From 0b3985d307b3ee389f0476d0f9230ab3e0e70bc7 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 22 Jun 2020 10:22:26 +0200 Subject: [PATCH] limit arg for Corpus --- spacy/gold/corpus.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/spacy/gold/corpus.py b/spacy/gold/corpus.py index d04a7bb7a..25252a1ca 100644 --- a/spacy/gold/corpus.py +++ b/spacy/gold/corpus.py @@ -1,5 +1,3 @@ -import srsly -from pathlib import Path import random from .. import util from .example import Example @@ -7,8 +5,8 @@ from ..tokens import DocBin class Corpus: - """An annotated corpus, using the JSON file format. Manages - annotations for tagging, dependency parsing and NER. + """An annotated corpus, reading train and dev datasets from + the DocBin (.spacy) format. DOCS: https://spacy.io/api/goldcorpus """ @@ -18,10 +16,12 @@ class Corpus: train (str / Path): File or directory of training data. dev (str / Path): File or directory of development data. + limit (int): Max. number of examples returned RETURNS (Corpus): The newly created object. """ self.train_loc = train_loc self.dev_loc = dev_loc + self.limit = limit @staticmethod def walk_corpus(path): @@ -48,7 +48,7 @@ class Corpus: predicted = nlp.make_doc(reference.text) yield Example(predicted, reference) - def read_docbin(self, vocab, locs, limit=0): + def read_docbin(self, vocab, locs): """ Yield training examples as example dicts """ i = 0 for loc in locs: @@ -57,6 +57,9 @@ class Corpus: with loc.open("rb") as file_: doc_bin = DocBin().from_bytes(file_.read()) yield from doc_bin.get_docs(vocab) + i += len(doc_bin) # TODO: should we restrict to EXACTLY the limit ? + if i >= self.limit: + break def count_train(self, nlp): """Returns count of words in train examples""" @@ -64,7 +67,7 @@ class Corpus: i = 0 for example in self.train_dataset(nlp): n += len(example.predicted) - if self.limit and i >= self.limit: + if i >= self.limit: break i += 1 return n