spaCy/spacy/gold/corpus.py

import random
from .. import util
from .example import Example
from ..tokens import DocBin


class Corpus:
    """An annotated corpus, reading train and dev datasets from
    the DocBin (.spacy) format.

    DOCS: https://spacy.io/api/goldcorpus
    """

    def __init__(self, train_loc, dev_loc, limit=0):
        """Create a Corpus.

        train (str / Path): File or directory of training data.
        dev (str / Path): File or directory of development data.
        limit (int): Max. number of examples returned
        RETURNS (Corpus): The newly created object.
        """
        self.train_loc = train_loc
        self.dev_loc = dev_loc
        self.limit = limit

    @staticmethod
    def walk_corpus(path):
        path = util.ensure_path(path)
        if not path.is_dir():
            return [path]
        paths = [path]
        locs = []
        seen = set()
        for path in paths:
            if str(path) in seen:
                continue
            seen.add(str(path))
            if path.parts[-1].startswith("."):
                continue
            elif path.is_dir():
                paths.extend(path.iterdir())
            elif path.parts[-1].endswith(".spacy"):
                locs.append(path)
        return locs

    def make_examples(self, nlp, reference_docs):
        for reference in reference_docs:
            predicted = nlp.make_doc(reference.text)
            yield Example(predicted, reference)

    def read_docbin(self, vocab, locs):
        """ Yield training examples as example dicts """
        i = 0
        for loc in locs:
            loc = util.ensure_path(loc)
            if loc.parts[-1].endswith(".spacy"):
                with loc.open("rb") as file_:
                    doc_bin = DocBin().from_bytes(file_.read())
                yield from doc_bin.get_docs(vocab)
                i += len(doc_bin)   # TODO: should we restrict to EXACTLY the limit ?
                if i >= self.limit:
                    break

    def count_train(self, nlp):
        """Returns count of words in train examples"""
        n = 0
        i = 0
        for example in self.train_dataset(nlp):
            n += len(example.predicted)
            if i >= self.limit:
                break
            i += 1
        return n

    def train_dataset(self, nlp, shuffle=True):
        ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.train_loc))
        examples = self.make_examples(nlp, ref_docs)
        if shuffle:
            examples = list(examples)
            random.shuffle(examples)
        yield from examples

    def dev_dataset(self, nlp):
        ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.dev_loc))
        examples = self.make_examples(nlp, ref_docs)
        yield from examples
Remove GoldCorpus Update imports Update after removing GoldCorpus Fix module name of corpus Fix mimport 2020-06-20 23:13:37 +03:00			`import random`
Add GoldCorpus 2020-06-06 15:28:37 +03:00			`from .. import util`
Remove GoldParse WIP on removing goldparse Get ArcEager compiling after GoldParse excise Update setup.py Get spacy.syntax compiling after removing GoldParse Rename NewExample -> Example and clean up Clean html files Start updating tests Update Morphologizer 2020-06-14 18:45:46 +03:00			`from .example import Example`
Remove GoldCorpus Update imports Update after removing GoldCorpus Fix module name of corpus Fix mimport 2020-06-20 23:13:37 +03:00			`from ..tokens import DocBin`
Add GoldCorpus 2020-06-06 15:28:37 +03:00

Remove GoldCorpus Update imports Update after removing GoldCorpus Fix module name of corpus Fix mimport 2020-06-20 23:13:37 +03:00			`class Corpus:`
limit arg for Corpus 2020-06-22 11:22:26 +03:00			`"""An annotated corpus, reading train and dev datasets from`
			`the DocBin (.spacy) format.`
Add GoldCorpus 2020-06-06 15:28:37 +03:00
			`DOCS: https://spacy.io/api/goldcorpus`
			`"""`
Format 2020-06-22 02:11:43 +03:00
Remove GoldCorpus Update imports Update after removing GoldCorpus Fix module name of corpus Fix mimport 2020-06-20 23:13:37 +03:00			`def __init__(self, train_loc, dev_loc, limit=0):`
small fixes 2020-06-22 11:05:12 +03:00			`"""Create a Corpus.`
Add GoldCorpus 2020-06-06 15:28:37 +03:00
			`train (str / Path): File or directory of training data.`
			`dev (str / Path): File or directory of development data.`
limit arg for Corpus 2020-06-22 11:22:26 +03:00			`limit (int): Max. number of examples returned`
small fixes 2020-06-22 11:05:12 +03:00			`RETURNS (Corpus): The newly created object.`
Add GoldCorpus 2020-06-06 15:28:37 +03:00			`"""`
Format 2020-06-22 02:11:43 +03:00			`self.train_loc = train_loc`
Remove GoldCorpus Update imports Update after removing GoldCorpus Fix module name of corpus Fix mimport 2020-06-20 23:13:37 +03:00			`self.dev_loc = dev_loc`
limit arg for Corpus 2020-06-22 11:22:26 +03:00			`self.limit = limit`
Add GoldCorpus 2020-06-06 15:28:37 +03:00
			`@staticmethod`
			`def walk_corpus(path):`
			`path = util.ensure_path(path)`
			`if not path.is_dir():`
			`return [path]`
			`paths = [path]`
			`locs = []`
			`seen = set()`
			`for path in paths:`
			`if str(path) in seen:`
			`continue`
			`seen.add(str(path))`
			`if path.parts[-1].startswith("."):`
			`continue`
			`elif path.is_dir():`
			`paths.extend(path.iterdir())`
Remove GoldCorpus Update imports Update after removing GoldCorpus Fix module name of corpus Fix mimport 2020-06-20 23:13:37 +03:00			`elif path.parts[-1].endswith(".spacy"):`
Add GoldCorpus 2020-06-06 15:28:37 +03:00			`locs.append(path)`
			`return locs`

avoid writing temp dir in json2docs, fixing 4402 test 2020-06-22 15:27:35 +03:00			`def make_examples(self, nlp, reference_docs):`
Remove GoldCorpus Update imports Update after removing GoldCorpus Fix module name of corpus Fix mimport 2020-06-20 23:13:37 +03:00			`for reference in reference_docs:`
			`predicted = nlp.make_doc(reference.text)`
			`yield Example(predicted, reference)`

limit arg for Corpus 2020-06-22 11:22:26 +03:00			`def read_docbin(self, vocab, locs):`
prevent writing dummy values like deps because that could interfer with sent_start values 2020-06-18 18:47:59 +03:00			`""" Yield training examples as example dicts """`
Add GoldCorpus 2020-06-06 15:28:37 +03:00			`i = 0`
			`for loc in locs:`
			`loc = util.ensure_path(loc)`
Remove GoldCorpus Update imports Update after removing GoldCorpus Fix module name of corpus Fix mimport 2020-06-20 23:13:37 +03:00			`if loc.parts[-1].endswith(".spacy"):`
			`with loc.open("rb") as file_:`
			`doc_bin = DocBin().from_bytes(file_.read())`
			`yield from doc_bin.get_docs(vocab)`
limit arg for Corpus 2020-06-22 11:22:26 +03:00			`i += len(doc_bin) # TODO: should we restrict to EXACTLY the limit ?`
			`if i >= self.limit:`
			`break`
Format 2020-06-22 02:11:43 +03:00
Remove GoldCorpus Update imports Update after removing GoldCorpus Fix module name of corpus Fix mimport 2020-06-20 23:13:37 +03:00			`def count_train(self, nlp):`
Add GoldCorpus 2020-06-06 15:28:37 +03:00			`"""Returns count of words in train examples"""`
			`n = 0`
			`i = 0`
Remove GoldCorpus Update imports Update after removing GoldCorpus Fix module name of corpus Fix mimport 2020-06-20 23:13:37 +03:00			`for example in self.train_dataset(nlp):`
			`n += len(example.predicted)`
limit arg for Corpus 2020-06-22 11:22:26 +03:00			`if i >= self.limit:`
Add GoldCorpus 2020-06-06 15:28:37 +03:00			`break`
			`i += 1`
			`return n`

avoid writing temp dir in json2docs, fixing 4402 test 2020-06-22 15:27:35 +03:00			`def train_dataset(self, nlp, shuffle=True):`
Remove GoldCorpus Update imports Update after removing GoldCorpus Fix module name of corpus Fix mimport 2020-06-20 23:13:37 +03:00			`ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.train_loc))`
avoid writing temp dir in json2docs, fixing 4402 test 2020-06-22 15:27:35 +03:00			`examples = self.make_examples(nlp, ref_docs)`
Remove GoldCorpus Update imports Update after removing GoldCorpus Fix module name of corpus Fix mimport 2020-06-20 23:13:37 +03:00			`if shuffle:`
			`examples = list(examples)`
			`random.shuffle(examples)`
Add GoldCorpus 2020-06-06 15:28:37 +03:00			`yield from examples`

avoid writing temp dir in json2docs, fixing 4402 test 2020-06-22 15:27:35 +03:00			`def dev_dataset(self, nlp):`
Fix Corpus 2020-06-22 01:24:15 +03:00			`ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.dev_loc))`
avoid writing temp dir in json2docs, fixing 4402 test 2020-06-22 15:27:35 +03:00			`examples = self.make_examples(nlp, ref_docs)`
Add GoldCorpus 2020-06-06 15:28:37 +03:00			`yield from examples`