Add GoldCorpus

2025-09-18 18:12:45 +03:00 · 2020-06-06 14:28:37 +02:00 · 2020-06-06 14:28:37 +02:00 · a663d44b1b
commit a663d44b1b
parent 1fb8fc6ea9
1 changed files with 277 additions and 0 deletions
--- a/spacy/_gold/corpus.py
+++ b/spacy/_gold/corpus.py
@ -0,0 +1,277 @@
+import random
+import shutil
+import tempfile
+import srsly
+from pathlib import Path
+import itertools
+from ..tokens import Doc
+from .. import util
+from ..errors import Errors
+from .gold_utils import read_json_file, read_json_object
+from .augment import make_orth_variants, add_noise
+from .exmaple import Example
+
+
+class GoldCorpus(object):
+    """An annotated corpus, using the JSON file format. Manages
+    annotations for tagging, dependency parsing and NER.
+
+    DOCS: https://spacy.io/api/goldcorpus
+    """
+
+    def __init__(self, train, dev, gold_preproc=False, limit=None):
+        """Create a GoldCorpus.
+
+        train (str / Path): File or directory of training data.
+        dev (str / Path): File or directory of development data.
+        RETURNS (GoldCorpus): The newly created object.
+        """
+        self.limit = limit
+        if isinstance(train, str) or isinstance(train, Path):
+            train = self.read_examples(self.walk_corpus(train))
+            dev = self.read_examples(self.walk_corpus(dev))
+        # Write temp directory with one doc per file, so we can shuffle and stream
+        self.tmp_dir = Path(tempfile.mkdtemp())
+        self.write_msgpack(self.tmp_dir / "train", train, limit=self.limit)
+        self.write_msgpack(self.tmp_dir / "dev", dev, limit=self.limit)
+
+    def __del__(self):
+        shutil.rmtree(self.tmp_dir)
+
+    @staticmethod
+    def write_msgpack(directory, examples, limit=0):
+        if not directory.exists():
+            directory.mkdir()
+        n = 0
+        for i, example in enumerate(examples):
+            ex_dict = example.to_dict()
+            text = example.text
+            srsly.write_msgpack(directory / f"{i}.msg", (text, ex_dict))
+            n += 1
+            if limit and n >= limit:
+                break
+
+    @staticmethod
+    def walk_corpus(path):
+        path = util.ensure_path(path)
+        if not path.is_dir():
+            return [path]
+        paths = [path]
+        locs = []
+        seen = set()
+        for path in paths:
+            if str(path) in seen:
+                continue
+            seen.add(str(path))
+            if path.parts[-1].startswith("."):
+                continue
+            elif path.is_dir():
+                paths.extend(path.iterdir())
+            elif path.parts[-1].endswith((".json", ".jsonl")):
+                locs.append(path)
+        return locs
+
+    @staticmethod
+    def read_examples(locs, limit=0):
+        """ Yield training examples """
+        i = 0
+        for loc in locs:
+            loc = util.ensure_path(loc)
+            file_name = loc.parts[-1]
+            if file_name.endswith("json"):
+                examples = read_json_file(loc)
+            elif file_name.endswith("jsonl"):
+                gold_tuples = srsly.read_jsonl(loc)
+                first_gold_tuple = next(gold_tuples)
+                gold_tuples = itertools.chain([first_gold_tuple], gold_tuples)
+                # TODO: proper format checks with schemas
+                if isinstance(first_gold_tuple, dict):
+                    if first_gold_tuple.get("paragraphs", None):
+                        examples = read_json_object(gold_tuples)
+                    elif first_gold_tuple.get("doc_annotation", None):
+                        examples = []
+                        for ex_dict in gold_tuples:
+                            doc = ex_dict.get("doc", None)
+                            if doc is None:
+                                doc = ex_dict.get("text", None)
+                            if not (
+                                doc is None
+                                or isinstance(doc, Doc)
+                                or isinstance(doc, str)
+                            ):
+                                raise ValueError(Errors.E987.format(type=type(doc)))
+                            examples.append(Example.from_dict(ex_dict, doc=doc))
+
+            elif file_name.endswith("msg"):
+                text, ex_dict = srsly.read_msgpack(loc)
+                examples = [Example.from_dict(ex_dict, doc=text)]
+            else:
+                supported = ("json", "jsonl", "msg")
+                raise ValueError(Errors.E124.format(path=loc, formats=supported))
+            try:
+                for example in examples:
+                    yield example
+                    i += 1
+                    if limit and i >= limit:
+                        return
+            except KeyError as e:
+                msg = "Missing key {}".format(e)
+                raise KeyError(Errors.E996.format(file=file_name, msg=msg))
+            except UnboundLocalError:
+                msg = "Unexpected document structure"
+                raise ValueError(Errors.E996.format(file=file_name, msg=msg))
+
+    @property
+    def dev_examples(self):
+        locs = (self.tmp_dir / "dev").iterdir()
+        yield from self.read_examples(locs, limit=self.limit)
+
+    @property
+    def train_examples(self):
+        locs = (self.tmp_dir / "train").iterdir()
+        yield from self.read_examples(locs, limit=self.limit)
+
+    def count_train(self):
+        """Returns count of words in train examples"""
+        n = 0
+        i = 0
+        for example in self.train_examples:
+            n += len(example.token_annotation.words)
+            if self.limit and i >= self.limit:
+                break
+            i += 1
+        return n
+
+    def train_dataset(
+        self,
+        nlp,
+        gold_preproc=False,
+        max_length=None,
+        noise_level=0.0,
+        orth_variant_level=0.0,
+        ignore_misaligned=False,
+    ):
+        locs = list((self.tmp_dir / "train").iterdir())
+        random.shuffle(locs)
+        train_examples = self.read_examples(locs, limit=self.limit)
+        gold_examples = self.iter_gold_docs(
+            nlp,
+            train_examples,
+            gold_preproc,
+            max_length=max_length,
+            noise_level=noise_level,
+            orth_variant_level=orth_variant_level,
+            make_projective=True,
+            ignore_misaligned=ignore_misaligned,
+        )
+        yield from gold_examples
+
+    def train_dataset_without_preprocessing(
+        self, nlp, gold_preproc=False, ignore_misaligned=False
+    ):
+        examples = self.iter_gold_docs(
+            nlp,
+            self.train_examples,
+            gold_preproc=gold_preproc,
+            ignore_misaligned=ignore_misaligned,
+        )
+        yield from examples
+
+    def dev_dataset(self, nlp, gold_preproc=False, ignore_misaligned=False):
+        examples = self.iter_gold_docs(
+            nlp,
+            self.dev_examples,
+            gold_preproc=gold_preproc,
+            ignore_misaligned=ignore_misaligned,
+        )
+        yield from examples
+
+    @classmethod
+    def iter_gold_docs(
+        cls,
+        nlp,
+        examples,
+        gold_preproc,
+        max_length=None,
+        noise_level=0.0,
+        orth_variant_level=0.0,
+        make_projective=False,
+        ignore_misaligned=False,
+    ):
+        """ Setting gold_preproc will result in creating a doc per sentence """
+        for example in examples:
+            if gold_preproc:
+                split_examples = example.split_sents()
+                example_golds = []
+                for split_example in split_examples:
+                    split_example_docs = cls._make_docs(
+                        nlp,
+                        split_example,
+                        gold_preproc,
+                        noise_level=noise_level,
+                        orth_variant_level=orth_variant_level,
+                    )
+                    split_example_golds = cls._make_golds(
+                        split_example_docs,
+                        vocab=nlp.vocab,
+                        make_projective=make_projective,
+                        ignore_misaligned=ignore_misaligned,
+                    )
+                    example_golds.extend(split_example_golds)
+            else:
+                example_docs = cls._make_docs(
+                    nlp,
+                    example,
+                    gold_preproc,
+                    noise_level=noise_level,
+                    orth_variant_level=orth_variant_level,
+                )
+                example_golds = cls._make_golds(
+                    example_docs,
+                    vocab=nlp.vocab,
+                    make_projective=make_projective,
+                    ignore_misaligned=ignore_misaligned,
+                )
+            for ex in example_golds:
+                if ex.goldparse is not None:
+                    if (not max_length) or len(ex.doc) < max_length:
+                        yield ex
+
+    @classmethod
+    def _make_docs(
+        cls, nlp, example, gold_preproc, noise_level=0.0, orth_variant_level=0.0
+    ):
+        var_example = make_orth_variants(
+            nlp, example, orth_variant_level=orth_variant_level
+        )
+        # gold_preproc is not used ?!
+        if example.text is not None:
+            var_text = add_noise(var_example.text, noise_level)
+            var_doc = nlp.make_doc(var_text)
+            var_example.doc = var_doc
+        else:
+            var_doc = Doc(
+                nlp.vocab,
+                words=add_noise(var_example.token_annotation.words, noise_level),
+            )
+            var_example.doc = var_doc
+        return [var_example]
+
+    @classmethod
+    def _make_golds(
+        cls, examples, vocab=None, make_projective=False, ignore_misaligned=False
+    ):
+        filtered_examples = []
+        for example in examples:
+            gold_parses = example.get_gold_parses(
+                vocab=vocab,
+                make_projective=make_projective,
+                ignore_misaligned=ignore_misaligned,
+            )
+            assert len(gold_parses) == 1
+            doc, gold = gold_parses[0]
+            if doc:
+                assert doc == example.doc
+                example.goldparse = gold
+                filtered_examples.append(example)
+        return filtered_examples