From 75a5f2d499eae394573cc3f6078c26ab3db566a8 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 20 Jun 2020 22:13:37 +0200
Subject: [PATCH] Remove GoldCorpus

Update imports

Update after removing GoldCorpus

Fix module name of corpus

Fix mimport
---
 spacy/about.py                           |   2 +-
 spacy/cli/debug_data.py                  |   4 +-
 spacy/cli/evaluate.py                    |   4 +-
 spacy/cli/train.py                       |   2 +-
 spacy/gold/__init__.py                   |   2 +-
 spacy/gold/corpus.py                     | 200 ++++-------------------
 spacy/gold/corpus_docbin.py              |  82 ----------
 spacy/tests/regression/test_issue4402.py |   4 +-
 spacy/tests/test_gold.py                 |  12 +-
 9 files changed, 45 insertions(+), 267 deletions(-)
 delete mode 100644 spacy/gold/corpus_docbin.py

diff --git a/spacy/about.py b/spacy/about.py
index 04a660ad1..14ea60c8c 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy"
-__version__ = "3.0.0.dev9"
+__version__ = "3.0.0"
 __release__ = True
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py
index c86408170..e0a6cba2e 100644
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@@ -4,7 +4,7 @@ import sys
 import srsly
 from wasabi import Printer, MESSAGES
 
-from ..gold import GoldCorpus
+from ..gold import Corpus
 from ..syntax import nonproj
 from ..util import load_model, get_lang_class
 
@@ -68,7 +68,7 @@ def debug_data(
     loading_train_error_message = ""
     loading_dev_error_message = ""
     with msg.loading("Loading corpus..."):
-        corpus = GoldCorpus(train_path, dev_path)
+        corpus = Corpus(train_path, dev_path)
         try:
             train_dataset = list(corpus.train_dataset(nlp))
             train_dataset_unpreprocessed = list(
diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py
index bae252b1c..09ce7c1b5 100644
--- a/spacy/cli/evaluate.py
+++ b/spacy/cli/evaluate.py
@@ -1,7 +1,7 @@
 from timeit import default_timer as timer
 from wasabi import msg
 
-from ..gold import GoldCorpus
+from ..gold import Corpus
 from .. import util
 from .. import displacy
 
@@ -31,7 +31,7 @@ def evaluate(
         msg.fail("Evaluation data not found", data_path, exits=1)
     if displacy_path and not displacy_path.exists():
         msg.fail("Visualization output directory not found", displacy_path, exits=1)
-    corpus = GoldCorpus(data_path, data_path)
+    corpus = Corpus(data_path, data_path)
     if model.startswith("blank:"):
         nlp = util.get_lang_class(model.replace("blank:", ""))()
     else:
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 3420c96fa..6a1d74934 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -12,7 +12,7 @@ import thinc.schedules
 from thinc.api import Model, use_pytorch_for_gpu_memory
 import random
 
-from ..gold.corpus_docbin import Corpus
+from ..gold import Corpus
 from ..lookups import Lookups
 from .. import util
 from ..errors import Errors
diff --git a/spacy/gold/__init__.py b/spacy/gold/__init__.py
index 22530a757..9416bdd81 100644
--- a/spacy/gold/__init__.py
+++ b/spacy/gold/__init__.py
@@ -1,4 +1,4 @@
-from .corpus import GoldCorpus
+from .corpus import Corpus
 from .example import Example
 from .align import align
 
diff --git a/spacy/gold/corpus.py b/spacy/gold/corpus.py
index c84f8355f..750217c8c 100644
--- a/spacy/gold/corpus.py
+++ b/spacy/gold/corpus.py
@@ -1,54 +1,26 @@
-import random
-import shutil
-import tempfile
 import srsly
 from pathlib import Path
-import itertools
-from ..tokens import Doc
+import random
 from .. import util
-from ..errors import Errors, AlignmentError
-from .gold_io import read_json_file, json_to_annotations
-from .augment import make_orth_variants
 from .example import Example
+from ..tokens import DocBin
 
 
-class GoldCorpus(object):
+class Corpus:
     """An annotated corpus, using the JSON file format. Manages
     annotations for tagging, dependency parsing and NER.
 
     DOCS: https://spacy.io/api/goldcorpus
     """
-
-    def __init__(self, train, dev, gold_preproc=False, limit=None):
+    def __init__(self, train_loc, dev_loc, limit=0):
         """Create a GoldCorpus.
 
         train (str / Path): File or directory of training data.
         dev (str / Path): File or directory of development data.
         RETURNS (GoldCorpus): The newly created object.
         """
-        self.limit = limit
-        if isinstance(train, str) or isinstance(train, Path):
-            train = self.read_annotations(self.walk_corpus(train))
-            dev = self.read_annotations(self.walk_corpus(dev))
-        # Write temp directory with one doc per file, so we can shuffle and stream
-        self.tmp_dir = Path(tempfile.mkdtemp())
-        self.write_msgpack(self.tmp_dir / "train", train, limit=self.limit)
-        self.write_msgpack(self.tmp_dir / "dev", dev, limit=self.limit)
-
-    def __del__(self):
-        shutil.rmtree(self.tmp_dir)
-
-    @staticmethod
-    def write_msgpack(directory, examples, limit=0):
-        if not directory.exists():
-            directory.mkdir()
-        n = 0
-        for i, ex_dict in enumerate(examples):
-            text = ex_dict["text"]
-            srsly.write_msgpack(directory / f"{i}.msg", (text, ex_dict))
-            n += 1
-            if limit and n >= limit:
-                break
+        self.train_loc = train_loc 
+        self.dev_loc = dev_loc
 
     @staticmethod
     def walk_corpus(path):
@@ -66,157 +38,45 @@ class GoldCorpus(object):
                 continue
             elif path.is_dir():
                 paths.extend(path.iterdir())
-            elif path.parts[-1].endswith((".json", ".jsonl")):
+            elif path.parts[-1].endswith(".spacy"):
                 locs.append(path)
         return locs
 
-    @staticmethod
-    def read_annotations(locs, limit=0):
+    def make_examples(self, nlp, reference_docs, **kwargs):
+        for reference in reference_docs:
+            predicted = nlp.make_doc(reference.text)
+            yield Example(predicted, reference)
+
+    def read_docbin(self, vocab, locs, limit=0):
         """ Yield training examples as example dicts """
         i = 0
         for loc in locs:
             loc = util.ensure_path(loc)
-            file_name = loc.parts[-1]
-            if file_name.endswith("json"):
-                examples = read_json_file(loc)
-            elif file_name.endswith("jsonl"):
-                gold_tuples = srsly.read_jsonl(loc)
-                first_gold_tuple = next(gold_tuples)
-                gold_tuples = itertools.chain([first_gold_tuple], gold_tuples)
-                # TODO: proper format checks with schemas
-                if isinstance(first_gold_tuple, dict):
-                    if first_gold_tuple.get("paragraphs", None):
-                        examples = []
-                        for json_doc in gold_tuples:
-                            examples.extend(json_to_annotations(json_doc))
-                    elif first_gold_tuple.get("doc_annotation", None):
-                        examples = []
-                        for ex_dict in gold_tuples:
-                            doc = ex_dict.get("doc", None)
-                            if doc is None:
-                                doc = ex_dict.get("text", None)
-                            if not (
-                                doc is None
-                                or isinstance(doc, Doc)
-                                or isinstance(doc, str)
-                            ):
-                                raise ValueError(Errors.E987.format(type=type(doc)))
-                            examples.append(ex_dict)
-
-            elif file_name.endswith("msg"):
-                text, ex_dict = srsly.read_msgpack(loc)
-                examples = [ex_dict]
-            else:
-                supported = ("json", "jsonl", "msg")
-                raise ValueError(Errors.E124.format(path=loc, formats=supported))
-            try:
-                for example in examples:
-                    yield example
-                    i += 1
-                    if limit and i >= limit:
-                        return
-            except KeyError as e:
-                msg = "Missing key {}".format(e)
-                raise KeyError(Errors.E996.format(file=file_name, msg=msg))
-            except UnboundLocalError as e:
-                msg = "Unexpected document structure"
-                raise ValueError(Errors.E996.format(file=file_name, msg=msg))
-
-    @property
-    def dev_annotations(self):
-        locs = (self.tmp_dir / "dev").iterdir()
-        yield from self.read_annotations(locs, limit=self.limit)
-
-    @property
-    def train_annotations(self):
-        locs = (self.tmp_dir / "train").iterdir()
-        yield from self.read_annotations(locs, limit=self.limit)
-
-    def count_train(self):
+            if loc.parts[-1].endswith(".spacy"):
+                with loc.open("rb") as file_:
+                    doc_bin = DocBin().from_bytes(file_.read())
+                yield from doc_bin.get_docs(vocab)
+    
+    def count_train(self, nlp):
         """Returns count of words in train examples"""
         n = 0
         i = 0
-        for eg_dict in self.train_annotations:
-            n += len(eg_dict["token_annotation"]["words"])
+        for example in self.train_dataset(nlp):
+            n += len(example.predicted)
             if self.limit and i >= self.limit:
                 break
             i += 1
         return n
 
-    def train_dataset(
-        self,
-        nlp,
-        gold_preproc=False,
-        max_length=None,
-        orth_variant_level=0.0,
-        ignore_misaligned=False,
-    ):
-        locs = list((self.tmp_dir / "train").iterdir())
-        random.shuffle(locs)
-        train_annotations = self.read_annotations(locs, limit=self.limit)
-        examples = self.iter_examples(
-            nlp,
-            train_annotations,
-            gold_preproc,
-            max_length=max_length,
-            orth_variant_level=orth_variant_level,
-            make_projective=True,
-            ignore_misaligned=ignore_misaligned,
-        )
+    def train_dataset(self, nlp, shuffle=True, **kwargs):
+        ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.train_loc))
+        examples = self.make_examples(nlp, ref_docs, **kwargs)
+        if shuffle:
+            examples = list(examples)
+            random.shuffle(examples)
         yield from examples
 
-    def train_dataset_without_preprocessing(
-        self, nlp, gold_preproc=False, ignore_misaligned=False
-    ):
-        examples = self.iter_examples(
-            nlp,
-            self.train_annotations,
-            gold_preproc=gold_preproc,
-            ignore_misaligned=ignore_misaligned,
-        )
+    def dev_dataset(self, nlp):
+        ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.train_loc))
+        examples = self.make_examples(nlp, ref_docs, **kwargs)
         yield from examples
-
-    def dev_dataset(self, nlp, gold_preproc=False, ignore_misaligned=False):
-        examples = self.iter_examples(
-            nlp,
-            self.dev_annotations,
-            gold_preproc=gold_preproc,
-            ignore_misaligned=ignore_misaligned,
-        )
-        yield from examples
-
-    @classmethod
-    def iter_examples(
-        cls,
-        nlp,
-        annotations,
-        gold_preproc,
-        max_length=None,
-        orth_variant_level=0.0,
-        make_projective=False,
-        ignore_misaligned=False,
-    ):
-        """ Setting gold_preproc will result in creating a doc per sentence """
-        for eg_dict in annotations:
-            token_annot = eg_dict.get("token_annotation", {})
-            if eg_dict["text"]:
-                doc = nlp.make_doc(eg_dict["text"])
-            elif "words" in token_annot:
-                doc = Doc(nlp.vocab, words=token_annot["words"])
-            else:
-                raise ValueError("Expecting either 'text' or token_annotation.words annotation")
-
-            if gold_preproc:
-                variant_text, variant_token_annot = make_orth_variants(nlp, doc.text, token_annot, orth_variant_level)
-                doc = nlp.make_doc(variant_text)
-                eg_dict["token_annotation"] = variant_token_annot
-                example = Example.from_dict(doc, eg_dict)
-                examples = example.split_sents()
-
-            else:
-                example = Example.from_dict(doc, eg_dict)
-                examples = [example]
-
-            for eg in examples:
-                if (not max_length) or len(eg.predicted) < max_length:
-                    yield eg
diff --git a/spacy/gold/corpus_docbin.py b/spacy/gold/corpus_docbin.py
deleted file mode 100644
index 750217c8c..000000000
--- a/spacy/gold/corpus_docbin.py
+++ /dev/null
@@ -1,82 +0,0 @@
-import srsly
-from pathlib import Path
-import random
-from .. import util
-from .example import Example
-from ..tokens import DocBin
-
-
-class Corpus:
-    """An annotated corpus, using the JSON file format. Manages
-    annotations for tagging, dependency parsing and NER.
-
-    DOCS: https://spacy.io/api/goldcorpus
-    """
-    def __init__(self, train_loc, dev_loc, limit=0):
-        """Create a GoldCorpus.
-
-        train (str / Path): File or directory of training data.
-        dev (str / Path): File or directory of development data.
-        RETURNS (GoldCorpus): The newly created object.
-        """
-        self.train_loc = train_loc 
-        self.dev_loc = dev_loc
-
-    @staticmethod
-    def walk_corpus(path):
-        path = util.ensure_path(path)
-        if not path.is_dir():
-            return [path]
-        paths = [path]
-        locs = []
-        seen = set()
-        for path in paths:
-            if str(path) in seen:
-                continue
-            seen.add(str(path))
-            if path.parts[-1].startswith("."):
-                continue
-            elif path.is_dir():
-                paths.extend(path.iterdir())
-            elif path.parts[-1].endswith(".spacy"):
-                locs.append(path)
-        return locs
-
-    def make_examples(self, nlp, reference_docs, **kwargs):
-        for reference in reference_docs:
-            predicted = nlp.make_doc(reference.text)
-            yield Example(predicted, reference)
-
-    def read_docbin(self, vocab, locs, limit=0):
-        """ Yield training examples as example dicts """
-        i = 0
-        for loc in locs:
-            loc = util.ensure_path(loc)
-            if loc.parts[-1].endswith(".spacy"):
-                with loc.open("rb") as file_:
-                    doc_bin = DocBin().from_bytes(file_.read())
-                yield from doc_bin.get_docs(vocab)
-    
-    def count_train(self, nlp):
-        """Returns count of words in train examples"""
-        n = 0
-        i = 0
-        for example in self.train_dataset(nlp):
-            n += len(example.predicted)
-            if self.limit and i >= self.limit:
-                break
-            i += 1
-        return n
-
-    def train_dataset(self, nlp, shuffle=True, **kwargs):
-        ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.train_loc))
-        examples = self.make_examples(nlp, ref_docs, **kwargs)
-        if shuffle:
-            examples = list(examples)
-            random.shuffle(examples)
-        yield from examples
-
-    def dev_dataset(self, nlp):
-        ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.train_loc))
-        examples = self.make_examples(nlp, ref_docs, **kwargs)
-        yield from examples
diff --git a/spacy/tests/regression/test_issue4402.py b/spacy/tests/regression/test_issue4402.py
index 80d37b1e6..71ed7ec14 100644
--- a/spacy/tests/regression/test_issue4402.py
+++ b/spacy/tests/regression/test_issue4402.py
@@ -1,5 +1,5 @@
 import srsly
-from spacy.gold import GoldCorpus
+from spacy.gold import Corpus
 from spacy.lang.en import English
 
 from ..util import make_tempdir
@@ -11,7 +11,7 @@ def test_issue4402():
         json_path = tmpdir / "test4402.json"
         srsly.write_json(json_path, json_data)
 
-        corpus = GoldCorpus(str(json_path), str(json_path))
+        corpus = Corpus(str(json_path), str(json_path))
 
         train_data = list(corpus.train_dataset(nlp, gold_preproc=True, max_length=0))
         # assert that the data got split into 4 sentences
diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py
index 726492138..7af62accb 100644
--- a/spacy/tests/test_gold.py
+++ b/spacy/tests/test_gold.py
@@ -1,7 +1,7 @@
 from spacy.errors import AlignmentError
 from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags
 from spacy.gold import spans_from_biluo_tags, iob_to_biluo, align
-from spacy.gold import GoldCorpus, docs_to_json
+from spacy.gold import Corpus, docs_to_json
 from spacy.gold.example import Example
 from spacy.lang.en import English
 from spacy.syntax.nonproj import is_nonproj_tree
@@ -299,7 +299,7 @@ def test_roundtrip_docs_to_json(doc):
     with make_tempdir() as tmpdir:
         json_file = tmpdir / "roundtrip.json"
         srsly.write_json(json_file, [docs_to_json(doc)])
-        goldcorpus = GoldCorpus(train=str(json_file), dev=str(json_file))
+        goldcorpus = Corpus(train=str(json_file), dev=str(json_file))
 
         reloaded_example = next(goldcorpus.dev_dataset(nlp=nlp))
         assert len(doc) == goldcorpus.count_train()
@@ -328,7 +328,7 @@ def test_projective_train_vs_nonprojective_dev(doc):
         json_file = tmpdir / "test.json"
         # write to JSON train dicts
         srsly.write_json(json_file, [docs_to_json(doc)])
-        goldcorpus = GoldCorpus(str(json_file), str(json_file))
+        goldcorpus = Corpus(str(json_file), str(json_file))
 
         train_reloaded_example = next(goldcorpus.train_dataset(nlp))
         train_goldparse = get_parses_from_example(train_reloaded_example)[0][1]
@@ -360,7 +360,7 @@ def test_ignore_misaligned(doc):
         data[0]["paragraphs"][0]["raw"] = text.replace("Sarah", "Jane")
         # write to JSON train dicts
         srsly.write_json(json_file, data)
-        goldcorpus = GoldCorpus(str(json_file), str(json_file))
+        goldcorpus = Corpus(str(json_file), str(json_file))
 
         with pytest.raises(AlignmentError):
             train_reloaded_example = next(goldcorpus.train_dataset(nlp))
@@ -371,7 +371,7 @@ def test_ignore_misaligned(doc):
         data[0]["paragraphs"][0]["raw"] = text.replace("Sarah", "Jane")
         # write to JSON train dicts
         srsly.write_json(json_file, data)
-        goldcorpus = GoldCorpus(str(json_file), str(json_file))
+        goldcorpus = Corpus(str(json_file), str(json_file))
 
         # doesn't raise an AlignmentError, but there is nothing to iterate over
         # because the only example can't be aligned
@@ -385,7 +385,7 @@ def test_make_orth_variants(doc):
         json_file = tmpdir / "test.json"
         # write to JSON train dicts
         srsly.write_json(json_file, [docs_to_json(doc)])
-        goldcorpus = GoldCorpus(str(json_file), str(json_file))
+        goldcorpus = Corpus(str(json_file), str(json_file))
 
         # due to randomness, test only that this runs with no errors for now
         train_example = next(goldcorpus.train_dataset(nlp))