From 75a5f2d499eae394573cc3f6078c26ab3db566a8 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 20 Jun 2020 22:13:37 +0200 Subject: [PATCH] Remove GoldCorpus Update imports Update after removing GoldCorpus Fix module name of corpus Fix mimport --- spacy/about.py | 2 +- spacy/cli/debug_data.py | 4 +- spacy/cli/evaluate.py | 4 +- spacy/cli/train.py | 2 +- spacy/gold/__init__.py | 2 +- spacy/gold/corpus.py | 200 ++++------------------- spacy/gold/corpus_docbin.py | 82 ---------- spacy/tests/regression/test_issue4402.py | 4 +- spacy/tests/test_gold.py | 12 +- 9 files changed, 45 insertions(+), 267 deletions(-) delete mode 100644 spacy/gold/corpus_docbin.py diff --git a/spacy/about.py b/spacy/about.py index 04a660ad1..14ea60c8c 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy" -__version__ = "3.0.0.dev9" +__version__ = "3.0.0" __release__ = True __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index c86408170..e0a6cba2e 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -4,7 +4,7 @@ import sys import srsly from wasabi import Printer, MESSAGES -from ..gold import GoldCorpus +from ..gold import Corpus from ..syntax import nonproj from ..util import load_model, get_lang_class @@ -68,7 +68,7 @@ def debug_data( loading_train_error_message = "" loading_dev_error_message = "" with msg.loading("Loading corpus..."): - corpus = GoldCorpus(train_path, dev_path) + corpus = Corpus(train_path, dev_path) try: train_dataset = list(corpus.train_dataset(nlp)) train_dataset_unpreprocessed = list( diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py index bae252b1c..09ce7c1b5 100644 --- a/spacy/cli/evaluate.py +++ b/spacy/cli/evaluate.py @@ -1,7 +1,7 @@ from timeit import default_timer as timer from wasabi import msg -from ..gold import GoldCorpus +from ..gold import Corpus from .. import util from .. import displacy @@ -31,7 +31,7 @@ def evaluate( msg.fail("Evaluation data not found", data_path, exits=1) if displacy_path and not displacy_path.exists(): msg.fail("Visualization output directory not found", displacy_path, exits=1) - corpus = GoldCorpus(data_path, data_path) + corpus = Corpus(data_path, data_path) if model.startswith("blank:"): nlp = util.get_lang_class(model.replace("blank:", ""))() else: diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 3420c96fa..6a1d74934 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -12,7 +12,7 @@ import thinc.schedules from thinc.api import Model, use_pytorch_for_gpu_memory import random -from ..gold.corpus_docbin import Corpus +from ..gold import Corpus from ..lookups import Lookups from .. import util from ..errors import Errors diff --git a/spacy/gold/__init__.py b/spacy/gold/__init__.py index 22530a757..9416bdd81 100644 --- a/spacy/gold/__init__.py +++ b/spacy/gold/__init__.py @@ -1,4 +1,4 @@ -from .corpus import GoldCorpus +from .corpus import Corpus from .example import Example from .align import align diff --git a/spacy/gold/corpus.py b/spacy/gold/corpus.py index c84f8355f..750217c8c 100644 --- a/spacy/gold/corpus.py +++ b/spacy/gold/corpus.py @@ -1,54 +1,26 @@ -import random -import shutil -import tempfile import srsly from pathlib import Path -import itertools -from ..tokens import Doc +import random from .. import util -from ..errors import Errors, AlignmentError -from .gold_io import read_json_file, json_to_annotations -from .augment import make_orth_variants from .example import Example +from ..tokens import DocBin -class GoldCorpus(object): +class Corpus: """An annotated corpus, using the JSON file format. Manages annotations for tagging, dependency parsing and NER. DOCS: https://spacy.io/api/goldcorpus """ - - def __init__(self, train, dev, gold_preproc=False, limit=None): + def __init__(self, train_loc, dev_loc, limit=0): """Create a GoldCorpus. train (str / Path): File or directory of training data. dev (str / Path): File or directory of development data. RETURNS (GoldCorpus): The newly created object. """ - self.limit = limit - if isinstance(train, str) or isinstance(train, Path): - train = self.read_annotations(self.walk_corpus(train)) - dev = self.read_annotations(self.walk_corpus(dev)) - # Write temp directory with one doc per file, so we can shuffle and stream - self.tmp_dir = Path(tempfile.mkdtemp()) - self.write_msgpack(self.tmp_dir / "train", train, limit=self.limit) - self.write_msgpack(self.tmp_dir / "dev", dev, limit=self.limit) - - def __del__(self): - shutil.rmtree(self.tmp_dir) - - @staticmethod - def write_msgpack(directory, examples, limit=0): - if not directory.exists(): - directory.mkdir() - n = 0 - for i, ex_dict in enumerate(examples): - text = ex_dict["text"] - srsly.write_msgpack(directory / f"{i}.msg", (text, ex_dict)) - n += 1 - if limit and n >= limit: - break + self.train_loc = train_loc + self.dev_loc = dev_loc @staticmethod def walk_corpus(path): @@ -66,157 +38,45 @@ class GoldCorpus(object): continue elif path.is_dir(): paths.extend(path.iterdir()) - elif path.parts[-1].endswith((".json", ".jsonl")): + elif path.parts[-1].endswith(".spacy"): locs.append(path) return locs - @staticmethod - def read_annotations(locs, limit=0): + def make_examples(self, nlp, reference_docs, **kwargs): + for reference in reference_docs: + predicted = nlp.make_doc(reference.text) + yield Example(predicted, reference) + + def read_docbin(self, vocab, locs, limit=0): """ Yield training examples as example dicts """ i = 0 for loc in locs: loc = util.ensure_path(loc) - file_name = loc.parts[-1] - if file_name.endswith("json"): - examples = read_json_file(loc) - elif file_name.endswith("jsonl"): - gold_tuples = srsly.read_jsonl(loc) - first_gold_tuple = next(gold_tuples) - gold_tuples = itertools.chain([first_gold_tuple], gold_tuples) - # TODO: proper format checks with schemas - if isinstance(first_gold_tuple, dict): - if first_gold_tuple.get("paragraphs", None): - examples = [] - for json_doc in gold_tuples: - examples.extend(json_to_annotations(json_doc)) - elif first_gold_tuple.get("doc_annotation", None): - examples = [] - for ex_dict in gold_tuples: - doc = ex_dict.get("doc", None) - if doc is None: - doc = ex_dict.get("text", None) - if not ( - doc is None - or isinstance(doc, Doc) - or isinstance(doc, str) - ): - raise ValueError(Errors.E987.format(type=type(doc))) - examples.append(ex_dict) - - elif file_name.endswith("msg"): - text, ex_dict = srsly.read_msgpack(loc) - examples = [ex_dict] - else: - supported = ("json", "jsonl", "msg") - raise ValueError(Errors.E124.format(path=loc, formats=supported)) - try: - for example in examples: - yield example - i += 1 - if limit and i >= limit: - return - except KeyError as e: - msg = "Missing key {}".format(e) - raise KeyError(Errors.E996.format(file=file_name, msg=msg)) - except UnboundLocalError as e: - msg = "Unexpected document structure" - raise ValueError(Errors.E996.format(file=file_name, msg=msg)) - - @property - def dev_annotations(self): - locs = (self.tmp_dir / "dev").iterdir() - yield from self.read_annotations(locs, limit=self.limit) - - @property - def train_annotations(self): - locs = (self.tmp_dir / "train").iterdir() - yield from self.read_annotations(locs, limit=self.limit) - - def count_train(self): + if loc.parts[-1].endswith(".spacy"): + with loc.open("rb") as file_: + doc_bin = DocBin().from_bytes(file_.read()) + yield from doc_bin.get_docs(vocab) + + def count_train(self, nlp): """Returns count of words in train examples""" n = 0 i = 0 - for eg_dict in self.train_annotations: - n += len(eg_dict["token_annotation"]["words"]) + for example in self.train_dataset(nlp): + n += len(example.predicted) if self.limit and i >= self.limit: break i += 1 return n - def train_dataset( - self, - nlp, - gold_preproc=False, - max_length=None, - orth_variant_level=0.0, - ignore_misaligned=False, - ): - locs = list((self.tmp_dir / "train").iterdir()) - random.shuffle(locs) - train_annotations = self.read_annotations(locs, limit=self.limit) - examples = self.iter_examples( - nlp, - train_annotations, - gold_preproc, - max_length=max_length, - orth_variant_level=orth_variant_level, - make_projective=True, - ignore_misaligned=ignore_misaligned, - ) + def train_dataset(self, nlp, shuffle=True, **kwargs): + ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.train_loc)) + examples = self.make_examples(nlp, ref_docs, **kwargs) + if shuffle: + examples = list(examples) + random.shuffle(examples) yield from examples - def train_dataset_without_preprocessing( - self, nlp, gold_preproc=False, ignore_misaligned=False - ): - examples = self.iter_examples( - nlp, - self.train_annotations, - gold_preproc=gold_preproc, - ignore_misaligned=ignore_misaligned, - ) + def dev_dataset(self, nlp): + ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.train_loc)) + examples = self.make_examples(nlp, ref_docs, **kwargs) yield from examples - - def dev_dataset(self, nlp, gold_preproc=False, ignore_misaligned=False): - examples = self.iter_examples( - nlp, - self.dev_annotations, - gold_preproc=gold_preproc, - ignore_misaligned=ignore_misaligned, - ) - yield from examples - - @classmethod - def iter_examples( - cls, - nlp, - annotations, - gold_preproc, - max_length=None, - orth_variant_level=0.0, - make_projective=False, - ignore_misaligned=False, - ): - """ Setting gold_preproc will result in creating a doc per sentence """ - for eg_dict in annotations: - token_annot = eg_dict.get("token_annotation", {}) - if eg_dict["text"]: - doc = nlp.make_doc(eg_dict["text"]) - elif "words" in token_annot: - doc = Doc(nlp.vocab, words=token_annot["words"]) - else: - raise ValueError("Expecting either 'text' or token_annotation.words annotation") - - if gold_preproc: - variant_text, variant_token_annot = make_orth_variants(nlp, doc.text, token_annot, orth_variant_level) - doc = nlp.make_doc(variant_text) - eg_dict["token_annotation"] = variant_token_annot - example = Example.from_dict(doc, eg_dict) - examples = example.split_sents() - - else: - example = Example.from_dict(doc, eg_dict) - examples = [example] - - for eg in examples: - if (not max_length) or len(eg.predicted) < max_length: - yield eg diff --git a/spacy/gold/corpus_docbin.py b/spacy/gold/corpus_docbin.py deleted file mode 100644 index 750217c8c..000000000 --- a/spacy/gold/corpus_docbin.py +++ /dev/null @@ -1,82 +0,0 @@ -import srsly -from pathlib import Path -import random -from .. import util -from .example import Example -from ..tokens import DocBin - - -class Corpus: - """An annotated corpus, using the JSON file format. Manages - annotations for tagging, dependency parsing and NER. - - DOCS: https://spacy.io/api/goldcorpus - """ - def __init__(self, train_loc, dev_loc, limit=0): - """Create a GoldCorpus. - - train (str / Path): File or directory of training data. - dev (str / Path): File or directory of development data. - RETURNS (GoldCorpus): The newly created object. - """ - self.train_loc = train_loc - self.dev_loc = dev_loc - - @staticmethod - def walk_corpus(path): - path = util.ensure_path(path) - if not path.is_dir(): - return [path] - paths = [path] - locs = [] - seen = set() - for path in paths: - if str(path) in seen: - continue - seen.add(str(path)) - if path.parts[-1].startswith("."): - continue - elif path.is_dir(): - paths.extend(path.iterdir()) - elif path.parts[-1].endswith(".spacy"): - locs.append(path) - return locs - - def make_examples(self, nlp, reference_docs, **kwargs): - for reference in reference_docs: - predicted = nlp.make_doc(reference.text) - yield Example(predicted, reference) - - def read_docbin(self, vocab, locs, limit=0): - """ Yield training examples as example dicts """ - i = 0 - for loc in locs: - loc = util.ensure_path(loc) - if loc.parts[-1].endswith(".spacy"): - with loc.open("rb") as file_: - doc_bin = DocBin().from_bytes(file_.read()) - yield from doc_bin.get_docs(vocab) - - def count_train(self, nlp): - """Returns count of words in train examples""" - n = 0 - i = 0 - for example in self.train_dataset(nlp): - n += len(example.predicted) - if self.limit and i >= self.limit: - break - i += 1 - return n - - def train_dataset(self, nlp, shuffle=True, **kwargs): - ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.train_loc)) - examples = self.make_examples(nlp, ref_docs, **kwargs) - if shuffle: - examples = list(examples) - random.shuffle(examples) - yield from examples - - def dev_dataset(self, nlp): - ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.train_loc)) - examples = self.make_examples(nlp, ref_docs, **kwargs) - yield from examples diff --git a/spacy/tests/regression/test_issue4402.py b/spacy/tests/regression/test_issue4402.py index 80d37b1e6..71ed7ec14 100644 --- a/spacy/tests/regression/test_issue4402.py +++ b/spacy/tests/regression/test_issue4402.py @@ -1,5 +1,5 @@ import srsly -from spacy.gold import GoldCorpus +from spacy.gold import Corpus from spacy.lang.en import English from ..util import make_tempdir @@ -11,7 +11,7 @@ def test_issue4402(): json_path = tmpdir / "test4402.json" srsly.write_json(json_path, json_data) - corpus = GoldCorpus(str(json_path), str(json_path)) + corpus = Corpus(str(json_path), str(json_path)) train_data = list(corpus.train_dataset(nlp, gold_preproc=True, max_length=0)) # assert that the data got split into 4 sentences diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index 726492138..7af62accb 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -1,7 +1,7 @@ from spacy.errors import AlignmentError from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags from spacy.gold import spans_from_biluo_tags, iob_to_biluo, align -from spacy.gold import GoldCorpus, docs_to_json +from spacy.gold import Corpus, docs_to_json from spacy.gold.example import Example from spacy.lang.en import English from spacy.syntax.nonproj import is_nonproj_tree @@ -299,7 +299,7 @@ def test_roundtrip_docs_to_json(doc): with make_tempdir() as tmpdir: json_file = tmpdir / "roundtrip.json" srsly.write_json(json_file, [docs_to_json(doc)]) - goldcorpus = GoldCorpus(train=str(json_file), dev=str(json_file)) + goldcorpus = Corpus(train=str(json_file), dev=str(json_file)) reloaded_example = next(goldcorpus.dev_dataset(nlp=nlp)) assert len(doc) == goldcorpus.count_train() @@ -328,7 +328,7 @@ def test_projective_train_vs_nonprojective_dev(doc): json_file = tmpdir / "test.json" # write to JSON train dicts srsly.write_json(json_file, [docs_to_json(doc)]) - goldcorpus = GoldCorpus(str(json_file), str(json_file)) + goldcorpus = Corpus(str(json_file), str(json_file)) train_reloaded_example = next(goldcorpus.train_dataset(nlp)) train_goldparse = get_parses_from_example(train_reloaded_example)[0][1] @@ -360,7 +360,7 @@ def test_ignore_misaligned(doc): data[0]["paragraphs"][0]["raw"] = text.replace("Sarah", "Jane") # write to JSON train dicts srsly.write_json(json_file, data) - goldcorpus = GoldCorpus(str(json_file), str(json_file)) + goldcorpus = Corpus(str(json_file), str(json_file)) with pytest.raises(AlignmentError): train_reloaded_example = next(goldcorpus.train_dataset(nlp)) @@ -371,7 +371,7 @@ def test_ignore_misaligned(doc): data[0]["paragraphs"][0]["raw"] = text.replace("Sarah", "Jane") # write to JSON train dicts srsly.write_json(json_file, data) - goldcorpus = GoldCorpus(str(json_file), str(json_file)) + goldcorpus = Corpus(str(json_file), str(json_file)) # doesn't raise an AlignmentError, but there is nothing to iterate over # because the only example can't be aligned @@ -385,7 +385,7 @@ def test_make_orth_variants(doc): json_file = tmpdir / "test.json" # write to JSON train dicts srsly.write_json(json_file, [docs_to_json(doc)]) - goldcorpus = GoldCorpus(str(json_file), str(json_file)) + goldcorpus = Corpus(str(json_file), str(json_file)) # due to randomness, test only that this runs with no errors for now train_example = next(goldcorpus.train_dataset(nlp))