Remove GoldCorpus

Update imports

Update after removing GoldCorpus

Fix module name of corpus

Fix mimport
This commit is contained in:
Matthew Honnibal 2020-06-20 22:13:37 +02:00
parent 50d4b21743
commit 75a5f2d499
9 changed files with 45 additions and 267 deletions

View File

@ -1,6 +1,6 @@
# fmt: off # fmt: off
__title__ = "spacy" __title__ = "spacy"
__version__ = "3.0.0.dev9" __version__ = "3.0.0"
__release__ = True __release__ = True
__download_url__ = "https://github.com/explosion/spacy-models/releases/download" __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

View File

@ -4,7 +4,7 @@ import sys
import srsly import srsly
from wasabi import Printer, MESSAGES from wasabi import Printer, MESSAGES
from ..gold import GoldCorpus from ..gold import Corpus
from ..syntax import nonproj from ..syntax import nonproj
from ..util import load_model, get_lang_class from ..util import load_model, get_lang_class
@ -68,7 +68,7 @@ def debug_data(
loading_train_error_message = "" loading_train_error_message = ""
loading_dev_error_message = "" loading_dev_error_message = ""
with msg.loading("Loading corpus..."): with msg.loading("Loading corpus..."):
corpus = GoldCorpus(train_path, dev_path) corpus = Corpus(train_path, dev_path)
try: try:
train_dataset = list(corpus.train_dataset(nlp)) train_dataset = list(corpus.train_dataset(nlp))
train_dataset_unpreprocessed = list( train_dataset_unpreprocessed = list(

View File

@ -1,7 +1,7 @@
from timeit import default_timer as timer from timeit import default_timer as timer
from wasabi import msg from wasabi import msg
from ..gold import GoldCorpus from ..gold import Corpus
from .. import util from .. import util
from .. import displacy from .. import displacy
@ -31,7 +31,7 @@ def evaluate(
msg.fail("Evaluation data not found", data_path, exits=1) msg.fail("Evaluation data not found", data_path, exits=1)
if displacy_path and not displacy_path.exists(): if displacy_path and not displacy_path.exists():
msg.fail("Visualization output directory not found", displacy_path, exits=1) msg.fail("Visualization output directory not found", displacy_path, exits=1)
corpus = GoldCorpus(data_path, data_path) corpus = Corpus(data_path, data_path)
if model.startswith("blank:"): if model.startswith("blank:"):
nlp = util.get_lang_class(model.replace("blank:", ""))() nlp = util.get_lang_class(model.replace("blank:", ""))()
else: else:

View File

@ -12,7 +12,7 @@ import thinc.schedules
from thinc.api import Model, use_pytorch_for_gpu_memory from thinc.api import Model, use_pytorch_for_gpu_memory
import random import random
from ..gold.corpus_docbin import Corpus from ..gold import Corpus
from ..lookups import Lookups from ..lookups import Lookups
from .. import util from .. import util
from ..errors import Errors from ..errors import Errors

View File

@ -1,4 +1,4 @@
from .corpus import GoldCorpus from .corpus import Corpus
from .example import Example from .example import Example
from .align import align from .align import align

View File

@ -1,54 +1,26 @@
import random
import shutil
import tempfile
import srsly import srsly
from pathlib import Path from pathlib import Path
import itertools import random
from ..tokens import Doc
from .. import util from .. import util
from ..errors import Errors, AlignmentError
from .gold_io import read_json_file, json_to_annotations
from .augment import make_orth_variants
from .example import Example from .example import Example
from ..tokens import DocBin
class GoldCorpus(object): class Corpus:
"""An annotated corpus, using the JSON file format. Manages """An annotated corpus, using the JSON file format. Manages
annotations for tagging, dependency parsing and NER. annotations for tagging, dependency parsing and NER.
DOCS: https://spacy.io/api/goldcorpus DOCS: https://spacy.io/api/goldcorpus
""" """
def __init__(self, train_loc, dev_loc, limit=0):
def __init__(self, train, dev, gold_preproc=False, limit=None):
"""Create a GoldCorpus. """Create a GoldCorpus.
train (str / Path): File or directory of training data. train (str / Path): File or directory of training data.
dev (str / Path): File or directory of development data. dev (str / Path): File or directory of development data.
RETURNS (GoldCorpus): The newly created object. RETURNS (GoldCorpus): The newly created object.
""" """
self.limit = limit self.train_loc = train_loc
if isinstance(train, str) or isinstance(train, Path): self.dev_loc = dev_loc
train = self.read_annotations(self.walk_corpus(train))
dev = self.read_annotations(self.walk_corpus(dev))
# Write temp directory with one doc per file, so we can shuffle and stream
self.tmp_dir = Path(tempfile.mkdtemp())
self.write_msgpack(self.tmp_dir / "train", train, limit=self.limit)
self.write_msgpack(self.tmp_dir / "dev", dev, limit=self.limit)
def __del__(self):
shutil.rmtree(self.tmp_dir)
@staticmethod
def write_msgpack(directory, examples, limit=0):
if not directory.exists():
directory.mkdir()
n = 0
for i, ex_dict in enumerate(examples):
text = ex_dict["text"]
srsly.write_msgpack(directory / f"{i}.msg", (text, ex_dict))
n += 1
if limit and n >= limit:
break
@staticmethod @staticmethod
def walk_corpus(path): def walk_corpus(path):
@ -66,157 +38,45 @@ class GoldCorpus(object):
continue continue
elif path.is_dir(): elif path.is_dir():
paths.extend(path.iterdir()) paths.extend(path.iterdir())
elif path.parts[-1].endswith((".json", ".jsonl")): elif path.parts[-1].endswith(".spacy"):
locs.append(path) locs.append(path)
return locs return locs
@staticmethod def make_examples(self, nlp, reference_docs, **kwargs):
def read_annotations(locs, limit=0): for reference in reference_docs:
predicted = nlp.make_doc(reference.text)
yield Example(predicted, reference)
def read_docbin(self, vocab, locs, limit=0):
""" Yield training examples as example dicts """ """ Yield training examples as example dicts """
i = 0 i = 0
for loc in locs: for loc in locs:
loc = util.ensure_path(loc) loc = util.ensure_path(loc)
file_name = loc.parts[-1] if loc.parts[-1].endswith(".spacy"):
if file_name.endswith("json"): with loc.open("rb") as file_:
examples = read_json_file(loc) doc_bin = DocBin().from_bytes(file_.read())
elif file_name.endswith("jsonl"): yield from doc_bin.get_docs(vocab)
gold_tuples = srsly.read_jsonl(loc)
first_gold_tuple = next(gold_tuples)
gold_tuples = itertools.chain([first_gold_tuple], gold_tuples)
# TODO: proper format checks with schemas
if isinstance(first_gold_tuple, dict):
if first_gold_tuple.get("paragraphs", None):
examples = []
for json_doc in gold_tuples:
examples.extend(json_to_annotations(json_doc))
elif first_gold_tuple.get("doc_annotation", None):
examples = []
for ex_dict in gold_tuples:
doc = ex_dict.get("doc", None)
if doc is None:
doc = ex_dict.get("text", None)
if not (
doc is None
or isinstance(doc, Doc)
or isinstance(doc, str)
):
raise ValueError(Errors.E987.format(type=type(doc)))
examples.append(ex_dict)
elif file_name.endswith("msg"): def count_train(self, nlp):
text, ex_dict = srsly.read_msgpack(loc)
examples = [ex_dict]
else:
supported = ("json", "jsonl", "msg")
raise ValueError(Errors.E124.format(path=loc, formats=supported))
try:
for example in examples:
yield example
i += 1
if limit and i >= limit:
return
except KeyError as e:
msg = "Missing key {}".format(e)
raise KeyError(Errors.E996.format(file=file_name, msg=msg))
except UnboundLocalError as e:
msg = "Unexpected document structure"
raise ValueError(Errors.E996.format(file=file_name, msg=msg))
@property
def dev_annotations(self):
locs = (self.tmp_dir / "dev").iterdir()
yield from self.read_annotations(locs, limit=self.limit)
@property
def train_annotations(self):
locs = (self.tmp_dir / "train").iterdir()
yield from self.read_annotations(locs, limit=self.limit)
def count_train(self):
"""Returns count of words in train examples""" """Returns count of words in train examples"""
n = 0 n = 0
i = 0 i = 0
for eg_dict in self.train_annotations: for example in self.train_dataset(nlp):
n += len(eg_dict["token_annotation"]["words"]) n += len(example.predicted)
if self.limit and i >= self.limit: if self.limit and i >= self.limit:
break break
i += 1 i += 1
return n return n
def train_dataset( def train_dataset(self, nlp, shuffle=True, **kwargs):
self, ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.train_loc))
nlp, examples = self.make_examples(nlp, ref_docs, **kwargs)
gold_preproc=False, if shuffle:
max_length=None, examples = list(examples)
orth_variant_level=0.0, random.shuffle(examples)
ignore_misaligned=False,
):
locs = list((self.tmp_dir / "train").iterdir())
random.shuffle(locs)
train_annotations = self.read_annotations(locs, limit=self.limit)
examples = self.iter_examples(
nlp,
train_annotations,
gold_preproc,
max_length=max_length,
orth_variant_level=orth_variant_level,
make_projective=True,
ignore_misaligned=ignore_misaligned,
)
yield from examples yield from examples
def train_dataset_without_preprocessing( def dev_dataset(self, nlp):
self, nlp, gold_preproc=False, ignore_misaligned=False ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.train_loc))
): examples = self.make_examples(nlp, ref_docs, **kwargs)
examples = self.iter_examples(
nlp,
self.train_annotations,
gold_preproc=gold_preproc,
ignore_misaligned=ignore_misaligned,
)
yield from examples yield from examples
def dev_dataset(self, nlp, gold_preproc=False, ignore_misaligned=False):
examples = self.iter_examples(
nlp,
self.dev_annotations,
gold_preproc=gold_preproc,
ignore_misaligned=ignore_misaligned,
)
yield from examples
@classmethod
def iter_examples(
cls,
nlp,
annotations,
gold_preproc,
max_length=None,
orth_variant_level=0.0,
make_projective=False,
ignore_misaligned=False,
):
""" Setting gold_preproc will result in creating a doc per sentence """
for eg_dict in annotations:
token_annot = eg_dict.get("token_annotation", {})
if eg_dict["text"]:
doc = nlp.make_doc(eg_dict["text"])
elif "words" in token_annot:
doc = Doc(nlp.vocab, words=token_annot["words"])
else:
raise ValueError("Expecting either 'text' or token_annotation.words annotation")
if gold_preproc:
variant_text, variant_token_annot = make_orth_variants(nlp, doc.text, token_annot, orth_variant_level)
doc = nlp.make_doc(variant_text)
eg_dict["token_annotation"] = variant_token_annot
example = Example.from_dict(doc, eg_dict)
examples = example.split_sents()
else:
example = Example.from_dict(doc, eg_dict)
examples = [example]
for eg in examples:
if (not max_length) or len(eg.predicted) < max_length:
yield eg

View File

@ -1,82 +0,0 @@
import srsly
from pathlib import Path
import random
from .. import util
from .example import Example
from ..tokens import DocBin
class Corpus:
"""An annotated corpus, using the JSON file format. Manages
annotations for tagging, dependency parsing and NER.
DOCS: https://spacy.io/api/goldcorpus
"""
def __init__(self, train_loc, dev_loc, limit=0):
"""Create a GoldCorpus.
train (str / Path): File or directory of training data.
dev (str / Path): File or directory of development data.
RETURNS (GoldCorpus): The newly created object.
"""
self.train_loc = train_loc
self.dev_loc = dev_loc
@staticmethod
def walk_corpus(path):
path = util.ensure_path(path)
if not path.is_dir():
return [path]
paths = [path]
locs = []
seen = set()
for path in paths:
if str(path) in seen:
continue
seen.add(str(path))
if path.parts[-1].startswith("."):
continue
elif path.is_dir():
paths.extend(path.iterdir())
elif path.parts[-1].endswith(".spacy"):
locs.append(path)
return locs
def make_examples(self, nlp, reference_docs, **kwargs):
for reference in reference_docs:
predicted = nlp.make_doc(reference.text)
yield Example(predicted, reference)
def read_docbin(self, vocab, locs, limit=0):
""" Yield training examples as example dicts """
i = 0
for loc in locs:
loc = util.ensure_path(loc)
if loc.parts[-1].endswith(".spacy"):
with loc.open("rb") as file_:
doc_bin = DocBin().from_bytes(file_.read())
yield from doc_bin.get_docs(vocab)
def count_train(self, nlp):
"""Returns count of words in train examples"""
n = 0
i = 0
for example in self.train_dataset(nlp):
n += len(example.predicted)
if self.limit and i >= self.limit:
break
i += 1
return n
def train_dataset(self, nlp, shuffle=True, **kwargs):
ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.train_loc))
examples = self.make_examples(nlp, ref_docs, **kwargs)
if shuffle:
examples = list(examples)
random.shuffle(examples)
yield from examples
def dev_dataset(self, nlp):
ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.train_loc))
examples = self.make_examples(nlp, ref_docs, **kwargs)
yield from examples

View File

@ -1,5 +1,5 @@
import srsly import srsly
from spacy.gold import GoldCorpus from spacy.gold import Corpus
from spacy.lang.en import English from spacy.lang.en import English
from ..util import make_tempdir from ..util import make_tempdir
@ -11,7 +11,7 @@ def test_issue4402():
json_path = tmpdir / "test4402.json" json_path = tmpdir / "test4402.json"
srsly.write_json(json_path, json_data) srsly.write_json(json_path, json_data)
corpus = GoldCorpus(str(json_path), str(json_path)) corpus = Corpus(str(json_path), str(json_path))
train_data = list(corpus.train_dataset(nlp, gold_preproc=True, max_length=0)) train_data = list(corpus.train_dataset(nlp, gold_preproc=True, max_length=0))
# assert that the data got split into 4 sentences # assert that the data got split into 4 sentences

View File

@ -1,7 +1,7 @@
from spacy.errors import AlignmentError from spacy.errors import AlignmentError
from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags
from spacy.gold import spans_from_biluo_tags, iob_to_biluo, align from spacy.gold import spans_from_biluo_tags, iob_to_biluo, align
from spacy.gold import GoldCorpus, docs_to_json from spacy.gold import Corpus, docs_to_json
from spacy.gold.example import Example from spacy.gold.example import Example
from spacy.lang.en import English from spacy.lang.en import English
from spacy.syntax.nonproj import is_nonproj_tree from spacy.syntax.nonproj import is_nonproj_tree
@ -299,7 +299,7 @@ def test_roundtrip_docs_to_json(doc):
with make_tempdir() as tmpdir: with make_tempdir() as tmpdir:
json_file = tmpdir / "roundtrip.json" json_file = tmpdir / "roundtrip.json"
srsly.write_json(json_file, [docs_to_json(doc)]) srsly.write_json(json_file, [docs_to_json(doc)])
goldcorpus = GoldCorpus(train=str(json_file), dev=str(json_file)) goldcorpus = Corpus(train=str(json_file), dev=str(json_file))
reloaded_example = next(goldcorpus.dev_dataset(nlp=nlp)) reloaded_example = next(goldcorpus.dev_dataset(nlp=nlp))
assert len(doc) == goldcorpus.count_train() assert len(doc) == goldcorpus.count_train()
@ -328,7 +328,7 @@ def test_projective_train_vs_nonprojective_dev(doc):
json_file = tmpdir / "test.json" json_file = tmpdir / "test.json"
# write to JSON train dicts # write to JSON train dicts
srsly.write_json(json_file, [docs_to_json(doc)]) srsly.write_json(json_file, [docs_to_json(doc)])
goldcorpus = GoldCorpus(str(json_file), str(json_file)) goldcorpus = Corpus(str(json_file), str(json_file))
train_reloaded_example = next(goldcorpus.train_dataset(nlp)) train_reloaded_example = next(goldcorpus.train_dataset(nlp))
train_goldparse = get_parses_from_example(train_reloaded_example)[0][1] train_goldparse = get_parses_from_example(train_reloaded_example)[0][1]
@ -360,7 +360,7 @@ def test_ignore_misaligned(doc):
data[0]["paragraphs"][0]["raw"] = text.replace("Sarah", "Jane") data[0]["paragraphs"][0]["raw"] = text.replace("Sarah", "Jane")
# write to JSON train dicts # write to JSON train dicts
srsly.write_json(json_file, data) srsly.write_json(json_file, data)
goldcorpus = GoldCorpus(str(json_file), str(json_file)) goldcorpus = Corpus(str(json_file), str(json_file))
with pytest.raises(AlignmentError): with pytest.raises(AlignmentError):
train_reloaded_example = next(goldcorpus.train_dataset(nlp)) train_reloaded_example = next(goldcorpus.train_dataset(nlp))
@ -371,7 +371,7 @@ def test_ignore_misaligned(doc):
data[0]["paragraphs"][0]["raw"] = text.replace("Sarah", "Jane") data[0]["paragraphs"][0]["raw"] = text.replace("Sarah", "Jane")
# write to JSON train dicts # write to JSON train dicts
srsly.write_json(json_file, data) srsly.write_json(json_file, data)
goldcorpus = GoldCorpus(str(json_file), str(json_file)) goldcorpus = Corpus(str(json_file), str(json_file))
# doesn't raise an AlignmentError, but there is nothing to iterate over # doesn't raise an AlignmentError, but there is nothing to iterate over
# because the only example can't be aligned # because the only example can't be aligned
@ -385,7 +385,7 @@ def test_make_orth_variants(doc):
json_file = tmpdir / "test.json" json_file = tmpdir / "test.json"
# write to JSON train dicts # write to JSON train dicts
srsly.write_json(json_file, [docs_to_json(doc)]) srsly.write_json(json_file, [docs_to_json(doc)])
goldcorpus = GoldCorpus(str(json_file), str(json_file)) goldcorpus = Corpus(str(json_file), str(json_file))
# due to randomness, test only that this runs with no errors for now # due to randomness, test only that this runs with no errors for now
train_example = next(goldcorpus.train_dataset(nlp)) train_example = next(goldcorpus.train_dataset(nlp))