Remove GoldCorpus

Update imports

Update after removing GoldCorpus

Fix module name of corpus

Fix mimport
This commit is contained in:
Matthew Honnibal 2020-06-20 22:13:37 +02:00
parent 50d4b21743
commit 75a5f2d499
9 changed files with 45 additions and 267 deletions

View File

@ -1,6 +1,6 @@
# fmt: off
__title__ = "spacy"
__version__ = "3.0.0.dev9"
__version__ = "3.0.0"
__release__ = True
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

View File

@ -4,7 +4,7 @@ import sys
import srsly
from wasabi import Printer, MESSAGES
from ..gold import GoldCorpus
from ..gold import Corpus
from ..syntax import nonproj
from ..util import load_model, get_lang_class
@ -68,7 +68,7 @@ def debug_data(
loading_train_error_message = ""
loading_dev_error_message = ""
with msg.loading("Loading corpus..."):
corpus = GoldCorpus(train_path, dev_path)
corpus = Corpus(train_path, dev_path)
try:
train_dataset = list(corpus.train_dataset(nlp))
train_dataset_unpreprocessed = list(

View File

@ -1,7 +1,7 @@
from timeit import default_timer as timer
from wasabi import msg
from ..gold import GoldCorpus
from ..gold import Corpus
from .. import util
from .. import displacy
@ -31,7 +31,7 @@ def evaluate(
msg.fail("Evaluation data not found", data_path, exits=1)
if displacy_path and not displacy_path.exists():
msg.fail("Visualization output directory not found", displacy_path, exits=1)
corpus = GoldCorpus(data_path, data_path)
corpus = Corpus(data_path, data_path)
if model.startswith("blank:"):
nlp = util.get_lang_class(model.replace("blank:", ""))()
else:

View File

@ -12,7 +12,7 @@ import thinc.schedules
from thinc.api import Model, use_pytorch_for_gpu_memory
import random
from ..gold.corpus_docbin import Corpus
from ..gold import Corpus
from ..lookups import Lookups
from .. import util
from ..errors import Errors

View File

@ -1,4 +1,4 @@
from .corpus import GoldCorpus
from .corpus import Corpus
from .example import Example
from .align import align

View File

@ -1,54 +1,26 @@
import random
import shutil
import tempfile
import srsly
from pathlib import Path
import itertools
from ..tokens import Doc
import random
from .. import util
from ..errors import Errors, AlignmentError
from .gold_io import read_json_file, json_to_annotations
from .augment import make_orth_variants
from .example import Example
from ..tokens import DocBin
class GoldCorpus(object):
class Corpus:
"""An annotated corpus, using the JSON file format. Manages
annotations for tagging, dependency parsing and NER.
DOCS: https://spacy.io/api/goldcorpus
"""
def __init__(self, train, dev, gold_preproc=False, limit=None):
def __init__(self, train_loc, dev_loc, limit=0):
"""Create a GoldCorpus.
train (str / Path): File or directory of training data.
dev (str / Path): File or directory of development data.
RETURNS (GoldCorpus): The newly created object.
"""
self.limit = limit
if isinstance(train, str) or isinstance(train, Path):
train = self.read_annotations(self.walk_corpus(train))
dev = self.read_annotations(self.walk_corpus(dev))
# Write temp directory with one doc per file, so we can shuffle and stream
self.tmp_dir = Path(tempfile.mkdtemp())
self.write_msgpack(self.tmp_dir / "train", train, limit=self.limit)
self.write_msgpack(self.tmp_dir / "dev", dev, limit=self.limit)
def __del__(self):
shutil.rmtree(self.tmp_dir)
@staticmethod
def write_msgpack(directory, examples, limit=0):
if not directory.exists():
directory.mkdir()
n = 0
for i, ex_dict in enumerate(examples):
text = ex_dict["text"]
srsly.write_msgpack(directory / f"{i}.msg", (text, ex_dict))
n += 1
if limit and n >= limit:
break
self.train_loc = train_loc
self.dev_loc = dev_loc
@staticmethod
def walk_corpus(path):
@ -66,157 +38,45 @@ class GoldCorpus(object):
continue
elif path.is_dir():
paths.extend(path.iterdir())
elif path.parts[-1].endswith((".json", ".jsonl")):
elif path.parts[-1].endswith(".spacy"):
locs.append(path)
return locs
@staticmethod
def read_annotations(locs, limit=0):
def make_examples(self, nlp, reference_docs, **kwargs):
for reference in reference_docs:
predicted = nlp.make_doc(reference.text)
yield Example(predicted, reference)
def read_docbin(self, vocab, locs, limit=0):
""" Yield training examples as example dicts """
i = 0
for loc in locs:
loc = util.ensure_path(loc)
file_name = loc.parts[-1]
if file_name.endswith("json"):
examples = read_json_file(loc)
elif file_name.endswith("jsonl"):
gold_tuples = srsly.read_jsonl(loc)
first_gold_tuple = next(gold_tuples)
gold_tuples = itertools.chain([first_gold_tuple], gold_tuples)
# TODO: proper format checks with schemas
if isinstance(first_gold_tuple, dict):
if first_gold_tuple.get("paragraphs", None):
examples = []
for json_doc in gold_tuples:
examples.extend(json_to_annotations(json_doc))
elif first_gold_tuple.get("doc_annotation", None):
examples = []
for ex_dict in gold_tuples:
doc = ex_dict.get("doc", None)
if doc is None:
doc = ex_dict.get("text", None)
if not (
doc is None
or isinstance(doc, Doc)
or isinstance(doc, str)
):
raise ValueError(Errors.E987.format(type=type(doc)))
examples.append(ex_dict)
elif file_name.endswith("msg"):
text, ex_dict = srsly.read_msgpack(loc)
examples = [ex_dict]
else:
supported = ("json", "jsonl", "msg")
raise ValueError(Errors.E124.format(path=loc, formats=supported))
try:
for example in examples:
yield example
i += 1
if limit and i >= limit:
return
except KeyError as e:
msg = "Missing key {}".format(e)
raise KeyError(Errors.E996.format(file=file_name, msg=msg))
except UnboundLocalError as e:
msg = "Unexpected document structure"
raise ValueError(Errors.E996.format(file=file_name, msg=msg))
@property
def dev_annotations(self):
locs = (self.tmp_dir / "dev").iterdir()
yield from self.read_annotations(locs, limit=self.limit)
@property
def train_annotations(self):
locs = (self.tmp_dir / "train").iterdir()
yield from self.read_annotations(locs, limit=self.limit)
def count_train(self):
if loc.parts[-1].endswith(".spacy"):
with loc.open("rb") as file_:
doc_bin = DocBin().from_bytes(file_.read())
yield from doc_bin.get_docs(vocab)
def count_train(self, nlp):
"""Returns count of words in train examples"""
n = 0
i = 0
for eg_dict in self.train_annotations:
n += len(eg_dict["token_annotation"]["words"])
for example in self.train_dataset(nlp):
n += len(example.predicted)
if self.limit and i >= self.limit:
break
i += 1
return n
def train_dataset(
self,
nlp,
gold_preproc=False,
max_length=None,
orth_variant_level=0.0,
ignore_misaligned=False,
):
locs = list((self.tmp_dir / "train").iterdir())
random.shuffle(locs)
train_annotations = self.read_annotations(locs, limit=self.limit)
examples = self.iter_examples(
nlp,
train_annotations,
gold_preproc,
max_length=max_length,
orth_variant_level=orth_variant_level,
make_projective=True,
ignore_misaligned=ignore_misaligned,
)
def train_dataset(self, nlp, shuffle=True, **kwargs):
ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.train_loc))
examples = self.make_examples(nlp, ref_docs, **kwargs)
if shuffle:
examples = list(examples)
random.shuffle(examples)
yield from examples
def train_dataset_without_preprocessing(
self, nlp, gold_preproc=False, ignore_misaligned=False
):
examples = self.iter_examples(
nlp,
self.train_annotations,
gold_preproc=gold_preproc,
ignore_misaligned=ignore_misaligned,
)
def dev_dataset(self, nlp):
ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.train_loc))
examples = self.make_examples(nlp, ref_docs, **kwargs)
yield from examples
def dev_dataset(self, nlp, gold_preproc=False, ignore_misaligned=False):
examples = self.iter_examples(
nlp,
self.dev_annotations,
gold_preproc=gold_preproc,
ignore_misaligned=ignore_misaligned,
)
yield from examples
@classmethod
def iter_examples(
cls,
nlp,
annotations,
gold_preproc,
max_length=None,
orth_variant_level=0.0,
make_projective=False,
ignore_misaligned=False,
):
""" Setting gold_preproc will result in creating a doc per sentence """
for eg_dict in annotations:
token_annot = eg_dict.get("token_annotation", {})
if eg_dict["text"]:
doc = nlp.make_doc(eg_dict["text"])
elif "words" in token_annot:
doc = Doc(nlp.vocab, words=token_annot["words"])
else:
raise ValueError("Expecting either 'text' or token_annotation.words annotation")
if gold_preproc:
variant_text, variant_token_annot = make_orth_variants(nlp, doc.text, token_annot, orth_variant_level)
doc = nlp.make_doc(variant_text)
eg_dict["token_annotation"] = variant_token_annot
example = Example.from_dict(doc, eg_dict)
examples = example.split_sents()
else:
example = Example.from_dict(doc, eg_dict)
examples = [example]
for eg in examples:
if (not max_length) or len(eg.predicted) < max_length:
yield eg

View File

@ -1,82 +0,0 @@
import srsly
from pathlib import Path
import random
from .. import util
from .example import Example
from ..tokens import DocBin
class Corpus:
"""An annotated corpus, using the JSON file format. Manages
annotations for tagging, dependency parsing and NER.
DOCS: https://spacy.io/api/goldcorpus
"""
def __init__(self, train_loc, dev_loc, limit=0):
"""Create a GoldCorpus.
train (str / Path): File or directory of training data.
dev (str / Path): File or directory of development data.
RETURNS (GoldCorpus): The newly created object.
"""
self.train_loc = train_loc
self.dev_loc = dev_loc
@staticmethod
def walk_corpus(path):
path = util.ensure_path(path)
if not path.is_dir():
return [path]
paths = [path]
locs = []
seen = set()
for path in paths:
if str(path) in seen:
continue
seen.add(str(path))
if path.parts[-1].startswith("."):
continue
elif path.is_dir():
paths.extend(path.iterdir())
elif path.parts[-1].endswith(".spacy"):
locs.append(path)
return locs
def make_examples(self, nlp, reference_docs, **kwargs):
for reference in reference_docs:
predicted = nlp.make_doc(reference.text)
yield Example(predicted, reference)
def read_docbin(self, vocab, locs, limit=0):
""" Yield training examples as example dicts """
i = 0
for loc in locs:
loc = util.ensure_path(loc)
if loc.parts[-1].endswith(".spacy"):
with loc.open("rb") as file_:
doc_bin = DocBin().from_bytes(file_.read())
yield from doc_bin.get_docs(vocab)
def count_train(self, nlp):
"""Returns count of words in train examples"""
n = 0
i = 0
for example in self.train_dataset(nlp):
n += len(example.predicted)
if self.limit and i >= self.limit:
break
i += 1
return n
def train_dataset(self, nlp, shuffle=True, **kwargs):
ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.train_loc))
examples = self.make_examples(nlp, ref_docs, **kwargs)
if shuffle:
examples = list(examples)
random.shuffle(examples)
yield from examples
def dev_dataset(self, nlp):
ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.train_loc))
examples = self.make_examples(nlp, ref_docs, **kwargs)
yield from examples

View File

@ -1,5 +1,5 @@
import srsly
from spacy.gold import GoldCorpus
from spacy.gold import Corpus
from spacy.lang.en import English
from ..util import make_tempdir
@ -11,7 +11,7 @@ def test_issue4402():
json_path = tmpdir / "test4402.json"
srsly.write_json(json_path, json_data)
corpus = GoldCorpus(str(json_path), str(json_path))
corpus = Corpus(str(json_path), str(json_path))
train_data = list(corpus.train_dataset(nlp, gold_preproc=True, max_length=0))
# assert that the data got split into 4 sentences

View File

@ -1,7 +1,7 @@
from spacy.errors import AlignmentError
from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags
from spacy.gold import spans_from_biluo_tags, iob_to_biluo, align
from spacy.gold import GoldCorpus, docs_to_json
from spacy.gold import Corpus, docs_to_json
from spacy.gold.example import Example
from spacy.lang.en import English
from spacy.syntax.nonproj import is_nonproj_tree
@ -299,7 +299,7 @@ def test_roundtrip_docs_to_json(doc):
with make_tempdir() as tmpdir:
json_file = tmpdir / "roundtrip.json"
srsly.write_json(json_file, [docs_to_json(doc)])
goldcorpus = GoldCorpus(train=str(json_file), dev=str(json_file))
goldcorpus = Corpus(train=str(json_file), dev=str(json_file))
reloaded_example = next(goldcorpus.dev_dataset(nlp=nlp))
assert len(doc) == goldcorpus.count_train()
@ -328,7 +328,7 @@ def test_projective_train_vs_nonprojective_dev(doc):
json_file = tmpdir / "test.json"
# write to JSON train dicts
srsly.write_json(json_file, [docs_to_json(doc)])
goldcorpus = GoldCorpus(str(json_file), str(json_file))
goldcorpus = Corpus(str(json_file), str(json_file))
train_reloaded_example = next(goldcorpus.train_dataset(nlp))
train_goldparse = get_parses_from_example(train_reloaded_example)[0][1]
@ -360,7 +360,7 @@ def test_ignore_misaligned(doc):
data[0]["paragraphs"][0]["raw"] = text.replace("Sarah", "Jane")
# write to JSON train dicts
srsly.write_json(json_file, data)
goldcorpus = GoldCorpus(str(json_file), str(json_file))
goldcorpus = Corpus(str(json_file), str(json_file))
with pytest.raises(AlignmentError):
train_reloaded_example = next(goldcorpus.train_dataset(nlp))
@ -371,7 +371,7 @@ def test_ignore_misaligned(doc):
data[0]["paragraphs"][0]["raw"] = text.replace("Sarah", "Jane")
# write to JSON train dicts
srsly.write_json(json_file, data)
goldcorpus = GoldCorpus(str(json_file), str(json_file))
goldcorpus = Corpus(str(json_file), str(json_file))
# doesn't raise an AlignmentError, but there is nothing to iterate over
# because the only example can't be aligned
@ -385,7 +385,7 @@ def test_make_orth_variants(doc):
json_file = tmpdir / "test.json"
# write to JSON train dicts
srsly.write_json(json_file, [docs_to_json(doc)])
goldcorpus = GoldCorpus(str(json_file), str(json_file))
goldcorpus = Corpus(str(json_file), str(json_file))
# due to randomness, test only that this runs with no errors for now
train_example = next(goldcorpus.train_dataset(nlp))