Use new example class in GoldCorpus

This commit is contained in:
Matthew Honnibal 2020-06-09 23:31:19 +02:00
parent 0714f1fa5c
commit af1b5f129b

View File

@ -9,7 +9,7 @@ from .. import util
from ..errors import Errors, AlignmentError from ..errors import Errors, AlignmentError
from .gold_io import read_json_file, json_to_annotations from .gold_io import read_json_file, json_to_annotations
from .augment import make_orth_variants, add_noise from .augment import make_orth_variants, add_noise
from .example import Example from .new_example import NewExample as Example
class GoldCorpus(object): class GoldCorpus(object):
@ -203,59 +203,24 @@ class GoldCorpus(object):
for eg_dict in annotations: for eg_dict in annotations:
if eg_dict["text"]: if eg_dict["text"]:
example = Example.from_dict( example = Example.from_dict(
eg_dict, nlp.make_doc(eg_dict["text"]),
doc=nlp.make_doc(eg_dict["text"]) eg_dict
) )
else: else:
example = Example.from_dict( example = Example.from_dict(
eg_dict, Doc(nlp.vocab, words=eg_dict["words"]),
doc=Doc(nlp.vocab, words=eg_dict["words"]) eg_dict
) )
example_docs = []
if gold_preproc: if gold_preproc:
split_examples = example.split_sents() # TODO: Data augmentation
for split_example in split_examples: examples = example.split_sents()
split_example_docs = cls._make_docs(
nlp,
split_example,
gold_preproc,
noise_level=noise_level,
orth_variant_level=orth_variant_level,
)
example_docs.extend(split_example_docs)
else: else:
example_docs = cls._make_docs( examples = [example]
nlp, for ex in examples:
example, if (not max_length) or len(ex.predicted) < max_length:
gold_preproc,
noise_level=noise_level,
orth_variant_level=orth_variant_level,
)
for ex in example_docs:
if (not max_length) or len(ex.doc) < max_length:
if ignore_misaligned: if ignore_misaligned:
try: try:
_ = ex._deprecated_get_gold() _ = ex._deprecated_get_gold()
except AlignmentError: except AlignmentError:
continue continue
yield ex yield ex
@classmethod
def _make_docs(
cls, nlp, example, gold_preproc, noise_level=0.0, orth_variant_level=0.0
):
var_example = make_orth_variants(
nlp, example, orth_variant_level=orth_variant_level
)
# gold_preproc is not used ?!
if example.text is not None:
var_text = add_noise(var_example.text, noise_level)
var_doc = nlp.make_doc(var_text)
var_example.doc = var_doc
else:
var_doc = Doc(
nlp.vocab,
words=add_noise(var_example.token_annotation.words, noise_level),
)
var_example.doc = var_doc
return [var_example]