Fix Example details for train CLI / pipeline components (#4624)

* Switch to train_dataset() function in train CLI

* Fixes for pipe() methods in pipeline components

* Don't clobber `examples` variable with `as_example` in pipe() methods
* Remove unnecessary traversals of `examples`

* Update Parser.pipe() for Examples

* Add `as_examples` kwarg to `pipe()` with implementation to return
`Example`s

* Accept `Doc` or `Example` in `pipe()` with `_get_doc()` (copied from
`Pipe`)

* Fixes to Example implementation in spacy.gold

* Move `make_projective` from an attribute of Example to an argument of
`Example.get_gold_parses()`

* Head of 0 are not treated as unset

* Unset heads are set to self rather than `None` (which causes problems
while projectivizing)

* Check for `Doc` (not just not `None`) when creating GoldParses for
pre-merged example

* Don't clobber `examples` variable in `iter_gold_docs()`

* Add/modify gold tests for handling projectivity

* In JSON roundtrip compare results from `dev_dataset` rather than
`train_dataset` to avoid projectivization (and other potential
modifications)

* Add test for projective train vs. nonprojective dev versions of the
same `Doc`

* Handle ignore_misaligned as arg rather than attr

Move `ignore_misaligned` from an attribute of `Example` to an argument
to `Example.get_gold_parses()`, which makes it parallel to
`make_projective`.

Add test with old and new align that checks whether `ignore_misaligned`
errors are raised as expected (only for new align).

* Remove unused attrs from gold.pxd

Remove `ignore_misaligned` and `make_projective` from `gold.pxd`

* Refer to Example.goldparse in iter_gold_docs()

Use `Example.goldparse` in `iter_gold_docs()` instead of `Example.gold`
because a `None` `GoldParse` is generated with ignore_misaligned and
generating it on-the-fly can raise an unwanted AlignmentError

* Update test for ignore_misaligned
This commit is contained in:
adrianeboyd 2019-11-23 14:32:15 +01:00 committed by Matthew Honnibal
parent faaa832518
commit 44829950ba
6 changed files with 179 additions and 74 deletions

View File

@ -340,7 +340,7 @@ def train(
iter_since_best = 0 iter_since_best = 0
best_score = 0.0 best_score = 0.0
for i in range(n_iter): for i in range(n_iter):
train_data = corpus.train_data( train_data = corpus.train_dataset(
nlp, nlp,
noise_level=noise_level, noise_level=noise_level,
orth_variant_level=orth_variant_level, orth_variant_level=orth_variant_level,

View File

@ -58,8 +58,6 @@ cdef class Example:
cdef public object doc cdef public object doc
cdef public list token_annotations cdef public list token_annotations
cdef public DocAnnotation doc_annotation cdef public DocAnnotation doc_annotation
cdef public object make_projective
cdef public object ignore_misaligned
cdef public object goldparse cdef public object goldparse

View File

@ -311,47 +311,50 @@ class GoldCorpus(object):
ignore_misaligned=ignore_misaligned) ignore_misaligned=ignore_misaligned)
yield from gold_examples yield from gold_examples
def train_dataset_without_preprocessing(self, nlp, gold_preproc=False): def train_dataset_without_preprocessing(self, nlp, gold_preproc=False,
examples = self.iter_gold_docs(nlp, self.train_examples, gold_preproc=gold_preproc) ignore_misaligned=False):
examples = self.iter_gold_docs(nlp, self.train_examples,
gold_preproc=gold_preproc,
ignore_misaligned=ignore_misaligned)
yield from examples yield from examples
def dev_dataset(self, nlp, gold_preproc=False, ignore_misaligned=False): def dev_dataset(self, nlp, gold_preproc=False, ignore_misaligned=False):
examples = self.iter_gold_docs(nlp, self.dev_examples, gold_preproc=gold_preproc, examples = self.iter_gold_docs(nlp, self.dev_examples,
ignore_misaligned=ignore_misaligned) gold_preproc=gold_preproc,
ignore_misaligned=ignore_misaligned)
yield from examples yield from examples
@classmethod @classmethod
def iter_gold_docs(cls, nlp, examples, gold_preproc, max_length=None, def iter_gold_docs(cls, nlp, examples, gold_preproc, max_length=None,
noise_level=0.0, orth_variant_level=0.0, make_projective=False, noise_level=0.0, orth_variant_level=0.0,
ignore_misaligned=False): make_projective=False, ignore_misaligned=False):
""" Setting gold_preproc will result in creating a doc per 'sentence' """ """ Setting gold_preproc will result in creating a doc per 'sentence' """
for example in examples: for example in examples:
if gold_preproc: if gold_preproc:
example.doc = None example.doc = None
else: else:
example = example.merge_sents() example = example.merge_sents()
example.make_projective = make_projective example_docs = cls._make_docs(nlp, example,
example.ignore_misaligned = ignore_misaligned
examples = cls._make_docs(nlp, example,
gold_preproc, noise_level=noise_level, gold_preproc, noise_level=noise_level,
orth_variant_level=orth_variant_level) orth_variant_level=orth_variant_level)
examples = cls._make_golds(examples, vocab=nlp.vocab) example_golds = cls._make_golds(example_docs, vocab=nlp.vocab,
for ex in examples: make_projective=make_projective,
if ex.gold is not None: ignore_misaligned=ignore_misaligned)
for ex in example_golds:
if ex.goldparse is not None:
if (not max_length) or len(ex.doc) < max_length: if (not max_length) or len(ex.doc) < max_length:
yield ex yield ex
@classmethod @classmethod
def _make_docs(cls, nlp, example, gold_preproc, noise_level=0.0, orth_variant_level=0.0): def _make_docs(cls, nlp, example, gold_preproc, noise_level=0.0, orth_variant_level=0.0):
var_example = make_orth_variants(nlp, example, orth_variant_level=orth_variant_level)
# gold_preproc is not used ?! # gold_preproc is not used ?!
if example.text is not None: if example.text is not None:
var_example = make_orth_variants(nlp, example, orth_variant_level=orth_variant_level)
var_text = add_noise(var_example.text, noise_level) var_text = add_noise(var_example.text, noise_level)
var_doc = nlp.make_doc(var_text) var_doc = nlp.make_doc(var_text)
var_example.doc = var_doc var_example.doc = var_doc
return [var_example] return [var_example]
else: else:
var_example = make_orth_variants(nlp, example, orth_variant_level=orth_variant_level)
doc_examples = [] doc_examples = []
for token_annotation in var_example.token_annotations: for token_annotation in var_example.token_annotations:
t_doc = Doc(nlp.vocab, words=add_noise(token_annotation.words, noise_level)) t_doc = Doc(nlp.vocab, words=add_noise(token_annotation.words, noise_level))
@ -362,10 +365,13 @@ class GoldCorpus(object):
return doc_examples return doc_examples
@classmethod @classmethod
def _make_golds(cls, examples, vocab=None): def _make_golds(cls, examples, vocab=None, make_projective=False,
ignore_misaligned=False):
gold_examples = [] gold_examples = []
for example in examples: for example in examples:
gold_parses = example.get_gold_parses(vocab=vocab) gold_parses = example.get_gold_parses(vocab=vocab,
make_projective=make_projective,
ignore_misaligned=ignore_misaligned)
for (doc, gold) in gold_parses: for (doc, gold) in gold_parses:
ex = Example(doc=doc) ex = Example(doc=doc)
ex.goldparse = gold ex.goldparse = gold
@ -693,13 +699,11 @@ cdef class DocAnnotation:
cdef class Example: cdef class Example:
def __init__(self, doc_annotation=None, token_annotations=None, doc=None, def __init__(self, doc_annotation=None, token_annotations=None, doc=None,
make_projective=False, ignore_misaligned=False, goldparse=None): goldparse=None):
""" Doc can either be text, or an actual Doc """ """ Doc can either be text, or an actual Doc """
self.doc = doc self.doc = doc
self.doc_annotation = doc_annotation if doc_annotation else DocAnnotation() self.doc_annotation = doc_annotation if doc_annotation else DocAnnotation()
self.token_annotations = token_annotations if token_annotations else [] self.token_annotations = token_annotations if token_annotations else []
self.make_projective = make_projective
self.ignore_misaligned = ignore_misaligned
self.goldparse = goldparse self.goldparse = goldparse
@classmethod @classmethod
@ -760,7 +764,7 @@ cdef class Example:
m_ids.extend(id_ + i for id_ in t.ids) m_ids.extend(id_ + i for id_ in t.ids)
m_words.extend(t.words) m_words.extend(t.words)
m_tags.extend(t.tags) m_tags.extend(t.tags)
m_heads.extend(head + i if head else None for head in t.heads) m_heads.extend(head + i if head is not None and head >= 0 else head_i + i for head_i, head in enumerate(t.heads))
m_deps.extend(t.deps) m_deps.extend(t.deps)
m_ents.extend(t.entities) m_ents.extend(t.entities)
m_morph.extend(t.morphology) m_morph.extend(t.morphology)
@ -773,7 +777,8 @@ cdef class Example:
return m_example return m_example
def get_gold_parses(self, merge=False, vocab=None): def get_gold_parses(self, merge=False, vocab=None, make_projective=False,
ignore_misaligned=False):
"""Return a list of (doc, GoldParse) objects. """Return a list of (doc, GoldParse) objects.
If merge is set to True, add all Token annotations to one big list.""" If merge is set to True, add all Token annotations to one big list."""
d = self.doc_annotation d = self.doc_annotation
@ -788,20 +793,20 @@ cdef class Example:
raise ValueError(Errors.E998) raise ValueError(Errors.E998)
m_doc = Doc(vocab, words=t.words) m_doc = Doc(vocab, words=t.words)
try: try:
gp = GoldParse.from_annotation(m_doc, d, t, make_projective=self.make_projective) gp = GoldParse.from_annotation(m_doc, d, t, make_projective=make_projective)
except AlignmentError: except AlignmentError:
if self.ignore_misaligned: if ignore_misaligned:
gp = None gp = None
else: else:
raise raise
return [(self.doc, gp)] return [(self.doc, gp)]
# we only have one sentence and an appropriate doc # we only have one sentence and an appropriate doc
elif len(self.token_annotations) == 1 and self.doc is not None: elif len(self.token_annotations) == 1 and isinstance(self.doc, Doc):
t = self.token_annotations[0] t = self.token_annotations[0]
try: try:
gp = GoldParse.from_annotation(self.doc, d, t, make_projective=self.make_projective) gp = GoldParse.from_annotation(self.doc, d, t, make_projective=make_projective)
except AlignmentError: except AlignmentError:
if self.ignore_misaligned: if ignore_misaligned:
gp = None gp = None
else: else:
raise raise
@ -814,9 +819,9 @@ cdef class Example:
raise ValueError(Errors.E998) raise ValueError(Errors.E998)
t_doc = Doc(vocab, words=t.words) t_doc = Doc(vocab, words=t.words)
try: try:
gp = GoldParse.from_annotation(t_doc, d, t, make_projective=self.make_projective) gp = GoldParse.from_annotation(t_doc, d, t, make_projective=make_projective)
except AlignmentError: except AlignmentError:
if self.ignore_misaligned: if ignore_misaligned:
gp = None gp = None
else: else:
raise raise

View File

@ -61,7 +61,7 @@ class Pipe(object):
return cls(nlp.vocab, **cfg) return cls(nlp.vocab, **cfg)
def _get_doc(self, example): def _get_doc(self, example):
""" Use this method if the `example` method can be both a Doc or an Example """ """ Use this method if the `example` can be both a Doc or an Example """
if isinstance(example, Doc): if isinstance(example, Doc):
return example return example
return example.doc return example.doc
@ -102,7 +102,6 @@ class Pipe(object):
and `set_annotations()` methods. and `set_annotations()` methods.
""" """
for examples in util.minibatch(stream, size=batch_size): for examples in util.minibatch(stream, size=batch_size):
examples = list(examples)
docs = [self._get_doc(ex) for ex in examples] docs = [self._get_doc(ex) for ex in examples]
predictions = self.predict(docs) predictions = self.predict(docs)
if isinstance(predictions, tuple) and len(tuple) == 2: if isinstance(predictions, tuple) and len(tuple) == 2:
@ -112,11 +111,11 @@ class Pipe(object):
self.set_annotations(docs, predictions) self.set_annotations(docs, predictions)
if as_example: if as_example:
examples = [] annotated_examples = []
for ex, doc in zip(examples, docs): for ex, doc in zip(examples, docs):
ex.doc = doc ex.doc = doc
examples.append(ex) annotated_examples.append(ex)
yield from examples yield from annotated_examples
else: else:
yield from docs yield from docs
@ -312,11 +311,11 @@ class Tensorizer(Pipe):
self.set_annotations(docs, tensors) self.set_annotations(docs, tensors)
if as_example: if as_example:
examples = [] annotated_examples = []
for ex, doc in zip(examples, docs): for ex, doc in zip(examples, docs):
ex.doc = doc ex.doc = doc
examples.append(ex) annotated_examples.append(ex)
yield from examples yield from annotated_examples
else: else:
yield from docs yield from docs
@ -434,17 +433,16 @@ class Tagger(Pipe):
def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False): def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
for examples in util.minibatch(stream, size=batch_size): for examples in util.minibatch(stream, size=batch_size):
examples = list(examples)
docs = [self._get_doc(ex) for ex in examples] docs = [self._get_doc(ex) for ex in examples]
tag_ids, tokvecs = self.predict(docs) tag_ids, tokvecs = self.predict(docs)
self.set_annotations(docs, tag_ids, tensors=tokvecs) self.set_annotations(docs, tag_ids, tensors=tokvecs)
if as_example: if as_example:
examples = [] annotated_examples = []
for ex, doc in zip(examples, docs): for ex, doc in zip(examples, docs):
ex.doc = doc ex.doc = doc
examples.append(ex) annotated_examples.append(ex)
yield from examples yield from annotated_examples
else: else:
yield from docs yield from docs
@ -1000,17 +998,16 @@ class TextCategorizer(Pipe):
def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False): def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
for examples in util.minibatch(stream, size=batch_size): for examples in util.minibatch(stream, size=batch_size):
examples = list(examples)
docs = [self._get_doc(ex) for ex in examples] docs = [self._get_doc(ex) for ex in examples]
scores, tensors = self.predict(docs) scores, tensors = self.predict(docs)
self.set_annotations(docs, scores, tensors=tensors) self.set_annotations(docs, scores, tensors=tensors)
if as_example: if as_example:
examples = [] annotated_examples = []
for ex, doc in zip(examples, docs): for ex, doc in zip(examples, docs):
ex.doc = doc ex.doc = doc
examples.append(ex) annotated_examples.append(ex)
yield from examples yield from annotated_examples
else: else:
yield from docs yield from docs
@ -1333,17 +1330,16 @@ class EntityLinker(Pipe):
def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False): def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
for examples in util.minibatch(stream, size=batch_size): for examples in util.minibatch(stream, size=batch_size):
examples = list(examples)
docs = [self._get_doc(ex) for ex in examples] docs = [self._get_doc(ex) for ex in examples]
kb_ids, tensors = self.predict(docs) kb_ids, tensors = self.predict(docs)
self.set_annotations(docs, kb_ids, tensors=tensors) self.set_annotations(docs, kb_ids, tensors=tensors)
if as_example: if as_example:
examples = [] annotated_examples = []
for ex, doc in zip(examples, docs): for ex, doc in zip(examples, docs):
ex.doc = doc ex.doc = doc
examples.append(ex) annotated_examples.append(ex)
yield from examples yield from annotated_examples
else: else:
yield from docs yield from docs

View File

@ -227,7 +227,8 @@ cdef class Parser:
self.set_annotations([doc], states, tensors=None) self.set_annotations([doc], states, tensors=None)
return doc return doc
def pipe(self, docs, int batch_size=256, int n_threads=-1, beam_width=None): def pipe(self, docs, int batch_size=256, int n_threads=-1, beam_width=None,
as_example=False):
"""Process a stream of documents. """Process a stream of documents.
stream: The sequence of documents to process. stream: The sequence of documents to process.
@ -240,14 +241,21 @@ cdef class Parser:
cdef Doc doc cdef Doc doc
for batch in util.minibatch(docs, size=batch_size): for batch in util.minibatch(docs, size=batch_size):
batch_in_order = list(batch) batch_in_order = list(batch)
by_length = sorted(batch_in_order, key=lambda doc: len(doc)) docs = [self._get_doc(ex) for ex in batch_in_order]
by_length = sorted(docs, key=lambda doc: len(doc))
for subbatch in util.minibatch(by_length, size=max(batch_size//4, 2)): for subbatch in util.minibatch(by_length, size=max(batch_size//4, 2)):
subbatch = list(subbatch) subbatch = list(subbatch)
parse_states = self.predict(subbatch, beam_width=beam_width, parse_states = self.predict(subbatch, beam_width=beam_width,
beam_density=beam_density) beam_density=beam_density)
self.set_annotations(subbatch, parse_states, tensors=None) self.set_annotations(subbatch, parse_states, tensors=None)
for doc in batch_in_order: if as_example:
yield doc annotated_examples = []
for ex, doc in zip(batch_in_order, docs):
ex.doc = doc
annotated_examples.append(ex)
yield from annotated_examples
else:
yield from batch_in_order
def require_model(self): def require_model(self):
"""Raise an error if the component's model is not initialized.""" """Raise an error if the component's model is not initialized."""
@ -635,6 +643,12 @@ cdef class Parser:
self.cfg.update(cfg) self.cfg.update(cfg)
return sgd return sgd
def _get_doc(self, example):
""" Use this method if the `example` can be both a Doc or an Example """
if isinstance(example, Doc):
return example
return example.doc
def to_disk(self, path, exclude=tuple(), **kwargs): def to_disk(self, path, exclude=tuple(), **kwargs):
serializers = { serializers = {
'model': lambda p: (self.model.to_disk(p) if self.model is not True else True), 'model': lambda p: (self.model.to_disk(p) if self.model is not True else True),

View File

@ -1,16 +1,40 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import spacy
from spacy.errors import AlignmentError
from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags, Example, DocAnnotation from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags, Example, DocAnnotation
from spacy.gold import spans_from_biluo_tags, GoldParse, iob_to_biluo from spacy.gold import spans_from_biluo_tags, GoldParse, iob_to_biluo
from spacy.gold import GoldCorpus, docs_to_json, align from spacy.gold import GoldCorpus, docs_to_json, align
from spacy.lang.en import English from spacy.lang.en import English
from spacy.syntax.nonproj import is_nonproj_tree
from spacy.tokens import Doc from spacy.tokens import Doc
from spacy.util import compounding, minibatch from spacy.util import compounding, minibatch
from .util import make_tempdir from .util import make_tempdir
import pytest import pytest
import srsly import srsly
@pytest.fixture
def doc():
text = "Sarah's sister flew to Silicon Valley via London."
tags = ['NNP', 'POS', 'NN', 'VBD', 'IN', 'NNP', 'NNP', 'IN', 'NNP', '.']
# head of '.' is intentionally nonprojective for testing
heads = [2, 0, 3, 3, 3, 6, 4, 3, 7, 5]
deps = ['poss', 'case', 'nsubj', 'ROOT', 'prep', 'compound', 'pobj', 'prep', 'pobj', 'punct']
biluo_tags = ["U-PERSON", "O", "O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
cats = {"TRAVEL": 1.0, "BAKING": 0.0}
nlp = English()
doc = nlp(text)
for i in range(len(tags)):
doc[i].tag_ = tags[i]
doc[i].dep_ = deps[i]
doc[i].head = doc[heads[i]]
doc.ents = spans_from_biluo_tags(doc, biluo_tags)
doc.cats = cats
doc.is_tagged = True
doc.is_parsed = True
return doc
def test_gold_biluo_U(en_vocab): def test_gold_biluo_U(en_vocab):
words = ["I", "flew", "to", "London", "."] words = ["I", "flew", "to", "London", "."]
@ -98,23 +122,14 @@ def test_iob_to_biluo():
iob_to_biluo(bad_iob) iob_to_biluo(bad_iob)
def test_roundtrip_docs_to_json(): def test_roundtrip_docs_to_json(doc):
text = "I flew to Silicon Valley via London."
tags = ["PRP", "VBD", "IN", "NNP", "NNP", "IN", "NNP", "."]
heads = [1, 1, 1, 4, 2, 1, 5, 1]
deps = ["nsubj", "ROOT", "prep", "compound", "pobj", "prep", "pobj", "punct"]
biluo_tags = ["O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
cats = {"TRAVEL": 1.0, "BAKING": 0.0}
nlp = English() nlp = English()
doc = nlp(text) text = doc.text
for i in range(len(tags)): tags = [t.tag_ for t in doc]
doc[i].tag_ = tags[i] deps = [t.dep_ for t in doc]
doc[i].dep_ = deps[i] heads = [t.head.i for t in doc]
doc[i].head = doc[heads[i]] biluo_tags = iob_to_biluo([t.ent_iob_ + "-" + t.ent_type_ if t.ent_type_ else "O" for t in doc])
doc.ents = spans_from_biluo_tags(doc, biluo_tags) cats = doc.cats
doc.cats = cats
doc.is_tagged = True
doc.is_parsed = True
# roundtrip to JSON # roundtrip to JSON
with make_tempdir() as tmpdir: with make_tempdir() as tmpdir:
@ -122,7 +137,7 @@ def test_roundtrip_docs_to_json():
srsly.write_json(json_file, [docs_to_json(doc)]) srsly.write_json(json_file, [docs_to_json(doc)])
goldcorpus = GoldCorpus(train=str(json_file), dev=str(json_file)) goldcorpus = GoldCorpus(train=str(json_file), dev=str(json_file))
reloaded_example = next(goldcorpus.train_dataset(nlp)) reloaded_example = next(goldcorpus.dev_dataset(nlp))
goldparse = reloaded_example.gold goldparse = reloaded_example.gold
assert len(doc) == goldcorpus.count_train() assert len(doc) == goldcorpus.count_train()
@ -142,7 +157,7 @@ def test_roundtrip_docs_to_json():
srsly.write_jsonl(jsonl_file, [docs_to_json(doc)]) srsly.write_jsonl(jsonl_file, [docs_to_json(doc)])
goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file)) goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
reloaded_example = next(goldcorpus.train_dataset(nlp)) reloaded_example = next(goldcorpus.dev_dataset(nlp))
goldparse = reloaded_example.gold goldparse = reloaded_example.gold
assert len(doc) == goldcorpus.count_train() assert len(doc) == goldcorpus.count_train()
@ -166,7 +181,7 @@ def test_roundtrip_docs_to_json():
srsly.write_jsonl(jsonl_file, goldcorpus.train_examples) srsly.write_jsonl(jsonl_file, goldcorpus.train_examples)
goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file)) goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
reloaded_example = next(goldcorpus.train_dataset(nlp)) reloaded_example = next(goldcorpus.dev_dataset(nlp))
goldparse = reloaded_example.gold goldparse = reloaded_example.gold
assert len(doc) == goldcorpus.count_train() assert len(doc) == goldcorpus.count_train()
@ -181,6 +196,83 @@ def test_roundtrip_docs_to_json():
assert cats["BAKING"] == goldparse.cats["BAKING"] assert cats["BAKING"] == goldparse.cats["BAKING"]
def test_projective_train_vs_nonprojective_dev(doc):
nlp = English()
text = doc.text
deps = [t.dep_ for t in doc]
heads = [t.head.i for t in doc]
with make_tempdir() as tmpdir:
jsonl_file = tmpdir / "test.jsonl"
# write to JSONL train dicts
srsly.write_jsonl(jsonl_file, [docs_to_json(doc)])
goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
train_reloaded_example = next(goldcorpus.train_dataset(nlp))
train_goldparse = train_reloaded_example.gold
dev_reloaded_example = next(goldcorpus.dev_dataset(nlp))
dev_goldparse = dev_reloaded_example.gold
assert is_nonproj_tree([t.head.i for t in doc]) is True
assert is_nonproj_tree(train_goldparse.heads) is False
assert heads[:-1] == train_goldparse.heads[:-1]
assert heads[-1] != train_goldparse.heads[-1]
assert deps[:-1] == train_goldparse.labels[:-1]
assert deps[-1] != train_goldparse.labels[-1]
assert heads == dev_goldparse.heads
assert deps == dev_goldparse.labels
def test_ignore_misaligned(doc):
nlp = English()
text = doc.text
deps = [t.dep_ for t in doc]
heads = [t.head.i for t in doc]
use_new_align = spacy.gold.USE_NEW_ALIGN
spacy.gold.USE_NEW_ALIGN = False
with make_tempdir() as tmpdir:
jsonl_file = tmpdir / "test.jsonl"
data = [docs_to_json(doc)]
data[0]["paragraphs"][0]["raw"] = text.replace("Sarah", "Jane")
# write to JSONL train dicts
srsly.write_jsonl(jsonl_file, data)
goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
train_reloaded_example = next(goldcorpus.train_dataset(nlp))
spacy.gold.USE_NEW_ALIGN = True
with make_tempdir() as tmpdir:
jsonl_file = tmpdir / "test.jsonl"
data = [docs_to_json(doc)]
data[0]["paragraphs"][0]["raw"] = text.replace("Sarah", "Jane")
# write to JSONL train dicts
srsly.write_jsonl(jsonl_file, data)
goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
with pytest.raises(AlignmentError):
train_reloaded_example = next(goldcorpus.train_dataset(nlp))
with make_tempdir() as tmpdir:
jsonl_file = tmpdir / "test.jsonl"
data = [docs_to_json(doc)]
data[0]["paragraphs"][0]["raw"] = text.replace("Sarah", "Jane")
# write to JSONL train dicts
srsly.write_jsonl(jsonl_file, data)
goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
# doesn't raise an AlignmentError, but there is nothing to iterate over
# because the only example can't be aligned
train_reloaded_example = list(goldcorpus.train_dataset(nlp,
ignore_misaligned=True))
assert len(train_reloaded_example) == 0
spacy.gold.USE_NEW_ALIGN = use_new_align
# xfail while we have backwards-compatible alignment # xfail while we have backwards-compatible alignment
@pytest.mark.xfail @pytest.mark.xfail
@pytest.mark.parametrize( @pytest.mark.parametrize(