mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-27 17:54:39 +03:00
Fix Example details for train CLI / pipeline components (#4624)
* Switch to train_dataset() function in train CLI * Fixes for pipe() methods in pipeline components * Don't clobber `examples` variable with `as_example` in pipe() methods * Remove unnecessary traversals of `examples` * Update Parser.pipe() for Examples * Add `as_examples` kwarg to `pipe()` with implementation to return `Example`s * Accept `Doc` or `Example` in `pipe()` with `_get_doc()` (copied from `Pipe`) * Fixes to Example implementation in spacy.gold * Move `make_projective` from an attribute of Example to an argument of `Example.get_gold_parses()` * Head of 0 are not treated as unset * Unset heads are set to self rather than `None` (which causes problems while projectivizing) * Check for `Doc` (not just not `None`) when creating GoldParses for pre-merged example * Don't clobber `examples` variable in `iter_gold_docs()` * Add/modify gold tests for handling projectivity * In JSON roundtrip compare results from `dev_dataset` rather than `train_dataset` to avoid projectivization (and other potential modifications) * Add test for projective train vs. nonprojective dev versions of the same `Doc` * Handle ignore_misaligned as arg rather than attr Move `ignore_misaligned` from an attribute of `Example` to an argument to `Example.get_gold_parses()`, which makes it parallel to `make_projective`. Add test with old and new align that checks whether `ignore_misaligned` errors are raised as expected (only for new align). * Remove unused attrs from gold.pxd Remove `ignore_misaligned` and `make_projective` from `gold.pxd` * Refer to Example.goldparse in iter_gold_docs() Use `Example.goldparse` in `iter_gold_docs()` instead of `Example.gold` because a `None` `GoldParse` is generated with ignore_misaligned and generating it on-the-fly can raise an unwanted AlignmentError * Update test for ignore_misaligned
This commit is contained in:
parent
faaa832518
commit
44829950ba
|
@ -340,7 +340,7 @@ def train(
|
||||||
iter_since_best = 0
|
iter_since_best = 0
|
||||||
best_score = 0.0
|
best_score = 0.0
|
||||||
for i in range(n_iter):
|
for i in range(n_iter):
|
||||||
train_data = corpus.train_data(
|
train_data = corpus.train_dataset(
|
||||||
nlp,
|
nlp,
|
||||||
noise_level=noise_level,
|
noise_level=noise_level,
|
||||||
orth_variant_level=orth_variant_level,
|
orth_variant_level=orth_variant_level,
|
||||||
|
|
|
@ -58,8 +58,6 @@ cdef class Example:
|
||||||
cdef public object doc
|
cdef public object doc
|
||||||
cdef public list token_annotations
|
cdef public list token_annotations
|
||||||
cdef public DocAnnotation doc_annotation
|
cdef public DocAnnotation doc_annotation
|
||||||
cdef public object make_projective
|
|
||||||
cdef public object ignore_misaligned
|
|
||||||
cdef public object goldparse
|
cdef public object goldparse
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -311,47 +311,50 @@ class GoldCorpus(object):
|
||||||
ignore_misaligned=ignore_misaligned)
|
ignore_misaligned=ignore_misaligned)
|
||||||
yield from gold_examples
|
yield from gold_examples
|
||||||
|
|
||||||
def train_dataset_without_preprocessing(self, nlp, gold_preproc=False):
|
def train_dataset_without_preprocessing(self, nlp, gold_preproc=False,
|
||||||
examples = self.iter_gold_docs(nlp, self.train_examples, gold_preproc=gold_preproc)
|
ignore_misaligned=False):
|
||||||
|
examples = self.iter_gold_docs(nlp, self.train_examples,
|
||||||
|
gold_preproc=gold_preproc,
|
||||||
|
ignore_misaligned=ignore_misaligned)
|
||||||
yield from examples
|
yield from examples
|
||||||
|
|
||||||
def dev_dataset(self, nlp, gold_preproc=False, ignore_misaligned=False):
|
def dev_dataset(self, nlp, gold_preproc=False, ignore_misaligned=False):
|
||||||
examples = self.iter_gold_docs(nlp, self.dev_examples, gold_preproc=gold_preproc,
|
examples = self.iter_gold_docs(nlp, self.dev_examples,
|
||||||
ignore_misaligned=ignore_misaligned)
|
gold_preproc=gold_preproc,
|
||||||
|
ignore_misaligned=ignore_misaligned)
|
||||||
yield from examples
|
yield from examples
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def iter_gold_docs(cls, nlp, examples, gold_preproc, max_length=None,
|
def iter_gold_docs(cls, nlp, examples, gold_preproc, max_length=None,
|
||||||
noise_level=0.0, orth_variant_level=0.0, make_projective=False,
|
noise_level=0.0, orth_variant_level=0.0,
|
||||||
ignore_misaligned=False):
|
make_projective=False, ignore_misaligned=False):
|
||||||
""" Setting gold_preproc will result in creating a doc per 'sentence' """
|
""" Setting gold_preproc will result in creating a doc per 'sentence' """
|
||||||
for example in examples:
|
for example in examples:
|
||||||
if gold_preproc:
|
if gold_preproc:
|
||||||
example.doc = None
|
example.doc = None
|
||||||
else:
|
else:
|
||||||
example = example.merge_sents()
|
example = example.merge_sents()
|
||||||
example.make_projective = make_projective
|
example_docs = cls._make_docs(nlp, example,
|
||||||
example.ignore_misaligned = ignore_misaligned
|
|
||||||
examples = cls._make_docs(nlp, example,
|
|
||||||
gold_preproc, noise_level=noise_level,
|
gold_preproc, noise_level=noise_level,
|
||||||
orth_variant_level=orth_variant_level)
|
orth_variant_level=orth_variant_level)
|
||||||
examples = cls._make_golds(examples, vocab=nlp.vocab)
|
example_golds = cls._make_golds(example_docs, vocab=nlp.vocab,
|
||||||
for ex in examples:
|
make_projective=make_projective,
|
||||||
if ex.gold is not None:
|
ignore_misaligned=ignore_misaligned)
|
||||||
|
for ex in example_golds:
|
||||||
|
if ex.goldparse is not None:
|
||||||
if (not max_length) or len(ex.doc) < max_length:
|
if (not max_length) or len(ex.doc) < max_length:
|
||||||
yield ex
|
yield ex
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def _make_docs(cls, nlp, example, gold_preproc, noise_level=0.0, orth_variant_level=0.0):
|
def _make_docs(cls, nlp, example, gold_preproc, noise_level=0.0, orth_variant_level=0.0):
|
||||||
|
var_example = make_orth_variants(nlp, example, orth_variant_level=orth_variant_level)
|
||||||
# gold_preproc is not used ?!
|
# gold_preproc is not used ?!
|
||||||
if example.text is not None:
|
if example.text is not None:
|
||||||
var_example = make_orth_variants(nlp, example, orth_variant_level=orth_variant_level)
|
|
||||||
var_text = add_noise(var_example.text, noise_level)
|
var_text = add_noise(var_example.text, noise_level)
|
||||||
var_doc = nlp.make_doc(var_text)
|
var_doc = nlp.make_doc(var_text)
|
||||||
var_example.doc = var_doc
|
var_example.doc = var_doc
|
||||||
return [var_example]
|
return [var_example]
|
||||||
else:
|
else:
|
||||||
var_example = make_orth_variants(nlp, example, orth_variant_level=orth_variant_level)
|
|
||||||
doc_examples = []
|
doc_examples = []
|
||||||
for token_annotation in var_example.token_annotations:
|
for token_annotation in var_example.token_annotations:
|
||||||
t_doc = Doc(nlp.vocab, words=add_noise(token_annotation.words, noise_level))
|
t_doc = Doc(nlp.vocab, words=add_noise(token_annotation.words, noise_level))
|
||||||
|
@ -362,10 +365,13 @@ class GoldCorpus(object):
|
||||||
return doc_examples
|
return doc_examples
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def _make_golds(cls, examples, vocab=None):
|
def _make_golds(cls, examples, vocab=None, make_projective=False,
|
||||||
|
ignore_misaligned=False):
|
||||||
gold_examples = []
|
gold_examples = []
|
||||||
for example in examples:
|
for example in examples:
|
||||||
gold_parses = example.get_gold_parses(vocab=vocab)
|
gold_parses = example.get_gold_parses(vocab=vocab,
|
||||||
|
make_projective=make_projective,
|
||||||
|
ignore_misaligned=ignore_misaligned)
|
||||||
for (doc, gold) in gold_parses:
|
for (doc, gold) in gold_parses:
|
||||||
ex = Example(doc=doc)
|
ex = Example(doc=doc)
|
||||||
ex.goldparse = gold
|
ex.goldparse = gold
|
||||||
|
@ -693,13 +699,11 @@ cdef class DocAnnotation:
|
||||||
|
|
||||||
cdef class Example:
|
cdef class Example:
|
||||||
def __init__(self, doc_annotation=None, token_annotations=None, doc=None,
|
def __init__(self, doc_annotation=None, token_annotations=None, doc=None,
|
||||||
make_projective=False, ignore_misaligned=False, goldparse=None):
|
goldparse=None):
|
||||||
""" Doc can either be text, or an actual Doc """
|
""" Doc can either be text, or an actual Doc """
|
||||||
self.doc = doc
|
self.doc = doc
|
||||||
self.doc_annotation = doc_annotation if doc_annotation else DocAnnotation()
|
self.doc_annotation = doc_annotation if doc_annotation else DocAnnotation()
|
||||||
self.token_annotations = token_annotations if token_annotations else []
|
self.token_annotations = token_annotations if token_annotations else []
|
||||||
self.make_projective = make_projective
|
|
||||||
self.ignore_misaligned = ignore_misaligned
|
|
||||||
self.goldparse = goldparse
|
self.goldparse = goldparse
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
@ -760,7 +764,7 @@ cdef class Example:
|
||||||
m_ids.extend(id_ + i for id_ in t.ids)
|
m_ids.extend(id_ + i for id_ in t.ids)
|
||||||
m_words.extend(t.words)
|
m_words.extend(t.words)
|
||||||
m_tags.extend(t.tags)
|
m_tags.extend(t.tags)
|
||||||
m_heads.extend(head + i if head else None for head in t.heads)
|
m_heads.extend(head + i if head is not None and head >= 0 else head_i + i for head_i, head in enumerate(t.heads))
|
||||||
m_deps.extend(t.deps)
|
m_deps.extend(t.deps)
|
||||||
m_ents.extend(t.entities)
|
m_ents.extend(t.entities)
|
||||||
m_morph.extend(t.morphology)
|
m_morph.extend(t.morphology)
|
||||||
|
@ -773,7 +777,8 @@ cdef class Example:
|
||||||
return m_example
|
return m_example
|
||||||
|
|
||||||
|
|
||||||
def get_gold_parses(self, merge=False, vocab=None):
|
def get_gold_parses(self, merge=False, vocab=None, make_projective=False,
|
||||||
|
ignore_misaligned=False):
|
||||||
"""Return a list of (doc, GoldParse) objects.
|
"""Return a list of (doc, GoldParse) objects.
|
||||||
If merge is set to True, add all Token annotations to one big list."""
|
If merge is set to True, add all Token annotations to one big list."""
|
||||||
d = self.doc_annotation
|
d = self.doc_annotation
|
||||||
|
@ -788,20 +793,20 @@ cdef class Example:
|
||||||
raise ValueError(Errors.E998)
|
raise ValueError(Errors.E998)
|
||||||
m_doc = Doc(vocab, words=t.words)
|
m_doc = Doc(vocab, words=t.words)
|
||||||
try:
|
try:
|
||||||
gp = GoldParse.from_annotation(m_doc, d, t, make_projective=self.make_projective)
|
gp = GoldParse.from_annotation(m_doc, d, t, make_projective=make_projective)
|
||||||
except AlignmentError:
|
except AlignmentError:
|
||||||
if self.ignore_misaligned:
|
if ignore_misaligned:
|
||||||
gp = None
|
gp = None
|
||||||
else:
|
else:
|
||||||
raise
|
raise
|
||||||
return [(self.doc, gp)]
|
return [(self.doc, gp)]
|
||||||
# we only have one sentence and an appropriate doc
|
# we only have one sentence and an appropriate doc
|
||||||
elif len(self.token_annotations) == 1 and self.doc is not None:
|
elif len(self.token_annotations) == 1 and isinstance(self.doc, Doc):
|
||||||
t = self.token_annotations[0]
|
t = self.token_annotations[0]
|
||||||
try:
|
try:
|
||||||
gp = GoldParse.from_annotation(self.doc, d, t, make_projective=self.make_projective)
|
gp = GoldParse.from_annotation(self.doc, d, t, make_projective=make_projective)
|
||||||
except AlignmentError:
|
except AlignmentError:
|
||||||
if self.ignore_misaligned:
|
if ignore_misaligned:
|
||||||
gp = None
|
gp = None
|
||||||
else:
|
else:
|
||||||
raise
|
raise
|
||||||
|
@ -814,9 +819,9 @@ cdef class Example:
|
||||||
raise ValueError(Errors.E998)
|
raise ValueError(Errors.E998)
|
||||||
t_doc = Doc(vocab, words=t.words)
|
t_doc = Doc(vocab, words=t.words)
|
||||||
try:
|
try:
|
||||||
gp = GoldParse.from_annotation(t_doc, d, t, make_projective=self.make_projective)
|
gp = GoldParse.from_annotation(t_doc, d, t, make_projective=make_projective)
|
||||||
except AlignmentError:
|
except AlignmentError:
|
||||||
if self.ignore_misaligned:
|
if ignore_misaligned:
|
||||||
gp = None
|
gp = None
|
||||||
else:
|
else:
|
||||||
raise
|
raise
|
||||||
|
|
|
@ -61,7 +61,7 @@ class Pipe(object):
|
||||||
return cls(nlp.vocab, **cfg)
|
return cls(nlp.vocab, **cfg)
|
||||||
|
|
||||||
def _get_doc(self, example):
|
def _get_doc(self, example):
|
||||||
""" Use this method if the `example` method can be both a Doc or an Example """
|
""" Use this method if the `example` can be both a Doc or an Example """
|
||||||
if isinstance(example, Doc):
|
if isinstance(example, Doc):
|
||||||
return example
|
return example
|
||||||
return example.doc
|
return example.doc
|
||||||
|
@ -102,7 +102,6 @@ class Pipe(object):
|
||||||
and `set_annotations()` methods.
|
and `set_annotations()` methods.
|
||||||
"""
|
"""
|
||||||
for examples in util.minibatch(stream, size=batch_size):
|
for examples in util.minibatch(stream, size=batch_size):
|
||||||
examples = list(examples)
|
|
||||||
docs = [self._get_doc(ex) for ex in examples]
|
docs = [self._get_doc(ex) for ex in examples]
|
||||||
predictions = self.predict(docs)
|
predictions = self.predict(docs)
|
||||||
if isinstance(predictions, tuple) and len(tuple) == 2:
|
if isinstance(predictions, tuple) and len(tuple) == 2:
|
||||||
|
@ -112,11 +111,11 @@ class Pipe(object):
|
||||||
self.set_annotations(docs, predictions)
|
self.set_annotations(docs, predictions)
|
||||||
|
|
||||||
if as_example:
|
if as_example:
|
||||||
examples = []
|
annotated_examples = []
|
||||||
for ex, doc in zip(examples, docs):
|
for ex, doc in zip(examples, docs):
|
||||||
ex.doc = doc
|
ex.doc = doc
|
||||||
examples.append(ex)
|
annotated_examples.append(ex)
|
||||||
yield from examples
|
yield from annotated_examples
|
||||||
else:
|
else:
|
||||||
yield from docs
|
yield from docs
|
||||||
|
|
||||||
|
@ -312,11 +311,11 @@ class Tensorizer(Pipe):
|
||||||
self.set_annotations(docs, tensors)
|
self.set_annotations(docs, tensors)
|
||||||
|
|
||||||
if as_example:
|
if as_example:
|
||||||
examples = []
|
annotated_examples = []
|
||||||
for ex, doc in zip(examples, docs):
|
for ex, doc in zip(examples, docs):
|
||||||
ex.doc = doc
|
ex.doc = doc
|
||||||
examples.append(ex)
|
annotated_examples.append(ex)
|
||||||
yield from examples
|
yield from annotated_examples
|
||||||
else:
|
else:
|
||||||
yield from docs
|
yield from docs
|
||||||
|
|
||||||
|
@ -434,17 +433,16 @@ class Tagger(Pipe):
|
||||||
|
|
||||||
def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
|
def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
|
||||||
for examples in util.minibatch(stream, size=batch_size):
|
for examples in util.minibatch(stream, size=batch_size):
|
||||||
examples = list(examples)
|
|
||||||
docs = [self._get_doc(ex) for ex in examples]
|
docs = [self._get_doc(ex) for ex in examples]
|
||||||
tag_ids, tokvecs = self.predict(docs)
|
tag_ids, tokvecs = self.predict(docs)
|
||||||
self.set_annotations(docs, tag_ids, tensors=tokvecs)
|
self.set_annotations(docs, tag_ids, tensors=tokvecs)
|
||||||
|
|
||||||
if as_example:
|
if as_example:
|
||||||
examples = []
|
annotated_examples = []
|
||||||
for ex, doc in zip(examples, docs):
|
for ex, doc in zip(examples, docs):
|
||||||
ex.doc = doc
|
ex.doc = doc
|
||||||
examples.append(ex)
|
annotated_examples.append(ex)
|
||||||
yield from examples
|
yield from annotated_examples
|
||||||
else:
|
else:
|
||||||
yield from docs
|
yield from docs
|
||||||
|
|
||||||
|
@ -1000,17 +998,16 @@ class TextCategorizer(Pipe):
|
||||||
|
|
||||||
def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
|
def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
|
||||||
for examples in util.minibatch(stream, size=batch_size):
|
for examples in util.minibatch(stream, size=batch_size):
|
||||||
examples = list(examples)
|
|
||||||
docs = [self._get_doc(ex) for ex in examples]
|
docs = [self._get_doc(ex) for ex in examples]
|
||||||
scores, tensors = self.predict(docs)
|
scores, tensors = self.predict(docs)
|
||||||
self.set_annotations(docs, scores, tensors=tensors)
|
self.set_annotations(docs, scores, tensors=tensors)
|
||||||
|
|
||||||
if as_example:
|
if as_example:
|
||||||
examples = []
|
annotated_examples = []
|
||||||
for ex, doc in zip(examples, docs):
|
for ex, doc in zip(examples, docs):
|
||||||
ex.doc = doc
|
ex.doc = doc
|
||||||
examples.append(ex)
|
annotated_examples.append(ex)
|
||||||
yield from examples
|
yield from annotated_examples
|
||||||
else:
|
else:
|
||||||
yield from docs
|
yield from docs
|
||||||
|
|
||||||
|
@ -1333,17 +1330,16 @@ class EntityLinker(Pipe):
|
||||||
|
|
||||||
def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
|
def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
|
||||||
for examples in util.minibatch(stream, size=batch_size):
|
for examples in util.minibatch(stream, size=batch_size):
|
||||||
examples = list(examples)
|
|
||||||
docs = [self._get_doc(ex) for ex in examples]
|
docs = [self._get_doc(ex) for ex in examples]
|
||||||
kb_ids, tensors = self.predict(docs)
|
kb_ids, tensors = self.predict(docs)
|
||||||
self.set_annotations(docs, kb_ids, tensors=tensors)
|
self.set_annotations(docs, kb_ids, tensors=tensors)
|
||||||
|
|
||||||
if as_example:
|
if as_example:
|
||||||
examples = []
|
annotated_examples = []
|
||||||
for ex, doc in zip(examples, docs):
|
for ex, doc in zip(examples, docs):
|
||||||
ex.doc = doc
|
ex.doc = doc
|
||||||
examples.append(ex)
|
annotated_examples.append(ex)
|
||||||
yield from examples
|
yield from annotated_examples
|
||||||
else:
|
else:
|
||||||
yield from docs
|
yield from docs
|
||||||
|
|
||||||
|
|
|
@ -227,7 +227,8 @@ cdef class Parser:
|
||||||
self.set_annotations([doc], states, tensors=None)
|
self.set_annotations([doc], states, tensors=None)
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def pipe(self, docs, int batch_size=256, int n_threads=-1, beam_width=None):
|
def pipe(self, docs, int batch_size=256, int n_threads=-1, beam_width=None,
|
||||||
|
as_example=False):
|
||||||
"""Process a stream of documents.
|
"""Process a stream of documents.
|
||||||
|
|
||||||
stream: The sequence of documents to process.
|
stream: The sequence of documents to process.
|
||||||
|
@ -240,14 +241,21 @@ cdef class Parser:
|
||||||
cdef Doc doc
|
cdef Doc doc
|
||||||
for batch in util.minibatch(docs, size=batch_size):
|
for batch in util.minibatch(docs, size=batch_size):
|
||||||
batch_in_order = list(batch)
|
batch_in_order = list(batch)
|
||||||
by_length = sorted(batch_in_order, key=lambda doc: len(doc))
|
docs = [self._get_doc(ex) for ex in batch_in_order]
|
||||||
|
by_length = sorted(docs, key=lambda doc: len(doc))
|
||||||
for subbatch in util.minibatch(by_length, size=max(batch_size//4, 2)):
|
for subbatch in util.minibatch(by_length, size=max(batch_size//4, 2)):
|
||||||
subbatch = list(subbatch)
|
subbatch = list(subbatch)
|
||||||
parse_states = self.predict(subbatch, beam_width=beam_width,
|
parse_states = self.predict(subbatch, beam_width=beam_width,
|
||||||
beam_density=beam_density)
|
beam_density=beam_density)
|
||||||
self.set_annotations(subbatch, parse_states, tensors=None)
|
self.set_annotations(subbatch, parse_states, tensors=None)
|
||||||
for doc in batch_in_order:
|
if as_example:
|
||||||
yield doc
|
annotated_examples = []
|
||||||
|
for ex, doc in zip(batch_in_order, docs):
|
||||||
|
ex.doc = doc
|
||||||
|
annotated_examples.append(ex)
|
||||||
|
yield from annotated_examples
|
||||||
|
else:
|
||||||
|
yield from batch_in_order
|
||||||
|
|
||||||
def require_model(self):
|
def require_model(self):
|
||||||
"""Raise an error if the component's model is not initialized."""
|
"""Raise an error if the component's model is not initialized."""
|
||||||
|
@ -635,6 +643,12 @@ cdef class Parser:
|
||||||
self.cfg.update(cfg)
|
self.cfg.update(cfg)
|
||||||
return sgd
|
return sgd
|
||||||
|
|
||||||
|
def _get_doc(self, example):
|
||||||
|
""" Use this method if the `example` can be both a Doc or an Example """
|
||||||
|
if isinstance(example, Doc):
|
||||||
|
return example
|
||||||
|
return example.doc
|
||||||
|
|
||||||
def to_disk(self, path, exclude=tuple(), **kwargs):
|
def to_disk(self, path, exclude=tuple(), **kwargs):
|
||||||
serializers = {
|
serializers = {
|
||||||
'model': lambda p: (self.model.to_disk(p) if self.model is not True else True),
|
'model': lambda p: (self.model.to_disk(p) if self.model is not True else True),
|
||||||
|
|
|
@ -1,16 +1,40 @@
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import spacy
|
||||||
|
from spacy.errors import AlignmentError
|
||||||
from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags, Example, DocAnnotation
|
from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags, Example, DocAnnotation
|
||||||
from spacy.gold import spans_from_biluo_tags, GoldParse, iob_to_biluo
|
from spacy.gold import spans_from_biluo_tags, GoldParse, iob_to_biluo
|
||||||
from spacy.gold import GoldCorpus, docs_to_json, align
|
from spacy.gold import GoldCorpus, docs_to_json, align
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
|
from spacy.syntax.nonproj import is_nonproj_tree
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
from spacy.util import compounding, minibatch
|
from spacy.util import compounding, minibatch
|
||||||
from .util import make_tempdir
|
from .util import make_tempdir
|
||||||
import pytest
|
import pytest
|
||||||
import srsly
|
import srsly
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def doc():
|
||||||
|
text = "Sarah's sister flew to Silicon Valley via London."
|
||||||
|
tags = ['NNP', 'POS', 'NN', 'VBD', 'IN', 'NNP', 'NNP', 'IN', 'NNP', '.']
|
||||||
|
# head of '.' is intentionally nonprojective for testing
|
||||||
|
heads = [2, 0, 3, 3, 3, 6, 4, 3, 7, 5]
|
||||||
|
deps = ['poss', 'case', 'nsubj', 'ROOT', 'prep', 'compound', 'pobj', 'prep', 'pobj', 'punct']
|
||||||
|
biluo_tags = ["U-PERSON", "O", "O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
|
||||||
|
cats = {"TRAVEL": 1.0, "BAKING": 0.0}
|
||||||
|
nlp = English()
|
||||||
|
doc = nlp(text)
|
||||||
|
for i in range(len(tags)):
|
||||||
|
doc[i].tag_ = tags[i]
|
||||||
|
doc[i].dep_ = deps[i]
|
||||||
|
doc[i].head = doc[heads[i]]
|
||||||
|
doc.ents = spans_from_biluo_tags(doc, biluo_tags)
|
||||||
|
doc.cats = cats
|
||||||
|
doc.is_tagged = True
|
||||||
|
doc.is_parsed = True
|
||||||
|
return doc
|
||||||
|
|
||||||
|
|
||||||
def test_gold_biluo_U(en_vocab):
|
def test_gold_biluo_U(en_vocab):
|
||||||
words = ["I", "flew", "to", "London", "."]
|
words = ["I", "flew", "to", "London", "."]
|
||||||
|
@ -98,23 +122,14 @@ def test_iob_to_biluo():
|
||||||
iob_to_biluo(bad_iob)
|
iob_to_biluo(bad_iob)
|
||||||
|
|
||||||
|
|
||||||
def test_roundtrip_docs_to_json():
|
def test_roundtrip_docs_to_json(doc):
|
||||||
text = "I flew to Silicon Valley via London."
|
|
||||||
tags = ["PRP", "VBD", "IN", "NNP", "NNP", "IN", "NNP", "."]
|
|
||||||
heads = [1, 1, 1, 4, 2, 1, 5, 1]
|
|
||||||
deps = ["nsubj", "ROOT", "prep", "compound", "pobj", "prep", "pobj", "punct"]
|
|
||||||
biluo_tags = ["O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
|
|
||||||
cats = {"TRAVEL": 1.0, "BAKING": 0.0}
|
|
||||||
nlp = English()
|
nlp = English()
|
||||||
doc = nlp(text)
|
text = doc.text
|
||||||
for i in range(len(tags)):
|
tags = [t.tag_ for t in doc]
|
||||||
doc[i].tag_ = tags[i]
|
deps = [t.dep_ for t in doc]
|
||||||
doc[i].dep_ = deps[i]
|
heads = [t.head.i for t in doc]
|
||||||
doc[i].head = doc[heads[i]]
|
biluo_tags = iob_to_biluo([t.ent_iob_ + "-" + t.ent_type_ if t.ent_type_ else "O" for t in doc])
|
||||||
doc.ents = spans_from_biluo_tags(doc, biluo_tags)
|
cats = doc.cats
|
||||||
doc.cats = cats
|
|
||||||
doc.is_tagged = True
|
|
||||||
doc.is_parsed = True
|
|
||||||
|
|
||||||
# roundtrip to JSON
|
# roundtrip to JSON
|
||||||
with make_tempdir() as tmpdir:
|
with make_tempdir() as tmpdir:
|
||||||
|
@ -122,7 +137,7 @@ def test_roundtrip_docs_to_json():
|
||||||
srsly.write_json(json_file, [docs_to_json(doc)])
|
srsly.write_json(json_file, [docs_to_json(doc)])
|
||||||
goldcorpus = GoldCorpus(train=str(json_file), dev=str(json_file))
|
goldcorpus = GoldCorpus(train=str(json_file), dev=str(json_file))
|
||||||
|
|
||||||
reloaded_example = next(goldcorpus.train_dataset(nlp))
|
reloaded_example = next(goldcorpus.dev_dataset(nlp))
|
||||||
goldparse = reloaded_example.gold
|
goldparse = reloaded_example.gold
|
||||||
|
|
||||||
assert len(doc) == goldcorpus.count_train()
|
assert len(doc) == goldcorpus.count_train()
|
||||||
|
@ -142,7 +157,7 @@ def test_roundtrip_docs_to_json():
|
||||||
srsly.write_jsonl(jsonl_file, [docs_to_json(doc)])
|
srsly.write_jsonl(jsonl_file, [docs_to_json(doc)])
|
||||||
goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
|
goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
|
||||||
|
|
||||||
reloaded_example = next(goldcorpus.train_dataset(nlp))
|
reloaded_example = next(goldcorpus.dev_dataset(nlp))
|
||||||
goldparse = reloaded_example.gold
|
goldparse = reloaded_example.gold
|
||||||
|
|
||||||
assert len(doc) == goldcorpus.count_train()
|
assert len(doc) == goldcorpus.count_train()
|
||||||
|
@ -166,7 +181,7 @@ def test_roundtrip_docs_to_json():
|
||||||
srsly.write_jsonl(jsonl_file, goldcorpus.train_examples)
|
srsly.write_jsonl(jsonl_file, goldcorpus.train_examples)
|
||||||
goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
|
goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
|
||||||
|
|
||||||
reloaded_example = next(goldcorpus.train_dataset(nlp))
|
reloaded_example = next(goldcorpus.dev_dataset(nlp))
|
||||||
goldparse = reloaded_example.gold
|
goldparse = reloaded_example.gold
|
||||||
|
|
||||||
assert len(doc) == goldcorpus.count_train()
|
assert len(doc) == goldcorpus.count_train()
|
||||||
|
@ -181,6 +196,83 @@ def test_roundtrip_docs_to_json():
|
||||||
assert cats["BAKING"] == goldparse.cats["BAKING"]
|
assert cats["BAKING"] == goldparse.cats["BAKING"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_projective_train_vs_nonprojective_dev(doc):
|
||||||
|
nlp = English()
|
||||||
|
text = doc.text
|
||||||
|
deps = [t.dep_ for t in doc]
|
||||||
|
heads = [t.head.i for t in doc]
|
||||||
|
|
||||||
|
with make_tempdir() as tmpdir:
|
||||||
|
jsonl_file = tmpdir / "test.jsonl"
|
||||||
|
# write to JSONL train dicts
|
||||||
|
srsly.write_jsonl(jsonl_file, [docs_to_json(doc)])
|
||||||
|
goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
|
||||||
|
|
||||||
|
train_reloaded_example = next(goldcorpus.train_dataset(nlp))
|
||||||
|
train_goldparse = train_reloaded_example.gold
|
||||||
|
|
||||||
|
dev_reloaded_example = next(goldcorpus.dev_dataset(nlp))
|
||||||
|
dev_goldparse = dev_reloaded_example.gold
|
||||||
|
|
||||||
|
assert is_nonproj_tree([t.head.i for t in doc]) is True
|
||||||
|
assert is_nonproj_tree(train_goldparse.heads) is False
|
||||||
|
assert heads[:-1] == train_goldparse.heads[:-1]
|
||||||
|
assert heads[-1] != train_goldparse.heads[-1]
|
||||||
|
assert deps[:-1] == train_goldparse.labels[:-1]
|
||||||
|
assert deps[-1] != train_goldparse.labels[-1]
|
||||||
|
|
||||||
|
assert heads == dev_goldparse.heads
|
||||||
|
assert deps == dev_goldparse.labels
|
||||||
|
|
||||||
|
|
||||||
|
def test_ignore_misaligned(doc):
|
||||||
|
nlp = English()
|
||||||
|
text = doc.text
|
||||||
|
deps = [t.dep_ for t in doc]
|
||||||
|
heads = [t.head.i for t in doc]
|
||||||
|
|
||||||
|
use_new_align = spacy.gold.USE_NEW_ALIGN
|
||||||
|
|
||||||
|
spacy.gold.USE_NEW_ALIGN = False
|
||||||
|
with make_tempdir() as tmpdir:
|
||||||
|
jsonl_file = tmpdir / "test.jsonl"
|
||||||
|
data = [docs_to_json(doc)]
|
||||||
|
data[0]["paragraphs"][0]["raw"] = text.replace("Sarah", "Jane")
|
||||||
|
# write to JSONL train dicts
|
||||||
|
srsly.write_jsonl(jsonl_file, data)
|
||||||
|
goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
|
||||||
|
|
||||||
|
train_reloaded_example = next(goldcorpus.train_dataset(nlp))
|
||||||
|
|
||||||
|
spacy.gold.USE_NEW_ALIGN = True
|
||||||
|
with make_tempdir() as tmpdir:
|
||||||
|
jsonl_file = tmpdir / "test.jsonl"
|
||||||
|
data = [docs_to_json(doc)]
|
||||||
|
data[0]["paragraphs"][0]["raw"] = text.replace("Sarah", "Jane")
|
||||||
|
# write to JSONL train dicts
|
||||||
|
srsly.write_jsonl(jsonl_file, data)
|
||||||
|
goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
|
||||||
|
|
||||||
|
with pytest.raises(AlignmentError):
|
||||||
|
train_reloaded_example = next(goldcorpus.train_dataset(nlp))
|
||||||
|
|
||||||
|
with make_tempdir() as tmpdir:
|
||||||
|
jsonl_file = tmpdir / "test.jsonl"
|
||||||
|
data = [docs_to_json(doc)]
|
||||||
|
data[0]["paragraphs"][0]["raw"] = text.replace("Sarah", "Jane")
|
||||||
|
# write to JSONL train dicts
|
||||||
|
srsly.write_jsonl(jsonl_file, data)
|
||||||
|
goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
|
||||||
|
|
||||||
|
# doesn't raise an AlignmentError, but there is nothing to iterate over
|
||||||
|
# because the only example can't be aligned
|
||||||
|
train_reloaded_example = list(goldcorpus.train_dataset(nlp,
|
||||||
|
ignore_misaligned=True))
|
||||||
|
assert len(train_reloaded_example) == 0
|
||||||
|
|
||||||
|
spacy.gold.USE_NEW_ALIGN = use_new_align
|
||||||
|
|
||||||
|
|
||||||
# xfail while we have backwards-compatible alignment
|
# xfail while we have backwards-compatible alignment
|
||||||
@pytest.mark.xfail
|
@pytest.mark.xfail
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
|
|
Loading…
Reference in New Issue
Block a user