Restructure Example with merged sents as default (#4632)

* Switch to train_dataset() function in train CLI

* Fixes for pipe() methods in pipeline components

* Don't clobber `examples` variable with `as_example` in pipe() methods
* Remove unnecessary traversals of `examples`

* Update Parser.pipe() for Examples

* Add `as_examples` kwarg to `pipe()` with implementation to return
`Example`s

* Accept `Doc` or `Example` in `pipe()` with `_get_doc()` (copied from
`Pipe`)

* Fixes to Example implementation in spacy.gold

* Move `make_projective` from an attribute of Example to an argument of
`Example.get_gold_parses()`

* Head of 0 are not treated as unset

* Unset heads are set to self rather than `None` (which causes problems
while projectivizing)

* Check for `Doc` (not just not `None`) when creating GoldParses for
pre-merged example

* Don't clobber `examples` variable in `iter_gold_docs()`

* Add/modify gold tests for handling projectivity

* In JSON roundtrip compare results from `dev_dataset` rather than
`train_dataset` to avoid projectivization (and other potential
modifications)

* Add test for projective train vs. nonprojective dev versions of the
same `Doc`

* Handle ignore_misaligned as arg rather than attr

Move `ignore_misaligned` from an attribute of `Example` to an argument
to `Example.get_gold_parses()`, which makes it parallel to
`make_projective`.

Add test with old and new align that checks whether `ignore_misaligned`
errors are raised as expected (only for new align).

* Remove unused attrs from gold.pxd

Remove `ignore_misaligned` and `make_projective` from `gold.pxd`

* Restructure Example with merged sents as default

An `Example` now includes a single `TokenAnnotation` that includes all
the information from one `Doc` (=JSON `paragraph`). If required, the
individual sentences can be returned as a list of examples with
`Example.split_sents()` with no raw text available.

* Input/output a single `Example.token_annotation`

* Add `sent_starts` to `TokenAnnotation` to handle sentence boundaries

* Replace `Example.merge_sents()` with `Example.split_sents()`

* Modify components to use a single `Example.token_annotation`

  * Pipeline components
  * conllu2json converter

* Rework/rename `add_token_annotation()` and `add_doc_annotation()` to
`set_token_annotation()` and `set_doc_annotation()`, functions that set
rather then appending/extending.

* Rename `morphology` to `morphs` in `TokenAnnotation` and `GoldParse`

* Add getters to `TokenAnnotation` to supply default values when a given
attribute is not available

* `Example.get_gold_parses()` in `spacy.gold._make_golds()` is only
applied on single examples, so the `GoldParse` is returned saved in the
provided `Example` rather than creating a new `Example` with no other
internal annotation

* Update tests for API changes and `merge_sents()` vs. `split_sents()`

* Refer to Example.goldparse in iter_gold_docs()

Use `Example.goldparse` in `iter_gold_docs()` instead of `Example.gold`
because a `None` `GoldParse` is generated with ignore_misaligned and
generating it on-the-fly can raise an unwanted AlignmentError

* Fix make_orth_variants()

Fix bug in make_orth_variants() related to conversion from multiple to
one TokenAnnotation per Example.

* Add basic test for make_orth_variants()

* Replace try/except with conditionals

* Replace default morph value with set
This commit is contained in:
adrianeboyd 2019-11-25 16:03:28 +01:00 committed by Matthew Honnibal
parent 44829950ba
commit 392c4880d9
12 changed files with 376 additions and 330 deletions

View File

@ -24,17 +24,16 @@ def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None, **_):
checked_for_ner = False
has_ner_tags = False
for i, example in enumerate(conll_data):
for token_annotation in example.token_annotations:
if not checked_for_ner:
has_ner_tags = is_ner(token_annotation.entities[0])
checked_for_ner = True
sentences.append(generate_sentence(token_annotation, has_ner_tags))
# Real-sized documents could be extracted using the comments on the
# conluu document
if len(sentences) % n_sents == 0:
doc = create_doc(sentences, i)
docs.append(doc)
sentences = []
if not checked_for_ner:
has_ner_tags = is_ner(example.token_annotation.entities[0])
checked_for_ner = True
sentences.append(generate_sentence(example.token_annotation, has_ner_tags))
# Real-sized documents could be extracted using the comments on the
# conllu document
if len(sentences) % n_sents == 0:
doc = create_doc(sentences, i)
docs.append(doc)
sentences = []
return docs
@ -84,7 +83,7 @@ def read_conllx(input_data, use_morphology=False, n=0):
print(line)
raise
example = Example(doc=None)
example.add_token_annotation(ids=ids, words=words, tags=tags,
example.set_token_annotation(ids=ids, words=words, tags=tags,
heads=heads, deps=deps, entities=ents)
yield example
i += 1

View File

@ -25,7 +25,7 @@ cdef class GoldParse:
cdef public int loss
cdef public list words
cdef public list tags
cdef public list morphology
cdef public list morphs
cdef public list heads
cdef public list labels
cdef public dict orths
@ -45,7 +45,8 @@ cdef class TokenAnnotation:
cdef public list heads
cdef public list deps
cdef public list entities
cdef public list morphology
cdef public list morphs
cdef public list sent_starts
cdef public list brackets
@ -56,7 +57,7 @@ cdef class DocAnnotation:
cdef class Example:
cdef public object doc
cdef public list token_annotations
cdef public TokenAnnotation token_annotation
cdef public DocAnnotation doc_annotation
cdef public object goldparse

View File

@ -215,7 +215,7 @@ class GoldCorpus(object):
ex_dict = example.to_dict()
text = example.text
srsly.write_msgpack(directory / "{}.msg".format(i), (text, ex_dict))
n += len(example.token_annotations)
n += 1
if limit and n >= limit:
break
@ -271,7 +271,7 @@ class GoldCorpus(object):
raise ValueError(Errors.E124.format(path=path2str(loc), formats=supported))
for example in examples:
yield example
i += len(example.token_annotations)
i += 1
if limit and i >= limit:
return
@ -286,15 +286,14 @@ class GoldCorpus(object):
yield from self.read_examples(locs, limit=self.limit)
def count_train(self):
# TODO: should this count words or sentences ?
"""Returns count of words in train examples"""
n = 0
i = 0
for example in self.train_examples:
for token_annotation in example.token_annotations:
n += len(token_annotation.words)
if self.limit and i >= self.limit:
break
i += 1
n += len(example.token_annotation.words)
if self.limit and i >= self.limit:
break
i += 1
return n
def train_dataset(self, nlp, gold_preproc=False, max_length=None,
@ -328,18 +327,27 @@ class GoldCorpus(object):
def iter_gold_docs(cls, nlp, examples, gold_preproc, max_length=None,
noise_level=0.0, orth_variant_level=0.0,
make_projective=False, ignore_misaligned=False):
""" Setting gold_preproc will result in creating a doc per 'sentence' """
""" Setting gold_preproc will result in creating a doc per sentence """
for example in examples:
if gold_preproc:
example.doc = None
split_examples = example.split_sents()
example_golds = []
for split_example in split_examples:
split_example_docs = cls._make_docs(nlp, split_example,
gold_preproc, noise_level=noise_level,
orth_variant_level=orth_variant_level)
split_example_golds = cls._make_golds(split_example_docs,
vocab=nlp.vocab, make_projective=make_projective,
ignore_misaligned=ignore_misaligned)
example_golds.extend(split_example_golds)
else:
example = example.merge_sents()
example_docs = cls._make_docs(nlp, example,
gold_preproc, noise_level=noise_level,
orth_variant_level=orth_variant_level)
example_golds = cls._make_golds(example_docs, vocab=nlp.vocab,
make_projective=make_projective,
ignore_misaligned=ignore_misaligned)
example_docs = cls._make_docs(nlp, example,
gold_preproc, noise_level=noise_level,
orth_variant_level=orth_variant_level)
example_golds = cls._make_golds(example_docs, vocab=nlp.vocab,
make_projective=make_projective,
ignore_misaligned=ignore_misaligned)
for ex in example_golds:
if ex.goldparse is not None:
if (not max_length) or len(ex.doc) < max_length:
@ -353,35 +361,28 @@ class GoldCorpus(object):
var_text = add_noise(var_example.text, noise_level)
var_doc = nlp.make_doc(var_text)
var_example.doc = var_doc
return [var_example]
else:
doc_examples = []
for token_annotation in var_example.token_annotations:
t_doc = Doc(nlp.vocab, words=add_noise(token_annotation.words, noise_level))
doc_example = Example(doc_annotation=example.doc_annotation,
token_annotations=[token_annotation],
doc=t_doc)
doc_examples.append(doc_example)
return doc_examples
var_doc = Doc(nlp.vocab, words=add_noise(var_example.token_annotation.words, noise_level))
var_example.doc = var_doc
return [var_example]
@classmethod
def _make_golds(cls, examples, vocab=None, make_projective=False,
ignore_misaligned=False):
gold_examples = []
for example in examples:
gold_parses = example.get_gold_parses(vocab=vocab,
make_projective=make_projective,
ignore_misaligned=ignore_misaligned)
for (doc, gold) in gold_parses:
ex = Example(doc=doc)
ex.goldparse = gold
gold_examples.append(ex)
return gold_examples
assert len(gold_parses) == 1
assert gold_parses[0][0] == example.doc
example.goldparse = gold_parses[0][1]
return examples
def make_orth_variants(nlp, example, orth_variant_level=0.0):
if random.random() >= orth_variant_level:
return example
if not example.token_annotations:
if not example.token_annotation:
return example
raw = example.text
if random.random() >= 0.5:
@ -392,46 +393,46 @@ def make_orth_variants(nlp, example, orth_variant_level=0.0):
ndpv = nlp.Defaults.paired_orth_variants
# modify words in paragraph_tuples
variant_example = Example(doc=raw)
for token_annotation in example.token_annotations:
words = token_annotation.words
tags = token_annotation.tags
if not words or not tags:
# add the unmodified annotation
token_dict = token_annotation.to_dict()
variant_example.add_token_annotation(**token_dict)
else:
if lower:
words = [w.lower() for w in words]
# single variants
punct_choices = [random.choice(x["variants"]) for x in ndsv]
for word_idx in range(len(words)):
for punct_idx in range(len(ndsv)):
if tags[word_idx] in ndsv[punct_idx]["tags"] \
and words[word_idx] in ndsv[punct_idx]["variants"]:
words[word_idx] = punct_choices[punct_idx]
# paired variants
punct_choices = [random.choice(x["variants"]) for x in ndpv]
for word_idx in range(len(words)):
for punct_idx in range(len(ndpv)):
if tags[word_idx] in ndpv[punct_idx]["tags"] \
and words[word_idx] in itertools.chain.from_iterable(ndpv[punct_idx]["variants"]):
# backup option: random left vs. right from pair
pair_idx = random.choice([0, 1])
# best option: rely on paired POS tags like `` / ''
if len(ndpv[punct_idx]["tags"]) == 2:
pair_idx = ndpv[punct_idx]["tags"].index(tags[word_idx])
# next best option: rely on position in variants
# (may not be unambiguous, so order of variants matters)
else:
for pair in ndpv[punct_idx]["variants"]:
if words[word_idx] in pair:
pair_idx = pair.index(words[word_idx])
words[word_idx] = punct_choices[punct_idx][pair_idx]
token_annotation = example.token_annotation
words = token_annotation.words
tags = token_annotation.tags
if not words or not tags:
# add the unmodified annotation
token_dict = token_annotation.to_dict()
variant_example.set_token_annotation(**token_dict)
else:
if lower:
words = [w.lower() for w in words]
# single variants
punct_choices = [random.choice(x["variants"]) for x in ndsv]
for word_idx in range(len(words)):
for punct_idx in range(len(ndsv)):
if tags[word_idx] in ndsv[punct_idx]["tags"] \
and words[word_idx] in ndsv[punct_idx]["variants"]:
words[word_idx] = punct_choices[punct_idx]
# paired variants
punct_choices = [random.choice(x["variants"]) for x in ndpv]
for word_idx in range(len(words)):
for punct_idx in range(len(ndpv)):
if tags[word_idx] in ndpv[punct_idx]["tags"] \
and words[word_idx] in itertools.chain.from_iterable(ndpv[punct_idx]["variants"]):
# backup option: random left vs. right from pair
pair_idx = random.choice([0, 1])
# best option: rely on paired POS tags like `` / ''
if len(ndpv[punct_idx]["tags"]) == 2:
pair_idx = ndpv[punct_idx]["tags"].index(tags[word_idx])
# next best option: rely on position in variants
# (may not be unambiguous, so order of variants matters)
else:
for pair in ndpv[punct_idx]["variants"]:
if words[word_idx] in pair:
pair_idx = pair.index(words[word_idx])
words[word_idx] = punct_choices[punct_idx][pair_idx]
token_dict = token_annotation.to_dict()
token_dict["words"] = words
token_dict["tags"] = tags
variant_example.add_token_annotation(**token_dict)
token_dict = token_annotation.to_dict()
token_dict["words"] = words
token_dict["tags"] = tags
variant_example.set_token_annotation(**token_dict)
# modify raw to match variant_paragraph_tuples
if raw is not None:
variants = []
@ -449,30 +450,29 @@ def make_orth_variants(nlp, example, orth_variant_level=0.0):
while raw_idx < len(raw) and re.match("\s", raw[raw_idx]):
variant_raw += raw[raw_idx]
raw_idx += 1
for token_annotation in variant_example.token_annotations:
for word in token_annotation.words:
match_found = False
# add identical word
if word not in variants and raw[raw_idx:].startswith(word):
variant_raw += word
raw_idx += len(word)
match_found = True
# add variant word
else:
for variant in variants:
if not match_found and \
raw[raw_idx:].startswith(variant):
raw_idx += len(variant)
variant_raw += word
match_found = True
# something went wrong, abort
# (add a warning message?)
if not match_found:
return example
# add following whitespace
while raw_idx < len(raw) and re.match("\s", raw[raw_idx]):
variant_raw += raw[raw_idx]
raw_idx += 1
for word in variant_example.token_annotation.words:
match_found = False
# add identical word
if word not in variants and raw[raw_idx:].startswith(word):
variant_raw += word
raw_idx += len(word)
match_found = True
# add variant word
else:
for variant in variants:
if not match_found and \
raw[raw_idx:].startswith(variant):
raw_idx += len(variant)
variant_raw += word
match_found = True
# something went wrong, abort
# (add a warning message?)
if not match_found:
return example
# add following whitespace
while raw_idx < len(raw) and re.match("\s", raw[raw_idx]):
variant_raw += raw[raw_idx]
raw_idx += 1
variant_example.doc = variant_raw
return variant_example
return variant_example
@ -521,30 +521,43 @@ def json_to_examples(doc):
paragraphs = []
for paragraph in doc["paragraphs"]:
example = Example(doc=paragraph.get("raw", None))
words = []
ids = []
tags = []
heads = []
labels = []
ner = []
morphs = []
sent_starts = []
brackets = []
for sent in paragraph["sentences"]:
words = []
ids = []
tags = []
heads = []
labels = []
ner = []
sent_start_i = len(words)
for i, token in enumerate(sent["tokens"]):
words.append(token["orth"])
ids.append(i)
ids.append(token.get('id', sent_start_i + i))
tags.append(token.get('tag', "-"))
heads.append(token.get("head", 0) + i)
heads.append(token.get("head", 0) + sent_start_i + i)
labels.append(token.get("dep", ""))
# Ensure ROOT label is case-insensitive
if labels[-1].lower() == "root":
labels[-1] = "ROOT"
ner.append(token.get("ner", "-"))
example.add_token_annotation(ids=ids, words=words, tags=tags,
heads=heads, deps=labels, entities=ner,
brackets=sent.get("brackets", []))
morphs.append(token.get("morph", {}))
if i == 0:
sent_starts.append(True)
else:
sent_starts.append(False)
if "brackets" in sent:
brackets.extend((b["first"] + sent_start_i,
b["last"] + sent_start_i, b["label"])
for b in sent["brackets"])
cats = {}
for cat in paragraph.get("cats", {}):
cats[cat["label"]] = cat["value"]
example.add_doc_annotation(cats=cats)
example.set_token_annotation(ids=ids, words=words, tags=tags,
heads=heads, deps=labels, entities=ner, morphs=morphs,
sent_starts=sent_starts, brackets=brackets)
example.set_doc_annotation(cats=cats)
yield example
@ -652,15 +665,16 @@ def _consume_ent(tags):
cdef class TokenAnnotation:
def __init__(self, ids=None, words=None, tags=None, heads=None, deps=None, entities=None, morphology=None, brackets=None):
def __init__(self, ids=None, words=None, tags=None, heads=None, deps=None, entities=None, morphs=None, sent_starts=None, brackets=None):
self.ids = ids if ids else []
self.words = words if words else []
self.tags = tags if tags else []
self.heads = heads if heads else []
self.deps = deps if deps else []
self.entities = entities if entities else []
self.morphs = morphs if morphs else []
self.sent_starts = sent_starts if sent_starts else []
self.brackets = brackets if brackets else []
self.morphology = morphology if morphology else []
@classmethod
def from_dict(cls, token_dict):
@ -670,7 +684,8 @@ cdef class TokenAnnotation:
heads=token_dict.get("heads", None),
deps=token_dict.get("deps", None),
entities=token_dict.get("entities", None),
morphology=token_dict.get("morphology", None),
morphs=token_dict.get("morphs", None),
sent_starts=token_dict.get("sent_starts", None),
brackets=token_dict.get("brackets", None))
def to_dict(self):
@ -680,9 +695,34 @@ cdef class TokenAnnotation:
"heads": self.heads,
"deps": self.deps,
"entities": self.entities,
"morphology": self.morphology,
"morphs": self.morphs,
"sent_starts": self.sent_starts,
"brackets": self.brackets}
def get_id(self, i):
return self.ids[i] if i < len(self.ids) else i
def get_word(self, i):
return self.words[i] if i < len(self.words) else ""
def get_tag(self, i):
return self.tags[i] if i < len(self.tags) else "-"
def get_head(self, i):
return self.heads[i] if i < len(self.heads) else i
def get_dep(self, i):
return self.deps[i] if i < len(self.deps) else ""
def get_entity(self, i):
return self.entities[i] if i < len(self.entities) else "-"
def get_morph(self, i):
return self.morphs[i] if i < len(self.morphs) else set()
def get_sent_start(self, i):
return self.sent_starts[i] if i < len(self.sent_starts) else None
cdef class DocAnnotation:
def __init__(self, cats=None, links=None):
@ -698,33 +738,33 @@ cdef class DocAnnotation:
cdef class Example:
def __init__(self, doc_annotation=None, token_annotations=None, doc=None,
def __init__(self, doc_annotation=None, token_annotation=None, doc=None,
goldparse=None):
""" Doc can either be text, or an actual Doc """
self.doc = doc
self.doc_annotation = doc_annotation if doc_annotation else DocAnnotation()
self.token_annotations = token_annotations if token_annotations else []
self.token_annotation = token_annotation if token_annotation else TokenAnnotation()
self.goldparse = goldparse
@classmethod
def from_gold(cls, goldparse, doc=None):
doc_annotation = DocAnnotation(cats=goldparse.cats, links=goldparse.links)
token_annotation = goldparse.get_token_annotation()
return cls(doc_annotation, [token_annotation], doc)
return cls(doc_annotation, token_annotation, doc)
@classmethod
def from_dict(cls, example_dict, doc=None):
token_dicts = example_dict["token_annotations"]
token_annotations = [TokenAnnotation.from_dict(t) for t in token_dicts]
token_dict = example_dict["token_annotation"]
token_annotation = TokenAnnotation.from_dict(token_dict)
doc_dict = example_dict["doc_annotation"]
doc_annotation = DocAnnotation.from_dict(doc_dict)
return cls(doc_annotation, token_annotations, doc)
return cls(doc_annotation, token_annotation, doc)
def to_dict(self):
""" Note that this method does NOT export the doc, only the annotations ! """
token_dicts = [t.to_dict() for t in self.token_annotations]
token_dict = self.token_annotation.to_dict()
doc_dict = self.doc_annotation.to_dict()
return {"token_annotations": token_dicts, "doc_annotation": doc_dict}
return {"token_annotation": token_dict, "doc_annotation": doc_dict}
@property
def text(self):
@ -737,96 +777,108 @@ cdef class Example:
@property
def gold(self):
if self.goldparse is None:
doc, gold = self.get_gold_parses(merge=True)[0]
doc, gold = self.get_gold_parses()[0]
self.goldparse = gold
return self.goldparse
def add_token_annotation(self, ids=None, words=None, tags=None, heads=None,
deps=None, entities=None, morphology=None, brackets=None):
t = TokenAnnotation(ids=ids, words=words, tags=tags,
def set_token_annotation(self, ids=None, words=None, tags=None, heads=None,
deps=None, entities=None, morphs=None,
sent_starts=None, brackets=None):
self.token_annotation = TokenAnnotation(ids=ids, words=words, tags=tags,
heads=heads, deps=deps, entities=entities,
morphology=morphology, brackets=brackets)
self.token_annotations.append(t)
morphs=morphs, sent_starts=sent_starts,
brackets=brackets)
def add_doc_annotation(self, cats=None, links=None):
def set_doc_annotation(self, cats=None, links=None):
if cats:
self.doc_annotation.cats.update(cats)
self.doc_annotation.cats = cats
if links:
self.doc_annotation.links.update(links)
self.doc_annotation.links = links
def merge_sents(self):
""" Merge the list of token annotations into one object and return this new object """
m_example = Example(doc=self.doc, doc_annotation=self.doc_annotation)
m_ids, m_words, m_tags, m_heads, m_deps, m_ents, m_morph = [], [], [], [], [], [], []
m_brackets = []
i = 0
for t in self.token_annotations:
m_ids.extend(id_ + i for id_ in t.ids)
m_words.extend(t.words)
m_tags.extend(t.tags)
m_heads.extend(head + i if head is not None and head >= 0 else head_i + i for head_i, head in enumerate(t.heads))
m_deps.extend(t.deps)
m_ents.extend(t.entities)
m_morph.extend(t.morphology)
m_brackets.extend((b["first"] + i, b["last"] + i, b["label"])
for b in t.brackets)
i += len(t.ids)
m_example.add_token_annotation(ids=m_ids, words=m_words, tags=m_tags,
heads=m_heads, deps=m_deps, entities=m_ents,
morphology=m_morph, brackets=m_brackets)
return m_example
def split_sents(self):
""" Split the token annotations into multiple Examples based on
sent_starts and return a list of the new Examples"""
s_example = Example(doc=None, doc_annotation=self.doc_annotation)
s_ids, s_words, s_tags, s_heads = [], [], [], []
s_deps, s_ents, s_morphs, s_sent_starts = [], [], [], []
s_brackets = []
sent_start_i = 0
t = self.token_annotation
split_examples = []
for i in range(len(t.words)):
if i > 0 and t.sent_starts[i] == True:
s_example.set_token_annotation(ids=s_ids,
words=s_words, tags=s_tags, heads=s_heads, deps=s_deps,
entities=s_ents, morphs=s_morphs,
sent_starts=s_sent_starts, brackets=s_brackets)
split_examples.append(s_example)
s_example = Example(doc=None, doc_annotation=self.doc_annotation)
s_ids, s_words, s_tags, s_heads = [], [], [], []
s_deps, s_ents, s_morphs, s_sent_starts = [], [], [], []
s_brackets = []
sent_start_i = i
s_ids.append(t.get_id(i))
s_words.append(t.get_word(i))
s_tags.append(t.get_tag(i))
s_heads.append(t.get_head(i) - sent_start_i)
s_deps.append(t.get_dep(i))
s_ents.append(t.get_entity(i))
s_morphs.append(t.get_morph(i))
s_sent_starts.append(t.get_sent_start(i))
s_brackets.extend((b[0] - sent_start_i,
b[1] - sent_start_i, b[2])
for b in t.brackets if b[0] == i)
i += 1
s_example.set_token_annotation(ids=s_ids, words=s_words, tags=s_tags,
heads=s_heads, deps=s_deps, entities=s_ents,
morphs=s_morphs, sent_starts=s_sent_starts,
brackets=s_brackets)
split_examples.append(s_example)
return split_examples
def get_gold_parses(self, merge=False, vocab=None, make_projective=False,
def get_gold_parses(self, merge=True, vocab=None, make_projective=False,
ignore_misaligned=False):
"""Return a list of (doc, GoldParse) objects.
If merge is set to True, add all Token annotations to one big list."""
If merge is set to True, keep all Token annotations as one big list."""
d = self.doc_annotation
# merging different sentences
# merge == do not modify Example
if merge:
merged_example = self.merge_sents()
assert(len(merged_example.token_annotations)) == 1
t = merged_example.token_annotations[0]
m_doc = merged_example.doc
if not m_doc:
t = self.token_annotation
doc = self.doc
if not self.doc:
if not vocab:
raise ValueError(Errors.E998)
m_doc = Doc(vocab, words=t.words)
doc = Doc(vocab, words=t.words)
try:
gp = GoldParse.from_annotation(m_doc, d, t, make_projective=make_projective)
gp = GoldParse.from_annotation(doc, d, t,
make_projective=make_projective)
except AlignmentError:
if ignore_misaligned:
gp = None
else:
raise
return [(self.doc, gp)]
# we only have one sentence and an appropriate doc
elif len(self.token_annotations) == 1 and isinstance(self.doc, Doc):
t = self.token_annotations[0]
try:
gp = GoldParse.from_annotation(self.doc, d, t, make_projective=make_projective)
except AlignmentError:
if ignore_misaligned:
gp = None
else:
raise
return [(self.doc, gp)]
# not merging: one GoldParse per 'sentence', defining docs with the words from each sentence
return [(doc, gp)]
# not merging: one GoldParse per sentence, defining docs with the words
# from each sentence
else:
parses = []
for t in self.token_annotations:
split_examples = self.split_sents()
for split_example in split_examples:
if not vocab:
raise ValueError(Errors.E998)
t_doc = Doc(vocab, words=t.words)
split_doc = Doc(vocab, words=split_example.token_annotation.words)
try:
gp = GoldParse.from_annotation(t_doc, d, t, make_projective=make_projective)
gp = GoldParse.from_annotation(split_doc, d,
split_example.token_annotation,
make_projective=make_projective)
except AlignmentError:
if ignore_misaligned:
gp = None
else:
raise
if gp is not None:
parses.append((t_doc, gp))
parses.append((split_doc, gp))
return parses
@classmethod
@ -881,9 +933,14 @@ cdef class GoldParse:
"""
@classmethod
def from_annotation(cls, doc, doc_annotation, token_annotation, make_projective=False):
return cls(doc, words=token_annotation.words, tags=token_annotation.tags,
heads=token_annotation.heads, deps=token_annotation.deps, entities=token_annotation.entities,
morphology=token_annotation.morphology, cats=doc_annotation.cats, links=doc_annotation.links,
return cls(doc, words=token_annotation.words,
tags=token_annotation.tags,
heads=token_annotation.heads,
deps=token_annotation.deps,
entities=token_annotation.entities,
morphs=token_annotation.morphs,
cats=doc_annotation.cats,
links=doc_annotation.links,
make_projective=make_projective)
def get_token_annotation(self):
@ -893,9 +950,9 @@ cdef class GoldParse:
return TokenAnnotation(ids=ids, words=self.words, tags=self.tags,
heads=self.heads, deps=self.labels, entities=self.ner,
morphology=self.morphology)
morphs=self.morphs)
def __init__(self, doc, words=None, tags=None, morphology=None,
def __init__(self, doc, words=None, tags=None, morphs=None,
heads=None, deps=None, entities=None, make_projective=False,
cats=None, links=None):
"""Create a GoldParse. The fields will not be initialized if len(doc) is zero.
@ -944,8 +1001,8 @@ cdef class GoldParse:
heads = [None for _ in words]
if not deps:
deps = [None for _ in words]
if not morphology:
morphology = [None for _ in words]
if not morphs:
morphs = [None for _ in words]
if entities is None:
entities = ["-" for _ in words]
elif len(entities) == 0:
@ -971,7 +1028,7 @@ cdef class GoldParse:
self.heads = [None] * len(doc)
self.labels = [None] * len(doc)
self.ner = [None] * len(doc)
self.morphology = [None] * len(doc)
self.morphs = [None] * len(doc)
# This needs to be done before we align the words
if make_projective and heads is not None and deps is not None:
@ -990,7 +1047,7 @@ cdef class GoldParse:
self.gold_to_cand = [(i if i >= 0 else None) for i in j2i]
self.orig = TokenAnnotation(ids=list(range(len(words))), words=words, tags=tags,
heads=heads, deps=deps, entities=entities, morphology=morphology,
heads=heads, deps=deps, entities=entities, morphs=morphs,
brackets=[])
for i, gold_i in enumerate(self.cand_to_gold):
@ -1000,12 +1057,12 @@ cdef class GoldParse:
self.heads[i] = None
self.labels[i] = None
self.ner[i] = None
self.morphology[i] = set()
self.morphs[i] = set()
if gold_i is None:
if i in i2j_multi:
self.words[i] = words[i2j_multi[i]]
self.tags[i] = tags[i2j_multi[i]]
self.morphology[i] = morphology[i2j_multi[i]]
self.morphs[i] = morphs[i2j_multi[i]]
is_last = i2j_multi[i] != i2j_multi.get(i+1)
is_first = i2j_multi[i] != i2j_multi.get(i-1)
# Set next word in multi-token span as head, until last
@ -1044,7 +1101,7 @@ cdef class GoldParse:
else:
self.words[i] = words[gold_i]
self.tags[i] = tags[gold_i]
self.morphology[i] = morphology[gold_i]
self.morphs[i] = morphs[gold_i]
if heads[gold_i] is None:
self.heads[i] = None
else:

View File

@ -574,9 +574,8 @@ class Language(object):
# Populate vocab
else:
for example in get_examples():
for token_annotation in example.token_annotations:
for word in token_annotation.words:
_ = self.vocab[word] # noqa: F841
for word in example.token_annotation.words:
_ = self.vocab[word] # noqa: F841
if cfg.get("device", -1) >= 0:
util.use_gpu(cfg["device"])

View File

@ -565,12 +565,11 @@ class Tagger(Pipe):
orig_tag_map = dict(self.vocab.morphology.tag_map)
new_tag_map = OrderedDict()
for example in get_examples():
for token_annotation in example.token_annotations:
for tag in token_annotation.tags:
if tag in orig_tag_map:
new_tag_map[tag] = orig_tag_map[tag]
else:
new_tag_map[tag] = {POS: X}
for tag in example.token_annotation.tags:
if tag in orig_tag_map:
new_tag_map[tag] = orig_tag_map[tag]
else:
new_tag_map[tag] = {POS: X}
cdef Vocab vocab = self.vocab
if new_tag_map:
vocab.morphology = Morphology(vocab.strings, new_tag_map,
@ -750,11 +749,10 @@ class MultitaskObjective(Tagger):
gold_examples = nonproj.preprocess_training_data(get_examples())
# for raw_text, doc_annot in gold_tuples:
for example in gold_examples:
for token_annotation in example.token_annotations:
for i in range(len(token_annotation.ids)):
label = self.make_label(i, token_annotation)
if label is not None and label not in self.labels:
self.labels[label] = len(self.labels)
for i in range(len(example.token_annotation.ids)):
label = self.make_label(i, example.token_annotation)
if label is not None and label not in self.labels:
self.labels[label] = len(self.labels)
if self.model is True:
token_vector_width = util.env_opt("token_vector_width")
self.model = self.Model(len(self.labels), tok2vec=tok2vec)

View File

@ -237,7 +237,7 @@ class Scorer(object):
if len(doc) != len(gold):
doc_annotation = DocAnnotation(cats=gold.cats)
token_annotation = gold.orig
gold = GoldParse.from_annotation(doc, doc_annotation, [token_annotation])
gold = GoldParse.from_annotation(doc, doc_annotation, token_annotation)
orig = gold.orig
gold_deps = set()
gold_deps_per_dep = {}

View File

@ -342,19 +342,19 @@ cdef class ArcEager(TransitionSystem):
actions[RIGHT][label] = 1
actions[REDUCE][label] = 1
for example in kwargs.get('gold_parses', []):
for token_annotation in example.token_annotations:
heads, labels = nonproj.projectivize(token_annotation.heads, token_annotation.deps)
for child, head, label in zip(token_annotation.ids, heads, labels):
if label.upper() == 'ROOT' :
label = 'ROOT'
if head == child:
actions[BREAK][label] += 1
elif head < child:
actions[RIGHT][label] += 1
actions[REDUCE][''] += 1
elif head > child:
actions[LEFT][label] += 1
actions[SHIFT][''] += 1
heads, labels = nonproj.projectivize(example.token_annotation.heads,
example.token_annotation.deps)
for child, head, label in zip(example.token_annotation.ids, heads, labels):
if label.upper() == 'ROOT' :
label = 'ROOT'
if head == child:
actions[BREAK][label] += 1
elif head < child:
actions[RIGHT][label] += 1
actions[REDUCE][''] += 1
elif head > child:
actions[LEFT][label] += 1
actions[SHIFT][''] += 1
if min_freq is not None:
for action, label_freqs in actions.items():
for label, freq in list(label_freqs.items()):

View File

@ -73,12 +73,11 @@ cdef class BiluoPushDown(TransitionSystem):
actions[action][entity_type] = 1
moves = ('M', 'B', 'I', 'L', 'U')
for example in kwargs.get('gold_parses', []):
for token_annotation in example.token_annotations:
for i, ner_tag in enumerate(token_annotation.entities):
if ner_tag != 'O' and ner_tag != '-':
_, label = ner_tag.split('-', 1)
for action in (BEGIN, IN, LAST, UNIT):
actions[action][label] += 1
for i, ner_tag in enumerate(example.token_annotation.entities):
if ner_tag != 'O' and ner_tag != '-':
_, label = ner_tag.split('-', 1)
for action in (BEGIN, IN, LAST, UNIT):
actions[action][label] += 1
return actions
@property

View File

@ -81,15 +81,15 @@ def is_decorated(label):
def count_decorated_labels(gold_data):
freqs = {}
for example in gold_data:
for token_annotation in example.token_annotations:
proj_heads, deco_deps = projectivize(token_annotation.heads, token_annotation.deps)
# set the label to ROOT for each root dependent
deco_deps = ['ROOT' if head == i else deco_deps[i]
for i, head in enumerate(proj_heads)]
# count label frequencies
for label in deco_deps:
if is_decorated(label):
freqs[label] = freqs.get(label, 0) + 1
proj_heads, deco_deps = projectivize(example.token_annotation.heads,
example.token_annotation.deps)
# set the label to ROOT for each root dependent
deco_deps = ['ROOT' if head == i else deco_deps[i]
for i, head in enumerate(proj_heads)]
# count label frequencies
for label in deco_deps:
if is_decorated(label):
freqs[label] = freqs.get(label, 0) + 1
return freqs
@ -98,21 +98,20 @@ def preprocess_training_data(gold_data, label_freq_cutoff=30):
freqs = {}
for example in gold_data:
new_example = Example(doc=example.doc)
for token_annotation in example.token_annotations:
proj_heads, deco_deps = projectivize(token_annotation.heads, token_annotation.deps)
# set the label to ROOT for each root dependent
deco_deps = ['ROOT' if head == i else deco_deps[i]
for i, head in enumerate(proj_heads)]
# count label frequencies
if label_freq_cutoff > 0:
for label in deco_deps:
if is_decorated(label):
freqs[label] = freqs.get(label, 0) + 1
# TODO: the code would be less ugly when changing heads and deps in-place, but is this OK upstream ?
proj_token_dict = token_annotation.to_dict()
proj_token_dict["heads"] = proj_heads
proj_token_dict["deps"] = deco_deps
new_example.add_token_annotation(**proj_token_dict)
proj_heads, deco_deps = projectivize(example.token_annotation.heads,
example.token_annotation.deps)
# set the label to ROOT for each root dependent
deco_deps = ['ROOT' if head == i else deco_deps[i]
for i, head in enumerate(proj_heads)]
# count label frequencies
if label_freq_cutoff > 0:
for label in deco_deps:
if is_decorated(label):
freqs[label] = freqs.get(label, 0) + 1
proj_token_dict = example.token_annotation.to_dict()
proj_token_dict["heads"] = proj_heads
proj_token_dict["deps"] = deco_deps
new_example.set_token_annotation(**proj_token_dict)
preprocessed.append(new_example)
if label_freq_cutoff > 0:
return _filter_labels(preprocessed, label_freq_cutoff, freqs)
@ -213,15 +212,14 @@ def _filter_labels(examples, cutoff, freqs):
filtered = []
for example in examples:
new_example = Example(doc=example.doc)
for token_annotation in example.token_annotations:
filtered_labels = []
for label in token_annotation.deps:
if is_decorated(label) and freqs.get(label, 0) < cutoff:
filtered_labels.append(decompose(label)[0])
else:
filtered_labels.append(label)
filtered_token_dict = token_annotation.to_dict()
filtered_token_dict["deps"] = filtered_labels
new_example.add_token_annotation(**filtered_token_dict)
filtered_labels = []
for label in example.token_annotation.deps:
if is_decorated(label) and freqs.get(label, 0) < cutoff:
filtered_labels.append(decompose(label)[0])
else:
filtered_labels.append(label)
filtered_token_dict = example.token_annotation.to_dict()
filtered_token_dict["deps"] = filtered_labels
new_example.set_token_annotation(**filtered_token_dict)
filtered.append(new_example)
return filtered

View File

@ -273,7 +273,7 @@ def test_issue1963(en_tokenizer):
def test_issue1967(label):
ner = EntityRecognizer(Vocab())
example = Example(doc=None)
example.add_token_annotation(ids=[0], words=["word"], tags=["tag"], heads=[0], deps=["dep"], entities=[label])
example.set_token_annotation(ids=[0], words=["word"], tags=["tag"], heads=[0], deps=["dep"], entities=[label])
ner.moves.get_actions(gold_parses=[example])

View File

@ -36,6 +36,16 @@ def doc():
return doc
@pytest.fixture()
def merged_dict():
return {
"ids": [1, 2, 3, 4, 5, 6, 7],
"words": ["Hi", "there", "everyone", "It", "is", "just", "me"],
"tags": ["INTJ", "ADV", "PRON", "PRON", "AUX", "ADV", "PRON"],
"sent_starts": [1, 0, 0, 1, 0, 0, 0, 0],
}
def test_gold_biluo_U(en_vocab):
words = ["I", "flew", "to", "London", "."]
spaces = [True, True, True, False, True]
@ -231,7 +241,7 @@ def test_ignore_misaligned(doc):
deps = [t.dep_ for t in doc]
heads = [t.head.i for t in doc]
use_new_align = spacy.gold.USE_NEW_ALIGN
saved_use_new_align = spacy.gold.USE_NEW_ALIGN
spacy.gold.USE_NEW_ALIGN = False
with make_tempdir() as tmpdir:
@ -270,7 +280,25 @@ def test_ignore_misaligned(doc):
ignore_misaligned=True))
assert len(train_reloaded_example) == 0
spacy.gold.USE_NEW_ALIGN = use_new_align
spacy.gold.USE_NEW_ALIGN = saved_use_new_align
def test_make_orth_variants(doc):
nlp = English()
text = doc.text
deps = [t.dep_ for t in doc]
heads = [t.head.i for t in doc]
with make_tempdir() as tmpdir:
jsonl_file = tmpdir / "test.jsonl"
# write to JSONL train dicts
srsly.write_jsonl(jsonl_file, [docs_to_json(doc)])
goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
# due to randomness, test only that this runs with no errors for now
train_reloaded_example = next(goldcorpus.train_dataset(nlp,
orth_variant_level=0.2))
train_goldparse = train_reloaded_example.gold
# xfail while we have backwards-compatible alignment
@ -386,71 +414,38 @@ def _train(train_data):
nlp.update(batch, sgd=optimizer, losses=losses)
tokens_1 = {
"ids": [1, 2, 3],
"words": ["Hi", "there", "everyone"],
"tags": ["INTJ", "ADV", "PRON"],
}
tokens_2 = {
"ids": [1, 2, 3, 4],
"words": ["It", "is", "just", "me"],
"tags": ["PRON", "AUX", "ADV", "PRON"],
}
text0 = "Hi there everyone It is just me"
def test_merge_sents():
def test_split_sents(merged_dict):
nlp = English()
example = Example()
example.add_token_annotation(**tokens_1)
example.add_token_annotation(**tokens_2)
example.set_token_annotation(**merged_dict)
assert len(example.get_gold_parses(merge=False, vocab=nlp.vocab)) == 2
assert len(example.get_gold_parses(merge=True, vocab=nlp.vocab)) == 1 # this shouldn't change the original object
assert len(example.get_gold_parses(merge=True, vocab=nlp.vocab)) == 1
merged_example = example.merge_sents()
split_examples = example.split_sents()
assert len(split_examples) == 2
token_annotation_1 = example.token_annotations[0]
token_annotation_1 = split_examples[0].token_annotation
assert token_annotation_1.ids == [1, 2, 3]
assert token_annotation_1.words == ["Hi", "there", "everyone"]
assert token_annotation_1.tags == ["INTJ", "ADV", "PRON"]
assert token_annotation_1.sent_starts == [1, 0, 0]
token_annotation_m = merged_example.token_annotations[0]
assert token_annotation_m.ids == [1, 2, 3, 4, 5, 6, 7]
assert token_annotation_m.words == ["Hi", "there", "everyone", "It", "is", "just", "me"]
assert token_annotation_m.tags == ["INTJ", "ADV", "PRON", "PRON", "AUX", "ADV", "PRON"]
token_annotation_2 = split_examples[1].token_annotation
assert token_annotation_2.ids == [4, 5, 6, 7]
assert token_annotation_2.words == ["It", "is", "just", "me"]
assert token_annotation_2.tags == ["PRON", "AUX", "ADV", "PRON"]
assert token_annotation_2.sent_starts == [1, 0, 0, 0]
def test_tuples_to_example():
def test_tuples_to_example(merged_dict):
ex = Example()
ex.add_token_annotation(**tokens_1)
ex.add_token_annotation(**tokens_2)
ex.add_doc_annotation(cats={"TRAVEL": 1.0, "BAKING": 0.0})
ex.set_token_annotation(**merged_dict)
cats = {"TRAVEL": 1.0, "BAKING": 0.0}
ex.set_doc_annotation(cats=cats)
ex_dict = ex.to_dict()
token_dicts = [
{
"ids": [1, 2, 3],
"words": ["Hi", "there", "everyone"],
"tags": ["INTJ", "ADV", "PRON"],
"heads": [],
"deps": [],
"entities": [],
"morphology": [],
"brackets": [],
},
{
"ids": [1, 2, 3, 4],
"words": ["It", "is", "just", "me"],
"tags": ["PRON", "AUX", "ADV", "PRON"],
"heads": [],
"deps": [],
"entities": [],
"morphology": [],
"brackets": [],
},
]
doc_dict = {"cats": {"TRAVEL": 1.0, "BAKING": 0.0}, "links": {}}
assert ex_dict == {"token_annotations": token_dicts, "doc_annotation": doc_dict}
assert ex_dict["token_annotation"]["ids"] == merged_dict["ids"]
assert ex_dict["token_annotation"]["words"] == merged_dict["words"]
assert ex_dict["token_annotation"]["tags"] == merged_dict["tags"]
assert ex_dict["token_annotation"]["sent_starts"] == merged_dict["sent_starts"]
assert ex_dict["doc_annotation"]["cats"] == cats

View File

@ -86,7 +86,7 @@ def test_ner_per_type(en_vocab):
ents=[[0, 1, "CARDINAL"], [2, 3, "CARDINAL"]],
)
ex = Example(doc=doc)
ex.add_token_annotation(entities=annot["entities"])
ex.set_token_annotation(entities=annot["entities"])
scorer.score(ex)
results = scorer.scores
@ -107,7 +107,7 @@ def test_ner_per_type(en_vocab):
ents=[[0, 1, "ORG"], [5, 6, "GPE"], [6, 7, "ORG"]],
)
ex = Example(doc=doc)
ex.add_token_annotation(entities=annot["entities"])
ex.set_token_annotation(entities=annot["entities"])
scorer.score(ex)
results = scorer.scores