Restructure Example with merged sents as default (#4632)

* Switch to train_dataset() function in train CLI

* Fixes for pipe() methods in pipeline components

* Don't clobber `examples` variable with `as_example` in pipe() methods
* Remove unnecessary traversals of `examples`

* Update Parser.pipe() for Examples

* Add `as_examples` kwarg to `pipe()` with implementation to return
`Example`s

* Accept `Doc` or `Example` in `pipe()` with `_get_doc()` (copied from
`Pipe`)

* Fixes to Example implementation in spacy.gold

* Move `make_projective` from an attribute of Example to an argument of
`Example.get_gold_parses()`

* Head of 0 are not treated as unset

* Unset heads are set to self rather than `None` (which causes problems
while projectivizing)

* Check for `Doc` (not just not `None`) when creating GoldParses for
pre-merged example

* Don't clobber `examples` variable in `iter_gold_docs()`

* Add/modify gold tests for handling projectivity

* In JSON roundtrip compare results from `dev_dataset` rather than
`train_dataset` to avoid projectivization (and other potential
modifications)

* Add test for projective train vs. nonprojective dev versions of the
same `Doc`

* Handle ignore_misaligned as arg rather than attr

Move `ignore_misaligned` from an attribute of `Example` to an argument
to `Example.get_gold_parses()`, which makes it parallel to
`make_projective`.

Add test with old and new align that checks whether `ignore_misaligned`
errors are raised as expected (only for new align).

* Remove unused attrs from gold.pxd

Remove `ignore_misaligned` and `make_projective` from `gold.pxd`

* Restructure Example with merged sents as default

An `Example` now includes a single `TokenAnnotation` that includes all
the information from one `Doc` (=JSON `paragraph`). If required, the
individual sentences can be returned as a list of examples with
`Example.split_sents()` with no raw text available.

* Input/output a single `Example.token_annotation`

* Add `sent_starts` to `TokenAnnotation` to handle sentence boundaries

* Replace `Example.merge_sents()` with `Example.split_sents()`

* Modify components to use a single `Example.token_annotation`

  * Pipeline components
  * conllu2json converter

* Rework/rename `add_token_annotation()` and `add_doc_annotation()` to
`set_token_annotation()` and `set_doc_annotation()`, functions that set
rather then appending/extending.

* Rename `morphology` to `morphs` in `TokenAnnotation` and `GoldParse`

* Add getters to `TokenAnnotation` to supply default values when a given
attribute is not available

* `Example.get_gold_parses()` in `spacy.gold._make_golds()` is only
applied on single examples, so the `GoldParse` is returned saved in the
provided `Example` rather than creating a new `Example` with no other
internal annotation

* Update tests for API changes and `merge_sents()` vs. `split_sents()`

* Refer to Example.goldparse in iter_gold_docs()

Use `Example.goldparse` in `iter_gold_docs()` instead of `Example.gold`
because a `None` `GoldParse` is generated with ignore_misaligned and
generating it on-the-fly can raise an unwanted AlignmentError

* Fix make_orth_variants()

Fix bug in make_orth_variants() related to conversion from multiple to
one TokenAnnotation per Example.

* Add basic test for make_orth_variants()

* Replace try/except with conditionals

* Replace default morph value with set
This commit is contained in:
adrianeboyd 2019-11-25 16:03:28 +01:00 committed by Matthew Honnibal
parent 44829950ba
commit 392c4880d9
12 changed files with 376 additions and 330 deletions

View File

@ -24,17 +24,16 @@ def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None, **_):
checked_for_ner = False checked_for_ner = False
has_ner_tags = False has_ner_tags = False
for i, example in enumerate(conll_data): for i, example in enumerate(conll_data):
for token_annotation in example.token_annotations: if not checked_for_ner:
if not checked_for_ner: has_ner_tags = is_ner(example.token_annotation.entities[0])
has_ner_tags = is_ner(token_annotation.entities[0]) checked_for_ner = True
checked_for_ner = True sentences.append(generate_sentence(example.token_annotation, has_ner_tags))
sentences.append(generate_sentence(token_annotation, has_ner_tags)) # Real-sized documents could be extracted using the comments on the
# Real-sized documents could be extracted using the comments on the # conllu document
# conluu document if len(sentences) % n_sents == 0:
if len(sentences) % n_sents == 0: doc = create_doc(sentences, i)
doc = create_doc(sentences, i) docs.append(doc)
docs.append(doc) sentences = []
sentences = []
return docs return docs
@ -84,7 +83,7 @@ def read_conllx(input_data, use_morphology=False, n=0):
print(line) print(line)
raise raise
example = Example(doc=None) example = Example(doc=None)
example.add_token_annotation(ids=ids, words=words, tags=tags, example.set_token_annotation(ids=ids, words=words, tags=tags,
heads=heads, deps=deps, entities=ents) heads=heads, deps=deps, entities=ents)
yield example yield example
i += 1 i += 1

View File

@ -25,7 +25,7 @@ cdef class GoldParse:
cdef public int loss cdef public int loss
cdef public list words cdef public list words
cdef public list tags cdef public list tags
cdef public list morphology cdef public list morphs
cdef public list heads cdef public list heads
cdef public list labels cdef public list labels
cdef public dict orths cdef public dict orths
@ -45,7 +45,8 @@ cdef class TokenAnnotation:
cdef public list heads cdef public list heads
cdef public list deps cdef public list deps
cdef public list entities cdef public list entities
cdef public list morphology cdef public list morphs
cdef public list sent_starts
cdef public list brackets cdef public list brackets
@ -56,7 +57,7 @@ cdef class DocAnnotation:
cdef class Example: cdef class Example:
cdef public object doc cdef public object doc
cdef public list token_annotations cdef public TokenAnnotation token_annotation
cdef public DocAnnotation doc_annotation cdef public DocAnnotation doc_annotation
cdef public object goldparse cdef public object goldparse

View File

@ -215,7 +215,7 @@ class GoldCorpus(object):
ex_dict = example.to_dict() ex_dict = example.to_dict()
text = example.text text = example.text
srsly.write_msgpack(directory / "{}.msg".format(i), (text, ex_dict)) srsly.write_msgpack(directory / "{}.msg".format(i), (text, ex_dict))
n += len(example.token_annotations) n += 1
if limit and n >= limit: if limit and n >= limit:
break break
@ -271,7 +271,7 @@ class GoldCorpus(object):
raise ValueError(Errors.E124.format(path=path2str(loc), formats=supported)) raise ValueError(Errors.E124.format(path=path2str(loc), formats=supported))
for example in examples: for example in examples:
yield example yield example
i += len(example.token_annotations) i += 1
if limit and i >= limit: if limit and i >= limit:
return return
@ -286,15 +286,14 @@ class GoldCorpus(object):
yield from self.read_examples(locs, limit=self.limit) yield from self.read_examples(locs, limit=self.limit)
def count_train(self): def count_train(self):
# TODO: should this count words or sentences ? """Returns count of words in train examples"""
n = 0 n = 0
i = 0 i = 0
for example in self.train_examples: for example in self.train_examples:
for token_annotation in example.token_annotations: n += len(example.token_annotation.words)
n += len(token_annotation.words) if self.limit and i >= self.limit:
if self.limit and i >= self.limit: break
break i += 1
i += 1
return n return n
def train_dataset(self, nlp, gold_preproc=False, max_length=None, def train_dataset(self, nlp, gold_preproc=False, max_length=None,
@ -328,18 +327,27 @@ class GoldCorpus(object):
def iter_gold_docs(cls, nlp, examples, gold_preproc, max_length=None, def iter_gold_docs(cls, nlp, examples, gold_preproc, max_length=None,
noise_level=0.0, orth_variant_level=0.0, noise_level=0.0, orth_variant_level=0.0,
make_projective=False, ignore_misaligned=False): make_projective=False, ignore_misaligned=False):
""" Setting gold_preproc will result in creating a doc per 'sentence' """ """ Setting gold_preproc will result in creating a doc per sentence """
for example in examples: for example in examples:
if gold_preproc: if gold_preproc:
example.doc = None example.doc = None
split_examples = example.split_sents()
example_golds = []
for split_example in split_examples:
split_example_docs = cls._make_docs(nlp, split_example,
gold_preproc, noise_level=noise_level,
orth_variant_level=orth_variant_level)
split_example_golds = cls._make_golds(split_example_docs,
vocab=nlp.vocab, make_projective=make_projective,
ignore_misaligned=ignore_misaligned)
example_golds.extend(split_example_golds)
else: else:
example = example.merge_sents() example_docs = cls._make_docs(nlp, example,
example_docs = cls._make_docs(nlp, example, gold_preproc, noise_level=noise_level,
gold_preproc, noise_level=noise_level, orth_variant_level=orth_variant_level)
orth_variant_level=orth_variant_level) example_golds = cls._make_golds(example_docs, vocab=nlp.vocab,
example_golds = cls._make_golds(example_docs, vocab=nlp.vocab, make_projective=make_projective,
make_projective=make_projective, ignore_misaligned=ignore_misaligned)
ignore_misaligned=ignore_misaligned)
for ex in example_golds: for ex in example_golds:
if ex.goldparse is not None: if ex.goldparse is not None:
if (not max_length) or len(ex.doc) < max_length: if (not max_length) or len(ex.doc) < max_length:
@ -353,35 +361,28 @@ class GoldCorpus(object):
var_text = add_noise(var_example.text, noise_level) var_text = add_noise(var_example.text, noise_level)
var_doc = nlp.make_doc(var_text) var_doc = nlp.make_doc(var_text)
var_example.doc = var_doc var_example.doc = var_doc
return [var_example]
else: else:
doc_examples = [] var_doc = Doc(nlp.vocab, words=add_noise(var_example.token_annotation.words, noise_level))
for token_annotation in var_example.token_annotations: var_example.doc = var_doc
t_doc = Doc(nlp.vocab, words=add_noise(token_annotation.words, noise_level)) return [var_example]
doc_example = Example(doc_annotation=example.doc_annotation,
token_annotations=[token_annotation],
doc=t_doc)
doc_examples.append(doc_example)
return doc_examples
@classmethod @classmethod
def _make_golds(cls, examples, vocab=None, make_projective=False, def _make_golds(cls, examples, vocab=None, make_projective=False,
ignore_misaligned=False): ignore_misaligned=False):
gold_examples = []
for example in examples: for example in examples:
gold_parses = example.get_gold_parses(vocab=vocab, gold_parses = example.get_gold_parses(vocab=vocab,
make_projective=make_projective, make_projective=make_projective,
ignore_misaligned=ignore_misaligned) ignore_misaligned=ignore_misaligned)
for (doc, gold) in gold_parses: assert len(gold_parses) == 1
ex = Example(doc=doc) assert gold_parses[0][0] == example.doc
ex.goldparse = gold example.goldparse = gold_parses[0][1]
gold_examples.append(ex) return examples
return gold_examples
def make_orth_variants(nlp, example, orth_variant_level=0.0): def make_orth_variants(nlp, example, orth_variant_level=0.0):
if random.random() >= orth_variant_level: if random.random() >= orth_variant_level:
return example return example
if not example.token_annotations: if not example.token_annotation:
return example return example
raw = example.text raw = example.text
if random.random() >= 0.5: if random.random() >= 0.5:
@ -392,46 +393,46 @@ def make_orth_variants(nlp, example, orth_variant_level=0.0):
ndpv = nlp.Defaults.paired_orth_variants ndpv = nlp.Defaults.paired_orth_variants
# modify words in paragraph_tuples # modify words in paragraph_tuples
variant_example = Example(doc=raw) variant_example = Example(doc=raw)
for token_annotation in example.token_annotations: token_annotation = example.token_annotation
words = token_annotation.words words = token_annotation.words
tags = token_annotation.tags tags = token_annotation.tags
if not words or not tags: if not words or not tags:
# add the unmodified annotation # add the unmodified annotation
token_dict = token_annotation.to_dict() token_dict = token_annotation.to_dict()
variant_example.add_token_annotation(**token_dict) variant_example.set_token_annotation(**token_dict)
else: else:
if lower: if lower:
words = [w.lower() for w in words] words = [w.lower() for w in words]
# single variants # single variants
punct_choices = [random.choice(x["variants"]) for x in ndsv] punct_choices = [random.choice(x["variants"]) for x in ndsv]
for word_idx in range(len(words)): for word_idx in range(len(words)):
for punct_idx in range(len(ndsv)): for punct_idx in range(len(ndsv)):
if tags[word_idx] in ndsv[punct_idx]["tags"] \ if tags[word_idx] in ndsv[punct_idx]["tags"] \
and words[word_idx] in ndsv[punct_idx]["variants"]: and words[word_idx] in ndsv[punct_idx]["variants"]:
words[word_idx] = punct_choices[punct_idx] words[word_idx] = punct_choices[punct_idx]
# paired variants # paired variants
punct_choices = [random.choice(x["variants"]) for x in ndpv] punct_choices = [random.choice(x["variants"]) for x in ndpv]
for word_idx in range(len(words)): for word_idx in range(len(words)):
for punct_idx in range(len(ndpv)): for punct_idx in range(len(ndpv)):
if tags[word_idx] in ndpv[punct_idx]["tags"] \ if tags[word_idx] in ndpv[punct_idx]["tags"] \
and words[word_idx] in itertools.chain.from_iterable(ndpv[punct_idx]["variants"]): and words[word_idx] in itertools.chain.from_iterable(ndpv[punct_idx]["variants"]):
# backup option: random left vs. right from pair # backup option: random left vs. right from pair
pair_idx = random.choice([0, 1]) pair_idx = random.choice([0, 1])
# best option: rely on paired POS tags like `` / '' # best option: rely on paired POS tags like `` / ''
if len(ndpv[punct_idx]["tags"]) == 2: if len(ndpv[punct_idx]["tags"]) == 2:
pair_idx = ndpv[punct_idx]["tags"].index(tags[word_idx]) pair_idx = ndpv[punct_idx]["tags"].index(tags[word_idx])
# next best option: rely on position in variants # next best option: rely on position in variants
# (may not be unambiguous, so order of variants matters) # (may not be unambiguous, so order of variants matters)
else: else:
for pair in ndpv[punct_idx]["variants"]: for pair in ndpv[punct_idx]["variants"]:
if words[word_idx] in pair: if words[word_idx] in pair:
pair_idx = pair.index(words[word_idx]) pair_idx = pair.index(words[word_idx])
words[word_idx] = punct_choices[punct_idx][pair_idx] words[word_idx] = punct_choices[punct_idx][pair_idx]
token_dict = token_annotation.to_dict() token_dict = token_annotation.to_dict()
token_dict["words"] = words token_dict["words"] = words
token_dict["tags"] = tags token_dict["tags"] = tags
variant_example.add_token_annotation(**token_dict) variant_example.set_token_annotation(**token_dict)
# modify raw to match variant_paragraph_tuples # modify raw to match variant_paragraph_tuples
if raw is not None: if raw is not None:
variants = [] variants = []
@ -449,30 +450,29 @@ def make_orth_variants(nlp, example, orth_variant_level=0.0):
while raw_idx < len(raw) and re.match("\s", raw[raw_idx]): while raw_idx < len(raw) and re.match("\s", raw[raw_idx]):
variant_raw += raw[raw_idx] variant_raw += raw[raw_idx]
raw_idx += 1 raw_idx += 1
for token_annotation in variant_example.token_annotations: for word in variant_example.token_annotation.words:
for word in token_annotation.words: match_found = False
match_found = False # add identical word
# add identical word if word not in variants and raw[raw_idx:].startswith(word):
if word not in variants and raw[raw_idx:].startswith(word): variant_raw += word
variant_raw += word raw_idx += len(word)
raw_idx += len(word) match_found = True
match_found = True # add variant word
# add variant word else:
else: for variant in variants:
for variant in variants: if not match_found and \
if not match_found and \ raw[raw_idx:].startswith(variant):
raw[raw_idx:].startswith(variant): raw_idx += len(variant)
raw_idx += len(variant) variant_raw += word
variant_raw += word match_found = True
match_found = True # something went wrong, abort
# something went wrong, abort # (add a warning message?)
# (add a warning message?) if not match_found:
if not match_found: return example
return example # add following whitespace
# add following whitespace while raw_idx < len(raw) and re.match("\s", raw[raw_idx]):
while raw_idx < len(raw) and re.match("\s", raw[raw_idx]): variant_raw += raw[raw_idx]
variant_raw += raw[raw_idx] raw_idx += 1
raw_idx += 1
variant_example.doc = variant_raw variant_example.doc = variant_raw
return variant_example return variant_example
return variant_example return variant_example
@ -521,30 +521,43 @@ def json_to_examples(doc):
paragraphs = [] paragraphs = []
for paragraph in doc["paragraphs"]: for paragraph in doc["paragraphs"]:
example = Example(doc=paragraph.get("raw", None)) example = Example(doc=paragraph.get("raw", None))
words = []
ids = []
tags = []
heads = []
labels = []
ner = []
morphs = []
sent_starts = []
brackets = []
for sent in paragraph["sentences"]: for sent in paragraph["sentences"]:
words = [] sent_start_i = len(words)
ids = []
tags = []
heads = []
labels = []
ner = []
for i, token in enumerate(sent["tokens"]): for i, token in enumerate(sent["tokens"]):
words.append(token["orth"]) words.append(token["orth"])
ids.append(i) ids.append(token.get('id', sent_start_i + i))
tags.append(token.get('tag', "-")) tags.append(token.get('tag', "-"))
heads.append(token.get("head", 0) + i) heads.append(token.get("head", 0) + sent_start_i + i)
labels.append(token.get("dep", "")) labels.append(token.get("dep", ""))
# Ensure ROOT label is case-insensitive # Ensure ROOT label is case-insensitive
if labels[-1].lower() == "root": if labels[-1].lower() == "root":
labels[-1] = "ROOT" labels[-1] = "ROOT"
ner.append(token.get("ner", "-")) ner.append(token.get("ner", "-"))
example.add_token_annotation(ids=ids, words=words, tags=tags, morphs.append(token.get("morph", {}))
heads=heads, deps=labels, entities=ner, if i == 0:
brackets=sent.get("brackets", [])) sent_starts.append(True)
else:
sent_starts.append(False)
if "brackets" in sent:
brackets.extend((b["first"] + sent_start_i,
b["last"] + sent_start_i, b["label"])
for b in sent["brackets"])
cats = {} cats = {}
for cat in paragraph.get("cats", {}): for cat in paragraph.get("cats", {}):
cats[cat["label"]] = cat["value"] cats[cat["label"]] = cat["value"]
example.add_doc_annotation(cats=cats) example.set_token_annotation(ids=ids, words=words, tags=tags,
heads=heads, deps=labels, entities=ner, morphs=morphs,
sent_starts=sent_starts, brackets=brackets)
example.set_doc_annotation(cats=cats)
yield example yield example
@ -652,15 +665,16 @@ def _consume_ent(tags):
cdef class TokenAnnotation: cdef class TokenAnnotation:
def __init__(self, ids=None, words=None, tags=None, heads=None, deps=None, entities=None, morphology=None, brackets=None): def __init__(self, ids=None, words=None, tags=None, heads=None, deps=None, entities=None, morphs=None, sent_starts=None, brackets=None):
self.ids = ids if ids else [] self.ids = ids if ids else []
self.words = words if words else [] self.words = words if words else []
self.tags = tags if tags else [] self.tags = tags if tags else []
self.heads = heads if heads else [] self.heads = heads if heads else []
self.deps = deps if deps else [] self.deps = deps if deps else []
self.entities = entities if entities else [] self.entities = entities if entities else []
self.morphs = morphs if morphs else []
self.sent_starts = sent_starts if sent_starts else []
self.brackets = brackets if brackets else [] self.brackets = brackets if brackets else []
self.morphology = morphology if morphology else []
@classmethod @classmethod
def from_dict(cls, token_dict): def from_dict(cls, token_dict):
@ -670,7 +684,8 @@ cdef class TokenAnnotation:
heads=token_dict.get("heads", None), heads=token_dict.get("heads", None),
deps=token_dict.get("deps", None), deps=token_dict.get("deps", None),
entities=token_dict.get("entities", None), entities=token_dict.get("entities", None),
morphology=token_dict.get("morphology", None), morphs=token_dict.get("morphs", None),
sent_starts=token_dict.get("sent_starts", None),
brackets=token_dict.get("brackets", None)) brackets=token_dict.get("brackets", None))
def to_dict(self): def to_dict(self):
@ -680,9 +695,34 @@ cdef class TokenAnnotation:
"heads": self.heads, "heads": self.heads,
"deps": self.deps, "deps": self.deps,
"entities": self.entities, "entities": self.entities,
"morphology": self.morphology, "morphs": self.morphs,
"sent_starts": self.sent_starts,
"brackets": self.brackets} "brackets": self.brackets}
def get_id(self, i):
return self.ids[i] if i < len(self.ids) else i
def get_word(self, i):
return self.words[i] if i < len(self.words) else ""
def get_tag(self, i):
return self.tags[i] if i < len(self.tags) else "-"
def get_head(self, i):
return self.heads[i] if i < len(self.heads) else i
def get_dep(self, i):
return self.deps[i] if i < len(self.deps) else ""
def get_entity(self, i):
return self.entities[i] if i < len(self.entities) else "-"
def get_morph(self, i):
return self.morphs[i] if i < len(self.morphs) else set()
def get_sent_start(self, i):
return self.sent_starts[i] if i < len(self.sent_starts) else None
cdef class DocAnnotation: cdef class DocAnnotation:
def __init__(self, cats=None, links=None): def __init__(self, cats=None, links=None):
@ -698,33 +738,33 @@ cdef class DocAnnotation:
cdef class Example: cdef class Example:
def __init__(self, doc_annotation=None, token_annotations=None, doc=None, def __init__(self, doc_annotation=None, token_annotation=None, doc=None,
goldparse=None): goldparse=None):
""" Doc can either be text, or an actual Doc """ """ Doc can either be text, or an actual Doc """
self.doc = doc self.doc = doc
self.doc_annotation = doc_annotation if doc_annotation else DocAnnotation() self.doc_annotation = doc_annotation if doc_annotation else DocAnnotation()
self.token_annotations = token_annotations if token_annotations else [] self.token_annotation = token_annotation if token_annotation else TokenAnnotation()
self.goldparse = goldparse self.goldparse = goldparse
@classmethod @classmethod
def from_gold(cls, goldparse, doc=None): def from_gold(cls, goldparse, doc=None):
doc_annotation = DocAnnotation(cats=goldparse.cats, links=goldparse.links) doc_annotation = DocAnnotation(cats=goldparse.cats, links=goldparse.links)
token_annotation = goldparse.get_token_annotation() token_annotation = goldparse.get_token_annotation()
return cls(doc_annotation, [token_annotation], doc) return cls(doc_annotation, token_annotation, doc)
@classmethod @classmethod
def from_dict(cls, example_dict, doc=None): def from_dict(cls, example_dict, doc=None):
token_dicts = example_dict["token_annotations"] token_dict = example_dict["token_annotation"]
token_annotations = [TokenAnnotation.from_dict(t) for t in token_dicts] token_annotation = TokenAnnotation.from_dict(token_dict)
doc_dict = example_dict["doc_annotation"] doc_dict = example_dict["doc_annotation"]
doc_annotation = DocAnnotation.from_dict(doc_dict) doc_annotation = DocAnnotation.from_dict(doc_dict)
return cls(doc_annotation, token_annotations, doc) return cls(doc_annotation, token_annotation, doc)
def to_dict(self): def to_dict(self):
""" Note that this method does NOT export the doc, only the annotations ! """ """ Note that this method does NOT export the doc, only the annotations ! """
token_dicts = [t.to_dict() for t in self.token_annotations] token_dict = self.token_annotation.to_dict()
doc_dict = self.doc_annotation.to_dict() doc_dict = self.doc_annotation.to_dict()
return {"token_annotations": token_dicts, "doc_annotation": doc_dict} return {"token_annotation": token_dict, "doc_annotation": doc_dict}
@property @property
def text(self): def text(self):
@ -737,96 +777,108 @@ cdef class Example:
@property @property
def gold(self): def gold(self):
if self.goldparse is None: if self.goldparse is None:
doc, gold = self.get_gold_parses(merge=True)[0] doc, gold = self.get_gold_parses()[0]
self.goldparse = gold self.goldparse = gold
return self.goldparse return self.goldparse
def add_token_annotation(self, ids=None, words=None, tags=None, heads=None, def set_token_annotation(self, ids=None, words=None, tags=None, heads=None,
deps=None, entities=None, morphology=None, brackets=None): deps=None, entities=None, morphs=None,
t = TokenAnnotation(ids=ids, words=words, tags=tags, sent_starts=None, brackets=None):
self.token_annotation = TokenAnnotation(ids=ids, words=words, tags=tags,
heads=heads, deps=deps, entities=entities, heads=heads, deps=deps, entities=entities,
morphology=morphology, brackets=brackets) morphs=morphs, sent_starts=sent_starts,
self.token_annotations.append(t) brackets=brackets)
def add_doc_annotation(self, cats=None, links=None): def set_doc_annotation(self, cats=None, links=None):
if cats: if cats:
self.doc_annotation.cats.update(cats) self.doc_annotation.cats = cats
if links: if links:
self.doc_annotation.links.update(links) self.doc_annotation.links = links
def merge_sents(self): def split_sents(self):
""" Merge the list of token annotations into one object and return this new object """ """ Split the token annotations into multiple Examples based on
m_example = Example(doc=self.doc, doc_annotation=self.doc_annotation) sent_starts and return a list of the new Examples"""
m_ids, m_words, m_tags, m_heads, m_deps, m_ents, m_morph = [], [], [], [], [], [], [] s_example = Example(doc=None, doc_annotation=self.doc_annotation)
m_brackets = [] s_ids, s_words, s_tags, s_heads = [], [], [], []
i = 0 s_deps, s_ents, s_morphs, s_sent_starts = [], [], [], []
for t in self.token_annotations: s_brackets = []
m_ids.extend(id_ + i for id_ in t.ids) sent_start_i = 0
m_words.extend(t.words) t = self.token_annotation
m_tags.extend(t.tags) split_examples = []
m_heads.extend(head + i if head is not None and head >= 0 else head_i + i for head_i, head in enumerate(t.heads)) for i in range(len(t.words)):
m_deps.extend(t.deps) if i > 0 and t.sent_starts[i] == True:
m_ents.extend(t.entities) s_example.set_token_annotation(ids=s_ids,
m_morph.extend(t.morphology) words=s_words, tags=s_tags, heads=s_heads, deps=s_deps,
m_brackets.extend((b["first"] + i, b["last"] + i, b["label"]) entities=s_ents, morphs=s_morphs,
for b in t.brackets) sent_starts=s_sent_starts, brackets=s_brackets)
i += len(t.ids) split_examples.append(s_example)
m_example.add_token_annotation(ids=m_ids, words=m_words, tags=m_tags, s_example = Example(doc=None, doc_annotation=self.doc_annotation)
heads=m_heads, deps=m_deps, entities=m_ents, s_ids, s_words, s_tags, s_heads = [], [], [], []
morphology=m_morph, brackets=m_brackets) s_deps, s_ents, s_morphs, s_sent_starts = [], [], [], []
return m_example s_brackets = []
sent_start_i = i
s_ids.append(t.get_id(i))
s_words.append(t.get_word(i))
s_tags.append(t.get_tag(i))
s_heads.append(t.get_head(i) - sent_start_i)
s_deps.append(t.get_dep(i))
s_ents.append(t.get_entity(i))
s_morphs.append(t.get_morph(i))
s_sent_starts.append(t.get_sent_start(i))
s_brackets.extend((b[0] - sent_start_i,
b[1] - sent_start_i, b[2])
for b in t.brackets if b[0] == i)
i += 1
s_example.set_token_annotation(ids=s_ids, words=s_words, tags=s_tags,
heads=s_heads, deps=s_deps, entities=s_ents,
morphs=s_morphs, sent_starts=s_sent_starts,
brackets=s_brackets)
split_examples.append(s_example)
return split_examples
def get_gold_parses(self, merge=False, vocab=None, make_projective=False, def get_gold_parses(self, merge=True, vocab=None, make_projective=False,
ignore_misaligned=False): ignore_misaligned=False):
"""Return a list of (doc, GoldParse) objects. """Return a list of (doc, GoldParse) objects.
If merge is set to True, add all Token annotations to one big list.""" If merge is set to True, keep all Token annotations as one big list."""
d = self.doc_annotation d = self.doc_annotation
# merging different sentences # merge == do not modify Example
if merge: if merge:
merged_example = self.merge_sents() t = self.token_annotation
assert(len(merged_example.token_annotations)) == 1 doc = self.doc
t = merged_example.token_annotations[0] if not self.doc:
m_doc = merged_example.doc
if not m_doc:
if not vocab: if not vocab:
raise ValueError(Errors.E998) raise ValueError(Errors.E998)
m_doc = Doc(vocab, words=t.words) doc = Doc(vocab, words=t.words)
try: try:
gp = GoldParse.from_annotation(m_doc, d, t, make_projective=make_projective) gp = GoldParse.from_annotation(doc, d, t,
make_projective=make_projective)
except AlignmentError: except AlignmentError:
if ignore_misaligned: if ignore_misaligned:
gp = None gp = None
else: else:
raise raise
return [(self.doc, gp)] return [(doc, gp)]
# we only have one sentence and an appropriate doc # not merging: one GoldParse per sentence, defining docs with the words
elif len(self.token_annotations) == 1 and isinstance(self.doc, Doc): # from each sentence
t = self.token_annotations[0]
try:
gp = GoldParse.from_annotation(self.doc, d, t, make_projective=make_projective)
except AlignmentError:
if ignore_misaligned:
gp = None
else:
raise
return [(self.doc, gp)]
# not merging: one GoldParse per 'sentence', defining docs with the words from each sentence
else: else:
parses = [] parses = []
for t in self.token_annotations: split_examples = self.split_sents()
for split_example in split_examples:
if not vocab: if not vocab:
raise ValueError(Errors.E998) raise ValueError(Errors.E998)
t_doc = Doc(vocab, words=t.words) split_doc = Doc(vocab, words=split_example.token_annotation.words)
try: try:
gp = GoldParse.from_annotation(t_doc, d, t, make_projective=make_projective) gp = GoldParse.from_annotation(split_doc, d,
split_example.token_annotation,
make_projective=make_projective)
except AlignmentError: except AlignmentError:
if ignore_misaligned: if ignore_misaligned:
gp = None gp = None
else: else:
raise raise
if gp is not None: if gp is not None:
parses.append((t_doc, gp)) parses.append((split_doc, gp))
return parses return parses
@classmethod @classmethod
@ -881,9 +933,14 @@ cdef class GoldParse:
""" """
@classmethod @classmethod
def from_annotation(cls, doc, doc_annotation, token_annotation, make_projective=False): def from_annotation(cls, doc, doc_annotation, token_annotation, make_projective=False):
return cls(doc, words=token_annotation.words, tags=token_annotation.tags, return cls(doc, words=token_annotation.words,
heads=token_annotation.heads, deps=token_annotation.deps, entities=token_annotation.entities, tags=token_annotation.tags,
morphology=token_annotation.morphology, cats=doc_annotation.cats, links=doc_annotation.links, heads=token_annotation.heads,
deps=token_annotation.deps,
entities=token_annotation.entities,
morphs=token_annotation.morphs,
cats=doc_annotation.cats,
links=doc_annotation.links,
make_projective=make_projective) make_projective=make_projective)
def get_token_annotation(self): def get_token_annotation(self):
@ -893,9 +950,9 @@ cdef class GoldParse:
return TokenAnnotation(ids=ids, words=self.words, tags=self.tags, return TokenAnnotation(ids=ids, words=self.words, tags=self.tags,
heads=self.heads, deps=self.labels, entities=self.ner, heads=self.heads, deps=self.labels, entities=self.ner,
morphology=self.morphology) morphs=self.morphs)
def __init__(self, doc, words=None, tags=None, morphology=None, def __init__(self, doc, words=None, tags=None, morphs=None,
heads=None, deps=None, entities=None, make_projective=False, heads=None, deps=None, entities=None, make_projective=False,
cats=None, links=None): cats=None, links=None):
"""Create a GoldParse. The fields will not be initialized if len(doc) is zero. """Create a GoldParse. The fields will not be initialized if len(doc) is zero.
@ -944,8 +1001,8 @@ cdef class GoldParse:
heads = [None for _ in words] heads = [None for _ in words]
if not deps: if not deps:
deps = [None for _ in words] deps = [None for _ in words]
if not morphology: if not morphs:
morphology = [None for _ in words] morphs = [None for _ in words]
if entities is None: if entities is None:
entities = ["-" for _ in words] entities = ["-" for _ in words]
elif len(entities) == 0: elif len(entities) == 0:
@ -971,7 +1028,7 @@ cdef class GoldParse:
self.heads = [None] * len(doc) self.heads = [None] * len(doc)
self.labels = [None] * len(doc) self.labels = [None] * len(doc)
self.ner = [None] * len(doc) self.ner = [None] * len(doc)
self.morphology = [None] * len(doc) self.morphs = [None] * len(doc)
# This needs to be done before we align the words # This needs to be done before we align the words
if make_projective and heads is not None and deps is not None: if make_projective and heads is not None and deps is not None:
@ -990,7 +1047,7 @@ cdef class GoldParse:
self.gold_to_cand = [(i if i >= 0 else None) for i in j2i] self.gold_to_cand = [(i if i >= 0 else None) for i in j2i]
self.orig = TokenAnnotation(ids=list(range(len(words))), words=words, tags=tags, self.orig = TokenAnnotation(ids=list(range(len(words))), words=words, tags=tags,
heads=heads, deps=deps, entities=entities, morphology=morphology, heads=heads, deps=deps, entities=entities, morphs=morphs,
brackets=[]) brackets=[])
for i, gold_i in enumerate(self.cand_to_gold): for i, gold_i in enumerate(self.cand_to_gold):
@ -1000,12 +1057,12 @@ cdef class GoldParse:
self.heads[i] = None self.heads[i] = None
self.labels[i] = None self.labels[i] = None
self.ner[i] = None self.ner[i] = None
self.morphology[i] = set() self.morphs[i] = set()
if gold_i is None: if gold_i is None:
if i in i2j_multi: if i in i2j_multi:
self.words[i] = words[i2j_multi[i]] self.words[i] = words[i2j_multi[i]]
self.tags[i] = tags[i2j_multi[i]] self.tags[i] = tags[i2j_multi[i]]
self.morphology[i] = morphology[i2j_multi[i]] self.morphs[i] = morphs[i2j_multi[i]]
is_last = i2j_multi[i] != i2j_multi.get(i+1) is_last = i2j_multi[i] != i2j_multi.get(i+1)
is_first = i2j_multi[i] != i2j_multi.get(i-1) is_first = i2j_multi[i] != i2j_multi.get(i-1)
# Set next word in multi-token span as head, until last # Set next word in multi-token span as head, until last
@ -1044,7 +1101,7 @@ cdef class GoldParse:
else: else:
self.words[i] = words[gold_i] self.words[i] = words[gold_i]
self.tags[i] = tags[gold_i] self.tags[i] = tags[gold_i]
self.morphology[i] = morphology[gold_i] self.morphs[i] = morphs[gold_i]
if heads[gold_i] is None: if heads[gold_i] is None:
self.heads[i] = None self.heads[i] = None
else: else:

View File

@ -574,9 +574,8 @@ class Language(object):
# Populate vocab # Populate vocab
else: else:
for example in get_examples(): for example in get_examples():
for token_annotation in example.token_annotations: for word in example.token_annotation.words:
for word in token_annotation.words: _ = self.vocab[word] # noqa: F841
_ = self.vocab[word] # noqa: F841
if cfg.get("device", -1) >= 0: if cfg.get("device", -1) >= 0:
util.use_gpu(cfg["device"]) util.use_gpu(cfg["device"])

View File

@ -565,12 +565,11 @@ class Tagger(Pipe):
orig_tag_map = dict(self.vocab.morphology.tag_map) orig_tag_map = dict(self.vocab.morphology.tag_map)
new_tag_map = OrderedDict() new_tag_map = OrderedDict()
for example in get_examples(): for example in get_examples():
for token_annotation in example.token_annotations: for tag in example.token_annotation.tags:
for tag in token_annotation.tags: if tag in orig_tag_map:
if tag in orig_tag_map: new_tag_map[tag] = orig_tag_map[tag]
new_tag_map[tag] = orig_tag_map[tag] else:
else: new_tag_map[tag] = {POS: X}
new_tag_map[tag] = {POS: X}
cdef Vocab vocab = self.vocab cdef Vocab vocab = self.vocab
if new_tag_map: if new_tag_map:
vocab.morphology = Morphology(vocab.strings, new_tag_map, vocab.morphology = Morphology(vocab.strings, new_tag_map,
@ -750,11 +749,10 @@ class MultitaskObjective(Tagger):
gold_examples = nonproj.preprocess_training_data(get_examples()) gold_examples = nonproj.preprocess_training_data(get_examples())
# for raw_text, doc_annot in gold_tuples: # for raw_text, doc_annot in gold_tuples:
for example in gold_examples: for example in gold_examples:
for token_annotation in example.token_annotations: for i in range(len(example.token_annotation.ids)):
for i in range(len(token_annotation.ids)): label = self.make_label(i, example.token_annotation)
label = self.make_label(i, token_annotation) if label is not None and label not in self.labels:
if label is not None and label not in self.labels: self.labels[label] = len(self.labels)
self.labels[label] = len(self.labels)
if self.model is True: if self.model is True:
token_vector_width = util.env_opt("token_vector_width") token_vector_width = util.env_opt("token_vector_width")
self.model = self.Model(len(self.labels), tok2vec=tok2vec) self.model = self.Model(len(self.labels), tok2vec=tok2vec)

View File

@ -237,7 +237,7 @@ class Scorer(object):
if len(doc) != len(gold): if len(doc) != len(gold):
doc_annotation = DocAnnotation(cats=gold.cats) doc_annotation = DocAnnotation(cats=gold.cats)
token_annotation = gold.orig token_annotation = gold.orig
gold = GoldParse.from_annotation(doc, doc_annotation, [token_annotation]) gold = GoldParse.from_annotation(doc, doc_annotation, token_annotation)
orig = gold.orig orig = gold.orig
gold_deps = set() gold_deps = set()
gold_deps_per_dep = {} gold_deps_per_dep = {}

View File

@ -342,19 +342,19 @@ cdef class ArcEager(TransitionSystem):
actions[RIGHT][label] = 1 actions[RIGHT][label] = 1
actions[REDUCE][label] = 1 actions[REDUCE][label] = 1
for example in kwargs.get('gold_parses', []): for example in kwargs.get('gold_parses', []):
for token_annotation in example.token_annotations: heads, labels = nonproj.projectivize(example.token_annotation.heads,
heads, labels = nonproj.projectivize(token_annotation.heads, token_annotation.deps) example.token_annotation.deps)
for child, head, label in zip(token_annotation.ids, heads, labels): for child, head, label in zip(example.token_annotation.ids, heads, labels):
if label.upper() == 'ROOT' : if label.upper() == 'ROOT' :
label = 'ROOT' label = 'ROOT'
if head == child: if head == child:
actions[BREAK][label] += 1 actions[BREAK][label] += 1
elif head < child: elif head < child:
actions[RIGHT][label] += 1 actions[RIGHT][label] += 1
actions[REDUCE][''] += 1 actions[REDUCE][''] += 1
elif head > child: elif head > child:
actions[LEFT][label] += 1 actions[LEFT][label] += 1
actions[SHIFT][''] += 1 actions[SHIFT][''] += 1
if min_freq is not None: if min_freq is not None:
for action, label_freqs in actions.items(): for action, label_freqs in actions.items():
for label, freq in list(label_freqs.items()): for label, freq in list(label_freqs.items()):

View File

@ -73,12 +73,11 @@ cdef class BiluoPushDown(TransitionSystem):
actions[action][entity_type] = 1 actions[action][entity_type] = 1
moves = ('M', 'B', 'I', 'L', 'U') moves = ('M', 'B', 'I', 'L', 'U')
for example in kwargs.get('gold_parses', []): for example in kwargs.get('gold_parses', []):
for token_annotation in example.token_annotations: for i, ner_tag in enumerate(example.token_annotation.entities):
for i, ner_tag in enumerate(token_annotation.entities): if ner_tag != 'O' and ner_tag != '-':
if ner_tag != 'O' and ner_tag != '-': _, label = ner_tag.split('-', 1)
_, label = ner_tag.split('-', 1) for action in (BEGIN, IN, LAST, UNIT):
for action in (BEGIN, IN, LAST, UNIT): actions[action][label] += 1
actions[action][label] += 1
return actions return actions
@property @property

View File

@ -81,15 +81,15 @@ def is_decorated(label):
def count_decorated_labels(gold_data): def count_decorated_labels(gold_data):
freqs = {} freqs = {}
for example in gold_data: for example in gold_data:
for token_annotation in example.token_annotations: proj_heads, deco_deps = projectivize(example.token_annotation.heads,
proj_heads, deco_deps = projectivize(token_annotation.heads, token_annotation.deps) example.token_annotation.deps)
# set the label to ROOT for each root dependent # set the label to ROOT for each root dependent
deco_deps = ['ROOT' if head == i else deco_deps[i] deco_deps = ['ROOT' if head == i else deco_deps[i]
for i, head in enumerate(proj_heads)] for i, head in enumerate(proj_heads)]
# count label frequencies # count label frequencies
for label in deco_deps: for label in deco_deps:
if is_decorated(label): if is_decorated(label):
freqs[label] = freqs.get(label, 0) + 1 freqs[label] = freqs.get(label, 0) + 1
return freqs return freqs
@ -98,21 +98,20 @@ def preprocess_training_data(gold_data, label_freq_cutoff=30):
freqs = {} freqs = {}
for example in gold_data: for example in gold_data:
new_example = Example(doc=example.doc) new_example = Example(doc=example.doc)
for token_annotation in example.token_annotations: proj_heads, deco_deps = projectivize(example.token_annotation.heads,
proj_heads, deco_deps = projectivize(token_annotation.heads, token_annotation.deps) example.token_annotation.deps)
# set the label to ROOT for each root dependent # set the label to ROOT for each root dependent
deco_deps = ['ROOT' if head == i else deco_deps[i] deco_deps = ['ROOT' if head == i else deco_deps[i]
for i, head in enumerate(proj_heads)] for i, head in enumerate(proj_heads)]
# count label frequencies # count label frequencies
if label_freq_cutoff > 0: if label_freq_cutoff > 0:
for label in deco_deps: for label in deco_deps:
if is_decorated(label): if is_decorated(label):
freqs[label] = freqs.get(label, 0) + 1 freqs[label] = freqs.get(label, 0) + 1
# TODO: the code would be less ugly when changing heads and deps in-place, but is this OK upstream ? proj_token_dict = example.token_annotation.to_dict()
proj_token_dict = token_annotation.to_dict() proj_token_dict["heads"] = proj_heads
proj_token_dict["heads"] = proj_heads proj_token_dict["deps"] = deco_deps
proj_token_dict["deps"] = deco_deps new_example.set_token_annotation(**proj_token_dict)
new_example.add_token_annotation(**proj_token_dict)
preprocessed.append(new_example) preprocessed.append(new_example)
if label_freq_cutoff > 0: if label_freq_cutoff > 0:
return _filter_labels(preprocessed, label_freq_cutoff, freqs) return _filter_labels(preprocessed, label_freq_cutoff, freqs)
@ -213,15 +212,14 @@ def _filter_labels(examples, cutoff, freqs):
filtered = [] filtered = []
for example in examples: for example in examples:
new_example = Example(doc=example.doc) new_example = Example(doc=example.doc)
for token_annotation in example.token_annotations: filtered_labels = []
filtered_labels = [] for label in example.token_annotation.deps:
for label in token_annotation.deps: if is_decorated(label) and freqs.get(label, 0) < cutoff:
if is_decorated(label) and freqs.get(label, 0) < cutoff: filtered_labels.append(decompose(label)[0])
filtered_labels.append(decompose(label)[0]) else:
else: filtered_labels.append(label)
filtered_labels.append(label) filtered_token_dict = example.token_annotation.to_dict()
filtered_token_dict = token_annotation.to_dict() filtered_token_dict["deps"] = filtered_labels
filtered_token_dict["deps"] = filtered_labels new_example.set_token_annotation(**filtered_token_dict)
new_example.add_token_annotation(**filtered_token_dict)
filtered.append(new_example) filtered.append(new_example)
return filtered return filtered

View File

@ -273,7 +273,7 @@ def test_issue1963(en_tokenizer):
def test_issue1967(label): def test_issue1967(label):
ner = EntityRecognizer(Vocab()) ner = EntityRecognizer(Vocab())
example = Example(doc=None) example = Example(doc=None)
example.add_token_annotation(ids=[0], words=["word"], tags=["tag"], heads=[0], deps=["dep"], entities=[label]) example.set_token_annotation(ids=[0], words=["word"], tags=["tag"], heads=[0], deps=["dep"], entities=[label])
ner.moves.get_actions(gold_parses=[example]) ner.moves.get_actions(gold_parses=[example])

View File

@ -36,6 +36,16 @@ def doc():
return doc return doc
@pytest.fixture()
def merged_dict():
return {
"ids": [1, 2, 3, 4, 5, 6, 7],
"words": ["Hi", "there", "everyone", "It", "is", "just", "me"],
"tags": ["INTJ", "ADV", "PRON", "PRON", "AUX", "ADV", "PRON"],
"sent_starts": [1, 0, 0, 1, 0, 0, 0, 0],
}
def test_gold_biluo_U(en_vocab): def test_gold_biluo_U(en_vocab):
words = ["I", "flew", "to", "London", "."] words = ["I", "flew", "to", "London", "."]
spaces = [True, True, True, False, True] spaces = [True, True, True, False, True]
@ -231,7 +241,7 @@ def test_ignore_misaligned(doc):
deps = [t.dep_ for t in doc] deps = [t.dep_ for t in doc]
heads = [t.head.i for t in doc] heads = [t.head.i for t in doc]
use_new_align = spacy.gold.USE_NEW_ALIGN saved_use_new_align = spacy.gold.USE_NEW_ALIGN
spacy.gold.USE_NEW_ALIGN = False spacy.gold.USE_NEW_ALIGN = False
with make_tempdir() as tmpdir: with make_tempdir() as tmpdir:
@ -270,7 +280,25 @@ def test_ignore_misaligned(doc):
ignore_misaligned=True)) ignore_misaligned=True))
assert len(train_reloaded_example) == 0 assert len(train_reloaded_example) == 0
spacy.gold.USE_NEW_ALIGN = use_new_align spacy.gold.USE_NEW_ALIGN = saved_use_new_align
def test_make_orth_variants(doc):
nlp = English()
text = doc.text
deps = [t.dep_ for t in doc]
heads = [t.head.i for t in doc]
with make_tempdir() as tmpdir:
jsonl_file = tmpdir / "test.jsonl"
# write to JSONL train dicts
srsly.write_jsonl(jsonl_file, [docs_to_json(doc)])
goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
# due to randomness, test only that this runs with no errors for now
train_reloaded_example = next(goldcorpus.train_dataset(nlp,
orth_variant_level=0.2))
train_goldparse = train_reloaded_example.gold
# xfail while we have backwards-compatible alignment # xfail while we have backwards-compatible alignment
@ -386,71 +414,38 @@ def _train(train_data):
nlp.update(batch, sgd=optimizer, losses=losses) nlp.update(batch, sgd=optimizer, losses=losses)
tokens_1 = { def test_split_sents(merged_dict):
"ids": [1, 2, 3],
"words": ["Hi", "there", "everyone"],
"tags": ["INTJ", "ADV", "PRON"],
}
tokens_2 = {
"ids": [1, 2, 3, 4],
"words": ["It", "is", "just", "me"],
"tags": ["PRON", "AUX", "ADV", "PRON"],
}
text0 = "Hi there everyone It is just me"
def test_merge_sents():
nlp = English() nlp = English()
example = Example() example = Example()
example.add_token_annotation(**tokens_1) example.set_token_annotation(**merged_dict)
example.add_token_annotation(**tokens_2)
assert len(example.get_gold_parses(merge=False, vocab=nlp.vocab)) == 2 assert len(example.get_gold_parses(merge=False, vocab=nlp.vocab)) == 2
assert len(example.get_gold_parses(merge=True, vocab=nlp.vocab)) == 1 # this shouldn't change the original object assert len(example.get_gold_parses(merge=True, vocab=nlp.vocab)) == 1
merged_example = example.merge_sents() split_examples = example.split_sents()
assert len(split_examples) == 2
token_annotation_1 = example.token_annotations[0] token_annotation_1 = split_examples[0].token_annotation
assert token_annotation_1.ids == [1, 2, 3] assert token_annotation_1.ids == [1, 2, 3]
assert token_annotation_1.words == ["Hi", "there", "everyone"] assert token_annotation_1.words == ["Hi", "there", "everyone"]
assert token_annotation_1.tags == ["INTJ", "ADV", "PRON"] assert token_annotation_1.tags == ["INTJ", "ADV", "PRON"]
assert token_annotation_1.sent_starts == [1, 0, 0]
token_annotation_m = merged_example.token_annotations[0] token_annotation_2 = split_examples[1].token_annotation
assert token_annotation_m.ids == [1, 2, 3, 4, 5, 6, 7] assert token_annotation_2.ids == [4, 5, 6, 7]
assert token_annotation_m.words == ["Hi", "there", "everyone", "It", "is", "just", "me"] assert token_annotation_2.words == ["It", "is", "just", "me"]
assert token_annotation_m.tags == ["INTJ", "ADV", "PRON", "PRON", "AUX", "ADV", "PRON"] assert token_annotation_2.tags == ["PRON", "AUX", "ADV", "PRON"]
assert token_annotation_2.sent_starts == [1, 0, 0, 0]
def test_tuples_to_example(): def test_tuples_to_example(merged_dict):
ex = Example() ex = Example()
ex.add_token_annotation(**tokens_1) ex.set_token_annotation(**merged_dict)
ex.add_token_annotation(**tokens_2) cats = {"TRAVEL": 1.0, "BAKING": 0.0}
ex.add_doc_annotation(cats={"TRAVEL": 1.0, "BAKING": 0.0}) ex.set_doc_annotation(cats=cats)
ex_dict = ex.to_dict() ex_dict = ex.to_dict()
token_dicts = [ assert ex_dict["token_annotation"]["ids"] == merged_dict["ids"]
{ assert ex_dict["token_annotation"]["words"] == merged_dict["words"]
"ids": [1, 2, 3], assert ex_dict["token_annotation"]["tags"] == merged_dict["tags"]
"words": ["Hi", "there", "everyone"], assert ex_dict["token_annotation"]["sent_starts"] == merged_dict["sent_starts"]
"tags": ["INTJ", "ADV", "PRON"], assert ex_dict["doc_annotation"]["cats"] == cats
"heads": [],
"deps": [],
"entities": [],
"morphology": [],
"brackets": [],
},
{
"ids": [1, 2, 3, 4],
"words": ["It", "is", "just", "me"],
"tags": ["PRON", "AUX", "ADV", "PRON"],
"heads": [],
"deps": [],
"entities": [],
"morphology": [],
"brackets": [],
},
]
doc_dict = {"cats": {"TRAVEL": 1.0, "BAKING": 0.0}, "links": {}}
assert ex_dict == {"token_annotations": token_dicts, "doc_annotation": doc_dict}

View File

@ -86,7 +86,7 @@ def test_ner_per_type(en_vocab):
ents=[[0, 1, "CARDINAL"], [2, 3, "CARDINAL"]], ents=[[0, 1, "CARDINAL"], [2, 3, "CARDINAL"]],
) )
ex = Example(doc=doc) ex = Example(doc=doc)
ex.add_token_annotation(entities=annot["entities"]) ex.set_token_annotation(entities=annot["entities"])
scorer.score(ex) scorer.score(ex)
results = scorer.scores results = scorer.scores
@ -107,7 +107,7 @@ def test_ner_per_type(en_vocab):
ents=[[0, 1, "ORG"], [5, 6, "GPE"], [6, 7, "ORG"]], ents=[[0, 1, "ORG"], [5, 6, "GPE"], [6, 7, "ORG"]],
) )
ex = Example(doc=doc) ex = Example(doc=doc)
ex.add_token_annotation(entities=annot["entities"]) ex.set_token_annotation(entities=annot["entities"])
scorer.score(ex) scorer.score(ex)
results = scorer.scores results = scorer.scores