Add support for pos/morphs/lemmas in training data (#4941)

Add support for pos/morphs/lemmas throughout `GoldParse`, `Example`, and
`docs_to_json()`.
This commit is contained in:
adrianeboyd 2020-01-28 11:36:29 +01:00 committed by Matthew Honnibal
parent adc9745718
commit 06b251dd1e
3 changed files with 124 additions and 68 deletions

View File

@ -25,6 +25,7 @@ cdef class GoldParse:
cdef public int loss cdef public int loss
cdef public list words cdef public list words
cdef public list tags cdef public list tags
cdef public list pos
cdef public list morphs cdef public list morphs
cdef public list lemmas cdef public list lemmas
cdef public list sent_starts cdef public list sent_starts
@ -44,11 +45,12 @@ cdef class TokenAnnotation:
cdef public list ids cdef public list ids
cdef public list words cdef public list words
cdef public list tags cdef public list tags
cdef public list pos
cdef public list morphs
cdef public list lemmas
cdef public list heads cdef public list heads
cdef public list deps cdef public list deps
cdef public list entities cdef public list entities
cdef public list morphs
cdef public list lemmas
cdef public list sent_starts cdef public list sent_starts
cdef public list brackets cdef public list brackets

View File

@ -485,11 +485,12 @@ def json_to_examples(doc):
words = [] words = []
ids = [] ids = []
tags = [] tags = []
pos = []
morphs = []
lemmas = []
heads = [] heads = []
labels = [] labels = []
ner = [] ner = []
morphs = []
lemmas = []
sent_starts = [] sent_starts = []
brackets = [] brackets = []
for sent in paragraph["sentences"]: for sent in paragraph["sentences"]:
@ -498,14 +499,15 @@ def json_to_examples(doc):
words.append(token["orth"]) words.append(token["orth"])
ids.append(token.get('id', sent_start_i + i)) ids.append(token.get('id', sent_start_i + i))
tags.append(token.get('tag', "-")) tags.append(token.get('tag', "-"))
pos.append(token.get("pos", ""))
morphs.append(token.get("morph", ""))
lemmas.append(token.get("lemma", ""))
heads.append(token.get("head", 0) + sent_start_i + i) heads.append(token.get("head", 0) + sent_start_i + i)
labels.append(token.get("dep", "")) labels.append(token.get("dep", ""))
# Ensure ROOT label is case-insensitive # Ensure ROOT label is case-insensitive
if labels[-1].lower() == "root": if labels[-1].lower() == "root":
labels[-1] = "ROOT" labels[-1] = "ROOT"
ner.append(token.get("ner", "-")) ner.append(token.get("ner", "-"))
morphs.append(token.get("morph", {}))
lemmas.append(token.get("lemma", ""))
if i == 0: if i == 0:
sent_starts.append(1) sent_starts.append(1)
else: else:
@ -518,8 +520,9 @@ def json_to_examples(doc):
for cat in paragraph.get("cats", {}): for cat in paragraph.get("cats", {}):
cats[cat["label"]] = cat["value"] cats[cat["label"]] = cat["value"]
example.set_token_annotation(ids=ids, words=words, tags=tags, example.set_token_annotation(ids=ids, words=words, tags=tags,
heads=heads, deps=labels, entities=ner, morphs=morphs, pos=pos, morphs=morphs, lemmas=lemmas, heads=heads,
lemmas=lemmas, sent_starts=sent_starts, brackets=brackets) deps=labels, entities=ner, sent_starts=sent_starts,
brackets=brackets)
example.set_doc_annotation(cats=cats) example.set_doc_annotation(cats=cats)
yield example yield example
@ -632,17 +635,18 @@ def _consume_ent(tags):
cdef class TokenAnnotation: cdef class TokenAnnotation:
def __init__(self, ids=None, words=None, tags=None, heads=None, deps=None, def __init__(self, ids=None, words=None, tags=None, pos=None, morphs=None,
entities=None, morphs=None, lemmas=None, sent_starts=None, lemmas=None, heads=None, deps=None, entities=None, sent_starts=None,
brackets=None): brackets=None):
self.ids = ids if ids else [] self.ids = ids if ids else []
self.words = words if words else [] self.words = words if words else []
self.tags = tags if tags else [] self.tags = tags if tags else []
self.pos = pos if pos else []
self.morphs = morphs if morphs else []
self.lemmas = lemmas if lemmas else []
self.heads = heads if heads else [] self.heads = heads if heads else []
self.deps = deps if deps else [] self.deps = deps if deps else []
self.entities = entities if entities else [] self.entities = entities if entities else []
self.morphs = morphs if morphs else []
self.lemmas = lemmas if lemmas else []
self.sent_starts = sent_starts if sent_starts else [] self.sent_starts = sent_starts if sent_starts else []
self.brackets = brackets if brackets else [] self.brackets = brackets if brackets else []
@ -651,11 +655,12 @@ cdef class TokenAnnotation:
return cls(ids=token_dict.get("ids", None), return cls(ids=token_dict.get("ids", None),
words=token_dict.get("words", None), words=token_dict.get("words", None),
tags=token_dict.get("tags", None), tags=token_dict.get("tags", None),
pos=token_dict.get("pos", None),
morphs=token_dict.get("morphs", None),
lemmas=token_dict.get("lemmas", None),
heads=token_dict.get("heads", None), heads=token_dict.get("heads", None),
deps=token_dict.get("deps", None), deps=token_dict.get("deps", None),
entities=token_dict.get("entities", None), entities=token_dict.get("entities", None),
morphs=token_dict.get("morphs", None),
lemmas=token_dict.get("lemmas", None),
sent_starts=token_dict.get("sent_starts", None), sent_starts=token_dict.get("sent_starts", None),
brackets=token_dict.get("brackets", None)) brackets=token_dict.get("brackets", None))
@ -663,11 +668,12 @@ cdef class TokenAnnotation:
return {"ids": self.ids, return {"ids": self.ids,
"words": self.words, "words": self.words,
"tags": self.tags, "tags": self.tags,
"pos": self.pos,
"morphs": self.morphs,
"lemmas": self.lemmas,
"heads": self.heads, "heads": self.heads,
"deps": self.deps, "deps": self.deps,
"entities": self.entities, "entities": self.entities,
"morphs": self.morphs,
"lemmas": self.lemmas,
"sent_starts": self.sent_starts, "sent_starts": self.sent_starts,
"brackets": self.brackets} "brackets": self.brackets}
@ -680,6 +686,15 @@ cdef class TokenAnnotation:
def get_tag(self, i): def get_tag(self, i):
return self.tags[i] if i < len(self.tags) else "-" return self.tags[i] if i < len(self.tags) else "-"
def get_pos(self, i):
return self.pos[i] if i < len(self.pos) else ""
def get_morph(self, i):
return self.morphs[i] if i < len(self.morphs) else ""
def get_lemma(self, i):
return self.lemmas[i] if i < len(self.lemmas) else ""
def get_head(self, i): def get_head(self, i):
return self.heads[i] if i < len(self.heads) else i return self.heads[i] if i < len(self.heads) else i
@ -689,12 +704,6 @@ cdef class TokenAnnotation:
def get_entity(self, i): def get_entity(self, i):
return self.entities[i] if i < len(self.entities) else "-" return self.entities[i] if i < len(self.entities) else "-"
def get_morph(self, i):
return self.morphs[i] if i < len(self.morphs) else set()
def get_lemma(self, i):
return self.lemmas[i] if i < len(self.lemmas) else ""
def get_sent_start(self, i): def get_sent_start(self, i):
return self.sent_starts[i] if i < len(self.sent_starts) else None return self.sent_starts[i] if i < len(self.sent_starts) else None
@ -756,12 +765,12 @@ cdef class Example:
self.goldparse = gold self.goldparse = gold
return self.goldparse return self.goldparse
def set_token_annotation(self, ids=None, words=None, tags=None, heads=None, def set_token_annotation(self, ids=None, words=None, tags=None, pos=None,
deps=None, entities=None, morphs=None, lemmas=None, morphs=None, lemmas=None, heads=None, deps=None,
sent_starts=None, brackets=None): entities=None, sent_starts=None, brackets=None):
self.token_annotation = TokenAnnotation(ids=ids, words=words, tags=tags, self.token_annotation = TokenAnnotation(ids=ids, words=words, tags=tags,
heads=heads, deps=deps, entities=entities, pos=pos, morphs=morphs, lemmas=lemmas, heads=heads,
morphs=morphs, lemmas=lemmas, deps=deps, entities=entities,
sent_starts=sent_starts, brackets=brackets) sent_starts=sent_starts, brackets=brackets)
def set_doc_annotation(self, cats=None, links=None): def set_doc_annotation(self, cats=None, links=None):
@ -774,8 +783,8 @@ cdef class Example:
""" Split the token annotations into multiple Examples based on """ Split the token annotations into multiple Examples based on
sent_starts and return a list of the new Examples""" sent_starts and return a list of the new Examples"""
s_example = Example(doc=None, doc_annotation=self.doc_annotation) s_example = Example(doc=None, doc_annotation=self.doc_annotation)
s_ids, s_words, s_tags, s_heads = [], [], [], [] s_ids, s_words, s_tags, s_pos, s_morphs = [], [], [], [], []
s_deps, s_ents, s_morphs, s_lemmas, s_sent_starts = [], [], [], [], [] s_lemmas, s_heads, s_deps, s_ents, s_sent_starts = [], [], [], [], []
s_brackets = [] s_brackets = []
sent_start_i = 0 sent_start_i = 0
t = self.token_annotation t = self.token_annotation
@ -783,31 +792,33 @@ cdef class Example:
for i in range(len(t.words)): for i in range(len(t.words)):
if i > 0 and t.sent_starts[i] == 1: if i > 0 and t.sent_starts[i] == 1:
s_example.set_token_annotation(ids=s_ids, s_example.set_token_annotation(ids=s_ids,
words=s_words, tags=s_tags, heads=s_heads, deps=s_deps, words=s_words, tags=s_tags, pos=s_pos, morphs=s_morphs,
entities=s_ents, morphs=s_morphs, lemmas=s_lemmas, lemmas=s_lemmas, heads=s_heads, deps=s_deps,
sent_starts=s_sent_starts, brackets=s_brackets) entities=s_ents, sent_starts=s_sent_starts,
brackets=s_brackets)
split_examples.append(s_example) split_examples.append(s_example)
s_example = Example(doc=None, doc_annotation=self.doc_annotation) s_example = Example(doc=None, doc_annotation=self.doc_annotation)
s_ids, s_words, s_tags, s_heads = [], [], [], [] s_ids, s_words, s_tags, s_pos, s_heads = [], [], [], [], []
s_deps, s_ents, s_morphs, s_lemmas = [], [], [], [] s_deps, s_ents, s_morphs, s_lemmas = [], [], [], []
s_sent_starts, s_brackets = [], [] s_sent_starts, s_brackets = [], []
sent_start_i = i sent_start_i = i
s_ids.append(t.get_id(i)) s_ids.append(t.get_id(i))
s_words.append(t.get_word(i)) s_words.append(t.get_word(i))
s_tags.append(t.get_tag(i)) s_tags.append(t.get_tag(i))
s_pos.append(t.get_pos(i))
s_morphs.append(t.get_morph(i))
s_lemmas.append(t.get_lemma(i))
s_heads.append(t.get_head(i) - sent_start_i) s_heads.append(t.get_head(i) - sent_start_i)
s_deps.append(t.get_dep(i)) s_deps.append(t.get_dep(i))
s_ents.append(t.get_entity(i)) s_ents.append(t.get_entity(i))
s_morphs.append(t.get_morph(i))
s_lemmas.append(t.get_lemma(i))
s_sent_starts.append(t.get_sent_start(i)) s_sent_starts.append(t.get_sent_start(i))
s_brackets.extend((b[0] - sent_start_i, s_brackets.extend((b[0] - sent_start_i,
b[1] - sent_start_i, b[2]) b[1] - sent_start_i, b[2])
for b in t.brackets if b[0] == i) for b in t.brackets if b[0] == i)
i += 1 i += 1
s_example.set_token_annotation(ids=s_ids, words=s_words, tags=s_tags, s_example.set_token_annotation(ids=s_ids, words=s_words, tags=s_tags,
heads=s_heads, deps=s_deps, entities=s_ents, pos=s_pos, morphs=s_morphs, lemmas=s_lemmas, heads=s_heads,
morphs=s_morphs, lemmas=s_lemmas, sent_starts=s_sent_starts, deps=s_deps, entities=s_ents, sent_starts=s_sent_starts,
brackets=s_brackets) brackets=s_brackets)
split_examples.append(s_example) split_examples.append(s_example)
return split_examples return split_examples
@ -911,11 +922,12 @@ cdef class GoldParse:
def from_annotation(cls, doc, doc_annotation, token_annotation, make_projective=False): def from_annotation(cls, doc, doc_annotation, token_annotation, make_projective=False):
return cls(doc, words=token_annotation.words, return cls(doc, words=token_annotation.words,
tags=token_annotation.tags, tags=token_annotation.tags,
pos=token_annotation.pos,
morphs=token_annotation.morphs,
lemmas=token_annotation.lemmas,
heads=token_annotation.heads, heads=token_annotation.heads,
deps=token_annotation.deps, deps=token_annotation.deps,
entities=token_annotation.entities, entities=token_annotation.entities,
morphs=token_annotation.morphs,
lemmas=token_annotation.lemmas,
sent_starts=token_annotation.sent_starts, sent_starts=token_annotation.sent_starts,
cats=doc_annotation.cats, cats=doc_annotation.cats,
links=doc_annotation.links, links=doc_annotation.links,
@ -927,18 +939,25 @@ cdef class GoldParse:
ids = list(range(len(self.words))) ids = list(range(len(self.words)))
return TokenAnnotation(ids=ids, words=self.words, tags=self.tags, return TokenAnnotation(ids=ids, words=self.words, tags=self.tags,
heads=self.heads, deps=self.labels, pos=self.pos, morphs=self.morphs,
entities=self.ner, morphs=self.morphs, lemmas=self.lemmas, heads=self.heads,
sent_starts=self.sent_starts, lemmas=self.lemmas) deps=self.labels, entities=self.ner,
sent_starts=self.sent_starts)
def __init__(self, doc, words=None, tags=None, morphs=None, lemmas=None, def __init__(self, doc, words=None, tags=None, pos=None, morphs=None,
sent_starts=None, heads=None, deps=None, entities=None, lemmas=None, heads=None, deps=None, entities=None,
make_projective=False, cats=None, links=None): sent_starts=None, make_projective=False, cats=None,
links=None):
"""Create a GoldParse. The fields will not be initialized if len(doc) is zero. """Create a GoldParse. The fields will not be initialized if len(doc) is zero.
doc (Doc): The document the annotations refer to. doc (Doc): The document the annotations refer to.
words (iterable): A sequence of unicode word strings. words (iterable): A sequence of unicode word strings.
tags (iterable): A sequence of strings, representing tag annotations. tags (iterable): A sequence of strings, representing tag annotations.
pos (iterable): A sequence of strings, representing UPOS annotations.
morphs (iterable): A sequence of strings, representing morph
annotations.
lemmas (iterable): A sequence of strings, representing lemma
annotations.
heads (iterable): A sequence of integers, representing syntactic heads (iterable): A sequence of integers, representing syntactic
head offsets. head offsets.
deps (iterable): A sequence of strings, representing the syntactic deps (iterable): A sequence of strings, representing the syntactic
@ -978,14 +997,16 @@ cdef class GoldParse:
words = [token.text for token in doc] words = [token.text for token in doc]
if not tags: if not tags:
tags = [None for _ in words] tags = [None for _ in words]
if not heads: if not pos:
heads = [None for _ in words] pos = [None for _ in words]
if not deps:
deps = [None for _ in words]
if not morphs: if not morphs:
morphs = [None for _ in words] morphs = [None for _ in words]
if not lemmas: if not lemmas:
lemmas = [None for _ in words] lemmas = [None for _ in words]
if not heads:
heads = [None for _ in words]
if not deps:
deps = [None for _ in words]
if not sent_starts: if not sent_starts:
sent_starts = [None for _ in words] sent_starts = [None for _ in words]
if entities is None: if entities is None:
@ -1010,11 +1031,12 @@ cdef class GoldParse:
self.words = [None] * len(doc) self.words = [None] * len(doc)
self.tags = [None] * len(doc) self.tags = [None] * len(doc)
self.pos = [None] * len(doc)
self.morphs = [None] * len(doc)
self.lemmas = [None] * len(doc)
self.heads = [None] * len(doc) self.heads = [None] * len(doc)
self.labels = [None] * len(doc) self.labels = [None] * len(doc)
self.ner = [None] * len(doc) self.ner = [None] * len(doc)
self.morphs = [None] * len(doc)
self.lemmas = [None] * len(doc)
self.sent_starts = [None] * len(doc) self.sent_starts = [None] * len(doc)
# This needs to be done before we align the words # This needs to be done before we align the words
@ -1034,24 +1056,26 @@ cdef class GoldParse:
self.gold_to_cand = [(i if i >= 0 else None) for i in j2i] self.gold_to_cand = [(i if i >= 0 else None) for i in j2i]
self.orig = TokenAnnotation(ids=list(range(len(words))), self.orig = TokenAnnotation(ids=list(range(len(words))),
words=words, tags=tags, heads=heads, deps=deps, words=words, tags=tags, pos=pos, morphs=morphs,
entities=entities, morphs=morphs, lemmas=lemmas, lemmas=lemmas, heads=heads, deps=deps, entities=entities,
sent_starts=sent_starts, brackets=[]) sent_starts=sent_starts, brackets=[])
for i, gold_i in enumerate(self.cand_to_gold): for i, gold_i in enumerate(self.cand_to_gold):
if doc[i].text.isspace(): if doc[i].text.isspace():
self.words[i] = doc[i].text self.words[i] = doc[i].text
self.tags[i] = "_SP" self.tags[i] = "_SP"
self.pos[i] = "SPACE"
self.morphs[i] = None
self.lemmas[i] = None
self.heads[i] = None self.heads[i] = None
self.labels[i] = None self.labels[i] = None
self.ner[i] = None self.ner[i] = None
self.morphs[i] = set()
self.lemmas[i] = None
self.sent_starts[i] = 0 self.sent_starts[i] = 0
if gold_i is None: if gold_i is None:
if i in i2j_multi: if i in i2j_multi:
self.words[i] = words[i2j_multi[i]] self.words[i] = words[i2j_multi[i]]
self.tags[i] = tags[i2j_multi[i]] self.tags[i] = tags[i2j_multi[i]]
self.pos[i] = pos[i2j_multi[i]]
self.morphs[i] = morphs[i2j_multi[i]] self.morphs[i] = morphs[i2j_multi[i]]
self.lemmas[i] = lemmas[i2j_multi[i]] self.lemmas[i] = lemmas[i2j_multi[i]]
self.sent_starts[i] = sent_starts[i2j_multi[i]] self.sent_starts[i] = sent_starts[i2j_multi[i]]
@ -1093,6 +1117,7 @@ cdef class GoldParse:
else: else:
self.words[i] = words[gold_i] self.words[i] = words[gold_i]
self.tags[i] = tags[gold_i] self.tags[i] = tags[gold_i]
self.pos[i] = pos[gold_i]
self.morphs[i] = morphs[gold_i] self.morphs[i] = morphs[gold_i]
self.lemmas[i] = lemmas[gold_i] self.lemmas[i] = lemmas[gold_i]
self.sent_starts[i] = sent_starts[gold_i] self.sent_starts[i] = sent_starts[gold_i]
@ -1156,9 +1181,11 @@ def docs_to_json(docs, id=0, ner_missing_tag="O"):
json_sent = {"tokens": [], "brackets": []} json_sent = {"tokens": [], "brackets": []}
for token in sent: for token in sent:
json_token = {"id": token.i, "orth": token.text} json_token = {"id": token.i, "orth": token.text}
json_token["lemma"] = token.lemma_
if doc.is_tagged: if doc.is_tagged:
json_token["tag"] = token.tag_ json_token["tag"] = token.tag_
json_token["pos"] = token.pos_
json_token["morph"] = token.morph_
json_token["lemma"] = token.lemma_
if doc.is_parsed: if doc.is_parsed:
json_token["head"] = token.head.i-token.i json_token["head"] = token.head.i-token.i
json_token["dep"] = token.dep_ json_token["dep"] = token.dep_

View File

@ -1,12 +1,7 @@
from spacy.errors import AlignmentError from spacy.errors import AlignmentError
from spacy.gold import ( from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags
biluo_tags_from_offsets, from spacy.gold import spans_from_biluo_tags, GoldParse, iob_to_biluo, align
offsets_from_biluo_tags, from spacy.gold import GoldCorpus, docs_to_json, Example, DocAnnotation
Example,
DocAnnotation,
)
from spacy.gold import spans_from_biluo_tags, GoldParse, iob_to_biluo
from spacy.gold import GoldCorpus, docs_to_json, align
from spacy.lang.en import English from spacy.lang.en import English
from spacy.syntax.nonproj import is_nonproj_tree from spacy.syntax.nonproj import is_nonproj_tree
from spacy.tokens import Doc from spacy.tokens import Doc
@ -20,6 +15,30 @@ import srsly
def doc(): def doc():
text = "Sarah's sister flew to Silicon Valley via London." text = "Sarah's sister flew to Silicon Valley via London."
tags = ["NNP", "POS", "NN", "VBD", "IN", "NNP", "NNP", "IN", "NNP", "."] tags = ["NNP", "POS", "NN", "VBD", "IN", "NNP", "NNP", "IN", "NNP", "."]
pos = [
"PROPN",
"PART",
"NOUN",
"VERB",
"ADP",
"PROPN",
"PROPN",
"ADP",
"PROPN",
"PUNCT",
]
morphs = [
"NounType=prop|Number=sing",
"Poss=yes",
"Number=sing",
"Tense=past|VerbForm=fin",
"",
"NounType=prop|Number=sing",
"NounType=prop|Number=sing",
"",
"NounType=prop|Number=sing",
"PunctType=peri",
]
# head of '.' is intentionally nonprojective for testing # head of '.' is intentionally nonprojective for testing
heads = [2, 0, 3, 3, 3, 6, 4, 3, 7, 5] heads = [2, 0, 3, 3, 3, 6, 4, 3, 7, 5]
deps = [ deps = [
@ -52,9 +71,11 @@ def doc():
doc = nlp(text) doc = nlp(text)
for i in range(len(tags)): for i in range(len(tags)):
doc[i].tag_ = tags[i] doc[i].tag_ = tags[i]
doc[i].pos_ = pos[i]
doc[i].morph_ = morphs[i]
doc[i].lemma_ = lemmas[i]
doc[i].dep_ = deps[i] doc[i].dep_ = deps[i]
doc[i].head = doc[heads[i]] doc[i].head = doc[heads[i]]
doc[i].lemma_ = lemmas[i]
doc.ents = spans_from_biluo_tags(doc, biluo_tags) doc.ents = spans_from_biluo_tags(doc, biluo_tags)
doc.cats = cats doc.cats = cats
doc.is_tagged = True doc.is_tagged = True
@ -162,9 +183,11 @@ def test_roundtrip_docs_to_json(doc):
nlp = English() nlp = English()
text = doc.text text = doc.text
tags = [t.tag_ for t in doc] tags = [t.tag_ for t in doc]
pos = [t.pos_ for t in doc]
morphs = [t.morph_ for t in doc]
lemmas = [t.lemma_ for t in doc]
deps = [t.dep_ for t in doc] deps = [t.dep_ for t in doc]
heads = [t.head.i for t in doc] heads = [t.head.i for t in doc]
lemmas = [t.lemma_ for t in doc]
biluo_tags = iob_to_biluo( biluo_tags = iob_to_biluo(
[t.ent_iob_ + "-" + t.ent_type_ if t.ent_type_ else "O" for t in doc] [t.ent_iob_ + "-" + t.ent_type_ if t.ent_type_ else "O" for t in doc]
) )
@ -182,9 +205,11 @@ def test_roundtrip_docs_to_json(doc):
assert len(doc) == goldcorpus.count_train() assert len(doc) == goldcorpus.count_train()
assert text == reloaded_example.text assert text == reloaded_example.text
assert tags == goldparse.tags assert tags == goldparse.tags
assert pos == goldparse.pos
assert morphs == goldparse.morphs
assert lemmas == goldparse.lemmas
assert deps == goldparse.labels assert deps == goldparse.labels
assert heads == goldparse.heads assert heads == goldparse.heads
assert lemmas == goldparse.lemmas
assert biluo_tags == goldparse.ner assert biluo_tags == goldparse.ner
assert "TRAVEL" in goldparse.cats assert "TRAVEL" in goldparse.cats
assert "BAKING" in goldparse.cats assert "BAKING" in goldparse.cats
@ -203,9 +228,11 @@ def test_roundtrip_docs_to_json(doc):
assert len(doc) == goldcorpus.count_train() assert len(doc) == goldcorpus.count_train()
assert text == reloaded_example.text assert text == reloaded_example.text
assert tags == goldparse.tags assert tags == goldparse.tags
assert pos == goldparse.pos
assert morphs == goldparse.morphs
assert lemmas == goldparse.lemmas
assert deps == goldparse.labels assert deps == goldparse.labels
assert heads == goldparse.heads assert heads == goldparse.heads
assert lemmas == goldparse.lemmas
assert biluo_tags == goldparse.ner assert biluo_tags == goldparse.ner
assert "TRAVEL" in goldparse.cats assert "TRAVEL" in goldparse.cats
assert "BAKING" in goldparse.cats assert "BAKING" in goldparse.cats