mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-27 17:54:39 +03:00
Add support for pos/morphs/lemmas in training data (#4941)
Add support for pos/morphs/lemmas throughout `GoldParse`, `Example`, and `docs_to_json()`.
This commit is contained in:
parent
adc9745718
commit
06b251dd1e
|
@ -25,6 +25,7 @@ cdef class GoldParse:
|
||||||
cdef public int loss
|
cdef public int loss
|
||||||
cdef public list words
|
cdef public list words
|
||||||
cdef public list tags
|
cdef public list tags
|
||||||
|
cdef public list pos
|
||||||
cdef public list morphs
|
cdef public list morphs
|
||||||
cdef public list lemmas
|
cdef public list lemmas
|
||||||
cdef public list sent_starts
|
cdef public list sent_starts
|
||||||
|
@ -44,11 +45,12 @@ cdef class TokenAnnotation:
|
||||||
cdef public list ids
|
cdef public list ids
|
||||||
cdef public list words
|
cdef public list words
|
||||||
cdef public list tags
|
cdef public list tags
|
||||||
|
cdef public list pos
|
||||||
|
cdef public list morphs
|
||||||
|
cdef public list lemmas
|
||||||
cdef public list heads
|
cdef public list heads
|
||||||
cdef public list deps
|
cdef public list deps
|
||||||
cdef public list entities
|
cdef public list entities
|
||||||
cdef public list morphs
|
|
||||||
cdef public list lemmas
|
|
||||||
cdef public list sent_starts
|
cdef public list sent_starts
|
||||||
cdef public list brackets
|
cdef public list brackets
|
||||||
|
|
||||||
|
|
135
spacy/gold.pyx
135
spacy/gold.pyx
|
@ -485,11 +485,12 @@ def json_to_examples(doc):
|
||||||
words = []
|
words = []
|
||||||
ids = []
|
ids = []
|
||||||
tags = []
|
tags = []
|
||||||
|
pos = []
|
||||||
|
morphs = []
|
||||||
|
lemmas = []
|
||||||
heads = []
|
heads = []
|
||||||
labels = []
|
labels = []
|
||||||
ner = []
|
ner = []
|
||||||
morphs = []
|
|
||||||
lemmas = []
|
|
||||||
sent_starts = []
|
sent_starts = []
|
||||||
brackets = []
|
brackets = []
|
||||||
for sent in paragraph["sentences"]:
|
for sent in paragraph["sentences"]:
|
||||||
|
@ -498,14 +499,15 @@ def json_to_examples(doc):
|
||||||
words.append(token["orth"])
|
words.append(token["orth"])
|
||||||
ids.append(token.get('id', sent_start_i + i))
|
ids.append(token.get('id', sent_start_i + i))
|
||||||
tags.append(token.get('tag', "-"))
|
tags.append(token.get('tag', "-"))
|
||||||
|
pos.append(token.get("pos", ""))
|
||||||
|
morphs.append(token.get("morph", ""))
|
||||||
|
lemmas.append(token.get("lemma", ""))
|
||||||
heads.append(token.get("head", 0) + sent_start_i + i)
|
heads.append(token.get("head", 0) + sent_start_i + i)
|
||||||
labels.append(token.get("dep", ""))
|
labels.append(token.get("dep", ""))
|
||||||
# Ensure ROOT label is case-insensitive
|
# Ensure ROOT label is case-insensitive
|
||||||
if labels[-1].lower() == "root":
|
if labels[-1].lower() == "root":
|
||||||
labels[-1] = "ROOT"
|
labels[-1] = "ROOT"
|
||||||
ner.append(token.get("ner", "-"))
|
ner.append(token.get("ner", "-"))
|
||||||
morphs.append(token.get("morph", {}))
|
|
||||||
lemmas.append(token.get("lemma", ""))
|
|
||||||
if i == 0:
|
if i == 0:
|
||||||
sent_starts.append(1)
|
sent_starts.append(1)
|
||||||
else:
|
else:
|
||||||
|
@ -518,8 +520,9 @@ def json_to_examples(doc):
|
||||||
for cat in paragraph.get("cats", {}):
|
for cat in paragraph.get("cats", {}):
|
||||||
cats[cat["label"]] = cat["value"]
|
cats[cat["label"]] = cat["value"]
|
||||||
example.set_token_annotation(ids=ids, words=words, tags=tags,
|
example.set_token_annotation(ids=ids, words=words, tags=tags,
|
||||||
heads=heads, deps=labels, entities=ner, morphs=morphs,
|
pos=pos, morphs=morphs, lemmas=lemmas, heads=heads,
|
||||||
lemmas=lemmas, sent_starts=sent_starts, brackets=brackets)
|
deps=labels, entities=ner, sent_starts=sent_starts,
|
||||||
|
brackets=brackets)
|
||||||
example.set_doc_annotation(cats=cats)
|
example.set_doc_annotation(cats=cats)
|
||||||
yield example
|
yield example
|
||||||
|
|
||||||
|
@ -632,17 +635,18 @@ def _consume_ent(tags):
|
||||||
|
|
||||||
|
|
||||||
cdef class TokenAnnotation:
|
cdef class TokenAnnotation:
|
||||||
def __init__(self, ids=None, words=None, tags=None, heads=None, deps=None,
|
def __init__(self, ids=None, words=None, tags=None, pos=None, morphs=None,
|
||||||
entities=None, morphs=None, lemmas=None, sent_starts=None,
|
lemmas=None, heads=None, deps=None, entities=None, sent_starts=None,
|
||||||
brackets=None):
|
brackets=None):
|
||||||
self.ids = ids if ids else []
|
self.ids = ids if ids else []
|
||||||
self.words = words if words else []
|
self.words = words if words else []
|
||||||
self.tags = tags if tags else []
|
self.tags = tags if tags else []
|
||||||
|
self.pos = pos if pos else []
|
||||||
|
self.morphs = morphs if morphs else []
|
||||||
|
self.lemmas = lemmas if lemmas else []
|
||||||
self.heads = heads if heads else []
|
self.heads = heads if heads else []
|
||||||
self.deps = deps if deps else []
|
self.deps = deps if deps else []
|
||||||
self.entities = entities if entities else []
|
self.entities = entities if entities else []
|
||||||
self.morphs = morphs if morphs else []
|
|
||||||
self.lemmas = lemmas if lemmas else []
|
|
||||||
self.sent_starts = sent_starts if sent_starts else []
|
self.sent_starts = sent_starts if sent_starts else []
|
||||||
self.brackets = brackets if brackets else []
|
self.brackets = brackets if brackets else []
|
||||||
|
|
||||||
|
@ -651,11 +655,12 @@ cdef class TokenAnnotation:
|
||||||
return cls(ids=token_dict.get("ids", None),
|
return cls(ids=token_dict.get("ids", None),
|
||||||
words=token_dict.get("words", None),
|
words=token_dict.get("words", None),
|
||||||
tags=token_dict.get("tags", None),
|
tags=token_dict.get("tags", None),
|
||||||
|
pos=token_dict.get("pos", None),
|
||||||
|
morphs=token_dict.get("morphs", None),
|
||||||
|
lemmas=token_dict.get("lemmas", None),
|
||||||
heads=token_dict.get("heads", None),
|
heads=token_dict.get("heads", None),
|
||||||
deps=token_dict.get("deps", None),
|
deps=token_dict.get("deps", None),
|
||||||
entities=token_dict.get("entities", None),
|
entities=token_dict.get("entities", None),
|
||||||
morphs=token_dict.get("morphs", None),
|
|
||||||
lemmas=token_dict.get("lemmas", None),
|
|
||||||
sent_starts=token_dict.get("sent_starts", None),
|
sent_starts=token_dict.get("sent_starts", None),
|
||||||
brackets=token_dict.get("brackets", None))
|
brackets=token_dict.get("brackets", None))
|
||||||
|
|
||||||
|
@ -663,11 +668,12 @@ cdef class TokenAnnotation:
|
||||||
return {"ids": self.ids,
|
return {"ids": self.ids,
|
||||||
"words": self.words,
|
"words": self.words,
|
||||||
"tags": self.tags,
|
"tags": self.tags,
|
||||||
|
"pos": self.pos,
|
||||||
|
"morphs": self.morphs,
|
||||||
|
"lemmas": self.lemmas,
|
||||||
"heads": self.heads,
|
"heads": self.heads,
|
||||||
"deps": self.deps,
|
"deps": self.deps,
|
||||||
"entities": self.entities,
|
"entities": self.entities,
|
||||||
"morphs": self.morphs,
|
|
||||||
"lemmas": self.lemmas,
|
|
||||||
"sent_starts": self.sent_starts,
|
"sent_starts": self.sent_starts,
|
||||||
"brackets": self.brackets}
|
"brackets": self.brackets}
|
||||||
|
|
||||||
|
@ -680,6 +686,15 @@ cdef class TokenAnnotation:
|
||||||
def get_tag(self, i):
|
def get_tag(self, i):
|
||||||
return self.tags[i] if i < len(self.tags) else "-"
|
return self.tags[i] if i < len(self.tags) else "-"
|
||||||
|
|
||||||
|
def get_pos(self, i):
|
||||||
|
return self.pos[i] if i < len(self.pos) else ""
|
||||||
|
|
||||||
|
def get_morph(self, i):
|
||||||
|
return self.morphs[i] if i < len(self.morphs) else ""
|
||||||
|
|
||||||
|
def get_lemma(self, i):
|
||||||
|
return self.lemmas[i] if i < len(self.lemmas) else ""
|
||||||
|
|
||||||
def get_head(self, i):
|
def get_head(self, i):
|
||||||
return self.heads[i] if i < len(self.heads) else i
|
return self.heads[i] if i < len(self.heads) else i
|
||||||
|
|
||||||
|
@ -689,12 +704,6 @@ cdef class TokenAnnotation:
|
||||||
def get_entity(self, i):
|
def get_entity(self, i):
|
||||||
return self.entities[i] if i < len(self.entities) else "-"
|
return self.entities[i] if i < len(self.entities) else "-"
|
||||||
|
|
||||||
def get_morph(self, i):
|
|
||||||
return self.morphs[i] if i < len(self.morphs) else set()
|
|
||||||
|
|
||||||
def get_lemma(self, i):
|
|
||||||
return self.lemmas[i] if i < len(self.lemmas) else ""
|
|
||||||
|
|
||||||
def get_sent_start(self, i):
|
def get_sent_start(self, i):
|
||||||
return self.sent_starts[i] if i < len(self.sent_starts) else None
|
return self.sent_starts[i] if i < len(self.sent_starts) else None
|
||||||
|
|
||||||
|
@ -756,12 +765,12 @@ cdef class Example:
|
||||||
self.goldparse = gold
|
self.goldparse = gold
|
||||||
return self.goldparse
|
return self.goldparse
|
||||||
|
|
||||||
def set_token_annotation(self, ids=None, words=None, tags=None, heads=None,
|
def set_token_annotation(self, ids=None, words=None, tags=None, pos=None,
|
||||||
deps=None, entities=None, morphs=None, lemmas=None,
|
morphs=None, lemmas=None, heads=None, deps=None,
|
||||||
sent_starts=None, brackets=None):
|
entities=None, sent_starts=None, brackets=None):
|
||||||
self.token_annotation = TokenAnnotation(ids=ids, words=words, tags=tags,
|
self.token_annotation = TokenAnnotation(ids=ids, words=words, tags=tags,
|
||||||
heads=heads, deps=deps, entities=entities,
|
pos=pos, morphs=morphs, lemmas=lemmas, heads=heads,
|
||||||
morphs=morphs, lemmas=lemmas,
|
deps=deps, entities=entities,
|
||||||
sent_starts=sent_starts, brackets=brackets)
|
sent_starts=sent_starts, brackets=brackets)
|
||||||
|
|
||||||
def set_doc_annotation(self, cats=None, links=None):
|
def set_doc_annotation(self, cats=None, links=None):
|
||||||
|
@ -774,8 +783,8 @@ cdef class Example:
|
||||||
""" Split the token annotations into multiple Examples based on
|
""" Split the token annotations into multiple Examples based on
|
||||||
sent_starts and return a list of the new Examples"""
|
sent_starts and return a list of the new Examples"""
|
||||||
s_example = Example(doc=None, doc_annotation=self.doc_annotation)
|
s_example = Example(doc=None, doc_annotation=self.doc_annotation)
|
||||||
s_ids, s_words, s_tags, s_heads = [], [], [], []
|
s_ids, s_words, s_tags, s_pos, s_morphs = [], [], [], [], []
|
||||||
s_deps, s_ents, s_morphs, s_lemmas, s_sent_starts = [], [], [], [], []
|
s_lemmas, s_heads, s_deps, s_ents, s_sent_starts = [], [], [], [], []
|
||||||
s_brackets = []
|
s_brackets = []
|
||||||
sent_start_i = 0
|
sent_start_i = 0
|
||||||
t = self.token_annotation
|
t = self.token_annotation
|
||||||
|
@ -783,31 +792,33 @@ cdef class Example:
|
||||||
for i in range(len(t.words)):
|
for i in range(len(t.words)):
|
||||||
if i > 0 and t.sent_starts[i] == 1:
|
if i > 0 and t.sent_starts[i] == 1:
|
||||||
s_example.set_token_annotation(ids=s_ids,
|
s_example.set_token_annotation(ids=s_ids,
|
||||||
words=s_words, tags=s_tags, heads=s_heads, deps=s_deps,
|
words=s_words, tags=s_tags, pos=s_pos, morphs=s_morphs,
|
||||||
entities=s_ents, morphs=s_morphs, lemmas=s_lemmas,
|
lemmas=s_lemmas, heads=s_heads, deps=s_deps,
|
||||||
sent_starts=s_sent_starts, brackets=s_brackets)
|
entities=s_ents, sent_starts=s_sent_starts,
|
||||||
|
brackets=s_brackets)
|
||||||
split_examples.append(s_example)
|
split_examples.append(s_example)
|
||||||
s_example = Example(doc=None, doc_annotation=self.doc_annotation)
|
s_example = Example(doc=None, doc_annotation=self.doc_annotation)
|
||||||
s_ids, s_words, s_tags, s_heads = [], [], [], []
|
s_ids, s_words, s_tags, s_pos, s_heads = [], [], [], [], []
|
||||||
s_deps, s_ents, s_morphs, s_lemmas = [], [], [], []
|
s_deps, s_ents, s_morphs, s_lemmas = [], [], [], []
|
||||||
s_sent_starts, s_brackets = [], []
|
s_sent_starts, s_brackets = [], []
|
||||||
sent_start_i = i
|
sent_start_i = i
|
||||||
s_ids.append(t.get_id(i))
|
s_ids.append(t.get_id(i))
|
||||||
s_words.append(t.get_word(i))
|
s_words.append(t.get_word(i))
|
||||||
s_tags.append(t.get_tag(i))
|
s_tags.append(t.get_tag(i))
|
||||||
|
s_pos.append(t.get_pos(i))
|
||||||
|
s_morphs.append(t.get_morph(i))
|
||||||
|
s_lemmas.append(t.get_lemma(i))
|
||||||
s_heads.append(t.get_head(i) - sent_start_i)
|
s_heads.append(t.get_head(i) - sent_start_i)
|
||||||
s_deps.append(t.get_dep(i))
|
s_deps.append(t.get_dep(i))
|
||||||
s_ents.append(t.get_entity(i))
|
s_ents.append(t.get_entity(i))
|
||||||
s_morphs.append(t.get_morph(i))
|
|
||||||
s_lemmas.append(t.get_lemma(i))
|
|
||||||
s_sent_starts.append(t.get_sent_start(i))
|
s_sent_starts.append(t.get_sent_start(i))
|
||||||
s_brackets.extend((b[0] - sent_start_i,
|
s_brackets.extend((b[0] - sent_start_i,
|
||||||
b[1] - sent_start_i, b[2])
|
b[1] - sent_start_i, b[2])
|
||||||
for b in t.brackets if b[0] == i)
|
for b in t.brackets if b[0] == i)
|
||||||
i += 1
|
i += 1
|
||||||
s_example.set_token_annotation(ids=s_ids, words=s_words, tags=s_tags,
|
s_example.set_token_annotation(ids=s_ids, words=s_words, tags=s_tags,
|
||||||
heads=s_heads, deps=s_deps, entities=s_ents,
|
pos=s_pos, morphs=s_morphs, lemmas=s_lemmas, heads=s_heads,
|
||||||
morphs=s_morphs, lemmas=s_lemmas, sent_starts=s_sent_starts,
|
deps=s_deps, entities=s_ents, sent_starts=s_sent_starts,
|
||||||
brackets=s_brackets)
|
brackets=s_brackets)
|
||||||
split_examples.append(s_example)
|
split_examples.append(s_example)
|
||||||
return split_examples
|
return split_examples
|
||||||
|
@ -911,11 +922,12 @@ cdef class GoldParse:
|
||||||
def from_annotation(cls, doc, doc_annotation, token_annotation, make_projective=False):
|
def from_annotation(cls, doc, doc_annotation, token_annotation, make_projective=False):
|
||||||
return cls(doc, words=token_annotation.words,
|
return cls(doc, words=token_annotation.words,
|
||||||
tags=token_annotation.tags,
|
tags=token_annotation.tags,
|
||||||
|
pos=token_annotation.pos,
|
||||||
|
morphs=token_annotation.morphs,
|
||||||
|
lemmas=token_annotation.lemmas,
|
||||||
heads=token_annotation.heads,
|
heads=token_annotation.heads,
|
||||||
deps=token_annotation.deps,
|
deps=token_annotation.deps,
|
||||||
entities=token_annotation.entities,
|
entities=token_annotation.entities,
|
||||||
morphs=token_annotation.morphs,
|
|
||||||
lemmas=token_annotation.lemmas,
|
|
||||||
sent_starts=token_annotation.sent_starts,
|
sent_starts=token_annotation.sent_starts,
|
||||||
cats=doc_annotation.cats,
|
cats=doc_annotation.cats,
|
||||||
links=doc_annotation.links,
|
links=doc_annotation.links,
|
||||||
|
@ -927,18 +939,25 @@ cdef class GoldParse:
|
||||||
ids = list(range(len(self.words)))
|
ids = list(range(len(self.words)))
|
||||||
|
|
||||||
return TokenAnnotation(ids=ids, words=self.words, tags=self.tags,
|
return TokenAnnotation(ids=ids, words=self.words, tags=self.tags,
|
||||||
heads=self.heads, deps=self.labels,
|
pos=self.pos, morphs=self.morphs,
|
||||||
entities=self.ner, morphs=self.morphs,
|
lemmas=self.lemmas, heads=self.heads,
|
||||||
sent_starts=self.sent_starts, lemmas=self.lemmas)
|
deps=self.labels, entities=self.ner,
|
||||||
|
sent_starts=self.sent_starts)
|
||||||
|
|
||||||
def __init__(self, doc, words=None, tags=None, morphs=None, lemmas=None,
|
def __init__(self, doc, words=None, tags=None, pos=None, morphs=None,
|
||||||
sent_starts=None, heads=None, deps=None, entities=None,
|
lemmas=None, heads=None, deps=None, entities=None,
|
||||||
make_projective=False, cats=None, links=None):
|
sent_starts=None, make_projective=False, cats=None,
|
||||||
|
links=None):
|
||||||
"""Create a GoldParse. The fields will not be initialized if len(doc) is zero.
|
"""Create a GoldParse. The fields will not be initialized if len(doc) is zero.
|
||||||
|
|
||||||
doc (Doc): The document the annotations refer to.
|
doc (Doc): The document the annotations refer to.
|
||||||
words (iterable): A sequence of unicode word strings.
|
words (iterable): A sequence of unicode word strings.
|
||||||
tags (iterable): A sequence of strings, representing tag annotations.
|
tags (iterable): A sequence of strings, representing tag annotations.
|
||||||
|
pos (iterable): A sequence of strings, representing UPOS annotations.
|
||||||
|
morphs (iterable): A sequence of strings, representing morph
|
||||||
|
annotations.
|
||||||
|
lemmas (iterable): A sequence of strings, representing lemma
|
||||||
|
annotations.
|
||||||
heads (iterable): A sequence of integers, representing syntactic
|
heads (iterable): A sequence of integers, representing syntactic
|
||||||
head offsets.
|
head offsets.
|
||||||
deps (iterable): A sequence of strings, representing the syntactic
|
deps (iterable): A sequence of strings, representing the syntactic
|
||||||
|
@ -978,14 +997,16 @@ cdef class GoldParse:
|
||||||
words = [token.text for token in doc]
|
words = [token.text for token in doc]
|
||||||
if not tags:
|
if not tags:
|
||||||
tags = [None for _ in words]
|
tags = [None for _ in words]
|
||||||
if not heads:
|
if not pos:
|
||||||
heads = [None for _ in words]
|
pos = [None for _ in words]
|
||||||
if not deps:
|
|
||||||
deps = [None for _ in words]
|
|
||||||
if not morphs:
|
if not morphs:
|
||||||
morphs = [None for _ in words]
|
morphs = [None for _ in words]
|
||||||
if not lemmas:
|
if not lemmas:
|
||||||
lemmas = [None for _ in words]
|
lemmas = [None for _ in words]
|
||||||
|
if not heads:
|
||||||
|
heads = [None for _ in words]
|
||||||
|
if not deps:
|
||||||
|
deps = [None for _ in words]
|
||||||
if not sent_starts:
|
if not sent_starts:
|
||||||
sent_starts = [None for _ in words]
|
sent_starts = [None for _ in words]
|
||||||
if entities is None:
|
if entities is None:
|
||||||
|
@ -1010,11 +1031,12 @@ cdef class GoldParse:
|
||||||
|
|
||||||
self.words = [None] * len(doc)
|
self.words = [None] * len(doc)
|
||||||
self.tags = [None] * len(doc)
|
self.tags = [None] * len(doc)
|
||||||
|
self.pos = [None] * len(doc)
|
||||||
|
self.morphs = [None] * len(doc)
|
||||||
|
self.lemmas = [None] * len(doc)
|
||||||
self.heads = [None] * len(doc)
|
self.heads = [None] * len(doc)
|
||||||
self.labels = [None] * len(doc)
|
self.labels = [None] * len(doc)
|
||||||
self.ner = [None] * len(doc)
|
self.ner = [None] * len(doc)
|
||||||
self.morphs = [None] * len(doc)
|
|
||||||
self.lemmas = [None] * len(doc)
|
|
||||||
self.sent_starts = [None] * len(doc)
|
self.sent_starts = [None] * len(doc)
|
||||||
|
|
||||||
# This needs to be done before we align the words
|
# This needs to be done before we align the words
|
||||||
|
@ -1034,24 +1056,26 @@ cdef class GoldParse:
|
||||||
self.gold_to_cand = [(i if i >= 0 else None) for i in j2i]
|
self.gold_to_cand = [(i if i >= 0 else None) for i in j2i]
|
||||||
|
|
||||||
self.orig = TokenAnnotation(ids=list(range(len(words))),
|
self.orig = TokenAnnotation(ids=list(range(len(words))),
|
||||||
words=words, tags=tags, heads=heads, deps=deps,
|
words=words, tags=tags, pos=pos, morphs=morphs,
|
||||||
entities=entities, morphs=morphs, lemmas=lemmas,
|
lemmas=lemmas, heads=heads, deps=deps, entities=entities,
|
||||||
sent_starts=sent_starts, brackets=[])
|
sent_starts=sent_starts, brackets=[])
|
||||||
|
|
||||||
for i, gold_i in enumerate(self.cand_to_gold):
|
for i, gold_i in enumerate(self.cand_to_gold):
|
||||||
if doc[i].text.isspace():
|
if doc[i].text.isspace():
|
||||||
self.words[i] = doc[i].text
|
self.words[i] = doc[i].text
|
||||||
self.tags[i] = "_SP"
|
self.tags[i] = "_SP"
|
||||||
|
self.pos[i] = "SPACE"
|
||||||
|
self.morphs[i] = None
|
||||||
|
self.lemmas[i] = None
|
||||||
self.heads[i] = None
|
self.heads[i] = None
|
||||||
self.labels[i] = None
|
self.labels[i] = None
|
||||||
self.ner[i] = None
|
self.ner[i] = None
|
||||||
self.morphs[i] = set()
|
|
||||||
self.lemmas[i] = None
|
|
||||||
self.sent_starts[i] = 0
|
self.sent_starts[i] = 0
|
||||||
if gold_i is None:
|
if gold_i is None:
|
||||||
if i in i2j_multi:
|
if i in i2j_multi:
|
||||||
self.words[i] = words[i2j_multi[i]]
|
self.words[i] = words[i2j_multi[i]]
|
||||||
self.tags[i] = tags[i2j_multi[i]]
|
self.tags[i] = tags[i2j_multi[i]]
|
||||||
|
self.pos[i] = pos[i2j_multi[i]]
|
||||||
self.morphs[i] = morphs[i2j_multi[i]]
|
self.morphs[i] = morphs[i2j_multi[i]]
|
||||||
self.lemmas[i] = lemmas[i2j_multi[i]]
|
self.lemmas[i] = lemmas[i2j_multi[i]]
|
||||||
self.sent_starts[i] = sent_starts[i2j_multi[i]]
|
self.sent_starts[i] = sent_starts[i2j_multi[i]]
|
||||||
|
@ -1093,6 +1117,7 @@ cdef class GoldParse:
|
||||||
else:
|
else:
|
||||||
self.words[i] = words[gold_i]
|
self.words[i] = words[gold_i]
|
||||||
self.tags[i] = tags[gold_i]
|
self.tags[i] = tags[gold_i]
|
||||||
|
self.pos[i] = pos[gold_i]
|
||||||
self.morphs[i] = morphs[gold_i]
|
self.morphs[i] = morphs[gold_i]
|
||||||
self.lemmas[i] = lemmas[gold_i]
|
self.lemmas[i] = lemmas[gold_i]
|
||||||
self.sent_starts[i] = sent_starts[gold_i]
|
self.sent_starts[i] = sent_starts[gold_i]
|
||||||
|
@ -1156,9 +1181,11 @@ def docs_to_json(docs, id=0, ner_missing_tag="O"):
|
||||||
json_sent = {"tokens": [], "brackets": []}
|
json_sent = {"tokens": [], "brackets": []}
|
||||||
for token in sent:
|
for token in sent:
|
||||||
json_token = {"id": token.i, "orth": token.text}
|
json_token = {"id": token.i, "orth": token.text}
|
||||||
json_token["lemma"] = token.lemma_
|
|
||||||
if doc.is_tagged:
|
if doc.is_tagged:
|
||||||
json_token["tag"] = token.tag_
|
json_token["tag"] = token.tag_
|
||||||
|
json_token["pos"] = token.pos_
|
||||||
|
json_token["morph"] = token.morph_
|
||||||
|
json_token["lemma"] = token.lemma_
|
||||||
if doc.is_parsed:
|
if doc.is_parsed:
|
||||||
json_token["head"] = token.head.i-token.i
|
json_token["head"] = token.head.i-token.i
|
||||||
json_token["dep"] = token.dep_
|
json_token["dep"] = token.dep_
|
||||||
|
|
|
@ -1,12 +1,7 @@
|
||||||
from spacy.errors import AlignmentError
|
from spacy.errors import AlignmentError
|
||||||
from spacy.gold import (
|
from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags
|
||||||
biluo_tags_from_offsets,
|
from spacy.gold import spans_from_biluo_tags, GoldParse, iob_to_biluo, align
|
||||||
offsets_from_biluo_tags,
|
from spacy.gold import GoldCorpus, docs_to_json, Example, DocAnnotation
|
||||||
Example,
|
|
||||||
DocAnnotation,
|
|
||||||
)
|
|
||||||
from spacy.gold import spans_from_biluo_tags, GoldParse, iob_to_biluo
|
|
||||||
from spacy.gold import GoldCorpus, docs_to_json, align
|
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.syntax.nonproj import is_nonproj_tree
|
from spacy.syntax.nonproj import is_nonproj_tree
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
|
@ -20,6 +15,30 @@ import srsly
|
||||||
def doc():
|
def doc():
|
||||||
text = "Sarah's sister flew to Silicon Valley via London."
|
text = "Sarah's sister flew to Silicon Valley via London."
|
||||||
tags = ["NNP", "POS", "NN", "VBD", "IN", "NNP", "NNP", "IN", "NNP", "."]
|
tags = ["NNP", "POS", "NN", "VBD", "IN", "NNP", "NNP", "IN", "NNP", "."]
|
||||||
|
pos = [
|
||||||
|
"PROPN",
|
||||||
|
"PART",
|
||||||
|
"NOUN",
|
||||||
|
"VERB",
|
||||||
|
"ADP",
|
||||||
|
"PROPN",
|
||||||
|
"PROPN",
|
||||||
|
"ADP",
|
||||||
|
"PROPN",
|
||||||
|
"PUNCT",
|
||||||
|
]
|
||||||
|
morphs = [
|
||||||
|
"NounType=prop|Number=sing",
|
||||||
|
"Poss=yes",
|
||||||
|
"Number=sing",
|
||||||
|
"Tense=past|VerbForm=fin",
|
||||||
|
"",
|
||||||
|
"NounType=prop|Number=sing",
|
||||||
|
"NounType=prop|Number=sing",
|
||||||
|
"",
|
||||||
|
"NounType=prop|Number=sing",
|
||||||
|
"PunctType=peri",
|
||||||
|
]
|
||||||
# head of '.' is intentionally nonprojective for testing
|
# head of '.' is intentionally nonprojective for testing
|
||||||
heads = [2, 0, 3, 3, 3, 6, 4, 3, 7, 5]
|
heads = [2, 0, 3, 3, 3, 6, 4, 3, 7, 5]
|
||||||
deps = [
|
deps = [
|
||||||
|
@ -52,9 +71,11 @@ def doc():
|
||||||
doc = nlp(text)
|
doc = nlp(text)
|
||||||
for i in range(len(tags)):
|
for i in range(len(tags)):
|
||||||
doc[i].tag_ = tags[i]
|
doc[i].tag_ = tags[i]
|
||||||
|
doc[i].pos_ = pos[i]
|
||||||
|
doc[i].morph_ = morphs[i]
|
||||||
|
doc[i].lemma_ = lemmas[i]
|
||||||
doc[i].dep_ = deps[i]
|
doc[i].dep_ = deps[i]
|
||||||
doc[i].head = doc[heads[i]]
|
doc[i].head = doc[heads[i]]
|
||||||
doc[i].lemma_ = lemmas[i]
|
|
||||||
doc.ents = spans_from_biluo_tags(doc, biluo_tags)
|
doc.ents = spans_from_biluo_tags(doc, biluo_tags)
|
||||||
doc.cats = cats
|
doc.cats = cats
|
||||||
doc.is_tagged = True
|
doc.is_tagged = True
|
||||||
|
@ -162,9 +183,11 @@ def test_roundtrip_docs_to_json(doc):
|
||||||
nlp = English()
|
nlp = English()
|
||||||
text = doc.text
|
text = doc.text
|
||||||
tags = [t.tag_ for t in doc]
|
tags = [t.tag_ for t in doc]
|
||||||
|
pos = [t.pos_ for t in doc]
|
||||||
|
morphs = [t.morph_ for t in doc]
|
||||||
|
lemmas = [t.lemma_ for t in doc]
|
||||||
deps = [t.dep_ for t in doc]
|
deps = [t.dep_ for t in doc]
|
||||||
heads = [t.head.i for t in doc]
|
heads = [t.head.i for t in doc]
|
||||||
lemmas = [t.lemma_ for t in doc]
|
|
||||||
biluo_tags = iob_to_biluo(
|
biluo_tags = iob_to_biluo(
|
||||||
[t.ent_iob_ + "-" + t.ent_type_ if t.ent_type_ else "O" for t in doc]
|
[t.ent_iob_ + "-" + t.ent_type_ if t.ent_type_ else "O" for t in doc]
|
||||||
)
|
)
|
||||||
|
@ -182,9 +205,11 @@ def test_roundtrip_docs_to_json(doc):
|
||||||
assert len(doc) == goldcorpus.count_train()
|
assert len(doc) == goldcorpus.count_train()
|
||||||
assert text == reloaded_example.text
|
assert text == reloaded_example.text
|
||||||
assert tags == goldparse.tags
|
assert tags == goldparse.tags
|
||||||
|
assert pos == goldparse.pos
|
||||||
|
assert morphs == goldparse.morphs
|
||||||
|
assert lemmas == goldparse.lemmas
|
||||||
assert deps == goldparse.labels
|
assert deps == goldparse.labels
|
||||||
assert heads == goldparse.heads
|
assert heads == goldparse.heads
|
||||||
assert lemmas == goldparse.lemmas
|
|
||||||
assert biluo_tags == goldparse.ner
|
assert biluo_tags == goldparse.ner
|
||||||
assert "TRAVEL" in goldparse.cats
|
assert "TRAVEL" in goldparse.cats
|
||||||
assert "BAKING" in goldparse.cats
|
assert "BAKING" in goldparse.cats
|
||||||
|
@ -203,9 +228,11 @@ def test_roundtrip_docs_to_json(doc):
|
||||||
assert len(doc) == goldcorpus.count_train()
|
assert len(doc) == goldcorpus.count_train()
|
||||||
assert text == reloaded_example.text
|
assert text == reloaded_example.text
|
||||||
assert tags == goldparse.tags
|
assert tags == goldparse.tags
|
||||||
|
assert pos == goldparse.pos
|
||||||
|
assert morphs == goldparse.morphs
|
||||||
|
assert lemmas == goldparse.lemmas
|
||||||
assert deps == goldparse.labels
|
assert deps == goldparse.labels
|
||||||
assert heads == goldparse.heads
|
assert heads == goldparse.heads
|
||||||
assert lemmas == goldparse.lemmas
|
|
||||||
assert biluo_tags == goldparse.ner
|
assert biluo_tags == goldparse.ner
|
||||||
assert "TRAVEL" in goldparse.cats
|
assert "TRAVEL" in goldparse.cats
|
||||||
assert "BAKING" in goldparse.cats
|
assert "BAKING" in goldparse.cats
|
||||||
|
|
Loading…
Reference in New Issue
Block a user