add support for MORPH in to/from_array, fix morphologizer overfitting test

This commit is contained in:
svlandeg 2020-06-17 17:48:07 +02:00
parent 1a151b10d6
commit 10d396977e
6 changed files with 36 additions and 23 deletions

View File

@ -56,10 +56,8 @@ cdef class Example:
if "ORTH" not in tok_dict: if "ORTH" not in tok_dict:
tok_dict["ORTH"] = [tok.text for tok in predicted] tok_dict["ORTH"] = [tok.text for tok in predicted]
tok_dict["SPACY"] = [tok.whitespace_ for tok in predicted] tok_dict["SPACY"] = [tok.whitespace_ for tok in predicted]
print("added ORTH and SPACY to the tok_dict")
if "SPACY" not in tok_dict: if "SPACY" not in tok_dict:
tok_dict["SPACY"] = None tok_dict["SPACY"] = None
print("added SPACY to the tok_dict")
return Example( return Example(
predicted, predicted,
annotations2doc(predicted.vocab, tok_dict, doc_dict) annotations2doc(predicted.vocab, tok_dict, doc_dict)
@ -82,10 +80,6 @@ cdef class Example:
j2i_multi = alignment.j2i_multi j2i_multi = alignment.j2i_multi
gold_to_cand = alignment.gold_to_cand gold_to_cand = alignment.gold_to_cand
cand_to_gold = alignment.cand_to_gold cand_to_gold = alignment.cand_to_gold
print("i2j_multi", i2j_multi)
print("j2i_multi", j2i_multi)
print("gold_to_cand", gold_to_cand)
print("cand_to_gold", cand_to_gold)
vocab = self.reference.vocab vocab = self.reference.vocab
gold_values = self.reference.to_array([field]) gold_values = self.reference.to_array([field])
@ -101,7 +95,6 @@ cdef class Example:
else: else:
output[i] = gold_values[gold_i] output[i] = gold_values[gold_i]
print("output before:" , output)
if field in ["ENT_IOB"]: if field in ["ENT_IOB"]:
# Fix many-to-one IOB codes # Fix many-to-one IOB codes
prev_j = -1 prev_j = -1
@ -116,23 +109,16 @@ cdef class Example:
prev_j = -1 prev_j = -1
prev_value = value prev_value = value
print("output in between:" , output)
if field in ["ENT_IOB", "ENT_TYPE"]: if field in ["ENT_IOB", "ENT_TYPE"]:
# Assign one-to-many NER tags # Assign one-to-many NER tags
for j, cand_j in enumerate(gold_to_cand): for j, cand_j in enumerate(gold_to_cand):
print()
print("j", j)
print("cand_j", cand_j)
if cand_j is None: if cand_j is None:
if j in j2i_multi: if j in j2i_multi:
i = j2i_multi[j] i = j2i_multi[j]
if output[i] is None: if output[i] is None:
output[i] = gold_values[j] output[i] = gold_values[j]
print("output final:" , output)
if as_string: if as_string:
output = [vocab.strings[o] if o is not None else o for o in output] output = [vocab.strings[o] if o is not None else o for o in output]
print("output as string:" , output)
return output return output
def to_dict(self): def to_dict(self):
@ -182,6 +168,10 @@ cdef class Example:
def __get__(self): def __get__(self):
return self.x.text return self.x.text
property doc:
def __get__(self):
return self.x
def __str__(self): def __str__(self):
return str(self.to_dict()) return str(self.to_dict())

View File

@ -92,8 +92,8 @@ class Morphologizer(Tagger):
guesses = scores.argmax(axis=1) guesses = scores.argmax(axis=1)
known_labels = numpy.ones((scores.shape[0], 1), dtype="f") known_labels = numpy.ones((scores.shape[0], 1), dtype="f")
for eg in examples: for eg in examples:
pos_tags = eg.get_aligned("POS") pos_tags = eg.get_aligned("POS", as_string=True)
morphs = eg.get_aligned("MORPH") morphs = eg.get_aligned("MORPH", as_string=True)
for i in range(len(morphs)): for i in range(len(morphs)):
pos = pos_tags[i] pos = pos_tags[i]
morph = morphs[i] morph = morphs[i]

View File

@ -1,6 +1,6 @@
import pytest import pytest
from spacy.tokens import Doc from spacy.tokens import Doc
from spacy.attrs import ORTH, SHAPE, POS, DEP from spacy.attrs import ORTH, SHAPE, POS, DEP, MORPH
from ..util import get_doc from ..util import get_doc
@ -44,6 +44,20 @@ def test_doc_array_tag(en_vocab):
assert feats_array[3][1] == doc[3].pos assert feats_array[3][1] == doc[3].pos
def test_doc_array_morph(en_vocab):
words = ["Eat", "blue", "ham"]
morph = ["Feat=V", "Feat=J", "Feat=N"]
doc = get_doc(en_vocab, words=words, morphs=morph)
assert morph[0] == doc[0].morph_
assert morph[1] == doc[1].morph_
assert morph[2] == doc[2].morph_
feats_array = doc.to_array((ORTH, MORPH))
assert feats_array[0][1] == doc[0].morph.key
assert feats_array[1][1] == doc[1].morph.key
assert feats_array[2][1] == doc[2].morph.key
def test_doc_array_dep(en_vocab): def test_doc_array_dep(en_vocab):
words = ["A", "nice", "sentence", "."] words = ["A", "nice", "sentence", "."]
deps = ["det", "amod", "ROOT", "punct"] deps = ["det", "amod", "ROOT", "punct"]

View File

@ -39,7 +39,7 @@ def test_overfitting_IO():
test_text = "I like blue eggs" test_text = "I like blue eggs"
doc = nlp(test_text) doc = nlp(test_text)
gold_morphs = ["Feat=N|POS=NOUN", "Feat=V|POS=VERB", "Feat=J|POS=ADJ", "Feat=N|POS=NOUN"] gold_morphs = ["Feat=N|POS=NOUN", "Feat=V|POS=VERB", "Feat=J|POS=ADJ", "Feat=N|POS=NOUN"]
assert gold_morphs == [t.morph_ for t in doc] assert [t.morph_ for t in doc] == gold_morphs
# Also test the results are still the same after IO # Also test the results are still the same after IO
with make_tempdir() as tmp_dir: with make_tempdir() as tmp_dir:

View File

@ -7,7 +7,7 @@ from pathlib import Path
from spacy import Errors from spacy import Errors
from spacy.tokens import Doc, Span from spacy.tokens import Doc, Span
from spacy.attrs import POS, TAG, HEAD, DEP, LEMMA from spacy.attrs import POS, TAG, HEAD, DEP, LEMMA, MORPH
from spacy.vocab import Vocab from spacy.vocab import Vocab
@ -27,15 +27,15 @@ def make_tempdir():
def get_doc( def get_doc(
vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None, lemmas=None vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None, lemmas=None, morphs=None
): ):
"""Create Doc object from given vocab, words and annotations.""" """Create Doc object from given vocab, words and annotations."""
if deps and not heads: if deps and not heads:
heads = [0] * len(deps) heads = [0] * len(deps)
headings = [] headings = []
values = [] values = []
annotations = [pos, heads, deps, lemmas, tags] annotations = [pos, heads, deps, lemmas, tags, morphs]
possible_headings = [POS, HEAD, DEP, LEMMA, TAG] possible_headings = [POS, HEAD, DEP, LEMMA, TAG, MORPH]
for a, annot in enumerate(annotations): for a, annot in enumerate(annotations):
if annot is not None: if annot is not None:
if len(annot) != len(words): if len(annot) != len(words):
@ -61,6 +61,13 @@ def get_doc(
attrs[i] = heads[i] attrs[i] = heads[i]
else: else:
attrs[i, j] = heads[i] attrs[i, j] = heads[i]
elif annot is morphs:
for i in range(len(words)):
morph_key = vocab.morphology.add(morphs[i])
if attrs.ndim == 1:
attrs[i] = morph_key
else:
attrs[i, j] = morph_key
else: else:
for i in range(len(words)): for i in range(len(words)):
if attrs.ndim == 1: if attrs.ndim == 1:

View File

@ -18,7 +18,7 @@ from .token cimport Token
from ..lexeme cimport Lexeme, EMPTY_LEXEME from ..lexeme cimport Lexeme, EMPTY_LEXEME
from ..typedefs cimport attr_t, flags_t from ..typedefs cimport attr_t, flags_t
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, CLUSTER from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, CLUSTER
from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB from ..attrs cimport LENGTH, POS, LEMMA, TAG, MORPH, DEP, HEAD, SPACY, ENT_IOB
from ..attrs cimport ENT_TYPE, ENT_ID, ENT_KB_ID, SENT_START, IDX, attr_id_t from ..attrs cimport ENT_TYPE, ENT_ID, ENT_KB_ID, SENT_START, IDX, attr_id_t
from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t
@ -52,6 +52,8 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil:
return token.pos return token.pos
elif feat_name == TAG: elif feat_name == TAG:
return token.tag return token.tag
elif feat_name == MORPH:
return token.morph
elif feat_name == DEP: elif feat_name == DEP:
return token.dep return token.dep
elif feat_name == HEAD: elif feat_name == HEAD: