From 10d396977e345e6372b151e672ac8fd0f91c8dbd Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 17 Jun 2020 17:48:07 +0200 Subject: [PATCH] add support for MORPH in to/from_array, fix morphologizer overfitting test --- spacy/gold/example.pyx | 18 ++++-------------- spacy/pipeline/morphologizer.pyx | 4 ++-- spacy/tests/doc/test_array.py | 16 +++++++++++++++- spacy/tests/pipeline/test_morphologizer.py | 2 +- spacy/tests/util.py | 15 +++++++++++---- spacy/tokens/doc.pyx | 4 +++- 6 files changed, 36 insertions(+), 23 deletions(-) diff --git a/spacy/gold/example.pyx b/spacy/gold/example.pyx index 92b9beb0f..90b6dc85a 100644 --- a/spacy/gold/example.pyx +++ b/spacy/gold/example.pyx @@ -56,10 +56,8 @@ cdef class Example: if "ORTH" not in tok_dict: tok_dict["ORTH"] = [tok.text for tok in predicted] tok_dict["SPACY"] = [tok.whitespace_ for tok in predicted] - print("added ORTH and SPACY to the tok_dict") if "SPACY" not in tok_dict: tok_dict["SPACY"] = None - print("added SPACY to the tok_dict") return Example( predicted, annotations2doc(predicted.vocab, tok_dict, doc_dict) @@ -82,10 +80,6 @@ cdef class Example: j2i_multi = alignment.j2i_multi gold_to_cand = alignment.gold_to_cand cand_to_gold = alignment.cand_to_gold - print("i2j_multi", i2j_multi) - print("j2i_multi", j2i_multi) - print("gold_to_cand", gold_to_cand) - print("cand_to_gold", cand_to_gold) vocab = self.reference.vocab gold_values = self.reference.to_array([field]) @@ -101,7 +95,6 @@ cdef class Example: else: output[i] = gold_values[gold_i] - print("output before:" , output) if field in ["ENT_IOB"]: # Fix many-to-one IOB codes prev_j = -1 @@ -116,23 +109,16 @@ cdef class Example: prev_j = -1 prev_value = value - print("output in between:" , output) if field in ["ENT_IOB", "ENT_TYPE"]: # Assign one-to-many NER tags for j, cand_j in enumerate(gold_to_cand): - print() - print("j", j) - print("cand_j", cand_j) if cand_j is None: if j in j2i_multi: i = j2i_multi[j] if output[i] is None: output[i] = gold_values[j] - - print("output final:" , output) if as_string: output = [vocab.strings[o] if o is not None else o for o in output] - print("output as string:" , output) return output def to_dict(self): @@ -182,6 +168,10 @@ cdef class Example: def __get__(self): return self.x.text + property doc: + def __get__(self): + return self.x + def __str__(self): return str(self.to_dict()) diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index b693e4fd6..cc43506c9 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -92,8 +92,8 @@ class Morphologizer(Tagger): guesses = scores.argmax(axis=1) known_labels = numpy.ones((scores.shape[0], 1), dtype="f") for eg in examples: - pos_tags = eg.get_aligned("POS") - morphs = eg.get_aligned("MORPH") + pos_tags = eg.get_aligned("POS", as_string=True) + morphs = eg.get_aligned("MORPH", as_string=True) for i in range(len(morphs)): pos = pos_tags[i] morph = morphs[i] diff --git a/spacy/tests/doc/test_array.py b/spacy/tests/doc/test_array.py index f44ae1421..e721b3f09 100644 --- a/spacy/tests/doc/test_array.py +++ b/spacy/tests/doc/test_array.py @@ -1,6 +1,6 @@ import pytest from spacy.tokens import Doc -from spacy.attrs import ORTH, SHAPE, POS, DEP +from spacy.attrs import ORTH, SHAPE, POS, DEP, MORPH from ..util import get_doc @@ -44,6 +44,20 @@ def test_doc_array_tag(en_vocab): assert feats_array[3][1] == doc[3].pos +def test_doc_array_morph(en_vocab): + words = ["Eat", "blue", "ham"] + morph = ["Feat=V", "Feat=J", "Feat=N"] + doc = get_doc(en_vocab, words=words, morphs=morph) + assert morph[0] == doc[0].morph_ + assert morph[1] == doc[1].morph_ + assert morph[2] == doc[2].morph_ + + feats_array = doc.to_array((ORTH, MORPH)) + assert feats_array[0][1] == doc[0].morph.key + assert feats_array[1][1] == doc[1].morph.key + assert feats_array[2][1] == doc[2].morph.key + + def test_doc_array_dep(en_vocab): words = ["A", "nice", "sentence", "."] deps = ["det", "amod", "ROOT", "punct"] diff --git a/spacy/tests/pipeline/test_morphologizer.py b/spacy/tests/pipeline/test_morphologizer.py index f9307afc2..647e1a429 100644 --- a/spacy/tests/pipeline/test_morphologizer.py +++ b/spacy/tests/pipeline/test_morphologizer.py @@ -39,7 +39,7 @@ def test_overfitting_IO(): test_text = "I like blue eggs" doc = nlp(test_text) gold_morphs = ["Feat=N|POS=NOUN", "Feat=V|POS=VERB", "Feat=J|POS=ADJ", "Feat=N|POS=NOUN"] - assert gold_morphs == [t.morph_ for t in doc] + assert [t.morph_ for t in doc] == gold_morphs # Also test the results are still the same after IO with make_tempdir() as tmp_dir: diff --git a/spacy/tests/util.py b/spacy/tests/util.py index 3d0a023c9..a5d1737f1 100644 --- a/spacy/tests/util.py +++ b/spacy/tests/util.py @@ -7,7 +7,7 @@ from pathlib import Path from spacy import Errors from spacy.tokens import Doc, Span -from spacy.attrs import POS, TAG, HEAD, DEP, LEMMA +from spacy.attrs import POS, TAG, HEAD, DEP, LEMMA, MORPH from spacy.vocab import Vocab @@ -27,15 +27,15 @@ def make_tempdir(): def get_doc( - vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None, lemmas=None + vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None, lemmas=None, morphs=None ): """Create Doc object from given vocab, words and annotations.""" if deps and not heads: heads = [0] * len(deps) headings = [] values = [] - annotations = [pos, heads, deps, lemmas, tags] - possible_headings = [POS, HEAD, DEP, LEMMA, TAG] + annotations = [pos, heads, deps, lemmas, tags, morphs] + possible_headings = [POS, HEAD, DEP, LEMMA, TAG, MORPH] for a, annot in enumerate(annotations): if annot is not None: if len(annot) != len(words): @@ -61,6 +61,13 @@ def get_doc( attrs[i] = heads[i] else: attrs[i, j] = heads[i] + elif annot is morphs: + for i in range(len(words)): + morph_key = vocab.morphology.add(morphs[i]) + if attrs.ndim == 1: + attrs[i] = morph_key + else: + attrs[i, j] = morph_key else: for i in range(len(words)): if attrs.ndim == 1: diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 601d4f4a7..f9e7c97dd 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -18,7 +18,7 @@ from .token cimport Token from ..lexeme cimport Lexeme, EMPTY_LEXEME from ..typedefs cimport attr_t, flags_t from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, CLUSTER -from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB +from ..attrs cimport LENGTH, POS, LEMMA, TAG, MORPH, DEP, HEAD, SPACY, ENT_IOB from ..attrs cimport ENT_TYPE, ENT_ID, ENT_KB_ID, SENT_START, IDX, attr_id_t from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t @@ -52,6 +52,8 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil: return token.pos elif feat_name == TAG: return token.tag + elif feat_name == MORPH: + return token.morph elif feat_name == DEP: return token.dep elif feat_name == HEAD: