mirror of
https://github.com/explosion/spaCy.git
synced 2025-03-12 07:15:48 +03:00
add support for MORPH in to/from_array, fix morphologizer overfitting test
This commit is contained in:
parent
1a151b10d6
commit
10d396977e
|
@ -56,10 +56,8 @@ cdef class Example:
|
|||
if "ORTH" not in tok_dict:
|
||||
tok_dict["ORTH"] = [tok.text for tok in predicted]
|
||||
tok_dict["SPACY"] = [tok.whitespace_ for tok in predicted]
|
||||
print("added ORTH and SPACY to the tok_dict")
|
||||
if "SPACY" not in tok_dict:
|
||||
tok_dict["SPACY"] = None
|
||||
print("added SPACY to the tok_dict")
|
||||
return Example(
|
||||
predicted,
|
||||
annotations2doc(predicted.vocab, tok_dict, doc_dict)
|
||||
|
@ -82,10 +80,6 @@ cdef class Example:
|
|||
j2i_multi = alignment.j2i_multi
|
||||
gold_to_cand = alignment.gold_to_cand
|
||||
cand_to_gold = alignment.cand_to_gold
|
||||
print("i2j_multi", i2j_multi)
|
||||
print("j2i_multi", j2i_multi)
|
||||
print("gold_to_cand", gold_to_cand)
|
||||
print("cand_to_gold", cand_to_gold)
|
||||
|
||||
vocab = self.reference.vocab
|
||||
gold_values = self.reference.to_array([field])
|
||||
|
@ -101,7 +95,6 @@ cdef class Example:
|
|||
else:
|
||||
output[i] = gold_values[gold_i]
|
||||
|
||||
print("output before:" , output)
|
||||
if field in ["ENT_IOB"]:
|
||||
# Fix many-to-one IOB codes
|
||||
prev_j = -1
|
||||
|
@ -116,23 +109,16 @@ cdef class Example:
|
|||
prev_j = -1
|
||||
prev_value = value
|
||||
|
||||
print("output in between:" , output)
|
||||
if field in ["ENT_IOB", "ENT_TYPE"]:
|
||||
# Assign one-to-many NER tags
|
||||
for j, cand_j in enumerate(gold_to_cand):
|
||||
print()
|
||||
print("j", j)
|
||||
print("cand_j", cand_j)
|
||||
if cand_j is None:
|
||||
if j in j2i_multi:
|
||||
i = j2i_multi[j]
|
||||
if output[i] is None:
|
||||
output[i] = gold_values[j]
|
||||
|
||||
print("output final:" , output)
|
||||
if as_string:
|
||||
output = [vocab.strings[o] if o is not None else o for o in output]
|
||||
print("output as string:" , output)
|
||||
return output
|
||||
|
||||
def to_dict(self):
|
||||
|
@ -182,6 +168,10 @@ cdef class Example:
|
|||
def __get__(self):
|
||||
return self.x.text
|
||||
|
||||
property doc:
|
||||
def __get__(self):
|
||||
return self.x
|
||||
|
||||
def __str__(self):
|
||||
return str(self.to_dict())
|
||||
|
||||
|
|
|
@ -92,8 +92,8 @@ class Morphologizer(Tagger):
|
|||
guesses = scores.argmax(axis=1)
|
||||
known_labels = numpy.ones((scores.shape[0], 1), dtype="f")
|
||||
for eg in examples:
|
||||
pos_tags = eg.get_aligned("POS")
|
||||
morphs = eg.get_aligned("MORPH")
|
||||
pos_tags = eg.get_aligned("POS", as_string=True)
|
||||
morphs = eg.get_aligned("MORPH", as_string=True)
|
||||
for i in range(len(morphs)):
|
||||
pos = pos_tags[i]
|
||||
morph = morphs[i]
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
import pytest
|
||||
from spacy.tokens import Doc
|
||||
from spacy.attrs import ORTH, SHAPE, POS, DEP
|
||||
from spacy.attrs import ORTH, SHAPE, POS, DEP, MORPH
|
||||
|
||||
from ..util import get_doc
|
||||
|
||||
|
@ -44,6 +44,20 @@ def test_doc_array_tag(en_vocab):
|
|||
assert feats_array[3][1] == doc[3].pos
|
||||
|
||||
|
||||
def test_doc_array_morph(en_vocab):
|
||||
words = ["Eat", "blue", "ham"]
|
||||
morph = ["Feat=V", "Feat=J", "Feat=N"]
|
||||
doc = get_doc(en_vocab, words=words, morphs=morph)
|
||||
assert morph[0] == doc[0].morph_
|
||||
assert morph[1] == doc[1].morph_
|
||||
assert morph[2] == doc[2].morph_
|
||||
|
||||
feats_array = doc.to_array((ORTH, MORPH))
|
||||
assert feats_array[0][1] == doc[0].morph.key
|
||||
assert feats_array[1][1] == doc[1].morph.key
|
||||
assert feats_array[2][1] == doc[2].morph.key
|
||||
|
||||
|
||||
def test_doc_array_dep(en_vocab):
|
||||
words = ["A", "nice", "sentence", "."]
|
||||
deps = ["det", "amod", "ROOT", "punct"]
|
||||
|
|
|
@ -39,7 +39,7 @@ def test_overfitting_IO():
|
|||
test_text = "I like blue eggs"
|
||||
doc = nlp(test_text)
|
||||
gold_morphs = ["Feat=N|POS=NOUN", "Feat=V|POS=VERB", "Feat=J|POS=ADJ", "Feat=N|POS=NOUN"]
|
||||
assert gold_morphs == [t.morph_ for t in doc]
|
||||
assert [t.morph_ for t in doc] == gold_morphs
|
||||
|
||||
# Also test the results are still the same after IO
|
||||
with make_tempdir() as tmp_dir:
|
||||
|
|
|
@ -7,7 +7,7 @@ from pathlib import Path
|
|||
|
||||
from spacy import Errors
|
||||
from spacy.tokens import Doc, Span
|
||||
from spacy.attrs import POS, TAG, HEAD, DEP, LEMMA
|
||||
from spacy.attrs import POS, TAG, HEAD, DEP, LEMMA, MORPH
|
||||
|
||||
from spacy.vocab import Vocab
|
||||
|
||||
|
@ -27,15 +27,15 @@ def make_tempdir():
|
|||
|
||||
|
||||
def get_doc(
|
||||
vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None, lemmas=None
|
||||
vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None, lemmas=None, morphs=None
|
||||
):
|
||||
"""Create Doc object from given vocab, words and annotations."""
|
||||
if deps and not heads:
|
||||
heads = [0] * len(deps)
|
||||
headings = []
|
||||
values = []
|
||||
annotations = [pos, heads, deps, lemmas, tags]
|
||||
possible_headings = [POS, HEAD, DEP, LEMMA, TAG]
|
||||
annotations = [pos, heads, deps, lemmas, tags, morphs]
|
||||
possible_headings = [POS, HEAD, DEP, LEMMA, TAG, MORPH]
|
||||
for a, annot in enumerate(annotations):
|
||||
if annot is not None:
|
||||
if len(annot) != len(words):
|
||||
|
@ -61,6 +61,13 @@ def get_doc(
|
|||
attrs[i] = heads[i]
|
||||
else:
|
||||
attrs[i, j] = heads[i]
|
||||
elif annot is morphs:
|
||||
for i in range(len(words)):
|
||||
morph_key = vocab.morphology.add(morphs[i])
|
||||
if attrs.ndim == 1:
|
||||
attrs[i] = morph_key
|
||||
else:
|
||||
attrs[i, j] = morph_key
|
||||
else:
|
||||
for i in range(len(words)):
|
||||
if attrs.ndim == 1:
|
||||
|
|
|
@ -18,7 +18,7 @@ from .token cimport Token
|
|||
from ..lexeme cimport Lexeme, EMPTY_LEXEME
|
||||
from ..typedefs cimport attr_t, flags_t
|
||||
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, CLUSTER
|
||||
from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB
|
||||
from ..attrs cimport LENGTH, POS, LEMMA, TAG, MORPH, DEP, HEAD, SPACY, ENT_IOB
|
||||
from ..attrs cimport ENT_TYPE, ENT_ID, ENT_KB_ID, SENT_START, IDX, attr_id_t
|
||||
from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t
|
||||
|
||||
|
@ -52,6 +52,8 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil:
|
|||
return token.pos
|
||||
elif feat_name == TAG:
|
||||
return token.tag
|
||||
elif feat_name == MORPH:
|
||||
return token.morph
|
||||
elif feat_name == DEP:
|
||||
return token.dep
|
||||
elif feat_name == HEAD:
|
||||
|
|
Loading…
Reference in New Issue
Block a user