mirror of
https://github.com/explosion/spaCy.git
synced 2025-03-12 15:25:47 +03:00
add support for MORPH in to/from_array, fix morphologizer overfitting test
This commit is contained in:
parent
1a151b10d6
commit
10d396977e
|
@ -56,10 +56,8 @@ cdef class Example:
|
||||||
if "ORTH" not in tok_dict:
|
if "ORTH" not in tok_dict:
|
||||||
tok_dict["ORTH"] = [tok.text for tok in predicted]
|
tok_dict["ORTH"] = [tok.text for tok in predicted]
|
||||||
tok_dict["SPACY"] = [tok.whitespace_ for tok in predicted]
|
tok_dict["SPACY"] = [tok.whitespace_ for tok in predicted]
|
||||||
print("added ORTH and SPACY to the tok_dict")
|
|
||||||
if "SPACY" not in tok_dict:
|
if "SPACY" not in tok_dict:
|
||||||
tok_dict["SPACY"] = None
|
tok_dict["SPACY"] = None
|
||||||
print("added SPACY to the tok_dict")
|
|
||||||
return Example(
|
return Example(
|
||||||
predicted,
|
predicted,
|
||||||
annotations2doc(predicted.vocab, tok_dict, doc_dict)
|
annotations2doc(predicted.vocab, tok_dict, doc_dict)
|
||||||
|
@ -82,10 +80,6 @@ cdef class Example:
|
||||||
j2i_multi = alignment.j2i_multi
|
j2i_multi = alignment.j2i_multi
|
||||||
gold_to_cand = alignment.gold_to_cand
|
gold_to_cand = alignment.gold_to_cand
|
||||||
cand_to_gold = alignment.cand_to_gold
|
cand_to_gold = alignment.cand_to_gold
|
||||||
print("i2j_multi", i2j_multi)
|
|
||||||
print("j2i_multi", j2i_multi)
|
|
||||||
print("gold_to_cand", gold_to_cand)
|
|
||||||
print("cand_to_gold", cand_to_gold)
|
|
||||||
|
|
||||||
vocab = self.reference.vocab
|
vocab = self.reference.vocab
|
||||||
gold_values = self.reference.to_array([field])
|
gold_values = self.reference.to_array([field])
|
||||||
|
@ -101,7 +95,6 @@ cdef class Example:
|
||||||
else:
|
else:
|
||||||
output[i] = gold_values[gold_i]
|
output[i] = gold_values[gold_i]
|
||||||
|
|
||||||
print("output before:" , output)
|
|
||||||
if field in ["ENT_IOB"]:
|
if field in ["ENT_IOB"]:
|
||||||
# Fix many-to-one IOB codes
|
# Fix many-to-one IOB codes
|
||||||
prev_j = -1
|
prev_j = -1
|
||||||
|
@ -116,23 +109,16 @@ cdef class Example:
|
||||||
prev_j = -1
|
prev_j = -1
|
||||||
prev_value = value
|
prev_value = value
|
||||||
|
|
||||||
print("output in between:" , output)
|
|
||||||
if field in ["ENT_IOB", "ENT_TYPE"]:
|
if field in ["ENT_IOB", "ENT_TYPE"]:
|
||||||
# Assign one-to-many NER tags
|
# Assign one-to-many NER tags
|
||||||
for j, cand_j in enumerate(gold_to_cand):
|
for j, cand_j in enumerate(gold_to_cand):
|
||||||
print()
|
|
||||||
print("j", j)
|
|
||||||
print("cand_j", cand_j)
|
|
||||||
if cand_j is None:
|
if cand_j is None:
|
||||||
if j in j2i_multi:
|
if j in j2i_multi:
|
||||||
i = j2i_multi[j]
|
i = j2i_multi[j]
|
||||||
if output[i] is None:
|
if output[i] is None:
|
||||||
output[i] = gold_values[j]
|
output[i] = gold_values[j]
|
||||||
|
|
||||||
print("output final:" , output)
|
|
||||||
if as_string:
|
if as_string:
|
||||||
output = [vocab.strings[o] if o is not None else o for o in output]
|
output = [vocab.strings[o] if o is not None else o for o in output]
|
||||||
print("output as string:" , output)
|
|
||||||
return output
|
return output
|
||||||
|
|
||||||
def to_dict(self):
|
def to_dict(self):
|
||||||
|
@ -182,6 +168,10 @@ cdef class Example:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.x.text
|
return self.x.text
|
||||||
|
|
||||||
|
property doc:
|
||||||
|
def __get__(self):
|
||||||
|
return self.x
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return str(self.to_dict())
|
return str(self.to_dict())
|
||||||
|
|
||||||
|
|
|
@ -92,8 +92,8 @@ class Morphologizer(Tagger):
|
||||||
guesses = scores.argmax(axis=1)
|
guesses = scores.argmax(axis=1)
|
||||||
known_labels = numpy.ones((scores.shape[0], 1), dtype="f")
|
known_labels = numpy.ones((scores.shape[0], 1), dtype="f")
|
||||||
for eg in examples:
|
for eg in examples:
|
||||||
pos_tags = eg.get_aligned("POS")
|
pos_tags = eg.get_aligned("POS", as_string=True)
|
||||||
morphs = eg.get_aligned("MORPH")
|
morphs = eg.get_aligned("MORPH", as_string=True)
|
||||||
for i in range(len(morphs)):
|
for i in range(len(morphs)):
|
||||||
pos = pos_tags[i]
|
pos = pos_tags[i]
|
||||||
morph = morphs[i]
|
morph = morphs[i]
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
import pytest
|
import pytest
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
from spacy.attrs import ORTH, SHAPE, POS, DEP
|
from spacy.attrs import ORTH, SHAPE, POS, DEP, MORPH
|
||||||
|
|
||||||
from ..util import get_doc
|
from ..util import get_doc
|
||||||
|
|
||||||
|
@ -44,6 +44,20 @@ def test_doc_array_tag(en_vocab):
|
||||||
assert feats_array[3][1] == doc[3].pos
|
assert feats_array[3][1] == doc[3].pos
|
||||||
|
|
||||||
|
|
||||||
|
def test_doc_array_morph(en_vocab):
|
||||||
|
words = ["Eat", "blue", "ham"]
|
||||||
|
morph = ["Feat=V", "Feat=J", "Feat=N"]
|
||||||
|
doc = get_doc(en_vocab, words=words, morphs=morph)
|
||||||
|
assert morph[0] == doc[0].morph_
|
||||||
|
assert morph[1] == doc[1].morph_
|
||||||
|
assert morph[2] == doc[2].morph_
|
||||||
|
|
||||||
|
feats_array = doc.to_array((ORTH, MORPH))
|
||||||
|
assert feats_array[0][1] == doc[0].morph.key
|
||||||
|
assert feats_array[1][1] == doc[1].morph.key
|
||||||
|
assert feats_array[2][1] == doc[2].morph.key
|
||||||
|
|
||||||
|
|
||||||
def test_doc_array_dep(en_vocab):
|
def test_doc_array_dep(en_vocab):
|
||||||
words = ["A", "nice", "sentence", "."]
|
words = ["A", "nice", "sentence", "."]
|
||||||
deps = ["det", "amod", "ROOT", "punct"]
|
deps = ["det", "amod", "ROOT", "punct"]
|
||||||
|
|
|
@ -39,7 +39,7 @@ def test_overfitting_IO():
|
||||||
test_text = "I like blue eggs"
|
test_text = "I like blue eggs"
|
||||||
doc = nlp(test_text)
|
doc = nlp(test_text)
|
||||||
gold_morphs = ["Feat=N|POS=NOUN", "Feat=V|POS=VERB", "Feat=J|POS=ADJ", "Feat=N|POS=NOUN"]
|
gold_morphs = ["Feat=N|POS=NOUN", "Feat=V|POS=VERB", "Feat=J|POS=ADJ", "Feat=N|POS=NOUN"]
|
||||||
assert gold_morphs == [t.morph_ for t in doc]
|
assert [t.morph_ for t in doc] == gold_morphs
|
||||||
|
|
||||||
# Also test the results are still the same after IO
|
# Also test the results are still the same after IO
|
||||||
with make_tempdir() as tmp_dir:
|
with make_tempdir() as tmp_dir:
|
||||||
|
|
|
@ -7,7 +7,7 @@ from pathlib import Path
|
||||||
|
|
||||||
from spacy import Errors
|
from spacy import Errors
|
||||||
from spacy.tokens import Doc, Span
|
from spacy.tokens import Doc, Span
|
||||||
from spacy.attrs import POS, TAG, HEAD, DEP, LEMMA
|
from spacy.attrs import POS, TAG, HEAD, DEP, LEMMA, MORPH
|
||||||
|
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
|
|
||||||
|
@ -27,15 +27,15 @@ def make_tempdir():
|
||||||
|
|
||||||
|
|
||||||
def get_doc(
|
def get_doc(
|
||||||
vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None, lemmas=None
|
vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None, lemmas=None, morphs=None
|
||||||
):
|
):
|
||||||
"""Create Doc object from given vocab, words and annotations."""
|
"""Create Doc object from given vocab, words and annotations."""
|
||||||
if deps and not heads:
|
if deps and not heads:
|
||||||
heads = [0] * len(deps)
|
heads = [0] * len(deps)
|
||||||
headings = []
|
headings = []
|
||||||
values = []
|
values = []
|
||||||
annotations = [pos, heads, deps, lemmas, tags]
|
annotations = [pos, heads, deps, lemmas, tags, morphs]
|
||||||
possible_headings = [POS, HEAD, DEP, LEMMA, TAG]
|
possible_headings = [POS, HEAD, DEP, LEMMA, TAG, MORPH]
|
||||||
for a, annot in enumerate(annotations):
|
for a, annot in enumerate(annotations):
|
||||||
if annot is not None:
|
if annot is not None:
|
||||||
if len(annot) != len(words):
|
if len(annot) != len(words):
|
||||||
|
@ -61,6 +61,13 @@ def get_doc(
|
||||||
attrs[i] = heads[i]
|
attrs[i] = heads[i]
|
||||||
else:
|
else:
|
||||||
attrs[i, j] = heads[i]
|
attrs[i, j] = heads[i]
|
||||||
|
elif annot is morphs:
|
||||||
|
for i in range(len(words)):
|
||||||
|
morph_key = vocab.morphology.add(morphs[i])
|
||||||
|
if attrs.ndim == 1:
|
||||||
|
attrs[i] = morph_key
|
||||||
|
else:
|
||||||
|
attrs[i, j] = morph_key
|
||||||
else:
|
else:
|
||||||
for i in range(len(words)):
|
for i in range(len(words)):
|
||||||
if attrs.ndim == 1:
|
if attrs.ndim == 1:
|
||||||
|
|
|
@ -18,7 +18,7 @@ from .token cimport Token
|
||||||
from ..lexeme cimport Lexeme, EMPTY_LEXEME
|
from ..lexeme cimport Lexeme, EMPTY_LEXEME
|
||||||
from ..typedefs cimport attr_t, flags_t
|
from ..typedefs cimport attr_t, flags_t
|
||||||
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, CLUSTER
|
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, CLUSTER
|
||||||
from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB
|
from ..attrs cimport LENGTH, POS, LEMMA, TAG, MORPH, DEP, HEAD, SPACY, ENT_IOB
|
||||||
from ..attrs cimport ENT_TYPE, ENT_ID, ENT_KB_ID, SENT_START, IDX, attr_id_t
|
from ..attrs cimport ENT_TYPE, ENT_ID, ENT_KB_ID, SENT_START, IDX, attr_id_t
|
||||||
from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t
|
from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t
|
||||||
|
|
||||||
|
@ -52,6 +52,8 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil:
|
||||||
return token.pos
|
return token.pos
|
||||||
elif feat_name == TAG:
|
elif feat_name == TAG:
|
||||||
return token.tag
|
return token.tag
|
||||||
|
elif feat_name == MORPH:
|
||||||
|
return token.morph
|
||||||
elif feat_name == DEP:
|
elif feat_name == DEP:
|
||||||
return token.dep
|
return token.dep
|
||||||
elif feat_name == HEAD:
|
elif feat_name == HEAD:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user