Make morphology use int attributes internally

The morphology class was calling the lemmatizer inconsistently,
which some string-valued attributes. This caused Issue #903.
This commit is contained in:
Matthew Honnibal 2017-03-25 21:49:10 +01:00
parent 4454c1b23f
commit 850d35dcb3

View File

@ -32,12 +32,11 @@ def _normalize_props(props):
return out
cdef class Morphology:
def __init__(self, StringStore string_store, tag_map, lemmatizer):
self.mem = Pool()
self.strings = string_store
self.tag_map = tag_map
self.tag_map = {}
self.lemmatizer = lemmatizer
self.n_tags = len(tag_map) + 1
self.tag_names = tuple(sorted(tag_map.keys()))
@ -52,6 +51,7 @@ cdef class Morphology:
self.rich_tags[i].morph = 0
self.rich_tags[i].pos = attrs[POS]
self.reverse_index[self.rich_tags[i].name] = i
self.tag_map[tag_str] = attrs
self._cache = PreshMapArray(self.n_tags)
def __reduce__(self):
@ -74,10 +74,10 @@ cdef class Morphology:
# Related to Issue #220
if Lexeme.c_check_flag(token.lex, IS_SPACE):
tag_id = self.reverse_index[self.strings['SP']]
rich_tag = self.rich_tags[tag_id]
analysis = <MorphAnalysisC*>self._cache.get(tag_id, token.lex.orth)
if analysis is NULL:
analysis = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
analysis.tag = self.rich_tags[tag_id]
tag_str = self.strings[self.rich_tags[tag_id].name]
analysis.lemma = self.lemmatize(analysis.tag.pos, token.lex.orth,
self.tag_map.get(tag_str, {}))
@ -126,8 +126,7 @@ cdef class Morphology:
else:
self.assign_feature(&cached.tag.morph, name_id, value_id)
if cached.lemma == 0:
cached.lemma = self.lemmatize(rich_tag.pos, orth,
self.tag_map.get(tag_str, {}))
cached.lemma = self.lemmatize(rich_tag.pos, orth, attrs)
self._cache.set(tag_id, orth, <void*>cached)
def load_morph_exceptions(self, dict exc):