Restore previous morphology stuff

This commit is contained in:
Matthew Honnibal 2018-09-25 00:35:59 +02:00
parent 3bba8e9245
commit a3d2e616d5
2 changed files with 11 additions and 9 deletions

View File

@ -1,5 +1,5 @@
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from preshed.maps cimport PreshMap from preshed.maps cimport PreshMap, PreshMapArray
from libc.stdint cimport uint64_t from libc.stdint cimport uint64_t
from murmurhash cimport mrmr from murmurhash cimport mrmr
@ -17,14 +17,17 @@ cdef class Morphology:
cdef public object lemmatizer cdef public object lemmatizer
cdef readonly object tag_map cdef readonly object tag_map
cdef readonly object tag_names
cdef readonly object reverse_index
cdef readonly object exc
cdef readonly int n_tags
cdef hash_t insert(self, RichTagC tag) except 0 cdef hash_t insert(self, RichTagC tag) except 0
cdef int assign_untagged(self, TokenC* token) except -1 cdef int assign_untagged(self, TokenC* token) except -1
cdef int assign_tag(self, TokenC* token, tag) except -1 cdef int assign_tag(self, TokenC* token, tag) except -1
cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1 cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1
cdef update_token_morph(self, TokenC* token, features) cdef update_morph(self, hash_t morph, features)
cdef set_token_morph(self, TokenC* token, pos, features)
cdef enum univ_morph_t: cdef enum univ_morph_t:
NIL = 0 NIL = 0

View File

@ -125,17 +125,17 @@ cdef class Morphology:
# figure out why the statistical model fails. Related to Issue #220 # figure out why the statistical model fails. Related to Issue #220
if Lexeme.c_check_flag(token.lex, IS_SPACE): if Lexeme.c_check_flag(token.lex, IS_SPACE):
tag_id = self.reverse_index[self.strings.add('_SP')] tag_id = self.reverse_index[self.strings.add('_SP')]
tag_str = self.tag_names[tag_id]
features = dict(self.tag_map.get(tag_str, {}))
lemma = <attr_t>self._cache.get(tag_id, token.lex.orth) lemma = <attr_t>self._cache.get(tag_id, token.lex.orth)
if lemma == 0: if lemma == 0 and features:
tag_str = self.tag_names[tag_id]
features = dict(self.tag_map.get(tag_str, {}))
pos = self.strings.as_int(features.pop('POS')) pos = self.strings.as_int(features.pop('POS'))
lemma = self.lemmatize(pos, token.lex.orth, features) lemma = self.lemmatize(pos, token.lex.orth, features)
self._cache.set(tag_id, token.lex.orth, lemma) self._cache.set(tag_id, token.lex.orth, lemma)
token.lemma = lemma token.lemma = lemma
token.pos = pos token.pos = pos
token.tag = self.strings[tag_str] token.tag = self.strings[tag_str]
token.morph = self.add(attrs) token.morph = self.add(features)
cdef update_morph(self, hash_t morph, features): cdef update_morph(self, hash_t morph, features):
"""Update a morphological analysis with new feature values.""" """Update a morphological analysis with new feature values."""
@ -175,10 +175,9 @@ cpdef intify_features(StringStore strings, features):
cdef hash_t hash_tag(RichTagC tag) nogil: cdef hash_t hash_tag(RichTagC tag) nogil:
return mrmr.hash64(&tag, sizeof(tag), 0) return mrmr.hash64(&tag, sizeof(tag), 0)
cdef RichTagC create_rich_tag(pos_, features): cdef RichTagC create_rich_tag(features):
cdef RichTagC tag cdef RichTagC tag
cdef univ_morph_t feature cdef univ_morph_t feature
tag.pos = get_int_tag(pos_)
for feature in features: for feature in features:
set_feature(&tag, feature, 1) set_feature(&tag, feature, 1)
return tag return tag