Ratty implementation of morphology features

This commit is contained in:
Matthew Honnibal 2018-02-26 01:20:42 +01:00
parent 9c32388235
commit 9db60acd7c
2 changed files with 15 additions and 1 deletions

View File

@ -107,6 +107,7 @@ cdef class Morphology:
# justification is that this is where the specific word and the tag
# interact. Still, we should have a better way to enforce this rule, or
# figure out why the statistical model fails. Related to Issue #220
previous_features = self.get_features(token.morph)
if Lexeme.c_check_flag(token.lex, IS_SPACE):
tag_id = self.reverse_index[self.strings.add('_SP')]
rich_tag = self.rich_tags[tag_id]
@ -122,6 +123,8 @@ cdef class Morphology:
token.pos = analysis.tag.pos
token.tag = analysis.tag.name
token.morph = analysis.tag.morph
for feature in previous_features:
self.set_feature(&token.morph, feature, True)
cdef int assign_feature(self, uint64_t* morph, univ_morph_t flag_id, bint value) except -1:
# Deprecated
@ -146,7 +149,10 @@ cdef class Morphology:
self._morph2features[morph[0]] = new_features
def get_features(self, uint64_t morph):
return self._morph2features.get(morph, frozenset())
if morph in self._morph2features:
return self._morph2features[morph]
else:
return frozenset()
def add_special_case(self, unicode tag_str, unicode orth_str, attrs,
force=False):

View File

@ -484,9 +484,11 @@ class Tagger(Pipe):
new_tag_map[tag] = {POS: X}
cdef Vocab vocab = self.vocab
if new_tag_map:
morph_feats = self.vocab.morphology._morph2features
vocab.morphology = Morphology(vocab.strings, new_tag_map,
vocab.morphology.lemmatizer,
exc=vocab.morphology.exc)
vocab.morphology._morph2features = morph_feats
if self.model is True:
self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg)
@ -519,10 +521,12 @@ class Tagger(Pipe):
if values is None:
values = {POS: "X"}
tag_map[label] = values
morph_feats = Morphology._morph2features
self.vocab.morphology = Morphology(
self.vocab.strings, tag_map=tag_map,
lemmatizer=self.vocab.morphology.lemmatizer,
exc=self.vocab.morphology.exc)
self.vocab.morphology._morph2features = morph_feats
return 1
def use_params(self, params):
@ -554,10 +558,12 @@ class Tagger(Pipe):
def load_tag_map(b):
tag_map = msgpack.loads(b, encoding='utf8')
morph_feats = self.vocab.morphology._morph2features
self.vocab.morphology = Morphology(
self.vocab.strings, tag_map=tag_map,
lemmatizer=self.vocab.morphology.lemmatizer,
exc=self.vocab.morphology.exc)
self.vocab.morphology._morph2features = morph_feats
deserialize = OrderedDict((
('vocab', lambda b: self.vocab.from_bytes(b)),
@ -590,10 +596,12 @@ class Tagger(Pipe):
def load_tag_map(p):
with p.open('rb') as file_:
tag_map = msgpack.loads(file_.read(), encoding='utf8')
morph_feats = self.vocab.morphology._morph2features
self.vocab.morphology = Morphology(
self.vocab.strings, tag_map=tag_map,
lemmatizer=self.vocab.morphology.lemmatizer,
exc=self.vocab.morphology.exc)
self.vocab.morphology._morph2features = morph_feats
deserialize = OrderedDict((
('cfg', lambda p: self.cfg.update(_load_cfg(p))),