diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index a0397312c..8e83fbf70 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -107,6 +107,7 @@ cdef class Morphology: # justification is that this is where the specific word and the tag # interact. Still, we should have a better way to enforce this rule, or # figure out why the statistical model fails. Related to Issue #220 + previous_features = self.get_features(token.morph) if Lexeme.c_check_flag(token.lex, IS_SPACE): tag_id = self.reverse_index[self.strings.add('_SP')] rich_tag = self.rich_tags[tag_id] @@ -122,6 +123,8 @@ cdef class Morphology: token.pos = analysis.tag.pos token.tag = analysis.tag.name token.morph = analysis.tag.morph + for feature in previous_features: + self.set_feature(&token.morph, feature, True) cdef int assign_feature(self, uint64_t* morph, univ_morph_t flag_id, bint value) except -1: # Deprecated @@ -146,7 +149,10 @@ cdef class Morphology: self._morph2features[morph[0]] = new_features def get_features(self, uint64_t morph): - return self._morph2features.get(morph, frozenset()) + if morph in self._morph2features: + return self._morph2features[morph] + else: + return frozenset() def add_special_case(self, unicode tag_str, unicode orth_str, attrs, force=False): diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index 6fbf95eea..fe50b4cdd 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -484,9 +484,11 @@ class Tagger(Pipe): new_tag_map[tag] = {POS: X} cdef Vocab vocab = self.vocab if new_tag_map: + morph_feats = self.vocab.morphology._morph2features vocab.morphology = Morphology(vocab.strings, new_tag_map, vocab.morphology.lemmatizer, exc=vocab.morphology.exc) + vocab.morphology._morph2features = morph_feats if self.model is True: self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1] self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg) @@ -519,10 +521,12 @@ class Tagger(Pipe): if values is None: values = {POS: "X"} tag_map[label] = values + morph_feats = Morphology._morph2features self.vocab.morphology = Morphology( self.vocab.strings, tag_map=tag_map, lemmatizer=self.vocab.morphology.lemmatizer, exc=self.vocab.morphology.exc) + self.vocab.morphology._morph2features = morph_feats return 1 def use_params(self, params): @@ -554,10 +558,12 @@ class Tagger(Pipe): def load_tag_map(b): tag_map = msgpack.loads(b, encoding='utf8') + morph_feats = self.vocab.morphology._morph2features self.vocab.morphology = Morphology( self.vocab.strings, tag_map=tag_map, lemmatizer=self.vocab.morphology.lemmatizer, exc=self.vocab.morphology.exc) + self.vocab.morphology._morph2features = morph_feats deserialize = OrderedDict(( ('vocab', lambda b: self.vocab.from_bytes(b)), @@ -590,10 +596,12 @@ class Tagger(Pipe): def load_tag_map(p): with p.open('rb') as file_: tag_map = msgpack.loads(file_.read(), encoding='utf8') + morph_feats = self.vocab.morphology._morph2features self.vocab.morphology = Morphology( self.vocab.strings, tag_map=tag_map, lemmatizer=self.vocab.morphology.lemmatizer, exc=self.vocab.morphology.exc) + self.vocab.morphology._morph2features = morph_feats deserialize = OrderedDict(( ('cfg', lambda p: self.cfg.update(_load_cfg(p))),