Ratty implementation of morphology features

2025-10-03 02:17:00 +03:00 · 2018-02-26 01:20:42 +01:00 · 2018-02-26 01:20:42 +01:00 · 9db60acd7c
commit 9db60acd7c
parent 9c32388235
2 changed files with 15 additions and 1 deletions
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@ -107,6 +107,7 @@ cdef class Morphology:
        # justification is that this is where the specific word and the tag
        # interact. Still, we should have a better way to enforce this rule, or
        # figure out why the statistical model fails. Related to Issue #220
        previous_features = self.get_features(token.morph)
        if Lexeme.c_check_flag(token.lex, IS_SPACE):
            tag_id = self.reverse_index[self.strings.add('_SP')]
        rich_tag = self.rich_tags[tag_id]
@ -122,6 +123,8 @@ cdef class Morphology:
        token.pos = analysis.tag.pos
        token.tag = analysis.tag.name
        token.morph = analysis.tag.morph
        for feature in previous_features:
            self.set_feature(&token.morph, feature, True)
    cdef int assign_feature(self, uint64_t* morph, univ_morph_t flag_id, bint value) except -1:
        # Deprecated
@ -146,7 +149,10 @@ cdef class Morphology:
            self._morph2features[morph[0]] = new_features
    def get_features(self, uint64_t morph):
-        return self._morph2features.get(morph, frozenset())
+        if morph in self._morph2features:
            return self._morph2features[morph]
        else:
            return frozenset()
    def add_special_case(self, unicode tag_str, unicode orth_str, attrs,
                         force=False):
--- a/spacy/pipeline.pyx
+++ b/spacy/pipeline.pyx
@ -484,9 +484,11 @@ class Tagger(Pipe):
                        new_tag_map[tag] = {POS: X}
        cdef Vocab vocab = self.vocab
        if new_tag_map:
            morph_feats = self.vocab.morphology._morph2features
            vocab.morphology = Morphology(vocab.strings, new_tag_map,
                                          vocab.morphology.lemmatizer,
                                          exc=vocab.morphology.exc)
            vocab.morphology._morph2features = morph_feats
        if self.model is True:
            self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
            self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg)
@ -519,10 +521,12 @@ class Tagger(Pipe):
        if values is None:
            values = {POS: "X"}
        tag_map[label] = values
        morph_feats = Morphology._morph2features
        self.vocab.morphology = Morphology(
            self.vocab.strings, tag_map=tag_map,
            lemmatizer=self.vocab.morphology.lemmatizer,
            exc=self.vocab.morphology.exc)
        self.vocab.morphology._morph2features = morph_feats
        return 1
    def use_params(self, params):
@ -554,10 +558,12 @@ class Tagger(Pipe):
        def load_tag_map(b):
            tag_map = msgpack.loads(b, encoding='utf8')
            morph_feats = self.vocab.morphology._morph2features
            self.vocab.morphology = Morphology(
                self.vocab.strings, tag_map=tag_map,
                lemmatizer=self.vocab.morphology.lemmatizer,
                exc=self.vocab.morphology.exc)
            self.vocab.morphology._morph2features = morph_feats
        deserialize = OrderedDict((
            ('vocab', lambda b: self.vocab.from_bytes(b)),
@ -590,10 +596,12 @@ class Tagger(Pipe):
        def load_tag_map(p):
            with p.open('rb') as file_:
                tag_map = msgpack.loads(file_.read(), encoding='utf8')
            morph_feats = self.vocab.morphology._morph2features
            self.vocab.morphology = Morphology(
                self.vocab.strings, tag_map=tag_map,
                lemmatizer=self.vocab.morphology.lemmatizer,
                exc=self.vocab.morphology.exc)
            self.vocab.morphology._morph2features = morph_feats
        deserialize = OrderedDict((
            ('cfg', lambda p: self.cfg.update(_load_cfg(p))),