WIP on supporting morphology features

2025-12-24 10:33:17 +03:00 · 2018-09-24 23:57:41 +02:00 · 2018-09-24 23:57:41 +02:00 · 6ae645c4ef
commit 6ae645c4ef
parent ac5742223a
2 changed files with 366 additions and 117 deletions
--- a/spacy/morphology.pxd
+++ b/spacy/morphology.pxd
@ -1,48 +1,30 @@
 from cymem.cymem cimport Pool
-from preshed.maps cimport PreshMapArray
+from preshed.maps cimport PreshMap
 from libc.stdint cimport uint64_t
 from murmurhash cimport mrmr
 from .structs cimport TokenC
 from .strings cimport StringStore
-from .typedefs cimport attr_t, flags_t
+from .typedefs cimport hash_t, attr_t, flags_t
 from .parts_of_speech cimport univ_pos_t
 from . cimport symbols
 cdef struct RichTagC:
    uint64_t morph
    int id
    univ_pos_t pos
    attr_t name
 cdef struct MorphAnalysisC:
    RichTagC tag
    attr_t lemma
 cdef class Morphology:
    cdef readonly Pool mem
    cdef readonly StringStore strings
    cdef PreshMap tags # Keyed by hash, value is pointer to tag
    cdef public object lemmatizer
    cdef readonly object tag_map
    cdef public object n_tags
    cdef public object reverse_index
    cdef public object tag_names
    cdef public object exc
    cdef RichTagC* rich_tags
    cdef PreshMapArray _cache
    cdef hash_t insert(self, RichTagC tag) except 0
    cdef int assign_untagged(self, TokenC* token) except -1
    cdef int assign_tag(self, TokenC* token, tag) except -1
    cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1
-
+    cdef update_token_morph(self, TokenC* token, features)
-    cdef int assign_feature(self, uint64_t* morph, univ_morph_t feat_id, bint value) except -1
+    cdef set_token_morph(self, TokenC* token, pos, features)
 cdef enum univ_morph_t:
    NIL = 0
@ -298,4 +280,47 @@ cdef enum univ_morph_t:
    VerbType_mod # U
    VerbType_light # U
-
+cdef struct RichTagC:
    univ_pos_t pos
    univ_morph_t abbr
    univ_morph_t adp_type
    univ_morph_t adv_type
    univ_morph_t animacy
    univ_morph_t aspect
    univ_morph_t case
    univ_morph_t conj_type
    univ_morph_t connegative
    univ_morph_t definite
    univ_morph_t degree
    univ_morph_t derivation
    univ_morph_t echo
    univ_morph_t foreign
    univ_morph_t gender
    univ_morph_t hyph
    univ_morph_t inf_form
    univ_morph_t mood
    univ_morph_t negative
    univ_morph_t number
    univ_morph_t name_type
    univ_morph_t num_form
    univ_morph_t num_type
    univ_morph_t num_value
    univ_morph_t part_form
    univ_morph_t part_type
    univ_morph_t person
    univ_morph_t polite
    univ_morph_t polarity
    univ_morph_t poss
    univ_morph_t prefix
    univ_morph_t prep_case
    univ_morph_t pron_type
    univ_morph_t punct_side
    univ_morph_t punct_type
    univ_morph_t reflex
    univ_morph_t style
    univ_morph_t style_variant
    univ_morph_t tense
    univ_morph_t verb_form
    univ_morph_t voice
    univ_morph_t verb_type
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@ -3,6 +3,7 @@
 from __future__ import unicode_literals
 from libc.string cimport memset
 import ujson as json
 from .attrs cimport POS, IS_SPACE
 from .attrs import LEMMA, intify_attrs
@ -12,6 +13,7 @@ from .lexeme cimport Lexeme
 from .errors import Errors
 def _normalize_props(props):
    """Transform deprecated string keys to correct names."""
    out = {}
@ -32,9 +34,17 @@ def _normalize_props(props):
 cdef class Morphology:
    '''Store the possible morphological analyses for a language, and index them
    by hash.
    To save space on each token, tokens only know the hash of their morphological
    analysis, so queries of morphological attributes are delegated
    to this class.
    '''
    def __init__(self, StringStore string_store, tag_map, lemmatizer, exc=None):
        self.mem = Pool()
        self.strings = string_store
        self.tags = PreshMap()
        # Add special space symbol. We prefix with underscore, to make sure it
        # always sorts to the end.
        space_attrs = tag_map.get('SP', {POS: SPACE})
@ -47,32 +57,46 @@ cdef class Morphology:
        self.lemmatizer = lemmatizer
        self.n_tags = len(tag_map)
        self.reverse_index = {}
        self.rich_tags = <RichTagC*>self.mem.alloc(self.n_tags+1, sizeof(RichTagC))
        for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
            self.strings.add(tag_str)
            self.tag_map[tag_str] = dict(attrs)
-            attrs = _normalize_props(attrs)
+            self.reverse_index[i] = self.strings.add(tag_str)
            attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
            self.rich_tags[i].id = i
            self.rich_tags[i].name = self.strings.add(tag_str)
            self.rich_tags[i].morph = 0
            self.rich_tags[i].pos = attrs[POS]
            self.reverse_index[self.rich_tags[i].name] = i
        # Add a 'null' tag, which we can reference when assign morphology to
        # untagged tokens.
        self.rich_tags[self.n_tags].id = self.n_tags
        self._cache = PreshMapArray(self.n_tags)
        self.exc = {}
        if exc is not None:
            for (tag_str, orth_str), attrs in exc.items():
                self.add_special_case(tag_str, orth_str, attrs)
    def add(self, features):
        """Insert a morphological analysis in the morphology table, if not already
        present. Returns the hash of the new analysis.
        """
        features = intify_features(self.strings, features)
        cdef RichTagC tag = create_rich_tag(features)
        cdef hash_t key = self.insert(tag)
        return key
-    def __reduce__(self):
+    def lemmatize(self, const univ_pos_t univ_pos, attr_t orth, morphology):
-        return (Morphology, (self.strings, self.tag_map, self.lemmatizer,
+        if orth not in self.strings:
-                             self.exc), None, None)
+            return orth
-
+        cdef unicode py_string = self.strings[orth]
        if self.lemmatizer is None:
            return self.strings.add(py_string.lower())
        cdef list lemma_strings
        cdef unicode lemma_string
        lemma_strings = self.lemmatizer(py_string, univ_pos, morphology)
        lemma_string = lemma_strings[0]
        lemma = self.strings.add(lemma_string)
        return lemma
    cdef hash_t insert(self, RichTagC tag) except 0:
        cdef hash_t key = hash_tag(tag)
        if self.tags.get(key) == NULL:
            tag_ptr = <RichTagC*>self.mem.alloc(1, sizeof(RichTagC))
            tag_ptr[0] = tag
            self.tags.set(key, <void*>tag_ptr)
        return key
    cdef int assign_untagged(self, TokenC* token) except -1:
        """Set morphological attributes on a token without a POS tag. Uses
        the lemmatizer's lookup() method, which looks up the string in the
@ -101,84 +125,284 @@ cdef class Morphology:
        # figure out why the statistical model fails. Related to Issue #220
        if Lexeme.c_check_flag(token.lex, IS_SPACE):
            tag_id = self.reverse_index[self.strings.add('_SP')]
-        rich_tag = self.rich_tags[tag_id]
+        lemma = <attr_t>self._cache.get(tag_id, token.lex.orth)
-        analysis = <MorphAnalysisC*>self._cache.get(tag_id, token.lex.orth)
+        if lemma == 0:
-        if analysis is NULL:
+            tag_str = self.tag_names[tag_id]
-            analysis = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
+            features = dict(self.tag_map.get(tag_str, {}))
-            tag_str = self.strings[self.rich_tags[tag_id].name]
+            pos = self.strings.as_int(features.pop('POS'))
-            analysis.tag = rich_tag
+            lemma = self.lemmatize(pos, token.lex.orth, features)
-            analysis.lemma = self.lemmatize(analysis.tag.pos, token.lex.orth,
+            self._cache.set(tag_id, token.lex.orth, lemma)
-                                            self.tag_map.get(tag_str, {}))
+        token.lemma = lemma
-            self._cache.set(tag_id, token.lex.orth, analysis)
+        token.pos = pos
-        token.lemma = analysis.lemma
+        token.tag = self.strings[tag_str]
-        token.pos = analysis.tag.pos
+        token.morph = self.add(attrs)
        token.tag = analysis.tag.name
        token.morph = analysis.tag.morph
-    cdef int assign_feature(self, uint64_t* flags, univ_morph_t flag_id, bint value) except -1:
+    cdef update_morph(self, hash_t morph, features):
-        cdef flags_t one = 1
+        """Update a morphological analysis with new feature values."""
-        if value:
+        tag = (<RichTagC*>self.tags.get(morph))[0]
-            flags[0] |= one << flag_id
+        cdef univ_morph_t feature
-        else:
+        cdef int value
-            flags[0] &= ~(one << flag_id)
+        for feature_, value in features.items():
            feature = self.strings.as_int(feature_)
            set_feature(&tag, feature, 1)
        morph = self.insert_tag(tag)
        return morph
-    def add_special_case(self, unicode tag_str, unicode orth_str, attrs,
+    def to_bytes(self):
-                         force=False):
+        json_tags = []
-        """Add a special-case rule to the morphological analyser. Tokens whose
+        for key in self.tags:
-        tag and orth match the rule will receive the specified properties.
+            tag_ptr = <RichTagC*>self.tags.get(key)
            if tag_ptr != NULL:
                json_tags.append(tag_to_json(tag_ptr[0]))
        raise json.dumps(json_tags)
-        tag (unicode): The part-of-speech tag to key the exception.
+    def from_bytes(self, byte_string):
-        orth (unicode): The word-form to key the exception.
+        raise NotImplementedError
        """
        # TODO: Currently we've assumed that we know the number of tags --
        # RichTagC is an array, and _cache is a PreshMapArray
        # This is really bad: it makes the morphology typed to the tagger
        # classes, which is all wrong.
        self.exc[(tag_str, orth_str)] = dict(attrs)
        tag = self.strings.add(tag_str)
        if tag not in self.reverse_index:
            return
        tag_id = self.reverse_index[tag]
        orth = self.strings[orth_str]
        cdef RichTagC rich_tag = self.rich_tags[tag_id]
        attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
        cached = <MorphAnalysisC*>self._cache.get(tag_id, orth)
        if cached is NULL:
            cached = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
        elif force:
            memset(cached, 0, sizeof(cached[0]))
        else:
            raise ValueError(Errors.E015.format(tag=tag_str, orth=orth_str))
-        cached.tag = rich_tag
+    def to_disk(self, path):
-        # TODO: Refactor this to take arbitrary attributes.
+        raise NotImplementedError
-        for name_id, value_id in attrs.items():
+
-            if name_id == LEMMA:
+    def from_disk(self, path):
-                cached.lemma = value_id
+        raise NotImplementedError
-            else:
+
-                self.assign_feature(&cached.tag.morph, name_id, value_id)
+
-        if cached.lemma == 0:
+cpdef univ_pos_t get_int_tag(pos_):
-            cached.lemma = self.lemmatize(rich_tag.pos, orth, attrs)
+    return <univ_pos_t>0
-        self._cache.set(tag_id, orth, <void*>cached)
+
 cpdef intify_features(StringStore strings, features):
    return {strings.as_int(feature) for feature in features}
 cdef hash_t hash_tag(RichTagC tag) nogil:
    return mrmr.hash64(&tag, sizeof(tag), 0)
 cdef RichTagC create_rich_tag(pos_, features):
    cdef RichTagC tag
    cdef univ_morph_t feature
    tag.pos = get_int_tag(pos_)
    for feature in features:
        set_feature(&tag, feature, 1)
    return tag
 cdef tag_to_json(RichTagC tag):
    return {}
 cdef RichTagC tag_from_json(json_tag):
    cdef RichTagC tag
    return tag
 cdef int set_feature(RichTagC* tag, univ_morph_t feature, int value) nogil:
    if value == True:
        value_ = feature
    else:
        value_ = NIL
    if feature == NIL:
        pass
    if is_abbr_feature(feature):
        tag.abbr = value_
    elif is_adp_type_feature(feature):
        tag.adp_type = value_
    elif is_adv_type_feature(feature):
        tag.adv_type = value_
    elif is_animacy_feature(feature):
        tag.animacy = value_
    elif is_aspect_feature(feature):
        tag.aspect = value_
    elif is_case_feature(feature):
        tag.case = value_
    elif is_conj_type_feature(feature):
        tag.conj_type = value_
    elif is_connegative_feature(feature):
        tag.connegative = value_
    elif is_definite_feature(feature):
        tag.definite = value_
    elif is_degree_feature(feature):
        tag.degree = value_
    elif is_derivation_feature(feature):
        tag.derivation = value_
    elif is_echo_feature(feature):
        tag.echo = value_
    elif is_foreign_feature(feature):
        tag.foreign = value_
    elif is_gender_feature(feature):
        tag.gender = value_
    elif is_hyph_feature(feature):
        tag.hyph = value_
    elif is_inf_form_feature(feature):
        tag.inf_form = value_
    elif is_mood_feature(feature):
        tag.mood = value_
    elif is_negative_feature(feature):
        tag.negative = value_
    elif is_number_feature(feature):
        tag.number = value_
    elif is_name_type_feature(feature):
        tag.name_type = value_
    elif is_num_form_feature(feature):
        tag.num_form = value_
    elif is_num_value_feature(feature):
        tag.num_value = value_
    elif is_part_form_feature(feature):
        tag.part_form = value_
    elif is_part_type_feature(feature):
        tag.part_type = value_
    elif is_person_feature(feature):
        tag.person = value_
    elif is_polite_feature(feature):
        tag.polite = value_
    elif is_polarity_feature(feature):
        tag.polarity = value_
    elif is_poss_feature(feature):
        tag.poss = value_
    elif is_prefix_feature(feature):
        tag.prefix = value_
    elif is_prep_case_feature(feature):
        tag.prep_case = value_
    elif is_pron_type_feature(feature):
        tag.pron_type = value_
    elif is_punct_side_feature(feature):
        tag.punct_type = value_
    elif is_reflex_feature(feature):
        tag.reflex = value_
    elif is_style_feature(feature):
        tag.style = value_
    elif is_style_variant_feature(feature):
        tag.style_variant = value_
    elif is_tense_feature(feature):
        tag.tense = value_
    elif is_verb_form_feature(feature):
        tag.verb_form = value_
    elif is_voice_feature(feature):
        tag.voice = value_
    elif is_verb_type_feature(feature):
        tag.verb_type = value_
    else:
        with gil:
            raise ValueError("Unknown feature: %d" % feature)
 cdef int is_abbr_feature(univ_morph_t abbr) nogil:
    return 0
 cdef int is_adp_type_feature(univ_morph_t feature) nogil:
    return 0
 cdef int is_adv_type_feature(univ_morph_t feature) nogil:
    return 0
 cdef int is_animacy_feature(univ_morph_t feature) nogil:
    return 0
 cdef int is_aspect_feature(univ_morph_t feature) nogil:
    return 0
 cdef int is_case_feature(univ_morph_t feature) nogil:
    return 0
 cdef int is_conj_type_feature(univ_morph_t feature) nogil:
    return 0
 cdef int is_connegative_feature(univ_morph_t feature) nogil:
    return 0
 cdef int is_definite_feature(univ_morph_t feature) nogil:
    return 0
 cdef int is_degree_feature(univ_morph_t feature) nogil:
    return 0
 cdef int is_derivation_feature(univ_morph_t feature) nogil:
    return 0
 cdef int is_echo_feature(univ_morph_t feature) nogil:
    return 0
 cdef int is_foreign_feature(univ_morph_t feature) nogil:
    return 0
 cdef int is_gender_feature(univ_morph_t feature) nogil:
    return 0
 cdef int is_hyph_feature(univ_morph_t feature) nogil:
    return 0
 cdef int is_inf_form_feature(univ_morph_t feature) nogil:
    return 0
 cdef int is_mood_feature(univ_morph_t feature) nogil:
    return 0
 cdef int is_negative_feature(univ_morph_t feature) nogil:
    return 0
 cdef int is_number_feature(univ_morph_t feature) nogil:
    return 0
 cdef int is_name_type_feature(univ_morph_t feature) nogil:
    return 0
 cdef int is_num_form_feature(univ_morph_t feature) nogil:
    return 0
 cdef int is_num_type_feature(univ_morph_t feature) nogil:
    return 0
 cdef int is_num_value_feature(univ_morph_t feature) nogil:
    return 0
 cdef int is_part_form_feature(univ_morph_t feature) nogil:
    return 0
 cdef int is_part_type_feature(univ_morph_t feature) nogil:
    return 0
 cdef int is_person_feature(univ_morph_t feature) nogil:
    return 0
 cdef int is_polite_feature(univ_morph_t feature) nogil:
    return 0
 cdef int is_polarity_feature(univ_morph_t feature) nogil:
    return 0
 cdef int is_poss_feature(univ_morph_t feature) nogil:
    return 0
 cdef int is_prefix_feature(univ_morph_t feature) nogil:
    return 0
 cdef int is_prep_case_feature(univ_morph_t feature) nogil:
    return 0
 cdef int is_pron_type_feature(univ_morph_t feature) nogil:
    return 0
 cdef int is_punct_side_feature(univ_morph_t feature) nogil:
    return 0
 cdef int is_punct_type_feature(univ_morph_t feature) nogil:
    return 0
 cdef int is_reflex_feature(univ_morph_t feature) nogil:
    return 0
 cdef int is_style_feature(univ_morph_t feature) nogil:
    return 0
 cdef int is_style_variant_feature(univ_morph_t feature) nogil:
    return 0
 cdef int is_tense_feature(univ_morph_t feature) nogil:
    return 0
 cdef int is_verb_form_feature(univ_morph_t feature) nogil:
    return 0
 cdef int is_voice_feature(univ_morph_t feature) nogil:
    return 0
 cdef int is_verb_type_feature(univ_morph_t feature) nogil:
    return 0
    def load_morph_exceptions(self, dict exc):
        # Map (form, pos) to (lemma, rich tag)
        for tag_str, entries in exc.items():
            for form_str, attrs in entries.items():
                self.add_special_case(tag_str, form_str, attrs)
    def lemmatize(self, const univ_pos_t univ_pos, attr_t orth, morphology):
        if orth not in self.strings:
            return orth
        cdef unicode py_string = self.strings[orth]
        if self.lemmatizer is None:
            return self.strings.add(py_string.lower())
        cdef list lemma_strings
        cdef unicode lemma_string
        lemma_strings = self.lemmatizer(py_string, univ_pos, morphology)
        lemma_string = lemma_strings[0]
        lemma = self.strings.add(lemma_string)
        return lemma
 IDS = {