Wire up morphological features

2025-10-03 02:17:00 +03:00 · 2018-02-25 21:22:45 +01:00 · 2018-02-25 21:22:45 +01:00 · 9c32388235
commit 9c32388235
parent 9b406181cd
9 changed files with 201 additions and 130 deletions
--- a/spacy/attrs.pyx
+++ b/spacy/attrs.pyx
@ -143,8 +143,10 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
    for name, value in stringy_attrs.items():
        if isinstance(name, int):
            int_key = name
-        else:
+        elif name.upper() in IDS:
            int_key = IDS[name.upper()]
+        else:
+            continue
        if strings_map is not None and isinstance(value, basestring):
            if hasattr(strings_map, 'add'):
                value = strings_map.add(value)
--- a/spacy/lang/de/tokenizer_exceptions.py
+++ b/spacy/lang/de/tokenizer_exceptions.py
@ -2,6 +2,7 @@
 from __future__ import unicode_literals

 from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA
+from ...morphology import Fused_begin, Fused_inside


 _exc = {
@ -47,7 +48,11 @@ _exc = {

    "über'm": [
        {ORTH: "über", LEMMA: "über"},
-        {ORTH: "'m", LEMMA: "der", NORM: "dem"}]
+        {ORTH: "'m", LEMMA: "der", NORM: "dem"}],
+    "zum": [
+        {ORTH: "zu", LEMMA: "zu", "morphology": [Fused_begin]},
+        {ORTH: "m", LEMMA: "der", "morphology": [Fused_inside]}
+    ]
 }


--- a/spacy/morphology.pxd
+++ b/spacy/morphology.pxd
@ -31,6 +31,7 @@ cdef class Morphology:
    cdef public object reverse_index
    cdef public object tag_names
    cdef public object exc
+    cdef public object _morph2features

    cdef RichTagC* rich_tags
    cdef PreshMapArray _cache
@ -43,6 +44,8 @@ cdef class Morphology:

    cdef int assign_feature(self, uint64_t* morph, univ_morph_t feat_id, bint value) except -1
    
+    cdef int set_feature(self, uint64_t* morph, univ_morph_t feat_id, bint value) except -1
+

 cdef enum univ_morph_t:
    NIL = 0
@ -298,4 +301,7 @@ cdef enum univ_morph_t:
    VerbType_mod # U
    VerbType_light # U

+    Fused_begin
+    Fused_inside
+

--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@ -9,6 +9,7 @@ from .attrs import LEMMA, intify_attrs
 from .parts_of_speech cimport SPACE
 from .parts_of_speech import IDS as POS_IDS
 from .lexeme cimport Lexeme
+from .strings cimport hash_string


 def _normalize_props(props):
@ -29,6 +30,11 @@ def _normalize_props(props):
            out[key] = value
    return out

+cdef uint64_t hash_features(features):
+    # TODO improve this
+    cdef unicode string = str(tuple(features))
+    return hash_string(string)
+

 cdef class Morphology:
    def __init__(self, StringStore string_store, tag_map, lemmatizer, exc=None):
@ -36,7 +42,7 @@ cdef class Morphology:
        self.strings = string_store
        # Add special space symbol. We prefix with underscore, to make sure it
        # always sorts to the end.
-        space_attrs = tag_map.get('SP', {POS: SPACE})
+        space_attrs = tag_map.get('_SP', tag_map.get('SP', {POS: SPACE}))
        if '_SP' not in tag_map:
            self.strings.add('_SP')
            tag_map = dict(tag_map)
@ -48,16 +54,19 @@ cdef class Morphology:
        self.reverse_index = {}

        self.rich_tags = <RichTagC*>self.mem.alloc(self.n_tags+1, sizeof(RichTagC))
+        self._morph2features = {}
        for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
+            features = attrs.get('morphology', frozenset())
            self.strings.add(tag_str)
            self.tag_map[tag_str] = dict(attrs)
            attrs = _normalize_props(attrs)
            attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
            self.rich_tags[i].id = i
            self.rich_tags[i].name = self.strings.add(tag_str)
-            self.rich_tags[i].morph = 0
+            self.rich_tags[i].morph = hash_features(features)
            self.rich_tags[i].pos = attrs[POS]
            self.reverse_index[self.rich_tags[i].name] = i
+            self._morph2features[self.rich_tags[i].morph] = features
        # Add a 'null' tag, which we can reference when assign morphology to
        # untagged tokens.
        self.rich_tags[self.n_tags].id = self.n_tags
@ -114,12 +123,30 @@ cdef class Morphology:
        token.tag = analysis.tag.name
        token.morph = analysis.tag.morph

-    cdef int assign_feature(self, uint64_t* flags, univ_morph_t flag_id, bint value) except -1:
-        cdef flags_t one = 1
+    cdef int assign_feature(self, uint64_t* morph, univ_morph_t flag_id, bint value) except -1:
+        # Deprecated
+        pass
+
+    cdef int set_feature(self, uint64_t* morph, univ_morph_t flag_id, bint value) except -1:
+        '''Update a morph attribute in-place, so that it indicates the given
+        feature.
+        '''
+        features = self._morph2features.get(morph[0], {})
+        cdef uint64_t key
+        cdef attr_t flag = flag_id
+        if (flag in features) != value:
+            new_features = set(features)
            if value:
-            flags[0] |= one << flag_id
+                new_features.add(flag)
            else:
-            flags[0] &= ~(one << flag_id)
+                new_features.remove(flag)
+            new_features = frozenset(new_features)
+            key = hash_features(new_features)
+            morph[0] = key
+            self._morph2features[morph[0]] = new_features
+
+    def get_features(self, uint64_t morph):
+        return self._morph2features.get(morph, frozenset())

    def add_special_case(self, unicode tag_str, unicode orth_str, attrs,
                         force=False):
@ -140,6 +167,9 @@ cdef class Morphology:
        tag_id = self.reverse_index[tag]
        orth = self.strings[orth_str]
        cdef RichTagC rich_tag = self.rich_tags[tag_id]
+        features = attrs.get('morphology', frozenset())
+        cdef uint64_t morph = hash_features(features)
+        self._morph2features[morph] = features
        attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
        cached = <MorphAnalysisC*>self._cache.get(tag_id, orth)
        if cached is NULL:
@ -152,12 +182,11 @@ cdef class Morphology:
                "force=True to overwrite." % (tag_str, orth_str))

        cached.tag = rich_tag
+        cached.tag.morph = morph
        # TODO: Refactor this to take arbitrary attributes.
        for name_id, value_id in attrs.items():
            if name_id == LEMMA:
                cached.lemma = value_id
-            else:
-                self.assign_feature(&cached.tag.morph, name_id, value_id)
        if cached.lemma == 0:
            cached.lemma = self.lemmatize(rich_tag.pos, orth, attrs)
        self._cache.set(tag_id, orth, <void*>cached)
@ -434,6 +463,9 @@ IDS = {
    "VerbType_cop": VerbType_cop,  # U,
    "VerbType_mod": VerbType_mod,  # U,
    "VerbType_light": VerbType_light,  # U,
+
+    "Fused_begin": Fused_begin,   # Internal
+    "Fused_inside": Fused_inside # Internal
 }


--- a/spacy/symbols.pxd
+++ b/spacy/symbols.pxd
@ -385,6 +385,9 @@ cdef enum symbol_t:
    VerbType_mod # U
    VerbType_light # U
    
+    Fused_begin
+    Fused_inside
+
    PERSON
    NORP
    FACILITY
--- a/spacy/symbols.pyx
+++ b/spacy/symbols.pyx
@ -390,6 +390,9 @@ IDS = {
    "VerbType_mod": VerbType_mod, # U,
    "VerbType_light": VerbType_light, # U,
    
+    "Fused_begin": Fused_begin,
+    "Fused_inside": Fused_inside,
+
    "PERSON": PERSON,
    "NORP": NORP,
    "FACILITY": FACILITY,
--- a/spacy/tests/lang/de/test_text.py
+++ b/spacy/tests/lang/de/test_text.py
@ -5,6 +5,7 @@
 from __future__ import unicode_literals

 import pytest
+from ....morphology import Fused_begin, Fused_inside


 def test_tokenizer_handles_long_text(de_tokenizer):
@ -22,9 +23,15 @@ Umfang kläglich dünnen Beine flimmerten ihm hilflos vor den Augen.
 »Was ist mit mir geschehen?«, dachte er."""

    tokens = de_tokenizer(text)
-    assert len(tokens) == 109
+    assert len(tokens) == 110


+def test_fused(de_tokenizer):
+    doc = de_tokenizer('zum')
+    assert len(doc) == 2
+    assert doc[0].check_morph(Fused_begin)
+    assert doc[1].check_morph(Fused_inside)
+
@pytest.mark.parametrize('text', [
    "Donaudampfschifffahrtsgesellschaftskapitänsanwärterposten",
    "Rindfleischetikettierungsüberwachungsaufgabenübertragungsgesetz",
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -10,6 +10,7 @@ cimport numpy as np
 np.import_array()
 import numpy

+from ..morphology cimport univ_morph_t
 from ..typedefs cimport hash_t
 from ..lexeme cimport Lexeme
 from .. import parts_of_speech
@ -128,6 +129,15 @@ cdef class Token:
        """
        return Lexeme.c_check_flag(self.c.lex, flag_id)

+    def set_morph(self, univ_morph_t feature, bint value):
+        '''Set a morphological feature'''
+        self.vocab.morphology.set_feature(&self.c.morph, feature, value)
+
+    def check_morph(self, univ_morph_t feature):
+        '''Check whether the token has the given morphological feature.'''
+        features = self.vocab.morphology.get_features(self.c.morph)
+        return feature in features
+
    def nbor(self, int i=1):
        """Get a neighboring token.

--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -232,14 +232,17 @@ cdef class Vocab:
        cdef int i
        tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
        for i, props in enumerate(substrings):
+            features = props.get('morphology', frozenset())
            props = intify_attrs(props, strings_map=self.strings,
-                                 _do_deprecated=True)
+                                 _do_deprecated=False)
            token = &tokens[i]
            # Set the special tokens up to have arbitrary attributes
            lex = <LexemeC*>self.get_by_orth(self.mem, props[ORTH])
            token.lex = lex
            if TAG in props:
                self.morphology.assign_tag(token, props[TAG])
+            for feature in features:
+                self.morphology.set_feature(&token.morph, feature, True)
            for attr_id, value in props.items():
                Token.set_struct_attr(token, attr_id, value)
                Lexeme.set_struct_attr(lex, attr_id, value)