Wire up morphological features

2025-10-02 18:06:46 +03:00 · 2018-02-25 21:22:45 +01:00 · 2018-02-25 21:22:45 +01:00 · 9c32388235
commit 9c32388235
parent 9b406181cd
9 changed files with 201 additions and 130 deletions
--- a/spacy/attrs.pyx
+++ b/spacy/attrs.pyx
@ -143,8 +143,10 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
    for name, value in stringy_attrs.items():
        if isinstance(name, int):
            int_key = name
-        else:
+        elif name.upper() in IDS:
            int_key = IDS[name.upper()]
+        else:
+            continue
        if strings_map is not None and isinstance(value, basestring):
            if hasattr(strings_map, 'add'):
                value = strings_map.add(value)
--- a/spacy/lang/de/tokenizer_exceptions.py
+++ b/spacy/lang/de/tokenizer_exceptions.py
@ -2,6 +2,7 @@
 from __future__ import unicode_literals

 from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA
+from ...morphology import Fused_begin, Fused_inside


 _exc = {
@ -47,7 +48,11 @@ _exc = {

    "über'm": [
        {ORTH: "über", LEMMA: "über"},
-        {ORTH: "'m", LEMMA: "der", NORM: "dem"}]
+        {ORTH: "'m", LEMMA: "der", NORM: "dem"}],
+    "zum": [
+        {ORTH: "zu", LEMMA: "zu", "morphology": [Fused_begin]},
+        {ORTH: "m", LEMMA: "der", "morphology": [Fused_inside]}
+    ]
 }


--- a/spacy/morphology.pxd
+++ b/spacy/morphology.pxd
@ -31,6 +31,7 @@ cdef class Morphology:
    cdef public object reverse_index
    cdef public object tag_names
    cdef public object exc
+    cdef public object _morph2features

    cdef RichTagC* rich_tags
    cdef PreshMapArray _cache
@ -42,6 +43,8 @@ cdef class Morphology:
    cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1

    cdef int assign_feature(self, uint64_t* morph, univ_morph_t feat_id, bint value) except -1
+    
+    cdef int set_feature(self, uint64_t* morph, univ_morph_t feat_id, bint value) except -1


 cdef enum univ_morph_t:
@ -298,4 +301,7 @@ cdef enum univ_morph_t:
    VerbType_mod # U
    VerbType_light # U

+    Fused_begin
+    Fused_inside
+

--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@ -9,6 +9,7 @@ from .attrs import LEMMA, intify_attrs
 from .parts_of_speech cimport SPACE
 from .parts_of_speech import IDS as POS_IDS
 from .lexeme cimport Lexeme
+from .strings cimport hash_string


 def _normalize_props(props):
@ -29,6 +30,11 @@ def _normalize_props(props):
            out[key] = value
    return out

+cdef uint64_t hash_features(features):
+    # TODO improve this
+    cdef unicode string = str(tuple(features))
+    return hash_string(string)
+

 cdef class Morphology:
    def __init__(self, StringStore string_store, tag_map, lemmatizer, exc=None):
@ -36,7 +42,7 @@ cdef class Morphology:
        self.strings = string_store
        # Add special space symbol. We prefix with underscore, to make sure it
        # always sorts to the end.
-        space_attrs = tag_map.get('SP', {POS: SPACE})
+        space_attrs = tag_map.get('_SP', tag_map.get('SP', {POS: SPACE}))
        if '_SP' not in tag_map:
            self.strings.add('_SP')
            tag_map = dict(tag_map)
@ -48,16 +54,19 @@ cdef class Morphology:
        self.reverse_index = {}

        self.rich_tags = <RichTagC*>self.mem.alloc(self.n_tags+1, sizeof(RichTagC))
+        self._morph2features = {}
        for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
+            features = attrs.get('morphology', frozenset())
            self.strings.add(tag_str)
            self.tag_map[tag_str] = dict(attrs)
            attrs = _normalize_props(attrs)
            attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
            self.rich_tags[i].id = i
            self.rich_tags[i].name = self.strings.add(tag_str)
-            self.rich_tags[i].morph = 0
+            self.rich_tags[i].morph = hash_features(features)
            self.rich_tags[i].pos = attrs[POS]
            self.reverse_index[self.rich_tags[i].name] = i
+            self._morph2features[self.rich_tags[i].morph] = features
        # Add a 'null' tag, which we can reference when assign morphology to
        # untagged tokens.
        self.rich_tags[self.n_tags].id = self.n_tags
@ -114,12 +123,30 @@ cdef class Morphology:
        token.tag = analysis.tag.name
        token.morph = analysis.tag.morph

-    cdef int assign_feature(self, uint64_t* flags, univ_morph_t flag_id, bint value) except -1:
-        cdef flags_t one = 1
-        if value:
-            flags[0] |= one << flag_id
-        else:
-            flags[0] &= ~(one << flag_id)
+    cdef int assign_feature(self, uint64_t* morph, univ_morph_t flag_id, bint value) except -1:
+        # Deprecated
+        pass
+
+    cdef int set_feature(self, uint64_t* morph, univ_morph_t flag_id, bint value) except -1:
+        '''Update a morph attribute in-place, so that it indicates the given
+        feature.
+        '''
+        features = self._morph2features.get(morph[0], {})
+        cdef uint64_t key
+        cdef attr_t flag = flag_id
+        if (flag in features) != value:
+            new_features = set(features)
+            if value:
+                new_features.add(flag)
+            else:
+                new_features.remove(flag)
+            new_features = frozenset(new_features)
+            key = hash_features(new_features)
+            morph[0] = key
+            self._morph2features[morph[0]] = new_features
+
+    def get_features(self, uint64_t morph):
+        return self._morph2features.get(morph, frozenset())

    def add_special_case(self, unicode tag_str, unicode orth_str, attrs,
                         force=False):
@ -140,6 +167,9 @@ cdef class Morphology:
        tag_id = self.reverse_index[tag]
        orth = self.strings[orth_str]
        cdef RichTagC rich_tag = self.rich_tags[tag_id]
+        features = attrs.get('morphology', frozenset())
+        cdef uint64_t morph = hash_features(features)
+        self._morph2features[morph] = features
        attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
        cached = <MorphAnalysisC*>self._cache.get(tag_id, orth)
        if cached is NULL:
@ -152,12 +182,11 @@ cdef class Morphology:
                "force=True to overwrite." % (tag_str, orth_str))

        cached.tag = rich_tag
+        cached.tag.morph = morph
        # TODO: Refactor this to take arbitrary attributes.
        for name_id, value_id in attrs.items():
            if name_id == LEMMA:
                cached.lemma = value_id
-            else:
-                self.assign_feature(&cached.tag.morph, name_id, value_id)
        if cached.lemma == 0:
            cached.lemma = self.lemmatize(rich_tag.pos, orth, attrs)
        self._cache.set(tag_id, orth, <void*>cached)
@ -318,122 +347,125 @@ IDS = {
    "AdvType_sta": AdvType_sta,
    "AdvType_ex": AdvType_ex,
    "AdvType_adadj": AdvType_adadj,
-    "ConjType_oper ": ConjType_oper,  # cz, U,
-    "ConjType_comp ": ConjType_comp,  # cz, U,
-    "Connegative_yes ": Connegative_yes,  # fi,
-    "Derivation_minen ": Derivation_minen,  # fi,
-    "Derivation_sti ": Derivation_sti,  # fi,
-    "Derivation_inen ": Derivation_inen,  # fi,
-    "Derivation_lainen ": Derivation_lainen,  # fi,
-    "Derivation_ja ": Derivation_ja,  # fi,
-    "Derivation_ton ": Derivation_ton,  # fi,
-    "Derivation_vs ": Derivation_vs,  # fi,
-    "Derivation_ttain ": Derivation_ttain,  # fi,
-    "Derivation_ttaa ": Derivation_ttaa,  # fi,
-    "Echo_rdp ": Echo_rdp,  # U,
-    "Echo_ech ": Echo_ech,  # U,
-    "Foreign_foreign ": Foreign_foreign,  # cz, fi, U,
-    "Foreign_fscript ": Foreign_fscript,  # cz, fi, U,
-    "Foreign_tscript ": Foreign_tscript,  # cz, U,
-    "Foreign_yes ": Foreign_yes,  # sl,
-    "Gender_dat_masc ": Gender_dat_masc,  # bq, U,
-    "Gender_dat_fem ": Gender_dat_fem,  # bq, U,
-    "Gender_erg_masc ": Gender_erg_masc,  # bq,
-    "Gender_erg_fem ": Gender_erg_fem,  # bq,
-    "Gender_psor_masc ": Gender_psor_masc,  # cz, sl, U,
-    "Gender_psor_fem ": Gender_psor_fem,  # cz, sl, U,
-    "Gender_psor_neut ": Gender_psor_neut,  # sl,
-    "Hyph_yes ": Hyph_yes,  # cz, U,
-    "InfForm_one ": InfForm_one,  # fi,
-    "InfForm_two ": InfForm_two,  # fi,
-    "InfForm_three ": InfForm_three,  # fi,
-    "NameType_geo ": NameType_geo,  # U, cz,
-    "NameType_prs ": NameType_prs,  # U, cz,
-    "NameType_giv ": NameType_giv,  # U, cz,
-    "NameType_sur ": NameType_sur,  # U, cz,
-    "NameType_nat ": NameType_nat,  # U, cz,
-    "NameType_com ": NameType_com,  # U, cz,
-    "NameType_pro ": NameType_pro,  # U, cz,
-    "NameType_oth ": NameType_oth,  # U, cz,
-    "NounType_com ": NounType_com,  # U,
-    "NounType_prop ": NounType_prop,  # U,
-    "NounType_class ": NounType_class,  # U,
-    "Number_abs_sing ": Number_abs_sing,  # bq, U,
-    "Number_abs_plur ": Number_abs_plur,  # bq, U,
-    "Number_dat_sing ": Number_dat_sing,  # bq, U,
-    "Number_dat_plur ": Number_dat_plur,  # bq, U,
-    "Number_erg_sing ": Number_erg_sing,  # bq, U,
-    "Number_erg_plur ": Number_erg_plur,  # bq, U,
-    "Number_psee_sing ": Number_psee_sing,  # U,
-    "Number_psee_plur ": Number_psee_plur,  # U,
-    "Number_psor_sing ": Number_psor_sing,  # cz, fi, sl, U,
-    "Number_psor_plur ": Number_psor_plur,  # cz, fi, sl, U,
-    "NumForm_digit ": NumForm_digit,  # cz, sl, U,
-    "NumForm_roman ": NumForm_roman,  # cz, sl, U,
-    "NumForm_word ": NumForm_word,  # cz, sl, U,
-    "NumValue_one ": NumValue_one,  # cz, U,
-    "NumValue_two ": NumValue_two,  # cz, U,
-    "NumValue_three ": NumValue_three,  # cz, U,
-    "PartForm_pres ": PartForm_pres,  # fi,
-    "PartForm_past ": PartForm_past,  # fi,
-    "PartForm_agt ": PartForm_agt,  # fi,
-    "PartForm_neg ": PartForm_neg,  # fi,
-    "PartType_mod ": PartType_mod,  # U,
-    "PartType_emp ": PartType_emp,  # U,
-    "PartType_res ": PartType_res,  # U,
-    "PartType_inf ": PartType_inf,  # U,
-    "PartType_vbp ": PartType_vbp,  # U,
-    "Person_abs_one ": Person_abs_one,  # bq, U,
-    "Person_abs_two ": Person_abs_two,  # bq, U,
-    "Person_abs_three ": Person_abs_three,  # bq, U,
-    "Person_dat_one ": Person_dat_one,  # bq, U,
-    "Person_dat_two ": Person_dat_two,  # bq, U,
-    "Person_dat_three ": Person_dat_three,  # bq, U,
-    "Person_erg_one ": Person_erg_one,  # bq, U,
-    "Person_erg_two ": Person_erg_two,  # bq, U,
-    "Person_erg_three ": Person_erg_three,  # bq, U,
-    "Person_psor_one ": Person_psor_one,  # fi, U,
-    "Person_psor_two ": Person_psor_two,  # fi, U,
-    "Person_psor_three ": Person_psor_three,  # fi, U,
-    "Polite_inf ": Polite_inf,  # bq, U,
-    "Polite_pol ": Polite_pol,  # bq, U,
-    "Polite_abs_inf ": Polite_abs_inf,  # bq, U,
-    "Polite_abs_pol ": Polite_abs_pol,  # bq, U,
-    "Polite_erg_inf ": Polite_erg_inf,  # bq, U,
-    "Polite_erg_pol ": Polite_erg_pol,  # bq, U,
-    "Polite_dat_inf ": Polite_dat_inf,  # bq, U,
-    "Polite_dat_pol ": Polite_dat_pol,  # bq, U,
-    "Prefix_yes ": Prefix_yes,  # U,
-    "PrepCase_npr ": PrepCase_npr,  # cz,
-    "PrepCase_pre ": PrepCase_pre,  # U,
-    "PunctSide_ini ": PunctSide_ini,  # U,
-    "PunctSide_fin ": PunctSide_fin,  # U,
-    "PunctType_peri ": PunctType_peri,  # U,
-    "PunctType_qest ": PunctType_qest,  # U,
-    "PunctType_excl ": PunctType_excl,  # U,
-    "PunctType_quot ": PunctType_quot,  # U,
-    "PunctType_brck ": PunctType_brck,  # U,
-    "PunctType_comm ": PunctType_comm,  # U,
-    "PunctType_colo ": PunctType_colo,  # U,
-    "PunctType_semi ": PunctType_semi,  # U,
-    "PunctType_dash ": PunctType_dash,  # U,
-    "Style_arch ": Style_arch,  # cz, fi, U,
-    "Style_rare ": Style_rare,  # cz, fi, U,
-    "Style_poet ": Style_poet,  # cz, U,
-    "Style_norm ": Style_norm,  # cz, U,
-    "Style_coll ": Style_coll,  # cz, U,
-    "Style_vrnc ": Style_vrnc,  # cz, U,
-    "Style_sing ": Style_sing,  # cz, U,
-    "Style_expr ": Style_expr,  # cz, U,
-    "Style_derg ": Style_derg,  # cz, U,
-    "Style_vulg ": Style_vulg,  # cz, U,
-    "Style_yes ": Style_yes,  # fi, U,
-    "StyleVariant_styleShort ": StyleVariant_styleShort,  # cz,
-    "StyleVariant_styleBound ": StyleVariant_styleBound,  # cz, sl,
-    "VerbType_aux ": VerbType_aux,  # U,
-    "VerbType_cop ": VerbType_cop,  # U,
-    "VerbType_mod ": VerbType_mod,  # U,
-    "VerbType_light ": VerbType_light,  # U,
+    "ConjType_oper": ConjType_oper,  # cz, U,
+    "ConjType_comp": ConjType_comp,  # cz, U,
+    "Connegative_yes": Connegative_yes,  # fi,
+    "Derivation_minen": Derivation_minen,  # fi,
+    "Derivation_sti": Derivation_sti,  # fi,
+    "Derivation_inen": Derivation_inen,  # fi,
+    "Derivation_lainen": Derivation_lainen,  # fi,
+    "Derivation_ja": Derivation_ja,  # fi,
+    "Derivation_ton": Derivation_ton,  # fi,
+    "Derivation_vs": Derivation_vs,  # fi,
+    "Derivation_ttain": Derivation_ttain,  # fi,
+    "Derivation_ttaa": Derivation_ttaa,  # fi,
+    "Echo_rdp": Echo_rdp,  # U,
+    "Echo_ech": Echo_ech,  # U,
+    "Foreign_foreign": Foreign_foreign,  # cz, fi, U,
+    "Foreign_fscript": Foreign_fscript,  # cz, fi, U,
+    "Foreign_tscript": Foreign_tscript,  # cz, U,
+    "Foreign_yes": Foreign_yes,  # sl,
+    "Gender_dat_masc": Gender_dat_masc,  # bq, U,
+    "Gender_dat_fem": Gender_dat_fem,  # bq, U,
+    "Gender_erg_masc": Gender_erg_masc,  # bq,
+    "Gender_erg_fem": Gender_erg_fem,  # bq,
+    "Gender_psor_masc": Gender_psor_masc,  # cz, sl, U,
+    "Gender_psor_fem": Gender_psor_fem,  # cz, sl, U,
+    "Gender_psor_neut": Gender_psor_neut,  # sl,
+    "Hyph_yes": Hyph_yes,  # cz, U,
+    "InfForm_one": InfForm_one,  # fi,
+    "InfForm_two": InfForm_two,  # fi,
+    "InfForm_three": InfForm_three,  # fi,
+    "NameType_geo": NameType_geo,  # U, cz,
+    "NameType_prs": NameType_prs,  # U, cz,
+    "NameType_giv": NameType_giv,  # U, cz,
+    "NameType_sur": NameType_sur,  # U, cz,
+    "NameType_nat": NameType_nat,  # U, cz,
+    "NameType_com": NameType_com,  # U, cz,
+    "NameType_pro": NameType_pro,  # U, cz,
+    "NameType_oth": NameType_oth,  # U, cz,
+    "NounType_com": NounType_com,  # U,
+    "NounType_prop": NounType_prop,  # U,
+    "NounType_class": NounType_class,  # U,
+    "Number_abs_sing": Number_abs_sing,  # bq, U,
+    "Number_abs_plur": Number_abs_plur,  # bq, U,
+    "Number_dat_sing": Number_dat_sing,  # bq, U,
+    "Number_dat_plur": Number_dat_plur,  # bq, U,
+    "Number_erg_sing": Number_erg_sing,  # bq, U,
+    "Number_erg_plur": Number_erg_plur,  # bq, U,
+    "Number_psee_sing": Number_psee_sing,  # U,
+    "Number_psee_plur": Number_psee_plur,  # U,
+    "Number_psor_sing": Number_psor_sing,  # cz, fi, sl, U,
+    "Number_psor_plur": Number_psor_plur,  # cz, fi, sl, U,
+    "NumForm_digit": NumForm_digit,  # cz, sl, U,
+    "NumForm_roman": NumForm_roman,  # cz, sl, U,
+    "NumForm_word": NumForm_word,  # cz, sl, U,
+    "NumValue_one": NumValue_one,  # cz, U,
+    "NumValue_two": NumValue_two,  # cz, U,
+    "NumValue_three": NumValue_three,  # cz, U,
+    "PartForm_pres": PartForm_pres,  # fi,
+    "PartForm_past": PartForm_past,  # fi,
+    "PartForm_agt": PartForm_agt,  # fi,
+    "PartForm_neg": PartForm_neg,  # fi,
+    "PartType_mod": PartType_mod,  # U,
+    "PartType_emp": PartType_emp,  # U,
+    "PartType_res": PartType_res,  # U,
+    "PartType_inf": PartType_inf,  # U,
+    "PartType_vbp": PartType_vbp,  # U,
+    "Person_abs_one": Person_abs_one,  # bq, U,
+    "Person_abs_two": Person_abs_two,  # bq, U,
+    "Person_abs_three": Person_abs_three,  # bq, U,
+    "Person_dat_one": Person_dat_one,  # bq, U,
+    "Person_dat_two": Person_dat_two,  # bq, U,
+    "Person_dat_three": Person_dat_three,  # bq, U,
+    "Person_erg_one": Person_erg_one,  # bq, U,
+    "Person_erg_two": Person_erg_two,  # bq, U,
+    "Person_erg_three": Person_erg_three,  # bq, U,
+    "Person_psor_one": Person_psor_one,  # fi, U,
+    "Person_psor_two": Person_psor_two,  # fi, U,
+    "Person_psor_three": Person_psor_three,  # fi, U,
+    "Polite_inf": Polite_inf,  # bq, U,
+    "Polite_pol": Polite_pol,  # bq, U,
+    "Polite_abs_inf": Polite_abs_inf,  # bq, U,
+    "Polite_abs_pol": Polite_abs_pol,  # bq, U,
+    "Polite_erg_inf": Polite_erg_inf,  # bq, U,
+    "Polite_erg_pol": Polite_erg_pol,  # bq, U,
+    "Polite_dat_inf": Polite_dat_inf,  # bq, U,
+    "Polite_dat_pol": Polite_dat_pol,  # bq, U,
+    "Prefix_yes": Prefix_yes,  # U,
+    "PrepCase_npr": PrepCase_npr,  # cz,
+    "PrepCase_pre": PrepCase_pre,  # U,
+    "PunctSide_ini": PunctSide_ini,  # U,
+    "PunctSide_fin": PunctSide_fin,  # U,
+    "PunctType_peri": PunctType_peri,  # U,
+    "PunctType_qest": PunctType_qest,  # U,
+    "PunctType_excl": PunctType_excl,  # U,
+    "PunctType_quot": PunctType_quot,  # U,
+    "PunctType_brck": PunctType_brck,  # U,
+    "PunctType_comm": PunctType_comm,  # U,
+    "PunctType_colo": PunctType_colo,  # U,
+    "PunctType_semi": PunctType_semi,  # U,
+    "PunctType_dash": PunctType_dash,  # U,
+    "Style_arch": Style_arch,  # cz, fi, U,
+    "Style_rare": Style_rare,  # cz, fi, U,
+    "Style_poet": Style_poet,  # cz, U,
+    "Style_norm": Style_norm,  # cz, U,
+    "Style_coll": Style_coll,  # cz, U,
+    "Style_vrnc": Style_vrnc,  # cz, U,
+    "Style_sing": Style_sing,  # cz, U,
+    "Style_expr": Style_expr,  # cz, U,
+    "Style_derg": Style_derg,  # cz, U,
+    "Style_vulg": Style_vulg,  # cz, U,
+    "Style_yes": Style_yes,  # fi, U,
+    "StyleVariant_styleShort": StyleVariant_styleShort,  # cz,
+    "StyleVariant_styleBound": StyleVariant_styleBound,  # cz, sl,
+    "VerbType_aux": VerbType_aux,  # U,
+    "VerbType_cop": VerbType_cop,  # U,
+    "VerbType_mod": VerbType_mod,  # U,
+    "VerbType_light": VerbType_light,  # U,
+
+    "Fused_begin": Fused_begin,   # Internal
+    "Fused_inside": Fused_inside # Internal
 }


--- a/spacy/symbols.pxd
+++ b/spacy/symbols.pxd
@ -384,6 +384,9 @@ cdef enum symbol_t:
    VerbType_cop # U
    VerbType_mod # U
    VerbType_light # U
+    
+    Fused_begin
+    Fused_inside

    PERSON
    NORP
--- a/spacy/symbols.pyx
+++ b/spacy/symbols.pyx
@ -389,6 +389,9 @@ IDS = {
    "VerbType_cop": VerbType_cop, # U,
    "VerbType_mod": VerbType_mod, # U,
    "VerbType_light": VerbType_light, # U,
+    
+    "Fused_begin": Fused_begin,
+    "Fused_inside": Fused_inside,

    "PERSON": PERSON,
    "NORP": NORP,
--- a/spacy/tests/lang/de/test_text.py
+++ b/spacy/tests/lang/de/test_text.py
@ -5,6 +5,7 @@
 from __future__ import unicode_literals

 import pytest
+from ....morphology import Fused_begin, Fused_inside


 def test_tokenizer_handles_long_text(de_tokenizer):
@ -22,9 +23,15 @@ Umfang kläglich dünnen Beine flimmerten ihm hilflos vor den Augen.
 »Was ist mit mir geschehen?«, dachte er."""

    tokens = de_tokenizer(text)
-    assert len(tokens) == 109
+    assert len(tokens) == 110


+def test_fused(de_tokenizer):
+    doc = de_tokenizer('zum')
+    assert len(doc) == 2
+    assert doc[0].check_morph(Fused_begin)
+    assert doc[1].check_morph(Fused_inside)
+
@pytest.mark.parametrize('text', [
    "Donaudampfschifffahrtsgesellschaftskapitänsanwärterposten",
    "Rindfleischetikettierungsüberwachungsaufgabenübertragungsgesetz",
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -10,6 +10,7 @@ cimport numpy as np
 np.import_array()
 import numpy

+from ..morphology cimport univ_morph_t
 from ..typedefs cimport hash_t
 from ..lexeme cimport Lexeme
 from .. import parts_of_speech
@ -128,6 +129,15 @@ cdef class Token:
        """
        return Lexeme.c_check_flag(self.c.lex, flag_id)

+    def set_morph(self, univ_morph_t feature, bint value):
+        '''Set a morphological feature'''
+        self.vocab.morphology.set_feature(&self.c.morph, feature, value)
+
+    def check_morph(self, univ_morph_t feature):
+        '''Check whether the token has the given morphological feature.'''
+        features = self.vocab.morphology.get_features(self.c.morph)
+        return feature in features
+
    def nbor(self, int i=1):
        """Get a neighboring token.

--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -232,14 +232,17 @@ cdef class Vocab:
        cdef int i
        tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
        for i, props in enumerate(substrings):
+            features = props.get('morphology', frozenset())
            props = intify_attrs(props, strings_map=self.strings,
-                                 _do_deprecated=True)
+                                 _do_deprecated=False)
            token = &tokens[i]
            # Set the special tokens up to have arbitrary attributes
            lex = <LexemeC*>self.get_by_orth(self.mem, props[ORTH])
            token.lex = lex
            if TAG in props:
                self.morphology.assign_tag(token, props[TAG])
+            for feature in features:
+                self.morphology.set_feature(&token.morph, feature, True)
            for attr_id, value in props.items():
                Token.set_struct_attr(token, attr_id, value)
                Lexeme.set_struct_attr(lex, attr_id, value)