From 9c32388235777c3bf9518757ca8304b8c4af997c Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sun, 25 Feb 2018 21:22:45 +0100
Subject: [PATCH] Wire up morphological features

---
 spacy/attrs.pyx                       |   4 +-
 spacy/lang/de/tokenizer_exceptions.py |   7 +-
 spacy/morphology.pxd                  |   6 +
 spacy/morphology.pyx                  | 284 ++++++++++++++------------
 spacy/symbols.pxd                     |   3 +
 spacy/symbols.pyx                     |   3 +
 spacy/tests/lang/de/test_text.py      |   9 +-
 spacy/tokens/token.pyx                |  10 +
 spacy/vocab.pyx                       |   5 +-
 9 files changed, 201 insertions(+), 130 deletions(-)

diff --git a/spacy/attrs.pyx b/spacy/attrs.pyx
index ed1f39a3f..ad012bc3d 100644
--- a/spacy/attrs.pyx
+++ b/spacy/attrs.pyx
@@ -143,8 +143,10 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
     for name, value in stringy_attrs.items():
         if isinstance(name, int):
             int_key = name
-        else:
+        elif name.upper() in IDS:
             int_key = IDS[name.upper()]
+        else:
+            continue
         if strings_map is not None and isinstance(value, basestring):
             if hasattr(strings_map, 'add'):
                 value = strings_map.add(value)
diff --git a/spacy/lang/de/tokenizer_exceptions.py b/spacy/lang/de/tokenizer_exceptions.py
index 8e041a740..d3b35dfa2 100644
--- a/spacy/lang/de/tokenizer_exceptions.py
+++ b/spacy/lang/de/tokenizer_exceptions.py
@@ -2,6 +2,7 @@
 from __future__ import unicode_literals
 
 from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA
+from ...morphology import Fused_begin, Fused_inside
 
 
 _exc = {
@@ -47,7 +48,11 @@ _exc = {
 
     "über'm": [
         {ORTH: "über", LEMMA: "über"},
-        {ORTH: "'m", LEMMA: "der", NORM: "dem"}]
+        {ORTH: "'m", LEMMA: "der", NORM: "dem"}],
+    "zum": [
+        {ORTH: "zu", LEMMA: "zu", "morphology": [Fused_begin]},
+        {ORTH: "m", LEMMA: "der", "morphology": [Fused_inside]}
+    ]
 }
 
 
diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd
index d0110b300..e93ebde59 100644
--- a/spacy/morphology.pxd
+++ b/spacy/morphology.pxd
@@ -31,6 +31,7 @@ cdef class Morphology:
     cdef public object reverse_index
     cdef public object tag_names
     cdef public object exc
+    cdef public object _morph2features
 
     cdef RichTagC* rich_tags
     cdef PreshMapArray _cache
@@ -42,6 +43,8 @@ cdef class Morphology:
     cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1
 
     cdef int assign_feature(self, uint64_t* morph, univ_morph_t feat_id, bint value) except -1
+    
+    cdef int set_feature(self, uint64_t* morph, univ_morph_t feat_id, bint value) except -1
 
 
 cdef enum univ_morph_t:
@@ -298,4 +301,7 @@ cdef enum univ_morph_t:
     VerbType_mod # U
     VerbType_light # U
 
+    Fused_begin
+    Fused_inside
+
 
diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx
index ab48427ce..a0397312c 100644
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@@ -9,6 +9,7 @@ from .attrs import LEMMA, intify_attrs
 from .parts_of_speech cimport SPACE
 from .parts_of_speech import IDS as POS_IDS
 from .lexeme cimport Lexeme
+from .strings cimport hash_string
 
 
 def _normalize_props(props):
@@ -29,6 +30,11 @@ def _normalize_props(props):
             out[key] = value
     return out
 
+cdef uint64_t hash_features(features):
+    # TODO improve this
+    cdef unicode string = str(tuple(features))
+    return hash_string(string)
+
 
 cdef class Morphology:
     def __init__(self, StringStore string_store, tag_map, lemmatizer, exc=None):
@@ -36,7 +42,7 @@ cdef class Morphology:
         self.strings = string_store
         # Add special space symbol. We prefix with underscore, to make sure it
         # always sorts to the end.
-        space_attrs = tag_map.get('SP', {POS: SPACE})
+        space_attrs = tag_map.get('_SP', tag_map.get('SP', {POS: SPACE}))
         if '_SP' not in tag_map:
             self.strings.add('_SP')
             tag_map = dict(tag_map)
@@ -48,16 +54,19 @@ cdef class Morphology:
         self.reverse_index = {}
 
         self.rich_tags = <RichTagC*>self.mem.alloc(self.n_tags+1, sizeof(RichTagC))
+        self._morph2features = {}
         for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
+            features = attrs.get('morphology', frozenset())
             self.strings.add(tag_str)
             self.tag_map[tag_str] = dict(attrs)
             attrs = _normalize_props(attrs)
             attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
             self.rich_tags[i].id = i
             self.rich_tags[i].name = self.strings.add(tag_str)
-            self.rich_tags[i].morph = 0
+            self.rich_tags[i].morph = hash_features(features)
             self.rich_tags[i].pos = attrs[POS]
             self.reverse_index[self.rich_tags[i].name] = i
+            self._morph2features[self.rich_tags[i].morph] = features
         # Add a 'null' tag, which we can reference when assign morphology to
         # untagged tokens.
         self.rich_tags[self.n_tags].id = self.n_tags
@@ -114,12 +123,30 @@ cdef class Morphology:
         token.tag = analysis.tag.name
         token.morph = analysis.tag.morph
 
-    cdef int assign_feature(self, uint64_t* flags, univ_morph_t flag_id, bint value) except -1:
-        cdef flags_t one = 1
-        if value:
-            flags[0] |= one << flag_id
-        else:
-            flags[0] &= ~(one << flag_id)
+    cdef int assign_feature(self, uint64_t* morph, univ_morph_t flag_id, bint value) except -1:
+        # Deprecated
+        pass
+
+    cdef int set_feature(self, uint64_t* morph, univ_morph_t flag_id, bint value) except -1:
+        '''Update a morph attribute in-place, so that it indicates the given
+        feature.
+        '''
+        features = self._morph2features.get(morph[0], {})
+        cdef uint64_t key
+        cdef attr_t flag = flag_id
+        if (flag in features) != value:
+            new_features = set(features)
+            if value:
+                new_features.add(flag)
+            else:
+                new_features.remove(flag)
+            new_features = frozenset(new_features)
+            key = hash_features(new_features)
+            morph[0] = key
+            self._morph2features[morph[0]] = new_features
+
+    def get_features(self, uint64_t morph):
+        return self._morph2features.get(morph, frozenset())
 
     def add_special_case(self, unicode tag_str, unicode orth_str, attrs,
                          force=False):
@@ -140,6 +167,9 @@ cdef class Morphology:
         tag_id = self.reverse_index[tag]
         orth = self.strings[orth_str]
         cdef RichTagC rich_tag = self.rich_tags[tag_id]
+        features = attrs.get('morphology', frozenset())
+        cdef uint64_t morph = hash_features(features)
+        self._morph2features[morph] = features
         attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
         cached = <MorphAnalysisC*>self._cache.get(tag_id, orth)
         if cached is NULL:
@@ -152,12 +182,11 @@ cdef class Morphology:
                 "force=True to overwrite." % (tag_str, orth_str))
 
         cached.tag = rich_tag
+        cached.tag.morph = morph
         # TODO: Refactor this to take arbitrary attributes.
         for name_id, value_id in attrs.items():
             if name_id == LEMMA:
                 cached.lemma = value_id
-            else:
-                self.assign_feature(&cached.tag.morph, name_id, value_id)
         if cached.lemma == 0:
             cached.lemma = self.lemmatize(rich_tag.pos, orth, attrs)
         self._cache.set(tag_id, orth, <void*>cached)
@@ -318,122 +347,125 @@ IDS = {
     "AdvType_sta": AdvType_sta,
     "AdvType_ex": AdvType_ex,
     "AdvType_adadj": AdvType_adadj,
-    "ConjType_oper ": ConjType_oper,  # cz, U,
-    "ConjType_comp ": ConjType_comp,  # cz, U,
-    "Connegative_yes ": Connegative_yes,  # fi,
-    "Derivation_minen ": Derivation_minen,  # fi,
-    "Derivation_sti ": Derivation_sti,  # fi,
-    "Derivation_inen ": Derivation_inen,  # fi,
-    "Derivation_lainen ": Derivation_lainen,  # fi,
-    "Derivation_ja ": Derivation_ja,  # fi,
-    "Derivation_ton ": Derivation_ton,  # fi,
-    "Derivation_vs ": Derivation_vs,  # fi,
-    "Derivation_ttain ": Derivation_ttain,  # fi,
-    "Derivation_ttaa ": Derivation_ttaa,  # fi,
-    "Echo_rdp ": Echo_rdp,  # U,
-    "Echo_ech ": Echo_ech,  # U,
-    "Foreign_foreign ": Foreign_foreign,  # cz, fi, U,
-    "Foreign_fscript ": Foreign_fscript,  # cz, fi, U,
-    "Foreign_tscript ": Foreign_tscript,  # cz, U,
-    "Foreign_yes ": Foreign_yes,  # sl,
-    "Gender_dat_masc ": Gender_dat_masc,  # bq, U,
-    "Gender_dat_fem ": Gender_dat_fem,  # bq, U,
-    "Gender_erg_masc ": Gender_erg_masc,  # bq,
-    "Gender_erg_fem ": Gender_erg_fem,  # bq,
-    "Gender_psor_masc ": Gender_psor_masc,  # cz, sl, U,
-    "Gender_psor_fem ": Gender_psor_fem,  # cz, sl, U,
-    "Gender_psor_neut ": Gender_psor_neut,  # sl,
-    "Hyph_yes ": Hyph_yes,  # cz, U,
-    "InfForm_one ": InfForm_one,  # fi,
-    "InfForm_two ": InfForm_two,  # fi,
-    "InfForm_three ": InfForm_three,  # fi,
-    "NameType_geo ": NameType_geo,  # U, cz,
-    "NameType_prs ": NameType_prs,  # U, cz,
-    "NameType_giv ": NameType_giv,  # U, cz,
-    "NameType_sur ": NameType_sur,  # U, cz,
-    "NameType_nat ": NameType_nat,  # U, cz,
-    "NameType_com ": NameType_com,  # U, cz,
-    "NameType_pro ": NameType_pro,  # U, cz,
-    "NameType_oth ": NameType_oth,  # U, cz,
-    "NounType_com ": NounType_com,  # U,
-    "NounType_prop ": NounType_prop,  # U,
-    "NounType_class ": NounType_class,  # U,
-    "Number_abs_sing ": Number_abs_sing,  # bq, U,
-    "Number_abs_plur ": Number_abs_plur,  # bq, U,
-    "Number_dat_sing ": Number_dat_sing,  # bq, U,
-    "Number_dat_plur ": Number_dat_plur,  # bq, U,
-    "Number_erg_sing ": Number_erg_sing,  # bq, U,
-    "Number_erg_plur ": Number_erg_plur,  # bq, U,
-    "Number_psee_sing ": Number_psee_sing,  # U,
-    "Number_psee_plur ": Number_psee_plur,  # U,
-    "Number_psor_sing ": Number_psor_sing,  # cz, fi, sl, U,
-    "Number_psor_plur ": Number_psor_plur,  # cz, fi, sl, U,
-    "NumForm_digit ": NumForm_digit,  # cz, sl, U,
-    "NumForm_roman ": NumForm_roman,  # cz, sl, U,
-    "NumForm_word ": NumForm_word,  # cz, sl, U,
-    "NumValue_one ": NumValue_one,  # cz, U,
-    "NumValue_two ": NumValue_two,  # cz, U,
-    "NumValue_three ": NumValue_three,  # cz, U,
-    "PartForm_pres ": PartForm_pres,  # fi,
-    "PartForm_past ": PartForm_past,  # fi,
-    "PartForm_agt ": PartForm_agt,  # fi,
-    "PartForm_neg ": PartForm_neg,  # fi,
-    "PartType_mod ": PartType_mod,  # U,
-    "PartType_emp ": PartType_emp,  # U,
-    "PartType_res ": PartType_res,  # U,
-    "PartType_inf ": PartType_inf,  # U,
-    "PartType_vbp ": PartType_vbp,  # U,
-    "Person_abs_one ": Person_abs_one,  # bq, U,
-    "Person_abs_two ": Person_abs_two,  # bq, U,
-    "Person_abs_three ": Person_abs_three,  # bq, U,
-    "Person_dat_one ": Person_dat_one,  # bq, U,
-    "Person_dat_two ": Person_dat_two,  # bq, U,
-    "Person_dat_three ": Person_dat_three,  # bq, U,
-    "Person_erg_one ": Person_erg_one,  # bq, U,
-    "Person_erg_two ": Person_erg_two,  # bq, U,
-    "Person_erg_three ": Person_erg_three,  # bq, U,
-    "Person_psor_one ": Person_psor_one,  # fi, U,
-    "Person_psor_two ": Person_psor_two,  # fi, U,
-    "Person_psor_three ": Person_psor_three,  # fi, U,
-    "Polite_inf ": Polite_inf,  # bq, U,
-    "Polite_pol ": Polite_pol,  # bq, U,
-    "Polite_abs_inf ": Polite_abs_inf,  # bq, U,
-    "Polite_abs_pol ": Polite_abs_pol,  # bq, U,
-    "Polite_erg_inf ": Polite_erg_inf,  # bq, U,
-    "Polite_erg_pol ": Polite_erg_pol,  # bq, U,
-    "Polite_dat_inf ": Polite_dat_inf,  # bq, U,
-    "Polite_dat_pol ": Polite_dat_pol,  # bq, U,
-    "Prefix_yes ": Prefix_yes,  # U,
-    "PrepCase_npr ": PrepCase_npr,  # cz,
-    "PrepCase_pre ": PrepCase_pre,  # U,
-    "PunctSide_ini ": PunctSide_ini,  # U,
-    "PunctSide_fin ": PunctSide_fin,  # U,
-    "PunctType_peri ": PunctType_peri,  # U,
-    "PunctType_qest ": PunctType_qest,  # U,
-    "PunctType_excl ": PunctType_excl,  # U,
-    "PunctType_quot ": PunctType_quot,  # U,
-    "PunctType_brck ": PunctType_brck,  # U,
-    "PunctType_comm ": PunctType_comm,  # U,
-    "PunctType_colo ": PunctType_colo,  # U,
-    "PunctType_semi ": PunctType_semi,  # U,
-    "PunctType_dash ": PunctType_dash,  # U,
-    "Style_arch ": Style_arch,  # cz, fi, U,
-    "Style_rare ": Style_rare,  # cz, fi, U,
-    "Style_poet ": Style_poet,  # cz, U,
-    "Style_norm ": Style_norm,  # cz, U,
-    "Style_coll ": Style_coll,  # cz, U,
-    "Style_vrnc ": Style_vrnc,  # cz, U,
-    "Style_sing ": Style_sing,  # cz, U,
-    "Style_expr ": Style_expr,  # cz, U,
-    "Style_derg ": Style_derg,  # cz, U,
-    "Style_vulg ": Style_vulg,  # cz, U,
-    "Style_yes ": Style_yes,  # fi, U,
-    "StyleVariant_styleShort ": StyleVariant_styleShort,  # cz,
-    "StyleVariant_styleBound ": StyleVariant_styleBound,  # cz, sl,
-    "VerbType_aux ": VerbType_aux,  # U,
-    "VerbType_cop ": VerbType_cop,  # U,
-    "VerbType_mod ": VerbType_mod,  # U,
-    "VerbType_light ": VerbType_light,  # U,
+    "ConjType_oper": ConjType_oper,  # cz, U,
+    "ConjType_comp": ConjType_comp,  # cz, U,
+    "Connegative_yes": Connegative_yes,  # fi,
+    "Derivation_minen": Derivation_minen,  # fi,
+    "Derivation_sti": Derivation_sti,  # fi,
+    "Derivation_inen": Derivation_inen,  # fi,
+    "Derivation_lainen": Derivation_lainen,  # fi,
+    "Derivation_ja": Derivation_ja,  # fi,
+    "Derivation_ton": Derivation_ton,  # fi,
+    "Derivation_vs": Derivation_vs,  # fi,
+    "Derivation_ttain": Derivation_ttain,  # fi,
+    "Derivation_ttaa": Derivation_ttaa,  # fi,
+    "Echo_rdp": Echo_rdp,  # U,
+    "Echo_ech": Echo_ech,  # U,
+    "Foreign_foreign": Foreign_foreign,  # cz, fi, U,
+    "Foreign_fscript": Foreign_fscript,  # cz, fi, U,
+    "Foreign_tscript": Foreign_tscript,  # cz, U,
+    "Foreign_yes": Foreign_yes,  # sl,
+    "Gender_dat_masc": Gender_dat_masc,  # bq, U,
+    "Gender_dat_fem": Gender_dat_fem,  # bq, U,
+    "Gender_erg_masc": Gender_erg_masc,  # bq,
+    "Gender_erg_fem": Gender_erg_fem,  # bq,
+    "Gender_psor_masc": Gender_psor_masc,  # cz, sl, U,
+    "Gender_psor_fem": Gender_psor_fem,  # cz, sl, U,
+    "Gender_psor_neut": Gender_psor_neut,  # sl,
+    "Hyph_yes": Hyph_yes,  # cz, U,
+    "InfForm_one": InfForm_one,  # fi,
+    "InfForm_two": InfForm_two,  # fi,
+    "InfForm_three": InfForm_three,  # fi,
+    "NameType_geo": NameType_geo,  # U, cz,
+    "NameType_prs": NameType_prs,  # U, cz,
+    "NameType_giv": NameType_giv,  # U, cz,
+    "NameType_sur": NameType_sur,  # U, cz,
+    "NameType_nat": NameType_nat,  # U, cz,
+    "NameType_com": NameType_com,  # U, cz,
+    "NameType_pro": NameType_pro,  # U, cz,
+    "NameType_oth": NameType_oth,  # U, cz,
+    "NounType_com": NounType_com,  # U,
+    "NounType_prop": NounType_prop,  # U,
+    "NounType_class": NounType_class,  # U,
+    "Number_abs_sing": Number_abs_sing,  # bq, U,
+    "Number_abs_plur": Number_abs_plur,  # bq, U,
+    "Number_dat_sing": Number_dat_sing,  # bq, U,
+    "Number_dat_plur": Number_dat_plur,  # bq, U,
+    "Number_erg_sing": Number_erg_sing,  # bq, U,
+    "Number_erg_plur": Number_erg_plur,  # bq, U,
+    "Number_psee_sing": Number_psee_sing,  # U,
+    "Number_psee_plur": Number_psee_plur,  # U,
+    "Number_psor_sing": Number_psor_sing,  # cz, fi, sl, U,
+    "Number_psor_plur": Number_psor_plur,  # cz, fi, sl, U,
+    "NumForm_digit": NumForm_digit,  # cz, sl, U,
+    "NumForm_roman": NumForm_roman,  # cz, sl, U,
+    "NumForm_word": NumForm_word,  # cz, sl, U,
+    "NumValue_one": NumValue_one,  # cz, U,
+    "NumValue_two": NumValue_two,  # cz, U,
+    "NumValue_three": NumValue_three,  # cz, U,
+    "PartForm_pres": PartForm_pres,  # fi,
+    "PartForm_past": PartForm_past,  # fi,
+    "PartForm_agt": PartForm_agt,  # fi,
+    "PartForm_neg": PartForm_neg,  # fi,
+    "PartType_mod": PartType_mod,  # U,
+    "PartType_emp": PartType_emp,  # U,
+    "PartType_res": PartType_res,  # U,
+    "PartType_inf": PartType_inf,  # U,
+    "PartType_vbp": PartType_vbp,  # U,
+    "Person_abs_one": Person_abs_one,  # bq, U,
+    "Person_abs_two": Person_abs_two,  # bq, U,
+    "Person_abs_three": Person_abs_three,  # bq, U,
+    "Person_dat_one": Person_dat_one,  # bq, U,
+    "Person_dat_two": Person_dat_two,  # bq, U,
+    "Person_dat_three": Person_dat_three,  # bq, U,
+    "Person_erg_one": Person_erg_one,  # bq, U,
+    "Person_erg_two": Person_erg_two,  # bq, U,
+    "Person_erg_three": Person_erg_three,  # bq, U,
+    "Person_psor_one": Person_psor_one,  # fi, U,
+    "Person_psor_two": Person_psor_two,  # fi, U,
+    "Person_psor_three": Person_psor_three,  # fi, U,
+    "Polite_inf": Polite_inf,  # bq, U,
+    "Polite_pol": Polite_pol,  # bq, U,
+    "Polite_abs_inf": Polite_abs_inf,  # bq, U,
+    "Polite_abs_pol": Polite_abs_pol,  # bq, U,
+    "Polite_erg_inf": Polite_erg_inf,  # bq, U,
+    "Polite_erg_pol": Polite_erg_pol,  # bq, U,
+    "Polite_dat_inf": Polite_dat_inf,  # bq, U,
+    "Polite_dat_pol": Polite_dat_pol,  # bq, U,
+    "Prefix_yes": Prefix_yes,  # U,
+    "PrepCase_npr": PrepCase_npr,  # cz,
+    "PrepCase_pre": PrepCase_pre,  # U,
+    "PunctSide_ini": PunctSide_ini,  # U,
+    "PunctSide_fin": PunctSide_fin,  # U,
+    "PunctType_peri": PunctType_peri,  # U,
+    "PunctType_qest": PunctType_qest,  # U,
+    "PunctType_excl": PunctType_excl,  # U,
+    "PunctType_quot": PunctType_quot,  # U,
+    "PunctType_brck": PunctType_brck,  # U,
+    "PunctType_comm": PunctType_comm,  # U,
+    "PunctType_colo": PunctType_colo,  # U,
+    "PunctType_semi": PunctType_semi,  # U,
+    "PunctType_dash": PunctType_dash,  # U,
+    "Style_arch": Style_arch,  # cz, fi, U,
+    "Style_rare": Style_rare,  # cz, fi, U,
+    "Style_poet": Style_poet,  # cz, U,
+    "Style_norm": Style_norm,  # cz, U,
+    "Style_coll": Style_coll,  # cz, U,
+    "Style_vrnc": Style_vrnc,  # cz, U,
+    "Style_sing": Style_sing,  # cz, U,
+    "Style_expr": Style_expr,  # cz, U,
+    "Style_derg": Style_derg,  # cz, U,
+    "Style_vulg": Style_vulg,  # cz, U,
+    "Style_yes": Style_yes,  # fi, U,
+    "StyleVariant_styleShort": StyleVariant_styleShort,  # cz,
+    "StyleVariant_styleBound": StyleVariant_styleBound,  # cz, sl,
+    "VerbType_aux": VerbType_aux,  # U,
+    "VerbType_cop": VerbType_cop,  # U,
+    "VerbType_mod": VerbType_mod,  # U,
+    "VerbType_light": VerbType_light,  # U,
+
+    "Fused_begin": Fused_begin,   # Internal
+    "Fused_inside": Fused_inside # Internal
 }
 
 
diff --git a/spacy/symbols.pxd b/spacy/symbols.pxd
index 051b92edb..c89a7e06c 100644
--- a/spacy/symbols.pxd
+++ b/spacy/symbols.pxd
@@ -384,6 +384,9 @@ cdef enum symbol_t:
     VerbType_cop # U
     VerbType_mod # U
     VerbType_light # U
+    
+    Fused_begin
+    Fused_inside
 
     PERSON
     NORP
diff --git a/spacy/symbols.pyx b/spacy/symbols.pyx
index 949621820..77ab6fba3 100644
--- a/spacy/symbols.pyx
+++ b/spacy/symbols.pyx
@@ -389,6 +389,9 @@ IDS = {
     "VerbType_cop": VerbType_cop, # U,
     "VerbType_mod": VerbType_mod, # U,
     "VerbType_light": VerbType_light, # U,
+    
+    "Fused_begin": Fused_begin,
+    "Fused_inside": Fused_inside,
 
     "PERSON": PERSON,
     "NORP": NORP,
diff --git a/spacy/tests/lang/de/test_text.py b/spacy/tests/lang/de/test_text.py
index 34180b982..65fc8a28a 100644
--- a/spacy/tests/lang/de/test_text.py
+++ b/spacy/tests/lang/de/test_text.py
@@ -5,6 +5,7 @@
 from __future__ import unicode_literals
 
 import pytest
+from ....morphology import Fused_begin, Fused_inside
 
 
 def test_tokenizer_handles_long_text(de_tokenizer):
@@ -22,9 +23,15 @@ Umfang kläglich dünnen Beine flimmerten ihm hilflos vor den Augen.
 »Was ist mit mir geschehen?«, dachte er."""
 
     tokens = de_tokenizer(text)
-    assert len(tokens) == 109
+    assert len(tokens) == 110
 
 
+def test_fused(de_tokenizer):
+    doc = de_tokenizer('zum')
+    assert len(doc) == 2
+    assert doc[0].check_morph(Fused_begin)
+    assert doc[1].check_morph(Fused_inside)
+
 @pytest.mark.parametrize('text', [
     "Donaudampfschifffahrtsgesellschaftskapitänsanwärterposten",
     "Rindfleischetikettierungsüberwachungsaufgabenübertragungsgesetz",
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index 9e4b878cf..677cee463 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -10,6 +10,7 @@ cimport numpy as np
 np.import_array()
 import numpy
 
+from ..morphology cimport univ_morph_t
 from ..typedefs cimport hash_t
 from ..lexeme cimport Lexeme
 from .. import parts_of_speech
@@ -128,6 +129,15 @@ cdef class Token:
         """
         return Lexeme.c_check_flag(self.c.lex, flag_id)
 
+    def set_morph(self, univ_morph_t feature, bint value):
+        '''Set a morphological feature'''
+        self.vocab.morphology.set_feature(&self.c.morph, feature, value)
+
+    def check_morph(self, univ_morph_t feature):
+        '''Check whether the token has the given morphological feature.'''
+        features = self.vocab.morphology.get_features(self.c.morph)
+        return feature in features
+
     def nbor(self, int i=1):
         """Get a neighboring token.
 
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index 0a675253b..580e527e4 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -232,14 +232,17 @@ cdef class Vocab:
         cdef int i
         tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
         for i, props in enumerate(substrings):
+            features = props.get('morphology', frozenset())
             props = intify_attrs(props, strings_map=self.strings,
-                                 _do_deprecated=True)
+                                 _do_deprecated=False)
             token = &tokens[i]
             # Set the special tokens up to have arbitrary attributes
             lex = <LexemeC*>self.get_by_orth(self.mem, props[ORTH])
             token.lex = lex
             if TAG in props:
                 self.morphology.assign_tag(token, props[TAG])
+            for feature in features:
+                self.morphology.set_feature(&token.morph, feature, True)
             for attr_id, value in props.items():
                 Token.set_struct_attr(token, attr_id, value)
                 Lexeme.set_struct_attr(lex, attr_id, value)