From 9c32388235777c3bf9518757ca8304b8c4af997c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 25 Feb 2018 21:22:45 +0100 Subject: [PATCH] Wire up morphological features --- spacy/attrs.pyx | 4 +- spacy/lang/de/tokenizer_exceptions.py | 7 +- spacy/morphology.pxd | 6 + spacy/morphology.pyx | 284 ++++++++++++++------------ spacy/symbols.pxd | 3 + spacy/symbols.pyx | 3 + spacy/tests/lang/de/test_text.py | 9 +- spacy/tokens/token.pyx | 10 + spacy/vocab.pyx | 5 +- 9 files changed, 201 insertions(+), 130 deletions(-) diff --git a/spacy/attrs.pyx b/spacy/attrs.pyx index ed1f39a3f..ad012bc3d 100644 --- a/spacy/attrs.pyx +++ b/spacy/attrs.pyx @@ -143,8 +143,10 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False): for name, value in stringy_attrs.items(): if isinstance(name, int): int_key = name - else: + elif name.upper() in IDS: int_key = IDS[name.upper()] + else: + continue if strings_map is not None and isinstance(value, basestring): if hasattr(strings_map, 'add'): value = strings_map.add(value) diff --git a/spacy/lang/de/tokenizer_exceptions.py b/spacy/lang/de/tokenizer_exceptions.py index 8e041a740..d3b35dfa2 100644 --- a/spacy/lang/de/tokenizer_exceptions.py +++ b/spacy/lang/de/tokenizer_exceptions.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA +from ...morphology import Fused_begin, Fused_inside _exc = { @@ -47,7 +48,11 @@ _exc = { "über'm": [ {ORTH: "über", LEMMA: "über"}, - {ORTH: "'m", LEMMA: "der", NORM: "dem"}] + {ORTH: "'m", LEMMA: "der", NORM: "dem"}], + "zum": [ + {ORTH: "zu", LEMMA: "zu", "morphology": [Fused_begin]}, + {ORTH: "m", LEMMA: "der", "morphology": [Fused_inside]} + ] } diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd index d0110b300..e93ebde59 100644 --- a/spacy/morphology.pxd +++ b/spacy/morphology.pxd @@ -31,6 +31,7 @@ cdef class Morphology: cdef public object reverse_index cdef public object tag_names cdef public object exc + cdef public object _morph2features cdef RichTagC* rich_tags cdef PreshMapArray _cache @@ -42,6 +43,8 @@ cdef class Morphology: cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1 cdef int assign_feature(self, uint64_t* morph, univ_morph_t feat_id, bint value) except -1 + + cdef int set_feature(self, uint64_t* morph, univ_morph_t feat_id, bint value) except -1 cdef enum univ_morph_t: @@ -298,4 +301,7 @@ cdef enum univ_morph_t: VerbType_mod # U VerbType_light # U + Fused_begin + Fused_inside + diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index ab48427ce..a0397312c 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -9,6 +9,7 @@ from .attrs import LEMMA, intify_attrs from .parts_of_speech cimport SPACE from .parts_of_speech import IDS as POS_IDS from .lexeme cimport Lexeme +from .strings cimport hash_string def _normalize_props(props): @@ -29,6 +30,11 @@ def _normalize_props(props): out[key] = value return out +cdef uint64_t hash_features(features): + # TODO improve this + cdef unicode string = str(tuple(features)) + return hash_string(string) + cdef class Morphology: def __init__(self, StringStore string_store, tag_map, lemmatizer, exc=None): @@ -36,7 +42,7 @@ cdef class Morphology: self.strings = string_store # Add special space symbol. We prefix with underscore, to make sure it # always sorts to the end. - space_attrs = tag_map.get('SP', {POS: SPACE}) + space_attrs = tag_map.get('_SP', tag_map.get('SP', {POS: SPACE})) if '_SP' not in tag_map: self.strings.add('_SP') tag_map = dict(tag_map) @@ -48,16 +54,19 @@ cdef class Morphology: self.reverse_index = {} self.rich_tags = self.mem.alloc(self.n_tags+1, sizeof(RichTagC)) + self._morph2features = {} for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())): + features = attrs.get('morphology', frozenset()) self.strings.add(tag_str) self.tag_map[tag_str] = dict(attrs) attrs = _normalize_props(attrs) attrs = intify_attrs(attrs, self.strings, _do_deprecated=True) self.rich_tags[i].id = i self.rich_tags[i].name = self.strings.add(tag_str) - self.rich_tags[i].morph = 0 + self.rich_tags[i].morph = hash_features(features) self.rich_tags[i].pos = attrs[POS] self.reverse_index[self.rich_tags[i].name] = i + self._morph2features[self.rich_tags[i].morph] = features # Add a 'null' tag, which we can reference when assign morphology to # untagged tokens. self.rich_tags[self.n_tags].id = self.n_tags @@ -114,12 +123,30 @@ cdef class Morphology: token.tag = analysis.tag.name token.morph = analysis.tag.morph - cdef int assign_feature(self, uint64_t* flags, univ_morph_t flag_id, bint value) except -1: - cdef flags_t one = 1 - if value: - flags[0] |= one << flag_id - else: - flags[0] &= ~(one << flag_id) + cdef int assign_feature(self, uint64_t* morph, univ_morph_t flag_id, bint value) except -1: + # Deprecated + pass + + cdef int set_feature(self, uint64_t* morph, univ_morph_t flag_id, bint value) except -1: + '''Update a morph attribute in-place, so that it indicates the given + feature. + ''' + features = self._morph2features.get(morph[0], {}) + cdef uint64_t key + cdef attr_t flag = flag_id + if (flag in features) != value: + new_features = set(features) + if value: + new_features.add(flag) + else: + new_features.remove(flag) + new_features = frozenset(new_features) + key = hash_features(new_features) + morph[0] = key + self._morph2features[morph[0]] = new_features + + def get_features(self, uint64_t morph): + return self._morph2features.get(morph, frozenset()) def add_special_case(self, unicode tag_str, unicode orth_str, attrs, force=False): @@ -140,6 +167,9 @@ cdef class Morphology: tag_id = self.reverse_index[tag] orth = self.strings[orth_str] cdef RichTagC rich_tag = self.rich_tags[tag_id] + features = attrs.get('morphology', frozenset()) + cdef uint64_t morph = hash_features(features) + self._morph2features[morph] = features attrs = intify_attrs(attrs, self.strings, _do_deprecated=True) cached = self._cache.get(tag_id, orth) if cached is NULL: @@ -152,12 +182,11 @@ cdef class Morphology: "force=True to overwrite." % (tag_str, orth_str)) cached.tag = rich_tag + cached.tag.morph = morph # TODO: Refactor this to take arbitrary attributes. for name_id, value_id in attrs.items(): if name_id == LEMMA: cached.lemma = value_id - else: - self.assign_feature(&cached.tag.morph, name_id, value_id) if cached.lemma == 0: cached.lemma = self.lemmatize(rich_tag.pos, orth, attrs) self._cache.set(tag_id, orth, cached) @@ -318,122 +347,125 @@ IDS = { "AdvType_sta": AdvType_sta, "AdvType_ex": AdvType_ex, "AdvType_adadj": AdvType_adadj, - "ConjType_oper ": ConjType_oper, # cz, U, - "ConjType_comp ": ConjType_comp, # cz, U, - "Connegative_yes ": Connegative_yes, # fi, - "Derivation_minen ": Derivation_minen, # fi, - "Derivation_sti ": Derivation_sti, # fi, - "Derivation_inen ": Derivation_inen, # fi, - "Derivation_lainen ": Derivation_lainen, # fi, - "Derivation_ja ": Derivation_ja, # fi, - "Derivation_ton ": Derivation_ton, # fi, - "Derivation_vs ": Derivation_vs, # fi, - "Derivation_ttain ": Derivation_ttain, # fi, - "Derivation_ttaa ": Derivation_ttaa, # fi, - "Echo_rdp ": Echo_rdp, # U, - "Echo_ech ": Echo_ech, # U, - "Foreign_foreign ": Foreign_foreign, # cz, fi, U, - "Foreign_fscript ": Foreign_fscript, # cz, fi, U, - "Foreign_tscript ": Foreign_tscript, # cz, U, - "Foreign_yes ": Foreign_yes, # sl, - "Gender_dat_masc ": Gender_dat_masc, # bq, U, - "Gender_dat_fem ": Gender_dat_fem, # bq, U, - "Gender_erg_masc ": Gender_erg_masc, # bq, - "Gender_erg_fem ": Gender_erg_fem, # bq, - "Gender_psor_masc ": Gender_psor_masc, # cz, sl, U, - "Gender_psor_fem ": Gender_psor_fem, # cz, sl, U, - "Gender_psor_neut ": Gender_psor_neut, # sl, - "Hyph_yes ": Hyph_yes, # cz, U, - "InfForm_one ": InfForm_one, # fi, - "InfForm_two ": InfForm_two, # fi, - "InfForm_three ": InfForm_three, # fi, - "NameType_geo ": NameType_geo, # U, cz, - "NameType_prs ": NameType_prs, # U, cz, - "NameType_giv ": NameType_giv, # U, cz, - "NameType_sur ": NameType_sur, # U, cz, - "NameType_nat ": NameType_nat, # U, cz, - "NameType_com ": NameType_com, # U, cz, - "NameType_pro ": NameType_pro, # U, cz, - "NameType_oth ": NameType_oth, # U, cz, - "NounType_com ": NounType_com, # U, - "NounType_prop ": NounType_prop, # U, - "NounType_class ": NounType_class, # U, - "Number_abs_sing ": Number_abs_sing, # bq, U, - "Number_abs_plur ": Number_abs_plur, # bq, U, - "Number_dat_sing ": Number_dat_sing, # bq, U, - "Number_dat_plur ": Number_dat_plur, # bq, U, - "Number_erg_sing ": Number_erg_sing, # bq, U, - "Number_erg_plur ": Number_erg_plur, # bq, U, - "Number_psee_sing ": Number_psee_sing, # U, - "Number_psee_plur ": Number_psee_plur, # U, - "Number_psor_sing ": Number_psor_sing, # cz, fi, sl, U, - "Number_psor_plur ": Number_psor_plur, # cz, fi, sl, U, - "NumForm_digit ": NumForm_digit, # cz, sl, U, - "NumForm_roman ": NumForm_roman, # cz, sl, U, - "NumForm_word ": NumForm_word, # cz, sl, U, - "NumValue_one ": NumValue_one, # cz, U, - "NumValue_two ": NumValue_two, # cz, U, - "NumValue_three ": NumValue_three, # cz, U, - "PartForm_pres ": PartForm_pres, # fi, - "PartForm_past ": PartForm_past, # fi, - "PartForm_agt ": PartForm_agt, # fi, - "PartForm_neg ": PartForm_neg, # fi, - "PartType_mod ": PartType_mod, # U, - "PartType_emp ": PartType_emp, # U, - "PartType_res ": PartType_res, # U, - "PartType_inf ": PartType_inf, # U, - "PartType_vbp ": PartType_vbp, # U, - "Person_abs_one ": Person_abs_one, # bq, U, - "Person_abs_two ": Person_abs_two, # bq, U, - "Person_abs_three ": Person_abs_three, # bq, U, - "Person_dat_one ": Person_dat_one, # bq, U, - "Person_dat_two ": Person_dat_two, # bq, U, - "Person_dat_three ": Person_dat_three, # bq, U, - "Person_erg_one ": Person_erg_one, # bq, U, - "Person_erg_two ": Person_erg_two, # bq, U, - "Person_erg_three ": Person_erg_three, # bq, U, - "Person_psor_one ": Person_psor_one, # fi, U, - "Person_psor_two ": Person_psor_two, # fi, U, - "Person_psor_three ": Person_psor_three, # fi, U, - "Polite_inf ": Polite_inf, # bq, U, - "Polite_pol ": Polite_pol, # bq, U, - "Polite_abs_inf ": Polite_abs_inf, # bq, U, - "Polite_abs_pol ": Polite_abs_pol, # bq, U, - "Polite_erg_inf ": Polite_erg_inf, # bq, U, - "Polite_erg_pol ": Polite_erg_pol, # bq, U, - "Polite_dat_inf ": Polite_dat_inf, # bq, U, - "Polite_dat_pol ": Polite_dat_pol, # bq, U, - "Prefix_yes ": Prefix_yes, # U, - "PrepCase_npr ": PrepCase_npr, # cz, - "PrepCase_pre ": PrepCase_pre, # U, - "PunctSide_ini ": PunctSide_ini, # U, - "PunctSide_fin ": PunctSide_fin, # U, - "PunctType_peri ": PunctType_peri, # U, - "PunctType_qest ": PunctType_qest, # U, - "PunctType_excl ": PunctType_excl, # U, - "PunctType_quot ": PunctType_quot, # U, - "PunctType_brck ": PunctType_brck, # U, - "PunctType_comm ": PunctType_comm, # U, - "PunctType_colo ": PunctType_colo, # U, - "PunctType_semi ": PunctType_semi, # U, - "PunctType_dash ": PunctType_dash, # U, - "Style_arch ": Style_arch, # cz, fi, U, - "Style_rare ": Style_rare, # cz, fi, U, - "Style_poet ": Style_poet, # cz, U, - "Style_norm ": Style_norm, # cz, U, - "Style_coll ": Style_coll, # cz, U, - "Style_vrnc ": Style_vrnc, # cz, U, - "Style_sing ": Style_sing, # cz, U, - "Style_expr ": Style_expr, # cz, U, - "Style_derg ": Style_derg, # cz, U, - "Style_vulg ": Style_vulg, # cz, U, - "Style_yes ": Style_yes, # fi, U, - "StyleVariant_styleShort ": StyleVariant_styleShort, # cz, - "StyleVariant_styleBound ": StyleVariant_styleBound, # cz, sl, - "VerbType_aux ": VerbType_aux, # U, - "VerbType_cop ": VerbType_cop, # U, - "VerbType_mod ": VerbType_mod, # U, - "VerbType_light ": VerbType_light, # U, + "ConjType_oper": ConjType_oper, # cz, U, + "ConjType_comp": ConjType_comp, # cz, U, + "Connegative_yes": Connegative_yes, # fi, + "Derivation_minen": Derivation_minen, # fi, + "Derivation_sti": Derivation_sti, # fi, + "Derivation_inen": Derivation_inen, # fi, + "Derivation_lainen": Derivation_lainen, # fi, + "Derivation_ja": Derivation_ja, # fi, + "Derivation_ton": Derivation_ton, # fi, + "Derivation_vs": Derivation_vs, # fi, + "Derivation_ttain": Derivation_ttain, # fi, + "Derivation_ttaa": Derivation_ttaa, # fi, + "Echo_rdp": Echo_rdp, # U, + "Echo_ech": Echo_ech, # U, + "Foreign_foreign": Foreign_foreign, # cz, fi, U, + "Foreign_fscript": Foreign_fscript, # cz, fi, U, + "Foreign_tscript": Foreign_tscript, # cz, U, + "Foreign_yes": Foreign_yes, # sl, + "Gender_dat_masc": Gender_dat_masc, # bq, U, + "Gender_dat_fem": Gender_dat_fem, # bq, U, + "Gender_erg_masc": Gender_erg_masc, # bq, + "Gender_erg_fem": Gender_erg_fem, # bq, + "Gender_psor_masc": Gender_psor_masc, # cz, sl, U, + "Gender_psor_fem": Gender_psor_fem, # cz, sl, U, + "Gender_psor_neut": Gender_psor_neut, # sl, + "Hyph_yes": Hyph_yes, # cz, U, + "InfForm_one": InfForm_one, # fi, + "InfForm_two": InfForm_two, # fi, + "InfForm_three": InfForm_three, # fi, + "NameType_geo": NameType_geo, # U, cz, + "NameType_prs": NameType_prs, # U, cz, + "NameType_giv": NameType_giv, # U, cz, + "NameType_sur": NameType_sur, # U, cz, + "NameType_nat": NameType_nat, # U, cz, + "NameType_com": NameType_com, # U, cz, + "NameType_pro": NameType_pro, # U, cz, + "NameType_oth": NameType_oth, # U, cz, + "NounType_com": NounType_com, # U, + "NounType_prop": NounType_prop, # U, + "NounType_class": NounType_class, # U, + "Number_abs_sing": Number_abs_sing, # bq, U, + "Number_abs_plur": Number_abs_plur, # bq, U, + "Number_dat_sing": Number_dat_sing, # bq, U, + "Number_dat_plur": Number_dat_plur, # bq, U, + "Number_erg_sing": Number_erg_sing, # bq, U, + "Number_erg_plur": Number_erg_plur, # bq, U, + "Number_psee_sing": Number_psee_sing, # U, + "Number_psee_plur": Number_psee_plur, # U, + "Number_psor_sing": Number_psor_sing, # cz, fi, sl, U, + "Number_psor_plur": Number_psor_plur, # cz, fi, sl, U, + "NumForm_digit": NumForm_digit, # cz, sl, U, + "NumForm_roman": NumForm_roman, # cz, sl, U, + "NumForm_word": NumForm_word, # cz, sl, U, + "NumValue_one": NumValue_one, # cz, U, + "NumValue_two": NumValue_two, # cz, U, + "NumValue_three": NumValue_three, # cz, U, + "PartForm_pres": PartForm_pres, # fi, + "PartForm_past": PartForm_past, # fi, + "PartForm_agt": PartForm_agt, # fi, + "PartForm_neg": PartForm_neg, # fi, + "PartType_mod": PartType_mod, # U, + "PartType_emp": PartType_emp, # U, + "PartType_res": PartType_res, # U, + "PartType_inf": PartType_inf, # U, + "PartType_vbp": PartType_vbp, # U, + "Person_abs_one": Person_abs_one, # bq, U, + "Person_abs_two": Person_abs_two, # bq, U, + "Person_abs_three": Person_abs_three, # bq, U, + "Person_dat_one": Person_dat_one, # bq, U, + "Person_dat_two": Person_dat_two, # bq, U, + "Person_dat_three": Person_dat_three, # bq, U, + "Person_erg_one": Person_erg_one, # bq, U, + "Person_erg_two": Person_erg_two, # bq, U, + "Person_erg_three": Person_erg_three, # bq, U, + "Person_psor_one": Person_psor_one, # fi, U, + "Person_psor_two": Person_psor_two, # fi, U, + "Person_psor_three": Person_psor_three, # fi, U, + "Polite_inf": Polite_inf, # bq, U, + "Polite_pol": Polite_pol, # bq, U, + "Polite_abs_inf": Polite_abs_inf, # bq, U, + "Polite_abs_pol": Polite_abs_pol, # bq, U, + "Polite_erg_inf": Polite_erg_inf, # bq, U, + "Polite_erg_pol": Polite_erg_pol, # bq, U, + "Polite_dat_inf": Polite_dat_inf, # bq, U, + "Polite_dat_pol": Polite_dat_pol, # bq, U, + "Prefix_yes": Prefix_yes, # U, + "PrepCase_npr": PrepCase_npr, # cz, + "PrepCase_pre": PrepCase_pre, # U, + "PunctSide_ini": PunctSide_ini, # U, + "PunctSide_fin": PunctSide_fin, # U, + "PunctType_peri": PunctType_peri, # U, + "PunctType_qest": PunctType_qest, # U, + "PunctType_excl": PunctType_excl, # U, + "PunctType_quot": PunctType_quot, # U, + "PunctType_brck": PunctType_brck, # U, + "PunctType_comm": PunctType_comm, # U, + "PunctType_colo": PunctType_colo, # U, + "PunctType_semi": PunctType_semi, # U, + "PunctType_dash": PunctType_dash, # U, + "Style_arch": Style_arch, # cz, fi, U, + "Style_rare": Style_rare, # cz, fi, U, + "Style_poet": Style_poet, # cz, U, + "Style_norm": Style_norm, # cz, U, + "Style_coll": Style_coll, # cz, U, + "Style_vrnc": Style_vrnc, # cz, U, + "Style_sing": Style_sing, # cz, U, + "Style_expr": Style_expr, # cz, U, + "Style_derg": Style_derg, # cz, U, + "Style_vulg": Style_vulg, # cz, U, + "Style_yes": Style_yes, # fi, U, + "StyleVariant_styleShort": StyleVariant_styleShort, # cz, + "StyleVariant_styleBound": StyleVariant_styleBound, # cz, sl, + "VerbType_aux": VerbType_aux, # U, + "VerbType_cop": VerbType_cop, # U, + "VerbType_mod": VerbType_mod, # U, + "VerbType_light": VerbType_light, # U, + + "Fused_begin": Fused_begin, # Internal + "Fused_inside": Fused_inside # Internal } diff --git a/spacy/symbols.pxd b/spacy/symbols.pxd index 051b92edb..c89a7e06c 100644 --- a/spacy/symbols.pxd +++ b/spacy/symbols.pxd @@ -384,6 +384,9 @@ cdef enum symbol_t: VerbType_cop # U VerbType_mod # U VerbType_light # U + + Fused_begin + Fused_inside PERSON NORP diff --git a/spacy/symbols.pyx b/spacy/symbols.pyx index 949621820..77ab6fba3 100644 --- a/spacy/symbols.pyx +++ b/spacy/symbols.pyx @@ -389,6 +389,9 @@ IDS = { "VerbType_cop": VerbType_cop, # U, "VerbType_mod": VerbType_mod, # U, "VerbType_light": VerbType_light, # U, + + "Fused_begin": Fused_begin, + "Fused_inside": Fused_inside, "PERSON": PERSON, "NORP": NORP, diff --git a/spacy/tests/lang/de/test_text.py b/spacy/tests/lang/de/test_text.py index 34180b982..65fc8a28a 100644 --- a/spacy/tests/lang/de/test_text.py +++ b/spacy/tests/lang/de/test_text.py @@ -5,6 +5,7 @@ from __future__ import unicode_literals import pytest +from ....morphology import Fused_begin, Fused_inside def test_tokenizer_handles_long_text(de_tokenizer): @@ -22,9 +23,15 @@ Umfang kläglich dünnen Beine flimmerten ihm hilflos vor den Augen. »Was ist mit mir geschehen?«, dachte er.""" tokens = de_tokenizer(text) - assert len(tokens) == 109 + assert len(tokens) == 110 +def test_fused(de_tokenizer): + doc = de_tokenizer('zum') + assert len(doc) == 2 + assert doc[0].check_morph(Fused_begin) + assert doc[1].check_morph(Fused_inside) + @pytest.mark.parametrize('text', [ "Donaudampfschifffahrtsgesellschaftskapitänsanwärterposten", "Rindfleischetikettierungsüberwachungsaufgabenübertragungsgesetz", diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 9e4b878cf..677cee463 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -10,6 +10,7 @@ cimport numpy as np np.import_array() import numpy +from ..morphology cimport univ_morph_t from ..typedefs cimport hash_t from ..lexeme cimport Lexeme from .. import parts_of_speech @@ -128,6 +129,15 @@ cdef class Token: """ return Lexeme.c_check_flag(self.c.lex, flag_id) + def set_morph(self, univ_morph_t feature, bint value): + '''Set a morphological feature''' + self.vocab.morphology.set_feature(&self.c.morph, feature, value) + + def check_morph(self, univ_morph_t feature): + '''Check whether the token has the given morphological feature.''' + features = self.vocab.morphology.get_features(self.c.morph) + return feature in features + def nbor(self, int i=1): """Get a neighboring token. diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 0a675253b..580e527e4 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -232,14 +232,17 @@ cdef class Vocab: cdef int i tokens = self.mem.alloc(len(substrings) + 1, sizeof(TokenC)) for i, props in enumerate(substrings): + features = props.get('morphology', frozenset()) props = intify_attrs(props, strings_map=self.strings, - _do_deprecated=True) + _do_deprecated=False) token = &tokens[i] # Set the special tokens up to have arbitrary attributes lex = self.get_by_orth(self.mem, props[ORTH]) token.lex = lex if TAG in props: self.morphology.assign_tag(token, props[TAG]) + for feature in features: + self.morphology.set_feature(&token.morph, feature, True) for attr_id, value in props.items(): Token.set_struct_attr(token, attr_id, value) Lexeme.set_struct_attr(lex, attr_id, value)