diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd index 24e54bdee..a057e8ed8 100644 --- a/spacy/morphology.pxd +++ b/spacy/morphology.pxd @@ -31,388 +31,3 @@ cdef class Morphology: cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1 cdef int _assign_tag_from_exceptions(self, TokenC* token, int tag_id) except -1 - - -cdef enum univ_morph_t: - NIL = 0 - - begin_Abbr - Abbr_yes - end_Abbr - - begin_AdpType - AdpType_circ - AdpType_comprep - AdpType_prep - AdpType_post - AdpType_voc - end_AdpType - - begin_AdvType - AdvType_adadj - AdvType_cau - AdvType_deg - AdvType_ex - AdvType_loc - AdvType_man - AdvType_mod - AdvType_sta - AdvType_tim - end_AdvType - - begin_Animacy - Animacy_anim - Animacy_hum - Animacy_inan - Animacy_nhum - end_Animacy - - begin_Aspect - Aspect_freq - Aspect_imp - Aspect_mod - Aspect_none - Aspect_perf - end_Aspect - - begin_Case - Case_abe - Case_abl - Case_abs - Case_acc - Case_ade - Case_all - Case_cau - Case_com - Case_dat - Case_del - Case_dis - Case_ela - Case_ess - Case_gen - Case_ill - Case_ine - Case_ins - Case_loc - Case_lat - Case_nom - Case_par - Case_sub - Case_sup - Case_tem - Case_ter - Case_tra - Case_voc - end_Case - - begin_ConjType - ConjType_comp # cz, U - ConjType_oper # cz, U - end_ConjType - begin_Connegative - Connegative_yes # fi - end_Connegative - - begin_Definite - Definite_cons # U20 - Definite_def - Definite_ind - Definite_red - Definite_two - end_Definite - - begin_Degree - Degree_abs - Degree_cmp - Degree_comp - Degree_none - Degree_pos - Degree_sup - Degree_com - Degree_dim # du - end_Degree - - begin_Derivation - Derivation_minen # fi - Derivation_sti # fi - Derivation_inen # fi - Derivation_lainen # fi - Derivation_ja # fi - Derivation_ton # fi - Derivation_vs # fi - Derivation_ttain # fi - Derivation_ttaa # fi - end_Derivation - - begin_Echo - Echo_rdp # U - Echo_ech # U - end_Echo - - begin_Foreign - Foreign_foreign # cz, fi, U - Foreign_fscript # cz, fi, U - Foreign_tscript # cz, U - Foreign_yes # sl - end_Foreign - - begin_Gender - Gender_com - Gender_fem - Gender_masc - Gender_neut - Gender_dat_masc # bq, U - Gender_dat_fem # bq, U - Gender_erg_masc # bq - Gender_erg_fem # bq - Gender_psor_masc # cz, sl, U - Gender_psor_fem # cz, sl, U - Gender_psor_neut # sl - end_Gender - - begin_Hyph - Hyph_yes # cz, U - end_Hyph - - begin_InfForm - InfForm_one # fi - InfForm_two # fi - InfForm_three # fi - end_InfForm - - begin_Mood - Mood_cnd - Mood_imp - Mood_ind - Mood_n - Mood_pot - Mood_sub - Mood_opt - end_Mood - - begin_NameType - NameType_geo # U, cz - NameType_prs # U, cz - NameType_giv # U, cz - NameType_sur # U, cz - NameType_nat # U, cz - NameType_com # U, cz - NameType_pro # U, cz - NameType_oth # U, cz - end_NameType - - begin_Negative - Negative_neg - Negative_pos - Negative_yes - end_Negative - - begin_NounType - NounType_com # U - NounType_prop # U - NounType_class # U - end_NounType - - begin_Number - Number_com - Number_dual - Number_none - Number_plur - Number_sing - Number_ptan # bg - Number_count # bg - Number_abs_sing # bq, U - Number_abs_plur # bq, U - Number_dat_sing # bq, U - Number_dat_plur # bq, U - Number_erg_sing # bq, U - Number_erg_plur # bq, U - Number_psee_sing # U - Number_psee_plur # U - Number_psor_sing # cz, fi, sl, U - Number_psor_plur # cz, fi, sl, U - end_Number - - begin_NumForm - NumForm_digit # cz, sl, U - NumForm_roman # cz, sl, U - NumForm_word # cz, sl, U - end_NumForm - - begin_NumType - NumType_card - NumType_dist - NumType_frac - NumType_gen - NumType_mult - NumType_none - NumType_ord - NumType_sets - end_NumType - - begin_NumValue - NumValue_one # cz, U - NumValue_two # cz, U - NumValue_three # cz, U - end_NumValue - - begin_PartForm - PartForm_pres # fi - PartForm_past # fi - PartForm_agt # fi - PartForm_neg # fi - end_PartForm - - begin_PartType - PartType_mod # U - PartType_emp # U - PartType_res # U - PartType_inf # U - PartType_vbp # U - end_PartType - - begin_Person - Person_one - Person_two - Person_three - Person_none - Person_abs_one # bq, U - Person_abs_two # bq, U - Person_abs_three # bq, U - Person_dat_one # bq, U - Person_dat_two # bq, U - Person_dat_three # bq, U - Person_erg_one # bq, U - Person_erg_two # bq, U - Person_erg_three # bq, U - Person_psor_one # fi, U - Person_psor_two # fi, U - Person_psor_three # fi, U - end_Person - - begin_Polarity - Polarity_neg # U20 - Polarity_pos # U20 - end_Polarity - - begin_Polite - Polite_inf # bq, U - Polite_pol # bq, U - Polite_abs_inf # bq, U - Polite_abs_pol # bq, U - Polite_erg_inf # bq, U - Polite_erg_pol # bq, U - Polite_dat_inf # bq, U - Polite_dat_pol # bq, U - end_Polite - - begin_Poss - Poss_yes - end_Poss - - begin_Prefix - Prefix_yes # U - end_Prefix - - begin_PrepCase - PrepCase_npr # cz - PrepCase_pre # U - end_PrepCase - - begin_PronType - PronType_advPart - PronType_art - PronType_default - PronType_dem - PronType_ind - PronType_int - PronType_neg - PronType_prs - PronType_rcp - PronType_rel - PronType_tot - PronType_clit - PronType_exc # es, ca, it, fa - end_PronType - - begin_PunctSide - PunctSide_ini # U - PunctSide_fin # U - end_PunctSide - - begin_PunctType - PunctType_peri # U - PunctType_qest # U - PunctType_excl # U - PunctType_quot # U - PunctType_brck # U - PunctType_comm # U - PunctType_colo # U - PunctType_semi # U - PunctType_dash # U - end_PunctType - - begin_Reflex - Reflex_yes - end_Reflex - - begin_Style - Style_arch # cz, fi, U - Style_rare # cz, fi, U - Style_poet # cz, U - Style_norm # cz, U - Style_coll # cz, U - Style_vrnc # cz, U - Style_sing # cz, U - Style_expr # cz, U - Style_derg # cz, U - Style_vulg # cz, U - Style_yes # fi, U - end_Style - - begin_StyleVariant - StyleVariant_styleShort # cz - StyleVariant_styleBound # cz, sl - end_StyleVariant - - begin_Tense - Tense_fut - Tense_imp - Tense_past - Tense_pres - end_Tense - - begin_Typo - Typo_yes - end_Typo - - begin_VerbForm - VerbForm_fin - VerbForm_ger - VerbForm_inf - VerbForm_none - VerbForm_part - VerbForm_partFut - VerbForm_partPast - VerbForm_partPres - VerbForm_sup - VerbForm_trans - VerbForm_conv # U20 - VerbForm_gdv # la - end_VerbForm - - begin_VerbType - VerbType_aux # U - VerbType_cop # U - VerbType_mod # U - VerbType_light # U - end_VerbType - - begin_Voice - Voice_act - Voice_cau - Voice_pass - Voice_mid # gkc - Voice_int # hb - end_Voice - diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 52acfedfb..1157c2502 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -4,6 +4,7 @@ from __future__ import unicode_literals from libc.string cimport memset import srsly +from collections import Counter from .strings import get_string_id from . import symbols @@ -14,6 +15,50 @@ from .parts_of_speech import IDS as POS_IDS from .lexeme cimport Lexeme from .errors import Errors +cdef enum univ_field_t: + Field_Abbr + Field_AdpType + Field_AdvType + Field_Animacy + Field_Aspect + Field_Case + Field_ConjType + Field_Connegative + Field_Definite + Field_Degree + Field_Derivation + Field_Echo + Field_Foreign + Field_Gender + Field_Hyph + Field_InfForm + Field_Mood + Field_NameType + Field_Negative + Field_NounType + Field_Number + Field_NumForm + Field_NumType + Field_NumValue + Field_PartForm + Field_PartType + Field_Person + Field_Polite + Field_Polarity + Field_Poss + Field_Prefix + Field_PrepCase + Field_PronType + Field_PunctSide + Field_PunctType + Field_Reflex + Field_Style + Field_StyleVariant + Field_Tense + Field_Typo + Field_VerbForm + Field_Voice + Field_VerbType def _normalize_props(props): @@ -23,7 +68,7 @@ def _normalize_props(props): for key in FIELDS: if key in props: attr = '%s_%s' % (key, props[key]) - if attr in IDS: + if attr in FEATURES: props.pop(key) props[attr] = True for key, value in props.items(): @@ -43,21 +88,21 @@ def _normalize_props(props): def parse_feature(feature): - if not hasattr(feature, 'split'): - feature = NAMES[feature] - key, value = feature.split('_') - begin = 'begin_%s' % key - # Note that this includes a 0 offset for the field, for no entry - offset = IDS[feature] - IDS[begin] - field_id = FIELDS[key] - return (field_id, offset) + field = FEATURE_FIELDS[feature] + offset = FEATURE_OFFSETS[feature] + return (field, offset) + + +def get_field_id(feature): + return FEATURE_FIELDS[feature] def get_field_size(field): - begin = 'begin_%s' % field - end = 'end_%s' % field - # Extra field for no entry -- always 0 - return IDS[end] - IDS[begin] + return FIELD_SIZES[field] + + +def get_field_offset(field): + return FIELD_OFFSETS[field] cdef class Morphology: @@ -105,11 +150,9 @@ cdef class Morphology: present. Returns the hash of the new analysis. """ features = intify_features(features) - cdef univ_morph_t feature + cdef attr_t feature for feature in features: - if feature != 0 and feature not in NAMES: - print(list(NAMES.keys())[:10]) - print(NAMES.get(feature-1), NAMES.get(feature+1)) + if feature != 0 and feature not in FEATURE_NAMES: raise KeyError("Unknown feature: %d" % feature) cdef MorphAnalysisC tag tag = create_rich_tag(features) @@ -127,9 +170,10 @@ cdef class Morphology: """Update a morphological analysis with new feature values.""" tag = (self.tags.get(morph))[0] features = intify_features(features) - cdef univ_morph_t feature + cdef attr_t feature for feature in features: - set_feature(&tag, feature, 1) + field = get_field_id(feature) + set_feature(&tag, field, feature, 1) morph = self.insert(tag) return morph @@ -259,729 +303,531 @@ cpdef univ_pos_t get_int_tag(pos_): return 0 cpdef intify_features(features): - return {IDS.get(feature, feature) for feature in features} + return {get_string_id(feature) for feature in features} cdef hash_t hash_tag(MorphAnalysisC tag) nogil: return mrmr.hash64(&tag, sizeof(tag), 0) + +def get_feature_field(feature): + cdef attr_t key = get_string_id(feature) + return FEATURE_FIELDS[feature] + + cdef MorphAnalysisC create_rich_tag(features) except *: cdef MorphAnalysisC tag - cdef univ_morph_t feature + cdef attr_t feature memset(&tag, 0, sizeof(tag)) for feature in features: - set_feature(&tag, feature, 1) + field = get_field_id(feature) + set_feature(&tag, field, feature, 1) return tag + cdef tag_to_json(MorphAnalysisC tag): features = [] if tag.abbr != 0: - features.append(NAMES[tag.abbr]) + features.append(FEATURE_NAMES[tag.abbr]) if tag.adp_type != 0: - features.append(NAMES[tag.adp_type]) + features.append(FEATURE_NAMES[tag.adp_type]) if tag.adv_type != 0: - features.append(NAMES[tag.adv_type]) + features.append(FEATURE_NAMES[tag.adv_type]) if tag.animacy != 0: - features.append(NAMES[tag.animacy]) + features.append(FEATURE_NAMES[tag.animacy]) if tag.aspect != 0: - features.append(NAMES[tag.aspect]) + features.append(FEATURE_NAMES[tag.aspect]) if tag.case != 0: - features.append(NAMES[tag.case]) + features.append(FEATURE_NAMES[tag.case]) if tag.conj_type != 0: - features.append(NAMES[tag.conj_type]) + features.append(FEATURE_NAMES[tag.conj_type]) if tag.connegative != 0: - features.append(NAMES[tag.connegative]) + features.append(FEATURE_NAMES[tag.connegative]) if tag.definite != 0: - features.append(NAMES[tag.definite]) + features.append(FEATURE_NAMES[tag.definite]) if tag.degree != 0: - features.append(NAMES[tag.degree]) + features.append(FEATURE_NAMES[tag.degree]) if tag.derivation != 0: - features.append(NAMES[tag.derivation]) + features.append(FEATURE_NAMES[tag.derivation]) if tag.echo != 0: - features.append(NAMES[tag.echo]) + features.append(FEATURE_NAMES[tag.echo]) if tag.foreign != 0: - features.append(NAMES[tag.foreign]) + features.append(FEATURE_NAMES[tag.foreign]) if tag.gender != 0: - features.append(NAMES[tag.gender]) + features.append(FEATURE_NAMES[tag.gender]) if tag.hyph != 0: - features.append(NAMES[tag.hyph]) + features.append(FEATURE_NAMES[tag.hyph]) if tag.inf_form != 0: - features.append(NAMES[tag.inf_form]) + features.append(FEATURE_NAMES[tag.inf_form]) if tag.mood != 0: - features.append(NAMES[tag.mood]) + features.append(FEATURE_NAMES[tag.mood]) if tag.negative != 0: - features.append(NAMES[tag.negative]) + features.append(FEATURE_NAMES[tag.negative]) if tag.number != 0: - features.append(NAMES[tag.number]) + features.append(FEATURE_NAMES[tag.number]) if tag.name_type != 0: - features.append(NAMES[tag.name_type]) + features.append(FEATURE_NAMES[tag.name_type]) if tag.noun_type != 0: - features.append(NAMES[tag.noun_type]) + features.append(FEATURE_NAMES[tag.noun_type]) if tag.num_form != 0: - features.append(NAMES[tag.num_form]) + features.append(FEATURE_NAMES[tag.num_form]) if tag.num_type != 0: - features.append(NAMES[tag.num_type]) + features.append(FEATURE_NAMES[tag.num_type]) if tag.num_value != 0: - features.append(NAMES[tag.num_value]) + features.append(FEATURE_NAMES[tag.num_value]) if tag.part_form != 0: - features.append(NAMES[tag.part_form]) + features.append(FEATURE_NAMES[tag.part_form]) if tag.part_type != 0: - features.append(NAMES[tag.part_type]) + features.append(FEATURE_NAMES[tag.part_type]) if tag.person != 0: - features.append(NAMES[tag.person]) + features.append(FEATURE_NAMES[tag.person]) if tag.polite != 0: - features.append(NAMES[tag.polite]) + features.append(FEATURE_NAMES[tag.polite]) if tag.polarity != 0: - features.append(NAMES[tag.polarity]) + features.append(FEATURE_NAMES[tag.polarity]) if tag.poss != 0: - features.append(NAMES[tag.poss]) + features.append(FEATURE_NAMES[tag.poss]) if tag.prefix != 0: - features.append(NAMES[tag.prefix]) + features.append(FEATURE_NAMES[tag.prefix]) if tag.prep_case != 0: - features.append(NAMES[tag.prep_case]) + features.append(FEATURE_NAMES[tag.prep_case]) if tag.pron_type != 0: - features.append(NAMES[tag.pron_type]) + features.append(FEATURE_NAMES[tag.pron_type]) if tag.punct_side != 0: - features.append(NAMES[tag.punct_side]) + features.append(FEATURE_NAMES[tag.punct_side]) if tag.punct_type != 0: - features.append(NAMES[tag.punct_type]) + features.append(FEATURE_NAMES[tag.punct_type]) if tag.reflex != 0: - features.append(NAMES[tag.reflex]) + features.append(FEATURE_NAMES[tag.reflex]) if tag.style != 0: - features.append(NAMES[tag.style]) + features.append(FEATURE_NAMES[tag.style]) if tag.style_variant != 0: - features.append(NAMES[tag.style_variant]) + features.append(FEATURE_NAMES[tag.style_variant]) if tag.tense != 0: - features.append(NAMES[tag.tense]) + features.append(FEATURE_NAMES[tag.tense]) if tag.verb_form != 0: - features.append(NAMES[tag.verb_form]) + features.append(FEATURE_NAMES[tag.verb_form]) if tag.voice != 0: - features.append(NAMES[tag.voice]) + features.append(FEATURE_NAMES[tag.voice]) if tag.verb_type != 0: - features.append(NAMES[tag.verb_type]) + features.append(FEATURE_NAMES[tag.verb_type]) return features cdef MorphAnalysisC tag_from_json(json_tag): cdef MorphAnalysisC tag return tag -cdef int set_feature(MorphAnalysisC* tag, univ_morph_t feature, int value) except -1: +cdef int set_feature(MorphAnalysisC* tag, + univ_field_t field, attr_t feature, int value) except -1: if value == True: value_ = feature else: - value_ = NIL - if feature == NIL: + value_ = 0 + if feature == 0: pass - elif is_abbr_feature(feature): + elif field == Field_Abbr: tag.abbr = value_ - elif is_adp_type_feature(feature): + elif field == Field_AdpType: tag.adp_type = value_ - elif is_adv_type_feature(feature): + elif field == Field_AdvType: tag.adv_type = value_ - elif is_animacy_feature(feature): + elif field == Field_Animacy: tag.animacy = value_ - elif is_aspect_feature(feature): + elif field == Field_Aspect: tag.aspect = value_ - elif is_case_feature(feature): + elif field == Field_Case: tag.case = value_ - elif is_conj_type_feature(feature): + elif field == Field_ConjType: tag.conj_type = value_ - elif is_connegative_feature(feature): + elif field == Field_Connegative: tag.connegative = value_ - elif is_definite_feature(feature): + elif field == Field_Definite: tag.definite = value_ - elif is_degree_feature(feature): + elif field == Field_Degree: tag.degree = value_ - elif is_derivation_feature(feature): + elif field == Field_Derivation: tag.derivation = value_ - elif is_echo_feature(feature): + elif field == Field_Echo: tag.echo = value_ - elif is_foreign_feature(feature): + elif field == Field_Foreign: tag.foreign = value_ - elif is_gender_feature(feature): + elif field == Field_Gender: tag.gender = value_ - elif is_hyph_feature(feature): + elif field == Field_Hyph: tag.hyph = value_ - elif is_inf_form_feature(feature): + elif field == Field_InfForm: tag.inf_form = value_ - elif is_mood_feature(feature): + elif field == Field_Mood: tag.mood = value_ - elif is_negative_feature(feature): + elif field == Field_Negative: tag.negative = value_ - elif is_number_feature(feature): + elif field == Field_Number: tag.number = value_ - elif is_name_type_feature(feature): + elif field == Field_NameType: tag.name_type = value_ - elif is_noun_type_feature(feature): + elif field == Field_NounType: tag.noun_type = value_ - elif is_num_form_feature(feature): + elif field == Field_NumForm: tag.num_form = value_ - elif is_num_type_feature(feature): + elif field == Field_NumType: tag.num_type = value_ - elif is_num_value_feature(feature): + elif field == Field_NumValue: tag.num_value = value_ - elif is_part_form_feature(feature): + elif field == Field_PartForm: tag.part_form = value_ - elif is_part_type_feature(feature): + elif field == Field_PartType: tag.part_type = value_ - elif is_person_feature(feature): + elif field == Field_Person: tag.person = value_ - elif is_polite_feature(feature): + elif field == Field_Polite: tag.polite = value_ - elif is_polarity_feature(feature): + elif field == Field_Polarity: tag.polarity = value_ - elif is_poss_feature(feature): + elif field == Field_Poss: tag.poss = value_ - elif is_prefix_feature(feature): + elif field == Field_Prefix: tag.prefix = value_ - elif is_prep_case_feature(feature): + elif field == Field_PrepCase: tag.prep_case = value_ - elif is_pron_type_feature(feature): + elif field == Field_PronType: tag.pron_type = value_ - elif is_punct_side_feature(feature): + elif field == Field_PunctSide: tag.punct_side = value_ - elif is_punct_type_feature(feature): + elif field == Field_PunctType: tag.punct_type = value_ - elif is_reflex_feature(feature): + elif field == Field_Reflex: tag.reflex = value_ - elif is_style_feature(feature): + elif field == Field_Style: tag.style = value_ - elif is_style_variant_feature(feature): + elif field == Field_StyleVariant: tag.style_variant = value_ - elif is_tense_feature(feature): + elif field == Field_Tense: tag.tense = value_ - elif is_typo_feature(feature): + elif field == Field_Typo: tag.typo = value_ - elif is_verb_form_feature(feature): + elif field == Field_VerbForm: tag.verb_form = value_ - elif is_voice_feature(feature): + elif field == Field_Voice: tag.voice = value_ - elif is_verb_type_feature(feature): + elif field == Field_VerbType: tag.verb_type = value_ else: - raise ValueError("Unknown feature: %s (%d)" % (NAMES.get(feature), feature)) - -cdef int is_abbr_feature(univ_morph_t feature) nogil: - return feature >= begin_Abbr and feature <= end_Abbr - -cdef int is_adp_type_feature(univ_morph_t feature) nogil: - return feature >= begin_AdpType and feature <= end_AdpType - -cdef int is_adv_type_feature(univ_morph_t feature) nogil: - return feature >= begin_AdvType and feature <= end_AdvType - -cdef int is_animacy_feature(univ_morph_t feature) nogil: - return feature >= begin_Animacy and feature <= end_Animacy - -cdef int is_aspect_feature(univ_morph_t feature) nogil: - return feature >= begin_Aspect and feature <= end_Aspect - -cdef int is_case_feature(univ_morph_t feature) nogil: - return feature >= begin_Case and feature <= end_Case - -cdef int is_conj_type_feature(univ_morph_t feature) nogil: - return feature >= begin_ConjType and feature <= end_ConjType - -cdef int is_connegative_feature(univ_morph_t feature) nogil: - return feature >= begin_Connegative and feature <= end_Connegative - -cdef int is_definite_feature(univ_morph_t feature) nogil: - return feature >= begin_Definite and feature <= end_Definite - -cdef int is_degree_feature(univ_morph_t feature) nogil: - return feature >= begin_Degree and feature <= end_Degree - -cdef int is_derivation_feature(univ_morph_t feature) nogil: - return feature >= begin_Derivation and feature <= end_Derivation - -cdef int is_echo_feature(univ_morph_t feature) nogil: - return feature >= begin_Echo and feature <= end_Echo - -cdef int is_foreign_feature(univ_morph_t feature) nogil: - return feature >= begin_Foreign and feature <= end_Foreign - -cdef int is_gender_feature(univ_morph_t feature) nogil: - return feature >= begin_Gender and feature <= end_Gender - -cdef int is_hyph_feature(univ_morph_t feature) nogil: - return feature >= begin_Hyph and feature <= end_Hyph - -cdef int is_inf_form_feature(univ_morph_t feature) nogil: - return feature >= begin_InfForm and feature <= end_InfForm - -cdef int is_mood_feature(univ_morph_t feature) nogil: - return feature >= begin_Mood and feature <= end_Mood - -cdef int is_name_type_feature(univ_morph_t feature) nogil: - return feature >= begin_NameType and feature < end_NameType - -cdef int is_negative_feature(univ_morph_t feature) nogil: - return feature >= begin_Negative and feature <= end_Negative - -cdef int is_noun_type_feature(univ_morph_t feature) nogil: - return feature >= begin_NounType and feature <= end_NounType - -cdef int is_number_feature(univ_morph_t feature) nogil: - return feature >= begin_Number and feature <= end_Number - -cdef int is_num_form_feature(univ_morph_t feature) nogil: - return feature >= begin_NumForm and feature <= end_NumForm - -cdef int is_num_type_feature(univ_morph_t feature) nogil: - return feature >= begin_NumType and feature <= end_NumType - -cdef int is_num_value_feature(univ_morph_t feature) nogil: - return feature >= begin_NumValue and feature <= end_NumValue - -cdef int is_part_form_feature(univ_morph_t feature) nogil: - return feature >= begin_PartForm and feature <= end_PartForm - -cdef int is_part_type_feature(univ_morph_t feature) nogil: - return feature >= begin_PartType and feature <= end_PartType - -cdef int is_person_feature(univ_morph_t feature) nogil: - return feature >= begin_Person and feature <= end_Person - -cdef int is_polite_feature(univ_morph_t feature) nogil: - return feature >= begin_Polite and feature <= end_Polite - -cdef int is_polarity_feature(univ_morph_t feature) nogil: - return feature >= begin_Polarity and feature <= end_Polarity - -cdef int is_poss_feature(univ_morph_t feature) nogil: - return feature >= begin_Poss and feature <= end_Poss - -cdef int is_prefix_feature(univ_morph_t feature) nogil: - return feature >= begin_Prefix and feature <= end_Prefix - -cdef int is_prep_case_feature(univ_morph_t feature) nogil: - return feature >= begin_PrepCase and feature <= end_PrepCase - -cdef int is_pron_type_feature(univ_morph_t feature) nogil: - return feature >= begin_PronType and feature <= end_PronType - -cdef int is_punct_side_feature(univ_morph_t feature) nogil: - return feature >= begin_PunctSide and feature <= end_PunctSide - -cdef int is_punct_type_feature(univ_morph_t feature) nogil: - return feature >= begin_PunctType and feature <= end_PunctType - -cdef int is_reflex_feature(univ_morph_t feature) nogil: - return feature >= begin_Reflex and feature <= end_Reflex - -cdef int is_style_feature(univ_morph_t feature) nogil: - return feature >= begin_Style and feature <= end_Style - -cdef int is_style_variant_feature(univ_morph_t feature) nogil: - return feature >= begin_StyleVariant and feature <= end_StyleVariant - -cdef int is_tense_feature(univ_morph_t feature) nogil: - return feature >= begin_Tense and feature <= end_Tense - -cdef int is_typo_feature(univ_morph_t feature) nogil: - return feature >= begin_Typo and feature <= end_Typo - -cdef int is_verb_form_feature(univ_morph_t feature) nogil: - return feature >= begin_VerbForm and feature <= end_VerbForm - -cdef int is_voice_feature(univ_morph_t feature) nogil: - return feature >= begin_Voice and feature <= end_Voice - -cdef int is_verb_type_feature(univ_morph_t feature) nogil: - return feature >= begin_VerbType and feature <= end_VerbType + raise ValueError("Unknown feature: %s (%d)" % (FEATURE_NAMES.get(feature), feature)) FIELDS = { - 'Abbr': 0, - 'AdpType': 1, - 'AdvType': 2, - 'Animacy': 3, - 'Aspect': 4, - 'Case': 5, - 'ConjType': 6, - 'Connegative': 7, - 'Definite': 8, - 'Degree': 9, - 'Derivation': 10, - 'Echo': 11, - 'Foreign': 12, - 'Gender': 13, - 'Hyph': 14, - 'InfForm': 15, - 'Mood': 16, - 'NameType': 17, - 'Negative': 18, - 'Number': 19, - 'NumForm': 20, - 'NumType': 21, - 'NumValue': 22, - 'PartForm': 23, - 'PartType': 24, - 'Person': 25, - 'Polite': 26, - 'Polarity': 27, - 'Poss': 28, - 'Prefix': 29, - 'PrepCase': 30, - 'PronType': 31, - 'PunctSide': 32, - 'PunctType': 33, - 'Reflex': 34, - 'Style': 35, - 'StyleVariant': 36, - 'Tense': 37, - 'Typo': 38, - 'VerbForm': 39, - 'Voice': 40, - 'VerbType': 41 + 'Abbr': Field_Abbr, + 'AdpType': Field_AdpType, + 'AdvType': Field_AdvType, + 'Animacy': Field_Animacy, + 'Aspect': Field_Aspect, + 'Case': Field_Case, + 'ConjType': Field_ConjType, + 'Connegative': Field_Connegative, + 'Definite': Field_Definite, + 'Degree': Field_Degree, + 'Derivation': Field_Derivation, + 'Echo': Field_Echo, + 'Foreign': Field_Foreign, + 'Gender': Field_Gender, + 'Hyph': Field_Hyph, + 'InfForm': Field_InfForm, + 'Mood': Field_Mood, + 'NameType': Field_NameType, + 'Negative': Field_Negative, + 'NounType': Field_NounType, + 'Number': Field_Number, + 'NumForm': Field_NumForm, + 'NumType': Field_NumType, + 'NumValue': Field_NumValue, + 'PartForm': Field_PartForm, + 'PartType': Field_PartType, + 'Person': Field_Person, + 'Polite': Field_Polite, + 'Polarity': Field_Polarity, + 'Poss': Field_Poss, + 'Prefix': Field_Prefix, + 'PrepCase': Field_PrepCase, + 'PronType': Field_PronType, + 'PunctSide': Field_PunctSide, + 'PunctType': Field_PunctType, + 'Reflex': Field_Reflex, + 'Style': Field_Style, + 'StyleVariant': Field_StyleVariant, + 'Tense': Field_Tense, + 'Typo': Field_Typo, + 'VerbForm': Field_VerbForm, + 'Voice': Field_Voice, + 'VerbType': Field_VerbType } -IDS = { - "begin_Abbr": begin_Abbr, - "Abbr_yes": Abbr_yes , - "end_Abbr": end_Abbr, - "begin_AdpType": begin_AdpType, - "AdpType_circ": AdpType_circ, - "AdpType_comprep": AdpType_comprep, - "AdpType_prep ": AdpType_prep , - "AdpType_post": AdpType_post, - "AdpType_voc": AdpType_voc, - "end_AdpType": end_AdpType, - "begin_AdvType": begin_AdvType, - "AdvType_adadj": AdvType_adadj, - "AdvType_cau": AdvType_cau, - "AdvType_deg": AdvType_deg, - "AdvType_ex": AdvType_ex, - "AdvType_loc": AdvType_loc, - "AdvType_man": AdvType_man, - "AdvType_mod": AdvType_mod, - "AdvType_sta": AdvType_sta, - "AdvType_tim": AdvType_tim, - "end_AdvType": end_AdvType, - "begin_Animacy": begin_Animacy, - "Animacy_anim": Animacy_anim, - "Animacy_hum": Animacy_hum, - "Animacy_inan": Animacy_inan, - "Animacy_nhum": Animacy_nhum, - "end_Animacy": end_Animacy, - "begin_Aspect": begin_Aspect, - "Aspect_freq": Aspect_freq, - "Aspect_imp": Aspect_imp, - "Aspect_mod": Aspect_mod, - "Aspect_none": Aspect_none, - "Aspect_perf": Aspect_perf, - "end_Aspect": end_Aspect, - "begin_Case": begin_Case, - "Case_abe": Case_abe, - "Case_abl": Case_abl, - "Case_abs": Case_abs, - "Case_acc": Case_acc, - "Case_ade": Case_ade, - "Case_all": Case_all, - "Case_cau": Case_cau, - "Case_com": Case_com, - "Case_dat": Case_dat, - "Case_del": Case_del, - "Case_dis": Case_dis, - "Case_ela": Case_ela, - "Case_ess": Case_ess, - "Case_gen": Case_gen, - "Case_ill": Case_ill, - "Case_ine": Case_ine, - "Case_ins": Case_ins, - "Case_loc": Case_loc, - "Case_lat": Case_lat, - "Case_nom": Case_nom, - "Case_par": Case_par, - "Case_sub": Case_sub, - "Case_sup": Case_sup, - "Case_tem": Case_tem, - "Case_ter": Case_ter, - "Case_tra": Case_tra, - "Case_voc": Case_voc, - "end_Case": end_Case, - "begin_ConjType": begin_ConjType, - "ConjType_comp ": ConjType_comp , - "ConjType_oper": ConjType_oper, - "end_ConjType": end_ConjType, - "begin_Connegative": begin_Connegative, - "Connegative_yes": Connegative_yes, - "end_Connegative": end_Connegative, - "begin_Definite": begin_Definite, - "Definite_cons": Definite_cons, - "Definite_def": Definite_def, - "Definite_ind": Definite_ind, - "Definite_red": Definite_red, - "Definite_two": Definite_two, - "end_Definite": end_Definite, - "begin_Degree": begin_Degree, - "Degree_abs": Degree_abs, - "Degree_cmp": Degree_cmp, - "Degree_comp": Degree_comp, - "Degree_none": Degree_none, - "Degree_pos": Degree_pos, - "Degree_sup": Degree_sup, - "Degree_com": Degree_com, - "Degree_dim": Degree_dim, - "end_Degree": end_Degree, - "begin_Derivation": begin_Derivation, - "Derivation_minen": Derivation_minen, - "Derivation_sti": Derivation_sti, - "Derivation_inen": Derivation_inen, - "Derivation_lainen": Derivation_lainen, - "Derivation_ja": Derivation_ja, - "Derivation_ton": Derivation_ton, - "Derivation_vs": Derivation_vs, - "Derivation_ttain": Derivation_ttain, - "Derivation_ttaa": Derivation_ttaa, - "end_Derivation": end_Derivation, - "begin_Echo": begin_Echo, - "Echo_rdp": Echo_rdp, - "Echo_ech": Echo_ech, - "end_Echo": end_Echo, - "begin_Foreign": begin_Foreign, - "Foreign_foreign": Foreign_foreign, - "Foreign_fscript": Foreign_fscript, - "Foreign_tscript": Foreign_tscript, - "Foreign_yes": Foreign_yes, - "end_Foreign": end_Foreign, - "begin_Gender": begin_Gender, - "Gender_com": Gender_com, - "Gender_fem": Gender_fem, - "Gender_masc": Gender_masc, - "Gender_neut": Gender_neut, - "Gender_dat_masc": Gender_dat_masc, - "Gender_dat_fem": Gender_dat_fem, - "Gender_erg_masc": Gender_erg_masc, - "Gender_erg_fem": Gender_erg_fem, - "Gender_psor_masc": Gender_psor_masc, - "Gender_psor_fem": Gender_psor_fem, - "Gender_psor_neut": Gender_psor_neut, - "end_Gender": end_Gender, - "begin_Hyph": begin_Hyph, - "Hyph_yes": Hyph_yes, - "end_Hyph": end_Hyph, - "begin_InfForm": begin_InfForm, - "InfForm_one": InfForm_one, - "InfForm_two": InfForm_two, - "InfForm_three": InfForm_three, - "end_InfForm": end_InfForm, - "begin_Mood": begin_Mood, - "Mood_cnd": Mood_cnd, - "Mood_imp": Mood_imp, - "Mood_ind": Mood_ind, - "Mood_n": Mood_n, - "Mood_pot": Mood_pot, - "Mood_sub": Mood_sub, - "Mood_opt": Mood_opt, - "end_Mood": end_Mood, - "begin_NameType": begin_NameType, - "NameType_geo": NameType_geo, - "NameType_prs": NameType_prs, - "NameType_giv": NameType_giv, - "NameType_sur": NameType_sur, - "NameType_nat": NameType_nat, - "NameType_com": NameType_com, - "NameType_pro": NameType_pro, - "NameType_oth": NameType_oth, - "end_NameType": end_NameType, - "begin_Negative": begin_Negative, - "Negative_neg": Negative_neg, - "Negative_pos": Negative_pos, - "Negative_yes": Negative_yes, - "end_Negative": end_Negative, - "begin_NounType": begin_NounType, - "NounType_com": NounType_com, - "NounType_prop": NounType_prop, - "NounType_class": NounType_class, - "end_NounType": end_NounType, - "begin_Number": begin_Number, - "Number_com": Number_com, - "Number_dual": Number_dual, - "Number_none": Number_none, - "Number_plur": Number_plur, - "Number_sing": Number_sing, - "Number_ptan": Number_ptan, - "Number_count": Number_count, - "Number_abs_sing": Number_abs_sing, - "Number_abs_plur": Number_abs_plur, - "Number_dat_sing": Number_dat_sing, - "Number_dat_plur": Number_dat_plur, - "Number_erg_sing": Number_erg_sing, - "Number_erg_plur": Number_erg_plur, - "Number_psee_sing": Number_psee_sing, - "Number_psee_plur": Number_psee_plur, - "Number_psor_sing": Number_psor_sing, - "Number_psor_plur": Number_psor_plur, - "end_Number": end_Number, - "begin_NumForm": begin_NumForm, - "NumForm_digit": NumForm_digit, - "NumForm_roman": NumForm_roman, - "NumForm_word": NumForm_word, - "end_NumForm": end_NumForm, - "begin_NumType": begin_NumType, - "NumType_card": NumType_card, - "NumType_dist": NumType_dist, - "NumType_frac": NumType_frac, - "NumType_gen": NumType_gen, - "NumType_mult": NumType_mult, - "NumType_none": NumType_none, - "NumType_ord": NumType_ord, - "NumType_sets": NumType_sets, - "end_NumType": end_NumType, - "begin_NumValue": begin_NumValue, - "NumValue_one": NumValue_one, - "NumValue_two": NumValue_two, - "NumValue_three": NumValue_three, - "end_NumValue": end_NumValue, - "begin_PartForm": begin_PartForm, - "PartForm_pres": PartForm_pres, - "PartForm_past": PartForm_past, - "PartForm_agt": PartForm_agt, - "PartForm_neg": PartForm_neg, - "end_PartForm": end_PartForm, - "begin_PartType": begin_PartType, - "PartType_mod": PartType_mod, - "PartType_emp": PartType_emp, - "PartType_res": PartType_res, - "PartType_inf": PartType_inf, - "PartType_vbp": PartType_vbp, - "end_PartType": end_PartType, +FEATURES = [ + "Abbr_yes", + "AdpType_circ", + "AdpType_comprep", + "AdpType_prep ", + "AdpType_post", + "AdpType_voc", + "AdvType_adadj," + "AdvType_cau", + "AdvType_deg", + "AdvType_ex", + "AdvType_loc", + "AdvType_man", + "AdvType_mod", + "AdvType_sta", + "AdvType_tim", + "Animacy_anim", + "Animacy_hum", + "Animacy_inan", + "Animacy_nhum", + "Aspect_freq", + "Aspect_imp", + "Aspect_mod", + "Aspect_none", + "Aspect_perf", + "Case_abe", + "Case_abl", + "Case_abs", + "Case_acc", + "Case_ade", + "Case_all", + "Case_cau", + "Case_com", + "Case_dat", + "Case_del", + "Case_dis", + "Case_ela", + "Case_ess", + "Case_gen", + "Case_ill", + "Case_ine", + "Case_ins", + "Case_loc", + "Case_lat", + "Case_nom", + "Case_par", + "Case_sub", + "Case_sup", + "Case_tem", + "Case_ter", + "Case_tra", + "Case_voc", + "ConjType_comp", + "ConjType_oper", + "Connegative_yes", + "Definite_cons", + "Definite_def", + "Definite_ind", + "Definite_red", + "Definite_two", + "Degree_abs", + "Degree_cmp", + "Degree_comp", + "Degree_none", + "Degree_pos", + "Degree_sup", + "Degree_com", + "Degree_dim", + "Derivation_minen", + "Derivation_sti", + "Derivation_inen", + "Derivation_lainen", + "Derivation_ja", + "Derivation_ton", + "Derivation_vs", + "Derivation_ttain", + "Derivation_ttaa", + "Echo_rdp", + "Echo_ech", + "Foreign_foreign", + "Foreign_fscript", + "Foreign_tscript", + "Foreign_yes", + "Gender_com", + "Gender_fem", + "Gender_masc", + "Gender_neut", + "Gender_dat_masc", + "Gender_dat_fem", + "Gender_erg_masc", + "Gender_erg_fem", + "Gender_psor_masc", + "Gender_psor_fem", + "Gender_psor_neut", + "Hyph_yes", + "InfForm_one", + "InfForm_two", + "InfForm_three", + "Mood_cnd", + "Mood_imp", + "Mood_ind", + "Mood_n", + "Mood_pot", + "Mood_sub", + "Mood_opt", + "NameType_geo", + "NameType_prs", + "NameType_giv", + "NameType_sur", + "NameType_nat", + "NameType_com", + "NameType_pro", + "NameType_oth", + "Negative_neg", + "Negative_pos", + "Negative_yes", + "NounType_com", + "NounType_prop", + "NounType_class", + "Number_com", + "Number_dual", + "Number_none", + "Number_plur", + "Number_sing", + "Number_ptan", + "Number_count", + "Number_abs_sing", + "Number_abs_plur", + "Number_dat_sing", + "Number_dat_plur", + "Number_erg_sing", + "Number_erg_plur", + "Number_psee_sing", + "Number_psee_plur", + "Number_psor_sing", + "Number_psor_plur", + "NumForm_digit", + "NumForm_roman", + "NumForm_word", + "NumType_card", + "NumType_dist", + "NumType_frac", + "NumType_gen", + "NumType_mult", + "NumType_none", + "NumType_ord", + "NumType_sets", + "NumValue_one", + "NumValue_two", + "NumValue_three", + "PartForm_pres", + "PartForm_past", + "PartForm_agt", + "PartForm_neg", + "PartType_mod", + "PartType_emp", + "PartType_res", + "PartType_inf", + "PartType_vbp", + "Person_one", + "Person_two", + "Person_three", + "Person_none", + "Person_abs_one", + "Person_abs_two", + "Person_abs_three", + "Person_dat_one", + "Person_dat_two", + "Person_dat_three", + "Person_erg_one", + "Person_erg_two", + "Person_erg_three", + "Person_psor_one", + "Person_psor_two", + "Person_psor_three", + "Polarity_neg", + "Polarity_pos", + "Polite_inf", + "Polite_pol", + "Polite_abs_inf", + "Polite_abs_pol", + "Polite_erg_inf", + "Polite_erg_pol", + "Polite_dat_inf", + "Polite_dat_pol", + "Poss_yes", + "Prefix_yes", + "PrepCase_npr", + "PrepCase_pre", + "PronType_advPart", + "PronType_art", + "PronType_default", + "PronType_dem", + "PronType_ind", + "PronType_int", + "PronType_neg", + "PronType_prs", + "PronType_rcp", + "PronType_rel", + "PronType_tot", + "PronType_clit", + "PronType_exc", + "PunctSide_ini", + "PunctSide_fin", + "PunctType_peri", + "PunctType_qest", + "PunctType_excl", + "PunctType_quot", + "PunctType_brck", + "PunctType_comm", + "PunctType_colo", + "PunctType_semi", + "PunctType_dash", + "Reflex_yes", + "Style_arch", + "Style_rare", + "Style_poet", + "Style_norm", + "Style_coll", + "Style_vrnc", + "Style_sing", + "Style_expr", + "Style_derg", + "Style_vulg", + "Style_yes", + "StyleVariant_styleShort", + "StyleVariant_styleBound", + "Tense_fut", + "Tense_imp", + "Tense_past", + "Tense_pres", + "Typo_yes", + "VerbForm_fin", + "VerbForm_ger", + "VerbForm_inf", + "VerbForm_none", + "VerbForm_part", + "VerbForm_partFut", + "VerbForm_partPast", + "VerbForm_partPres", + "VerbForm_sup", + "VerbForm_trans", + "VerbForm_conv", + "VerbForm_gdv", + "VerbType_aux", + "VerbType_cop", + "VerbType_mod", + "VerbType_light", + "Voice_act", + "Voice_cau", + "Voice_pass", + "Voice_mid", + "Voice_int", +] - "begin_Person": begin_Person, - "Person_one": Person_one, - "Person_two": Person_two, - "Person_three": Person_three, - "Person_none": Person_none, - "Person_abs_one": Person_abs_one, - "Person_abs_two": Person_abs_two, - "Person_abs_three": Person_abs_three, - "Person_dat_one": Person_dat_one, - "Person_dat_two": Person_dat_two, - "Person_dat_three": Person_dat_three, - "Person_erg_one": Person_erg_one, - "Person_erg_two": Person_erg_two, - "Person_erg_three": Person_erg_three, - "Person_psor_one": Person_psor_one, - "Person_psor_two": Person_psor_two, - "Person_psor_three": Person_psor_three, - "end_Person": end_Person, - "begin_Polarity": begin_Polarity, - "Polarity_neg": Polarity_neg, - "Polarity_pos": Polarity_pos, - "end_Polarity": end_Polarity, - "begin_Polite": begin_Polite, - "Polite_inf": Polite_inf, - "Polite_pol": Polite_pol, - "Polite_abs_inf": Polite_abs_inf, - "Polite_abs_pol": Polite_abs_pol, - "Polite_erg_inf": Polite_erg_inf, - "Polite_erg_pol": Polite_erg_pol, - "Polite_dat_inf": Polite_dat_inf, - "Polite_dat_pol": Polite_dat_pol, - "end_Polite": end_Polite, - "begin_Poss": begin_Poss, - "Poss_yes": Poss_yes, - "end_Poss": end_Poss, - "begin_Prefix": begin_Prefix, - "Prefix_yes": Prefix_yes, - "end_Prefix": end_Prefix, - "begin_PrepCase": begin_PrepCase, - "PrepCase_npr": PrepCase_npr, - "PrepCase_pre": PrepCase_pre, - "end_PrepCase": end_PrepCase, - "begin_PronType": begin_PronType, - "PronType_advPart": PronType_advPart, - "PronType_art": PronType_art, - "PronType_default": PronType_default, - "PronType_dem": PronType_dem, - "PronType_ind": PronType_ind, - "PronType_int": PronType_int, - "PronType_neg": PronType_neg, - "PronType_prs": PronType_prs, - "PronType_rcp": PronType_rcp, - "PronType_rel": PronType_rel, - "PronType_tot": PronType_tot, - "PronType_clit": PronType_clit, - "PronType_exc": PronType_exc, - "end_PronType": end_PronType, - "begin_PunctSide": begin_PunctSide, - "PunctSide_ini": PunctSide_ini, - "PunctSide_fin": PunctSide_fin, - "end_PunctSide": end_PunctSide, - "begin_PunctType": begin_PunctType, - "PunctType_peri": PunctType_peri, - "PunctType_qest": PunctType_qest, - "PunctType_excl": PunctType_excl, - "PunctType_quot": PunctType_quot, - "PunctType_brck": PunctType_brck, - "PunctType_comm": PunctType_comm, - "PunctType_colo": PunctType_colo, - "PunctType_semi": PunctType_semi, - "PunctType_dash": PunctType_dash, - "end_PunctType": end_PunctType, - "begin_Reflex": begin_Reflex, - "Reflex_yes": Reflex_yes, - "end_Reflex": end_Reflex, - "begin_Style": begin_Style, - "Style_arch": Style_arch, - "Style_rare": Style_rare, - "Style_poet": Style_poet, - "Style_norm": Style_norm, - "Style_coll": Style_coll, - "Style_vrnc": Style_vrnc, - "Style_sing": Style_sing, - "Style_expr": Style_expr, - "Style_derg": Style_derg, - "Style_vulg": Style_vulg, - "Style_yes": Style_yes, - "end_Style": end_Style, - "begin_StyleVariant": begin_StyleVariant, - "StyleVariant_styleShort": StyleVariant_styleShort, - "StyleVariant_styleBound": StyleVariant_styleBound, - "end_StyleVariant": end_StyleVariant, - "begin_Tense": begin_Tense, - "Tense_fut": Tense_fut, - "Tense_imp": Tense_imp, - "Tense_past": Tense_past, - "Tense_pres": Tense_pres, - "end_Tense": end_Tense, - "begin_Typo": begin_Typo, - "Typo_yes": Typo_yes, - "end_Typo": end_Typo, - "begin_VerbForm": begin_VerbForm, - "VerbForm_fin": VerbForm_fin, - "VerbForm_ger": VerbForm_ger, - "VerbForm_inf": VerbForm_inf, - "VerbForm_none": VerbForm_none, - "VerbForm_part": VerbForm_part, - "VerbForm_partFut": VerbForm_partFut, - "VerbForm_partPast": VerbForm_partPast, - "VerbForm_partPres": VerbForm_partPres, - "VerbForm_sup": VerbForm_sup, - "VerbForm_trans": VerbForm_trans, - "VerbForm_conv": VerbForm_conv, - "VerbForm_gdv": VerbForm_gdv, - "end_VerbForm": end_VerbForm, - "begin_VerbType": begin_VerbType, - "VerbType_aux": VerbType_aux, - "VerbType_cop": VerbType_cop, - "VerbType_mod": VerbType_mod, - "VerbType_light": VerbType_light, - "end_VerbType": end_VerbType, - "begin_Voice": begin_Voice, - "Voice_act": Voice_act, - "Voice_cau": Voice_cau, - "Voice_pass": Voice_pass, - "Voice_mid": Voice_mid, - "Voice_int": Voice_int, - "end_Voice": end_Voice, -} +FEATURE_NAMES = {get_string_id(name): name for name in FEATURES} +FEATURE_FIELDS = {feature: FIELDS[feature.split('_', 1)[0]] for feature in FEATURES} +for feat_id, name in FEATURE_NAMES.items(): + FEATURE_FIELDS[feat_id] = FEATURE_FIELDS[name] -FIELD_SIZES = [get_field_size(field) for field in FIELDS] - -NAMES = {value: key for key, value in IDS.items()} -# Unfortunate hack here, to work around problem with long cpdef enum -# (which is generating an enormous amount of C++ in Cython 0.24+) -# We keep the enum cdef, and just make sure the names are available to Python -locals().update(IDS) +FIELD_SIZES = Counter(FEATURE_FIELDS.values()) +FEATURE_OFFSETS = {} +FIELD_OFFSETS = {} +_seen_fields = Counter() +for i, feature in enumerate(FEATURES): + field = FEATURE_FIELDS[feature] + FEATURE_OFFSETS[feature] = _seen_fields[field] + if _seen_fields == 0: + FIELD_OFFSETS[field] = i + _seen_fields[field] += 1 diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index 820567e71..9f25ba357 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -16,7 +16,7 @@ from ..compat import basestring_ from ..tokens.doc cimport Doc from ..vocab cimport Vocab from ..morphology cimport Morphology -from ..morphology import parse_feature, IDS, FIELDS, FIELD_SIZES, NAMES +from ..morphology import get_field_size, get_field_offset, parse_feature, FIELDS class Morphologizer(Pipe): @@ -27,7 +27,7 @@ class Morphologizer(Pipe): if cfg.get('pretrained_dims') and not cfg.get('pretrained_vectors'): raise ValueError(TempErrors.T008) if attr_nums is None: - attr_nums = list(FIELD_SIZES) + attr_nums = [get_field_size(name) for name in FIELDS] return build_morphologizer_model(attr_nums, **cfg) def __init__(self, vocab, model=True, **cfg): @@ -76,7 +76,7 @@ class Morphologizer(Pipe): cdef Doc doc cdef Vocab vocab = self.vocab field_names = list(FIELDS) - offsets = [IDS['begin_%s' % field] for field in field_names] + offsets = [get_field_offset(field) for field in field_names] for i, doc in enumerate(docs): doc_scores = batch_scores[i] doc_guesses = scores_to_guesses(doc_scores, self.model.softmax.out_sizes) diff --git a/spacy/structs.pxd b/spacy/structs.pxd index 7452123c0..a4daa9b94 100644 --- a/spacy/structs.pxd +++ b/spacy/structs.pxd @@ -2,7 +2,6 @@ from libc.stdint cimport uint8_t, uint32_t, int32_t, uint64_t from .typedefs cimport flags_t, attr_t, hash_t from .parts_of_speech cimport univ_pos_t -from .morphology cimport univ_morph_t cdef struct LexemeC: diff --git a/spacy/tests/doc/test_retokenize_merge.py b/spacy/tests/doc/test_retokenize_merge.py index 4d4a70e30..b62e69f6c 100644 --- a/spacy/tests/doc/test_retokenize_merge.py +++ b/spacy/tests/doc/test_retokenize_merge.py @@ -69,7 +69,6 @@ def test_doc_retokenize_retokenizer_attrs(en_tokenizer): assert doc[4].ent_type_ == "ORG" -@pytest.mark.xfail def test_doc_retokenize_lex_attrs(en_tokenizer): """Test that lexical attributes can be changed (see #2390).""" doc = en_tokenizer("WKRO played beach boys songs") diff --git a/spacy/tests/morphology/test_morph_features.py b/spacy/tests/morphology/test_morph_features.py index 32cc665af..dcb0b32ff 100644 --- a/spacy/tests/morphology/test_morph_features.py +++ b/spacy/tests/morphology/test_morph_features.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals import pytest from ...morphology import Morphology -from ...strings import StringStore +from ...strings import StringStore, get_string_id from ...lemmatizer import Lemmatizer from ...morphology import * @@ -17,14 +17,14 @@ def test_add_morphology_with_string_names(morphology): morphology.add({"Case_gen", "Number_sing"}) def test_add_morphology_with_int_ids(morphology): - morphology.add({Case_gen, Number_sing}) + morphology.add({get_string_id("Case_gen"), get_string_id("Number_sing")}) def test_add_morphology_with_mix_strings_and_ints(morphology): - morphology.add({PunctSide_ini, 'VerbType_aux'}) + morphology.add({get_string_id("PunctSide_ini"), 'VerbType_aux'}) def test_morphology_tags_hash_distinctly(morphology): - tag1 = morphology.add({PunctSide_ini, 'VerbType_aux'}) + tag1 = morphology.add({"PunctSide_ini", 'VerbType_aux'}) tag2 = morphology.add({"Case_gen", 'Number_sing'}) assert tag1 != tag2 diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index df596ceb5..1b60a3271 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -22,6 +22,7 @@ from ..compat import is_config from ..errors import Errors, Warnings, user_warning, models_warning from .. import util from .underscore import Underscore, get_ext_args +from .morphanalysis cimport MorphAnalysis cdef class Token: @@ -176,6 +177,10 @@ cdef class Token: def __get__(self): return self.c.morph + property morph: + def __get__(self): + return MorphAnalysis.from_id(self.vocab, self.c.morph) + property lex_id: """RETURNS (int): Sequential ID of the token's lexical type.""" def __get__(self):