diff --git a/bin/init_model.py b/bin/init_model.py index 72d7a3aae..eb07f6494 100644 --- a/bin/init_model.py +++ b/bin/init_model.py @@ -42,7 +42,10 @@ import spacy.de import spacy.fi import spacy.it - +try: + unicode +except NameError: + unicode = str def setup_tokenizer(lang_data_dir, tok_dir): @@ -112,8 +115,12 @@ def _read_freqs(loc, max_length=100, min_doc_freq=5, min_freq=200): total += freq counts.smooth() log_total = math.log(total) + if str(loc).endswith('gz'): + file_ = gzip.open(str(loc)) + else: + file_ = loc.open() probs = {} - for line in loc.open(): + for line in file_: freq, doc_freq, key = line.split('\t', 2) doc_freq = int(doc_freq) freq = int(freq) @@ -158,7 +165,7 @@ def setup_vocab(get_lex_attr, tag_map, src_dir, dst_dir): clusters = _read_clusters(src_dir / 'clusters.txt') probs, oov_prob = _read_probs(src_dir / 'words.sgt.prob') if not probs: - probs, oov_prob = _read_freqs(src_dir / 'freqs.txt') + probs, oov_prob = _read_freqs(src_dir / 'freqs.txt.gz') if not probs: oov_prob = -20 else: @@ -168,6 +175,11 @@ def setup_vocab(get_lex_attr, tag_map, src_dir, dst_dir): probs[word] = oov_prob lexicon = [] + for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])): + # First encode the strings into the StringStore. This way, we can map + # the orth IDs to frequency ranks + orth = vocab.strings[word] + # Now actually load the vocab for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])): lexeme = vocab[word] lexeme.prob = prob diff --git a/lang_data/en/morphs.json b/lang_data/en/morphs.json index 917cbc759..059381b27 100644 --- a/lang_data/en/morphs.json +++ b/lang_data/en/morphs.json @@ -56,5 +56,4 @@ "was": {"L": "be", "VerbForm": "Fin", "Tense": "Past", "Number": "Sing"}, "were": {"L": "be", "VerbForm": "Fin", "Tense": "Past", "Number": "Plur"} } - } diff --git a/lang_data/en/tag_map.json b/lang_data/en/tag_map.json index de3e2eb58..a38411bcf 100644 --- a/lang_data/en/tag_map.json +++ b/lang_data/en/tag_map.json @@ -22,7 +22,7 @@ "JJS": {"pos": "adj", "degree": "sup"}, "LS": {"pos": "punct", "numtype": "ord"}, "MD": {"pos": "verb", "verbtype": "mod"}, -"NIL": {"pos": "no_tag"}, +"NIL": {"pos": ""}, "NN": {"pos": "noun", "number": "sing"}, "NNP": {"pos": "noun", "nountype": "prop", "number": "sing"}, "NNPS": {"pos": "noun", "nountype": "prop", "number": "plur"}, diff --git a/setup.py b/setup.py index ec394a2e3..fb05a9dbd 100644 --- a/setup.py +++ b/setup.py @@ -166,7 +166,8 @@ MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings', 'spacy.tokens.doc', 'spacy.tokens.spans', 'spacy.tokens.token', 'spacy.serialize.packer', 'spacy.serialize.huffman', 'spacy.serialize.bits', 'spacy.cfile', 'spacy.matcher', - 'spacy.syntax.ner'] + 'spacy.syntax.ner', + 'spacy.symbols'] if __name__ == '__main__': diff --git a/spacy/attrs.pxd b/spacy/attrs.pxd index c810762ef..d0f476dcd 100644 --- a/spacy/attrs.pxd +++ b/spacy/attrs.pxd @@ -1,5 +1,6 @@ # Reserve 64 values for flag features cpdef enum attr_id_t: + NULL_ATTR IS_ALPHA IS_ASCII IS_DIGIT @@ -14,8 +15,7 @@ cpdef enum attr_id_t: IS_STOP IS_OOV - FLAG13 = 13 - FLAG14 + FLAG14 = 14 FLAG15 FLAG16 FLAG17 diff --git a/spacy/attrs.pyx b/spacy/attrs.pyx index e69de29bb..3595fbf22 100644 --- a/spacy/attrs.pyx +++ b/spacy/attrs.pyx @@ -0,0 +1,90 @@ +IDS = { + "": NULL_ATTR, + "IS_ALPHA": IS_ALPHA, + "IS_ASCII": IS_ASCII, + "IS_DIGIT": IS_DIGIT, + "IS_LOWER": IS_LOWER, + "IS_PUNCT": IS_PUNCT, + "IS_SPACE": IS_SPACE, + "IS_TITLE": IS_TITLE, + "IS_UPPER": IS_UPPER, + "LIKE_URL": LIKE_URL, + "LIKE_NUM": LIKE_NUM, + "LIKE_EMAIL": LIKE_EMAIL, + "IS_STOP": IS_STOP, + "IS_OOV": IS_OOV, + + "FLAG14": FLAG14, + "FLAG15": FLAG15, + "FLAG16": FLAG16, + "FLAG17": FLAG17, + "FLAG18": FLAG18, + "FLAG19": FLAG19, + "FLAG20": FLAG20, + "FLAG21": FLAG21, + "FLAG22": FLAG22, + "FLAG23": FLAG23, + "FLAG24": FLAG24, + "FLAG25": FLAG25, + "FLAG26": FLAG26, + "FLAG27": FLAG27, + "FLAG28": FLAG28, + "FLAG29": FLAG29, + "FLAG30": FLAG30, + "FLAG31": FLAG31, + "FLAG32": FLAG32, + "FLAG33": FLAG33, + "FLAG34": FLAG34, + "FLAG35": FLAG35, + "FLAG36": FLAG36, + "FLAG37": FLAG37, + "FLAG38": FLAG38, + "FLAG39": FLAG39, + "FLAG40": FLAG40, + "FLAG41": FLAG41, + "FLAG42": FLAG42, + "FLAG43": FLAG43, + "FLAG44": FLAG44, + "FLAG45": FLAG45, + "FLAG46": FLAG46, + "FLAG47": FLAG47, + "FLAG48": FLAG48, + "FLAG49": FLAG49, + "FLAG50": FLAG50, + "FLAG51": FLAG51, + "FLAG52": FLAG52, + "FLAG53": FLAG53, + "FLAG54": FLAG54, + "FLAG55": FLAG55, + "FLAG56": FLAG56, + "FLAG57": FLAG57, + "FLAG58": FLAG58, + "FLAG59": FLAG59, + "FLAG60": FLAG60, + "FLAG61": FLAG61, + "FLAG62": FLAG62, + "FLAG63": FLAG63, + + "ID": ID, + "ORTH": ORTH, + "LOWER": LOWER, + "NORM": NORM, + "SHAPE": SHAPE, + "PREFIX": PREFIX, + "SUFFIX": SUFFIX, + + "LENGTH": LENGTH, + "CLUSTER": CLUSTER, + "LEMMA": LEMMA, + "POS": POS, + "TAG": TAG, + "DEP": DEP, + "ENT_IOB": ENT_IOB, + "ENT_TYPE": ENT_TYPE, + "HEAD": HEAD, + "SPACY": SPACY, + "PROB": PROB, +} + +# ATTR IDs, in order of the symbol +NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])] diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index afafd3ddb..3ee825932 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -15,7 +15,7 @@ from libcpp.vector cimport vector from murmurhash.mrmr cimport hash64 from .attrs cimport LENGTH, ENT_TYPE, ORTH, NORM, LEMMA, LOWER, SHAPE -from .attrs cimport FLAG13, FLAG14, FLAG15, FLAG16, FLAG17, FLAG18, FLAG19, FLAG20, FLAG21, FLAG22, FLAG23, FLAG24, FLAG25 +from .attrs cimport FLAG14, FLAG15, FLAG16, FLAG17, FLAG18, FLAG19, FLAG20, FLAG21, FLAG22, FLAG23, FLAG24, FLAG25 from .tokens.doc cimport get_token_attr from .tokens.doc cimport Doc from .vocab cimport Vocab diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd index 2229da0ad..62d3fccc1 100644 --- a/spacy/morphology.pxd +++ b/spacy/morphology.pxd @@ -7,6 +7,7 @@ from .strings cimport StringStore from .typedefs cimport attr_t from .parts_of_speech cimport univ_pos_t +from . cimport symbols cdef struct RichTagC: uint64_t morph @@ -36,720 +37,252 @@ cdef class Morphology: cdef int assign_feature(self, uint64_t* morph, feature, value) except -1 +cpdef enum univ_morph_t: + NIL = 0 + Animacy_anim = symbols.Animacy_anim + Animacy_inam + Aspect_freq + Aspect_imp + Aspect_mod + Aspect_none + Aspect_perf + Case_abe + Case_abl + Case_abs + Case_acc + Case_ade + Case_all + Case_cau + Case_com + Case_dat + Case_del + Case_dis + Case_ela + Case_ess + Case_gen + Case_ill + Case_ine + Case_ins + Case_loc + Case_lat + Case_nom + Case_par + Case_sub + Case_sup + Case_tem + Case_ter + Case_tra + Case_voc + Definite_two + Definite_def + Definite_red + Definite_ind + Degree_cmp + Degree_comp + Degree_none + Degree_pos + Degree_sup + Degree_abs + Degree_com + Degree_dim # du + Gender_com + Gender_fem + Gender_masc + Gender_neut + Mood_cnd + Mood_imp + Mood_ind + Mood_n + Mood_pot + Mood_sub + Mood_opt + Negative_neg + Negative_pos + Negative_yes + Number_com + Number_dual + Number_none + Number_plur + Number_sing + Number_ptan # bg + Number_count # bg + NumType_card + NumType_dist + NumType_frac + NumType_gen + NumType_mult + NumType_none + NumType_ord + NumType_sets + Person_one + Person_two + Person_three + Person_none + Poss_yes + PronType_advPart + PronType_art + PronType_default + PronType_dem + PronType_ind + PronType_int + PronType_neg + PronType_prs + PronType_rcp + PronType_rel + PronType_tot + PronType_clit + PronType_exc # es, ca, it, fa + Reflex_yes + Tense_fut + Tense_imp + Tense_past + Tense_pres + VerbForm_fin + VerbForm_ger + VerbForm_inf + VerbForm_none + VerbForm_part + VerbForm_partFut + VerbForm_partPast + VerbForm_partPres + VerbForm_sup + VerbForm_trans + VerbForm_gdv # la + Voice_act + Voice_cau + Voice_pass + Voice_mid # gkc + Voice_int # hb + Abbr_yes # cz, fi, sl, U + AdpType_prep # cz, U + AdpType_post # U + AdpType_voc # cz + AdpType_comprep # cz + AdpType_circ # U + AdvType_man + AdvType_loc + AdvType_tim + AdvType_deg + AdvType_cau + AdvType_mod + AdvType_sta + AdvType_ex + AdvType_adadj + ConjType_oper # cz, U + ConjType_comp # cz, U + Connegative_yes # fi + Derivation_minen # fi + Derivation_sti # fi + Derivation_inen # fi + Derivation_lainen # fi + Derivation_ja # fi + Derivation_ton # fi + Derivation_vs # fi + Derivation_ttain # fi + Derivation_ttaa # fi + Echo_rdp # U + Echo_ech # U + Foreign_foreign # cz, fi, U + Foreign_fscript # cz, fi, U + Foreign_tscript # cz, U + Foreign_yes # sl + Gender_dat_masc # bq, U + Gender_dat_fem # bq, U + Gender_erg_masc # bq + Gender_erg_fem # bq + Gender_psor_masc # cz, sl, U + Gender_psor_fem # cz, sl, U + Gender_psor_neut # sl + Hyph_yes # cz, U + InfForm_one # fi + InfForm_two # fi + InfForm_three # fi + NameType_geo # U, cz + NameType_prs # U, cz + NameType_giv # U, cz + NameType_sur # U, cz + NameType_nat # U, cz + NameType_com # U, cz + NameType_pro # U, cz + NameType_oth # U, cz + NounType_com # U + NounType_prop # U + NounType_class # U + Number_abs_sing # bq, U + Number_abs_plur # bq, U + Number_dat_sing # bq, U + Number_dat_plur # bq, U + Number_erg_sing # bq, U + Number_erg_plur # bq, U + Number_psee_sing # U + Number_psee_plur # U + Number_psor_sing # cz, fi, sl, U + Number_psor_plur # cz, fi, sl, U + NumForm_digit # cz, sl, U + NumForm_roman # cz, sl, U + NumForm_word # cz, sl, U + NumValue_one # cz, U + NumValue_two # cz, U + NumValue_three # cz, U + PartForm_pres # fi + PartForm_past # fi + PartForm_agt # fi + PartForm_neg # fi + PartType_mod # U + PartType_emp # U + PartType_res # U + PartType_inf # U + PartType_vbp # U + Person_abs_one # bq, U + Person_abs_two # bq, U + Person_abs_three # bq, U + Person_dat_one # bq, U + Person_dat_two # bq, U + Person_dat_three # bq, U + Person_erg_one # bq, U + Person_erg_two # bq, U + Person_erg_three # bq, U + Person_psor_one # fi, U + Person_psor_two # fi, U + Person_psor_three # fi, U + Polite_inf # bq, U + Polite_pol # bq, U + Polite_abs_inf # bq, U + Polite_abs_pol # bq, U + Polite_erg_inf # bq, U + Polite_erg_pol # bq, U + Polite_dat_inf # bq, U + Polite_dat_pol # bq, U + Prefix_yes # U + PrepCase_npr # cz + PrepCase_pre # U + PunctSide_ini # U + PunctSide_fin # U + PunctType_peri # U + PunctType_qest # U + PunctType_excl # U + PunctType_quot # U + PunctType_brck # U + PunctType_comm # U + PunctType_colo # U + PunctType_semi # U + PunctType_dash # U + Style_arch # cz, fi, U + Style_rare # cz, fi, U + Style_poet # cz, U + Style_norm # cz, U + Style_coll # cz, U + Style_vrnc # cz, U + Style_sing # cz, U + Style_expr # cz, U + Style_derg # cz, U + Style_vulg # cz, U + Style_yes # fi, U + StyleVariant_styleShort # cz + StyleVariant_styleBound # cz, sl + VerbType_aux # U + VerbType_cop # U + VerbType_mod # U + VerbType_light # U -# -#cpdef enum Feature_t: -# Abbr -# AdpType -# AdvType -# ConjType -# Connegative -# Derivation -# Echo -# Foreign -# Gender_dat -# Gender_erg -# Gender_psor -# Hyph -# InfForm -# NameType -# NounType -# NumberAbs -# NumberDat -# NumberErg -# NumberPsee -# NumberPsor -# NumForm -# NumValue -# PartForm -# PartType -# Person_abs -# Person_dat -# Person_psor -# Polite -# Polite_abs -# Polite_dat -# Prefix -# PrepCase -# PunctSide -# PunctType -# Style -# Typo -# Variant -# VerbType -# -# -#cpdef enum Animacy: -# Anim -# Inam -# -# -#cpdef enum Aspect: -# Freq -# Imp -# Mod -# None_ -# Perf -# -# -#cpdef enum Case1: -# Nom -# Gen -# Acc -# Dat -# Voc -# Abl -# -#cdef enum Case2: -# Abe -# Abs -# Ade -# All -# Cau -# Com -# Del -# Dis -# -#cdef enum Case3: -# Ela -# Ess -# Ill -# Ine -# Ins -# Loc -# Lat -# Par -# -#cdef enum Case4: -# Sub -# Sup -# Tem -# Ter -# Tra -# -# -#cpdef enum Definite: -# Two -# Def -# Red -# Ind -# -# -#cpdef enum Degree: -# Cmp -# Comp -# None_ -# Pos -# Sup -# Abs -# Com -# Degree # du -# -# -#cpdef enum Gender: -# Com -# Fem -# Masc -# Neut -# -# -#cpdef enum Mood: -# Cnd -# Imp -# Ind -# N -# Pot -# Sub -# Opt -# -# -#cpdef enum Negative: -# Neg -# Pos -# Yes -# -# -#cpdef enum Number: -# Com -# Dual -# None_ -# Plur -# Sing -# Ptan # bg -# Count # bg -# -# -#cpdef enum NumType: -# Card -# Dist -# Frac -# Gen -# Mult -# None_ -# Ord -# Sets -# -# -#cpdef enum Person: -# One -# Two -# Three -# None_ -# -# -#cpdef enum Poss: -# Yes -# -# -#cpdef enum PronType1: -# AdvPart -# Art -# Default -# Dem -# Ind -# Int -# Neg -# -#cpdef enum PronType2: -# Prs -# Rcp -# Rel -# Tot -# Clit -# Exc # es, ca, it, fa -# Clit # it -# -# -#cpdef enum Reflex: -# Yes -# -# -#cpdef enum Tense: -# Fut -# Imp -# Past -# Pres -# -#cpdef enum VerbForm1: -# Fin -# Ger -# Inf -# None_ -# Part -# PartFut -# PartPast -# -#cpdef enum VerbForm2: -# PartPres -# Sup -# Trans -# Gdv # la -# -# -#cpdef enum Voice: -# Act -# Cau -# Pass -# Mid # gkc -# Int # hb -# -# -#cpdef enum Abbr: -# Yes # cz, fi, sl, U -# -#cpdef enum AdpType: -# Prep # cz, U -# Post # U -# Voc # cz -# Comprep # cz -# Circ # U -# Voc # U -# -# -#cpdef enum AdvType1: -# # U -# Man -# Loc -# Tim -# Deg -# Cau -# Mod -# Sta -# Ex -# -#cpdef enum AdvType2: -# Adadj -# -#cpdef enum ConjType: -# Oper # cz, U -# Comp # cz, U -# -#cpdef enum Connegative: -# Yes # fi -# -# -#cpdef enum Derivation1: -# Minen # fi -# Sti # fi -# Inen # fi -# Lainen # fi -# Ja # fi -# Ton # fi -# Vs # fi -# Ttain # fi -# -#cpdef enum Derivation2: -# Ttaa -# -# -#cpdef enum Echo: -# Rdp # U -# Ech # U -# -# -#cpdef enum Foreign: -# Foreign # cz, fi, U -# Fscript # cz, fi, U -# Tscript # cz, U -# Yes # sl -# -# -#cpdef enum Gender_dat: -# Masc # bq, U -# Fem # bq, U -# -# -#cpdef enum Gender_erg: -# Masc # bq -# Fem # bq -# -# -#cpdef enum Gender_psor: -# Masc # cz, sl, U -# Fem # cz, sl, U -# Neut # sl -# -# -#cpdef enum Hyph: -# Yes # cz, U -# -# -#cpdef enum InfForm: -# One # fi -# Two # fi -# Three # fi -# -# -#cpdef enum NameType: -# Geo # U, cz -# Prs # U, cz -# Giv # U, cz -# Sur # U, cz -# Nat # U, cz -# Com # U, cz -# Pro # U, cz -# Oth # U, cz -# -# -#cpdef enum NounType: -# Com # U -# Prop # U -# Class # U -# -#cpdef enum Number_abs: -# Sing # bq, U -# Plur # bq, U -# -#cpdef enum Number_dat: -# Sing # bq, U -# Plur # bq, U -# -#cpdef enum Number_erg: -# Sing # bq, U -# Plur # bq, U -# -#cpdef enum Number_psee: -# Sing # U -# Plur # U -# -# -#cpdef enum Number_psor: -# Sing # cz, fi, sl, U -# Plur # cz, fi, sl, U -# -# -#cpdef enum NumForm: -# Digit # cz, sl, U -# Roman # cz, sl, U -# Word # cz, sl, U -# -# -#cpdef enum NumValue: -# One # cz, U -# Two # cz, U -# Three # cz, U -# -# -#cpdef enum PartForm: -# Pres # fi -# Past # fi -# Agt # fi -# Neg # fi -# -# -#cpdef enum PartType: -# Mod # U -# Emp # U -# Res # U -# Inf # U -# Vbp # U -# -#cpdef enum Person_abs: -# One # bq, U -# Two # bq, U -# Three # bq, U -# -# -#cpdef enum Person_dat: -# One # bq, U -# Two # bq, U -# Three # bq, U -# -# -#cpdef enum Person_erg: -# One # bq, U -# Two # bq, U -# Three # bq, U -# -# -#cpdef enum Person_psor: -# One # fi, U -# Two # fi, U -# Three # fi, U -# -# -#cpdef enum Polite: -# Inf # bq, U -# Pol # bq, U -# -# -#cpdef enum Polite_abs: -# Inf # bq, U -# Pol # bq, U -# -# -#cpdef enum Polite_erg: -# Inf # bq, U -# Pol # bq, U -# -# -#cpdef enum Polite_dat: -# Inf # bq, U -# Pol # bq, U -# -# -#cpdef enum Prefix: -# Yes # U -# -# -#cpdef enum PrepCase: -# Npr # cz -# Pre # U -# -# -#cpdef enum PunctSide: -# Ini # U -# Fin # U -# -#cpdef enum PunctType1: -# Peri # U -# Qest # U -# Excl # U -# Quot # U -# Brck # U -# Comm # U -# Colo # U -# Semi # U -# -#cpdef enum PunctType2: -# Dash # U -# -# -#cpdef enum Style1: -# Arch # cz, fi, U -# Rare # cz, fi, U -# Poet # cz, U -# Norm # cz, U -# Coll # cz, U -# Vrnc # cz, U -# Sing # cz, U -# Expr # cz, U -# -# -#cpdef enum Style2: -# Derg # cz, U -# Vulg # cz, U -# -# -#cpdef enum Typo: -# Yes # fi, U -# -# -#cpdef enum Variant: -# Short # cz -# Bound # cz, sl -# -# -#cpdef enum VerbType: -# Aux # U -# Cop # U -# Mod # U -# Light # U -# -cpdef enum Value_t: - Animacy_Anim - Animacy_Inam - Aspect_Freq - Aspect_Imp - Aspect_Mod - Aspect_None_ - Aspect_Perf - Case_Abe - Case_Abl - Case_Abs - Case_Acc - Case_Ade - Case_All - Case_Cau - Case_Com - Case_Dat - Case_Del - Case_Dis - Case_Ela - Case_Ess - Case_Gen - Case_Ill - Case_Ine - Case_Ins - Case_Loc - Case_Lat - Case_Nom - Case_Par - Case_Sub - Case_Sup - Case_Tem - Case_Ter - Case_Tra - Case_Voc - Definite_Two - Definite_Def - Definite_Red - Definite_Ind - Degree_Cmp - Degree_Comp - Degree_None - Degree_Pos - Degree_Sup - Degree_Abs - Degree_Com - Degree_Dim # du - Gender_Com - Gender_Fem - Gender_Masc - Gender_Neut - Mood_Cnd - Mood_Imp - Mood_Ind - Mood_N - Mood_Pot - Mood_Sub - Mood_Opt - Negative_Neg - Negative_Pos - Negative_Yes - Number_Com - Number_Dual - Number_None - Number_Plur - Number_Sing - Number_Ptan # bg - Number_Count # bg - NumType_Card - NumType_Dist - NumType_Frac - NumType_Gen - NumType_Mult - NumType_None - NumType_Ord - NumType_Sets - Person_One - Person_Two - Person_Three - Person_None - Poss_Yes - PronType_AdvPart - PronType_Art - PronType_Default - PronType_Dem - PronType_Ind - PronType_Int - PronType_Neg - PronType_Prs - PronType_Rcp - PronType_Rel - PronType_Tot - PronType_Clit - PronType_Exc # es, ca, it, fa - Reflex_Yes - Tense_Fut - Tense_Imp - Tense_Past - Tense_Pres - VerbForm_Fin - VerbForm_Ger - VerbForm_Inf - VerbForm_None - VerbForm_Part - VerbForm_PartFut - VerbForm_PartPast - VerbForm_PartPres - VerbForm_Sup - VerbForm_Trans - VerbForm_Gdv # la - Voice_Act - Voice_Cau - Voice_Pass - Voice_Mid # gkc - Voice_Int # hb - Abbr_Yes # cz, fi, sl, U - AdpType_Prep # cz, U - AdpType_Post # U - AdpType_Voc # cz - AdpType_Comprep # cz - AdpType_Circ # U - AdvType_Man - AdvType_Loc - AdvType_Tim - AdvType_Deg - AdvType_Cau - AdvType_Mod - AdvType_Sta - AdvType_Ex - AdvType_Adadj - ConjType_Oper # cz, U - ConjType_Comp # cz, U - Connegative_Yes # fi - Derivation_Minen # fi - Derivation_Sti # fi - Derivation_Inen # fi - Derivation_Lainen # fi - Derivation_Ja # fi - Derivation_Ton # fi - Derivation_Vs # fi - Derivation_Ttain # fi - Derivation_Ttaa # fi - Echo_Rdp # U - Echo_Ech # U - Foreign_Foreign # cz, fi, U - Foreign_Fscript # cz, fi, U - Foreign_Tscript # cz, U - Foreign_Yes # sl - Gender_dat_Masc # bq, U - Gender_dat_Fem # bq, U - Gender_erg_Masc # bq - Gender_erg_Fem # bq - Gender_psor_Masc # cz, sl, U - Gender_psor_Fem # cz, sl, U - Gender_psor_Neut # sl - Hyph_Yes # cz, U - InfForm_One # fi - InfForm_Two # fi - InfForm_Three # fi - NameType_Geo # U, cz - NameType_Prs # U, cz - NameType_Giv # U, cz - NameType_Sur # U, cz - NameType_Nat # U, cz - NameType_Com # U, cz - NameType_Pro # U, cz - NameType_Oth # U, cz - NounType_Com # U - NounType_Prop # U - NounType_Class # U - Number_abs_Sing # bq, U - Number_abs_Plur # bq, U - Number_dat_Sing # bq, U - Number_dat_Plur # bq, U - Number_erg_Sing # bq, U - Number_erg_Plur # bq, U - Number_psee_Sing # U - Number_psee_Plur # U - Number_psor_Sing # cz, fi, sl, U - Number_psor_Plur # cz, fi, sl, U - NumForm_Digit # cz, sl, U - NumForm_Roman # cz, sl, U - NumForm_Word # cz, sl, U - NumValue_One # cz, U - NumValue_Two # cz, U - NumValue_Three # cz, U - PartForm_Pres # fi - PartForm_Past # fi - PartForm_Agt # fi - PartForm_Neg # fi - PartType_Mod # U - PartType_Emp # U - PartType_Res # U - PartType_Inf # U - PartType_Vbp # U - Person_abs_One # bq, U - Person_abs_Two # bq, U - Person_abs_Three # bq, U - Person_dat_One # bq, U - Person_dat_Two # bq, U - Person_dat_Three # bq, U - Person_erg_One # bq, U - Person_erg_Two # bq, U - Person_erg_Three # bq, U - Person_psor_One # fi, U - Person_psor_Two # fi, U - Person_psor_Three # fi, U - Polite_Inf # bq, U - Polite_Pol # bq, U - Polite_abs_Inf # bq, U - Polite_abs_Pol # bq, U - Polite_erg_Inf # bq, U - Polite_erg_Pol # bq, U - Polite_dat_Inf # bq, U - Polite_dat_Pol # bq, U - Prefix_Yes # U - PrepCase_Npr # cz - PrepCase_Pre # U - PunctSide_Ini # U - PunctSide_Fin # U - PunctType_Peri # U - PunctType_Qest # U - PunctType_Excl # U - PunctType_Quot # U - PunctType_Brck # U - PunctType_Comm # U - PunctType_Colo # U - PunctType_Semi # U - PunctType_Dash # U - Style_Arch # cz, fi, U - Style_Rare # cz, fi, U - Style_Poet # cz, U - Style_Norm # cz, U - Style_Coll # cz, U - Style_Vrnc # cz, U - Style_Sing # cz, U - Style_Expr # cz, U - Style_Derg # cz, U - Style_Vulg # cz, U - Style_Yes # fi, U - StyleVariant_StyleShort # cz - StyleVariant_StyleBound # cz, sl - VerbType_Aux # U - VerbType_Cop # U - VerbType_Mod # U - VerbType_Light # U diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 534f64a59..c53e5f478 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -6,7 +6,7 @@ try: except ImportError: import json -from .parts_of_speech import UNIV_POS_NAMES +from .parts_of_speech import IDS as POS_IDS from .parts_of_speech cimport ADJ, VERB, NOUN, PUNCT @@ -24,7 +24,7 @@ cdef class Morphology: self.rich_tags[i].id = i self.rich_tags[i].name = self.strings[tag_str] self.rich_tags[i].morph = 0 - self.rich_tags[i].pos = UNIV_POS_NAMES[props['pos'].upper()] + self.rich_tags[i].pos = POS_IDS[props['pos'].upper()] self.reverse_index[self.rich_tags[i].name] = i self._cache = PreshMapArray(self.n_tags) @@ -89,3 +89,254 @@ cdef class Morphology: lemma_string = sorted(lemma_strings)[0] lemma = self.strings[lemma_string] return lemma + +IDS = { + "Animacy_anim": Animacy_anim, + "Animacy_inam": Animacy_inam, + "Aspect_freq": Aspect_freq, + "Aspect_imp": Aspect_imp, + "Aspect_mod": Aspect_mod, + "Aspect_none": Aspect_none, + "Aspect_perf": Aspect_perf, + "Case_abe": Case_abe, + "Case_abl": Case_abl, + "Case_abs": Case_abs, + "Case_acc": Case_acc, + "Case_ade": Case_ade, + "Case_all": Case_all, + "Case_cau": Case_cau, + "Case_com": Case_com, + "Case_dat": Case_dat, + "Case_del": Case_del, + "Case_dis": Case_dis, + "Case_ela": Case_ela, + "Case_ess": Case_ess, + "Case_gen": Case_gen, + "Case_ill": Case_ill, + "Case_ine": Case_ine, + "Case_ins": Case_ins, + "Case_loc": Case_loc, + "Case_lat": Case_lat, + "Case_nom": Case_nom, + "Case_par": Case_par, + "Case_sub": Case_sub, + "Case_sup": Case_sup, + "Case_tem": Case_tem, + "Case_ter": Case_ter, + "Case_tra": Case_tra, + "Case_voc": Case_voc, + "Definite_two": Definite_two, + "Definite_def": Definite_def, + "Definite_red": Definite_red, + "Definite_ind": Definite_ind, + "Degree_cmp": Degree_cmp, + "Degree_comp": Degree_comp, + "Degree_none": Degree_none, + "Degree_pos": Degree_pos, + "Degree_sup": Degree_sup, + "Degree_abs": Degree_abs, + "Degree_com": Degree_com, + "Degree_dim ": Degree_dim, # du + "Gender_com": Gender_com, + "Gender_fem": Gender_fem, + "Gender_masc": Gender_masc, + "Gender_neut": Gender_neut, + "Mood_cnd": Mood_cnd, + "Mood_imp": Mood_imp, + "Mood_ind": Mood_ind, + "Mood_n": Mood_n, + "Mood_pot": Mood_pot, + "Mood_sub": Mood_sub, + "Mood_opt": Mood_opt, + "Negative_neg": Negative_neg, + "Negative_pos": Negative_pos, + "Negative_yes": Negative_yes, + "Number_com": Number_com, + "Number_dual": Number_dual, + "Number_none": Number_none, + "Number_plur": Number_plur, + "Number_sing": Number_sing, + "Number_ptan ": Number_ptan, # bg + "Number_count ": Number_count, # bg + "NumType_card": NumType_card, + "NumType_dist": NumType_dist, + "NumType_frac": NumType_frac, + "NumType_gen": NumType_gen, + "NumType_mult": NumType_mult, + "NumType_none": NumType_none, + "NumType_ord": NumType_ord, + "NumType_sets": NumType_sets, + "Person_one": Person_one, + "Person_two": Person_two, + "Person_three": Person_three, + "Person_none": Person_none, + "Poss_yes": Poss_yes, + "PronType_advPart": PronType_advPart, + "PronType_art": PronType_art, + "PronType_default": PronType_default, + "PronType_dem": PronType_dem, + "PronType_ind": PronType_ind, + "PronType_int": PronType_int, + "PronType_neg": PronType_neg, + "PronType_prs": PronType_prs, + "PronType_rcp": PronType_rcp, + "PronType_rel": PronType_rel, + "PronType_tot": PronType_tot, + "PronType_clit": PronType_clit, + "PronType_exc ": PronType_exc, # es, ca, it, fa, + "Reflex_yes": Reflex_yes, + "Tense_fut": Tense_fut, + "Tense_imp": Tense_imp, + "Tense_past": Tense_past, + "Tense_pres": Tense_pres, + "VerbForm_fin": VerbForm_fin, + "VerbForm_ger": VerbForm_ger, + "VerbForm_inf": VerbForm_inf, + "VerbForm_none": VerbForm_none, + "VerbForm_part": VerbForm_part, + "VerbForm_partFut": VerbForm_partFut, + "VerbForm_partPast": VerbForm_partPast, + "VerbForm_partPres": VerbForm_partPres, + "VerbForm_sup": VerbForm_sup, + "VerbForm_trans": VerbForm_trans, + "VerbForm_gdv ": VerbForm_gdv, # la, + "Voice_act": Voice_act, + "Voice_cau": Voice_cau, + "Voice_pass": Voice_pass, + "Voice_mid ": Voice_mid, # gkc, + "Voice_int ": Voice_int, # hb, + "Abbr_yes ": Abbr_yes, # cz, fi, sl, U, + "AdpType_prep ": AdpType_prep, # cz, U, + "AdpType_post ": AdpType_post, # U, + "AdpType_voc ": AdpType_voc, # cz, + "AdpType_comprep ": AdpType_comprep, # cz, + "AdpType_circ ": AdpType_circ, # U, + "AdvType_man": AdvType_man, + "AdvType_loc": AdvType_loc, + "AdvType_tim": AdvType_tim, + "AdvType_deg": AdvType_deg, + "AdvType_cau": AdvType_cau, + "AdvType_mod": AdvType_mod, + "AdvType_sta": AdvType_sta, + "AdvType_ex": AdvType_ex, + "AdvType_adadj": AdvType_adadj, + "ConjType_oper ": ConjType_oper, # cz, U, + "ConjType_comp ": ConjType_comp, # cz, U, + "Connegative_yes ": Connegative_yes, # fi, + "Derivation_minen ": Derivation_minen, # fi, + "Derivation_sti ": Derivation_sti, # fi, + "Derivation_inen ": Derivation_inen, # fi, + "Derivation_lainen ": Derivation_lainen, # fi, + "Derivation_ja ": Derivation_ja, # fi, + "Derivation_ton ": Derivation_ton, # fi, + "Derivation_vs ": Derivation_vs, # fi, + "Derivation_ttain ": Derivation_ttain, # fi, + "Derivation_ttaa ": Derivation_ttaa, # fi, + "Echo_rdp ": Echo_rdp, # U, + "Echo_ech ": Echo_ech, # U, + "Foreign_foreign ": Foreign_foreign, # cz, fi, U, + "Foreign_fscript ": Foreign_fscript, # cz, fi, U, + "Foreign_tscript ": Foreign_tscript, # cz, U, + "Foreign_yes ": Foreign_yes, # sl, + "Gender_dat_masc ": Gender_dat_masc, # bq, U, + "Gender_dat_fem ": Gender_dat_fem, # bq, U, + "Gender_erg_masc ": Gender_erg_masc, # bq, + "Gender_erg_fem ": Gender_erg_fem, # bq, + "Gender_psor_masc ": Gender_psor_masc, # cz, sl, U, + "Gender_psor_fem ": Gender_psor_fem, # cz, sl, U, + "Gender_psor_neut ": Gender_psor_neut, # sl, + "Hyph_yes ": Hyph_yes, # cz, U, + "InfForm_one ": InfForm_one, # fi, + "InfForm_two ": InfForm_two, # fi, + "InfForm_three ": InfForm_three, # fi, + "NameType_geo ": NameType_geo, # U, cz, + "NameType_prs ": NameType_prs, # U, cz, + "NameType_giv ": NameType_giv, # U, cz, + "NameType_sur ": NameType_sur, # U, cz, + "NameType_nat ": NameType_nat, # U, cz, + "NameType_com ": NameType_com, # U, cz, + "NameType_pro ": NameType_pro, # U, cz, + "NameType_oth ": NameType_oth, # U, cz, + "NounType_com ": NounType_com, # U, + "NounType_prop ": NounType_prop, # U, + "NounType_class ": NounType_class, # U, + "Number_abs_sing ": Number_abs_sing, # bq, U, + "Number_abs_plur ": Number_abs_plur, # bq, U, + "Number_dat_sing ": Number_dat_sing, # bq, U, + "Number_dat_plur ": Number_dat_plur, # bq, U, + "Number_erg_sing ": Number_erg_sing, # bq, U, + "Number_erg_plur ": Number_erg_plur, # bq, U, + "Number_psee_sing ": Number_psee_sing, # U, + "Number_psee_plur ": Number_psee_plur, # U, + "Number_psor_sing ": Number_psor_sing, # cz, fi, sl, U, + "Number_psor_plur ": Number_psor_plur, # cz, fi, sl, U, + "NumForm_digit ": NumForm_digit, # cz, sl, U, + "NumForm_roman ": NumForm_roman, # cz, sl, U, + "NumForm_word ": NumForm_word, # cz, sl, U, + "NumValue_one ": NumValue_one, # cz, U, + "NumValue_two ": NumValue_two, # cz, U, + "NumValue_three ": NumValue_three, # cz, U, + "PartForm_pres ": PartForm_pres, # fi, + "PartForm_past ": PartForm_past, # fi, + "PartForm_agt ": PartForm_agt, # fi, + "PartForm_neg ": PartForm_neg, # fi, + "PartType_mod ": PartType_mod, # U, + "PartType_emp ": PartType_emp, # U, + "PartType_res ": PartType_res, # U, + "PartType_inf ": PartType_inf, # U, + "PartType_vbp ": PartType_vbp, # U, + "Person_abs_one ": Person_abs_one, # bq, U, + "Person_abs_two ": Person_abs_two, # bq, U, + "Person_abs_three ": Person_abs_three, # bq, U, + "Person_dat_one ": Person_dat_one, # bq, U, + "Person_dat_two ": Person_dat_two, # bq, U, + "Person_dat_three ": Person_dat_three, # bq, U, + "Person_erg_one ": Person_erg_one, # bq, U, + "Person_erg_two ": Person_erg_two, # bq, U, + "Person_erg_three ": Person_erg_three, # bq, U, + "Person_psor_one ": Person_psor_one, # fi, U, + "Person_psor_two ": Person_psor_two, # fi, U, + "Person_psor_three ": Person_psor_three, # fi, U, + "Polite_inf ": Polite_inf, # bq, U, + "Polite_pol ": Polite_pol, # bq, U, + "Polite_abs_inf ": Polite_abs_inf, # bq, U, + "Polite_abs_pol ": Polite_abs_pol, # bq, U, + "Polite_erg_inf ": Polite_erg_inf, # bq, U, + "Polite_erg_pol ": Polite_erg_pol, # bq, U, + "Polite_dat_inf ": Polite_dat_inf, # bq, U, + "Polite_dat_pol ": Polite_dat_pol, # bq, U, + "Prefix_yes ": Prefix_yes, # U, + "PrepCase_npr ": PrepCase_npr, # cz, + "PrepCase_pre ": PrepCase_pre, # U, + "PunctSide_ini ": PunctSide_ini, # U, + "PunctSide_fin ": PunctSide_fin, # U, + "PunctType_peri ": PunctType_peri, # U, + "PunctType_qest ": PunctType_qest, # U, + "PunctType_excl ": PunctType_excl, # U, + "PunctType_quot ": PunctType_quot, # U, + "PunctType_brck ": PunctType_brck, # U, + "PunctType_comm ": PunctType_comm, # U, + "PunctType_colo ": PunctType_colo, # U, + "PunctType_semi ": PunctType_semi, # U, + "PunctType_dash ": PunctType_dash, # U, + "Style_arch ": Style_arch, # cz, fi, U, + "Style_rare ": Style_rare, # cz, fi, U, + "Style_poet ": Style_poet, # cz, U, + "Style_norm ": Style_norm, # cz, U, + "Style_coll ": Style_coll, # cz, U, + "Style_vrnc ": Style_vrnc, # cz, U, + "Style_sing ": Style_sing, # cz, U, + "Style_expr ": Style_expr, # cz, U, + "Style_derg ": Style_derg, # cz, U, + "Style_vulg ": Style_vulg, # cz, U, + "Style_yes ": Style_yes, # fi, U, + "StyleVariant_styleShort ": StyleVariant_styleShort, # cz, + "StyleVariant_styleBound ": StyleVariant_styleBound, # cz, sl, + "VerbType_aux ": VerbType_aux, # U, + "VerbType_cop ": VerbType_cop, # U, + "VerbType_mod ": VerbType_mod, # U, + "VerbType_light ": VerbType_light, # U, +} + + +NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])] diff --git a/spacy/parts_of_speech.pxd b/spacy/parts_of_speech.pxd index e410c6971..c97673a69 100644 --- a/spacy/parts_of_speech.pxd +++ b/spacy/parts_of_speech.pxd @@ -1,7 +1,8 @@ -# Google universal tag set +from . cimport symbols + cpdef enum univ_pos_t: - NO_TAG - ADJ + NO_TAG = 0 + ADJ = symbols.ADJ ADP ADV AUX @@ -20,4 +21,3 @@ cpdef enum univ_pos_t: X EOL SPACE - N_UNIV_TAGS diff --git a/spacy/parts_of_speech.pyx b/spacy/parts_of_speech.pyx index 8c2348a47..14933480c 100644 --- a/spacy/parts_of_speech.pyx +++ b/spacy/parts_of_speech.pyx @@ -1,8 +1,8 @@ from __future__ import unicode_literals -UNIV_POS_NAMES = { - "NO_TAG": NO_TAG, +IDS = { + "": NO_TAG, "ADJ": ADJ, "ADP": ADP, "ADV": ADV, @@ -23,3 +23,6 @@ UNIV_POS_NAMES = { "EOL": EOL, "SPACE": SPACE } + + +NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])] diff --git a/spacy/symbols.pxd b/spacy/symbols.pxd new file mode 100644 index 000000000..0c60f6f67 --- /dev/null +++ b/spacy/symbols.pxd @@ -0,0 +1,421 @@ +cpdef enum symbol_t: + NIL + IS_ALPHA + IS_ASCII + IS_DIGIT + IS_LOWER + IS_PUNCT + IS_SPACE + IS_TITLE + IS_UPPER + LIKE_URL + LIKE_NUM + LIKE_EMAIL + IS_STOP + IS_OOV + + FLAG14 + FLAG15 + FLAG16 + FLAG17 + FLAG18 + FLAG19 + FLAG20 + FLAG21 + FLAG22 + FLAG23 + FLAG24 + FLAG25 + FLAG26 + FLAG27 + FLAG28 + FLAG29 + FLAG30 + FLAG31 + FLAG32 + FLAG33 + FLAG34 + FLAG35 + FLAG36 + FLAG37 + FLAG38 + FLAG39 + FLAG40 + FLAG41 + FLAG42 + FLAG43 + FLAG44 + FLAG45 + FLAG46 + FLAG47 + FLAG48 + FLAG49 + FLAG50 + FLAG51 + FLAG52 + FLAG53 + FLAG54 + FLAG55 + FLAG56 + FLAG57 + FLAG58 + FLAG59 + FLAG60 + FLAG61 + FLAG62 + FLAG63 + + ID + ORTH + LOWER + NORM + SHAPE + PREFIX + SUFFIX + + LENGTH + CLUSTER + LEMMA + POS + TAG + DEP + ENT_IOB + ENT_TYPE + HEAD + SPACY + PROB + + ADJ + ADP + ADV + AUX + CONJ + DET + INTJ + NOUN + NUM + PART + PRON + PROPN + PUNCT + SCONJ + SYM + VERB + X + EOL + SPACE + + Animacy_anim + Animacy_inam + Aspect_freq + Aspect_imp + Aspect_mod + Aspect_none + Aspect_perf + Case_abe + Case_abl + Case_abs + Case_acc + Case_ade + Case_all + Case_cau + Case_com + Case_dat + Case_del + Case_dis + Case_ela + Case_ess + Case_gen + Case_ill + Case_ine + Case_ins + Case_loc + Case_lat + Case_nom + Case_par + Case_sub + Case_sup + Case_tem + Case_ter + Case_tra + Case_voc + Definite_two + Definite_def + Definite_red + Definite_ind + Degree_cmp + Degree_comp + Degree_none + Degree_pos + Degree_sup + Degree_abs + Degree_com + Degree_dim # du + Gender_com + Gender_fem + Gender_masc + Gender_neut + Mood_cnd + Mood_imp + Mood_ind + Mood_n + Mood_pot + Mood_sub + Mood_opt + Negative_neg + Negative_pos + Negative_yes + Number_com + Number_dual + Number_none + Number_plur + Number_sing + Number_ptan # bg + Number_count # bg + NumType_card + NumType_dist + NumType_frac + NumType_gen + NumType_mult + NumType_none + NumType_ord + NumType_sets + Person_one + Person_two + Person_three + Person_none + Poss_yes + PronType_advPart + PronType_art + PronType_default + PronType_dem + PronType_ind + PronType_int + PronType_neg + PronType_prs + PronType_rcp + PronType_rel + PronType_tot + PronType_clit + PronType_exc # es, ca, it, fa + Reflex_yes + Tense_fut + Tense_imp + Tense_past + Tense_pres + VerbForm_fin + VerbForm_ger + VerbForm_inf + VerbForm_none + VerbForm_part + VerbForm_partFut + VerbForm_partPast + VerbForm_partPres + VerbForm_sup + VerbForm_trans + VerbForm_gdv # la + Voice_act + Voice_cau + Voice_pass + Voice_mid # gkc + Voice_int # hb + Abbr_yes # cz, fi, sl, U + AdpType_prep # cz, U + AdpType_post # U + AdpType_voc # cz + AdpType_comprep # cz + AdpType_circ # U + AdvType_man + AdvType_loc + AdvType_tim + AdvType_deg + AdvType_cau + AdvType_mod + AdvType_sta + AdvType_ex + AdvType_adadj + ConjType_oper # cz, U + ConjType_comp # cz, U + Connegative_yes # fi + Derivation_minen # fi + Derivation_sti # fi + Derivation_inen # fi + Derivation_lainen # fi + Derivation_ja # fi + Derivation_ton # fi + Derivation_vs # fi + Derivation_ttain # fi + Derivation_ttaa # fi + Echo_rdp # U + Echo_ech # U + Foreign_foreign # cz, fi, U + Foreign_fscript # cz, fi, U + Foreign_tscript # cz, U + Foreign_yes # sl + Gender_dat_masc # bq, U + Gender_dat_fem # bq, U + Gender_erg_masc # bq + Gender_erg_fem # bq + Gender_psor_masc # cz, sl, U + Gender_psor_fem # cz, sl, U + Gender_psor_neut # sl + Hyph_yes # cz, U + InfForm_one # fi + InfForm_two # fi + InfForm_three # fi + NameType_geo # U, cz + NameType_prs # U, cz + NameType_giv # U, cz + NameType_sur # U, cz + NameType_nat # U, cz + NameType_com # U, cz + NameType_pro # U, cz + NameType_oth # U, cz + NounType_com # U + NounType_prop # U + NounType_class # U + Number_abs_sing # bq, U + Number_abs_plur # bq, U + Number_dat_sing # bq, U + Number_dat_plur # bq, U + Number_erg_sing # bq, U + Number_erg_plur # bq, U + Number_psee_sing # U + Number_psee_plur # U + Number_psor_sing # cz, fi, sl, U + Number_psor_plur # cz, fi, sl, U + NumForm_digit # cz, sl, U + NumForm_roman # cz, sl, U + NumForm_word # cz, sl, U + NumValue_one # cz, U + NumValue_two # cz, U + NumValue_three # cz, U + PartForm_pres # fi + PartForm_past # fi + PartForm_agt # fi + PartForm_neg # fi + PartType_mod # U + PartType_emp # U + PartType_res # U + PartType_inf # U + PartType_vbp # U + Person_abs_one # bq, U + Person_abs_two # bq, U + Person_abs_three # bq, U + Person_dat_one # bq, U + Person_dat_two # bq, U + Person_dat_three # bq, U + Person_erg_one # bq, U + Person_erg_two # bq, U + Person_erg_three # bq, U + Person_psor_one # fi, U + Person_psor_two # fi, U + Person_psor_three # fi, U + Polite_inf # bq, U + Polite_pol # bq, U + Polite_abs_inf # bq, U + Polite_abs_pol # bq, U + Polite_erg_inf # bq, U + Polite_erg_pol # bq, U + Polite_dat_inf # bq, U + Polite_dat_pol # bq, U + Prefix_yes # U + PrepCase_npr # cz + PrepCase_pre # U + PunctSide_ini # U + PunctSide_fin # U + PunctType_peri # U + PunctType_qest # U + PunctType_excl # U + PunctType_quot # U + PunctType_brck # U + PunctType_comm # U + PunctType_colo # U + PunctType_semi # U + PunctType_dash # U + Style_arch # cz, fi, U + Style_rare # cz, fi, U + Style_poet # cz, U + Style_norm # cz, U + Style_coll # cz, U + Style_vrnc # cz, U + Style_sing # cz, U + Style_expr # cz, U + Style_derg # cz, U + Style_vulg # cz, U + Style_yes # fi, U + StyleVariant_styleShort # cz + StyleVariant_styleBound # cz, sl + VerbType_aux # U + VerbType_cop # U + VerbType_mod # U + VerbType_light # U + + PERSON + NORP + FACILITY + ORG + GPE + LOC + PRODUCT + EVENT + WORK_OF_ART + LANGUAGE + + DATE + TIME + PERCENT + MONEY + QUANTITY + ORDINAL + CARDINAL + + acomp + advcl + advmod + agent + amod + appos + attr + aux + auxpass + cc + ccomp + complm + conj + csubj + csubjpass + dep + det + dobj + expl + hmod + hyph + infmod + intj + iobj + mark + meta + neg + nmod + nn + npadvmod + nsubj + nsubjpass + num + number + oprd + parataxis + partmod + pcomp + pobj + poss + possessive + preconj + prep + prt + punct + quantmod + rcmod + root + xcomp diff --git a/spacy/symbols.pyx b/spacy/symbols.pyx new file mode 100644 index 000000000..31b01db98 --- /dev/null +++ b/spacy/symbols.pyx @@ -0,0 +1,424 @@ +IDS = { + "": NIL, + "IS_ALPHA": IS_ALPHA, + "IS_ASCII": IS_ASCII, + "IS_DIGIT": IS_DIGIT, + "IS_LOWER": IS_LOWER, + "IS_PUNCT": IS_PUNCT, + "IS_SPACE": IS_SPACE, + "IS_TITLE": IS_TITLE, + "IS_UPPER": IS_UPPER, + "LIKE_URL": LIKE_URL, + "LIKE_NUM": LIKE_NUM, + "LIKE_EMAIL": LIKE_EMAIL, + "IS_STOP": IS_STOP, + "IS_OOV": IS_OOV, + + "FLAG14": FLAG14, + "FLAG15": FLAG15, + "FLAG16": FLAG16, + "FLAG17": FLAG17, + "FLAG18": FLAG18, + "FLAG19": FLAG19, + "FLAG20": FLAG20, + "FLAG21": FLAG21, + "FLAG22": FLAG22, + "FLAG23": FLAG23, + "FLAG24": FLAG24, + "FLAG25": FLAG25, + "FLAG26": FLAG26, + "FLAG27": FLAG27, + "FLAG28": FLAG28, + "FLAG29": FLAG29, + "FLAG30": FLAG30, + "FLAG31": FLAG31, + "FLAG32": FLAG32, + "FLAG33": FLAG33, + "FLAG34": FLAG34, + "FLAG35": FLAG35, + "FLAG36": FLAG36, + "FLAG37": FLAG37, + "FLAG38": FLAG38, + "FLAG39": FLAG39, + "FLAG40": FLAG40, + "FLAG41": FLAG41, + "FLAG42": FLAG42, + "FLAG43": FLAG43, + "FLAG44": FLAG44, + "FLAG45": FLAG45, + "FLAG46": FLAG46, + "FLAG47": FLAG47, + "FLAG48": FLAG48, + "FLAG49": FLAG49, + "FLAG50": FLAG50, + "FLAG51": FLAG51, + "FLAG52": FLAG52, + "FLAG53": FLAG53, + "FLAG54": FLAG54, + "FLAG55": FLAG55, + "FLAG56": FLAG56, + "FLAG57": FLAG57, + "FLAG58": FLAG58, + "FLAG59": FLAG59, + "FLAG60": FLAG60, + "FLAG61": FLAG61, + "FLAG62": FLAG62, + "FLAG63": FLAG63, + + "ID": ID, + "ORTH": ORTH, + "LOWER": LOWER, + "NORM": NORM, + "SHAPE": SHAPE, + "PREFIX": PREFIX, + "SUFFIX": SUFFIX, + + "LENGTH": LENGTH, + "CLUSTER": CLUSTER, + "LEMMA": LEMMA, + "POS": POS, + "TAG": TAG, + "DEP": DEP, + "ENT_IOB": ENT_IOB, + "ENT_TYPE": ENT_TYPE, + "HEAD": HEAD, + "SPACY": SPACY, + "PROB": PROB, + + "ADJ": ADJ, + "ADP": ADP, + "ADV": ADV, + "AUX": AUX, + "CONJ": CONJ, + "DET": DET, + "INTJ": INTJ, + "NOUN": NOUN, + "NUM": NUM, + "PART": PART, + "PRON": PRON, + "PROPN": PROPN, + "PUNCT": PUNCT, + "SCONJ": SCONJ, + "SYM": SYM, + "VERB": VERB, + "X": X, + "EOL": EOL, + "SPACE": SPACE, + + "Animacy_anim": Animacy_anim, + "Animacy_inam": Animacy_inam, + "Aspect_freq": Aspect_freq, + "Aspect_imp": Aspect_imp, + "Aspect_mod": Aspect_mod, + "Aspect_none": Aspect_none, + "Aspect_perf": Aspect_perf, + "Case_abe": Case_abe, + "Case_abl": Case_abl, + "Case_abs": Case_abs, + "Case_acc": Case_acc, + "Case_ade": Case_ade, + "Case_all": Case_all, + "Case_cau": Case_cau, + "Case_com": Case_com, + "Case_dat": Case_dat, + "Case_del": Case_del, + "Case_dis": Case_dis, + "Case_ela": Case_ela, + "Case_ess": Case_ess, + "Case_gen": Case_gen, + "Case_ill": Case_ill, + "Case_ine": Case_ine, + "Case_ins": Case_ins, + "Case_loc": Case_loc, + "Case_lat": Case_lat, + "Case_nom": Case_nom, + "Case_par": Case_par, + "Case_sub": Case_sub, + "Case_sup": Case_sup, + "Case_tem": Case_tem, + "Case_ter": Case_ter, + "Case_tra": Case_tra, + "Case_voc": Case_voc, + "Definite_two": Definite_two, + "Definite_def": Definite_def, + "Definite_red": Definite_red, + "Definite_ind": Definite_ind, + "Degree_cmp": Degree_cmp, + "Degree_comp": Degree_comp, + "Degree_none": Degree_none, + "Degree_pos": Degree_pos, + "Degree_sup": Degree_sup, + "Degree_abs": Degree_abs, + "Degree_com": Degree_com, + "Degree_dim ": Degree_dim, # du + "Gender_com": Gender_com, + "Gender_fem": Gender_fem, + "Gender_masc": Gender_masc, + "Gender_neut": Gender_neut, + "Mood_cnd": Mood_cnd, + "Mood_imp": Mood_imp, + "Mood_ind": Mood_ind, + "Mood_n": Mood_n, + "Mood_pot": Mood_pot, + "Mood_sub": Mood_sub, + "Mood_opt": Mood_opt, + "Negative_neg": Negative_neg, + "Negative_pos": Negative_pos, + "Negative_yes": Negative_yes, + "Number_com": Number_com, + "Number_dual": Number_dual, + "Number_none": Number_none, + "Number_plur": Number_plur, + "Number_sing": Number_sing, + "Number_ptan ": Number_ptan, # bg + "Number_count ": Number_count, # bg + "NumType_card": NumType_card, + "NumType_dist": NumType_dist, + "NumType_frac": NumType_frac, + "NumType_gen": NumType_gen, + "NumType_mult": NumType_mult, + "NumType_none": NumType_none, + "NumType_ord": NumType_ord, + "NumType_sets": NumType_sets, + "Person_one": Person_one, + "Person_two": Person_two, + "Person_three": Person_three, + "Person_none": Person_none, + "Poss_yes": Poss_yes, + "PronType_advPart": PronType_advPart, + "PronType_art": PronType_art, + "PronType_default": PronType_default, + "PronType_dem": PronType_dem, + "PronType_ind": PronType_ind, + "PronType_int": PronType_int, + "PronType_neg": PronType_neg, + "PronType_prs": PronType_prs, + "PronType_rcp": PronType_rcp, + "PronType_rel": PronType_rel, + "PronType_tot": PronType_tot, + "PronType_clit": PronType_clit, + "PronType_exc ": PronType_exc, # es, ca, it, fa, + "Reflex_yes": Reflex_yes, + "Tense_fut": Tense_fut, + "Tense_imp": Tense_imp, + "Tense_past": Tense_past, + "Tense_pres": Tense_pres, + "VerbForm_fin": VerbForm_fin, + "VerbForm_ger": VerbForm_ger, + "VerbForm_inf": VerbForm_inf, + "VerbForm_none": VerbForm_none, + "VerbForm_part": VerbForm_part, + "VerbForm_partFut": VerbForm_partFut, + "VerbForm_partPast": VerbForm_partPast, + "VerbForm_partPres": VerbForm_partPres, + "VerbForm_sup": VerbForm_sup, + "VerbForm_trans": VerbForm_trans, + "VerbForm_gdv ": VerbForm_gdv, # la, + "Voice_act": Voice_act, + "Voice_cau": Voice_cau, + "Voice_pass": Voice_pass, + "Voice_mid ": Voice_mid, # gkc, + "Voice_int ": Voice_int, # hb, + "Abbr_yes ": Abbr_yes, # cz, fi, sl, U, + "AdpType_prep ": AdpType_prep, # cz, U, + "AdpType_post ": AdpType_post, # U, + "AdpType_voc ": AdpType_voc, # cz, + "AdpType_comprep ": AdpType_comprep, # cz, + "AdpType_circ ": AdpType_circ, # U, + "AdvType_man": AdvType_man, + "AdvType_loc": AdvType_loc, + "AdvType_tim": AdvType_tim, + "AdvType_deg": AdvType_deg, + "AdvType_cau": AdvType_cau, + "AdvType_mod": AdvType_mod, + "AdvType_sta": AdvType_sta, + "AdvType_ex": AdvType_ex, + "AdvType_adadj": AdvType_adadj, + "ConjType_oper ": ConjType_oper, # cz, U, + "ConjType_comp ": ConjType_comp, # cz, U, + "Connegative_yes ": Connegative_yes, # fi, + "Derivation_minen ": Derivation_minen, # fi, + "Derivation_sti ": Derivation_sti, # fi, + "Derivation_inen ": Derivation_inen, # fi, + "Derivation_lainen ": Derivation_lainen, # fi, + "Derivation_ja ": Derivation_ja, # fi, + "Derivation_ton ": Derivation_ton, # fi, + "Derivation_vs ": Derivation_vs, # fi, + "Derivation_ttain ": Derivation_ttain, # fi, + "Derivation_ttaa ": Derivation_ttaa, # fi, + "Echo_rdp ": Echo_rdp, # U, + "Echo_ech ": Echo_ech, # U, + "Foreign_foreign ": Foreign_foreign, # cz, fi, U, + "Foreign_fscript ": Foreign_fscript, # cz, fi, U, + "Foreign_tscript ": Foreign_tscript, # cz, U, + "Foreign_yes ": Foreign_yes, # sl, + "Gender_dat_masc ": Gender_dat_masc, # bq, U, + "Gender_dat_fem ": Gender_dat_fem, # bq, U, + "Gender_erg_masc ": Gender_erg_masc, # bq, + "Gender_erg_fem ": Gender_erg_fem, # bq, + "Gender_psor_masc ": Gender_psor_masc, # cz, sl, U, + "Gender_psor_fem ": Gender_psor_fem, # cz, sl, U, + "Gender_psor_neut ": Gender_psor_neut, # sl, + "Hyph_yes ": Hyph_yes, # cz, U, + "InfForm_one ": InfForm_one, # fi, + "InfForm_two ": InfForm_two, # fi, + "InfForm_three ": InfForm_three, # fi, + "NameType_geo ": NameType_geo, # U, cz, + "NameType_prs ": NameType_prs, # U, cz, + "NameType_giv ": NameType_giv, # U, cz, + "NameType_sur ": NameType_sur, # U, cz, + "NameType_nat ": NameType_nat, # U, cz, + "NameType_com ": NameType_com, # U, cz, + "NameType_pro ": NameType_pro, # U, cz, + "NameType_oth ": NameType_oth, # U, cz, + "NounType_com ": NounType_com, # U, + "NounType_prop ": NounType_prop, # U, + "NounType_class ": NounType_class, # U, + "Number_abs_sing ": Number_abs_sing, # bq, U, + "Number_abs_plur ": Number_abs_plur, # bq, U, + "Number_dat_sing ": Number_dat_sing, # bq, U, + "Number_dat_plur ": Number_dat_plur, # bq, U, + "Number_erg_sing ": Number_erg_sing, # bq, U, + "Number_erg_plur ": Number_erg_plur, # bq, U, + "Number_psee_sing ": Number_psee_sing, # U, + "Number_psee_plur ": Number_psee_plur, # U, + "Number_psor_sing ": Number_psor_sing, # cz, fi, sl, U, + "Number_psor_plur ": Number_psor_plur, # cz, fi, sl, U, + "NumForm_digit ": NumForm_digit, # cz, sl, U, + "NumForm_roman ": NumForm_roman, # cz, sl, U, + "NumForm_word ": NumForm_word, # cz, sl, U, + "NumValue_one ": NumValue_one, # cz, U, + "NumValue_two ": NumValue_two, # cz, U, + "NumValue_three ": NumValue_three, # cz, U, + "PartForm_pres ": PartForm_pres, # fi, + "PartForm_past ": PartForm_past, # fi, + "PartForm_agt ": PartForm_agt, # fi, + "PartForm_neg ": PartForm_neg, # fi, + "PartType_mod ": PartType_mod, # U, + "PartType_emp ": PartType_emp, # U, + "PartType_res ": PartType_res, # U, + "PartType_inf ": PartType_inf, # U, + "PartType_vbp ": PartType_vbp, # U, + "Person_abs_one ": Person_abs_one, # bq, U, + "Person_abs_two ": Person_abs_two, # bq, U, + "Person_abs_three ": Person_abs_three, # bq, U, + "Person_dat_one ": Person_dat_one, # bq, U, + "Person_dat_two ": Person_dat_two, # bq, U, + "Person_dat_three ": Person_dat_three, # bq, U, + "Person_erg_one ": Person_erg_one, # bq, U, + "Person_erg_two ": Person_erg_two, # bq, U, + "Person_erg_three ": Person_erg_three, # bq, U, + "Person_psor_one ": Person_psor_one, # fi, U, + "Person_psor_two ": Person_psor_two, # fi, U, + "Person_psor_three ": Person_psor_three, # fi, U, + "Polite_inf ": Polite_inf, # bq, U, + "Polite_pol ": Polite_pol, # bq, U, + "Polite_abs_inf ": Polite_abs_inf, # bq, U, + "Polite_abs_pol ": Polite_abs_pol, # bq, U, + "Polite_erg_inf ": Polite_erg_inf, # bq, U, + "Polite_erg_pol ": Polite_erg_pol, # bq, U, + "Polite_dat_inf ": Polite_dat_inf, # bq, U, + "Polite_dat_pol ": Polite_dat_pol, # bq, U, + "Prefix_yes ": Prefix_yes, # U, + "PrepCase_npr ": PrepCase_npr, # cz, + "PrepCase_pre ": PrepCase_pre, # U, + "PunctSide_ini ": PunctSide_ini, # U, + "PunctSide_fin ": PunctSide_fin, # U, + "PunctType_peri ": PunctType_peri, # U, + "PunctType_qest ": PunctType_qest, # U, + "PunctType_excl ": PunctType_excl, # U, + "PunctType_quot ": PunctType_quot, # U, + "PunctType_brck ": PunctType_brck, # U, + "PunctType_comm ": PunctType_comm, # U, + "PunctType_colo ": PunctType_colo, # U, + "PunctType_semi ": PunctType_semi, # U, + "PunctType_dash ": PunctType_dash, # U, + "Style_arch ": Style_arch, # cz, fi, U, + "Style_rare ": Style_rare, # cz, fi, U, + "Style_poet ": Style_poet, # cz, U, + "Style_norm ": Style_norm, # cz, U, + "Style_coll ": Style_coll, # cz, U, + "Style_vrnc ": Style_vrnc, # cz, U, + "Style_sing ": Style_sing, # cz, U, + "Style_expr ": Style_expr, # cz, U, + "Style_derg ": Style_derg, # cz, U, + "Style_vulg ": Style_vulg, # cz, U, + "Style_yes ": Style_yes, # fi, U, + "StyleVariant_styleShort ": StyleVariant_styleShort, # cz, + "StyleVariant_styleBound ": StyleVariant_styleBound, # cz, sl, + "VerbType_aux ": VerbType_aux, # U, + "VerbType_cop ": VerbType_cop, # U, + "VerbType_mod ": VerbType_mod, # U, + "VerbType_light ": VerbType_light, # U, + + "PERSON": PERSON, + "NORP": NORP, + "FACILITY": FACILITY, + "ORG": ORG, + "GPE": GPE, + "LOC": LOC, + "PRODUCT": PRODUCT, + "EVENT": EVENT, + "WORK_OF_ART": WORK_OF_ART, + "LANGUAGE": LANGUAGE, + + "DATE": DATE, + "TIME": TIME, + "PERCENT": PERCENT, + "MONEY": MONEY, + "QUANTITY": QUANTITY, + "ORDINAL": ORDINAL, + "CARDINAL": CARDINAL, + + "acomp": acomp, + "advcl": advcl, + "advmod": advmod, + "agent": agent, + "amod": amod, + "appos": appos, + "attr": attr, + "aux": aux, + "auxpass": auxpass, + "cc": cc, + "ccomp": ccomp, + "complm": complm, + "conj": conj, + "csubj": csubj, + "csubjpass": csubjpass, + "dep": dep, + "det": det, + "dobj": dobj, + "expl": expl, + "hmod": hmod, + "hyph": hyph, + "infmod": infmod, + "intj": intj, + "iobj": iobj, + "mark": mark, + "meta": meta, + "neg": neg, + "nmod": nmod, + "nn": nn, + "npadvmod": npadvmod, + "nsubj": nsubj, + "nsubjpass": nsubjpass, + "num": num, + "number": number, + "oprd": oprd, + "parataxis": parataxis, + "partmod": partmod, + "pcomp": pcomp, + "pobj": pobj, + "poss": poss, + "possessive": possessive, + "preconj": preconj, + "prep": prep, + "prt": prt, + "punct": punct, + "quantmod": quantmod, + "rcmod": rcmod, + "root": root, + "xcomp": xcomp +} + +NAMES = [it[0] for it in sorted(IDS.items(), key=lambda it: it[1])] diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index eab6c044e..50b19d4c1 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -14,7 +14,6 @@ from ..typedefs cimport attr_t, flags_t from ..attrs cimport attr_id_t from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE -from ..parts_of_speech import UNIV_POS_NAMES from ..parts_of_speech cimport CONJ, PUNCT, NOUN from ..parts_of_speech cimport univ_pos_t from ..lexeme cimport Lexeme diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 25db3f47e..af80b5359 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -9,7 +9,7 @@ import numpy from ..lexeme cimport Lexeme -from ..parts_of_speech import UNIV_POS_NAMES +from .. import parts_of_speech from ..attrs cimport LEMMA from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER @@ -318,7 +318,7 @@ cdef class Token: property pos_: def __get__(self): - return _pos_id_to_string[self.c.pos] + return parts_of_speech.NAMES[self.c.pos] property tag_: def __get__(self): @@ -363,6 +363,3 @@ cdef class Token: property like_email: def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_EMAIL) - - -_pos_id_to_string = {id_: string for string, id_ in UNIV_POS_NAMES.items()} diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index d79da8a79..0f43967bb 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -19,6 +19,9 @@ from .typedefs cimport attr_t from .cfile cimport CFile from .lemmatizer import Lemmatizer +from . import attrs +from . import symbols + from cymem.cymem cimport Address from . import util from .serialize.packer cimport Packer @@ -67,6 +70,14 @@ cdef class Vocab: self._by_hash = PreshMap() self._by_orth = PreshMap() self.strings = StringStore() + # Load strings in a special order, so that we have an onset number for + # the vocabulary. This way, when words are added in order, the orth ID + # is the frequency rank of the word, plus a certain offset. The structural + # strings are loaded first, because the vocab is open-class, and these + # symbols are closed class. + for name in symbols.NAMES + list(sorted(tag_map.keys())): + if name: + _ = self.strings[name] self.get_lex_attr = get_lex_attr self.morphology = Morphology(self.strings, tag_map, lemmatizer) self.serializer_freqs = serializer_freqs diff --git a/tests/vocab/test_vocab.py b/tests/vocab/test_vocab.py index 7ad911626..153e0d546 100644 --- a/tests/vocab/test_vocab.py +++ b/tests/vocab/test_vocab.py @@ -1,6 +1,9 @@ from __future__ import unicode_literals import pytest +from spacy.attrs import LEMMA, ORTH, PROB, IS_ALPHA +from spacy.parts_of_speech import NOUN, VERB + def test_neq(en_vocab): addr = en_vocab['Hello'] @@ -25,3 +28,13 @@ def test_punct_neq(en_vocab): def test_shape_attr(en_vocab): example = en_vocab['example'] assert example.orth != example.shape + + +def test_symbols(en_vocab): + assert en_vocab.strings['IS_ALPHA'] == IS_ALPHA + assert en_vocab.strings['NOUN'] == NOUN + assert en_vocab.strings['VERB'] == VERB + assert en_vocab.strings['LEMMA'] == LEMMA + assert en_vocab.strings['ORTH'] == ORTH + assert en_vocab.strings['PROB'] == PROB + diff --git a/tests/website/conftest.py b/tests/website/conftest.py index ade1bae2a..35c38d845 100644 --- a/tests/website/conftest.py +++ b/tests/website/conftest.py @@ -1,11 +1,13 @@ from __future__ import unicode_literals import pytest +import os @pytest.fixture(scope='session') def nlp(): - from spacy.en import English - return English() + from spacy.en import English, LOCAL_DATA_DIR + data_dir = os.environ.get('SPACY_DATA', LOCAL_DATA_DIR) + return English(data_dir=data_dir) @pytest.fixture() diff --git a/tests/website/test_home.py b/tests/website/test_home.py index 4da61becf..3f7f7ea4c 100644 --- a/tests/website/test_home.py +++ b/tests/website/test_home.py @@ -1,6 +1,7 @@ from __future__ import unicode_literals import pytest import spacy +import os @pytest.fixture() @@ -9,8 +10,9 @@ def token(doc): def test_load_resources_and_process_text(): - from spacy.en import English - nlp = English() + from spacy.en import English, LOCAL_DATA_DIR + data_dir = os.environ.get('SPACY_DATA', LOCAL_DATA_DIR) + nlp = English(data_dir=data_dir) doc = nlp('Hello, world. Here are two sentences.')