mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 16:07:41 +03:00 
			
		
		
		
	Add support for Universal Dependencies v2.0
This commit is contained in:
		
							parent
							
								
									8dff040032
								
							
						
					
					
						commit
						66e1109b53
					
				|  | @ -93,7 +93,7 @@ NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])] | ||||||
| 
 | 
 | ||||||
| def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False): | def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False): | ||||||
|     '''Normalize a dictionary of attributes, converting them to ints. |     '''Normalize a dictionary of attributes, converting them to ints. | ||||||
|      | 
 | ||||||
|     Arguments: |     Arguments: | ||||||
|         stringy_attrs (dict): |         stringy_attrs (dict): | ||||||
|             Dictionary keyed by attribute string names. Values can be ints or strings. |             Dictionary keyed by attribute string names. Values can be ints or strings. | ||||||
|  | @ -125,7 +125,9 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False): | ||||||
|             'VerbForm', 'PronType', 'Aspect', 'Tense', 'PartType', 'Poss', |             'VerbForm', 'PronType', 'Aspect', 'Tense', 'PartType', 'Poss', | ||||||
|             'Hyph', 'ConjType', 'NumType', 'Foreign', 'VerbType', 'NounType', |             'Hyph', 'ConjType', 'NumType', 'Foreign', 'VerbType', 'NounType', | ||||||
|             'Number', 'PronType', 'AdjType', 'Person', 'Variant', 'AdpType', |             'Number', 'PronType', 'AdjType', 'Person', 'Variant', 'AdpType', | ||||||
|             'Reflex', 'Negative', 'Mood', 'Aspect', 'Case'] |             'Reflex', 'Negative', 'Mood', 'Aspect', 'Case', | ||||||
|  |             'Polarity', # U20 | ||||||
|  |         ] | ||||||
|         for key in morph_keys: |         for key in morph_keys: | ||||||
|             if key in stringy_attrs: |             if key in stringy_attrs: | ||||||
|                 stringy_attrs.pop(key) |                 stringy_attrs.pop(key) | ||||||
|  |  | ||||||
|  | @ -41,7 +41,7 @@ TAG_MAP = { | ||||||
|     "PRF":      {POS: PRON, "PronType": "prs", "Reflex": "yes"}, |     "PRF":      {POS: PRON, "PronType": "prs", "Reflex": "yes"}, | ||||||
|     "PTKA":     {POS: PART}, |     "PTKA":     {POS: PART}, | ||||||
|     "PTKANT":   {POS: PART, "PartType": "res"}, |     "PTKANT":   {POS: PART, "PartType": "res"}, | ||||||
|     "PTKNEG":   {POS: PART, "Negative": "yes"}, |     "PTKNEG":   {POS: PART, "Polarity": "Neg"}, | ||||||
|     "PTKVZ":    {POS: PART, "PartType": "vbp"}, |     "PTKVZ":    {POS: PART, "PartType": "vbp"}, | ||||||
|     "PTKZU":    {POS: PART, "PartType": "inf"}, |     "PTKZU":    {POS: PART, "PartType": "inf"}, | ||||||
|     "PWAT":     {POS: DET, "PronType": "int"}, |     "PWAT":     {POS: DET, "PronType": "int"}, | ||||||
|  |  | ||||||
|  | @ -16,7 +16,7 @@ TAG_MAP = { | ||||||
|     "$":        {POS: SYM, "Other": {"SymType": "currency"}}, |     "$":        {POS: SYM, "Other": {"SymType": "currency"}}, | ||||||
|     "#":        {POS: SYM, "Other": {"SymType": "numbersign"}}, |     "#":        {POS: SYM, "Other": {"SymType": "numbersign"}}, | ||||||
|     "AFX":      {POS: ADJ,  "Hyph": "yes"}, |     "AFX":      {POS: ADJ,  "Hyph": "yes"}, | ||||||
|     "CC":       {POS: CONJ, "ConjType": "coor"}, |     "CC":       {POS: CCONJ, "ConjType": "coor"}, | ||||||
|     "CD":       {POS: NUM, "NumType": "card"}, |     "CD":       {POS: NUM, "NumType": "card"}, | ||||||
|     "DT":       {POS: DET}, |     "DT":       {POS: DET}, | ||||||
|     "EX":       {POS: ADV, "AdvType": "ex"}, |     "EX":       {POS: ADV, "AdvType": "ex"}, | ||||||
|  |  | ||||||
|  | @ -19,6 +19,7 @@ TAG_MAP = { | ||||||
|     "AUX":      {POS: AUX}, |     "AUX":      {POS: AUX}, | ||||||
|     "X":        {POS: X}, |     "X":        {POS: X}, | ||||||
|     "CONJ":     {POS: CONJ}, |     "CONJ":     {POS: CONJ}, | ||||||
|  |     "CCONJ":    {POS: CCONJ}, # U20 | ||||||
|     "ADJ":      {POS: ADJ}, |     "ADJ":      {POS: ADJ}, | ||||||
|     "VERB":     {POS: VERB}, |     "VERB":     {POS: VERB}, | ||||||
|     "PART":     {POS: PART} |     "PART":     {POS: PART} | ||||||
|  |  | ||||||
|  | @ -37,7 +37,7 @@ cdef class Morphology: | ||||||
|     cdef int assign_tag(self, TokenC* token, tag) except -1 |     cdef int assign_tag(self, TokenC* token, tag) except -1 | ||||||
| 
 | 
 | ||||||
|     cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1 |     cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1 | ||||||
|      | 
 | ||||||
|     cdef int assign_feature(self, uint64_t* morph, univ_morph_t feat_id, bint value) except -1 |     cdef int assign_feature(self, uint64_t* morph, univ_morph_t feat_id, bint value) except -1 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -80,6 +80,7 @@ cpdef enum univ_morph_t: | ||||||
|     Definite_two |     Definite_two | ||||||
|     Definite_def |     Definite_def | ||||||
|     Definite_red |     Definite_red | ||||||
|  |     Definite_cons # U20 | ||||||
|     Definite_ind |     Definite_ind | ||||||
|     Degree_cmp |     Degree_cmp | ||||||
|     Degree_comp |     Degree_comp | ||||||
|  | @ -103,6 +104,8 @@ cpdef enum univ_morph_t: | ||||||
|     Negative_neg |     Negative_neg | ||||||
|     Negative_pos |     Negative_pos | ||||||
|     Negative_yes |     Negative_yes | ||||||
|  |     Polarity_neg # U20 | ||||||
|  |     Polarity_pos # U20 | ||||||
|     Number_com |     Number_com | ||||||
|     Number_dual |     Number_dual | ||||||
|     Number_none |     Number_none | ||||||
|  | @ -151,6 +154,7 @@ cpdef enum univ_morph_t: | ||||||
|     VerbForm_partPres |     VerbForm_partPres | ||||||
|     VerbForm_sup |     VerbForm_sup | ||||||
|     VerbForm_trans |     VerbForm_trans | ||||||
|  |     VerbForm_conv # U20 | ||||||
|     VerbForm_gdv # la |     VerbForm_gdv # la | ||||||
|     Voice_act |     Voice_act | ||||||
|     Voice_cau |     Voice_cau | ||||||
|  |  | ||||||
|  | @ -192,6 +192,7 @@ IDS = { | ||||||
|     "Definite_two": Definite_two, |     "Definite_two": Definite_two, | ||||||
|     "Definite_def": Definite_def, |     "Definite_def": Definite_def, | ||||||
|     "Definite_red": Definite_red, |     "Definite_red": Definite_red, | ||||||
|  |     "Definite_cons": Definite_cons, # U20 | ||||||
|     "Definite_ind": Definite_ind, |     "Definite_ind": Definite_ind, | ||||||
|     "Degree_cmp": Degree_cmp, |     "Degree_cmp": Degree_cmp, | ||||||
|     "Degree_comp": Degree_comp, |     "Degree_comp": Degree_comp, | ||||||
|  | @ -215,6 +216,8 @@ IDS = { | ||||||
|     "Negative_neg": Negative_neg, |     "Negative_neg": Negative_neg, | ||||||
|     "Negative_pos": Negative_pos, |     "Negative_pos": Negative_pos, | ||||||
|     "Negative_yes": Negative_yes, |     "Negative_yes": Negative_yes, | ||||||
|  |     "Polarity_neg": Polarity_neg, # U20 | ||||||
|  |     "Polarity_pos": Polarity_pos, # U20 | ||||||
|     "Number_com": Number_com, |     "Number_com": Number_com, | ||||||
|     "Number_dual": Number_dual, |     "Number_dual": Number_dual, | ||||||
|     "Number_none": Number_none, |     "Number_none": Number_none, | ||||||
|  | @ -263,6 +266,7 @@ IDS = { | ||||||
|     "VerbForm_partPres": VerbForm_partPres, |     "VerbForm_partPres": VerbForm_partPres, | ||||||
|     "VerbForm_sup": VerbForm_sup, |     "VerbForm_sup": VerbForm_sup, | ||||||
|     "VerbForm_trans": VerbForm_trans, |     "VerbForm_trans": VerbForm_trans, | ||||||
|  |     "VerbForm_conv": VerbForm_conv, # U20 | ||||||
|     "VerbForm_gdv ": VerbForm_gdv, # la, |     "VerbForm_gdv ": VerbForm_gdv, # la, | ||||||
|     "Voice_act": Voice_act, |     "Voice_act": Voice_act, | ||||||
|     "Voice_cau": Voice_cau, |     "Voice_cau": Voice_cau, | ||||||
|  |  | ||||||
|  | @ -7,6 +7,7 @@ cpdef enum univ_pos_t: | ||||||
|     ADV |     ADV | ||||||
|     AUX |     AUX | ||||||
|     CONJ |     CONJ | ||||||
|  |     CCONJ # U20 | ||||||
|     DET |     DET | ||||||
|     INTJ |     INTJ | ||||||
|     NOUN |     NOUN | ||||||
|  |  | ||||||
|  | @ -7,7 +7,8 @@ IDS = { | ||||||
|     "ADP": ADP, |     "ADP": ADP, | ||||||
|     "ADV": ADV, |     "ADV": ADV, | ||||||
|     "AUX": AUX, |     "AUX": AUX, | ||||||
|     "CONJ": CONJ, |     "CONJ": CONJ, # U20 | ||||||
|  |     "CCONJ": CCONJ, | ||||||
|     "DET": DET, |     "DET": DET, | ||||||
|     "INTJ": INTJ, |     "INTJ": INTJ, | ||||||
|     "NOUN": NOUN, |     "NOUN": NOUN, | ||||||
|  |  | ||||||
|  | @ -13,7 +13,7 @@ cpdef enum symbol_t: | ||||||
|     LIKE_EMAIL |     LIKE_EMAIL | ||||||
|     IS_STOP |     IS_STOP | ||||||
|     IS_OOV |     IS_OOV | ||||||
|      | 
 | ||||||
|     FLAG14 = 14 |     FLAG14 = 14 | ||||||
|     FLAG15 |     FLAG15 | ||||||
|     FLAG16 |     FLAG16 | ||||||
|  | @ -90,6 +90,7 @@ cpdef enum symbol_t: | ||||||
|     ADV |     ADV | ||||||
|     AUX |     AUX | ||||||
|     CONJ |     CONJ | ||||||
|  |     CCONJ # U20 | ||||||
|     DET |     DET | ||||||
|     INTJ |     INTJ | ||||||
|     NOUN |     NOUN | ||||||
|  | @ -107,11 +108,14 @@ cpdef enum symbol_t: | ||||||
| 
 | 
 | ||||||
|     Animacy_anim |     Animacy_anim | ||||||
|     Animacy_inam |     Animacy_inam | ||||||
|  |     Animacy_hum # U20 | ||||||
|     Aspect_freq |     Aspect_freq | ||||||
|     Aspect_imp |     Aspect_imp | ||||||
|     Aspect_mod |     Aspect_mod | ||||||
|     Aspect_none |     Aspect_none | ||||||
|     Aspect_perf |     Aspect_perf | ||||||
|  |     Aspect_iter # U20 | ||||||
|  |     Aspect_hab # U20 | ||||||
|     Case_abe |     Case_abe | ||||||
|     Case_abl |     Case_abl | ||||||
|     Case_abs |     Case_abs | ||||||
|  | @ -120,10 +124,12 @@ cpdef enum symbol_t: | ||||||
|     Case_all |     Case_all | ||||||
|     Case_cau |     Case_cau | ||||||
|     Case_com |     Case_com | ||||||
|  |     Case_cmp # U20 | ||||||
|     Case_dat |     Case_dat | ||||||
|     Case_del |     Case_del | ||||||
|     Case_dis |     Case_dis | ||||||
|     Case_ela |     Case_ela | ||||||
|  |     Case_equ # U20 | ||||||
|     Case_ess |     Case_ess | ||||||
|     Case_gen |     Case_gen | ||||||
|     Case_ill |     Case_ill | ||||||
|  | @ -142,7 +148,9 @@ cpdef enum symbol_t: | ||||||
|     Definite_two |     Definite_two | ||||||
|     Definite_def |     Definite_def | ||||||
|     Definite_red |     Definite_red | ||||||
|  |     Definite_cons # U20 | ||||||
|     Definite_ind |     Definite_ind | ||||||
|  |     Definite_spec # U20 | ||||||
|     Degree_cmp |     Degree_cmp | ||||||
|     Degree_comp |     Degree_comp | ||||||
|     Degree_none |     Degree_none | ||||||
|  | @ -151,6 +159,8 @@ cpdef enum symbol_t: | ||||||
|     Degree_abs |     Degree_abs | ||||||
|     Degree_com |     Degree_com | ||||||
|     Degree_dim # du |     Degree_dim # du | ||||||
|  |     Degree_equ # U20 | ||||||
|  |     Evident_nfh # U20 | ||||||
|     Gender_com |     Gender_com | ||||||
|     Gender_fem |     Gender_fem | ||||||
|     Gender_masc |     Gender_masc | ||||||
|  | @ -162,16 +172,21 @@ cpdef enum symbol_t: | ||||||
|     Mood_pot |     Mood_pot | ||||||
|     Mood_sub |     Mood_sub | ||||||
|     Mood_opt |     Mood_opt | ||||||
|  |     Mood_prp # U20 | ||||||
|  |     Mood_adm # U20 | ||||||
|     Negative_neg |     Negative_neg | ||||||
|     Negative_pos |     Negative_pos | ||||||
|     Negative_yes |     Negative_yes | ||||||
|  |     Polarity_neg # U20 | ||||||
|  |     Polarity_pos # U20 | ||||||
|     Number_com |     Number_com | ||||||
|     Number_dual |     Number_dual | ||||||
|     Number_none |     Number_none | ||||||
|     Number_plur |     Number_plur | ||||||
|     Number_sing |     Number_sing | ||||||
|     Number_ptan # bg |     Number_ptan # bg | ||||||
|     Number_count # bg |     Number_count # bg, U20 | ||||||
|  |     Number_tri # U20 | ||||||
|     NumType_card |     NumType_card | ||||||
|     NumType_dist |     NumType_dist | ||||||
|     NumType_frac |     NumType_frac | ||||||
|  | @ -197,7 +212,8 @@ cpdef enum symbol_t: | ||||||
|     PronType_rel |     PronType_rel | ||||||
|     PronType_tot |     PronType_tot | ||||||
|     PronType_clit |     PronType_clit | ||||||
|     PronType_exc # es, ca, it, fa |     PronType_exc # es, ca, it, fa, U20 | ||||||
|  |     PronType_emp # U20 | ||||||
|     Reflex_yes |     Reflex_yes | ||||||
|     Tense_fut |     Tense_fut | ||||||
|     Tense_imp |     Tense_imp | ||||||
|  | @ -213,12 +229,17 @@ cpdef enum symbol_t: | ||||||
|     VerbForm_partPres |     VerbForm_partPres | ||||||
|     VerbForm_sup |     VerbForm_sup | ||||||
|     VerbForm_trans |     VerbForm_trans | ||||||
|  |     VerbForm_conv # U20 | ||||||
|     VerbForm_gdv # la |     VerbForm_gdv # la | ||||||
|  |     VerbForm_vnoun # U20 | ||||||
|     Voice_act |     Voice_act | ||||||
|     Voice_cau |     Voice_cau | ||||||
|     Voice_pass |     Voice_pass | ||||||
|     Voice_mid # gkc |     Voice_mid # gkc, U20 | ||||||
|     Voice_int # hb |     Voice_int # hb | ||||||
|  |     Voice_antip # U20 | ||||||
|  |     Voice_dir # U20 | ||||||
|  |     Voice_inv # U20 | ||||||
|     Abbr_yes # cz, fi, sl, U |     Abbr_yes # cz, fi, sl, U | ||||||
|     AdpType_prep # cz, U |     AdpType_prep # cz, U | ||||||
|     AdpType_post # U |     AdpType_post # U | ||||||
|  | @ -284,6 +305,10 @@ cpdef enum symbol_t: | ||||||
|     Number_psee_plur # U |     Number_psee_plur # U | ||||||
|     Number_psor_sing # cz, fi, sl, U |     Number_psor_sing # cz, fi, sl, U | ||||||
|     Number_psor_plur # cz, fi, sl, U |     Number_psor_plur # cz, fi, sl, U | ||||||
|  |     Number_pauc # U20 | ||||||
|  |     Number_grpa # U20 | ||||||
|  |     Number_grpl # U20 | ||||||
|  |     Number_inv # U20 | ||||||
|     NumForm_digit # cz, sl, U |     NumForm_digit # cz, sl, U | ||||||
|     NumForm_roman # cz, sl, U |     NumForm_roman # cz, sl, U | ||||||
|     NumForm_word # cz, sl, U |     NumForm_word # cz, sl, U | ||||||
|  | @ -311,6 +336,8 @@ cpdef enum symbol_t: | ||||||
|     Person_psor_one # fi, U |     Person_psor_one # fi, U | ||||||
|     Person_psor_two # fi, U |     Person_psor_two # fi, U | ||||||
|     Person_psor_three # fi, U |     Person_psor_three # fi, U | ||||||
|  |     Person_zero # U20 | ||||||
|  |     Person_four # U20 | ||||||
|     Polite_inf # bq, U |     Polite_inf # bq, U | ||||||
|     Polite_pol # bq, U |     Polite_pol # bq, U | ||||||
|     Polite_abs_inf # bq, U |     Polite_abs_inf # bq, U | ||||||
|  | @ -319,6 +346,10 @@ cpdef enum symbol_t: | ||||||
|     Polite_erg_pol # bq, U |     Polite_erg_pol # bq, U | ||||||
|     Polite_dat_inf # bq, U |     Polite_dat_inf # bq, U | ||||||
|     Polite_dat_pol # bq, U |     Polite_dat_pol # bq, U | ||||||
|  |     Polite_infm # U20 | ||||||
|  |     Polite_form # U20 | ||||||
|  |     Polite_form_elev # U20 | ||||||
|  |     Polite_form_humb # U20 | ||||||
|     Prefix_yes # U |     Prefix_yes # U | ||||||
|     PrepCase_npr # cz |     PrepCase_npr # cz | ||||||
|     PrepCase_pre # U |     PrepCase_pre # U | ||||||
|  | @ -383,6 +414,7 @@ cpdef enum symbol_t: | ||||||
|     ccomp |     ccomp | ||||||
|     complm |     complm | ||||||
|     conj |     conj | ||||||
|  |     cop # U20 | ||||||
|     csubj |     csubj | ||||||
|     csubjpass |     csubjpass | ||||||
|     dep |     dep | ||||||
|  | @ -405,6 +437,8 @@ cpdef enum symbol_t: | ||||||
|     num |     num | ||||||
|     number |     number | ||||||
|     oprd |     oprd | ||||||
|  |     obj # U20 | ||||||
|  |     obl # U20 | ||||||
|     parataxis |     parataxis | ||||||
|     partmod |     partmod | ||||||
|     pcomp |     pcomp | ||||||
|  |  | ||||||
|  | @ -91,6 +91,7 @@ IDS = { | ||||||
|     "ADV": ADV, |     "ADV": ADV, | ||||||
|     "AUX": AUX, |     "AUX": AUX, | ||||||
|     "CONJ": CONJ, |     "CONJ": CONJ, | ||||||
|  |     "CCONJ": CCONJ, # U20 | ||||||
|     "DET": DET, |     "DET": DET, | ||||||
|     "INTJ": INTJ, |     "INTJ": INTJ, | ||||||
|     "NOUN": NOUN, |     "NOUN": NOUN, | ||||||
|  | @ -108,11 +109,14 @@ IDS = { | ||||||
| 
 | 
 | ||||||
|     "Animacy_anim": Animacy_anim, |     "Animacy_anim": Animacy_anim, | ||||||
|     "Animacy_inam": Animacy_inam, |     "Animacy_inam": Animacy_inam, | ||||||
|  |     "Animacy_hum": Animacy_hum, # U20 | ||||||
|     "Aspect_freq": Aspect_freq, |     "Aspect_freq": Aspect_freq, | ||||||
|     "Aspect_imp": Aspect_imp, |     "Aspect_imp": Aspect_imp, | ||||||
|     "Aspect_mod": Aspect_mod, |     "Aspect_mod": Aspect_mod, | ||||||
|     "Aspect_none": Aspect_none, |     "Aspect_none": Aspect_none, | ||||||
|     "Aspect_perf": Aspect_perf, |     "Aspect_perf": Aspect_perf, | ||||||
|  |     "Aspect_iter": Aspect_iter, # U20 | ||||||
|  |     "Aspect_hab": Aspect_hab, # U20 | ||||||
|     "Case_abe": Case_abe, |     "Case_abe": Case_abe, | ||||||
|     "Case_abl": Case_abl, |     "Case_abl": Case_abl, | ||||||
|     "Case_abs": Case_abs, |     "Case_abs": Case_abs, | ||||||
|  | @ -121,10 +125,12 @@ IDS = { | ||||||
|     "Case_all": Case_all, |     "Case_all": Case_all, | ||||||
|     "Case_cau": Case_cau, |     "Case_cau": Case_cau, | ||||||
|     "Case_com": Case_com, |     "Case_com": Case_com, | ||||||
|  |     "Case_cmp": Case_cmp, # U20 | ||||||
|     "Case_dat": Case_dat, |     "Case_dat": Case_dat, | ||||||
|     "Case_del": Case_del, |     "Case_del": Case_del, | ||||||
|     "Case_dis": Case_dis, |     "Case_dis": Case_dis, | ||||||
|     "Case_ela": Case_ela, |     "Case_ela": Case_ela, | ||||||
|  |     "Case_equ": Case_equ, # U20 | ||||||
|     "Case_ess": Case_ess, |     "Case_ess": Case_ess, | ||||||
|     "Case_gen": Case_gen, |     "Case_gen": Case_gen, | ||||||
|     "Case_ill": Case_ill, |     "Case_ill": Case_ill, | ||||||
|  | @ -143,7 +149,9 @@ IDS = { | ||||||
|     "Definite_two": Definite_two, |     "Definite_two": Definite_two, | ||||||
|     "Definite_def": Definite_def, |     "Definite_def": Definite_def, | ||||||
|     "Definite_red": Definite_red, |     "Definite_red": Definite_red, | ||||||
|  |     "Definite_cons": Definite_cons, # U20 | ||||||
|     "Definite_ind": Definite_ind, |     "Definite_ind": Definite_ind, | ||||||
|  |     "Definite_spec": Definite_spec, # U20 | ||||||
|     "Degree_cmp": Degree_cmp, |     "Degree_cmp": Degree_cmp, | ||||||
|     "Degree_comp": Degree_comp, |     "Degree_comp": Degree_comp, | ||||||
|     "Degree_none": Degree_none, |     "Degree_none": Degree_none, | ||||||
|  | @ -152,6 +160,8 @@ IDS = { | ||||||
|     "Degree_abs": Degree_abs, |     "Degree_abs": Degree_abs, | ||||||
|     "Degree_com": Degree_com, |     "Degree_com": Degree_com, | ||||||
|     "Degree_dim ": Degree_dim, # du |     "Degree_dim ": Degree_dim, # du | ||||||
|  |     "Degree_equ": Degree_equ, # U20 | ||||||
|  |     "Evident_nfh": Evident_nfh, # U20 | ||||||
|     "Gender_com": Gender_com, |     "Gender_com": Gender_com, | ||||||
|     "Gender_fem": Gender_fem, |     "Gender_fem": Gender_fem, | ||||||
|     "Gender_masc": Gender_masc, |     "Gender_masc": Gender_masc, | ||||||
|  | @ -163,16 +173,21 @@ IDS = { | ||||||
|     "Mood_pot": Mood_pot, |     "Mood_pot": Mood_pot, | ||||||
|     "Mood_sub": Mood_sub, |     "Mood_sub": Mood_sub, | ||||||
|     "Mood_opt": Mood_opt, |     "Mood_opt": Mood_opt, | ||||||
|  |     "Mood_prp": Mood_prp, # U20 | ||||||
|  |     "Mood_adm": Mood_adm, # U20 | ||||||
|     "Negative_neg": Negative_neg, |     "Negative_neg": Negative_neg, | ||||||
|     "Negative_pos": Negative_pos, |     "Negative_pos": Negative_pos, | ||||||
|     "Negative_yes": Negative_yes, |     "Negative_yes": Negative_yes, | ||||||
|  |     "Polarity_neg": Polarity_neg, # U20 | ||||||
|  |     "Polarity_pos": Polarity_pos, # U20 | ||||||
|     "Number_com": Number_com, |     "Number_com": Number_com, | ||||||
|     "Number_dual": Number_dual, |     "Number_dual": Number_dual, | ||||||
|     "Number_none": Number_none, |     "Number_none": Number_none, | ||||||
|     "Number_plur": Number_plur, |     "Number_plur": Number_plur, | ||||||
|     "Number_sing": Number_sing, |     "Number_sing": Number_sing, | ||||||
|     "Number_ptan ": Number_ptan, # bg |     "Number_ptan ": Number_ptan, # bg | ||||||
|     "Number_count ": Number_count, # bg |     "Number_count ": Number_count, # bg, U20 | ||||||
|  |     "Number_tri": Number_tri, # U20 | ||||||
|     "NumType_card": NumType_card, |     "NumType_card": NumType_card, | ||||||
|     "NumType_dist": NumType_dist, |     "NumType_dist": NumType_dist, | ||||||
|     "NumType_frac": NumType_frac, |     "NumType_frac": NumType_frac, | ||||||
|  | @ -198,7 +213,8 @@ IDS = { | ||||||
|     "PronType_rel": PronType_rel, |     "PronType_rel": PronType_rel, | ||||||
|     "PronType_tot": PronType_tot, |     "PronType_tot": PronType_tot, | ||||||
|     "PronType_clit": PronType_clit, |     "PronType_clit": PronType_clit, | ||||||
|     "PronType_exc ": PronType_exc, # es, ca, it, fa, |     "PronType_exc": PronType_exc, # es, ca, it, fa, U20 | ||||||
|  |     "PronType_emp": PronType_emp, # U20 | ||||||
|     "Reflex_yes": Reflex_yes, |     "Reflex_yes": Reflex_yes, | ||||||
|     "Tense_fut": Tense_fut, |     "Tense_fut": Tense_fut, | ||||||
|     "Tense_imp": Tense_imp, |     "Tense_imp": Tense_imp, | ||||||
|  | @ -214,12 +230,17 @@ IDS = { | ||||||
|     "VerbForm_partPres": VerbForm_partPres, |     "VerbForm_partPres": VerbForm_partPres, | ||||||
|     "VerbForm_sup": VerbForm_sup, |     "VerbForm_sup": VerbForm_sup, | ||||||
|     "VerbForm_trans": VerbForm_trans, |     "VerbForm_trans": VerbForm_trans, | ||||||
|  |     "VerbForm_conv": VerbForm_conv, # U20 | ||||||
|     "VerbForm_gdv ": VerbForm_gdv, # la, |     "VerbForm_gdv ": VerbForm_gdv, # la, | ||||||
|  |     "VerbForm_vnoun": VerbForm_vnoun, # U20 | ||||||
|     "Voice_act": Voice_act, |     "Voice_act": Voice_act, | ||||||
|     "Voice_cau": Voice_cau, |     "Voice_cau": Voice_cau, | ||||||
|     "Voice_pass": Voice_pass, |     "Voice_pass": Voice_pass, | ||||||
|     "Voice_mid ": Voice_mid, # gkc, |     "Voice_mid ": Voice_mid, # gkc, U20 | ||||||
|     "Voice_int ": Voice_int, # hb, |     "Voice_int ": Voice_int, # hb, | ||||||
|  |     "Voice_antip": Voice_antip, # U20 | ||||||
|  |     "Voice_dir": Voice_dir, # U20 | ||||||
|  |     "Voice_inv": Voice_inv, # U20 | ||||||
|     "Abbr_yes ": Abbr_yes, # cz, fi, sl, U, |     "Abbr_yes ": Abbr_yes, # cz, fi, sl, U, | ||||||
|     "AdpType_prep ": AdpType_prep, # cz, U, |     "AdpType_prep ": AdpType_prep, # cz, U, | ||||||
|     "AdpType_post ": AdpType_post, # U, |     "AdpType_post ": AdpType_post, # U, | ||||||
|  | @ -285,6 +306,10 @@ IDS = { | ||||||
|     "Number_psee_plur ": Number_psee_plur, # U, |     "Number_psee_plur ": Number_psee_plur, # U, | ||||||
|     "Number_psor_sing ": Number_psor_sing, # cz, fi, sl, U, |     "Number_psor_sing ": Number_psor_sing, # cz, fi, sl, U, | ||||||
|     "Number_psor_plur ": Number_psor_plur, # cz, fi, sl, U, |     "Number_psor_plur ": Number_psor_plur, # cz, fi, sl, U, | ||||||
|  |     "Number_pauc": Number_pauc, # U20 | ||||||
|  |     "Number_grpa": Number_grpa, # U20 | ||||||
|  |     "Number_grpl": Number_grpl, # U20 | ||||||
|  |     "Number_inv": Number_inv, # U20 | ||||||
|     "NumForm_digit ": NumForm_digit, # cz, sl, U, |     "NumForm_digit ": NumForm_digit, # cz, sl, U, | ||||||
|     "NumForm_roman ": NumForm_roman, # cz, sl, U, |     "NumForm_roman ": NumForm_roman, # cz, sl, U, | ||||||
|     "NumForm_word ": NumForm_word, # cz, sl, U, |     "NumForm_word ": NumForm_word, # cz, sl, U, | ||||||
|  | @ -312,6 +337,8 @@ IDS = { | ||||||
|     "Person_psor_one ": Person_psor_one, # fi, U, |     "Person_psor_one ": Person_psor_one, # fi, U, | ||||||
|     "Person_psor_two ": Person_psor_two, # fi, U, |     "Person_psor_two ": Person_psor_two, # fi, U, | ||||||
|     "Person_psor_three ": Person_psor_three, # fi, U, |     "Person_psor_three ": Person_psor_three, # fi, U, | ||||||
|  |     "Person_zero ": Person_zero, # U20 | ||||||
|  |     "Person_four ": Person_four, # U20 | ||||||
|     "Polite_inf ": Polite_inf, # bq, U, |     "Polite_inf ": Polite_inf, # bq, U, | ||||||
|     "Polite_pol ": Polite_pol, # bq, U, |     "Polite_pol ": Polite_pol, # bq, U, | ||||||
|     "Polite_abs_inf ": Polite_abs_inf, # bq, U, |     "Polite_abs_inf ": Polite_abs_inf, # bq, U, | ||||||
|  | @ -320,6 +347,10 @@ IDS = { | ||||||
|     "Polite_erg_pol ": Polite_erg_pol, # bq, U, |     "Polite_erg_pol ": Polite_erg_pol, # bq, U, | ||||||
|     "Polite_dat_inf ": Polite_dat_inf, # bq, U, |     "Polite_dat_inf ": Polite_dat_inf, # bq, U, | ||||||
|     "Polite_dat_pol ": Polite_dat_pol, # bq, U, |     "Polite_dat_pol ": Polite_dat_pol, # bq, U, | ||||||
|  |     "Polite_infm ": Polite_infm, # U20 | ||||||
|  |     "Polite_form ": Polite_form, # U20 | ||||||
|  |     "Polite_form_elev ": Polite_form_elev, # U20 | ||||||
|  |     "Polite_form_humb ": Polite_form_humb, # U20 | ||||||
|     "Prefix_yes ": Prefix_yes, # U, |     "Prefix_yes ": Prefix_yes, # U, | ||||||
|     "PrepCase_npr ": PrepCase_npr, # cz, |     "PrepCase_npr ": PrepCase_npr, # cz, | ||||||
|     "PrepCase_pre ": PrepCase_pre, # U, |     "PrepCase_pre ": PrepCase_pre, # U, | ||||||
|  | @ -384,6 +415,7 @@ IDS = { | ||||||
|     "ccomp": ccomp, |     "ccomp": ccomp, | ||||||
|     "complm": complm, |     "complm": complm, | ||||||
|     "conj": conj, |     "conj": conj, | ||||||
|  |     "cop": cop, # U20 | ||||||
|     "csubj": csubj, |     "csubj": csubj, | ||||||
|     "csubjpass": csubjpass, |     "csubjpass": csubjpass, | ||||||
|     "dep": dep, |     "dep": dep, | ||||||
|  | @ -406,6 +438,8 @@ IDS = { | ||||||
|     "num": num, |     "num": num, | ||||||
|     "number": number, |     "number": number, | ||||||
|     "oprd": oprd, |     "oprd": oprd, | ||||||
|  |     "obj": obj, # U20 | ||||||
|  |     "obl": obl, # U20 | ||||||
|     "parataxis": parataxis, |     "parataxis": parataxis, | ||||||
|     "partmod": partmod, |     "partmod": partmod, | ||||||
|     "pcomp": pcomp, |     "pcomp": pcomp, | ||||||
|  |  | ||||||
|  | @ -8,7 +8,7 @@ from spacy.attrs import DEP, HEAD | ||||||
| def ancestors(tokenid, heads): | def ancestors(tokenid, heads): | ||||||
|     # returns all words going from the word up the path to the root |     # returns all words going from the word up the path to the root | ||||||
|     # the path to root cannot be longer than the number of words in the sentence |     # the path to root cannot be longer than the number of words in the sentence | ||||||
|     # this function ends after at most len(heads) steps  |     # this function ends after at most len(heads) steps | ||||||
|     # because it would otherwise loop indefinitely on cycles |     # because it would otherwise loop indefinitely on cycles | ||||||
|     head = tokenid |     head = tokenid | ||||||
|     cnt = 0 |     cnt = 0 | ||||||
|  | @ -180,7 +180,7 @@ class PseudoProjectivity: | ||||||
|             next_queue = [] |             next_queue = [] | ||||||
|             for qtoken in queue: |             for qtoken in queue: | ||||||
|                 for child in qtoken.children: |                 for child in qtoken.children: | ||||||
|                     if child.is_space: continue                         |                     if child.is_space: continue | ||||||
|                     if child == token: continue |                     if child == token: continue | ||||||
|                     if child.dep_ == headlabel: |                     if child.dep_ == headlabel: | ||||||
|                         return child |                         return child | ||||||
|  |  | ||||||
|  | @ -13,13 +13,13 @@ from thinc.linalg cimport VecVec | ||||||
| from .typedefs cimport attr_t | from .typedefs cimport attr_t | ||||||
| from .tokens.doc cimport Doc | from .tokens.doc cimport Doc | ||||||
| from .attrs cimport TAG | from .attrs cimport TAG | ||||||
| from .parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON | from .parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CCONJ, DET, NOUN, NUM, PRON | ||||||
| from .parts_of_speech cimport VERB, X, PUNCT, EOL, SPACE | from .parts_of_speech cimport VERB, X, PUNCT, EOL, SPACE | ||||||
| from .gold cimport GoldParse | from .gold cimport GoldParse | ||||||
| 
 | 
 | ||||||
| from .attrs cimport * | from .attrs cimport * | ||||||
| 
 | 
 | ||||||
|   | 
 | ||||||
| cpdef enum: | cpdef enum: | ||||||
|     P2_orth |     P2_orth | ||||||
|     P2_cluster |     P2_cluster | ||||||
|  | @ -71,7 +71,7 @@ cpdef enum: | ||||||
| 
 | 
 | ||||||
| cdef class TaggerModel(AveragedPerceptron): | cdef class TaggerModel(AveragedPerceptron): | ||||||
|     cdef void set_featuresC(self, ExampleC* eg, const TokenC* tokens, int i) except *: |     cdef void set_featuresC(self, ExampleC* eg, const TokenC* tokens, int i) except *: | ||||||
|          | 
 | ||||||
|         _fill_from_token(&eg.atoms[P2_orth], &tokens[i-2]) |         _fill_from_token(&eg.atoms[P2_orth], &tokens[i-2]) | ||||||
|         _fill_from_token(&eg.atoms[P1_orth], &tokens[i-1]) |         _fill_from_token(&eg.atoms[P1_orth], &tokens[i-1]) | ||||||
|         _fill_from_token(&eg.atoms[W_orth], &tokens[i]) |         _fill_from_token(&eg.atoms[W_orth], &tokens[i]) | ||||||
|  | @ -191,7 +191,7 @@ cdef class Tagger: | ||||||
|                                   nr_class=self.vocab.morphology.n_tags, |                                   nr_class=self.vocab.morphology.n_tags, | ||||||
|                                   nr_feat=self.model.nr_feat) |                                   nr_feat=self.model.nr_feat) | ||||||
|         for i in range(tokens.length): |         for i in range(tokens.length): | ||||||
|             if tokens.c[i].pos == 0:                 |             if tokens.c[i].pos == 0: | ||||||
|                 self.model.set_featuresC(&eg.c, tokens.c, i) |                 self.model.set_featuresC(&eg.c, tokens.c, i) | ||||||
|                 self.model.set_scoresC(eg.c.scores, |                 self.model.set_scoresC(eg.c.scores, | ||||||
|                     eg.c.features, eg.c.nr_feat) |                     eg.c.features, eg.c.nr_feat) | ||||||
|  | @ -217,7 +217,7 @@ cdef class Tagger: | ||||||
|         for doc in stream: |         for doc in stream: | ||||||
|             self(doc) |             self(doc) | ||||||
|             yield doc |             yield doc | ||||||
|      | 
 | ||||||
|     def update(self, Doc tokens, GoldParse gold): |     def update(self, Doc tokens, GoldParse gold): | ||||||
|         """Update the statistical model, with tags supplied for the given document. |         """Update the statistical model, with tags supplied for the given document. | ||||||
| 
 | 
 | ||||||
|  | @ -251,7 +251,7 @@ cdef class Tagger: | ||||||
|             self.model.updateC(&eg.c) |             self.model.updateC(&eg.c) | ||||||
| 
 | 
 | ||||||
|             self.vocab.morphology.assign_tag_id(&tokens.c[i], eg.guess) |             self.vocab.morphology.assign_tag_id(&tokens.c[i], eg.guess) | ||||||
|              | 
 | ||||||
|             correct += eg.cost == 0 |             correct += eg.cost == 0 | ||||||
|             self.freqs[TAG][tokens.c[i].tag] += 1 |             self.freqs[TAG][tokens.c[i].tag] += 1 | ||||||
|             eg.fill_scores(0, eg.c.nr_class) |             eg.fill_scores(0, eg.c.nr_class) | ||||||
|  |  | ||||||
|  | @ -16,7 +16,7 @@ from ..typedefs cimport attr_t, flags_t | ||||||
| from ..attrs cimport attr_id_t | from ..attrs cimport attr_id_t | ||||||
| from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER | from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER | ||||||
| from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE | from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE | ||||||
| from ..parts_of_speech cimport CONJ, PUNCT, NOUN | from ..parts_of_speech cimport CCONJ, PUNCT, NOUN | ||||||
| from ..parts_of_speech cimport univ_pos_t | from ..parts_of_speech cimport univ_pos_t | ||||||
| from ..lexeme cimport Lexeme | from ..lexeme cimport Lexeme | ||||||
| from .span cimport Span | from .span cimport Span | ||||||
|  | @ -59,13 +59,13 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil: | ||||||
| 
 | 
 | ||||||
| cdef class Doc: | cdef class Doc: | ||||||
|     """ |     """ | ||||||
|     A sequence of `Token` objects. Access sentences and named entities,  |     A sequence of `Token` objects. Access sentences and named entities, | ||||||
|     export annotations to numpy arrays, losslessly serialize to compressed  |     export annotations to numpy arrays, losslessly serialize to compressed | ||||||
|     binary strings. |     binary strings. | ||||||
| 
 | 
 | ||||||
|     Aside: Internals |     Aside: Internals | ||||||
|         The `Doc` object holds an array of `TokenC` structs.  |         The `Doc` object holds an array of `TokenC` structs. | ||||||
|         The Python-level `Token` and `Span` objects are views of this  |         The Python-level `Token` and `Span` objects are views of this | ||||||
|         array, i.e. they don't own the data themselves. |         array, i.e. they don't own the data themselves. | ||||||
| 
 | 
 | ||||||
|     Code: Construction 1 |     Code: Construction 1 | ||||||
|  | @ -80,13 +80,13 @@ cdef class Doc: | ||||||
|         Create a Doc object. |         Create a Doc object. | ||||||
| 
 | 
 | ||||||
|         Aside: Implementation |         Aside: Implementation | ||||||
|             This method of constructing a `Doc` object is usually only used  |             This method of constructing a `Doc` object is usually only used | ||||||
|             for deserialization. Standard usage is to construct the document via  |             for deserialization. Standard usage is to construct the document via | ||||||
|             a call to the language object. |             a call to the language object. | ||||||
| 
 | 
 | ||||||
|         Arguments: |         Arguments: | ||||||
|             vocab: |             vocab: | ||||||
|                 A Vocabulary object, which must match any models you want to  |                 A Vocabulary object, which must match any models you want to | ||||||
|                 use (e.g. tokenizer, parser, entity recognizer). |                 use (e.g. tokenizer, parser, entity recognizer). | ||||||
| 
 | 
 | ||||||
|             words: |             words: | ||||||
|  | @ -156,19 +156,19 @@ cdef class Doc: | ||||||
|         if self.length == 0: |         if self.length == 0: | ||||||
|             self.is_tagged = True |             self.is_tagged = True | ||||||
|             self.is_parsed = True |             self.is_parsed = True | ||||||
|      | 
 | ||||||
|     def __getitem__(self, object i): |     def __getitem__(self, object i): | ||||||
|         ''' |         ''' | ||||||
|         doc[i] |         doc[i] | ||||||
|             Get the Token object at position i, where i is an integer.  |             Get the Token object at position i, where i is an integer. | ||||||
|             Negative indexing is supported, and follows the usual Python  |             Negative indexing is supported, and follows the usual Python | ||||||
|             semantics, i.e. doc[-2] is doc[len(doc) - 2]. |             semantics, i.e. doc[-2] is doc[len(doc) - 2]. | ||||||
|         doc[start : end]] |         doc[start : end]] | ||||||
|             Get a `Span` object, starting at position `start` |             Get a `Span` object, starting at position `start` | ||||||
|             and ending at position `end`, where `start` and |             and ending at position `end`, where `start` and | ||||||
|             `end` are token indices. For instance, |             `end` are token indices. For instance, | ||||||
|             `doc[2:5]` produces a span consisting of  |             `doc[2:5]` produces a span consisting of | ||||||
|             tokens 2, 3 and 4. Stepped slices (e.g. `doc[start : end : step]`)  |             tokens 2, 3 and 4. Stepped slices (e.g. `doc[start : end : step]`) | ||||||
|             are not supported, as `Span` objects must be contiguous (cannot have gaps). |             are not supported, as `Span` objects must be contiguous (cannot have gaps). | ||||||
|             You can use negative indices and open-ended ranges, which have their |             You can use negative indices and open-ended ranges, which have their | ||||||
|             normal Python semantics. |             normal Python semantics. | ||||||
|  | @ -188,11 +188,11 @@ cdef class Doc: | ||||||
|     def __iter__(self): |     def __iter__(self): | ||||||
|         ''' |         ''' | ||||||
|         for token in doc |         for token in doc | ||||||
|             Iterate over `Token`  objects, from which the annotations can  |             Iterate over `Token`  objects, from which the annotations can | ||||||
|             be easily accessed. This is the main way of accessing Token  |             be easily accessed. This is the main way of accessing Token | ||||||
|             objects, which are the main way annotations are accessed from  |             objects, which are the main way annotations are accessed from | ||||||
|             Python. If faster-than-Python speeds are required, you can  |             Python. If faster-than-Python speeds are required, you can | ||||||
|             instead access the annotations as a numpy array, or access the  |             instead access the annotations as a numpy array, or access the | ||||||
|             underlying C data directly from Cython. |             underlying C data directly from Cython. | ||||||
|         ''' |         ''' | ||||||
|         cdef int i |         cdef int i | ||||||
|  | @ -251,13 +251,13 @@ cdef class Doc: | ||||||
|         def __get__(self): |         def __get__(self): | ||||||
|             if 'has_vector' in self.user_hooks: |             if 'has_vector' in self.user_hooks: | ||||||
|                 return self.user_hooks['has_vector'](self) |                 return self.user_hooks['has_vector'](self) | ||||||
|   | 
 | ||||||
|             return any(token.has_vector for token in self) |             return any(token.has_vector for token in self) | ||||||
| 
 | 
 | ||||||
|     property vector: |     property vector: | ||||||
|         ''' |         ''' | ||||||
|         A real-valued meaning representation. Defaults to an average of the token vectors. |         A real-valued meaning representation. Defaults to an average of the token vectors. | ||||||
|          | 
 | ||||||
|         Type: numpy.ndarray[ndim=1, dtype='float32'] |         Type: numpy.ndarray[ndim=1, dtype='float32'] | ||||||
|         ''' |         ''' | ||||||
|         def __get__(self): |         def __get__(self): | ||||||
|  | @ -285,14 +285,14 @@ cdef class Doc: | ||||||
|                     norm += value * value |                     norm += value * value | ||||||
|                 self._vector_norm = sqrt(norm) if norm != 0 else 0 |                 self._vector_norm = sqrt(norm) if norm != 0 else 0 | ||||||
|             return self._vector_norm |             return self._vector_norm | ||||||
|          | 
 | ||||||
|         def __set__(self, value): |         def __set__(self, value): | ||||||
|             self._vector_norm = value  |             self._vector_norm = value | ||||||
| 
 | 
 | ||||||
|     @property |     @property | ||||||
|     def string(self): |     def string(self): | ||||||
|         return self.text |         return self.text | ||||||
|      | 
 | ||||||
|     property text: |     property text: | ||||||
|         '''A unicode representation of the document text.''' |         '''A unicode representation of the document text.''' | ||||||
|         def __get__(self): |         def __get__(self): | ||||||
|  | @ -306,7 +306,7 @@ cdef class Doc: | ||||||
|     property ents: |     property ents: | ||||||
|         ''' |         ''' | ||||||
|         Yields named-entity `Span` objects, if the entity recognizer |         Yields named-entity `Span` objects, if the entity recognizer | ||||||
|         has been applied to the document. Iterate over the span to get  |         has been applied to the document. Iterate over the span to get | ||||||
|         individual Token objects, or access the label: |         individual Token objects, or access the label: | ||||||
| 
 | 
 | ||||||
|         Example: |         Example: | ||||||
|  | @ -352,7 +352,7 @@ cdef class Doc: | ||||||
|             cdef int i |             cdef int i | ||||||
|             for i in range(self.length): |             for i in range(self.length): | ||||||
|                 self.c[i].ent_type = 0 |                 self.c[i].ent_type = 0 | ||||||
|                 # At this point we don't know whether the NER has run over the  |                 # At this point we don't know whether the NER has run over the | ||||||
|                 # Doc. If the ent_iob is missing, leave it missing. |                 # Doc. If the ent_iob is missing, leave it missing. | ||||||
|                 if self.c[i].ent_iob != 0: |                 if self.c[i].ent_iob != 0: | ||||||
|                     self.c[i].ent_iob = 2 # Means O. Non-O are set from ents. |                     self.c[i].ent_iob = 2 # Means O. Non-O are set from ents. | ||||||
|  | @ -384,9 +384,9 @@ cdef class Doc: | ||||||
|     property noun_chunks: |     property noun_chunks: | ||||||
|         ''' |         ''' | ||||||
|         Yields base noun-phrase #[code Span] objects, if the document |         Yields base noun-phrase #[code Span] objects, if the document | ||||||
|         has been syntactically parsed. A base noun phrase, or  |         has been syntactically parsed. A base noun phrase, or | ||||||
|         'NP chunk', is a noun phrase that does not permit other NPs to  |         'NP chunk', is a noun phrase that does not permit other NPs to | ||||||
|         be nested within it – so no NP-level coordination, no prepositional  |         be nested within it – so no NP-level coordination, no prepositional | ||||||
|         phrases, and no relative clauses. For example: |         phrases, and no relative clauses. For example: | ||||||
|         ''' |         ''' | ||||||
|         def __get__(self): |         def __get__(self): | ||||||
|  | @ -422,7 +422,7 @@ cdef class Doc: | ||||||
|         def __get__(self): |         def __get__(self): | ||||||
|             if 'sents' in self.user_hooks: |             if 'sents' in self.user_hooks: | ||||||
|                 return self.user_hooks['sents'](self) |                 return self.user_hooks['sents'](self) | ||||||
|   | 
 | ||||||
|             if not self.is_parsed: |             if not self.is_parsed: | ||||||
|                 raise ValueError( |                 raise ValueError( | ||||||
|                     "sentence boundary detection requires the dependency parse, which " |                     "sentence boundary detection requires the dependency parse, which " | ||||||
|  | @ -465,8 +465,8 @@ cdef class Doc: | ||||||
|     @cython.boundscheck(False) |     @cython.boundscheck(False) | ||||||
|     cpdef np.ndarray to_array(self, object py_attr_ids): |     cpdef np.ndarray to_array(self, object py_attr_ids): | ||||||
|         """ |         """ | ||||||
|         Given a list of M attribute IDs, export the tokens to a numpy  |         Given a list of M attribute IDs, export the tokens to a numpy | ||||||
|         `ndarray` of shape (N, M), where `N` is the length  |         `ndarray` of shape (N, M), where `N` is the length | ||||||
|         of the document. The values will be 32-bit integers. |         of the document. The values will be 32-bit integers. | ||||||
| 
 | 
 | ||||||
|         Example: |         Example: | ||||||
|  | @ -474,7 +474,7 @@ cdef class Doc: | ||||||
|             doc = nlp(text) |             doc = nlp(text) | ||||||
|             # All strings mapped to integers, for easy export to numpy |             # All strings mapped to integers, for easy export to numpy | ||||||
|             np_array = doc.to_array([attrs.LOWER, attrs.POS, attrs.ENT_TYPE, attrs.IS_ALPHA]) |             np_array = doc.to_array([attrs.LOWER, attrs.POS, attrs.ENT_TYPE, attrs.IS_ALPHA]) | ||||||
|                  | 
 | ||||||
|         Arguments: |         Arguments: | ||||||
|             attr_ids (list[int]): A list of attribute ID ints. |             attr_ids (list[int]): A list of attribute ID ints. | ||||||
| 
 | 
 | ||||||
|  | @ -520,7 +520,7 @@ cdef class Doc: | ||||||
|         cdef int i |         cdef int i | ||||||
|         cdef attr_t attr |         cdef attr_t attr | ||||||
|         cdef size_t count |         cdef size_t count | ||||||
|          | 
 | ||||||
|         if counts is None: |         if counts is None: | ||||||
|             counts = PreshCounter() |             counts = PreshCounter() | ||||||
|             output_dict = True |             output_dict = True | ||||||
|  | @ -570,7 +570,7 @@ cdef class Doc: | ||||||
|         cdef TokenC* tokens = self.c |         cdef TokenC* tokens = self.c | ||||||
|         cdef int length = len(array) |         cdef int length = len(array) | ||||||
|         cdef attr_t[:] values |         cdef attr_t[:] values | ||||||
|         for col, attr_id in enumerate(attrs):  |         for col, attr_id in enumerate(attrs): | ||||||
|             values = array[:, col] |             values = array[:, col] | ||||||
|             if attr_id == HEAD: |             if attr_id == HEAD: | ||||||
|                 for i in range(length): |                 for i in range(length): | ||||||
|  | @ -612,11 +612,11 @@ cdef class Doc: | ||||||
|         '''Deserialize, loading from bytes.''' |         '''Deserialize, loading from bytes.''' | ||||||
|         self.vocab.serializer.unpack_into(data[4:], self) |         self.vocab.serializer.unpack_into(data[4:], self) | ||||||
|         return self |         return self | ||||||
|      | 
 | ||||||
|     @staticmethod |     @staticmethod | ||||||
|     def read_bytes(file_): |     def read_bytes(file_): | ||||||
|         ''' |         ''' | ||||||
|         A static method, used to read serialized #[code Doc] objects from  |         A static method, used to read serialized #[code Doc] objects from | ||||||
|         a file. For example: |         a file. For example: | ||||||
| 
 | 
 | ||||||
|         Example: |         Example: | ||||||
|  | @ -673,7 +673,7 @@ cdef class Doc: | ||||||
|                 "Expected either 3 arguments (deprecated), or 0 (use keyword arguments). " |                 "Expected either 3 arguments (deprecated), or 0 (use keyword arguments). " | ||||||
|                 "Arguments supplied:\n%s\n" |                 "Arguments supplied:\n%s\n" | ||||||
|                 "Keyword arguments:%s\n" % (len(args), repr(args), repr(attributes))) |                 "Keyword arguments:%s\n" % (len(args), repr(args), repr(attributes))) | ||||||
|   | 
 | ||||||
|         cdef int start = token_by_start(self.c, self.length, start_idx) |         cdef int start = token_by_start(self.c, self.length, start_idx) | ||||||
|         if start == -1: |         if start == -1: | ||||||
|             return None |             return None | ||||||
|  | @ -784,7 +784,7 @@ cdef int set_children_from_heads(TokenC* tokens, int length) except -1: | ||||||
|             if child.l_edge < head.l_edge: |             if child.l_edge < head.l_edge: | ||||||
|                 head.l_edge = child.l_edge |                 head.l_edge = child.l_edge | ||||||
|             head.l_kids += 1 |             head.l_kids += 1 | ||||||
|          | 
 | ||||||
|     # Set right edges --- same as above, but iterate in reverse |     # Set right edges --- same as above, but iterate in reverse | ||||||
|     for i in range(length-1, -1, -1): |     for i in range(length-1, -1, -1): | ||||||
|         child = &tokens[i] |         child = &tokens[i] | ||||||
|  | @ -798,4 +798,4 @@ cdef int set_children_from_heads(TokenC* tokens, int length) except -1: | ||||||
|     for i in range(length): |     for i in range(length): | ||||||
|         if tokens[i].head == 0 and tokens[i].dep != 0: |         if tokens[i].head == 0 and tokens[i].dep != 0: | ||||||
|             tokens[tokens[i].l_edge].sent_start = True |             tokens[tokens[i].l_edge].sent_start = True | ||||||
|              | 
 | ||||||
|  |  | ||||||
|  | @ -20,7 +20,7 @@ from .. import parts_of_speech | ||||||
| from ..attrs cimport LEMMA | from ..attrs cimport LEMMA | ||||||
| from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER | from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER | ||||||
| from ..attrs cimport POS, LEMMA, TAG, DEP | from ..attrs cimport POS, LEMMA, TAG, DEP | ||||||
| from ..parts_of_speech cimport CONJ, PUNCT | from ..parts_of_speech cimport CCONJ, PUNCT | ||||||
| 
 | 
 | ||||||
| from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE | from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE | ||||||
| from ..attrs cimport IS_BRACKET | from ..attrs cimport IS_BRACKET | ||||||
|  | @ -84,7 +84,7 @@ cdef class Token: | ||||||
| 
 | 
 | ||||||
|     cpdef bint check_flag(self, attr_id_t flag_id) except -1: |     cpdef bint check_flag(self, attr_id_t flag_id) except -1: | ||||||
|         '''Check the value of a boolean flag. |         '''Check the value of a boolean flag. | ||||||
|          | 
 | ||||||
|         Arguments: |         Arguments: | ||||||
|             flag_id (int): The ID of the flag attribute. |             flag_id (int): The ID of the flag attribute. | ||||||
|         Returns: |         Returns: | ||||||
|  | @ -225,7 +225,7 @@ cdef class Token: | ||||||
|     property vector: |     property vector: | ||||||
|         ''' |         ''' | ||||||
|         A real-valued meaning representation. |         A real-valued meaning representation. | ||||||
|          | 
 | ||||||
|         Type: numpy.ndarray[ndim=1, dtype='float32'] |         Type: numpy.ndarray[ndim=1, dtype='float32'] | ||||||
|         ''' |         ''' | ||||||
|         def __get__(self): |         def __get__(self): | ||||||
|  | @ -343,7 +343,7 @@ cdef class Token: | ||||||
|         ''' |         ''' | ||||||
|         def __get__(self): |         def __get__(self): | ||||||
|             cdef const TokenC* head_ptr = self.c |             cdef const TokenC* head_ptr = self.c | ||||||
|             # guard against infinite loop, no token can have  |             # guard against infinite loop, no token can have | ||||||
|             # more ancestors than tokens in the tree |             # more ancestors than tokens in the tree | ||||||
|             cdef int i = 0 |             cdef int i = 0 | ||||||
|             while head_ptr.head != 0 and i < self.doc.length: |             while head_ptr.head != 0 and i < self.doc.length: | ||||||
|  | @ -370,7 +370,7 @@ cdef class Token: | ||||||
| 
 | 
 | ||||||
|     property head: |     property head: | ||||||
|         '''The syntactic parent, or "governor", of this token. |         '''The syntactic parent, or "governor", of this token. | ||||||
|          | 
 | ||||||
|         Returns: Token |         Returns: Token | ||||||
|         ''' |         ''' | ||||||
|         def __get__(self): |         def __get__(self): | ||||||
|  | @ -390,7 +390,7 @@ cdef class Token: | ||||||
| 
 | 
 | ||||||
|             # is the new head a descendant of the old head |             # is the new head a descendant of the old head | ||||||
|             cdef bint is_desc = old_head.is_ancestor_of(new_head) |             cdef bint is_desc = old_head.is_ancestor_of(new_head) | ||||||
|              | 
 | ||||||
|             cdef int new_edge |             cdef int new_edge | ||||||
|             cdef Token anc, child |             cdef Token anc, child | ||||||
| 
 | 
 | ||||||
|  | @ -420,7 +420,7 @@ cdef class Token: | ||||||
|                         if anc.c.l_edge <= new_edge: |                         if anc.c.l_edge <= new_edge: | ||||||
|                             break |                             break | ||||||
|                         anc.c.l_edge = new_edge |                         anc.c.l_edge = new_edge | ||||||
|              | 
 | ||||||
|             elif self.c.head < 0: # right dependent |             elif self.c.head < 0: # right dependent | ||||||
|                 old_head.c.r_kids -= 1 |                 old_head.c.r_kids -= 1 | ||||||
|                 # do the same thing as for l_edge |                 # do the same thing as for l_edge | ||||||
|  | @ -435,7 +435,7 @@ cdef class Token: | ||||||
|                             if child.c.r_edge > new_edge: |                             if child.c.r_edge > new_edge: | ||||||
|                                 new_edge = child.c.r_edge |                                 new_edge = child.c.r_edge | ||||||
|                         old_head.c.r_edge = new_edge |                         old_head.c.r_edge = new_edge | ||||||
|                      | 
 | ||||||
|                     for anc in old_head.ancestors: |                     for anc in old_head.ancestors: | ||||||
|                         if anc.c.r_edge >= new_edge: |                         if anc.c.r_edge >= new_edge: | ||||||
|                             break |                             break | ||||||
|  | @ -598,19 +598,19 @@ cdef class Token: | ||||||
|     property is_punct: |     property is_punct: | ||||||
|         def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_PUNCT) |         def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_PUNCT) | ||||||
| 
 | 
 | ||||||
|     property is_space:  |     property is_space: | ||||||
|         def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_SPACE) |         def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_SPACE) | ||||||
|      | 
 | ||||||
|     property is_bracket:  |     property is_bracket: | ||||||
|         def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_BRACKET) |         def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_BRACKET) | ||||||
| 
 | 
 | ||||||
|     property is_quote:  |     property is_quote: | ||||||
|         def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_QUOTE) |         def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_QUOTE) | ||||||
| 
 | 
 | ||||||
|     property is_left_punct:  |     property is_left_punct: | ||||||
|         def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_LEFT_PUNCT) |         def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_LEFT_PUNCT) | ||||||
| 
 | 
 | ||||||
|     property is_right_punct:  |     property is_right_punct: | ||||||
|         def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_RIGHT_PUNCT) |         def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_RIGHT_PUNCT) | ||||||
| 
 | 
 | ||||||
|     property like_url: |     property like_url: | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user