Add support for Universal Dependencies v2.0

This commit is contained in:
Roman Inflianskas 2017-02-26 22:27:11 +01:00
parent 8dff040032
commit 66e1109b53
14 changed files with 155 additions and 74 deletions

View File

@ -93,7 +93,7 @@ NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]
def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False): def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
'''Normalize a dictionary of attributes, converting them to ints. '''Normalize a dictionary of attributes, converting them to ints.
Arguments: Arguments:
stringy_attrs (dict): stringy_attrs (dict):
Dictionary keyed by attribute string names. Values can be ints or strings. Dictionary keyed by attribute string names. Values can be ints or strings.
@ -125,7 +125,9 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
'VerbForm', 'PronType', 'Aspect', 'Tense', 'PartType', 'Poss', 'VerbForm', 'PronType', 'Aspect', 'Tense', 'PartType', 'Poss',
'Hyph', 'ConjType', 'NumType', 'Foreign', 'VerbType', 'NounType', 'Hyph', 'ConjType', 'NumType', 'Foreign', 'VerbType', 'NounType',
'Number', 'PronType', 'AdjType', 'Person', 'Variant', 'AdpType', 'Number', 'PronType', 'AdjType', 'Person', 'Variant', 'AdpType',
'Reflex', 'Negative', 'Mood', 'Aspect', 'Case'] 'Reflex', 'Negative', 'Mood', 'Aspect', 'Case',
'Polarity', # U20
]
for key in morph_keys: for key in morph_keys:
if key in stringy_attrs: if key in stringy_attrs:
stringy_attrs.pop(key) stringy_attrs.pop(key)

View File

@ -41,7 +41,7 @@ TAG_MAP = {
"PRF": {POS: PRON, "PronType": "prs", "Reflex": "yes"}, "PRF": {POS: PRON, "PronType": "prs", "Reflex": "yes"},
"PTKA": {POS: PART}, "PTKA": {POS: PART},
"PTKANT": {POS: PART, "PartType": "res"}, "PTKANT": {POS: PART, "PartType": "res"},
"PTKNEG": {POS: PART, "Negative": "yes"}, "PTKNEG": {POS: PART, "Polarity": "Neg"},
"PTKVZ": {POS: PART, "PartType": "vbp"}, "PTKVZ": {POS: PART, "PartType": "vbp"},
"PTKZU": {POS: PART, "PartType": "inf"}, "PTKZU": {POS: PART, "PartType": "inf"},
"PWAT": {POS: DET, "PronType": "int"}, "PWAT": {POS: DET, "PronType": "int"},

View File

@ -16,7 +16,7 @@ TAG_MAP = {
"$": {POS: SYM, "Other": {"SymType": "currency"}}, "$": {POS: SYM, "Other": {"SymType": "currency"}},
"#": {POS: SYM, "Other": {"SymType": "numbersign"}}, "#": {POS: SYM, "Other": {"SymType": "numbersign"}},
"AFX": {POS: ADJ, "Hyph": "yes"}, "AFX": {POS: ADJ, "Hyph": "yes"},
"CC": {POS: CONJ, "ConjType": "coor"}, "CC": {POS: CCONJ, "ConjType": "coor"},
"CD": {POS: NUM, "NumType": "card"}, "CD": {POS: NUM, "NumType": "card"},
"DT": {POS: DET}, "DT": {POS: DET},
"EX": {POS: ADV, "AdvType": "ex"}, "EX": {POS: ADV, "AdvType": "ex"},

View File

@ -19,6 +19,7 @@ TAG_MAP = {
"AUX": {POS: AUX}, "AUX": {POS: AUX},
"X": {POS: X}, "X": {POS: X},
"CONJ": {POS: CONJ}, "CONJ": {POS: CONJ},
"CCONJ": {POS: CCONJ}, # U20
"ADJ": {POS: ADJ}, "ADJ": {POS: ADJ},
"VERB": {POS: VERB}, "VERB": {POS: VERB},
"PART": {POS: PART} "PART": {POS: PART}

View File

@ -37,7 +37,7 @@ cdef class Morphology:
cdef int assign_tag(self, TokenC* token, tag) except -1 cdef int assign_tag(self, TokenC* token, tag) except -1
cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1 cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1
cdef int assign_feature(self, uint64_t* morph, univ_morph_t feat_id, bint value) except -1 cdef int assign_feature(self, uint64_t* morph, univ_morph_t feat_id, bint value) except -1
@ -80,6 +80,7 @@ cpdef enum univ_morph_t:
Definite_two Definite_two
Definite_def Definite_def
Definite_red Definite_red
Definite_cons # U20
Definite_ind Definite_ind
Degree_cmp Degree_cmp
Degree_comp Degree_comp
@ -103,6 +104,8 @@ cpdef enum univ_morph_t:
Negative_neg Negative_neg
Negative_pos Negative_pos
Negative_yes Negative_yes
Polarity_neg # U20
Polarity_pos # U20
Number_com Number_com
Number_dual Number_dual
Number_none Number_none
@ -151,6 +154,7 @@ cpdef enum univ_morph_t:
VerbForm_partPres VerbForm_partPres
VerbForm_sup VerbForm_sup
VerbForm_trans VerbForm_trans
VerbForm_conv # U20
VerbForm_gdv # la VerbForm_gdv # la
Voice_act Voice_act
Voice_cau Voice_cau

View File

@ -192,6 +192,7 @@ IDS = {
"Definite_two": Definite_two, "Definite_two": Definite_two,
"Definite_def": Definite_def, "Definite_def": Definite_def,
"Definite_red": Definite_red, "Definite_red": Definite_red,
"Definite_cons": Definite_cons, # U20
"Definite_ind": Definite_ind, "Definite_ind": Definite_ind,
"Degree_cmp": Degree_cmp, "Degree_cmp": Degree_cmp,
"Degree_comp": Degree_comp, "Degree_comp": Degree_comp,
@ -215,6 +216,8 @@ IDS = {
"Negative_neg": Negative_neg, "Negative_neg": Negative_neg,
"Negative_pos": Negative_pos, "Negative_pos": Negative_pos,
"Negative_yes": Negative_yes, "Negative_yes": Negative_yes,
"Polarity_neg": Polarity_neg, # U20
"Polarity_pos": Polarity_pos, # U20
"Number_com": Number_com, "Number_com": Number_com,
"Number_dual": Number_dual, "Number_dual": Number_dual,
"Number_none": Number_none, "Number_none": Number_none,
@ -263,6 +266,7 @@ IDS = {
"VerbForm_partPres": VerbForm_partPres, "VerbForm_partPres": VerbForm_partPres,
"VerbForm_sup": VerbForm_sup, "VerbForm_sup": VerbForm_sup,
"VerbForm_trans": VerbForm_trans, "VerbForm_trans": VerbForm_trans,
"VerbForm_conv": VerbForm_conv, # U20
"VerbForm_gdv ": VerbForm_gdv, # la, "VerbForm_gdv ": VerbForm_gdv, # la,
"Voice_act": Voice_act, "Voice_act": Voice_act,
"Voice_cau": Voice_cau, "Voice_cau": Voice_cau,

View File

@ -7,6 +7,7 @@ cpdef enum univ_pos_t:
ADV ADV
AUX AUX
CONJ CONJ
CCONJ # U20
DET DET
INTJ INTJ
NOUN NOUN

View File

@ -7,7 +7,8 @@ IDS = {
"ADP": ADP, "ADP": ADP,
"ADV": ADV, "ADV": ADV,
"AUX": AUX, "AUX": AUX,
"CONJ": CONJ, "CONJ": CONJ, # U20
"CCONJ": CCONJ,
"DET": DET, "DET": DET,
"INTJ": INTJ, "INTJ": INTJ,
"NOUN": NOUN, "NOUN": NOUN,

View File

@ -13,7 +13,7 @@ cpdef enum symbol_t:
LIKE_EMAIL LIKE_EMAIL
IS_STOP IS_STOP
IS_OOV IS_OOV
FLAG14 = 14 FLAG14 = 14
FLAG15 FLAG15
FLAG16 FLAG16
@ -90,6 +90,7 @@ cpdef enum symbol_t:
ADV ADV
AUX AUX
CONJ CONJ
CCONJ # U20
DET DET
INTJ INTJ
NOUN NOUN
@ -107,11 +108,14 @@ cpdef enum symbol_t:
Animacy_anim Animacy_anim
Animacy_inam Animacy_inam
Animacy_hum # U20
Aspect_freq Aspect_freq
Aspect_imp Aspect_imp
Aspect_mod Aspect_mod
Aspect_none Aspect_none
Aspect_perf Aspect_perf
Aspect_iter # U20
Aspect_hab # U20
Case_abe Case_abe
Case_abl Case_abl
Case_abs Case_abs
@ -120,10 +124,12 @@ cpdef enum symbol_t:
Case_all Case_all
Case_cau Case_cau
Case_com Case_com
Case_cmp # U20
Case_dat Case_dat
Case_del Case_del
Case_dis Case_dis
Case_ela Case_ela
Case_equ # U20
Case_ess Case_ess
Case_gen Case_gen
Case_ill Case_ill
@ -142,7 +148,9 @@ cpdef enum symbol_t:
Definite_two Definite_two
Definite_def Definite_def
Definite_red Definite_red
Definite_cons # U20
Definite_ind Definite_ind
Definite_spec # U20
Degree_cmp Degree_cmp
Degree_comp Degree_comp
Degree_none Degree_none
@ -151,6 +159,8 @@ cpdef enum symbol_t:
Degree_abs Degree_abs
Degree_com Degree_com
Degree_dim # du Degree_dim # du
Degree_equ # U20
Evident_nfh # U20
Gender_com Gender_com
Gender_fem Gender_fem
Gender_masc Gender_masc
@ -162,16 +172,21 @@ cpdef enum symbol_t:
Mood_pot Mood_pot
Mood_sub Mood_sub
Mood_opt Mood_opt
Mood_prp # U20
Mood_adm # U20
Negative_neg Negative_neg
Negative_pos Negative_pos
Negative_yes Negative_yes
Polarity_neg # U20
Polarity_pos # U20
Number_com Number_com
Number_dual Number_dual
Number_none Number_none
Number_plur Number_plur
Number_sing Number_sing
Number_ptan # bg Number_ptan # bg
Number_count # bg Number_count # bg, U20
Number_tri # U20
NumType_card NumType_card
NumType_dist NumType_dist
NumType_frac NumType_frac
@ -197,7 +212,8 @@ cpdef enum symbol_t:
PronType_rel PronType_rel
PronType_tot PronType_tot
PronType_clit PronType_clit
PronType_exc # es, ca, it, fa PronType_exc # es, ca, it, fa, U20
PronType_emp # U20
Reflex_yes Reflex_yes
Tense_fut Tense_fut
Tense_imp Tense_imp
@ -213,12 +229,17 @@ cpdef enum symbol_t:
VerbForm_partPres VerbForm_partPres
VerbForm_sup VerbForm_sup
VerbForm_trans VerbForm_trans
VerbForm_conv # U20
VerbForm_gdv # la VerbForm_gdv # la
VerbForm_vnoun # U20
Voice_act Voice_act
Voice_cau Voice_cau
Voice_pass Voice_pass
Voice_mid # gkc Voice_mid # gkc, U20
Voice_int # hb Voice_int # hb
Voice_antip # U20
Voice_dir # U20
Voice_inv # U20
Abbr_yes # cz, fi, sl, U Abbr_yes # cz, fi, sl, U
AdpType_prep # cz, U AdpType_prep # cz, U
AdpType_post # U AdpType_post # U
@ -284,6 +305,10 @@ cpdef enum symbol_t:
Number_psee_plur # U Number_psee_plur # U
Number_psor_sing # cz, fi, sl, U Number_psor_sing # cz, fi, sl, U
Number_psor_plur # cz, fi, sl, U Number_psor_plur # cz, fi, sl, U
Number_pauc # U20
Number_grpa # U20
Number_grpl # U20
Number_inv # U20
NumForm_digit # cz, sl, U NumForm_digit # cz, sl, U
NumForm_roman # cz, sl, U NumForm_roman # cz, sl, U
NumForm_word # cz, sl, U NumForm_word # cz, sl, U
@ -311,6 +336,8 @@ cpdef enum symbol_t:
Person_psor_one # fi, U Person_psor_one # fi, U
Person_psor_two # fi, U Person_psor_two # fi, U
Person_psor_three # fi, U Person_psor_three # fi, U
Person_zero # U20
Person_four # U20
Polite_inf # bq, U Polite_inf # bq, U
Polite_pol # bq, U Polite_pol # bq, U
Polite_abs_inf # bq, U Polite_abs_inf # bq, U
@ -319,6 +346,10 @@ cpdef enum symbol_t:
Polite_erg_pol # bq, U Polite_erg_pol # bq, U
Polite_dat_inf # bq, U Polite_dat_inf # bq, U
Polite_dat_pol # bq, U Polite_dat_pol # bq, U
Polite_infm # U20
Polite_form # U20
Polite_form_elev # U20
Polite_form_humb # U20
Prefix_yes # U Prefix_yes # U
PrepCase_npr # cz PrepCase_npr # cz
PrepCase_pre # U PrepCase_pre # U
@ -383,6 +414,7 @@ cpdef enum symbol_t:
ccomp ccomp
complm complm
conj conj
cop # U20
csubj csubj
csubjpass csubjpass
dep dep
@ -405,6 +437,8 @@ cpdef enum symbol_t:
num num
number number
oprd oprd
obj # U20
obl # U20
parataxis parataxis
partmod partmod
pcomp pcomp

View File

@ -91,6 +91,7 @@ IDS = {
"ADV": ADV, "ADV": ADV,
"AUX": AUX, "AUX": AUX,
"CONJ": CONJ, "CONJ": CONJ,
"CCONJ": CCONJ, # U20
"DET": DET, "DET": DET,
"INTJ": INTJ, "INTJ": INTJ,
"NOUN": NOUN, "NOUN": NOUN,
@ -108,11 +109,14 @@ IDS = {
"Animacy_anim": Animacy_anim, "Animacy_anim": Animacy_anim,
"Animacy_inam": Animacy_inam, "Animacy_inam": Animacy_inam,
"Animacy_hum": Animacy_hum, # U20
"Aspect_freq": Aspect_freq, "Aspect_freq": Aspect_freq,
"Aspect_imp": Aspect_imp, "Aspect_imp": Aspect_imp,
"Aspect_mod": Aspect_mod, "Aspect_mod": Aspect_mod,
"Aspect_none": Aspect_none, "Aspect_none": Aspect_none,
"Aspect_perf": Aspect_perf, "Aspect_perf": Aspect_perf,
"Aspect_iter": Aspect_iter, # U20
"Aspect_hab": Aspect_hab, # U20
"Case_abe": Case_abe, "Case_abe": Case_abe,
"Case_abl": Case_abl, "Case_abl": Case_abl,
"Case_abs": Case_abs, "Case_abs": Case_abs,
@ -121,10 +125,12 @@ IDS = {
"Case_all": Case_all, "Case_all": Case_all,
"Case_cau": Case_cau, "Case_cau": Case_cau,
"Case_com": Case_com, "Case_com": Case_com,
"Case_cmp": Case_cmp, # U20
"Case_dat": Case_dat, "Case_dat": Case_dat,
"Case_del": Case_del, "Case_del": Case_del,
"Case_dis": Case_dis, "Case_dis": Case_dis,
"Case_ela": Case_ela, "Case_ela": Case_ela,
"Case_equ": Case_equ, # U20
"Case_ess": Case_ess, "Case_ess": Case_ess,
"Case_gen": Case_gen, "Case_gen": Case_gen,
"Case_ill": Case_ill, "Case_ill": Case_ill,
@ -143,7 +149,9 @@ IDS = {
"Definite_two": Definite_two, "Definite_two": Definite_two,
"Definite_def": Definite_def, "Definite_def": Definite_def,
"Definite_red": Definite_red, "Definite_red": Definite_red,
"Definite_cons": Definite_cons, # U20
"Definite_ind": Definite_ind, "Definite_ind": Definite_ind,
"Definite_spec": Definite_spec, # U20
"Degree_cmp": Degree_cmp, "Degree_cmp": Degree_cmp,
"Degree_comp": Degree_comp, "Degree_comp": Degree_comp,
"Degree_none": Degree_none, "Degree_none": Degree_none,
@ -152,6 +160,8 @@ IDS = {
"Degree_abs": Degree_abs, "Degree_abs": Degree_abs,
"Degree_com": Degree_com, "Degree_com": Degree_com,
"Degree_dim ": Degree_dim, # du "Degree_dim ": Degree_dim, # du
"Degree_equ": Degree_equ, # U20
"Evident_nfh": Evident_nfh, # U20
"Gender_com": Gender_com, "Gender_com": Gender_com,
"Gender_fem": Gender_fem, "Gender_fem": Gender_fem,
"Gender_masc": Gender_masc, "Gender_masc": Gender_masc,
@ -163,16 +173,21 @@ IDS = {
"Mood_pot": Mood_pot, "Mood_pot": Mood_pot,
"Mood_sub": Mood_sub, "Mood_sub": Mood_sub,
"Mood_opt": Mood_opt, "Mood_opt": Mood_opt,
"Mood_prp": Mood_prp, # U20
"Mood_adm": Mood_adm, # U20
"Negative_neg": Negative_neg, "Negative_neg": Negative_neg,
"Negative_pos": Negative_pos, "Negative_pos": Negative_pos,
"Negative_yes": Negative_yes, "Negative_yes": Negative_yes,
"Polarity_neg": Polarity_neg, # U20
"Polarity_pos": Polarity_pos, # U20
"Number_com": Number_com, "Number_com": Number_com,
"Number_dual": Number_dual, "Number_dual": Number_dual,
"Number_none": Number_none, "Number_none": Number_none,
"Number_plur": Number_plur, "Number_plur": Number_plur,
"Number_sing": Number_sing, "Number_sing": Number_sing,
"Number_ptan ": Number_ptan, # bg "Number_ptan ": Number_ptan, # bg
"Number_count ": Number_count, # bg "Number_count ": Number_count, # bg, U20
"Number_tri": Number_tri, # U20
"NumType_card": NumType_card, "NumType_card": NumType_card,
"NumType_dist": NumType_dist, "NumType_dist": NumType_dist,
"NumType_frac": NumType_frac, "NumType_frac": NumType_frac,
@ -198,7 +213,8 @@ IDS = {
"PronType_rel": PronType_rel, "PronType_rel": PronType_rel,
"PronType_tot": PronType_tot, "PronType_tot": PronType_tot,
"PronType_clit": PronType_clit, "PronType_clit": PronType_clit,
"PronType_exc ": PronType_exc, # es, ca, it, fa, "PronType_exc": PronType_exc, # es, ca, it, fa, U20
"PronType_emp": PronType_emp, # U20
"Reflex_yes": Reflex_yes, "Reflex_yes": Reflex_yes,
"Tense_fut": Tense_fut, "Tense_fut": Tense_fut,
"Tense_imp": Tense_imp, "Tense_imp": Tense_imp,
@ -214,12 +230,17 @@ IDS = {
"VerbForm_partPres": VerbForm_partPres, "VerbForm_partPres": VerbForm_partPres,
"VerbForm_sup": VerbForm_sup, "VerbForm_sup": VerbForm_sup,
"VerbForm_trans": VerbForm_trans, "VerbForm_trans": VerbForm_trans,
"VerbForm_conv": VerbForm_conv, # U20
"VerbForm_gdv ": VerbForm_gdv, # la, "VerbForm_gdv ": VerbForm_gdv, # la,
"VerbForm_vnoun": VerbForm_vnoun, # U20
"Voice_act": Voice_act, "Voice_act": Voice_act,
"Voice_cau": Voice_cau, "Voice_cau": Voice_cau,
"Voice_pass": Voice_pass, "Voice_pass": Voice_pass,
"Voice_mid ": Voice_mid, # gkc, "Voice_mid ": Voice_mid, # gkc, U20
"Voice_int ": Voice_int, # hb, "Voice_int ": Voice_int, # hb,
"Voice_antip": Voice_antip, # U20
"Voice_dir": Voice_dir, # U20
"Voice_inv": Voice_inv, # U20
"Abbr_yes ": Abbr_yes, # cz, fi, sl, U, "Abbr_yes ": Abbr_yes, # cz, fi, sl, U,
"AdpType_prep ": AdpType_prep, # cz, U, "AdpType_prep ": AdpType_prep, # cz, U,
"AdpType_post ": AdpType_post, # U, "AdpType_post ": AdpType_post, # U,
@ -285,6 +306,10 @@ IDS = {
"Number_psee_plur ": Number_psee_plur, # U, "Number_psee_plur ": Number_psee_plur, # U,
"Number_psor_sing ": Number_psor_sing, # cz, fi, sl, U, "Number_psor_sing ": Number_psor_sing, # cz, fi, sl, U,
"Number_psor_plur ": Number_psor_plur, # cz, fi, sl, U, "Number_psor_plur ": Number_psor_plur, # cz, fi, sl, U,
"Number_pauc": Number_pauc, # U20
"Number_grpa": Number_grpa, # U20
"Number_grpl": Number_grpl, # U20
"Number_inv": Number_inv, # U20
"NumForm_digit ": NumForm_digit, # cz, sl, U, "NumForm_digit ": NumForm_digit, # cz, sl, U,
"NumForm_roman ": NumForm_roman, # cz, sl, U, "NumForm_roman ": NumForm_roman, # cz, sl, U,
"NumForm_word ": NumForm_word, # cz, sl, U, "NumForm_word ": NumForm_word, # cz, sl, U,
@ -312,6 +337,8 @@ IDS = {
"Person_psor_one ": Person_psor_one, # fi, U, "Person_psor_one ": Person_psor_one, # fi, U,
"Person_psor_two ": Person_psor_two, # fi, U, "Person_psor_two ": Person_psor_two, # fi, U,
"Person_psor_three ": Person_psor_three, # fi, U, "Person_psor_three ": Person_psor_three, # fi, U,
"Person_zero ": Person_zero, # U20
"Person_four ": Person_four, # U20
"Polite_inf ": Polite_inf, # bq, U, "Polite_inf ": Polite_inf, # bq, U,
"Polite_pol ": Polite_pol, # bq, U, "Polite_pol ": Polite_pol, # bq, U,
"Polite_abs_inf ": Polite_abs_inf, # bq, U, "Polite_abs_inf ": Polite_abs_inf, # bq, U,
@ -320,6 +347,10 @@ IDS = {
"Polite_erg_pol ": Polite_erg_pol, # bq, U, "Polite_erg_pol ": Polite_erg_pol, # bq, U,
"Polite_dat_inf ": Polite_dat_inf, # bq, U, "Polite_dat_inf ": Polite_dat_inf, # bq, U,
"Polite_dat_pol ": Polite_dat_pol, # bq, U, "Polite_dat_pol ": Polite_dat_pol, # bq, U,
"Polite_infm ": Polite_infm, # U20
"Polite_form ": Polite_form, # U20
"Polite_form_elev ": Polite_form_elev, # U20
"Polite_form_humb ": Polite_form_humb, # U20
"Prefix_yes ": Prefix_yes, # U, "Prefix_yes ": Prefix_yes, # U,
"PrepCase_npr ": PrepCase_npr, # cz, "PrepCase_npr ": PrepCase_npr, # cz,
"PrepCase_pre ": PrepCase_pre, # U, "PrepCase_pre ": PrepCase_pre, # U,
@ -384,6 +415,7 @@ IDS = {
"ccomp": ccomp, "ccomp": ccomp,
"complm": complm, "complm": complm,
"conj": conj, "conj": conj,
"cop": cop, # U20
"csubj": csubj, "csubj": csubj,
"csubjpass": csubjpass, "csubjpass": csubjpass,
"dep": dep, "dep": dep,
@ -406,6 +438,8 @@ IDS = {
"num": num, "num": num,
"number": number, "number": number,
"oprd": oprd, "oprd": oprd,
"obj": obj, # U20
"obl": obl, # U20
"parataxis": parataxis, "parataxis": parataxis,
"partmod": partmod, "partmod": partmod,
"pcomp": pcomp, "pcomp": pcomp,

View File

@ -8,7 +8,7 @@ from spacy.attrs import DEP, HEAD
def ancestors(tokenid, heads): def ancestors(tokenid, heads):
# returns all words going from the word up the path to the root # returns all words going from the word up the path to the root
# the path to root cannot be longer than the number of words in the sentence # the path to root cannot be longer than the number of words in the sentence
# this function ends after at most len(heads) steps # this function ends after at most len(heads) steps
# because it would otherwise loop indefinitely on cycles # because it would otherwise loop indefinitely on cycles
head = tokenid head = tokenid
cnt = 0 cnt = 0
@ -180,7 +180,7 @@ class PseudoProjectivity:
next_queue = [] next_queue = []
for qtoken in queue: for qtoken in queue:
for child in qtoken.children: for child in qtoken.children:
if child.is_space: continue if child.is_space: continue
if child == token: continue if child == token: continue
if child.dep_ == headlabel: if child.dep_ == headlabel:
return child return child

View File

@ -13,13 +13,13 @@ from thinc.linalg cimport VecVec
from .typedefs cimport attr_t from .typedefs cimport attr_t
from .tokens.doc cimport Doc from .tokens.doc cimport Doc
from .attrs cimport TAG from .attrs cimport TAG
from .parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON from .parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CCONJ, DET, NOUN, NUM, PRON
from .parts_of_speech cimport VERB, X, PUNCT, EOL, SPACE from .parts_of_speech cimport VERB, X, PUNCT, EOL, SPACE
from .gold cimport GoldParse from .gold cimport GoldParse
from .attrs cimport * from .attrs cimport *
cpdef enum: cpdef enum:
P2_orth P2_orth
P2_cluster P2_cluster
@ -71,7 +71,7 @@ cpdef enum:
cdef class TaggerModel(AveragedPerceptron): cdef class TaggerModel(AveragedPerceptron):
cdef void set_featuresC(self, ExampleC* eg, const TokenC* tokens, int i) except *: cdef void set_featuresC(self, ExampleC* eg, const TokenC* tokens, int i) except *:
_fill_from_token(&eg.atoms[P2_orth], &tokens[i-2]) _fill_from_token(&eg.atoms[P2_orth], &tokens[i-2])
_fill_from_token(&eg.atoms[P1_orth], &tokens[i-1]) _fill_from_token(&eg.atoms[P1_orth], &tokens[i-1])
_fill_from_token(&eg.atoms[W_orth], &tokens[i]) _fill_from_token(&eg.atoms[W_orth], &tokens[i])
@ -191,7 +191,7 @@ cdef class Tagger:
nr_class=self.vocab.morphology.n_tags, nr_class=self.vocab.morphology.n_tags,
nr_feat=self.model.nr_feat) nr_feat=self.model.nr_feat)
for i in range(tokens.length): for i in range(tokens.length):
if tokens.c[i].pos == 0: if tokens.c[i].pos == 0:
self.model.set_featuresC(&eg.c, tokens.c, i) self.model.set_featuresC(&eg.c, tokens.c, i)
self.model.set_scoresC(eg.c.scores, self.model.set_scoresC(eg.c.scores,
eg.c.features, eg.c.nr_feat) eg.c.features, eg.c.nr_feat)
@ -217,7 +217,7 @@ cdef class Tagger:
for doc in stream: for doc in stream:
self(doc) self(doc)
yield doc yield doc
def update(self, Doc tokens, GoldParse gold): def update(self, Doc tokens, GoldParse gold):
"""Update the statistical model, with tags supplied for the given document. """Update the statistical model, with tags supplied for the given document.
@ -251,7 +251,7 @@ cdef class Tagger:
self.model.updateC(&eg.c) self.model.updateC(&eg.c)
self.vocab.morphology.assign_tag_id(&tokens.c[i], eg.guess) self.vocab.morphology.assign_tag_id(&tokens.c[i], eg.guess)
correct += eg.cost == 0 correct += eg.cost == 0
self.freqs[TAG][tokens.c[i].tag] += 1 self.freqs[TAG][tokens.c[i].tag] += 1
eg.fill_scores(0, eg.c.nr_class) eg.fill_scores(0, eg.c.nr_class)

View File

@ -16,7 +16,7 @@ from ..typedefs cimport attr_t, flags_t
from ..attrs cimport attr_id_t from ..attrs cimport attr_id_t
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
from ..parts_of_speech cimport CONJ, PUNCT, NOUN from ..parts_of_speech cimport CCONJ, PUNCT, NOUN
from ..parts_of_speech cimport univ_pos_t from ..parts_of_speech cimport univ_pos_t
from ..lexeme cimport Lexeme from ..lexeme cimport Lexeme
from .span cimport Span from .span cimport Span
@ -59,13 +59,13 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil:
cdef class Doc: cdef class Doc:
""" """
A sequence of `Token` objects. Access sentences and named entities, A sequence of `Token` objects. Access sentences and named entities,
export annotations to numpy arrays, losslessly serialize to compressed export annotations to numpy arrays, losslessly serialize to compressed
binary strings. binary strings.
Aside: Internals Aside: Internals
The `Doc` object holds an array of `TokenC` structs. The `Doc` object holds an array of `TokenC` structs.
The Python-level `Token` and `Span` objects are views of this The Python-level `Token` and `Span` objects are views of this
array, i.e. they don't own the data themselves. array, i.e. they don't own the data themselves.
Code: Construction 1 Code: Construction 1
@ -80,13 +80,13 @@ cdef class Doc:
Create a Doc object. Create a Doc object.
Aside: Implementation Aside: Implementation
This method of constructing a `Doc` object is usually only used This method of constructing a `Doc` object is usually only used
for deserialization. Standard usage is to construct the document via for deserialization. Standard usage is to construct the document via
a call to the language object. a call to the language object.
Arguments: Arguments:
vocab: vocab:
A Vocabulary object, which must match any models you want to A Vocabulary object, which must match any models you want to
use (e.g. tokenizer, parser, entity recognizer). use (e.g. tokenizer, parser, entity recognizer).
words: words:
@ -156,19 +156,19 @@ cdef class Doc:
if self.length == 0: if self.length == 0:
self.is_tagged = True self.is_tagged = True
self.is_parsed = True self.is_parsed = True
def __getitem__(self, object i): def __getitem__(self, object i):
''' '''
doc[i] doc[i]
Get the Token object at position i, where i is an integer. Get the Token object at position i, where i is an integer.
Negative indexing is supported, and follows the usual Python Negative indexing is supported, and follows the usual Python
semantics, i.e. doc[-2] is doc[len(doc) - 2]. semantics, i.e. doc[-2] is doc[len(doc) - 2].
doc[start : end]] doc[start : end]]
Get a `Span` object, starting at position `start` Get a `Span` object, starting at position `start`
and ending at position `end`, where `start` and and ending at position `end`, where `start` and
`end` are token indices. For instance, `end` are token indices. For instance,
`doc[2:5]` produces a span consisting of `doc[2:5]` produces a span consisting of
tokens 2, 3 and 4. Stepped slices (e.g. `doc[start : end : step]`) tokens 2, 3 and 4. Stepped slices (e.g. `doc[start : end : step]`)
are not supported, as `Span` objects must be contiguous (cannot have gaps). are not supported, as `Span` objects must be contiguous (cannot have gaps).
You can use negative indices and open-ended ranges, which have their You can use negative indices and open-ended ranges, which have their
normal Python semantics. normal Python semantics.
@ -188,11 +188,11 @@ cdef class Doc:
def __iter__(self): def __iter__(self):
''' '''
for token in doc for token in doc
Iterate over `Token` objects, from which the annotations can Iterate over `Token` objects, from which the annotations can
be easily accessed. This is the main way of accessing Token be easily accessed. This is the main way of accessing Token
objects, which are the main way annotations are accessed from objects, which are the main way annotations are accessed from
Python. If faster-than-Python speeds are required, you can Python. If faster-than-Python speeds are required, you can
instead access the annotations as a numpy array, or access the instead access the annotations as a numpy array, or access the
underlying C data directly from Cython. underlying C data directly from Cython.
''' '''
cdef int i cdef int i
@ -251,13 +251,13 @@ cdef class Doc:
def __get__(self): def __get__(self):
if 'has_vector' in self.user_hooks: if 'has_vector' in self.user_hooks:
return self.user_hooks['has_vector'](self) return self.user_hooks['has_vector'](self)
return any(token.has_vector for token in self) return any(token.has_vector for token in self)
property vector: property vector:
''' '''
A real-valued meaning representation. Defaults to an average of the token vectors. A real-valued meaning representation. Defaults to an average of the token vectors.
Type: numpy.ndarray[ndim=1, dtype='float32'] Type: numpy.ndarray[ndim=1, dtype='float32']
''' '''
def __get__(self): def __get__(self):
@ -285,14 +285,14 @@ cdef class Doc:
norm += value * value norm += value * value
self._vector_norm = sqrt(norm) if norm != 0 else 0 self._vector_norm = sqrt(norm) if norm != 0 else 0
return self._vector_norm return self._vector_norm
def __set__(self, value): def __set__(self, value):
self._vector_norm = value self._vector_norm = value
@property @property
def string(self): def string(self):
return self.text return self.text
property text: property text:
'''A unicode representation of the document text.''' '''A unicode representation of the document text.'''
def __get__(self): def __get__(self):
@ -306,7 +306,7 @@ cdef class Doc:
property ents: property ents:
''' '''
Yields named-entity `Span` objects, if the entity recognizer Yields named-entity `Span` objects, if the entity recognizer
has been applied to the document. Iterate over the span to get has been applied to the document. Iterate over the span to get
individual Token objects, or access the label: individual Token objects, or access the label:
Example: Example:
@ -352,7 +352,7 @@ cdef class Doc:
cdef int i cdef int i
for i in range(self.length): for i in range(self.length):
self.c[i].ent_type = 0 self.c[i].ent_type = 0
# At this point we don't know whether the NER has run over the # At this point we don't know whether the NER has run over the
# Doc. If the ent_iob is missing, leave it missing. # Doc. If the ent_iob is missing, leave it missing.
if self.c[i].ent_iob != 0: if self.c[i].ent_iob != 0:
self.c[i].ent_iob = 2 # Means O. Non-O are set from ents. self.c[i].ent_iob = 2 # Means O. Non-O are set from ents.
@ -384,9 +384,9 @@ cdef class Doc:
property noun_chunks: property noun_chunks:
''' '''
Yields base noun-phrase #[code Span] objects, if the document Yields base noun-phrase #[code Span] objects, if the document
has been syntactically parsed. A base noun phrase, or has been syntactically parsed. A base noun phrase, or
'NP chunk', is a noun phrase that does not permit other NPs to 'NP chunk', is a noun phrase that does not permit other NPs to
be nested within it so no NP-level coordination, no prepositional be nested within it so no NP-level coordination, no prepositional
phrases, and no relative clauses. For example: phrases, and no relative clauses. For example:
''' '''
def __get__(self): def __get__(self):
@ -422,7 +422,7 @@ cdef class Doc:
def __get__(self): def __get__(self):
if 'sents' in self.user_hooks: if 'sents' in self.user_hooks:
return self.user_hooks['sents'](self) return self.user_hooks['sents'](self)
if not self.is_parsed: if not self.is_parsed:
raise ValueError( raise ValueError(
"sentence boundary detection requires the dependency parse, which " "sentence boundary detection requires the dependency parse, which "
@ -465,8 +465,8 @@ cdef class Doc:
@cython.boundscheck(False) @cython.boundscheck(False)
cpdef np.ndarray to_array(self, object py_attr_ids): cpdef np.ndarray to_array(self, object py_attr_ids):
""" """
Given a list of M attribute IDs, export the tokens to a numpy Given a list of M attribute IDs, export the tokens to a numpy
`ndarray` of shape (N, M), where `N` is the length `ndarray` of shape (N, M), where `N` is the length
of the document. The values will be 32-bit integers. of the document. The values will be 32-bit integers.
Example: Example:
@ -474,7 +474,7 @@ cdef class Doc:
doc = nlp(text) doc = nlp(text)
# All strings mapped to integers, for easy export to numpy # All strings mapped to integers, for easy export to numpy
np_array = doc.to_array([attrs.LOWER, attrs.POS, attrs.ENT_TYPE, attrs.IS_ALPHA]) np_array = doc.to_array([attrs.LOWER, attrs.POS, attrs.ENT_TYPE, attrs.IS_ALPHA])
Arguments: Arguments:
attr_ids (list[int]): A list of attribute ID ints. attr_ids (list[int]): A list of attribute ID ints.
@ -520,7 +520,7 @@ cdef class Doc:
cdef int i cdef int i
cdef attr_t attr cdef attr_t attr
cdef size_t count cdef size_t count
if counts is None: if counts is None:
counts = PreshCounter() counts = PreshCounter()
output_dict = True output_dict = True
@ -570,7 +570,7 @@ cdef class Doc:
cdef TokenC* tokens = self.c cdef TokenC* tokens = self.c
cdef int length = len(array) cdef int length = len(array)
cdef attr_t[:] values cdef attr_t[:] values
for col, attr_id in enumerate(attrs): for col, attr_id in enumerate(attrs):
values = array[:, col] values = array[:, col]
if attr_id == HEAD: if attr_id == HEAD:
for i in range(length): for i in range(length):
@ -612,11 +612,11 @@ cdef class Doc:
'''Deserialize, loading from bytes.''' '''Deserialize, loading from bytes.'''
self.vocab.serializer.unpack_into(data[4:], self) self.vocab.serializer.unpack_into(data[4:], self)
return self return self
@staticmethod @staticmethod
def read_bytes(file_): def read_bytes(file_):
''' '''
A static method, used to read serialized #[code Doc] objects from A static method, used to read serialized #[code Doc] objects from
a file. For example: a file. For example:
Example: Example:
@ -673,7 +673,7 @@ cdef class Doc:
"Expected either 3 arguments (deprecated), or 0 (use keyword arguments). " "Expected either 3 arguments (deprecated), or 0 (use keyword arguments). "
"Arguments supplied:\n%s\n" "Arguments supplied:\n%s\n"
"Keyword arguments:%s\n" % (len(args), repr(args), repr(attributes))) "Keyword arguments:%s\n" % (len(args), repr(args), repr(attributes)))
cdef int start = token_by_start(self.c, self.length, start_idx) cdef int start = token_by_start(self.c, self.length, start_idx)
if start == -1: if start == -1:
return None return None
@ -784,7 +784,7 @@ cdef int set_children_from_heads(TokenC* tokens, int length) except -1:
if child.l_edge < head.l_edge: if child.l_edge < head.l_edge:
head.l_edge = child.l_edge head.l_edge = child.l_edge
head.l_kids += 1 head.l_kids += 1
# Set right edges --- same as above, but iterate in reverse # Set right edges --- same as above, but iterate in reverse
for i in range(length-1, -1, -1): for i in range(length-1, -1, -1):
child = &tokens[i] child = &tokens[i]
@ -798,4 +798,4 @@ cdef int set_children_from_heads(TokenC* tokens, int length) except -1:
for i in range(length): for i in range(length):
if tokens[i].head == 0 and tokens[i].dep != 0: if tokens[i].head == 0 and tokens[i].dep != 0:
tokens[tokens[i].l_edge].sent_start = True tokens[tokens[i].l_edge].sent_start = True

View File

@ -20,7 +20,7 @@ from .. import parts_of_speech
from ..attrs cimport LEMMA from ..attrs cimport LEMMA
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
from ..attrs cimport POS, LEMMA, TAG, DEP from ..attrs cimport POS, LEMMA, TAG, DEP
from ..parts_of_speech cimport CONJ, PUNCT from ..parts_of_speech cimport CCONJ, PUNCT
from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
from ..attrs cimport IS_BRACKET from ..attrs cimport IS_BRACKET
@ -84,7 +84,7 @@ cdef class Token:
cpdef bint check_flag(self, attr_id_t flag_id) except -1: cpdef bint check_flag(self, attr_id_t flag_id) except -1:
'''Check the value of a boolean flag. '''Check the value of a boolean flag.
Arguments: Arguments:
flag_id (int): The ID of the flag attribute. flag_id (int): The ID of the flag attribute.
Returns: Returns:
@ -225,7 +225,7 @@ cdef class Token:
property vector: property vector:
''' '''
A real-valued meaning representation. A real-valued meaning representation.
Type: numpy.ndarray[ndim=1, dtype='float32'] Type: numpy.ndarray[ndim=1, dtype='float32']
''' '''
def __get__(self): def __get__(self):
@ -343,7 +343,7 @@ cdef class Token:
''' '''
def __get__(self): def __get__(self):
cdef const TokenC* head_ptr = self.c cdef const TokenC* head_ptr = self.c
# guard against infinite loop, no token can have # guard against infinite loop, no token can have
# more ancestors than tokens in the tree # more ancestors than tokens in the tree
cdef int i = 0 cdef int i = 0
while head_ptr.head != 0 and i < self.doc.length: while head_ptr.head != 0 and i < self.doc.length:
@ -370,7 +370,7 @@ cdef class Token:
property head: property head:
'''The syntactic parent, or "governor", of this token. '''The syntactic parent, or "governor", of this token.
Returns: Token Returns: Token
''' '''
def __get__(self): def __get__(self):
@ -390,7 +390,7 @@ cdef class Token:
# is the new head a descendant of the old head # is the new head a descendant of the old head
cdef bint is_desc = old_head.is_ancestor_of(new_head) cdef bint is_desc = old_head.is_ancestor_of(new_head)
cdef int new_edge cdef int new_edge
cdef Token anc, child cdef Token anc, child
@ -420,7 +420,7 @@ cdef class Token:
if anc.c.l_edge <= new_edge: if anc.c.l_edge <= new_edge:
break break
anc.c.l_edge = new_edge anc.c.l_edge = new_edge
elif self.c.head < 0: # right dependent elif self.c.head < 0: # right dependent
old_head.c.r_kids -= 1 old_head.c.r_kids -= 1
# do the same thing as for l_edge # do the same thing as for l_edge
@ -435,7 +435,7 @@ cdef class Token:
if child.c.r_edge > new_edge: if child.c.r_edge > new_edge:
new_edge = child.c.r_edge new_edge = child.c.r_edge
old_head.c.r_edge = new_edge old_head.c.r_edge = new_edge
for anc in old_head.ancestors: for anc in old_head.ancestors:
if anc.c.r_edge >= new_edge: if anc.c.r_edge >= new_edge:
break break
@ -598,19 +598,19 @@ cdef class Token:
property is_punct: property is_punct:
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_PUNCT) def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_PUNCT)
property is_space: property is_space:
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_SPACE) def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_SPACE)
property is_bracket: property is_bracket:
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_BRACKET) def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_BRACKET)
property is_quote: property is_quote:
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_QUOTE) def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_QUOTE)
property is_left_punct: property is_left_punct:
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_LEFT_PUNCT) def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_LEFT_PUNCT)
property is_right_punct: property is_right_punct:
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_RIGHT_PUNCT) def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_RIGHT_PUNCT)
property like_url: property like_url: