mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-11 04:08:09 +03:00
Merge branch 'rominf-ud20' into develop
This commit is contained in:
commit
d03d6a13f1
|
@ -93,7 +93,7 @@ NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]
|
||||||
|
|
||||||
def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
|
def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
|
||||||
'''Normalize a dictionary of attributes, converting them to ints.
|
'''Normalize a dictionary of attributes, converting them to ints.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
stringy_attrs (dict):
|
stringy_attrs (dict):
|
||||||
Dictionary keyed by attribute string names. Values can be ints or strings.
|
Dictionary keyed by attribute string names. Values can be ints or strings.
|
||||||
|
@ -125,7 +125,9 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
|
||||||
'VerbForm', 'PronType', 'Aspect', 'Tense', 'PartType', 'Poss',
|
'VerbForm', 'PronType', 'Aspect', 'Tense', 'PartType', 'Poss',
|
||||||
'Hyph', 'ConjType', 'NumType', 'Foreign', 'VerbType', 'NounType',
|
'Hyph', 'ConjType', 'NumType', 'Foreign', 'VerbType', 'NounType',
|
||||||
'Number', 'PronType', 'AdjType', 'Person', 'Variant', 'AdpType',
|
'Number', 'PronType', 'AdjType', 'Person', 'Variant', 'AdpType',
|
||||||
'Reflex', 'Negative', 'Mood', 'Aspect', 'Case']
|
'Reflex', 'Negative', 'Mood', 'Aspect', 'Case',
|
||||||
|
'Polarity', # U20
|
||||||
|
]
|
||||||
for key in morph_keys:
|
for key in morph_keys:
|
||||||
if key in stringy_attrs:
|
if key in stringy_attrs:
|
||||||
stringy_attrs.pop(key)
|
stringy_attrs.pop(key)
|
||||||
|
|
|
@ -41,7 +41,7 @@ TAG_MAP = {
|
||||||
"PRF": {POS: PRON, "PronType": "prs", "Reflex": "yes"},
|
"PRF": {POS: PRON, "PronType": "prs", "Reflex": "yes"},
|
||||||
"PTKA": {POS: PART},
|
"PTKA": {POS: PART},
|
||||||
"PTKANT": {POS: PART, "PartType": "res"},
|
"PTKANT": {POS: PART, "PartType": "res"},
|
||||||
"PTKNEG": {POS: PART, "Negative": "yes"},
|
"PTKNEG": {POS: PART, "Polarity": "Neg"},
|
||||||
"PTKVZ": {POS: PART, "PartType": "vbp"},
|
"PTKVZ": {POS: PART, "PartType": "vbp"},
|
||||||
"PTKZU": {POS: PART, "PartType": "inf"},
|
"PTKZU": {POS: PART, "PartType": "inf"},
|
||||||
"PWAT": {POS: DET, "PronType": "int"},
|
"PWAT": {POS: DET, "PronType": "int"},
|
||||||
|
|
|
@ -16,7 +16,7 @@ TAG_MAP = {
|
||||||
"$": {POS: SYM, "Other": {"SymType": "currency"}},
|
"$": {POS: SYM, "Other": {"SymType": "currency"}},
|
||||||
"#": {POS: SYM, "Other": {"SymType": "numbersign"}},
|
"#": {POS: SYM, "Other": {"SymType": "numbersign"}},
|
||||||
"AFX": {POS: ADJ, "Hyph": "yes"},
|
"AFX": {POS: ADJ, "Hyph": "yes"},
|
||||||
"CC": {POS: CONJ, "ConjType": "coor"},
|
"CC": {POS: CCONJ, "ConjType": "coor"},
|
||||||
"CD": {POS: NUM, "NumType": "card"},
|
"CD": {POS: NUM, "NumType": "card"},
|
||||||
"DT": {POS: DET},
|
"DT": {POS: DET},
|
||||||
"EX": {POS: ADV, "AdvType": "ex"},
|
"EX": {POS: ADV, "AdvType": "ex"},
|
||||||
|
|
|
@ -19,6 +19,7 @@ TAG_MAP = {
|
||||||
"AUX": {POS: AUX},
|
"AUX": {POS: AUX},
|
||||||
"X": {POS: X},
|
"X": {POS: X},
|
||||||
"CONJ": {POS: CONJ},
|
"CONJ": {POS: CONJ},
|
||||||
|
"CCONJ": {POS: CCONJ}, # U20
|
||||||
"ADJ": {POS: ADJ},
|
"ADJ": {POS: ADJ},
|
||||||
"VERB": {POS: VERB},
|
"VERB": {POS: VERB},
|
||||||
"PART": {POS: PART}
|
"PART": {POS: PART}
|
||||||
|
|
|
@ -37,7 +37,7 @@ cdef class Morphology:
|
||||||
cdef int assign_tag(self, TokenC* token, tag) except -1
|
cdef int assign_tag(self, TokenC* token, tag) except -1
|
||||||
|
|
||||||
cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1
|
cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1
|
||||||
|
|
||||||
cdef int assign_feature(self, uint64_t* morph, univ_morph_t feat_id, bint value) except -1
|
cdef int assign_feature(self, uint64_t* morph, univ_morph_t feat_id, bint value) except -1
|
||||||
|
|
||||||
|
|
||||||
|
@ -80,6 +80,7 @@ cpdef enum univ_morph_t:
|
||||||
Definite_two
|
Definite_two
|
||||||
Definite_def
|
Definite_def
|
||||||
Definite_red
|
Definite_red
|
||||||
|
Definite_cons # U20
|
||||||
Definite_ind
|
Definite_ind
|
||||||
Degree_cmp
|
Degree_cmp
|
||||||
Degree_comp
|
Degree_comp
|
||||||
|
@ -103,6 +104,8 @@ cpdef enum univ_morph_t:
|
||||||
Negative_neg
|
Negative_neg
|
||||||
Negative_pos
|
Negative_pos
|
||||||
Negative_yes
|
Negative_yes
|
||||||
|
Polarity_neg # U20
|
||||||
|
Polarity_pos # U20
|
||||||
Number_com
|
Number_com
|
||||||
Number_dual
|
Number_dual
|
||||||
Number_none
|
Number_none
|
||||||
|
@ -151,6 +154,7 @@ cpdef enum univ_morph_t:
|
||||||
VerbForm_partPres
|
VerbForm_partPres
|
||||||
VerbForm_sup
|
VerbForm_sup
|
||||||
VerbForm_trans
|
VerbForm_trans
|
||||||
|
VerbForm_conv # U20
|
||||||
VerbForm_gdv # la
|
VerbForm_gdv # la
|
||||||
Voice_act
|
Voice_act
|
||||||
Voice_cau
|
Voice_cau
|
||||||
|
|
|
@ -192,6 +192,7 @@ IDS = {
|
||||||
"Definite_two": Definite_two,
|
"Definite_two": Definite_two,
|
||||||
"Definite_def": Definite_def,
|
"Definite_def": Definite_def,
|
||||||
"Definite_red": Definite_red,
|
"Definite_red": Definite_red,
|
||||||
|
"Definite_cons": Definite_cons, # U20
|
||||||
"Definite_ind": Definite_ind,
|
"Definite_ind": Definite_ind,
|
||||||
"Degree_cmp": Degree_cmp,
|
"Degree_cmp": Degree_cmp,
|
||||||
"Degree_comp": Degree_comp,
|
"Degree_comp": Degree_comp,
|
||||||
|
@ -215,6 +216,8 @@ IDS = {
|
||||||
"Negative_neg": Negative_neg,
|
"Negative_neg": Negative_neg,
|
||||||
"Negative_pos": Negative_pos,
|
"Negative_pos": Negative_pos,
|
||||||
"Negative_yes": Negative_yes,
|
"Negative_yes": Negative_yes,
|
||||||
|
"Polarity_neg": Polarity_neg, # U20
|
||||||
|
"Polarity_pos": Polarity_pos, # U20
|
||||||
"Number_com": Number_com,
|
"Number_com": Number_com,
|
||||||
"Number_dual": Number_dual,
|
"Number_dual": Number_dual,
|
||||||
"Number_none": Number_none,
|
"Number_none": Number_none,
|
||||||
|
@ -263,6 +266,7 @@ IDS = {
|
||||||
"VerbForm_partPres": VerbForm_partPres,
|
"VerbForm_partPres": VerbForm_partPres,
|
||||||
"VerbForm_sup": VerbForm_sup,
|
"VerbForm_sup": VerbForm_sup,
|
||||||
"VerbForm_trans": VerbForm_trans,
|
"VerbForm_trans": VerbForm_trans,
|
||||||
|
"VerbForm_conv": VerbForm_conv, # U20
|
||||||
"VerbForm_gdv ": VerbForm_gdv, # la,
|
"VerbForm_gdv ": VerbForm_gdv, # la,
|
||||||
"Voice_act": Voice_act,
|
"Voice_act": Voice_act,
|
||||||
"Voice_cau": Voice_cau,
|
"Voice_cau": Voice_cau,
|
||||||
|
|
|
@ -7,6 +7,7 @@ cpdef enum univ_pos_t:
|
||||||
ADV
|
ADV
|
||||||
AUX
|
AUX
|
||||||
CONJ
|
CONJ
|
||||||
|
CCONJ # U20
|
||||||
DET
|
DET
|
||||||
INTJ
|
INTJ
|
||||||
NOUN
|
NOUN
|
||||||
|
|
|
@ -7,7 +7,8 @@ IDS = {
|
||||||
"ADP": ADP,
|
"ADP": ADP,
|
||||||
"ADV": ADV,
|
"ADV": ADV,
|
||||||
"AUX": AUX,
|
"AUX": AUX,
|
||||||
"CONJ": CONJ,
|
"CONJ": CONJ, # U20
|
||||||
|
"CCONJ": CCONJ,
|
||||||
"DET": DET,
|
"DET": DET,
|
||||||
"INTJ": INTJ,
|
"INTJ": INTJ,
|
||||||
"NOUN": NOUN,
|
"NOUN": NOUN,
|
||||||
|
|
|
@ -13,7 +13,7 @@ cpdef enum symbol_t:
|
||||||
LIKE_EMAIL
|
LIKE_EMAIL
|
||||||
IS_STOP
|
IS_STOP
|
||||||
IS_OOV
|
IS_OOV
|
||||||
|
|
||||||
FLAG14 = 14
|
FLAG14 = 14
|
||||||
FLAG15
|
FLAG15
|
||||||
FLAG16
|
FLAG16
|
||||||
|
@ -90,6 +90,7 @@ cpdef enum symbol_t:
|
||||||
ADV
|
ADV
|
||||||
AUX
|
AUX
|
||||||
CONJ
|
CONJ
|
||||||
|
CCONJ # U20
|
||||||
DET
|
DET
|
||||||
INTJ
|
INTJ
|
||||||
NOUN
|
NOUN
|
||||||
|
@ -107,11 +108,14 @@ cpdef enum symbol_t:
|
||||||
|
|
||||||
Animacy_anim
|
Animacy_anim
|
||||||
Animacy_inam
|
Animacy_inam
|
||||||
|
Animacy_hum # U20
|
||||||
Aspect_freq
|
Aspect_freq
|
||||||
Aspect_imp
|
Aspect_imp
|
||||||
Aspect_mod
|
Aspect_mod
|
||||||
Aspect_none
|
Aspect_none
|
||||||
Aspect_perf
|
Aspect_perf
|
||||||
|
Aspect_iter # U20
|
||||||
|
Aspect_hab # U20
|
||||||
Case_abe
|
Case_abe
|
||||||
Case_abl
|
Case_abl
|
||||||
Case_abs
|
Case_abs
|
||||||
|
@ -120,10 +124,12 @@ cpdef enum symbol_t:
|
||||||
Case_all
|
Case_all
|
||||||
Case_cau
|
Case_cau
|
||||||
Case_com
|
Case_com
|
||||||
|
Case_cmp # U20
|
||||||
Case_dat
|
Case_dat
|
||||||
Case_del
|
Case_del
|
||||||
Case_dis
|
Case_dis
|
||||||
Case_ela
|
Case_ela
|
||||||
|
Case_equ # U20
|
||||||
Case_ess
|
Case_ess
|
||||||
Case_gen
|
Case_gen
|
||||||
Case_ill
|
Case_ill
|
||||||
|
@ -142,7 +148,9 @@ cpdef enum symbol_t:
|
||||||
Definite_two
|
Definite_two
|
||||||
Definite_def
|
Definite_def
|
||||||
Definite_red
|
Definite_red
|
||||||
|
Definite_cons # U20
|
||||||
Definite_ind
|
Definite_ind
|
||||||
|
Definite_spec # U20
|
||||||
Degree_cmp
|
Degree_cmp
|
||||||
Degree_comp
|
Degree_comp
|
||||||
Degree_none
|
Degree_none
|
||||||
|
@ -151,6 +159,8 @@ cpdef enum symbol_t:
|
||||||
Degree_abs
|
Degree_abs
|
||||||
Degree_com
|
Degree_com
|
||||||
Degree_dim # du
|
Degree_dim # du
|
||||||
|
Degree_equ # U20
|
||||||
|
Evident_nfh # U20
|
||||||
Gender_com
|
Gender_com
|
||||||
Gender_fem
|
Gender_fem
|
||||||
Gender_masc
|
Gender_masc
|
||||||
|
@ -162,16 +172,21 @@ cpdef enum symbol_t:
|
||||||
Mood_pot
|
Mood_pot
|
||||||
Mood_sub
|
Mood_sub
|
||||||
Mood_opt
|
Mood_opt
|
||||||
|
Mood_prp # U20
|
||||||
|
Mood_adm # U20
|
||||||
Negative_neg
|
Negative_neg
|
||||||
Negative_pos
|
Negative_pos
|
||||||
Negative_yes
|
Negative_yes
|
||||||
|
Polarity_neg # U20
|
||||||
|
Polarity_pos # U20
|
||||||
Number_com
|
Number_com
|
||||||
Number_dual
|
Number_dual
|
||||||
Number_none
|
Number_none
|
||||||
Number_plur
|
Number_plur
|
||||||
Number_sing
|
Number_sing
|
||||||
Number_ptan # bg
|
Number_ptan # bg
|
||||||
Number_count # bg
|
Number_count # bg, U20
|
||||||
|
Number_tri # U20
|
||||||
NumType_card
|
NumType_card
|
||||||
NumType_dist
|
NumType_dist
|
||||||
NumType_frac
|
NumType_frac
|
||||||
|
@ -197,7 +212,8 @@ cpdef enum symbol_t:
|
||||||
PronType_rel
|
PronType_rel
|
||||||
PronType_tot
|
PronType_tot
|
||||||
PronType_clit
|
PronType_clit
|
||||||
PronType_exc # es, ca, it, fa
|
PronType_exc # es, ca, it, fa, U20
|
||||||
|
PronType_emp # U20
|
||||||
Reflex_yes
|
Reflex_yes
|
||||||
Tense_fut
|
Tense_fut
|
||||||
Tense_imp
|
Tense_imp
|
||||||
|
@ -213,12 +229,17 @@ cpdef enum symbol_t:
|
||||||
VerbForm_partPres
|
VerbForm_partPres
|
||||||
VerbForm_sup
|
VerbForm_sup
|
||||||
VerbForm_trans
|
VerbForm_trans
|
||||||
|
VerbForm_conv # U20
|
||||||
VerbForm_gdv # la
|
VerbForm_gdv # la
|
||||||
|
VerbForm_vnoun # U20
|
||||||
Voice_act
|
Voice_act
|
||||||
Voice_cau
|
Voice_cau
|
||||||
Voice_pass
|
Voice_pass
|
||||||
Voice_mid # gkc
|
Voice_mid # gkc, U20
|
||||||
Voice_int # hb
|
Voice_int # hb
|
||||||
|
Voice_antip # U20
|
||||||
|
Voice_dir # U20
|
||||||
|
Voice_inv # U20
|
||||||
Abbr_yes # cz, fi, sl, U
|
Abbr_yes # cz, fi, sl, U
|
||||||
AdpType_prep # cz, U
|
AdpType_prep # cz, U
|
||||||
AdpType_post # U
|
AdpType_post # U
|
||||||
|
@ -284,6 +305,10 @@ cpdef enum symbol_t:
|
||||||
Number_psee_plur # U
|
Number_psee_plur # U
|
||||||
Number_psor_sing # cz, fi, sl, U
|
Number_psor_sing # cz, fi, sl, U
|
||||||
Number_psor_plur # cz, fi, sl, U
|
Number_psor_plur # cz, fi, sl, U
|
||||||
|
Number_pauc # U20
|
||||||
|
Number_grpa # U20
|
||||||
|
Number_grpl # U20
|
||||||
|
Number_inv # U20
|
||||||
NumForm_digit # cz, sl, U
|
NumForm_digit # cz, sl, U
|
||||||
NumForm_roman # cz, sl, U
|
NumForm_roman # cz, sl, U
|
||||||
NumForm_word # cz, sl, U
|
NumForm_word # cz, sl, U
|
||||||
|
@ -311,6 +336,8 @@ cpdef enum symbol_t:
|
||||||
Person_psor_one # fi, U
|
Person_psor_one # fi, U
|
||||||
Person_psor_two # fi, U
|
Person_psor_two # fi, U
|
||||||
Person_psor_three # fi, U
|
Person_psor_three # fi, U
|
||||||
|
Person_zero # U20
|
||||||
|
Person_four # U20
|
||||||
Polite_inf # bq, U
|
Polite_inf # bq, U
|
||||||
Polite_pol # bq, U
|
Polite_pol # bq, U
|
||||||
Polite_abs_inf # bq, U
|
Polite_abs_inf # bq, U
|
||||||
|
@ -319,6 +346,10 @@ cpdef enum symbol_t:
|
||||||
Polite_erg_pol # bq, U
|
Polite_erg_pol # bq, U
|
||||||
Polite_dat_inf # bq, U
|
Polite_dat_inf # bq, U
|
||||||
Polite_dat_pol # bq, U
|
Polite_dat_pol # bq, U
|
||||||
|
Polite_infm # U20
|
||||||
|
Polite_form # U20
|
||||||
|
Polite_form_elev # U20
|
||||||
|
Polite_form_humb # U20
|
||||||
Prefix_yes # U
|
Prefix_yes # U
|
||||||
PrepCase_npr # cz
|
PrepCase_npr # cz
|
||||||
PrepCase_pre # U
|
PrepCase_pre # U
|
||||||
|
@ -383,6 +414,7 @@ cpdef enum symbol_t:
|
||||||
ccomp
|
ccomp
|
||||||
complm
|
complm
|
||||||
conj
|
conj
|
||||||
|
cop # U20
|
||||||
csubj
|
csubj
|
||||||
csubjpass
|
csubjpass
|
||||||
dep
|
dep
|
||||||
|
@ -405,6 +437,8 @@ cpdef enum symbol_t:
|
||||||
num
|
num
|
||||||
number
|
number
|
||||||
oprd
|
oprd
|
||||||
|
obj # U20
|
||||||
|
obl # U20
|
||||||
parataxis
|
parataxis
|
||||||
partmod
|
partmod
|
||||||
pcomp
|
pcomp
|
||||||
|
|
|
@ -91,6 +91,7 @@ IDS = {
|
||||||
"ADV": ADV,
|
"ADV": ADV,
|
||||||
"AUX": AUX,
|
"AUX": AUX,
|
||||||
"CONJ": CONJ,
|
"CONJ": CONJ,
|
||||||
|
"CCONJ": CCONJ, # U20
|
||||||
"DET": DET,
|
"DET": DET,
|
||||||
"INTJ": INTJ,
|
"INTJ": INTJ,
|
||||||
"NOUN": NOUN,
|
"NOUN": NOUN,
|
||||||
|
@ -108,11 +109,14 @@ IDS = {
|
||||||
|
|
||||||
"Animacy_anim": Animacy_anim,
|
"Animacy_anim": Animacy_anim,
|
||||||
"Animacy_inam": Animacy_inam,
|
"Animacy_inam": Animacy_inam,
|
||||||
|
"Animacy_hum": Animacy_hum, # U20
|
||||||
"Aspect_freq": Aspect_freq,
|
"Aspect_freq": Aspect_freq,
|
||||||
"Aspect_imp": Aspect_imp,
|
"Aspect_imp": Aspect_imp,
|
||||||
"Aspect_mod": Aspect_mod,
|
"Aspect_mod": Aspect_mod,
|
||||||
"Aspect_none": Aspect_none,
|
"Aspect_none": Aspect_none,
|
||||||
"Aspect_perf": Aspect_perf,
|
"Aspect_perf": Aspect_perf,
|
||||||
|
"Aspect_iter": Aspect_iter, # U20
|
||||||
|
"Aspect_hab": Aspect_hab, # U20
|
||||||
"Case_abe": Case_abe,
|
"Case_abe": Case_abe,
|
||||||
"Case_abl": Case_abl,
|
"Case_abl": Case_abl,
|
||||||
"Case_abs": Case_abs,
|
"Case_abs": Case_abs,
|
||||||
|
@ -121,10 +125,12 @@ IDS = {
|
||||||
"Case_all": Case_all,
|
"Case_all": Case_all,
|
||||||
"Case_cau": Case_cau,
|
"Case_cau": Case_cau,
|
||||||
"Case_com": Case_com,
|
"Case_com": Case_com,
|
||||||
|
"Case_cmp": Case_cmp, # U20
|
||||||
"Case_dat": Case_dat,
|
"Case_dat": Case_dat,
|
||||||
"Case_del": Case_del,
|
"Case_del": Case_del,
|
||||||
"Case_dis": Case_dis,
|
"Case_dis": Case_dis,
|
||||||
"Case_ela": Case_ela,
|
"Case_ela": Case_ela,
|
||||||
|
"Case_equ": Case_equ, # U20
|
||||||
"Case_ess": Case_ess,
|
"Case_ess": Case_ess,
|
||||||
"Case_gen": Case_gen,
|
"Case_gen": Case_gen,
|
||||||
"Case_ill": Case_ill,
|
"Case_ill": Case_ill,
|
||||||
|
@ -143,7 +149,9 @@ IDS = {
|
||||||
"Definite_two": Definite_two,
|
"Definite_two": Definite_two,
|
||||||
"Definite_def": Definite_def,
|
"Definite_def": Definite_def,
|
||||||
"Definite_red": Definite_red,
|
"Definite_red": Definite_red,
|
||||||
|
"Definite_cons": Definite_cons, # U20
|
||||||
"Definite_ind": Definite_ind,
|
"Definite_ind": Definite_ind,
|
||||||
|
"Definite_spec": Definite_spec, # U20
|
||||||
"Degree_cmp": Degree_cmp,
|
"Degree_cmp": Degree_cmp,
|
||||||
"Degree_comp": Degree_comp,
|
"Degree_comp": Degree_comp,
|
||||||
"Degree_none": Degree_none,
|
"Degree_none": Degree_none,
|
||||||
|
@ -152,6 +160,8 @@ IDS = {
|
||||||
"Degree_abs": Degree_abs,
|
"Degree_abs": Degree_abs,
|
||||||
"Degree_com": Degree_com,
|
"Degree_com": Degree_com,
|
||||||
"Degree_dim ": Degree_dim, # du
|
"Degree_dim ": Degree_dim, # du
|
||||||
|
"Degree_equ": Degree_equ, # U20
|
||||||
|
"Evident_nfh": Evident_nfh, # U20
|
||||||
"Gender_com": Gender_com,
|
"Gender_com": Gender_com,
|
||||||
"Gender_fem": Gender_fem,
|
"Gender_fem": Gender_fem,
|
||||||
"Gender_masc": Gender_masc,
|
"Gender_masc": Gender_masc,
|
||||||
|
@ -163,16 +173,21 @@ IDS = {
|
||||||
"Mood_pot": Mood_pot,
|
"Mood_pot": Mood_pot,
|
||||||
"Mood_sub": Mood_sub,
|
"Mood_sub": Mood_sub,
|
||||||
"Mood_opt": Mood_opt,
|
"Mood_opt": Mood_opt,
|
||||||
|
"Mood_prp": Mood_prp, # U20
|
||||||
|
"Mood_adm": Mood_adm, # U20
|
||||||
"Negative_neg": Negative_neg,
|
"Negative_neg": Negative_neg,
|
||||||
"Negative_pos": Negative_pos,
|
"Negative_pos": Negative_pos,
|
||||||
"Negative_yes": Negative_yes,
|
"Negative_yes": Negative_yes,
|
||||||
|
"Polarity_neg": Polarity_neg, # U20
|
||||||
|
"Polarity_pos": Polarity_pos, # U20
|
||||||
"Number_com": Number_com,
|
"Number_com": Number_com,
|
||||||
"Number_dual": Number_dual,
|
"Number_dual": Number_dual,
|
||||||
"Number_none": Number_none,
|
"Number_none": Number_none,
|
||||||
"Number_plur": Number_plur,
|
"Number_plur": Number_plur,
|
||||||
"Number_sing": Number_sing,
|
"Number_sing": Number_sing,
|
||||||
"Number_ptan ": Number_ptan, # bg
|
"Number_ptan ": Number_ptan, # bg
|
||||||
"Number_count ": Number_count, # bg
|
"Number_count ": Number_count, # bg, U20
|
||||||
|
"Number_tri": Number_tri, # U20
|
||||||
"NumType_card": NumType_card,
|
"NumType_card": NumType_card,
|
||||||
"NumType_dist": NumType_dist,
|
"NumType_dist": NumType_dist,
|
||||||
"NumType_frac": NumType_frac,
|
"NumType_frac": NumType_frac,
|
||||||
|
@ -198,7 +213,8 @@ IDS = {
|
||||||
"PronType_rel": PronType_rel,
|
"PronType_rel": PronType_rel,
|
||||||
"PronType_tot": PronType_tot,
|
"PronType_tot": PronType_tot,
|
||||||
"PronType_clit": PronType_clit,
|
"PronType_clit": PronType_clit,
|
||||||
"PronType_exc ": PronType_exc, # es, ca, it, fa,
|
"PronType_exc": PronType_exc, # es, ca, it, fa, U20
|
||||||
|
"PronType_emp": PronType_emp, # U20
|
||||||
"Reflex_yes": Reflex_yes,
|
"Reflex_yes": Reflex_yes,
|
||||||
"Tense_fut": Tense_fut,
|
"Tense_fut": Tense_fut,
|
||||||
"Tense_imp": Tense_imp,
|
"Tense_imp": Tense_imp,
|
||||||
|
@ -214,12 +230,17 @@ IDS = {
|
||||||
"VerbForm_partPres": VerbForm_partPres,
|
"VerbForm_partPres": VerbForm_partPres,
|
||||||
"VerbForm_sup": VerbForm_sup,
|
"VerbForm_sup": VerbForm_sup,
|
||||||
"VerbForm_trans": VerbForm_trans,
|
"VerbForm_trans": VerbForm_trans,
|
||||||
|
"VerbForm_conv": VerbForm_conv, # U20
|
||||||
"VerbForm_gdv ": VerbForm_gdv, # la,
|
"VerbForm_gdv ": VerbForm_gdv, # la,
|
||||||
|
"VerbForm_vnoun": VerbForm_vnoun, # U20
|
||||||
"Voice_act": Voice_act,
|
"Voice_act": Voice_act,
|
||||||
"Voice_cau": Voice_cau,
|
"Voice_cau": Voice_cau,
|
||||||
"Voice_pass": Voice_pass,
|
"Voice_pass": Voice_pass,
|
||||||
"Voice_mid ": Voice_mid, # gkc,
|
"Voice_mid ": Voice_mid, # gkc, U20
|
||||||
"Voice_int ": Voice_int, # hb,
|
"Voice_int ": Voice_int, # hb,
|
||||||
|
"Voice_antip": Voice_antip, # U20
|
||||||
|
"Voice_dir": Voice_dir, # U20
|
||||||
|
"Voice_inv": Voice_inv, # U20
|
||||||
"Abbr_yes ": Abbr_yes, # cz, fi, sl, U,
|
"Abbr_yes ": Abbr_yes, # cz, fi, sl, U,
|
||||||
"AdpType_prep ": AdpType_prep, # cz, U,
|
"AdpType_prep ": AdpType_prep, # cz, U,
|
||||||
"AdpType_post ": AdpType_post, # U,
|
"AdpType_post ": AdpType_post, # U,
|
||||||
|
@ -285,6 +306,10 @@ IDS = {
|
||||||
"Number_psee_plur ": Number_psee_plur, # U,
|
"Number_psee_plur ": Number_psee_plur, # U,
|
||||||
"Number_psor_sing ": Number_psor_sing, # cz, fi, sl, U,
|
"Number_psor_sing ": Number_psor_sing, # cz, fi, sl, U,
|
||||||
"Number_psor_plur ": Number_psor_plur, # cz, fi, sl, U,
|
"Number_psor_plur ": Number_psor_plur, # cz, fi, sl, U,
|
||||||
|
"Number_pauc": Number_pauc, # U20
|
||||||
|
"Number_grpa": Number_grpa, # U20
|
||||||
|
"Number_grpl": Number_grpl, # U20
|
||||||
|
"Number_inv": Number_inv, # U20
|
||||||
"NumForm_digit ": NumForm_digit, # cz, sl, U,
|
"NumForm_digit ": NumForm_digit, # cz, sl, U,
|
||||||
"NumForm_roman ": NumForm_roman, # cz, sl, U,
|
"NumForm_roman ": NumForm_roman, # cz, sl, U,
|
||||||
"NumForm_word ": NumForm_word, # cz, sl, U,
|
"NumForm_word ": NumForm_word, # cz, sl, U,
|
||||||
|
@ -312,6 +337,8 @@ IDS = {
|
||||||
"Person_psor_one ": Person_psor_one, # fi, U,
|
"Person_psor_one ": Person_psor_one, # fi, U,
|
||||||
"Person_psor_two ": Person_psor_two, # fi, U,
|
"Person_psor_two ": Person_psor_two, # fi, U,
|
||||||
"Person_psor_three ": Person_psor_three, # fi, U,
|
"Person_psor_three ": Person_psor_three, # fi, U,
|
||||||
|
"Person_zero ": Person_zero, # U20
|
||||||
|
"Person_four ": Person_four, # U20
|
||||||
"Polite_inf ": Polite_inf, # bq, U,
|
"Polite_inf ": Polite_inf, # bq, U,
|
||||||
"Polite_pol ": Polite_pol, # bq, U,
|
"Polite_pol ": Polite_pol, # bq, U,
|
||||||
"Polite_abs_inf ": Polite_abs_inf, # bq, U,
|
"Polite_abs_inf ": Polite_abs_inf, # bq, U,
|
||||||
|
@ -320,6 +347,10 @@ IDS = {
|
||||||
"Polite_erg_pol ": Polite_erg_pol, # bq, U,
|
"Polite_erg_pol ": Polite_erg_pol, # bq, U,
|
||||||
"Polite_dat_inf ": Polite_dat_inf, # bq, U,
|
"Polite_dat_inf ": Polite_dat_inf, # bq, U,
|
||||||
"Polite_dat_pol ": Polite_dat_pol, # bq, U,
|
"Polite_dat_pol ": Polite_dat_pol, # bq, U,
|
||||||
|
"Polite_infm ": Polite_infm, # U20
|
||||||
|
"Polite_form ": Polite_form, # U20
|
||||||
|
"Polite_form_elev ": Polite_form_elev, # U20
|
||||||
|
"Polite_form_humb ": Polite_form_humb, # U20
|
||||||
"Prefix_yes ": Prefix_yes, # U,
|
"Prefix_yes ": Prefix_yes, # U,
|
||||||
"PrepCase_npr ": PrepCase_npr, # cz,
|
"PrepCase_npr ": PrepCase_npr, # cz,
|
||||||
"PrepCase_pre ": PrepCase_pre, # U,
|
"PrepCase_pre ": PrepCase_pre, # U,
|
||||||
|
@ -384,6 +415,7 @@ IDS = {
|
||||||
"ccomp": ccomp,
|
"ccomp": ccomp,
|
||||||
"complm": complm,
|
"complm": complm,
|
||||||
"conj": conj,
|
"conj": conj,
|
||||||
|
"cop": cop, # U20
|
||||||
"csubj": csubj,
|
"csubj": csubj,
|
||||||
"csubjpass": csubjpass,
|
"csubjpass": csubjpass,
|
||||||
"dep": dep,
|
"dep": dep,
|
||||||
|
@ -406,6 +438,8 @@ IDS = {
|
||||||
"num": num,
|
"num": num,
|
||||||
"number": number,
|
"number": number,
|
||||||
"oprd": oprd,
|
"oprd": oprd,
|
||||||
|
"obj": obj, # U20
|
||||||
|
"obl": obl, # U20
|
||||||
"parataxis": parataxis,
|
"parataxis": parataxis,
|
||||||
"partmod": partmod,
|
"partmod": partmod,
|
||||||
"pcomp": pcomp,
|
"pcomp": pcomp,
|
||||||
|
|
|
@ -8,7 +8,7 @@ from spacy.attrs import DEP, HEAD
|
||||||
def ancestors(tokenid, heads):
|
def ancestors(tokenid, heads):
|
||||||
# returns all words going from the word up the path to the root
|
# returns all words going from the word up the path to the root
|
||||||
# the path to root cannot be longer than the number of words in the sentence
|
# the path to root cannot be longer than the number of words in the sentence
|
||||||
# this function ends after at most len(heads) steps
|
# this function ends after at most len(heads) steps
|
||||||
# because it would otherwise loop indefinitely on cycles
|
# because it would otherwise loop indefinitely on cycles
|
||||||
head = tokenid
|
head = tokenid
|
||||||
cnt = 0
|
cnt = 0
|
||||||
|
@ -180,7 +180,7 @@ class PseudoProjectivity:
|
||||||
next_queue = []
|
next_queue = []
|
||||||
for qtoken in queue:
|
for qtoken in queue:
|
||||||
for child in qtoken.children:
|
for child in qtoken.children:
|
||||||
if child.is_space: continue
|
if child.is_space: continue
|
||||||
if child == token: continue
|
if child == token: continue
|
||||||
if child.dep_ == headlabel:
|
if child.dep_ == headlabel:
|
||||||
return child
|
return child
|
||||||
|
|
|
@ -13,13 +13,13 @@ from thinc.linalg cimport VecVec
|
||||||
from .typedefs cimport attr_t
|
from .typedefs cimport attr_t
|
||||||
from .tokens.doc cimport Doc
|
from .tokens.doc cimport Doc
|
||||||
from .attrs cimport TAG
|
from .attrs cimport TAG
|
||||||
from .parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON
|
from .parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CCONJ, DET, NOUN, NUM, PRON
|
||||||
from .parts_of_speech cimport VERB, X, PUNCT, EOL, SPACE
|
from .parts_of_speech cimport VERB, X, PUNCT, EOL, SPACE
|
||||||
from .gold cimport GoldParse
|
from .gold cimport GoldParse
|
||||||
|
|
||||||
from .attrs cimport *
|
from .attrs cimport *
|
||||||
|
|
||||||
|
|
||||||
cpdef enum:
|
cpdef enum:
|
||||||
P2_orth
|
P2_orth
|
||||||
P2_cluster
|
P2_cluster
|
||||||
|
@ -71,7 +71,7 @@ cpdef enum:
|
||||||
|
|
||||||
cdef class TaggerModel(AveragedPerceptron):
|
cdef class TaggerModel(AveragedPerceptron):
|
||||||
cdef void set_featuresC(self, ExampleC* eg, const TokenC* tokens, int i) except *:
|
cdef void set_featuresC(self, ExampleC* eg, const TokenC* tokens, int i) except *:
|
||||||
|
|
||||||
_fill_from_token(&eg.atoms[P2_orth], &tokens[i-2])
|
_fill_from_token(&eg.atoms[P2_orth], &tokens[i-2])
|
||||||
_fill_from_token(&eg.atoms[P1_orth], &tokens[i-1])
|
_fill_from_token(&eg.atoms[P1_orth], &tokens[i-1])
|
||||||
_fill_from_token(&eg.atoms[W_orth], &tokens[i])
|
_fill_from_token(&eg.atoms[W_orth], &tokens[i])
|
||||||
|
@ -191,7 +191,7 @@ cdef class Tagger:
|
||||||
nr_class=self.vocab.morphology.n_tags,
|
nr_class=self.vocab.morphology.n_tags,
|
||||||
nr_feat=self.model.nr_feat)
|
nr_feat=self.model.nr_feat)
|
||||||
for i in range(tokens.length):
|
for i in range(tokens.length):
|
||||||
if tokens.c[i].pos == 0:
|
if tokens.c[i].pos == 0:
|
||||||
self.model.set_featuresC(&eg.c, tokens.c, i)
|
self.model.set_featuresC(&eg.c, tokens.c, i)
|
||||||
self.model.set_scoresC(eg.c.scores,
|
self.model.set_scoresC(eg.c.scores,
|
||||||
eg.c.features, eg.c.nr_feat)
|
eg.c.features, eg.c.nr_feat)
|
||||||
|
@ -217,7 +217,7 @@ cdef class Tagger:
|
||||||
for doc in stream:
|
for doc in stream:
|
||||||
self(doc)
|
self(doc)
|
||||||
yield doc
|
yield doc
|
||||||
|
|
||||||
def update(self, Doc tokens, GoldParse gold):
|
def update(self, Doc tokens, GoldParse gold):
|
||||||
"""Update the statistical model, with tags supplied for the given document.
|
"""Update the statistical model, with tags supplied for the given document.
|
||||||
|
|
||||||
|
@ -251,7 +251,7 @@ cdef class Tagger:
|
||||||
self.model.updateC(&eg.c)
|
self.model.updateC(&eg.c)
|
||||||
|
|
||||||
self.vocab.morphology.assign_tag_id(&tokens.c[i], eg.guess)
|
self.vocab.morphology.assign_tag_id(&tokens.c[i], eg.guess)
|
||||||
|
|
||||||
correct += eg.cost == 0
|
correct += eg.cost == 0
|
||||||
self.freqs[TAG][tokens.c[i].tag] += 1
|
self.freqs[TAG][tokens.c[i].tag] += 1
|
||||||
eg.fill_scores(0, eg.c.nr_class)
|
eg.fill_scores(0, eg.c.nr_class)
|
||||||
|
|
|
@ -16,7 +16,7 @@ from ..typedefs cimport attr_t, flags_t
|
||||||
from ..attrs cimport attr_id_t
|
from ..attrs cimport attr_id_t
|
||||||
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
|
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
|
||||||
from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
|
from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
|
||||||
from ..parts_of_speech cimport CONJ, PUNCT, NOUN
|
from ..parts_of_speech cimport CCONJ, PUNCT, NOUN
|
||||||
from ..parts_of_speech cimport univ_pos_t
|
from ..parts_of_speech cimport univ_pos_t
|
||||||
from ..lexeme cimport Lexeme
|
from ..lexeme cimport Lexeme
|
||||||
from .span cimport Span
|
from .span cimport Span
|
||||||
|
@ -59,13 +59,13 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil:
|
||||||
|
|
||||||
cdef class Doc:
|
cdef class Doc:
|
||||||
"""
|
"""
|
||||||
A sequence of `Token` objects. Access sentences and named entities,
|
A sequence of `Token` objects. Access sentences and named entities,
|
||||||
export annotations to numpy arrays, losslessly serialize to compressed
|
export annotations to numpy arrays, losslessly serialize to compressed
|
||||||
binary strings.
|
binary strings.
|
||||||
|
|
||||||
Aside: Internals
|
Aside: Internals
|
||||||
The `Doc` object holds an array of `TokenC` structs.
|
The `Doc` object holds an array of `TokenC` structs.
|
||||||
The Python-level `Token` and `Span` objects are views of this
|
The Python-level `Token` and `Span` objects are views of this
|
||||||
array, i.e. they don't own the data themselves.
|
array, i.e. they don't own the data themselves.
|
||||||
|
|
||||||
Code: Construction 1
|
Code: Construction 1
|
||||||
|
@ -80,13 +80,13 @@ cdef class Doc:
|
||||||
Create a Doc object.
|
Create a Doc object.
|
||||||
|
|
||||||
Aside: Implementation
|
Aside: Implementation
|
||||||
This method of constructing a `Doc` object is usually only used
|
This method of constructing a `Doc` object is usually only used
|
||||||
for deserialization. Standard usage is to construct the document via
|
for deserialization. Standard usage is to construct the document via
|
||||||
a call to the language object.
|
a call to the language object.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
vocab:
|
vocab:
|
||||||
A Vocabulary object, which must match any models you want to
|
A Vocabulary object, which must match any models you want to
|
||||||
use (e.g. tokenizer, parser, entity recognizer).
|
use (e.g. tokenizer, parser, entity recognizer).
|
||||||
|
|
||||||
words:
|
words:
|
||||||
|
@ -156,19 +156,19 @@ cdef class Doc:
|
||||||
if self.length == 0:
|
if self.length == 0:
|
||||||
self.is_tagged = True
|
self.is_tagged = True
|
||||||
self.is_parsed = True
|
self.is_parsed = True
|
||||||
|
|
||||||
def __getitem__(self, object i):
|
def __getitem__(self, object i):
|
||||||
'''
|
'''
|
||||||
doc[i]
|
doc[i]
|
||||||
Get the Token object at position i, where i is an integer.
|
Get the Token object at position i, where i is an integer.
|
||||||
Negative indexing is supported, and follows the usual Python
|
Negative indexing is supported, and follows the usual Python
|
||||||
semantics, i.e. doc[-2] is doc[len(doc) - 2].
|
semantics, i.e. doc[-2] is doc[len(doc) - 2].
|
||||||
doc[start : end]]
|
doc[start : end]]
|
||||||
Get a `Span` object, starting at position `start`
|
Get a `Span` object, starting at position `start`
|
||||||
and ending at position `end`, where `start` and
|
and ending at position `end`, where `start` and
|
||||||
`end` are token indices. For instance,
|
`end` are token indices. For instance,
|
||||||
`doc[2:5]` produces a span consisting of
|
`doc[2:5]` produces a span consisting of
|
||||||
tokens 2, 3 and 4. Stepped slices (e.g. `doc[start : end : step]`)
|
tokens 2, 3 and 4. Stepped slices (e.g. `doc[start : end : step]`)
|
||||||
are not supported, as `Span` objects must be contiguous (cannot have gaps).
|
are not supported, as `Span` objects must be contiguous (cannot have gaps).
|
||||||
You can use negative indices and open-ended ranges, which have their
|
You can use negative indices and open-ended ranges, which have their
|
||||||
normal Python semantics.
|
normal Python semantics.
|
||||||
|
@ -188,11 +188,11 @@ cdef class Doc:
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
'''
|
'''
|
||||||
for token in doc
|
for token in doc
|
||||||
Iterate over `Token` objects, from which the annotations can
|
Iterate over `Token` objects, from which the annotations can
|
||||||
be easily accessed. This is the main way of accessing Token
|
be easily accessed. This is the main way of accessing Token
|
||||||
objects, which are the main way annotations are accessed from
|
objects, which are the main way annotations are accessed from
|
||||||
Python. If faster-than-Python speeds are required, you can
|
Python. If faster-than-Python speeds are required, you can
|
||||||
instead access the annotations as a numpy array, or access the
|
instead access the annotations as a numpy array, or access the
|
||||||
underlying C data directly from Cython.
|
underlying C data directly from Cython.
|
||||||
'''
|
'''
|
||||||
cdef int i
|
cdef int i
|
||||||
|
@ -251,13 +251,13 @@ cdef class Doc:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
if 'has_vector' in self.user_hooks:
|
if 'has_vector' in self.user_hooks:
|
||||||
return self.user_hooks['has_vector'](self)
|
return self.user_hooks['has_vector'](self)
|
||||||
|
|
||||||
return any(token.has_vector for token in self)
|
return any(token.has_vector for token in self)
|
||||||
|
|
||||||
property vector:
|
property vector:
|
||||||
'''
|
'''
|
||||||
A real-valued meaning representation. Defaults to an average of the token vectors.
|
A real-valued meaning representation. Defaults to an average of the token vectors.
|
||||||
|
|
||||||
Type: numpy.ndarray[ndim=1, dtype='float32']
|
Type: numpy.ndarray[ndim=1, dtype='float32']
|
||||||
'''
|
'''
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
|
@ -285,14 +285,14 @@ cdef class Doc:
|
||||||
norm += value * value
|
norm += value * value
|
||||||
self._vector_norm = sqrt(norm) if norm != 0 else 0
|
self._vector_norm = sqrt(norm) if norm != 0 else 0
|
||||||
return self._vector_norm
|
return self._vector_norm
|
||||||
|
|
||||||
def __set__(self, value):
|
def __set__(self, value):
|
||||||
self._vector_norm = value
|
self._vector_norm = value
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def string(self):
|
def string(self):
|
||||||
return self.text
|
return self.text
|
||||||
|
|
||||||
property text:
|
property text:
|
||||||
'''A unicode representation of the document text.'''
|
'''A unicode representation of the document text.'''
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
|
@ -306,7 +306,7 @@ cdef class Doc:
|
||||||
property ents:
|
property ents:
|
||||||
'''
|
'''
|
||||||
Yields named-entity `Span` objects, if the entity recognizer
|
Yields named-entity `Span` objects, if the entity recognizer
|
||||||
has been applied to the document. Iterate over the span to get
|
has been applied to the document. Iterate over the span to get
|
||||||
individual Token objects, or access the label:
|
individual Token objects, or access the label:
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
|
@ -352,7 +352,7 @@ cdef class Doc:
|
||||||
cdef int i
|
cdef int i
|
||||||
for i in range(self.length):
|
for i in range(self.length):
|
||||||
self.c[i].ent_type = 0
|
self.c[i].ent_type = 0
|
||||||
# At this point we don't know whether the NER has run over the
|
# At this point we don't know whether the NER has run over the
|
||||||
# Doc. If the ent_iob is missing, leave it missing.
|
# Doc. If the ent_iob is missing, leave it missing.
|
||||||
if self.c[i].ent_iob != 0:
|
if self.c[i].ent_iob != 0:
|
||||||
self.c[i].ent_iob = 2 # Means O. Non-O are set from ents.
|
self.c[i].ent_iob = 2 # Means O. Non-O are set from ents.
|
||||||
|
@ -384,9 +384,9 @@ cdef class Doc:
|
||||||
property noun_chunks:
|
property noun_chunks:
|
||||||
'''
|
'''
|
||||||
Yields base noun-phrase #[code Span] objects, if the document
|
Yields base noun-phrase #[code Span] objects, if the document
|
||||||
has been syntactically parsed. A base noun phrase, or
|
has been syntactically parsed. A base noun phrase, or
|
||||||
'NP chunk', is a noun phrase that does not permit other NPs to
|
'NP chunk', is a noun phrase that does not permit other NPs to
|
||||||
be nested within it – so no NP-level coordination, no prepositional
|
be nested within it – so no NP-level coordination, no prepositional
|
||||||
phrases, and no relative clauses. For example:
|
phrases, and no relative clauses. For example:
|
||||||
'''
|
'''
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
|
@ -422,7 +422,7 @@ cdef class Doc:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
if 'sents' in self.user_hooks:
|
if 'sents' in self.user_hooks:
|
||||||
return self.user_hooks['sents'](self)
|
return self.user_hooks['sents'](self)
|
||||||
|
|
||||||
if not self.is_parsed:
|
if not self.is_parsed:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"sentence boundary detection requires the dependency parse, which "
|
"sentence boundary detection requires the dependency parse, which "
|
||||||
|
@ -465,8 +465,8 @@ cdef class Doc:
|
||||||
@cython.boundscheck(False)
|
@cython.boundscheck(False)
|
||||||
cpdef np.ndarray to_array(self, object py_attr_ids):
|
cpdef np.ndarray to_array(self, object py_attr_ids):
|
||||||
"""
|
"""
|
||||||
Given a list of M attribute IDs, export the tokens to a numpy
|
Given a list of M attribute IDs, export the tokens to a numpy
|
||||||
`ndarray` of shape (N, M), where `N` is the length
|
`ndarray` of shape (N, M), where `N` is the length
|
||||||
of the document. The values will be 32-bit integers.
|
of the document. The values will be 32-bit integers.
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
|
@ -474,7 +474,7 @@ cdef class Doc:
|
||||||
doc = nlp(text)
|
doc = nlp(text)
|
||||||
# All strings mapped to integers, for easy export to numpy
|
# All strings mapped to integers, for easy export to numpy
|
||||||
np_array = doc.to_array([attrs.LOWER, attrs.POS, attrs.ENT_TYPE, attrs.IS_ALPHA])
|
np_array = doc.to_array([attrs.LOWER, attrs.POS, attrs.ENT_TYPE, attrs.IS_ALPHA])
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
attr_ids (list[int]): A list of attribute ID ints.
|
attr_ids (list[int]): A list of attribute ID ints.
|
||||||
|
|
||||||
|
@ -520,7 +520,7 @@ cdef class Doc:
|
||||||
cdef int i
|
cdef int i
|
||||||
cdef attr_t attr
|
cdef attr_t attr
|
||||||
cdef size_t count
|
cdef size_t count
|
||||||
|
|
||||||
if counts is None:
|
if counts is None:
|
||||||
counts = PreshCounter()
|
counts = PreshCounter()
|
||||||
output_dict = True
|
output_dict = True
|
||||||
|
@ -570,7 +570,7 @@ cdef class Doc:
|
||||||
cdef TokenC* tokens = self.c
|
cdef TokenC* tokens = self.c
|
||||||
cdef int length = len(array)
|
cdef int length = len(array)
|
||||||
cdef attr_t[:] values
|
cdef attr_t[:] values
|
||||||
for col, attr_id in enumerate(attrs):
|
for col, attr_id in enumerate(attrs):
|
||||||
values = array[:, col]
|
values = array[:, col]
|
||||||
if attr_id == HEAD:
|
if attr_id == HEAD:
|
||||||
for i in range(length):
|
for i in range(length):
|
||||||
|
@ -612,11 +612,11 @@ cdef class Doc:
|
||||||
'''Deserialize, loading from bytes.'''
|
'''Deserialize, loading from bytes.'''
|
||||||
self.vocab.serializer.unpack_into(data[4:], self)
|
self.vocab.serializer.unpack_into(data[4:], self)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def read_bytes(file_):
|
def read_bytes(file_):
|
||||||
'''
|
'''
|
||||||
A static method, used to read serialized #[code Doc] objects from
|
A static method, used to read serialized #[code Doc] objects from
|
||||||
a file. For example:
|
a file. For example:
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
|
@ -673,7 +673,7 @@ cdef class Doc:
|
||||||
"Expected either 3 arguments (deprecated), or 0 (use keyword arguments). "
|
"Expected either 3 arguments (deprecated), or 0 (use keyword arguments). "
|
||||||
"Arguments supplied:\n%s\n"
|
"Arguments supplied:\n%s\n"
|
||||||
"Keyword arguments:%s\n" % (len(args), repr(args), repr(attributes)))
|
"Keyword arguments:%s\n" % (len(args), repr(args), repr(attributes)))
|
||||||
|
|
||||||
cdef int start = token_by_start(self.c, self.length, start_idx)
|
cdef int start = token_by_start(self.c, self.length, start_idx)
|
||||||
if start == -1:
|
if start == -1:
|
||||||
return None
|
return None
|
||||||
|
@ -784,7 +784,7 @@ cdef int set_children_from_heads(TokenC* tokens, int length) except -1:
|
||||||
if child.l_edge < head.l_edge:
|
if child.l_edge < head.l_edge:
|
||||||
head.l_edge = child.l_edge
|
head.l_edge = child.l_edge
|
||||||
head.l_kids += 1
|
head.l_kids += 1
|
||||||
|
|
||||||
# Set right edges --- same as above, but iterate in reverse
|
# Set right edges --- same as above, but iterate in reverse
|
||||||
for i in range(length-1, -1, -1):
|
for i in range(length-1, -1, -1):
|
||||||
child = &tokens[i]
|
child = &tokens[i]
|
||||||
|
@ -798,4 +798,4 @@ cdef int set_children_from_heads(TokenC* tokens, int length) except -1:
|
||||||
for i in range(length):
|
for i in range(length):
|
||||||
if tokens[i].head == 0 and tokens[i].dep != 0:
|
if tokens[i].head == 0 and tokens[i].dep != 0:
|
||||||
tokens[tokens[i].l_edge].sent_start = True
|
tokens[tokens[i].l_edge].sent_start = True
|
||||||
|
|
||||||
|
|
|
@ -20,7 +20,7 @@ from .. import parts_of_speech
|
||||||
from ..attrs cimport LEMMA
|
from ..attrs cimport LEMMA
|
||||||
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
|
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
|
||||||
from ..attrs cimport POS, LEMMA, TAG, DEP
|
from ..attrs cimport POS, LEMMA, TAG, DEP
|
||||||
from ..parts_of_speech cimport CONJ, PUNCT
|
from ..parts_of_speech cimport CCONJ, PUNCT
|
||||||
|
|
||||||
from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
|
from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
|
||||||
from ..attrs cimport IS_BRACKET
|
from ..attrs cimport IS_BRACKET
|
||||||
|
@ -84,7 +84,7 @@ cdef class Token:
|
||||||
|
|
||||||
cpdef bint check_flag(self, attr_id_t flag_id) except -1:
|
cpdef bint check_flag(self, attr_id_t flag_id) except -1:
|
||||||
'''Check the value of a boolean flag.
|
'''Check the value of a boolean flag.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
flag_id (int): The ID of the flag attribute.
|
flag_id (int): The ID of the flag attribute.
|
||||||
Returns:
|
Returns:
|
||||||
|
@ -225,7 +225,7 @@ cdef class Token:
|
||||||
property vector:
|
property vector:
|
||||||
'''
|
'''
|
||||||
A real-valued meaning representation.
|
A real-valued meaning representation.
|
||||||
|
|
||||||
Type: numpy.ndarray[ndim=1, dtype='float32']
|
Type: numpy.ndarray[ndim=1, dtype='float32']
|
||||||
'''
|
'''
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
|
@ -343,7 +343,7 @@ cdef class Token:
|
||||||
'''
|
'''
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
cdef const TokenC* head_ptr = self.c
|
cdef const TokenC* head_ptr = self.c
|
||||||
# guard against infinite loop, no token can have
|
# guard against infinite loop, no token can have
|
||||||
# more ancestors than tokens in the tree
|
# more ancestors than tokens in the tree
|
||||||
cdef int i = 0
|
cdef int i = 0
|
||||||
while head_ptr.head != 0 and i < self.doc.length:
|
while head_ptr.head != 0 and i < self.doc.length:
|
||||||
|
@ -370,7 +370,7 @@ cdef class Token:
|
||||||
|
|
||||||
property head:
|
property head:
|
||||||
'''The syntactic parent, or "governor", of this token.
|
'''The syntactic parent, or "governor", of this token.
|
||||||
|
|
||||||
Returns: Token
|
Returns: Token
|
||||||
'''
|
'''
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
|
@ -390,7 +390,7 @@ cdef class Token:
|
||||||
|
|
||||||
# is the new head a descendant of the old head
|
# is the new head a descendant of the old head
|
||||||
cdef bint is_desc = old_head.is_ancestor_of(new_head)
|
cdef bint is_desc = old_head.is_ancestor_of(new_head)
|
||||||
|
|
||||||
cdef int new_edge
|
cdef int new_edge
|
||||||
cdef Token anc, child
|
cdef Token anc, child
|
||||||
|
|
||||||
|
@ -420,7 +420,7 @@ cdef class Token:
|
||||||
if anc.c.l_edge <= new_edge:
|
if anc.c.l_edge <= new_edge:
|
||||||
break
|
break
|
||||||
anc.c.l_edge = new_edge
|
anc.c.l_edge = new_edge
|
||||||
|
|
||||||
elif self.c.head < 0: # right dependent
|
elif self.c.head < 0: # right dependent
|
||||||
old_head.c.r_kids -= 1
|
old_head.c.r_kids -= 1
|
||||||
# do the same thing as for l_edge
|
# do the same thing as for l_edge
|
||||||
|
@ -435,7 +435,7 @@ cdef class Token:
|
||||||
if child.c.r_edge > new_edge:
|
if child.c.r_edge > new_edge:
|
||||||
new_edge = child.c.r_edge
|
new_edge = child.c.r_edge
|
||||||
old_head.c.r_edge = new_edge
|
old_head.c.r_edge = new_edge
|
||||||
|
|
||||||
for anc in old_head.ancestors:
|
for anc in old_head.ancestors:
|
||||||
if anc.c.r_edge >= new_edge:
|
if anc.c.r_edge >= new_edge:
|
||||||
break
|
break
|
||||||
|
@ -598,19 +598,19 @@ cdef class Token:
|
||||||
property is_punct:
|
property is_punct:
|
||||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_PUNCT)
|
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_PUNCT)
|
||||||
|
|
||||||
property is_space:
|
property is_space:
|
||||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_SPACE)
|
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_SPACE)
|
||||||
|
|
||||||
property is_bracket:
|
property is_bracket:
|
||||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_BRACKET)
|
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_BRACKET)
|
||||||
|
|
||||||
property is_quote:
|
property is_quote:
|
||||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_QUOTE)
|
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_QUOTE)
|
||||||
|
|
||||||
property is_left_punct:
|
property is_left_punct:
|
||||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_LEFT_PUNCT)
|
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_LEFT_PUNCT)
|
||||||
|
|
||||||
property is_right_punct:
|
property is_right_punct:
|
||||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_RIGHT_PUNCT)
|
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_RIGHT_PUNCT)
|
||||||
|
|
||||||
property like_url:
|
property like_url:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user