From 66e1109b537c5f5900e528e3372271ec80e71f27 Mon Sep 17 00:00:00 2001 From: Roman Inflianskas Date: Sun, 26 Feb 2017 22:27:11 +0100 Subject: [PATCH] Add support for Universal Dependencies v2.0 --- spacy/attrs.pyx | 6 ++- spacy/de/tag_map.py | 2 +- spacy/en/tag_map.py | 2 +- spacy/language_data/tag_map.py | 1 + spacy/morphology.pxd | 6 ++- spacy/morphology.pyx | 4 ++ spacy/parts_of_speech.pxd | 1 + spacy/parts_of_speech.pyx | 3 +- spacy/symbols.pxd | 42 ++++++++++++++++-- spacy/symbols.pyx | 40 +++++++++++++++-- spacy/syntax/nonproj.pyx | 4 +- spacy/tagger.pyx | 12 +++--- spacy/tokens/doc.pyx | 78 +++++++++++++++++----------------- spacy/tokens/token.pyx | 28 ++++++------ 14 files changed, 155 insertions(+), 74 deletions(-) diff --git a/spacy/attrs.pyx b/spacy/attrs.pyx index 81554ecd3..07044ee2d 100644 --- a/spacy/attrs.pyx +++ b/spacy/attrs.pyx @@ -93,7 +93,7 @@ NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])] def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False): '''Normalize a dictionary of attributes, converting them to ints. - + Arguments: stringy_attrs (dict): Dictionary keyed by attribute string names. Values can be ints or strings. @@ -125,7 +125,9 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False): 'VerbForm', 'PronType', 'Aspect', 'Tense', 'PartType', 'Poss', 'Hyph', 'ConjType', 'NumType', 'Foreign', 'VerbType', 'NounType', 'Number', 'PronType', 'AdjType', 'Person', 'Variant', 'AdpType', - 'Reflex', 'Negative', 'Mood', 'Aspect', 'Case'] + 'Reflex', 'Negative', 'Mood', 'Aspect', 'Case', + 'Polarity', # U20 + ] for key in morph_keys: if key in stringy_attrs: stringy_attrs.pop(key) diff --git a/spacy/de/tag_map.py b/spacy/de/tag_map.py index e5996b38c..050bc8255 100644 --- a/spacy/de/tag_map.py +++ b/spacy/de/tag_map.py @@ -41,7 +41,7 @@ TAG_MAP = { "PRF": {POS: PRON, "PronType": "prs", "Reflex": "yes"}, "PTKA": {POS: PART}, "PTKANT": {POS: PART, "PartType": "res"}, - "PTKNEG": {POS: PART, "Negative": "yes"}, + "PTKNEG": {POS: PART, "Polarity": "Neg"}, "PTKVZ": {POS: PART, "PartType": "vbp"}, "PTKZU": {POS: PART, "PartType": "inf"}, "PWAT": {POS: DET, "PronType": "int"}, diff --git a/spacy/en/tag_map.py b/spacy/en/tag_map.py index 7a3589d0e..5884d8fd4 100644 --- a/spacy/en/tag_map.py +++ b/spacy/en/tag_map.py @@ -16,7 +16,7 @@ TAG_MAP = { "$": {POS: SYM, "Other": {"SymType": "currency"}}, "#": {POS: SYM, "Other": {"SymType": "numbersign"}}, "AFX": {POS: ADJ, "Hyph": "yes"}, - "CC": {POS: CONJ, "ConjType": "coor"}, + "CC": {POS: CCONJ, "ConjType": "coor"}, "CD": {POS: NUM, "NumType": "card"}, "DT": {POS: DET}, "EX": {POS: ADV, "AdvType": "ex"}, diff --git a/spacy/language_data/tag_map.py b/spacy/language_data/tag_map.py index 966960721..b861f39f5 100644 --- a/spacy/language_data/tag_map.py +++ b/spacy/language_data/tag_map.py @@ -19,6 +19,7 @@ TAG_MAP = { "AUX": {POS: AUX}, "X": {POS: X}, "CONJ": {POS: CONJ}, + "CCONJ": {POS: CCONJ}, # U20 "ADJ": {POS: ADJ}, "VERB": {POS: VERB}, "PART": {POS: PART} diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd index 5dc1ce529..4d981b30d 100644 --- a/spacy/morphology.pxd +++ b/spacy/morphology.pxd @@ -37,7 +37,7 @@ cdef class Morphology: cdef int assign_tag(self, TokenC* token, tag) except -1 cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1 - + cdef int assign_feature(self, uint64_t* morph, univ_morph_t feat_id, bint value) except -1 @@ -80,6 +80,7 @@ cpdef enum univ_morph_t: Definite_two Definite_def Definite_red + Definite_cons # U20 Definite_ind Degree_cmp Degree_comp @@ -103,6 +104,8 @@ cpdef enum univ_morph_t: Negative_neg Negative_pos Negative_yes + Polarity_neg # U20 + Polarity_pos # U20 Number_com Number_dual Number_none @@ -151,6 +154,7 @@ cpdef enum univ_morph_t: VerbForm_partPres VerbForm_sup VerbForm_trans + VerbForm_conv # U20 VerbForm_gdv # la Voice_act Voice_cau diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index c13ce1920..26405e988 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -192,6 +192,7 @@ IDS = { "Definite_two": Definite_two, "Definite_def": Definite_def, "Definite_red": Definite_red, + "Definite_cons": Definite_cons, # U20 "Definite_ind": Definite_ind, "Degree_cmp": Degree_cmp, "Degree_comp": Degree_comp, @@ -215,6 +216,8 @@ IDS = { "Negative_neg": Negative_neg, "Negative_pos": Negative_pos, "Negative_yes": Negative_yes, + "Polarity_neg": Polarity_neg, # U20 + "Polarity_pos": Polarity_pos, # U20 "Number_com": Number_com, "Number_dual": Number_dual, "Number_none": Number_none, @@ -263,6 +266,7 @@ IDS = { "VerbForm_partPres": VerbForm_partPres, "VerbForm_sup": VerbForm_sup, "VerbForm_trans": VerbForm_trans, + "VerbForm_conv": VerbForm_conv, # U20 "VerbForm_gdv ": VerbForm_gdv, # la, "Voice_act": Voice_act, "Voice_cau": Voice_cau, diff --git a/spacy/parts_of_speech.pxd b/spacy/parts_of_speech.pxd index c97673a69..0bf5b4789 100644 --- a/spacy/parts_of_speech.pxd +++ b/spacy/parts_of_speech.pxd @@ -7,6 +7,7 @@ cpdef enum univ_pos_t: ADV AUX CONJ + CCONJ # U20 DET INTJ NOUN diff --git a/spacy/parts_of_speech.pyx b/spacy/parts_of_speech.pyx index 006a1f006..a5c770f61 100644 --- a/spacy/parts_of_speech.pyx +++ b/spacy/parts_of_speech.pyx @@ -7,7 +7,8 @@ IDS = { "ADP": ADP, "ADV": ADV, "AUX": AUX, - "CONJ": CONJ, + "CONJ": CONJ, # U20 + "CCONJ": CCONJ, "DET": DET, "INTJ": INTJ, "NOUN": NOUN, diff --git a/spacy/symbols.pxd b/spacy/symbols.pxd index ca1d1ed79..1a46f509f 100644 --- a/spacy/symbols.pxd +++ b/spacy/symbols.pxd @@ -13,7 +13,7 @@ cpdef enum symbol_t: LIKE_EMAIL IS_STOP IS_OOV - + FLAG14 = 14 FLAG15 FLAG16 @@ -90,6 +90,7 @@ cpdef enum symbol_t: ADV AUX CONJ + CCONJ # U20 DET INTJ NOUN @@ -107,11 +108,14 @@ cpdef enum symbol_t: Animacy_anim Animacy_inam + Animacy_hum # U20 Aspect_freq Aspect_imp Aspect_mod Aspect_none Aspect_perf + Aspect_iter # U20 + Aspect_hab # U20 Case_abe Case_abl Case_abs @@ -120,10 +124,12 @@ cpdef enum symbol_t: Case_all Case_cau Case_com + Case_cmp # U20 Case_dat Case_del Case_dis Case_ela + Case_equ # U20 Case_ess Case_gen Case_ill @@ -142,7 +148,9 @@ cpdef enum symbol_t: Definite_two Definite_def Definite_red + Definite_cons # U20 Definite_ind + Definite_spec # U20 Degree_cmp Degree_comp Degree_none @@ -151,6 +159,8 @@ cpdef enum symbol_t: Degree_abs Degree_com Degree_dim # du + Degree_equ # U20 + Evident_nfh # U20 Gender_com Gender_fem Gender_masc @@ -162,16 +172,21 @@ cpdef enum symbol_t: Mood_pot Mood_sub Mood_opt + Mood_prp # U20 + Mood_adm # U20 Negative_neg Negative_pos Negative_yes + Polarity_neg # U20 + Polarity_pos # U20 Number_com Number_dual Number_none Number_plur Number_sing Number_ptan # bg - Number_count # bg + Number_count # bg, U20 + Number_tri # U20 NumType_card NumType_dist NumType_frac @@ -197,7 +212,8 @@ cpdef enum symbol_t: PronType_rel PronType_tot PronType_clit - PronType_exc # es, ca, it, fa + PronType_exc # es, ca, it, fa, U20 + PronType_emp # U20 Reflex_yes Tense_fut Tense_imp @@ -213,12 +229,17 @@ cpdef enum symbol_t: VerbForm_partPres VerbForm_sup VerbForm_trans + VerbForm_conv # U20 VerbForm_gdv # la + VerbForm_vnoun # U20 Voice_act Voice_cau Voice_pass - Voice_mid # gkc + Voice_mid # gkc, U20 Voice_int # hb + Voice_antip # U20 + Voice_dir # U20 + Voice_inv # U20 Abbr_yes # cz, fi, sl, U AdpType_prep # cz, U AdpType_post # U @@ -284,6 +305,10 @@ cpdef enum symbol_t: Number_psee_plur # U Number_psor_sing # cz, fi, sl, U Number_psor_plur # cz, fi, sl, U + Number_pauc # U20 + Number_grpa # U20 + Number_grpl # U20 + Number_inv # U20 NumForm_digit # cz, sl, U NumForm_roman # cz, sl, U NumForm_word # cz, sl, U @@ -311,6 +336,8 @@ cpdef enum symbol_t: Person_psor_one # fi, U Person_psor_two # fi, U Person_psor_three # fi, U + Person_zero # U20 + Person_four # U20 Polite_inf # bq, U Polite_pol # bq, U Polite_abs_inf # bq, U @@ -319,6 +346,10 @@ cpdef enum symbol_t: Polite_erg_pol # bq, U Polite_dat_inf # bq, U Polite_dat_pol # bq, U + Polite_infm # U20 + Polite_form # U20 + Polite_form_elev # U20 + Polite_form_humb # U20 Prefix_yes # U PrepCase_npr # cz PrepCase_pre # U @@ -383,6 +414,7 @@ cpdef enum symbol_t: ccomp complm conj + cop # U20 csubj csubjpass dep @@ -405,6 +437,8 @@ cpdef enum symbol_t: num number oprd + obj # U20 + obl # U20 parataxis partmod pcomp diff --git a/spacy/symbols.pyx b/spacy/symbols.pyx index 7254297d4..56b27512e 100644 --- a/spacy/symbols.pyx +++ b/spacy/symbols.pyx @@ -91,6 +91,7 @@ IDS = { "ADV": ADV, "AUX": AUX, "CONJ": CONJ, + "CCONJ": CCONJ, # U20 "DET": DET, "INTJ": INTJ, "NOUN": NOUN, @@ -108,11 +109,14 @@ IDS = { "Animacy_anim": Animacy_anim, "Animacy_inam": Animacy_inam, + "Animacy_hum": Animacy_hum, # U20 "Aspect_freq": Aspect_freq, "Aspect_imp": Aspect_imp, "Aspect_mod": Aspect_mod, "Aspect_none": Aspect_none, "Aspect_perf": Aspect_perf, + "Aspect_iter": Aspect_iter, # U20 + "Aspect_hab": Aspect_hab, # U20 "Case_abe": Case_abe, "Case_abl": Case_abl, "Case_abs": Case_abs, @@ -121,10 +125,12 @@ IDS = { "Case_all": Case_all, "Case_cau": Case_cau, "Case_com": Case_com, + "Case_cmp": Case_cmp, # U20 "Case_dat": Case_dat, "Case_del": Case_del, "Case_dis": Case_dis, "Case_ela": Case_ela, + "Case_equ": Case_equ, # U20 "Case_ess": Case_ess, "Case_gen": Case_gen, "Case_ill": Case_ill, @@ -143,7 +149,9 @@ IDS = { "Definite_two": Definite_two, "Definite_def": Definite_def, "Definite_red": Definite_red, + "Definite_cons": Definite_cons, # U20 "Definite_ind": Definite_ind, + "Definite_spec": Definite_spec, # U20 "Degree_cmp": Degree_cmp, "Degree_comp": Degree_comp, "Degree_none": Degree_none, @@ -152,6 +160,8 @@ IDS = { "Degree_abs": Degree_abs, "Degree_com": Degree_com, "Degree_dim ": Degree_dim, # du + "Degree_equ": Degree_equ, # U20 + "Evident_nfh": Evident_nfh, # U20 "Gender_com": Gender_com, "Gender_fem": Gender_fem, "Gender_masc": Gender_masc, @@ -163,16 +173,21 @@ IDS = { "Mood_pot": Mood_pot, "Mood_sub": Mood_sub, "Mood_opt": Mood_opt, + "Mood_prp": Mood_prp, # U20 + "Mood_adm": Mood_adm, # U20 "Negative_neg": Negative_neg, "Negative_pos": Negative_pos, "Negative_yes": Negative_yes, + "Polarity_neg": Polarity_neg, # U20 + "Polarity_pos": Polarity_pos, # U20 "Number_com": Number_com, "Number_dual": Number_dual, "Number_none": Number_none, "Number_plur": Number_plur, "Number_sing": Number_sing, "Number_ptan ": Number_ptan, # bg - "Number_count ": Number_count, # bg + "Number_count ": Number_count, # bg, U20 + "Number_tri": Number_tri, # U20 "NumType_card": NumType_card, "NumType_dist": NumType_dist, "NumType_frac": NumType_frac, @@ -198,7 +213,8 @@ IDS = { "PronType_rel": PronType_rel, "PronType_tot": PronType_tot, "PronType_clit": PronType_clit, - "PronType_exc ": PronType_exc, # es, ca, it, fa, + "PronType_exc": PronType_exc, # es, ca, it, fa, U20 + "PronType_emp": PronType_emp, # U20 "Reflex_yes": Reflex_yes, "Tense_fut": Tense_fut, "Tense_imp": Tense_imp, @@ -214,12 +230,17 @@ IDS = { "VerbForm_partPres": VerbForm_partPres, "VerbForm_sup": VerbForm_sup, "VerbForm_trans": VerbForm_trans, + "VerbForm_conv": VerbForm_conv, # U20 "VerbForm_gdv ": VerbForm_gdv, # la, + "VerbForm_vnoun": VerbForm_vnoun, # U20 "Voice_act": Voice_act, "Voice_cau": Voice_cau, "Voice_pass": Voice_pass, - "Voice_mid ": Voice_mid, # gkc, + "Voice_mid ": Voice_mid, # gkc, U20 "Voice_int ": Voice_int, # hb, + "Voice_antip": Voice_antip, # U20 + "Voice_dir": Voice_dir, # U20 + "Voice_inv": Voice_inv, # U20 "Abbr_yes ": Abbr_yes, # cz, fi, sl, U, "AdpType_prep ": AdpType_prep, # cz, U, "AdpType_post ": AdpType_post, # U, @@ -285,6 +306,10 @@ IDS = { "Number_psee_plur ": Number_psee_plur, # U, "Number_psor_sing ": Number_psor_sing, # cz, fi, sl, U, "Number_psor_plur ": Number_psor_plur, # cz, fi, sl, U, + "Number_pauc": Number_pauc, # U20 + "Number_grpa": Number_grpa, # U20 + "Number_grpl": Number_grpl, # U20 + "Number_inv": Number_inv, # U20 "NumForm_digit ": NumForm_digit, # cz, sl, U, "NumForm_roman ": NumForm_roman, # cz, sl, U, "NumForm_word ": NumForm_word, # cz, sl, U, @@ -312,6 +337,8 @@ IDS = { "Person_psor_one ": Person_psor_one, # fi, U, "Person_psor_two ": Person_psor_two, # fi, U, "Person_psor_three ": Person_psor_three, # fi, U, + "Person_zero ": Person_zero, # U20 + "Person_four ": Person_four, # U20 "Polite_inf ": Polite_inf, # bq, U, "Polite_pol ": Polite_pol, # bq, U, "Polite_abs_inf ": Polite_abs_inf, # bq, U, @@ -320,6 +347,10 @@ IDS = { "Polite_erg_pol ": Polite_erg_pol, # bq, U, "Polite_dat_inf ": Polite_dat_inf, # bq, U, "Polite_dat_pol ": Polite_dat_pol, # bq, U, + "Polite_infm ": Polite_infm, # U20 + "Polite_form ": Polite_form, # U20 + "Polite_form_elev ": Polite_form_elev, # U20 + "Polite_form_humb ": Polite_form_humb, # U20 "Prefix_yes ": Prefix_yes, # U, "PrepCase_npr ": PrepCase_npr, # cz, "PrepCase_pre ": PrepCase_pre, # U, @@ -384,6 +415,7 @@ IDS = { "ccomp": ccomp, "complm": complm, "conj": conj, + "cop": cop, # U20 "csubj": csubj, "csubjpass": csubjpass, "dep": dep, @@ -406,6 +438,8 @@ IDS = { "num": num, "number": number, "oprd": oprd, + "obj": obj, # U20 + "obl": obl, # U20 "parataxis": parataxis, "partmod": partmod, "pcomp": pcomp, diff --git a/spacy/syntax/nonproj.pyx b/spacy/syntax/nonproj.pyx index 566588da4..1f4878247 100644 --- a/spacy/syntax/nonproj.pyx +++ b/spacy/syntax/nonproj.pyx @@ -8,7 +8,7 @@ from spacy.attrs import DEP, HEAD def ancestors(tokenid, heads): # returns all words going from the word up the path to the root # the path to root cannot be longer than the number of words in the sentence - # this function ends after at most len(heads) steps + # this function ends after at most len(heads) steps # because it would otherwise loop indefinitely on cycles head = tokenid cnt = 0 @@ -180,7 +180,7 @@ class PseudoProjectivity: next_queue = [] for qtoken in queue: for child in qtoken.children: - if child.is_space: continue + if child.is_space: continue if child == token: continue if child.dep_ == headlabel: return child diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx index 7903c44fb..954bced53 100644 --- a/spacy/tagger.pyx +++ b/spacy/tagger.pyx @@ -13,13 +13,13 @@ from thinc.linalg cimport VecVec from .typedefs cimport attr_t from .tokens.doc cimport Doc from .attrs cimport TAG -from .parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON +from .parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CCONJ, DET, NOUN, NUM, PRON from .parts_of_speech cimport VERB, X, PUNCT, EOL, SPACE from .gold cimport GoldParse from .attrs cimport * - + cpdef enum: P2_orth P2_cluster @@ -71,7 +71,7 @@ cpdef enum: cdef class TaggerModel(AveragedPerceptron): cdef void set_featuresC(self, ExampleC* eg, const TokenC* tokens, int i) except *: - + _fill_from_token(&eg.atoms[P2_orth], &tokens[i-2]) _fill_from_token(&eg.atoms[P1_orth], &tokens[i-1]) _fill_from_token(&eg.atoms[W_orth], &tokens[i]) @@ -191,7 +191,7 @@ cdef class Tagger: nr_class=self.vocab.morphology.n_tags, nr_feat=self.model.nr_feat) for i in range(tokens.length): - if tokens.c[i].pos == 0: + if tokens.c[i].pos == 0: self.model.set_featuresC(&eg.c, tokens.c, i) self.model.set_scoresC(eg.c.scores, eg.c.features, eg.c.nr_feat) @@ -217,7 +217,7 @@ cdef class Tagger: for doc in stream: self(doc) yield doc - + def update(self, Doc tokens, GoldParse gold): """Update the statistical model, with tags supplied for the given document. @@ -251,7 +251,7 @@ cdef class Tagger: self.model.updateC(&eg.c) self.vocab.morphology.assign_tag_id(&tokens.c[i], eg.guess) - + correct += eg.cost == 0 self.freqs[TAG][tokens.c[i].tag] += 1 eg.fill_scores(0, eg.c.nr_class) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 805a5b30c..bda528383 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -16,7 +16,7 @@ from ..typedefs cimport attr_t, flags_t from ..attrs cimport attr_id_t from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE -from ..parts_of_speech cimport CONJ, PUNCT, NOUN +from ..parts_of_speech cimport CCONJ, PUNCT, NOUN from ..parts_of_speech cimport univ_pos_t from ..lexeme cimport Lexeme from .span cimport Span @@ -59,13 +59,13 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil: cdef class Doc: """ - A sequence of `Token` objects. Access sentences and named entities, - export annotations to numpy arrays, losslessly serialize to compressed + A sequence of `Token` objects. Access sentences and named entities, + export annotations to numpy arrays, losslessly serialize to compressed binary strings. Aside: Internals - The `Doc` object holds an array of `TokenC` structs. - The Python-level `Token` and `Span` objects are views of this + The `Doc` object holds an array of `TokenC` structs. + The Python-level `Token` and `Span` objects are views of this array, i.e. they don't own the data themselves. Code: Construction 1 @@ -80,13 +80,13 @@ cdef class Doc: Create a Doc object. Aside: Implementation - This method of constructing a `Doc` object is usually only used - for deserialization. Standard usage is to construct the document via + This method of constructing a `Doc` object is usually only used + for deserialization. Standard usage is to construct the document via a call to the language object. Arguments: vocab: - A Vocabulary object, which must match any models you want to + A Vocabulary object, which must match any models you want to use (e.g. tokenizer, parser, entity recognizer). words: @@ -156,19 +156,19 @@ cdef class Doc: if self.length == 0: self.is_tagged = True self.is_parsed = True - + def __getitem__(self, object i): ''' doc[i] - Get the Token object at position i, where i is an integer. - Negative indexing is supported, and follows the usual Python + Get the Token object at position i, where i is an integer. + Negative indexing is supported, and follows the usual Python semantics, i.e. doc[-2] is doc[len(doc) - 2]. doc[start : end]] Get a `Span` object, starting at position `start` and ending at position `end`, where `start` and `end` are token indices. For instance, - `doc[2:5]` produces a span consisting of - tokens 2, 3 and 4. Stepped slices (e.g. `doc[start : end : step]`) + `doc[2:5]` produces a span consisting of + tokens 2, 3 and 4. Stepped slices (e.g. `doc[start : end : step]`) are not supported, as `Span` objects must be contiguous (cannot have gaps). You can use negative indices and open-ended ranges, which have their normal Python semantics. @@ -188,11 +188,11 @@ cdef class Doc: def __iter__(self): ''' for token in doc - Iterate over `Token` objects, from which the annotations can - be easily accessed. This is the main way of accessing Token - objects, which are the main way annotations are accessed from - Python. If faster-than-Python speeds are required, you can - instead access the annotations as a numpy array, or access the + Iterate over `Token` objects, from which the annotations can + be easily accessed. This is the main way of accessing Token + objects, which are the main way annotations are accessed from + Python. If faster-than-Python speeds are required, you can + instead access the annotations as a numpy array, or access the underlying C data directly from Cython. ''' cdef int i @@ -251,13 +251,13 @@ cdef class Doc: def __get__(self): if 'has_vector' in self.user_hooks: return self.user_hooks['has_vector'](self) - + return any(token.has_vector for token in self) property vector: ''' A real-valued meaning representation. Defaults to an average of the token vectors. - + Type: numpy.ndarray[ndim=1, dtype='float32'] ''' def __get__(self): @@ -285,14 +285,14 @@ cdef class Doc: norm += value * value self._vector_norm = sqrt(norm) if norm != 0 else 0 return self._vector_norm - + def __set__(self, value): - self._vector_norm = value + self._vector_norm = value @property def string(self): return self.text - + property text: '''A unicode representation of the document text.''' def __get__(self): @@ -306,7 +306,7 @@ cdef class Doc: property ents: ''' Yields named-entity `Span` objects, if the entity recognizer - has been applied to the document. Iterate over the span to get + has been applied to the document. Iterate over the span to get individual Token objects, or access the label: Example: @@ -352,7 +352,7 @@ cdef class Doc: cdef int i for i in range(self.length): self.c[i].ent_type = 0 - # At this point we don't know whether the NER has run over the + # At this point we don't know whether the NER has run over the # Doc. If the ent_iob is missing, leave it missing. if self.c[i].ent_iob != 0: self.c[i].ent_iob = 2 # Means O. Non-O are set from ents. @@ -384,9 +384,9 @@ cdef class Doc: property noun_chunks: ''' Yields base noun-phrase #[code Span] objects, if the document - has been syntactically parsed. A base noun phrase, or - 'NP chunk', is a noun phrase that does not permit other NPs to - be nested within it – so no NP-level coordination, no prepositional + has been syntactically parsed. A base noun phrase, or + 'NP chunk', is a noun phrase that does not permit other NPs to + be nested within it – so no NP-level coordination, no prepositional phrases, and no relative clauses. For example: ''' def __get__(self): @@ -422,7 +422,7 @@ cdef class Doc: def __get__(self): if 'sents' in self.user_hooks: return self.user_hooks['sents'](self) - + if not self.is_parsed: raise ValueError( "sentence boundary detection requires the dependency parse, which " @@ -465,8 +465,8 @@ cdef class Doc: @cython.boundscheck(False) cpdef np.ndarray to_array(self, object py_attr_ids): """ - Given a list of M attribute IDs, export the tokens to a numpy - `ndarray` of shape (N, M), where `N` is the length + Given a list of M attribute IDs, export the tokens to a numpy + `ndarray` of shape (N, M), where `N` is the length of the document. The values will be 32-bit integers. Example: @@ -474,7 +474,7 @@ cdef class Doc: doc = nlp(text) # All strings mapped to integers, for easy export to numpy np_array = doc.to_array([attrs.LOWER, attrs.POS, attrs.ENT_TYPE, attrs.IS_ALPHA]) - + Arguments: attr_ids (list[int]): A list of attribute ID ints. @@ -520,7 +520,7 @@ cdef class Doc: cdef int i cdef attr_t attr cdef size_t count - + if counts is None: counts = PreshCounter() output_dict = True @@ -570,7 +570,7 @@ cdef class Doc: cdef TokenC* tokens = self.c cdef int length = len(array) cdef attr_t[:] values - for col, attr_id in enumerate(attrs): + for col, attr_id in enumerate(attrs): values = array[:, col] if attr_id == HEAD: for i in range(length): @@ -612,11 +612,11 @@ cdef class Doc: '''Deserialize, loading from bytes.''' self.vocab.serializer.unpack_into(data[4:], self) return self - + @staticmethod def read_bytes(file_): ''' - A static method, used to read serialized #[code Doc] objects from + A static method, used to read serialized #[code Doc] objects from a file. For example: Example: @@ -673,7 +673,7 @@ cdef class Doc: "Expected either 3 arguments (deprecated), or 0 (use keyword arguments). " "Arguments supplied:\n%s\n" "Keyword arguments:%s\n" % (len(args), repr(args), repr(attributes))) - + cdef int start = token_by_start(self.c, self.length, start_idx) if start == -1: return None @@ -784,7 +784,7 @@ cdef int set_children_from_heads(TokenC* tokens, int length) except -1: if child.l_edge < head.l_edge: head.l_edge = child.l_edge head.l_kids += 1 - + # Set right edges --- same as above, but iterate in reverse for i in range(length-1, -1, -1): child = &tokens[i] @@ -798,4 +798,4 @@ cdef int set_children_from_heads(TokenC* tokens, int length) except -1: for i in range(length): if tokens[i].head == 0 and tokens[i].dep != 0: tokens[tokens[i].l_edge].sent_start = True - + diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index fc84ba350..69bd9fa6e 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -20,7 +20,7 @@ from .. import parts_of_speech from ..attrs cimport LEMMA from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER from ..attrs cimport POS, LEMMA, TAG, DEP -from ..parts_of_speech cimport CONJ, PUNCT +from ..parts_of_speech cimport CCONJ, PUNCT from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE from ..attrs cimport IS_BRACKET @@ -84,7 +84,7 @@ cdef class Token: cpdef bint check_flag(self, attr_id_t flag_id) except -1: '''Check the value of a boolean flag. - + Arguments: flag_id (int): The ID of the flag attribute. Returns: @@ -225,7 +225,7 @@ cdef class Token: property vector: ''' A real-valued meaning representation. - + Type: numpy.ndarray[ndim=1, dtype='float32'] ''' def __get__(self): @@ -343,7 +343,7 @@ cdef class Token: ''' def __get__(self): cdef const TokenC* head_ptr = self.c - # guard against infinite loop, no token can have + # guard against infinite loop, no token can have # more ancestors than tokens in the tree cdef int i = 0 while head_ptr.head != 0 and i < self.doc.length: @@ -370,7 +370,7 @@ cdef class Token: property head: '''The syntactic parent, or "governor", of this token. - + Returns: Token ''' def __get__(self): @@ -390,7 +390,7 @@ cdef class Token: # is the new head a descendant of the old head cdef bint is_desc = old_head.is_ancestor_of(new_head) - + cdef int new_edge cdef Token anc, child @@ -420,7 +420,7 @@ cdef class Token: if anc.c.l_edge <= new_edge: break anc.c.l_edge = new_edge - + elif self.c.head < 0: # right dependent old_head.c.r_kids -= 1 # do the same thing as for l_edge @@ -435,7 +435,7 @@ cdef class Token: if child.c.r_edge > new_edge: new_edge = child.c.r_edge old_head.c.r_edge = new_edge - + for anc in old_head.ancestors: if anc.c.r_edge >= new_edge: break @@ -598,19 +598,19 @@ cdef class Token: property is_punct: def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_PUNCT) - property is_space: + property is_space: def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_SPACE) - - property is_bracket: + + property is_bracket: def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_BRACKET) - property is_quote: + property is_quote: def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_QUOTE) - property is_left_punct: + property is_left_punct: def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_LEFT_PUNCT) - property is_right_punct: + property is_right_punct: def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_RIGHT_PUNCT) property like_url: