From 66e1109b537c5f5900e528e3372271ec80e71f27 Mon Sep 17 00:00:00 2001 From: Roman Inflianskas Date: Sun, 26 Feb 2017 22:27:11 +0100 Subject: [PATCH 01/21] Add support for Universal Dependencies v2.0 --- spacy/attrs.pyx | 6 ++- spacy/de/tag_map.py | 2 +- spacy/en/tag_map.py | 2 +- spacy/language_data/tag_map.py | 1 + spacy/morphology.pxd | 6 ++- spacy/morphology.pyx | 4 ++ spacy/parts_of_speech.pxd | 1 + spacy/parts_of_speech.pyx | 3 +- spacy/symbols.pxd | 42 ++++++++++++++++-- spacy/symbols.pyx | 40 +++++++++++++++-- spacy/syntax/nonproj.pyx | 4 +- spacy/tagger.pyx | 12 +++--- spacy/tokens/doc.pyx | 78 +++++++++++++++++----------------- spacy/tokens/token.pyx | 28 ++++++------ 14 files changed, 155 insertions(+), 74 deletions(-) diff --git a/spacy/attrs.pyx b/spacy/attrs.pyx index 81554ecd3..07044ee2d 100644 --- a/spacy/attrs.pyx +++ b/spacy/attrs.pyx @@ -93,7 +93,7 @@ NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])] def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False): '''Normalize a dictionary of attributes, converting them to ints. - + Arguments: stringy_attrs (dict): Dictionary keyed by attribute string names. Values can be ints or strings. @@ -125,7 +125,9 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False): 'VerbForm', 'PronType', 'Aspect', 'Tense', 'PartType', 'Poss', 'Hyph', 'ConjType', 'NumType', 'Foreign', 'VerbType', 'NounType', 'Number', 'PronType', 'AdjType', 'Person', 'Variant', 'AdpType', - 'Reflex', 'Negative', 'Mood', 'Aspect', 'Case'] + 'Reflex', 'Negative', 'Mood', 'Aspect', 'Case', + 'Polarity', # U20 + ] for key in morph_keys: if key in stringy_attrs: stringy_attrs.pop(key) diff --git a/spacy/de/tag_map.py b/spacy/de/tag_map.py index e5996b38c..050bc8255 100644 --- a/spacy/de/tag_map.py +++ b/spacy/de/tag_map.py @@ -41,7 +41,7 @@ TAG_MAP = { "PRF": {POS: PRON, "PronType": "prs", "Reflex": "yes"}, "PTKA": {POS: PART}, "PTKANT": {POS: PART, "PartType": "res"}, - "PTKNEG": {POS: PART, "Negative": "yes"}, + "PTKNEG": {POS: PART, "Polarity": "Neg"}, "PTKVZ": {POS: PART, "PartType": "vbp"}, "PTKZU": {POS: PART, "PartType": "inf"}, "PWAT": {POS: DET, "PronType": "int"}, diff --git a/spacy/en/tag_map.py b/spacy/en/tag_map.py index 7a3589d0e..5884d8fd4 100644 --- a/spacy/en/tag_map.py +++ b/spacy/en/tag_map.py @@ -16,7 +16,7 @@ TAG_MAP = { "$": {POS: SYM, "Other": {"SymType": "currency"}}, "#": {POS: SYM, "Other": {"SymType": "numbersign"}}, "AFX": {POS: ADJ, "Hyph": "yes"}, - "CC": {POS: CONJ, "ConjType": "coor"}, + "CC": {POS: CCONJ, "ConjType": "coor"}, "CD": {POS: NUM, "NumType": "card"}, "DT": {POS: DET}, "EX": {POS: ADV, "AdvType": "ex"}, diff --git a/spacy/language_data/tag_map.py b/spacy/language_data/tag_map.py index 966960721..b861f39f5 100644 --- a/spacy/language_data/tag_map.py +++ b/spacy/language_data/tag_map.py @@ -19,6 +19,7 @@ TAG_MAP = { "AUX": {POS: AUX}, "X": {POS: X}, "CONJ": {POS: CONJ}, + "CCONJ": {POS: CCONJ}, # U20 "ADJ": {POS: ADJ}, "VERB": {POS: VERB}, "PART": {POS: PART} diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd index 5dc1ce529..4d981b30d 100644 --- a/spacy/morphology.pxd +++ b/spacy/morphology.pxd @@ -37,7 +37,7 @@ cdef class Morphology: cdef int assign_tag(self, TokenC* token, tag) except -1 cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1 - + cdef int assign_feature(self, uint64_t* morph, univ_morph_t feat_id, bint value) except -1 @@ -80,6 +80,7 @@ cpdef enum univ_morph_t: Definite_two Definite_def Definite_red + Definite_cons # U20 Definite_ind Degree_cmp Degree_comp @@ -103,6 +104,8 @@ cpdef enum univ_morph_t: Negative_neg Negative_pos Negative_yes + Polarity_neg # U20 + Polarity_pos # U20 Number_com Number_dual Number_none @@ -151,6 +154,7 @@ cpdef enum univ_morph_t: VerbForm_partPres VerbForm_sup VerbForm_trans + VerbForm_conv # U20 VerbForm_gdv # la Voice_act Voice_cau diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index c13ce1920..26405e988 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -192,6 +192,7 @@ IDS = { "Definite_two": Definite_two, "Definite_def": Definite_def, "Definite_red": Definite_red, + "Definite_cons": Definite_cons, # U20 "Definite_ind": Definite_ind, "Degree_cmp": Degree_cmp, "Degree_comp": Degree_comp, @@ -215,6 +216,8 @@ IDS = { "Negative_neg": Negative_neg, "Negative_pos": Negative_pos, "Negative_yes": Negative_yes, + "Polarity_neg": Polarity_neg, # U20 + "Polarity_pos": Polarity_pos, # U20 "Number_com": Number_com, "Number_dual": Number_dual, "Number_none": Number_none, @@ -263,6 +266,7 @@ IDS = { "VerbForm_partPres": VerbForm_partPres, "VerbForm_sup": VerbForm_sup, "VerbForm_trans": VerbForm_trans, + "VerbForm_conv": VerbForm_conv, # U20 "VerbForm_gdv ": VerbForm_gdv, # la, "Voice_act": Voice_act, "Voice_cau": Voice_cau, diff --git a/spacy/parts_of_speech.pxd b/spacy/parts_of_speech.pxd index c97673a69..0bf5b4789 100644 --- a/spacy/parts_of_speech.pxd +++ b/spacy/parts_of_speech.pxd @@ -7,6 +7,7 @@ cpdef enum univ_pos_t: ADV AUX CONJ + CCONJ # U20 DET INTJ NOUN diff --git a/spacy/parts_of_speech.pyx b/spacy/parts_of_speech.pyx index 006a1f006..a5c770f61 100644 --- a/spacy/parts_of_speech.pyx +++ b/spacy/parts_of_speech.pyx @@ -7,7 +7,8 @@ IDS = { "ADP": ADP, "ADV": ADV, "AUX": AUX, - "CONJ": CONJ, + "CONJ": CONJ, # U20 + "CCONJ": CCONJ, "DET": DET, "INTJ": INTJ, "NOUN": NOUN, diff --git a/spacy/symbols.pxd b/spacy/symbols.pxd index ca1d1ed79..1a46f509f 100644 --- a/spacy/symbols.pxd +++ b/spacy/symbols.pxd @@ -13,7 +13,7 @@ cpdef enum symbol_t: LIKE_EMAIL IS_STOP IS_OOV - + FLAG14 = 14 FLAG15 FLAG16 @@ -90,6 +90,7 @@ cpdef enum symbol_t: ADV AUX CONJ + CCONJ # U20 DET INTJ NOUN @@ -107,11 +108,14 @@ cpdef enum symbol_t: Animacy_anim Animacy_inam + Animacy_hum # U20 Aspect_freq Aspect_imp Aspect_mod Aspect_none Aspect_perf + Aspect_iter # U20 + Aspect_hab # U20 Case_abe Case_abl Case_abs @@ -120,10 +124,12 @@ cpdef enum symbol_t: Case_all Case_cau Case_com + Case_cmp # U20 Case_dat Case_del Case_dis Case_ela + Case_equ # U20 Case_ess Case_gen Case_ill @@ -142,7 +148,9 @@ cpdef enum symbol_t: Definite_two Definite_def Definite_red + Definite_cons # U20 Definite_ind + Definite_spec # U20 Degree_cmp Degree_comp Degree_none @@ -151,6 +159,8 @@ cpdef enum symbol_t: Degree_abs Degree_com Degree_dim # du + Degree_equ # U20 + Evident_nfh # U20 Gender_com Gender_fem Gender_masc @@ -162,16 +172,21 @@ cpdef enum symbol_t: Mood_pot Mood_sub Mood_opt + Mood_prp # U20 + Mood_adm # U20 Negative_neg Negative_pos Negative_yes + Polarity_neg # U20 + Polarity_pos # U20 Number_com Number_dual Number_none Number_plur Number_sing Number_ptan # bg - Number_count # bg + Number_count # bg, U20 + Number_tri # U20 NumType_card NumType_dist NumType_frac @@ -197,7 +212,8 @@ cpdef enum symbol_t: PronType_rel PronType_tot PronType_clit - PronType_exc # es, ca, it, fa + PronType_exc # es, ca, it, fa, U20 + PronType_emp # U20 Reflex_yes Tense_fut Tense_imp @@ -213,12 +229,17 @@ cpdef enum symbol_t: VerbForm_partPres VerbForm_sup VerbForm_trans + VerbForm_conv # U20 VerbForm_gdv # la + VerbForm_vnoun # U20 Voice_act Voice_cau Voice_pass - Voice_mid # gkc + Voice_mid # gkc, U20 Voice_int # hb + Voice_antip # U20 + Voice_dir # U20 + Voice_inv # U20 Abbr_yes # cz, fi, sl, U AdpType_prep # cz, U AdpType_post # U @@ -284,6 +305,10 @@ cpdef enum symbol_t: Number_psee_plur # U Number_psor_sing # cz, fi, sl, U Number_psor_plur # cz, fi, sl, U + Number_pauc # U20 + Number_grpa # U20 + Number_grpl # U20 + Number_inv # U20 NumForm_digit # cz, sl, U NumForm_roman # cz, sl, U NumForm_word # cz, sl, U @@ -311,6 +336,8 @@ cpdef enum symbol_t: Person_psor_one # fi, U Person_psor_two # fi, U Person_psor_three # fi, U + Person_zero # U20 + Person_four # U20 Polite_inf # bq, U Polite_pol # bq, U Polite_abs_inf # bq, U @@ -319,6 +346,10 @@ cpdef enum symbol_t: Polite_erg_pol # bq, U Polite_dat_inf # bq, U Polite_dat_pol # bq, U + Polite_infm # U20 + Polite_form # U20 + Polite_form_elev # U20 + Polite_form_humb # U20 Prefix_yes # U PrepCase_npr # cz PrepCase_pre # U @@ -383,6 +414,7 @@ cpdef enum symbol_t: ccomp complm conj + cop # U20 csubj csubjpass dep @@ -405,6 +437,8 @@ cpdef enum symbol_t: num number oprd + obj # U20 + obl # U20 parataxis partmod pcomp diff --git a/spacy/symbols.pyx b/spacy/symbols.pyx index 7254297d4..56b27512e 100644 --- a/spacy/symbols.pyx +++ b/spacy/symbols.pyx @@ -91,6 +91,7 @@ IDS = { "ADV": ADV, "AUX": AUX, "CONJ": CONJ, + "CCONJ": CCONJ, # U20 "DET": DET, "INTJ": INTJ, "NOUN": NOUN, @@ -108,11 +109,14 @@ IDS = { "Animacy_anim": Animacy_anim, "Animacy_inam": Animacy_inam, + "Animacy_hum": Animacy_hum, # U20 "Aspect_freq": Aspect_freq, "Aspect_imp": Aspect_imp, "Aspect_mod": Aspect_mod, "Aspect_none": Aspect_none, "Aspect_perf": Aspect_perf, + "Aspect_iter": Aspect_iter, # U20 + "Aspect_hab": Aspect_hab, # U20 "Case_abe": Case_abe, "Case_abl": Case_abl, "Case_abs": Case_abs, @@ -121,10 +125,12 @@ IDS = { "Case_all": Case_all, "Case_cau": Case_cau, "Case_com": Case_com, + "Case_cmp": Case_cmp, # U20 "Case_dat": Case_dat, "Case_del": Case_del, "Case_dis": Case_dis, "Case_ela": Case_ela, + "Case_equ": Case_equ, # U20 "Case_ess": Case_ess, "Case_gen": Case_gen, "Case_ill": Case_ill, @@ -143,7 +149,9 @@ IDS = { "Definite_two": Definite_two, "Definite_def": Definite_def, "Definite_red": Definite_red, + "Definite_cons": Definite_cons, # U20 "Definite_ind": Definite_ind, + "Definite_spec": Definite_spec, # U20 "Degree_cmp": Degree_cmp, "Degree_comp": Degree_comp, "Degree_none": Degree_none, @@ -152,6 +160,8 @@ IDS = { "Degree_abs": Degree_abs, "Degree_com": Degree_com, "Degree_dim ": Degree_dim, # du + "Degree_equ": Degree_equ, # U20 + "Evident_nfh": Evident_nfh, # U20 "Gender_com": Gender_com, "Gender_fem": Gender_fem, "Gender_masc": Gender_masc, @@ -163,16 +173,21 @@ IDS = { "Mood_pot": Mood_pot, "Mood_sub": Mood_sub, "Mood_opt": Mood_opt, + "Mood_prp": Mood_prp, # U20 + "Mood_adm": Mood_adm, # U20 "Negative_neg": Negative_neg, "Negative_pos": Negative_pos, "Negative_yes": Negative_yes, + "Polarity_neg": Polarity_neg, # U20 + "Polarity_pos": Polarity_pos, # U20 "Number_com": Number_com, "Number_dual": Number_dual, "Number_none": Number_none, "Number_plur": Number_plur, "Number_sing": Number_sing, "Number_ptan ": Number_ptan, # bg - "Number_count ": Number_count, # bg + "Number_count ": Number_count, # bg, U20 + "Number_tri": Number_tri, # U20 "NumType_card": NumType_card, "NumType_dist": NumType_dist, "NumType_frac": NumType_frac, @@ -198,7 +213,8 @@ IDS = { "PronType_rel": PronType_rel, "PronType_tot": PronType_tot, "PronType_clit": PronType_clit, - "PronType_exc ": PronType_exc, # es, ca, it, fa, + "PronType_exc": PronType_exc, # es, ca, it, fa, U20 + "PronType_emp": PronType_emp, # U20 "Reflex_yes": Reflex_yes, "Tense_fut": Tense_fut, "Tense_imp": Tense_imp, @@ -214,12 +230,17 @@ IDS = { "VerbForm_partPres": VerbForm_partPres, "VerbForm_sup": VerbForm_sup, "VerbForm_trans": VerbForm_trans, + "VerbForm_conv": VerbForm_conv, # U20 "VerbForm_gdv ": VerbForm_gdv, # la, + "VerbForm_vnoun": VerbForm_vnoun, # U20 "Voice_act": Voice_act, "Voice_cau": Voice_cau, "Voice_pass": Voice_pass, - "Voice_mid ": Voice_mid, # gkc, + "Voice_mid ": Voice_mid, # gkc, U20 "Voice_int ": Voice_int, # hb, + "Voice_antip": Voice_antip, # U20 + "Voice_dir": Voice_dir, # U20 + "Voice_inv": Voice_inv, # U20 "Abbr_yes ": Abbr_yes, # cz, fi, sl, U, "AdpType_prep ": AdpType_prep, # cz, U, "AdpType_post ": AdpType_post, # U, @@ -285,6 +306,10 @@ IDS = { "Number_psee_plur ": Number_psee_plur, # U, "Number_psor_sing ": Number_psor_sing, # cz, fi, sl, U, "Number_psor_plur ": Number_psor_plur, # cz, fi, sl, U, + "Number_pauc": Number_pauc, # U20 + "Number_grpa": Number_grpa, # U20 + "Number_grpl": Number_grpl, # U20 + "Number_inv": Number_inv, # U20 "NumForm_digit ": NumForm_digit, # cz, sl, U, "NumForm_roman ": NumForm_roman, # cz, sl, U, "NumForm_word ": NumForm_word, # cz, sl, U, @@ -312,6 +337,8 @@ IDS = { "Person_psor_one ": Person_psor_one, # fi, U, "Person_psor_two ": Person_psor_two, # fi, U, "Person_psor_three ": Person_psor_three, # fi, U, + "Person_zero ": Person_zero, # U20 + "Person_four ": Person_four, # U20 "Polite_inf ": Polite_inf, # bq, U, "Polite_pol ": Polite_pol, # bq, U, "Polite_abs_inf ": Polite_abs_inf, # bq, U, @@ -320,6 +347,10 @@ IDS = { "Polite_erg_pol ": Polite_erg_pol, # bq, U, "Polite_dat_inf ": Polite_dat_inf, # bq, U, "Polite_dat_pol ": Polite_dat_pol, # bq, U, + "Polite_infm ": Polite_infm, # U20 + "Polite_form ": Polite_form, # U20 + "Polite_form_elev ": Polite_form_elev, # U20 + "Polite_form_humb ": Polite_form_humb, # U20 "Prefix_yes ": Prefix_yes, # U, "PrepCase_npr ": PrepCase_npr, # cz, "PrepCase_pre ": PrepCase_pre, # U, @@ -384,6 +415,7 @@ IDS = { "ccomp": ccomp, "complm": complm, "conj": conj, + "cop": cop, # U20 "csubj": csubj, "csubjpass": csubjpass, "dep": dep, @@ -406,6 +438,8 @@ IDS = { "num": num, "number": number, "oprd": oprd, + "obj": obj, # U20 + "obl": obl, # U20 "parataxis": parataxis, "partmod": partmod, "pcomp": pcomp, diff --git a/spacy/syntax/nonproj.pyx b/spacy/syntax/nonproj.pyx index 566588da4..1f4878247 100644 --- a/spacy/syntax/nonproj.pyx +++ b/spacy/syntax/nonproj.pyx @@ -8,7 +8,7 @@ from spacy.attrs import DEP, HEAD def ancestors(tokenid, heads): # returns all words going from the word up the path to the root # the path to root cannot be longer than the number of words in the sentence - # this function ends after at most len(heads) steps + # this function ends after at most len(heads) steps # because it would otherwise loop indefinitely on cycles head = tokenid cnt = 0 @@ -180,7 +180,7 @@ class PseudoProjectivity: next_queue = [] for qtoken in queue: for child in qtoken.children: - if child.is_space: continue + if child.is_space: continue if child == token: continue if child.dep_ == headlabel: return child diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx index 7903c44fb..954bced53 100644 --- a/spacy/tagger.pyx +++ b/spacy/tagger.pyx @@ -13,13 +13,13 @@ from thinc.linalg cimport VecVec from .typedefs cimport attr_t from .tokens.doc cimport Doc from .attrs cimport TAG -from .parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON +from .parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CCONJ, DET, NOUN, NUM, PRON from .parts_of_speech cimport VERB, X, PUNCT, EOL, SPACE from .gold cimport GoldParse from .attrs cimport * - + cpdef enum: P2_orth P2_cluster @@ -71,7 +71,7 @@ cpdef enum: cdef class TaggerModel(AveragedPerceptron): cdef void set_featuresC(self, ExampleC* eg, const TokenC* tokens, int i) except *: - + _fill_from_token(&eg.atoms[P2_orth], &tokens[i-2]) _fill_from_token(&eg.atoms[P1_orth], &tokens[i-1]) _fill_from_token(&eg.atoms[W_orth], &tokens[i]) @@ -191,7 +191,7 @@ cdef class Tagger: nr_class=self.vocab.morphology.n_tags, nr_feat=self.model.nr_feat) for i in range(tokens.length): - if tokens.c[i].pos == 0: + if tokens.c[i].pos == 0: self.model.set_featuresC(&eg.c, tokens.c, i) self.model.set_scoresC(eg.c.scores, eg.c.features, eg.c.nr_feat) @@ -217,7 +217,7 @@ cdef class Tagger: for doc in stream: self(doc) yield doc - + def update(self, Doc tokens, GoldParse gold): """Update the statistical model, with tags supplied for the given document. @@ -251,7 +251,7 @@ cdef class Tagger: self.model.updateC(&eg.c) self.vocab.morphology.assign_tag_id(&tokens.c[i], eg.guess) - + correct += eg.cost == 0 self.freqs[TAG][tokens.c[i].tag] += 1 eg.fill_scores(0, eg.c.nr_class) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 805a5b30c..bda528383 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -16,7 +16,7 @@ from ..typedefs cimport attr_t, flags_t from ..attrs cimport attr_id_t from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE -from ..parts_of_speech cimport CONJ, PUNCT, NOUN +from ..parts_of_speech cimport CCONJ, PUNCT, NOUN from ..parts_of_speech cimport univ_pos_t from ..lexeme cimport Lexeme from .span cimport Span @@ -59,13 +59,13 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil: cdef class Doc: """ - A sequence of `Token` objects. Access sentences and named entities, - export annotations to numpy arrays, losslessly serialize to compressed + A sequence of `Token` objects. Access sentences and named entities, + export annotations to numpy arrays, losslessly serialize to compressed binary strings. Aside: Internals - The `Doc` object holds an array of `TokenC` structs. - The Python-level `Token` and `Span` objects are views of this + The `Doc` object holds an array of `TokenC` structs. + The Python-level `Token` and `Span` objects are views of this array, i.e. they don't own the data themselves. Code: Construction 1 @@ -80,13 +80,13 @@ cdef class Doc: Create a Doc object. Aside: Implementation - This method of constructing a `Doc` object is usually only used - for deserialization. Standard usage is to construct the document via + This method of constructing a `Doc` object is usually only used + for deserialization. Standard usage is to construct the document via a call to the language object. Arguments: vocab: - A Vocabulary object, which must match any models you want to + A Vocabulary object, which must match any models you want to use (e.g. tokenizer, parser, entity recognizer). words: @@ -156,19 +156,19 @@ cdef class Doc: if self.length == 0: self.is_tagged = True self.is_parsed = True - + def __getitem__(self, object i): ''' doc[i] - Get the Token object at position i, where i is an integer. - Negative indexing is supported, and follows the usual Python + Get the Token object at position i, where i is an integer. + Negative indexing is supported, and follows the usual Python semantics, i.e. doc[-2] is doc[len(doc) - 2]. doc[start : end]] Get a `Span` object, starting at position `start` and ending at position `end`, where `start` and `end` are token indices. For instance, - `doc[2:5]` produces a span consisting of - tokens 2, 3 and 4. Stepped slices (e.g. `doc[start : end : step]`) + `doc[2:5]` produces a span consisting of + tokens 2, 3 and 4. Stepped slices (e.g. `doc[start : end : step]`) are not supported, as `Span` objects must be contiguous (cannot have gaps). You can use negative indices and open-ended ranges, which have their normal Python semantics. @@ -188,11 +188,11 @@ cdef class Doc: def __iter__(self): ''' for token in doc - Iterate over `Token` objects, from which the annotations can - be easily accessed. This is the main way of accessing Token - objects, which are the main way annotations are accessed from - Python. If faster-than-Python speeds are required, you can - instead access the annotations as a numpy array, or access the + Iterate over `Token` objects, from which the annotations can + be easily accessed. This is the main way of accessing Token + objects, which are the main way annotations are accessed from + Python. If faster-than-Python speeds are required, you can + instead access the annotations as a numpy array, or access the underlying C data directly from Cython. ''' cdef int i @@ -251,13 +251,13 @@ cdef class Doc: def __get__(self): if 'has_vector' in self.user_hooks: return self.user_hooks['has_vector'](self) - + return any(token.has_vector for token in self) property vector: ''' A real-valued meaning representation. Defaults to an average of the token vectors. - + Type: numpy.ndarray[ndim=1, dtype='float32'] ''' def __get__(self): @@ -285,14 +285,14 @@ cdef class Doc: norm += value * value self._vector_norm = sqrt(norm) if norm != 0 else 0 return self._vector_norm - + def __set__(self, value): - self._vector_norm = value + self._vector_norm = value @property def string(self): return self.text - + property text: '''A unicode representation of the document text.''' def __get__(self): @@ -306,7 +306,7 @@ cdef class Doc: property ents: ''' Yields named-entity `Span` objects, if the entity recognizer - has been applied to the document. Iterate over the span to get + has been applied to the document. Iterate over the span to get individual Token objects, or access the label: Example: @@ -352,7 +352,7 @@ cdef class Doc: cdef int i for i in range(self.length): self.c[i].ent_type = 0 - # At this point we don't know whether the NER has run over the + # At this point we don't know whether the NER has run over the # Doc. If the ent_iob is missing, leave it missing. if self.c[i].ent_iob != 0: self.c[i].ent_iob = 2 # Means O. Non-O are set from ents. @@ -384,9 +384,9 @@ cdef class Doc: property noun_chunks: ''' Yields base noun-phrase #[code Span] objects, if the document - has been syntactically parsed. A base noun phrase, or - 'NP chunk', is a noun phrase that does not permit other NPs to - be nested within it – so no NP-level coordination, no prepositional + has been syntactically parsed. A base noun phrase, or + 'NP chunk', is a noun phrase that does not permit other NPs to + be nested within it – so no NP-level coordination, no prepositional phrases, and no relative clauses. For example: ''' def __get__(self): @@ -422,7 +422,7 @@ cdef class Doc: def __get__(self): if 'sents' in self.user_hooks: return self.user_hooks['sents'](self) - + if not self.is_parsed: raise ValueError( "sentence boundary detection requires the dependency parse, which " @@ -465,8 +465,8 @@ cdef class Doc: @cython.boundscheck(False) cpdef np.ndarray to_array(self, object py_attr_ids): """ - Given a list of M attribute IDs, export the tokens to a numpy - `ndarray` of shape (N, M), where `N` is the length + Given a list of M attribute IDs, export the tokens to a numpy + `ndarray` of shape (N, M), where `N` is the length of the document. The values will be 32-bit integers. Example: @@ -474,7 +474,7 @@ cdef class Doc: doc = nlp(text) # All strings mapped to integers, for easy export to numpy np_array = doc.to_array([attrs.LOWER, attrs.POS, attrs.ENT_TYPE, attrs.IS_ALPHA]) - + Arguments: attr_ids (list[int]): A list of attribute ID ints. @@ -520,7 +520,7 @@ cdef class Doc: cdef int i cdef attr_t attr cdef size_t count - + if counts is None: counts = PreshCounter() output_dict = True @@ -570,7 +570,7 @@ cdef class Doc: cdef TokenC* tokens = self.c cdef int length = len(array) cdef attr_t[:] values - for col, attr_id in enumerate(attrs): + for col, attr_id in enumerate(attrs): values = array[:, col] if attr_id == HEAD: for i in range(length): @@ -612,11 +612,11 @@ cdef class Doc: '''Deserialize, loading from bytes.''' self.vocab.serializer.unpack_into(data[4:], self) return self - + @staticmethod def read_bytes(file_): ''' - A static method, used to read serialized #[code Doc] objects from + A static method, used to read serialized #[code Doc] objects from a file. For example: Example: @@ -673,7 +673,7 @@ cdef class Doc: "Expected either 3 arguments (deprecated), or 0 (use keyword arguments). " "Arguments supplied:\n%s\n" "Keyword arguments:%s\n" % (len(args), repr(args), repr(attributes))) - + cdef int start = token_by_start(self.c, self.length, start_idx) if start == -1: return None @@ -784,7 +784,7 @@ cdef int set_children_from_heads(TokenC* tokens, int length) except -1: if child.l_edge < head.l_edge: head.l_edge = child.l_edge head.l_kids += 1 - + # Set right edges --- same as above, but iterate in reverse for i in range(length-1, -1, -1): child = &tokens[i] @@ -798,4 +798,4 @@ cdef int set_children_from_heads(TokenC* tokens, int length) except -1: for i in range(length): if tokens[i].head == 0 and tokens[i].dep != 0: tokens[tokens[i].l_edge].sent_start = True - + diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index fc84ba350..69bd9fa6e 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -20,7 +20,7 @@ from .. import parts_of_speech from ..attrs cimport LEMMA from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER from ..attrs cimport POS, LEMMA, TAG, DEP -from ..parts_of_speech cimport CONJ, PUNCT +from ..parts_of_speech cimport CCONJ, PUNCT from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE from ..attrs cimport IS_BRACKET @@ -84,7 +84,7 @@ cdef class Token: cpdef bint check_flag(self, attr_id_t flag_id) except -1: '''Check the value of a boolean flag. - + Arguments: flag_id (int): The ID of the flag attribute. Returns: @@ -225,7 +225,7 @@ cdef class Token: property vector: ''' A real-valued meaning representation. - + Type: numpy.ndarray[ndim=1, dtype='float32'] ''' def __get__(self): @@ -343,7 +343,7 @@ cdef class Token: ''' def __get__(self): cdef const TokenC* head_ptr = self.c - # guard against infinite loop, no token can have + # guard against infinite loop, no token can have # more ancestors than tokens in the tree cdef int i = 0 while head_ptr.head != 0 and i < self.doc.length: @@ -370,7 +370,7 @@ cdef class Token: property head: '''The syntactic parent, or "governor", of this token. - + Returns: Token ''' def __get__(self): @@ -390,7 +390,7 @@ cdef class Token: # is the new head a descendant of the old head cdef bint is_desc = old_head.is_ancestor_of(new_head) - + cdef int new_edge cdef Token anc, child @@ -420,7 +420,7 @@ cdef class Token: if anc.c.l_edge <= new_edge: break anc.c.l_edge = new_edge - + elif self.c.head < 0: # right dependent old_head.c.r_kids -= 1 # do the same thing as for l_edge @@ -435,7 +435,7 @@ cdef class Token: if child.c.r_edge > new_edge: new_edge = child.c.r_edge old_head.c.r_edge = new_edge - + for anc in old_head.ancestors: if anc.c.r_edge >= new_edge: break @@ -598,19 +598,19 @@ cdef class Token: property is_punct: def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_PUNCT) - property is_space: + property is_space: def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_SPACE) - - property is_bracket: + + property is_bracket: def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_BRACKET) - property is_quote: + property is_quote: def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_QUOTE) - property is_left_punct: + property is_left_punct: def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_LEFT_PUNCT) - property is_right_punct: + property is_right_punct: def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_RIGHT_PUNCT) property like_url: From 6d67213b80350fe63e46ea2a18688f4a5a3f0d81 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 7 Mar 2017 15:55:28 +0100 Subject: [PATCH 02/21] Add test for 850: Matcher fails on zero-or-more. --- spacy/tests/regression/test_issue850.py | 29 +++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 spacy/tests/regression/test_issue850.py diff --git a/spacy/tests/regression/test_issue850.py b/spacy/tests/regression/test_issue850.py new file mode 100644 index 000000000..4113ec512 --- /dev/null +++ b/spacy/tests/regression/test_issue850.py @@ -0,0 +1,29 @@ +''' +Test Matcher matches with '*' operator and Boolean flag +''' +from __future__ import unicode_literals +import pytest + +from ...matcher import Matcher +from ...vocab import Vocab +from ...attrs import LOWER +from ...tokens import Doc + + +@pytest.mark.xfail +def test_issue850(): + matcher = Matcher(Vocab()) + IS_ANY_TOKEN = matcher.vocab.add_flag(lambda x: True) + matcher.add_pattern( + "FarAway", + [ + {LOWER: "bob"}, + {'OP': '*', IS_ANY_TOKEN: True}, + {LOWER: 'frank'} + ]) + doc = Doc(matcher.vocab, words=['bob', 'and', 'and', 'cat', 'frank']) + match = matcher(doc) + assert len(match) == 1 + start, end, label, ent_id = match + assert start == 0 + assert end == 4 From 4e75e742475236cf7358b4481a29a54eb607dd4d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 7 Mar 2017 16:08:32 +0100 Subject: [PATCH 03/21] Update regression test for variable-length pattern problem in the matcher. --- spacy/tests/regression/test_issue850.py | 28 ++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/spacy/tests/regression/test_issue850.py b/spacy/tests/regression/test_issue850.py index 4113ec512..3b3952744 100644 --- a/spacy/tests/regression/test_issue850.py +++ b/spacy/tests/regression/test_issue850.py @@ -2,6 +2,7 @@ Test Matcher matches with '*' operator and Boolean flag ''' from __future__ import unicode_literals +from __future__ import print_function import pytest from ...matcher import Matcher @@ -10,9 +11,30 @@ from ...attrs import LOWER from ...tokens import Doc +def test_basic_case(): + matcher = Matcher(Vocab( + lex_attr_getters={LOWER: lambda string: string.lower()})) + IS_ANY_TOKEN = matcher.vocab.add_flag(lambda x: True) + matcher.add_pattern( + "FarAway", + [ + {LOWER: "bob"}, + {'OP': '*', LOWER: 'and'}, + {LOWER: 'frank'} + ]) + doc = Doc(matcher.vocab, words=['bob', 'and', 'and', 'frank']) + match = matcher(doc) + assert len(match) == 1 + ent_id, label, start, end = match[0] + assert start == 0 + assert end == 4 + @pytest.mark.xfail def test_issue850(): - matcher = Matcher(Vocab()) + '''The problem here is that the variable-length pattern matches the + succeeding token. We then don't handle the ambiguity correctly.''' + matcher = Matcher(Vocab( + lex_attr_getters={LOWER: lambda string: string.lower()})) IS_ANY_TOKEN = matcher.vocab.add_flag(lambda x: True) matcher.add_pattern( "FarAway", @@ -21,9 +43,9 @@ def test_issue850(): {'OP': '*', IS_ANY_TOKEN: True}, {LOWER: 'frank'} ]) - doc = Doc(matcher.vocab, words=['bob', 'and', 'and', 'cat', 'frank']) + doc = Doc(matcher.vocab, words=['bob', 'and', 'and', 'frank']) match = matcher(doc) assert len(match) == 1 - start, end, label, ent_id = match + ent_id, label, start, end = match[0] assert start == 0 assert end == 4 From 5de7e712b758829afbd0d9d000ec9139c474f737 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 7 Mar 2017 17:15:18 +0100 Subject: [PATCH 04/21] Add support for pickling StringStore. --- spacy/strings.pyx | 37 +++++++++++++++++++++---------------- spacy/tests/test_pickles.py | 17 +++++++++++++++++ 2 files changed, 38 insertions(+), 16 deletions(-) create mode 100644 spacy/tests/test_pickles.py diff --git a/spacy/strings.pyx b/spacy/strings.pyx index ddfddc29c..403ebd3c0 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -3,7 +3,7 @@ from __future__ import unicode_literals, absolute_import cimport cython from libc.string cimport memcpy -from libc.stdint cimport uint64_t +from libc.stdint cimport uint64_t, uint32_t from murmurhash.mrmr cimport hash64, hash32 @@ -12,22 +12,19 @@ from preshed.maps cimport map_iter, key_t from .typedefs cimport hash_t from libc.stdint cimport uint32_t -try: - import ujson as json -except ImportError: - import json +import ujson cpdef hash_t hash_string(unicode string) except 0: chars = string.encode('utf8') - return _hash_utf8(chars, len(chars)) + return hash_utf8(chars, len(chars)) -cdef hash_t _hash_utf8(char* utf8_string, int length): +cdef hash_t hash_utf8(char* utf8_string, int length) nogil: return hash64(utf8_string, length, 1) -cdef uint32_t _hash32_utf8(char* utf8_string, int length): +cdef uint32_t hash32_utf8(char* utf8_string, int length) nogil: return hash32(utf8_string, length, 1) @@ -48,11 +45,11 @@ cdef unicode _decode(const Utf8Str* string): return string.p[i:length + i].decode('utf8') -cdef Utf8Str _allocate(Pool mem, const unsigned char* chars, int length) except *: +cdef Utf8Str _allocate(Pool mem, const unsigned char* chars, uint32_t length) except *: cdef int n_length_bytes cdef int i cdef Utf8Str string - assert length != 0 + cdef uint32_t ulength = length if length < sizeof(string.s): string.s[0] = length memcpy(&string.s[1], chars, length) @@ -98,6 +95,14 @@ cdef class StringStore: def __get__(self): return self.size -1 + def __reduce__(self): + # TODO: OOV words, for the is_frozen stuff? + if self.is_frozen: + raise NotImplementedError( + "Currently missing support for pickling StringStore when " + "is_frozen=True") + return (StringStore, (list(self),)) + def __len__(self): """The number of strings in the store. @@ -149,7 +154,7 @@ cdef class StringStore: # pretty bad. # We could also get unlucky here, and hash into a value that # collides with the 'real' strings. - return _hash32_utf8(byte_string, len(byte_string)) + return hash32_utf8(byte_string, len(byte_string)) else: return utf8str - self.c @@ -200,7 +205,7 @@ cdef class StringStore: cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length): # TODO: This function's API/behaviour is an unholy mess... # 0 means missing, but we don't bother offsetting the index. - cdef hash_t key = _hash_utf8(utf8_string, length) + cdef hash_t key = hash_utf8(utf8_string, length) cdef Utf8Str* value = self._map.get(key) if value is not NULL: return value @@ -209,7 +214,7 @@ cdef class StringStore: return value if self.is_frozen: # OOV store uses 32 bit hashes. Pretty ugly :( - key32 = _hash32_utf8(utf8_string, length) + key32 = hash32_utf8(utf8_string, length) # Important: Make the OOV store own the memory. That way it's trivial # to flush them all. value = self._oov.mem.alloc(1, sizeof(Utf8Str)) @@ -232,7 +237,7 @@ cdef class StringStore: Returns: None """ - string_data = json.dumps(list(self)) + string_data = ujson.dumps(list(self)) if not isinstance(string_data, unicode): string_data = string_data.decode('utf8') # TODO: OOV? @@ -246,7 +251,7 @@ cdef class StringStore: Returns: None """ - strings = json.load(file_) + strings = ujson.load(file_) if strings == ['']: return None cdef unicode string @@ -271,7 +276,7 @@ cdef class StringStore: # Find array index with pointer arithmetic offset = ((value) - self.c) keys[offset] = key - + self._resize_at *= 2 cdef size_t new_size = self._resize_at * sizeof(Utf8Str) self.c = self.mem.realloc(self.c, new_size) diff --git a/spacy/tests/test_pickles.py b/spacy/tests/test_pickles.py new file mode 100644 index 000000000..46221fd8b --- /dev/null +++ b/spacy/tests/test_pickles.py @@ -0,0 +1,17 @@ +from __future__ import unicode_literals + +import io +import pickle + +from ..strings import StringStore + + +def test_pickle_string_store(): + sstore = StringStore() + hello = sstore['hello'] + bye = sstore['bye'] + bdata = pickle.dumps(sstore, protocol=-1) + unpickled = pickle.loads(bdata) + assert unpickled['hello'] == hello + assert unpickled['bye'] == bye + From 3edb8ae207a44fe9bf40f55d5e211c22cdf085f8 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 7 Mar 2017 17:16:26 +0100 Subject: [PATCH 05/21] Whitespace --- spacy/tests/regression/test_issue850.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/tests/regression/test_issue850.py b/spacy/tests/regression/test_issue850.py index 3b3952744..8237763ea 100644 --- a/spacy/tests/regression/test_issue850.py +++ b/spacy/tests/regression/test_issue850.py @@ -29,6 +29,7 @@ def test_basic_case(): assert start == 0 assert end == 4 + @pytest.mark.xfail def test_issue850(): '''The problem here is that the variable-length pattern matches the From 26614e028f94212810159995004a4330aca6ce43 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 7 Mar 2017 20:24:37 +0100 Subject: [PATCH 06/21] Add hacky support for StringCFile, to make pickling easier. --- spacy/cfile.pxd | 14 ++++++++++++++ spacy/cfile.pyx | 42 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+) diff --git a/spacy/cfile.pxd b/spacy/cfile.pxd index c9a6aec41..cb0077587 100644 --- a/spacy/cfile.pxd +++ b/spacy/cfile.pxd @@ -4,6 +4,20 @@ from cymem.cymem cimport Pool cdef class CFile: cdef FILE* fp cdef bint is_open + cdef Pool mem + cdef int size # For compatibility with subclass + cdef int _capacity # For compatibility with subclass + + cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1 + + cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1 + + cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except * + + + +cdef class StringCFile(CFile): + cdef unsigned char* data cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1 diff --git a/spacy/cfile.pyx b/spacy/cfile.pyx index b119d3b9b..95c61a468 100644 --- a/spacy/cfile.pyx +++ b/spacy/cfile.pyx @@ -1,4 +1,5 @@ from libc.stdio cimport fopen, fclose, fread, fwrite, FILE +from libc.string cimport memcpy cdef class CFile: @@ -9,6 +10,7 @@ cdef class CFile: mode_str = mode if hasattr(loc, 'as_posix'): loc = loc.as_posix() + self.mem = Pool() cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc self.fp = fopen(bytes_loc, mode_str) if self.fp == NULL: @@ -45,3 +47,43 @@ cdef class CFile: cdef bytes py_bytes = value.encode('utf8') cdef char* chars = py_bytes self.write(sizeof(char), len(py_bytes), chars) + + +cdef class StringCFile: + def __init__(self, mode, bytes data=b'', on_open_error=None): + self.mem = Pool() + self.is_open = 'w' in mode + self._capacity = max(len(data), 8) + self.size = len(data) + self.data = self.mem.alloc(1, self._capacity) + for i in range(len(data)): + self.data[i] = data + + def close(self): + self.is_open = False + + def string_data(self): + return (self.data-self.size)[:self.size] + + cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1: + memcpy(dest, self.data, elem_size * number) + self.data += elem_size * number + + cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1: + write_size = number * elem_size + if (self.size + write_size) >= self._capacity: + self._capacity = (self.size + write_size) * 2 + self.data = self.mem.realloc(self.data, self._capacity) + memcpy(self.data, src, elem_size * number) + self.data += write_size + self.size += write_size + + cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *: + cdef void* dest = mem.alloc(number, elem_size) + self.read_into(dest, number, elem_size) + return dest + + def write_unicode(self, unicode value): + cdef bytes py_bytes = value.encode('utf8') + cdef char* chars = py_bytes + self.write(sizeof(char), len(py_bytes), chars) From d814892805c364d9d52fb0eec2c97a8a1bdfea30 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 7 Mar 2017 20:25:12 +0100 Subject: [PATCH 07/21] Hackish pickle support for Vocab. --- spacy/vocab.pyx | 116 ++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 107 insertions(+), 9 deletions(-) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index da3a67e56..e7994c127 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -9,11 +9,16 @@ import bz2 import ujson as json import re +try: + import cPickle as pickle +except ImportError: + import pickle + from .lexeme cimport EMPTY_LEXEME from .lexeme cimport Lexeme from .strings cimport hash_string from .typedefs cimport attr_t -from .cfile cimport CFile +from .cfile cimport CFile, StringCFile from .lemmatizer import Lemmatizer from .attrs import intify_attrs from .tokens.token cimport Token @@ -346,17 +351,18 @@ cdef class Vocab: Token.set_struct_attr(token, attr_id, value) return tokens - def dump(self, loc): - """Save the lexemes binary data to the given location. + def dump(self, loc=None): + """Save the lexemes binary data to the given location, or + return a byte-string with the data if loc is None. Arguments: - loc (Path): The path to save to. + loc (Path or None): The path to save to, or None. """ - if hasattr(loc, 'as_posix'): - loc = loc.as_posix() - cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc - - cdef CFile fp = CFile(bytes_loc, 'wb') + cdef CFile fp + if loc is None: + fp = StringCFile('wb') + else: + fp = CFile(loc, 'wb') cdef size_t st cdef size_t addr cdef hash_t key @@ -378,6 +384,8 @@ cdef class Vocab: fp.write_from(&lexeme.l2_norm, sizeof(lexeme.l2_norm), 1) fp.write_from(&lexeme.lang, sizeof(lexeme.lang), 1) fp.close() + if loc is None: + return fp.string_data() def load_lexemes(self, loc): '''Load the binary vocabulary data from the given location. @@ -427,6 +435,60 @@ cdef class Vocab: i += 1 fp.close() + def _deserialize_lexemes(self, CFile fp): + '''Load the binary vocabulary data from the given CFile. + ''' + cdef LexemeC* lexeme + cdef hash_t key + cdef unicode py_str + cdef attr_t orth + assert sizeof(orth) == sizeof(lexeme.orth) + i = 0 + cdef int todo = fp.size + cdef int lex_size = sizeof(lexeme.flags) + lex_size += sizeof(lexeme.id) + lex_size += sizeof(lexeme.length) + lex_size += sizeof(lexeme.orth) + lex_size += sizeof(lexeme.lower) + lex_size += sizeof(lexeme.norm) + lex_size += sizeof(lexeme.shape) + lex_size += sizeof(lexeme.prefix) + lex_size += sizeof(lexeme.suffix) + lex_size += sizeof(lexeme.cluster) + lex_size += sizeof(lexeme.prob) + lex_size += sizeof(lexeme.sentiment) + lex_size += sizeof(lexeme.l2_norm) + lex_size += sizeof(lexeme.lang) + while True: + if todo < lex_size: + break + todo -= lex_size + lexeme = self.mem.alloc(sizeof(LexemeC), 1) + # Copy data from the file into the lexeme + fp.read_into(&lexeme.flags, 1, sizeof(lexeme.flags)) + fp.read_into(&lexeme.id, 1, sizeof(lexeme.id)) + fp.read_into(&lexeme.length, 1, sizeof(lexeme.length)) + fp.read_into(&lexeme.orth, 1, sizeof(lexeme.orth)) + fp.read_into(&lexeme.lower, 1, sizeof(lexeme.lower)) + fp.read_into(&lexeme.norm, 1, sizeof(lexeme.norm)) + fp.read_into(&lexeme.shape, 1, sizeof(lexeme.shape)) + fp.read_into(&lexeme.prefix, 1, sizeof(lexeme.prefix)) + fp.read_into(&lexeme.suffix, 1, sizeof(lexeme.suffix)) + fp.read_into(&lexeme.cluster, 1, sizeof(lexeme.cluster)) + fp.read_into(&lexeme.prob, 1, sizeof(lexeme.prob)) + fp.read_into(&lexeme.sentiment, 1, sizeof(lexeme.sentiment)) + fp.read_into(&lexeme.l2_norm, 1, sizeof(lexeme.l2_norm)) + fp.read_into(&lexeme.lang, 1, sizeof(lexeme.lang)) + + lexeme.vector = EMPTY_VEC + py_str = self.strings[lexeme.orth] + key = hash_string(py_str) + self._by_hash.set(key, lexeme) + self._by_orth.set(lexeme.orth, lexeme) + self.length += 1 + i += 1 + fp.close() + def dump_vectors(self, out_loc): '''Save the word vectors to a binary file. @@ -553,6 +615,42 @@ cdef class Vocab: return vec_len +def pickle_vocab(vocab): + sstore = vocab.strings + morph = vocab.morphology + length = vocab.length + serializer = vocab._serializer + data_dir = vocab.data_dir + lex_attr_getters = vocab.lex_attr_getters + + lexemes_data = vocab.dump() + vectors_length = vocab.vectors_length + + return (unpickle_vocab, + (sstore, morph, serializer, data_dir, lex_attr_getters, + lexemes_data, length, vectors_length)) + + +def unpickle_vocab(sstore, morphology, serializer, data_dir, + lex_attr_getters, bytes lexemes_data, int length, int vectors_length): + cdef Vocab vocab = Vocab() + vocab.length = length + vocab.vectors_length = vectors_length + vocab.strings = sstore + cdef CFile fp = StringCFile('r', data=lexemes_data) + vocab.morphology = morphology + vocab._serializer = serializer + vocab.data_dir = data_dir + vocab.lex_attr_getters = lex_attr_getters + vocab._deserialize_lexemes(fp) + vocab.length = length + vocab.vectors_length = vectors_length + return vocab + + +copy_reg.pickle(Vocab, pickle_vocab, unpickle_vocab) + + def write_binary_vectors(in_loc, out_loc): cdef CFile out_file = CFile(out_loc, 'wb') cdef Address mem From a89c3500f69e8c9aebaad6d2d137729eab1c8458 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 7 Mar 2017 20:58:55 +0100 Subject: [PATCH 08/21] Fixes to hacky vocab pickling --- spacy/cfile.pyx | 7 +++---- spacy/tests/test_pickles.py | 24 +++++++++++++++++++++++- 2 files changed, 26 insertions(+), 5 deletions(-) diff --git a/spacy/cfile.pyx b/spacy/cfile.pyx index 95c61a468..ceebe2e59 100644 --- a/spacy/cfile.pyx +++ b/spacy/cfile.pyx @@ -57,7 +57,7 @@ cdef class StringCFile: self.size = len(data) self.data = self.mem.alloc(1, self._capacity) for i in range(len(data)): - self.data[i] = data + self.data[i] = data[i] def close(self): self.is_open = False @@ -69,13 +69,12 @@ cdef class StringCFile: memcpy(dest, self.data, elem_size * number) self.data += elem_size * number - cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1: + cdef int write_from(self, void* src, size_t elem_size, size_t number) except -1: write_size = number * elem_size if (self.size + write_size) >= self._capacity: self._capacity = (self.size + write_size) * 2 self.data = self.mem.realloc(self.data, self._capacity) - memcpy(self.data, src, elem_size * number) - self.data += write_size + memcpy(&self.data[self.size], src, elem_size * number) self.size += write_size cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *: diff --git a/spacy/tests/test_pickles.py b/spacy/tests/test_pickles.py index 46221fd8b..4464b890e 100644 --- a/spacy/tests/test_pickles.py +++ b/spacy/tests/test_pickles.py @@ -1,9 +1,12 @@ from __future__ import unicode_literals import io -import pickle +import pytest +import dill as pickle from ..strings import StringStore +from ..vocab import Vocab +from ..attrs import NORM def test_pickle_string_store(): @@ -14,4 +17,23 @@ def test_pickle_string_store(): unpickled = pickle.loads(bdata) assert unpickled['hello'] == hello assert unpickled['bye'] == bye + assert len(sstore) == len(unpickled) + +def test_pickle_vocab(): + vocab = Vocab(lex_attr_getters={int(NORM): lambda string: string[:-1]}) + dog = vocab[u'dog'] + cat = vocab[u'cat'] + assert dog.norm_ == 'do' + assert cat.norm_ == 'ca' + + bdata = pickle.dumps(vocab) + unpickled = pickle.loads(bdata) + + assert unpickled[u'dog'].orth == dog.orth + assert unpickled[u'cat'].orth == cat.orth + assert unpickled[u'dog'].norm == dog.norm + assert unpickled[u'cat'].norm == cat.norm + dog_ = unpickled[u'dog'] + cat_ = unpickled[u'cat'] + assert dog_.norm != cat_.norm From 16670d325121bb183d818d0061e572ac7f962cef Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 7 Mar 2017 21:43:28 +0100 Subject: [PATCH 09/21] Xfail the vocab pickling for now --- spacy/tests/test_pickles.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/tests/test_pickles.py b/spacy/tests/test_pickles.py index 4464b890e..2e7fc6bf7 100644 --- a/spacy/tests/test_pickles.py +++ b/spacy/tests/test_pickles.py @@ -20,6 +20,7 @@ def test_pickle_string_store(): assert len(sstore) == len(unpickled) +@pytest.mark.xfail def test_pickle_vocab(): vocab = Vocab(lex_attr_getters={int(NORM): lambda string: string[:-1]}) dog = vocab[u'dog'] From 04a51dab623eb06a0b5cdcc5c8142ccd4cfdc8b8 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 8 Mar 2017 01:37:19 +0100 Subject: [PATCH 10/21] Print active parser features during training --- bin/parser/train.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/bin/parser/train.py b/bin/parser/train.py index 574797ba5..24484f7cf 100755 --- a/bin/parser/train.py +++ b/bin/parser/train.py @@ -66,8 +66,8 @@ def score_model(scorer, nlp, raw_text, annot_tuples, verbose=False): def train(Language, train_data, dev_data, model_dir, tagger_cfg, parser_cfg, entity_cfg, n_iter=15, seed=0, gold_preproc=False, n_sents=0, corruption_level=0): - print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %") - format_str = '{:d}\t{:d}\t{uas:.3f}\t{ents_f:.3f}\t{tags_acc:.3f}\t{token_acc:.3f}' + print("Itn.\tP.Loss\tN feats\tUAS\tNER F.\tTag %\tToken %") + format_str = '{:d}\t{:d}\t{:d}\t{uas:.3f}\t{ents_f:.3f}\t{tags_acc:.3f}\t{token_acc:.3f}' with Language.train(model_dir, train_data, tagger_cfg, parser_cfg, entity_cfg) as trainer: loss = 0 @@ -76,7 +76,8 @@ def train(Language, train_data, dev_data, model_dir, tagger_cfg, parser_cfg, ent for doc, gold in epoch: trainer.update(doc, gold) dev_scores = trainer.evaluate(dev_data, gold_preproc=gold_preproc) - print(format_str.format(itn, loss, **dev_scores.scores)) + print(format_str.format(itn, loss, + trainer.nlp.parser.model.nr_active_feat, **dev_scores.scores)) def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False, @@ -160,6 +161,7 @@ def main(language, train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc= if not eval_only: gold_train = list(read_json_file(train_loc)) gold_dev = list(read_json_file(dev_loc)) + gold_train = gold_train[:n_sents] train(lang, gold_train, gold_dev, model_dir, tagger_cfg, parser_cfg, entity_cfg, n_sents=n_sents, gold_preproc=gold_preproc, corruption_level=corruption_level, n_iter=n_iter) From d108534dc289f4f1342194be8cc0151bad769153 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 8 Mar 2017 01:37:52 +0100 Subject: [PATCH 11/21] Fix 2/3 problems for training --- spacy/en/__init__.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/spacy/en/__init__.py b/spacy/en/__init__.py index 56cf4d184..f39faf308 100644 --- a/spacy/en/__init__.py +++ b/spacy/en/__init__.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals, print_function from os import path +from pathlib import Path from ..util import match_best_version from ..util import get_data_path @@ -13,6 +14,11 @@ from ..attrs import LANG from .language_data import * +try: + basestring +except NameError: + basestring = str + class English(Language): lang = 'en' @@ -43,14 +49,15 @@ def _fix_deprecated_glove_vectors_loading(overrides): data_path = get_data_path() else: path = overrides['path'] + if isinstance(path, basestring): + path = Path(path) data_path = path.parent vec_path = None if 'add_vectors' not in overrides: if 'vectors' in overrides: vec_path = match_best_version(overrides['vectors'], None, data_path) if vec_path is None: - raise IOError( - 'Could not load data pack %s from %s' % (overrides['vectors'], data_path)) + return overrides else: vec_path = match_best_version('en_glove_cc_300_1m_vectors', None, data_path) if vec_path is not None: From 40703988bc01ea6c41d5e8a920634d4692231c05 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 8 Mar 2017 01:38:51 +0100 Subject: [PATCH 12/21] Use FTRL training in parser --- spacy/syntax/parser.pyx | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index eb094fa97..34ee920c6 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -124,6 +124,8 @@ cdef class Parser: elif 'features' not in cfg: cfg['features'] = self.feature_templates self.model = ParserModel(cfg['features']) + self.model.l1_penalty = 1e-7 + self.cfg = cfg def __reduce__(self): @@ -258,15 +260,20 @@ cdef class Parser: self.model.set_featuresC(&eg.c, stcls.c) self.moves.set_costs(eg.c.is_valid, eg.c.costs, stcls, gold) self.model.set_scoresC(eg.c.scores, eg.c.features, eg.c.nr_feat) - self.model.updateC(&eg.c) + self.model.time += 1 guess = VecVec.arg_max_if_true(eg.c.scores, eg.c.is_valid, eg.c.nr_class) - - action = self.moves.c[eg.guess] + if eg.c.costs[guess] > 0: + best = VecVec.arg_max_if_zero(eg.c.scores, eg.c.costs, eg.c.nr_class) + for feat in eg.c.features[:eg.c.nr_feat]: + self.model.update_weight_ftrl(feat.key, best, -feat.value * eg.costs[guess]) + self.model.update_weight_ftrl(feat.key, guess, feat.value * eg.costs[guess]) + + action = self.moves.c[guess] action.do(stcls.c, action.label) - loss += eg.costs[eg.guess] - eg.fill_scores(0, eg.nr_class) - eg.fill_costs(0, eg.nr_class) - eg.fill_is_valid(1, eg.nr_class) + loss += eg.costs[guess] + eg.fill_scores(0, eg.c.nr_class) + eg.fill_costs(0, eg.c.nr_class) + eg.fill_is_valid(1, eg.c.nr_class) return loss def step_through(self, Doc doc): @@ -296,7 +303,7 @@ cdef class Parser: # Doesn't set label into serializer -- subclasses override it to do that. for action in self.moves.action_types: self.moves.add_action(action, label) - + cdef class StepwiseState: cdef readonly StateClass stcls From cd33b39a04c52e288c9a6e9a1043a29f72cf6527 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 8 Mar 2017 01:39:13 +0100 Subject: [PATCH 13/21] Fix 2/3 problem for json save/load --- spacy/language.py | 47 +++++++++++++++++++++++++++++++---------------- 1 file changed, 31 insertions(+), 16 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index bebdeab20..9f8cc49e1 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -5,7 +5,7 @@ import pathlib from contextlib import contextmanager import shutil -import ujson as json +import ujson try: @@ -13,6 +13,10 @@ try: except NameError: basestring = str +try: + unicode +except NameError: + unicode = str from .tokenizer import Tokenizer from .vocab import Vocab @@ -226,12 +230,21 @@ class Language(object): parser_cfg['actions'] = ArcEager.get_actions(gold_parses=gold_tuples) entity_cfg['actions'] = BiluoPushDown.get_actions(gold_parses=gold_tuples) - with (dep_model_dir / 'config.json').open('w') as file_: - json.dump(parser_cfg, file_) - with (ner_model_dir / 'config.json').open('w') as file_: - json.dump(entity_cfg, file_) - with (pos_model_dir / 'config.json').open('w') as file_: - json.dump(tagger_cfg, file_) + with (dep_model_dir / 'config.json').open('wb') as file_: + data = ujson.dumps(parser_cfg) + if isinstance(data, unicode): + data = data.encode('utf8') + file_.write(data) + with (ner_model_dir / 'config.json').open('wb') as file_: + data = ujson.dumps(entity_cfg) + if isinstance(data, unicode): + data = data.encode('utf8') + file_.write(data) + with (pos_model_dir / 'config.json').open('wb') as file_: + data = ujson.dumps(tagger_cfg) + if isinstance(data, unicode): + data = data.encode('utf8') + file_.write(data) self = cls( path=path, @@ -391,12 +404,14 @@ class Language(object): else: entity_iob_freqs = [] entity_type_freqs = [] - with (path / 'vocab' / 'serializer.json').open('w') as file_: - file_.write( - json.dumps([ - (TAG, tagger_freqs), - (DEP, dep_freqs), - (ENT_IOB, entity_iob_freqs), - (ENT_TYPE, entity_type_freqs), - (HEAD, head_freqs) - ])) + with (path / 'vocab' / 'serializer.json').open('wb') as file_: + data = ujson.dumps([ + (TAG, tagger_freqs), + (DEP, dep_freqs), + (ENT_IOB, entity_iob_freqs), + (ENT_TYPE, entity_type_freqs), + (HEAD, head_freqs) + ]) + if isinstance(data, unicode): + data = data.encode('utf8') + file_.write(data) From ffe0f0c6c4be01aa356cc127e2df8103ba4cbf74 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 8 Mar 2017 14:11:54 +0100 Subject: [PATCH 14/21] Add dill to requirements --- requirements.txt | 1 + setup.py | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 538862aed..4a75f6be3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,3 +10,4 @@ six ujson>=1.35 cloudpickle sputnik>=0.9.2,<0.10.0 +dill>=0.2,<0.3 diff --git a/setup.py b/setup.py index fc316e72f..49ea639e2 100644 --- a/setup.py +++ b/setup.py @@ -241,7 +241,8 @@ def setup_package(): 'cloudpickle', 'pathlib', 'sputnik>=0.9.2,<0.10.0', - 'ujson>=1.35'], + 'ujson>=1.35', + 'dill>=0.2,<0.3'], classifiers=[ 'Development Status :: 5 - Production/Stable', 'Environment :: Console', From 0a6d7ca2006d520883361d9922282679c4d2d6cc Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 8 Mar 2017 14:33:32 +0100 Subject: [PATCH 15/21] Fix spacing after token_match The boolean flag indicating a space after the token was being set incorrectly after the token_match regex was applied. Fixes #859. --- spacy/tokenizer.pyx | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 8f2f111e7..1b74431ff 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -275,7 +275,10 @@ cdef class Tokenizer: if cache_hit: pass elif self.token_match and self.token_match(string): - tokens.push_back(self.vocab.get(tokens.mem, string), not suffixes.size()) + # We're always saying 'no' to spaces here -- the caller will + # fix up the outermost one, with reference to the original. + # See Issue #859 + tokens.push_back(self.vocab.get(tokens.mem, string), False) else: matches = self.find_infix(string) if not matches: From c2e3e651b84f519f6ef021e064c161bc4ba5e89a Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 8 Mar 2017 14:36:09 +0100 Subject: [PATCH 16/21] Re-add regression test for #859 --- spacy/tests/regression/test_issue859.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 spacy/tests/regression/test_issue859.py diff --git a/spacy/tests/regression/test_issue859.py b/spacy/tests/regression/test_issue859.py new file mode 100644 index 000000000..4a2d08df7 --- /dev/null +++ b/spacy/tests/regression/test_issue859.py @@ -0,0 +1,12 @@ +# encoding: utf8 +from __future__ import unicode_literals + +import pytest + + +@pytest.mark.parametrize('text', ["aaabbb@ccc.com\nThank you!", + "aaabbb@ccc.com \nThank you!"]) +def test_issue859(en_tokenizer, text): + """Test that no extra space is added in doc.text method.""" + doc = en_tokenizer(text) + assert doc.text == text From 0ac3d2768991521205a6d0e365303560521b6108 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 8 Mar 2017 15:01:40 +0100 Subject: [PATCH 17/21] Fix handling of trailing whitespace Fix off-by-one error that meant trailing spaces were being dropped. Closes #792 --- spacy/tests/regression/test_issue792.py | 12 +++++++++--- spacy/tokenizer.pyx | 1 - 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/spacy/tests/regression/test_issue792.py b/spacy/tests/regression/test_issue792.py index 563e061a6..df8b5ef50 100644 --- a/spacy/tests/regression/test_issue792.py +++ b/spacy/tests/regression/test_issue792.py @@ -4,9 +4,15 @@ from __future__ import unicode_literals import pytest -@pytest.mark.xfail @pytest.mark.parametrize('text', ["This is a string ", "This is a string\u0020"]) def test_issue792(en_tokenizer, text): - """Test for Issue #792: Trailing whitespace is removed after parsing.""" + """Test for Issue #792: Trailing whitespace is removed after tokenization.""" doc = en_tokenizer(text) - assert doc.text_with_ws == text + assert ''.join([token.text_with_ws for token in doc]) == text + + +@pytest.mark.parametrize('text', ["This is a string", "This is a string\n"]) +def test_control_issue792(en_tokenizer, text): + """Test base case for Issue #792: Non-trailing whitespace""" + doc = en_tokenizer(text) + assert ''.join([token.text_with_ws for token in doc]) == text diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 1b74431ff..5a4eb844a 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -163,7 +163,6 @@ cdef class Tokenizer: start = i in_ws = not in_ws i += 1 - i += 1 if start < i: span = string[start:] key = hash_string(span) From f71eeef9bb620032aa6c83cead39ded983a8be3f Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 9 Mar 2017 18:42:40 -0600 Subject: [PATCH 18/21] Pass path argument to end_training --- spacy/language.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/language.py b/spacy/language.py index 9f8cc49e1..66acec781 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -265,7 +265,7 @@ class Language(object): self.entity = self.Defaults.create_entity(self) self.pipeline = self.Defaults.create_pipeline(self) yield Trainer(self, gold_tuples) - self.end_training() + self.end_training(path=path) def __init__(self, **overrides): if 'data_dir' in overrides and 'path' not in overrides: From c62da0234480cc30a2c41dfc2e054d72db3015ee Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 9 Mar 2017 18:43:21 -0600 Subject: [PATCH 19/21] Use ftrl training, to learn compressed model. --- spacy/syntax/parser.pyx | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index 34ee920c6..093186518 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -68,7 +68,7 @@ def get_templates(name): cdef class ParserModel(AveragedPerceptron): - cdef void set_featuresC(self, ExampleC* eg, const StateC* state) nogil: + cdef void set_featuresC(self, ExampleC* eg, const StateC* state) nogil: fill_context(eg.atoms, state) eg.nr_feat = self.extracter.set_features(eg.features, eg.atoms) @@ -124,7 +124,7 @@ cdef class Parser: elif 'features' not in cfg: cfg['features'] = self.feature_templates self.model = ParserModel(cfg['features']) - self.model.l1_penalty = 1e-7 + self.model.l1_penalty = cfg.get('L1', 0.0) self.cfg = cfg @@ -234,7 +234,7 @@ cdef class Parser: free(eg.scores) free(eg.is_valid) return 0 - + def update(self, Doc tokens, GoldParse gold): """Update the statistical model. @@ -263,11 +263,11 @@ cdef class Parser: self.model.time += 1 guess = VecVec.arg_max_if_true(eg.c.scores, eg.c.is_valid, eg.c.nr_class) if eg.c.costs[guess] > 0: - best = VecVec.arg_max_if_zero(eg.c.scores, eg.c.costs, eg.c.nr_class) + best = arg_max_if_gold(eg.c.scores, eg.c.costs, eg.c.nr_class) for feat in eg.c.features[:eg.c.nr_feat]: - self.model.update_weight_ftrl(feat.key, best, -feat.value * eg.costs[guess]) - self.model.update_weight_ftrl(feat.key, guess, feat.value * eg.costs[guess]) - + self.model.update_weight_ftrl(feat.key, best, -feat.value * eg.c.costs[guess]) + self.model.update_weight_ftrl(feat.key, guess, feat.value * eg.c.costs[guess]) + action = self.moves.c[guess] action.do(stcls.c, action.label) loss += eg.costs[guess] @@ -392,6 +392,14 @@ class ParserStateError(ValueError): "Please include the text that the parser failed on, which is:\n" "%s" % repr(doc.text)) +cdef int arg_max_if_gold(const weight_t* scores, const weight_t* costs, int n) nogil: + cdef int best = -1 + for i in range(n): + if costs[i] <= 0: + if best == -1 or scores[i] > scores[best]: + best = i + return best + cdef int _arg_max_clas(const weight_t* scores, int move, const Transition* actions, int nr_class) except -1: From 798450136dc30068f81ffb88bbd947596a931b32 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 9 Mar 2017 18:43:47 -0600 Subject: [PATCH 20/21] Set L1 penalty to 0 in tagger. --- spacy/tagger.pyx | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx index 954bced53..eab0d1126 100644 --- a/spacy/tagger.pyx +++ b/spacy/tagger.pyx @@ -152,6 +152,7 @@ cdef class Tagger: model = TaggerModel(cfg.get('features', self.feature_templates)) self.vocab = vocab self.model = model + self.model.l1_penalty = 0.0 # TODO: Move this to tag map self.freqs = {TAG: defaultdict(int)} for tag in self.tag_names: From 35124b144a4b25f8377fcbbf0ab32fbffc3320eb Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 9 Mar 2017 18:44:53 -0600 Subject: [PATCH 21/21] Add L1 penalty option to parser --- bin/parser/train.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/bin/parser/train.py b/bin/parser/train.py index 24484f7cf..26b545b6d 100755 --- a/bin/parser/train.py +++ b/bin/parser/train.py @@ -66,7 +66,7 @@ def score_model(scorer, nlp, raw_text, annot_tuples, verbose=False): def train(Language, train_data, dev_data, model_dir, tagger_cfg, parser_cfg, entity_cfg, n_iter=15, seed=0, gold_preproc=False, n_sents=0, corruption_level=0): - print("Itn.\tP.Loss\tN feats\tUAS\tNER F.\tTag %\tToken %") + print("Itn.\tN weight\tN feats\tUAS\tNER F.\tTag %\tToken %") format_str = '{:d}\t{:d}\t{:d}\t{uas:.3f}\t{ents_f:.3f}\t{tags_acc:.3f}\t{token_acc:.3f}' with Language.train(model_dir, train_data, tagger_cfg, parser_cfg, entity_cfg) as trainer: @@ -76,12 +76,13 @@ def train(Language, train_data, dev_data, model_dir, tagger_cfg, parser_cfg, ent for doc, gold in epoch: trainer.update(doc, gold) dev_scores = trainer.evaluate(dev_data, gold_preproc=gold_preproc) - print(format_str.format(itn, loss, + print(format_str.format(itn, trainer.nlp.parser.model.nr_weight, trainer.nlp.parser.model.nr_active_feat, **dev_scores.scores)) def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False, beam_width=None, cand_preproc=None): + print("Load parser", model_dir) nlp = Language(path=model_dir) if nlp.lang == 'de': nlp.vocab.morphology.lemmatizer = lambda string,pos: set([string]) @@ -146,22 +147,25 @@ def write_parses(Language, dev_loc, model_dir, out_loc): verbose=("Verbose error reporting", "flag", "v", bool), debug=("Debug mode", "flag", "d", bool), pseudoprojective=("Use pseudo-projective parsing", "flag", "p", bool), + L1=("L1 regularization penalty", "option", "L", float), ) def main(language, train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False, - debug=False, corruption_level=0.0, gold_preproc=False, eval_only=False, pseudoprojective=False): + debug=False, corruption_level=0.0, gold_preproc=False, eval_only=False, pseudoprojective=False, + L1=1e-6): parser_cfg = dict(locals()) tagger_cfg = dict(locals()) entity_cfg = dict(locals()) lang = spacy.util.get_lang_class(language) - + parser_cfg['features'] = lang.Defaults.parser_features entity_cfg['features'] = lang.Defaults.entity_features if not eval_only: gold_train = list(read_json_file(train_loc)) gold_dev = list(read_json_file(dev_loc)) - gold_train = gold_train[:n_sents] + if n_sents > 0: + gold_train = gold_train[:n_sents] train(lang, gold_train, gold_dev, model_dir, tagger_cfg, parser_cfg, entity_cfg, n_sents=n_sents, gold_preproc=gold_preproc, corruption_level=corruption_level, n_iter=n_iter)