diff --git a/bin/init_model.py b/bin/init_model.py index 72d7a3aae..6e44fd444 100644 --- a/bin/init_model.py +++ b/bin/init_model.py @@ -168,6 +168,11 @@ def setup_vocab(get_lex_attr, tag_map, src_dir, dst_dir): probs[word] = oov_prob lexicon = [] + for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])): + # First encode the strings into the StringStore. This way, we can map + # the orth IDs to frequency ranks + orth = vocab.strings[word] + # Now actually load the vocab for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])): lexeme = vocab[word] lexeme.prob = prob diff --git a/lang_data/en/morphs.json b/lang_data/en/morphs.json index 917cbc759..059381b27 100644 --- a/lang_data/en/morphs.json +++ b/lang_data/en/morphs.json @@ -56,5 +56,4 @@ "was": {"L": "be", "VerbForm": "Fin", "Tense": "Past", "Number": "Sing"}, "were": {"L": "be", "VerbForm": "Fin", "Tense": "Past", "Number": "Plur"} } - } diff --git a/lang_data/en/tag_map.json b/lang_data/en/tag_map.json index de3e2eb58..a38411bcf 100644 --- a/lang_data/en/tag_map.json +++ b/lang_data/en/tag_map.json @@ -22,7 +22,7 @@ "JJS": {"pos": "adj", "degree": "sup"}, "LS": {"pos": "punct", "numtype": "ord"}, "MD": {"pos": "verb", "verbtype": "mod"}, -"NIL": {"pos": "no_tag"}, +"NIL": {"pos": ""}, "NN": {"pos": "noun", "number": "sing"}, "NNP": {"pos": "noun", "nountype": "prop", "number": "sing"}, "NNPS": {"pos": "noun", "nountype": "prop", "number": "plur"}, diff --git a/setup.py b/setup.py index ec394a2e3..fb05a9dbd 100644 --- a/setup.py +++ b/setup.py @@ -166,7 +166,8 @@ MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings', 'spacy.tokens.doc', 'spacy.tokens.spans', 'spacy.tokens.token', 'spacy.serialize.packer', 'spacy.serialize.huffman', 'spacy.serialize.bits', 'spacy.cfile', 'spacy.matcher', - 'spacy.syntax.ner'] + 'spacy.syntax.ner', + 'spacy.symbols'] if __name__ == '__main__': diff --git a/spacy/_ml.pxd b/spacy/_ml.pxd index c2c7ffded..b9a190b67 100644 --- a/spacy/_ml.pxd +++ b/spacy/_ml.pxd @@ -29,5 +29,6 @@ cdef class Model: cdef int update(self, atom_t* context, class_t guess, class_t gold, int cost) except -1 cdef object model_loc + cdef object _templates cdef Extractor _extractor cdef LinearModel _model diff --git a/spacy/_ml.pyx b/spacy/_ml.pyx index 56c080fa6..bc789e7d6 100644 --- a/spacy/_ml.pyx +++ b/spacy/_ml.pyx @@ -3,6 +3,7 @@ from __future__ import unicode_literals from __future__ import division from os import path +import tempfile import os import shutil import json @@ -52,6 +53,7 @@ cdef class Model: def __init__(self, n_classes, templates, model_loc=None): if model_loc is not None and path.isdir(model_loc): model_loc = path.join(model_loc, 'model') + self._templates = templates self.n_classes = n_classes self._extractor = Extractor(templates) self.n_feats = self._extractor.n_templ @@ -60,6 +62,18 @@ cdef class Model: if self.model_loc and path.exists(self.model_loc): self._model.load(self.model_loc, freq_thresh=0) + def __reduce__(self): + model_loc = tempfile.mkstemp() + # TODO: This is a potentially buggy implementation. We're not really + # given a good guarantee that all internal state is saved correctly here, + # since there are learning parameters for e.g. the model averaging in + # averaged perceptron, the gradient calculations in AdaGrad, etc + # that aren't necessarily saved. So, if we're part way through training + # the model, and then we pickle it, we won't recover the state correctly. + self._model.dump(model_loc) + return (Model, (self.n_classes, self.templates, model_loc), + None, None) + def predict(self, Example eg): self.set_scores(eg.c.scores, eg.c.atoms) eg.c.guess = arg_max_if_true(eg.c.scores, eg.c.is_valid, self.n_classes) diff --git a/spacy/attrs.pxd b/spacy/attrs.pxd index c810762ef..d0f476dcd 100644 --- a/spacy/attrs.pxd +++ b/spacy/attrs.pxd @@ -1,5 +1,6 @@ # Reserve 64 values for flag features cpdef enum attr_id_t: + NULL_ATTR IS_ALPHA IS_ASCII IS_DIGIT @@ -14,8 +15,7 @@ cpdef enum attr_id_t: IS_STOP IS_OOV - FLAG13 = 13 - FLAG14 + FLAG14 = 14 FLAG15 FLAG16 FLAG17 diff --git a/spacy/attrs.pyx b/spacy/attrs.pyx index e69de29bb..3595fbf22 100644 --- a/spacy/attrs.pyx +++ b/spacy/attrs.pyx @@ -0,0 +1,90 @@ +IDS = { + "": NULL_ATTR, + "IS_ALPHA": IS_ALPHA, + "IS_ASCII": IS_ASCII, + "IS_DIGIT": IS_DIGIT, + "IS_LOWER": IS_LOWER, + "IS_PUNCT": IS_PUNCT, + "IS_SPACE": IS_SPACE, + "IS_TITLE": IS_TITLE, + "IS_UPPER": IS_UPPER, + "LIKE_URL": LIKE_URL, + "LIKE_NUM": LIKE_NUM, + "LIKE_EMAIL": LIKE_EMAIL, + "IS_STOP": IS_STOP, + "IS_OOV": IS_OOV, + + "FLAG14": FLAG14, + "FLAG15": FLAG15, + "FLAG16": FLAG16, + "FLAG17": FLAG17, + "FLAG18": FLAG18, + "FLAG19": FLAG19, + "FLAG20": FLAG20, + "FLAG21": FLAG21, + "FLAG22": FLAG22, + "FLAG23": FLAG23, + "FLAG24": FLAG24, + "FLAG25": FLAG25, + "FLAG26": FLAG26, + "FLAG27": FLAG27, + "FLAG28": FLAG28, + "FLAG29": FLAG29, + "FLAG30": FLAG30, + "FLAG31": FLAG31, + "FLAG32": FLAG32, + "FLAG33": FLAG33, + "FLAG34": FLAG34, + "FLAG35": FLAG35, + "FLAG36": FLAG36, + "FLAG37": FLAG37, + "FLAG38": FLAG38, + "FLAG39": FLAG39, + "FLAG40": FLAG40, + "FLAG41": FLAG41, + "FLAG42": FLAG42, + "FLAG43": FLAG43, + "FLAG44": FLAG44, + "FLAG45": FLAG45, + "FLAG46": FLAG46, + "FLAG47": FLAG47, + "FLAG48": FLAG48, + "FLAG49": FLAG49, + "FLAG50": FLAG50, + "FLAG51": FLAG51, + "FLAG52": FLAG52, + "FLAG53": FLAG53, + "FLAG54": FLAG54, + "FLAG55": FLAG55, + "FLAG56": FLAG56, + "FLAG57": FLAG57, + "FLAG58": FLAG58, + "FLAG59": FLAG59, + "FLAG60": FLAG60, + "FLAG61": FLAG61, + "FLAG62": FLAG62, + "FLAG63": FLAG63, + + "ID": ID, + "ORTH": ORTH, + "LOWER": LOWER, + "NORM": NORM, + "SHAPE": SHAPE, + "PREFIX": PREFIX, + "SUFFIX": SUFFIX, + + "LENGTH": LENGTH, + "CLUSTER": CLUSTER, + "LEMMA": LEMMA, + "POS": POS, + "TAG": TAG, + "DEP": DEP, + "ENT_IOB": ENT_IOB, + "ENT_TYPE": ENT_TYPE, + "HEAD": HEAD, + "SPACY": SPACY, + "PROB": PROB, +} + +# ATTR IDs, in order of the symbol +NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])] diff --git a/spacy/language.py b/spacy/language.py index ba4c048d7..65425bc45 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -207,6 +207,12 @@ class Language(object): self.entity = entity self.matcher = matcher + def __reduce__(self): + return (self.__class__, + (None, self.vocab, self.tokenizer, self.tagger, self.parser, + self.entity, self.matcher, None), + None, None) + def __call__(self, text, tag=True, parse=True, entity=True): """Apply the pipeline to some text. The text can span multiple sentences, and can contain arbtrary whitespace. Alignment into the original string diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index afafd3ddb..2bf8370b5 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -15,7 +15,7 @@ from libcpp.vector cimport vector from murmurhash.mrmr cimport hash64 from .attrs cimport LENGTH, ENT_TYPE, ORTH, NORM, LEMMA, LOWER, SHAPE -from .attrs cimport FLAG13, FLAG14, FLAG15, FLAG16, FLAG17, FLAG18, FLAG19, FLAG20, FLAG21, FLAG22, FLAG23, FLAG24, FLAG25 +from .attrs cimport FLAG14, FLAG15, FLAG16, FLAG17, FLAG18, FLAG19, FLAG20, FLAG21, FLAG22, FLAG23, FLAG24, FLAG25 from .tokens.doc cimport get_token_attr from .tokens.doc cimport Doc from .vocab cimport Vocab @@ -168,13 +168,7 @@ cdef class Matcher: cdef Pool mem cdef vector[Pattern*] patterns cdef readonly Vocab vocab - - def __init__(self, vocab, patterns): - self.vocab = vocab - self.mem = Pool() - self.vocab = vocab - for entity_key, (etype, attrs, specs) in sorted(patterns.items()): - self.add(entity_key, etype, attrs, specs) + cdef object _patterns @classmethod def from_dir(cls, data_dir, Vocab vocab): @@ -186,10 +180,22 @@ cdef class Matcher: else: return cls(vocab, {}) + def __init__(self, vocab, patterns): + self.vocab = vocab + self.mem = Pool() + self.vocab = vocab + self._patterns = dict(patterns) + for entity_key, (etype, attrs, specs) in sorted(patterns.items()): + self.add(entity_key, etype, attrs, specs) + + def __reduce__(self): + return (self.__class__, (self.vocab, self._patterns), None, None) + property n_patterns: def __get__(self): return self.patterns.size() def add(self, entity_key, etype, attrs, specs): + self._patterns[entity_key] = (etype, dict(attrs), list(specs)) if isinstance(entity_key, basestring): entity_key = self.vocab.strings[entity_key] if isinstance(etype, basestring): diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd index 2229da0ad..847626158 100644 --- a/spacy/morphology.pxd +++ b/spacy/morphology.pxd @@ -7,6 +7,7 @@ from .strings cimport StringStore from .typedefs cimport attr_t from .parts_of_speech cimport univ_pos_t +from . cimport symbols cdef struct RichTagC: uint64_t morph @@ -24,6 +25,7 @@ cdef class Morphology: cdef readonly Pool mem cdef readonly StringStore strings cdef public object lemmatizer + cdef readonly object tag_map cdef public object n_tags cdef public object reverse_index cdef public object tag_names @@ -36,720 +38,252 @@ cdef class Morphology: cdef int assign_feature(self, uint64_t* morph, feature, value) except -1 +cpdef enum univ_morph_t: + NIL = 0 + Animacy_anim = symbols.Animacy_anim + Animacy_inam + Aspect_freq + Aspect_imp + Aspect_mod + Aspect_none + Aspect_perf + Case_abe + Case_abl + Case_abs + Case_acc + Case_ade + Case_all + Case_cau + Case_com + Case_dat + Case_del + Case_dis + Case_ela + Case_ess + Case_gen + Case_ill + Case_ine + Case_ins + Case_loc + Case_lat + Case_nom + Case_par + Case_sub + Case_sup + Case_tem + Case_ter + Case_tra + Case_voc + Definite_two + Definite_def + Definite_red + Definite_ind + Degree_cmp + Degree_comp + Degree_none + Degree_pos + Degree_sup + Degree_abs + Degree_com + Degree_dim # du + Gender_com + Gender_fem + Gender_masc + Gender_neut + Mood_cnd + Mood_imp + Mood_ind + Mood_n + Mood_pot + Mood_sub + Mood_opt + Negative_neg + Negative_pos + Negative_yes + Number_com + Number_dual + Number_none + Number_plur + Number_sing + Number_ptan # bg + Number_count # bg + NumType_card + NumType_dist + NumType_frac + NumType_gen + NumType_mult + NumType_none + NumType_ord + NumType_sets + Person_one + Person_two + Person_three + Person_none + Poss_yes + PronType_advPart + PronType_art + PronType_default + PronType_dem + PronType_ind + PronType_int + PronType_neg + PronType_prs + PronType_rcp + PronType_rel + PronType_tot + PronType_clit + PronType_exc # es, ca, it, fa + Reflex_yes + Tense_fut + Tense_imp + Tense_past + Tense_pres + VerbForm_fin + VerbForm_ger + VerbForm_inf + VerbForm_none + VerbForm_part + VerbForm_partFut + VerbForm_partPast + VerbForm_partPres + VerbForm_sup + VerbForm_trans + VerbForm_gdv # la + Voice_act + Voice_cau + Voice_pass + Voice_mid # gkc + Voice_int # hb + Abbr_yes # cz, fi, sl, U + AdpType_prep # cz, U + AdpType_post # U + AdpType_voc # cz + AdpType_comprep # cz + AdpType_circ # U + AdvType_man + AdvType_loc + AdvType_tim + AdvType_deg + AdvType_cau + AdvType_mod + AdvType_sta + AdvType_ex + AdvType_adadj + ConjType_oper # cz, U + ConjType_comp # cz, U + Connegative_yes # fi + Derivation_minen # fi + Derivation_sti # fi + Derivation_inen # fi + Derivation_lainen # fi + Derivation_ja # fi + Derivation_ton # fi + Derivation_vs # fi + Derivation_ttain # fi + Derivation_ttaa # fi + Echo_rdp # U + Echo_ech # U + Foreign_foreign # cz, fi, U + Foreign_fscript # cz, fi, U + Foreign_tscript # cz, U + Foreign_yes # sl + Gender_dat_masc # bq, U + Gender_dat_fem # bq, U + Gender_erg_masc # bq + Gender_erg_fem # bq + Gender_psor_masc # cz, sl, U + Gender_psor_fem # cz, sl, U + Gender_psor_neut # sl + Hyph_yes # cz, U + InfForm_one # fi + InfForm_two # fi + InfForm_three # fi + NameType_geo # U, cz + NameType_prs # U, cz + NameType_giv # U, cz + NameType_sur # U, cz + NameType_nat # U, cz + NameType_com # U, cz + NameType_pro # U, cz + NameType_oth # U, cz + NounType_com # U + NounType_prop # U + NounType_class # U + Number_abs_sing # bq, U + Number_abs_plur # bq, U + Number_dat_sing # bq, U + Number_dat_plur # bq, U + Number_erg_sing # bq, U + Number_erg_plur # bq, U + Number_psee_sing # U + Number_psee_plur # U + Number_psor_sing # cz, fi, sl, U + Number_psor_plur # cz, fi, sl, U + NumForm_digit # cz, sl, U + NumForm_roman # cz, sl, U + NumForm_word # cz, sl, U + NumValue_one # cz, U + NumValue_two # cz, U + NumValue_three # cz, U + PartForm_pres # fi + PartForm_past # fi + PartForm_agt # fi + PartForm_neg # fi + PartType_mod # U + PartType_emp # U + PartType_res # U + PartType_inf # U + PartType_vbp # U + Person_abs_one # bq, U + Person_abs_two # bq, U + Person_abs_three # bq, U + Person_dat_one # bq, U + Person_dat_two # bq, U + Person_dat_three # bq, U + Person_erg_one # bq, U + Person_erg_two # bq, U + Person_erg_three # bq, U + Person_psor_one # fi, U + Person_psor_two # fi, U + Person_psor_three # fi, U + Polite_inf # bq, U + Polite_pol # bq, U + Polite_abs_inf # bq, U + Polite_abs_pol # bq, U + Polite_erg_inf # bq, U + Polite_erg_pol # bq, U + Polite_dat_inf # bq, U + Polite_dat_pol # bq, U + Prefix_yes # U + PrepCase_npr # cz + PrepCase_pre # U + PunctSide_ini # U + PunctSide_fin # U + PunctType_peri # U + PunctType_qest # U + PunctType_excl # U + PunctType_quot # U + PunctType_brck # U + PunctType_comm # U + PunctType_colo # U + PunctType_semi # U + PunctType_dash # U + Style_arch # cz, fi, U + Style_rare # cz, fi, U + Style_poet # cz, U + Style_norm # cz, U + Style_coll # cz, U + Style_vrnc # cz, U + Style_sing # cz, U + Style_expr # cz, U + Style_derg # cz, U + Style_vulg # cz, U + Style_yes # fi, U + StyleVariant_styleShort # cz + StyleVariant_styleBound # cz, sl + VerbType_aux # U + VerbType_cop # U + VerbType_mod # U + VerbType_light # U -# -#cpdef enum Feature_t: -# Abbr -# AdpType -# AdvType -# ConjType -# Connegative -# Derivation -# Echo -# Foreign -# Gender_dat -# Gender_erg -# Gender_psor -# Hyph -# InfForm -# NameType -# NounType -# NumberAbs -# NumberDat -# NumberErg -# NumberPsee -# NumberPsor -# NumForm -# NumValue -# PartForm -# PartType -# Person_abs -# Person_dat -# Person_psor -# Polite -# Polite_abs -# Polite_dat -# Prefix -# PrepCase -# PunctSide -# PunctType -# Style -# Typo -# Variant -# VerbType -# -# -#cpdef enum Animacy: -# Anim -# Inam -# -# -#cpdef enum Aspect: -# Freq -# Imp -# Mod -# None_ -# Perf -# -# -#cpdef enum Case1: -# Nom -# Gen -# Acc -# Dat -# Voc -# Abl -# -#cdef enum Case2: -# Abe -# Abs -# Ade -# All -# Cau -# Com -# Del -# Dis -# -#cdef enum Case3: -# Ela -# Ess -# Ill -# Ine -# Ins -# Loc -# Lat -# Par -# -#cdef enum Case4: -# Sub -# Sup -# Tem -# Ter -# Tra -# -# -#cpdef enum Definite: -# Two -# Def -# Red -# Ind -# -# -#cpdef enum Degree: -# Cmp -# Comp -# None_ -# Pos -# Sup -# Abs -# Com -# Degree # du -# -# -#cpdef enum Gender: -# Com -# Fem -# Masc -# Neut -# -# -#cpdef enum Mood: -# Cnd -# Imp -# Ind -# N -# Pot -# Sub -# Opt -# -# -#cpdef enum Negative: -# Neg -# Pos -# Yes -# -# -#cpdef enum Number: -# Com -# Dual -# None_ -# Plur -# Sing -# Ptan # bg -# Count # bg -# -# -#cpdef enum NumType: -# Card -# Dist -# Frac -# Gen -# Mult -# None_ -# Ord -# Sets -# -# -#cpdef enum Person: -# One -# Two -# Three -# None_ -# -# -#cpdef enum Poss: -# Yes -# -# -#cpdef enum PronType1: -# AdvPart -# Art -# Default -# Dem -# Ind -# Int -# Neg -# -#cpdef enum PronType2: -# Prs -# Rcp -# Rel -# Tot -# Clit -# Exc # es, ca, it, fa -# Clit # it -# -# -#cpdef enum Reflex: -# Yes -# -# -#cpdef enum Tense: -# Fut -# Imp -# Past -# Pres -# -#cpdef enum VerbForm1: -# Fin -# Ger -# Inf -# None_ -# Part -# PartFut -# PartPast -# -#cpdef enum VerbForm2: -# PartPres -# Sup -# Trans -# Gdv # la -# -# -#cpdef enum Voice: -# Act -# Cau -# Pass -# Mid # gkc -# Int # hb -# -# -#cpdef enum Abbr: -# Yes # cz, fi, sl, U -# -#cpdef enum AdpType: -# Prep # cz, U -# Post # U -# Voc # cz -# Comprep # cz -# Circ # U -# Voc # U -# -# -#cpdef enum AdvType1: -# # U -# Man -# Loc -# Tim -# Deg -# Cau -# Mod -# Sta -# Ex -# -#cpdef enum AdvType2: -# Adadj -# -#cpdef enum ConjType: -# Oper # cz, U -# Comp # cz, U -# -#cpdef enum Connegative: -# Yes # fi -# -# -#cpdef enum Derivation1: -# Minen # fi -# Sti # fi -# Inen # fi -# Lainen # fi -# Ja # fi -# Ton # fi -# Vs # fi -# Ttain # fi -# -#cpdef enum Derivation2: -# Ttaa -# -# -#cpdef enum Echo: -# Rdp # U -# Ech # U -# -# -#cpdef enum Foreign: -# Foreign # cz, fi, U -# Fscript # cz, fi, U -# Tscript # cz, U -# Yes # sl -# -# -#cpdef enum Gender_dat: -# Masc # bq, U -# Fem # bq, U -# -# -#cpdef enum Gender_erg: -# Masc # bq -# Fem # bq -# -# -#cpdef enum Gender_psor: -# Masc # cz, sl, U -# Fem # cz, sl, U -# Neut # sl -# -# -#cpdef enum Hyph: -# Yes # cz, U -# -# -#cpdef enum InfForm: -# One # fi -# Two # fi -# Three # fi -# -# -#cpdef enum NameType: -# Geo # U, cz -# Prs # U, cz -# Giv # U, cz -# Sur # U, cz -# Nat # U, cz -# Com # U, cz -# Pro # U, cz -# Oth # U, cz -# -# -#cpdef enum NounType: -# Com # U -# Prop # U -# Class # U -# -#cpdef enum Number_abs: -# Sing # bq, U -# Plur # bq, U -# -#cpdef enum Number_dat: -# Sing # bq, U -# Plur # bq, U -# -#cpdef enum Number_erg: -# Sing # bq, U -# Plur # bq, U -# -#cpdef enum Number_psee: -# Sing # U -# Plur # U -# -# -#cpdef enum Number_psor: -# Sing # cz, fi, sl, U -# Plur # cz, fi, sl, U -# -# -#cpdef enum NumForm: -# Digit # cz, sl, U -# Roman # cz, sl, U -# Word # cz, sl, U -# -# -#cpdef enum NumValue: -# One # cz, U -# Two # cz, U -# Three # cz, U -# -# -#cpdef enum PartForm: -# Pres # fi -# Past # fi -# Agt # fi -# Neg # fi -# -# -#cpdef enum PartType: -# Mod # U -# Emp # U -# Res # U -# Inf # U -# Vbp # U -# -#cpdef enum Person_abs: -# One # bq, U -# Two # bq, U -# Three # bq, U -# -# -#cpdef enum Person_dat: -# One # bq, U -# Two # bq, U -# Three # bq, U -# -# -#cpdef enum Person_erg: -# One # bq, U -# Two # bq, U -# Three # bq, U -# -# -#cpdef enum Person_psor: -# One # fi, U -# Two # fi, U -# Three # fi, U -# -# -#cpdef enum Polite: -# Inf # bq, U -# Pol # bq, U -# -# -#cpdef enum Polite_abs: -# Inf # bq, U -# Pol # bq, U -# -# -#cpdef enum Polite_erg: -# Inf # bq, U -# Pol # bq, U -# -# -#cpdef enum Polite_dat: -# Inf # bq, U -# Pol # bq, U -# -# -#cpdef enum Prefix: -# Yes # U -# -# -#cpdef enum PrepCase: -# Npr # cz -# Pre # U -# -# -#cpdef enum PunctSide: -# Ini # U -# Fin # U -# -#cpdef enum PunctType1: -# Peri # U -# Qest # U -# Excl # U -# Quot # U -# Brck # U -# Comm # U -# Colo # U -# Semi # U -# -#cpdef enum PunctType2: -# Dash # U -# -# -#cpdef enum Style1: -# Arch # cz, fi, U -# Rare # cz, fi, U -# Poet # cz, U -# Norm # cz, U -# Coll # cz, U -# Vrnc # cz, U -# Sing # cz, U -# Expr # cz, U -# -# -#cpdef enum Style2: -# Derg # cz, U -# Vulg # cz, U -# -# -#cpdef enum Typo: -# Yes # fi, U -# -# -#cpdef enum Variant: -# Short # cz -# Bound # cz, sl -# -# -#cpdef enum VerbType: -# Aux # U -# Cop # U -# Mod # U -# Light # U -# -cpdef enum Value_t: - Animacy_Anim - Animacy_Inam - Aspect_Freq - Aspect_Imp - Aspect_Mod - Aspect_None_ - Aspect_Perf - Case_Abe - Case_Abl - Case_Abs - Case_Acc - Case_Ade - Case_All - Case_Cau - Case_Com - Case_Dat - Case_Del - Case_Dis - Case_Ela - Case_Ess - Case_Gen - Case_Ill - Case_Ine - Case_Ins - Case_Loc - Case_Lat - Case_Nom - Case_Par - Case_Sub - Case_Sup - Case_Tem - Case_Ter - Case_Tra - Case_Voc - Definite_Two - Definite_Def - Definite_Red - Definite_Ind - Degree_Cmp - Degree_Comp - Degree_None - Degree_Pos - Degree_Sup - Degree_Abs - Degree_Com - Degree_Dim # du - Gender_Com - Gender_Fem - Gender_Masc - Gender_Neut - Mood_Cnd - Mood_Imp - Mood_Ind - Mood_N - Mood_Pot - Mood_Sub - Mood_Opt - Negative_Neg - Negative_Pos - Negative_Yes - Number_Com - Number_Dual - Number_None - Number_Plur - Number_Sing - Number_Ptan # bg - Number_Count # bg - NumType_Card - NumType_Dist - NumType_Frac - NumType_Gen - NumType_Mult - NumType_None - NumType_Ord - NumType_Sets - Person_One - Person_Two - Person_Three - Person_None - Poss_Yes - PronType_AdvPart - PronType_Art - PronType_Default - PronType_Dem - PronType_Ind - PronType_Int - PronType_Neg - PronType_Prs - PronType_Rcp - PronType_Rel - PronType_Tot - PronType_Clit - PronType_Exc # es, ca, it, fa - Reflex_Yes - Tense_Fut - Tense_Imp - Tense_Past - Tense_Pres - VerbForm_Fin - VerbForm_Ger - VerbForm_Inf - VerbForm_None - VerbForm_Part - VerbForm_PartFut - VerbForm_PartPast - VerbForm_PartPres - VerbForm_Sup - VerbForm_Trans - VerbForm_Gdv # la - Voice_Act - Voice_Cau - Voice_Pass - Voice_Mid # gkc - Voice_Int # hb - Abbr_Yes # cz, fi, sl, U - AdpType_Prep # cz, U - AdpType_Post # U - AdpType_Voc # cz - AdpType_Comprep # cz - AdpType_Circ # U - AdvType_Man - AdvType_Loc - AdvType_Tim - AdvType_Deg - AdvType_Cau - AdvType_Mod - AdvType_Sta - AdvType_Ex - AdvType_Adadj - ConjType_Oper # cz, U - ConjType_Comp # cz, U - Connegative_Yes # fi - Derivation_Minen # fi - Derivation_Sti # fi - Derivation_Inen # fi - Derivation_Lainen # fi - Derivation_Ja # fi - Derivation_Ton # fi - Derivation_Vs # fi - Derivation_Ttain # fi - Derivation_Ttaa # fi - Echo_Rdp # U - Echo_Ech # U - Foreign_Foreign # cz, fi, U - Foreign_Fscript # cz, fi, U - Foreign_Tscript # cz, U - Foreign_Yes # sl - Gender_dat_Masc # bq, U - Gender_dat_Fem # bq, U - Gender_erg_Masc # bq - Gender_erg_Fem # bq - Gender_psor_Masc # cz, sl, U - Gender_psor_Fem # cz, sl, U - Gender_psor_Neut # sl - Hyph_Yes # cz, U - InfForm_One # fi - InfForm_Two # fi - InfForm_Three # fi - NameType_Geo # U, cz - NameType_Prs # U, cz - NameType_Giv # U, cz - NameType_Sur # U, cz - NameType_Nat # U, cz - NameType_Com # U, cz - NameType_Pro # U, cz - NameType_Oth # U, cz - NounType_Com # U - NounType_Prop # U - NounType_Class # U - Number_abs_Sing # bq, U - Number_abs_Plur # bq, U - Number_dat_Sing # bq, U - Number_dat_Plur # bq, U - Number_erg_Sing # bq, U - Number_erg_Plur # bq, U - Number_psee_Sing # U - Number_psee_Plur # U - Number_psor_Sing # cz, fi, sl, U - Number_psor_Plur # cz, fi, sl, U - NumForm_Digit # cz, sl, U - NumForm_Roman # cz, sl, U - NumForm_Word # cz, sl, U - NumValue_One # cz, U - NumValue_Two # cz, U - NumValue_Three # cz, U - PartForm_Pres # fi - PartForm_Past # fi - PartForm_Agt # fi - PartForm_Neg # fi - PartType_Mod # U - PartType_Emp # U - PartType_Res # U - PartType_Inf # U - PartType_Vbp # U - Person_abs_One # bq, U - Person_abs_Two # bq, U - Person_abs_Three # bq, U - Person_dat_One # bq, U - Person_dat_Two # bq, U - Person_dat_Three # bq, U - Person_erg_One # bq, U - Person_erg_Two # bq, U - Person_erg_Three # bq, U - Person_psor_One # fi, U - Person_psor_Two # fi, U - Person_psor_Three # fi, U - Polite_Inf # bq, U - Polite_Pol # bq, U - Polite_abs_Inf # bq, U - Polite_abs_Pol # bq, U - Polite_erg_Inf # bq, U - Polite_erg_Pol # bq, U - Polite_dat_Inf # bq, U - Polite_dat_Pol # bq, U - Prefix_Yes # U - PrepCase_Npr # cz - PrepCase_Pre # U - PunctSide_Ini # U - PunctSide_Fin # U - PunctType_Peri # U - PunctType_Qest # U - PunctType_Excl # U - PunctType_Quot # U - PunctType_Brck # U - PunctType_Comm # U - PunctType_Colo # U - PunctType_Semi # U - PunctType_Dash # U - Style_Arch # cz, fi, U - Style_Rare # cz, fi, U - Style_Poet # cz, U - Style_Norm # cz, U - Style_Coll # cz, U - Style_Vrnc # cz, U - Style_Sing # cz, U - Style_Expr # cz, U - Style_Derg # cz, U - Style_Vulg # cz, U - Style_Yes # fi, U - StyleVariant_StyleShort # cz - StyleVariant_StyleBound # cz, sl - VerbType_Aux # U - VerbType_Cop # U - VerbType_Mod # U - VerbType_Light # U diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 534f64a59..e8b1f3520 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -6,7 +6,7 @@ try: except ImportError: import json -from .parts_of_speech import UNIV_POS_NAMES +from .parts_of_speech import IDS as POS_IDS from .parts_of_speech cimport ADJ, VERB, NOUN, PUNCT @@ -14,6 +14,7 @@ cdef class Morphology: def __init__(self, StringStore string_store, tag_map, lemmatizer): self.mem = Pool() self.strings = string_store + self.tag_map = tag_map self.lemmatizer = lemmatizer self.n_tags = len(tag_map) + 1 self.tag_names = tuple(sorted(tag_map.keys())) @@ -24,10 +25,13 @@ cdef class Morphology: self.rich_tags[i].id = i self.rich_tags[i].name = self.strings[tag_str] self.rich_tags[i].morph = 0 - self.rich_tags[i].pos = UNIV_POS_NAMES[props['pos'].upper()] + self.rich_tags[i].pos = POS_IDS[props['pos'].upper()] self.reverse_index[self.rich_tags[i].name] = i self._cache = PreshMapArray(self.n_tags) + def __reduce__(self): + return (Morphology, (self.strings, self.tag_map, self.lemmatizer), None, None) + cdef int assign_tag(self, TokenC* token, tag) except -1: cdef int tag_id if isinstance(tag, basestring): @@ -89,3 +93,254 @@ cdef class Morphology: lemma_string = sorted(lemma_strings)[0] lemma = self.strings[lemma_string] return lemma + +IDS = { + "Animacy_anim": Animacy_anim, + "Animacy_inam": Animacy_inam, + "Aspect_freq": Aspect_freq, + "Aspect_imp": Aspect_imp, + "Aspect_mod": Aspect_mod, + "Aspect_none": Aspect_none, + "Aspect_perf": Aspect_perf, + "Case_abe": Case_abe, + "Case_abl": Case_abl, + "Case_abs": Case_abs, + "Case_acc": Case_acc, + "Case_ade": Case_ade, + "Case_all": Case_all, + "Case_cau": Case_cau, + "Case_com": Case_com, + "Case_dat": Case_dat, + "Case_del": Case_del, + "Case_dis": Case_dis, + "Case_ela": Case_ela, + "Case_ess": Case_ess, + "Case_gen": Case_gen, + "Case_ill": Case_ill, + "Case_ine": Case_ine, + "Case_ins": Case_ins, + "Case_loc": Case_loc, + "Case_lat": Case_lat, + "Case_nom": Case_nom, + "Case_par": Case_par, + "Case_sub": Case_sub, + "Case_sup": Case_sup, + "Case_tem": Case_tem, + "Case_ter": Case_ter, + "Case_tra": Case_tra, + "Case_voc": Case_voc, + "Definite_two": Definite_two, + "Definite_def": Definite_def, + "Definite_red": Definite_red, + "Definite_ind": Definite_ind, + "Degree_cmp": Degree_cmp, + "Degree_comp": Degree_comp, + "Degree_none": Degree_none, + "Degree_pos": Degree_pos, + "Degree_sup": Degree_sup, + "Degree_abs": Degree_abs, + "Degree_com": Degree_com, + "Degree_dim ": Degree_dim, # du + "Gender_com": Gender_com, + "Gender_fem": Gender_fem, + "Gender_masc": Gender_masc, + "Gender_neut": Gender_neut, + "Mood_cnd": Mood_cnd, + "Mood_imp": Mood_imp, + "Mood_ind": Mood_ind, + "Mood_n": Mood_n, + "Mood_pot": Mood_pot, + "Mood_sub": Mood_sub, + "Mood_opt": Mood_opt, + "Negative_neg": Negative_neg, + "Negative_pos": Negative_pos, + "Negative_yes": Negative_yes, + "Number_com": Number_com, + "Number_dual": Number_dual, + "Number_none": Number_none, + "Number_plur": Number_plur, + "Number_sing": Number_sing, + "Number_ptan ": Number_ptan, # bg + "Number_count ": Number_count, # bg + "NumType_card": NumType_card, + "NumType_dist": NumType_dist, + "NumType_frac": NumType_frac, + "NumType_gen": NumType_gen, + "NumType_mult": NumType_mult, + "NumType_none": NumType_none, + "NumType_ord": NumType_ord, + "NumType_sets": NumType_sets, + "Person_one": Person_one, + "Person_two": Person_two, + "Person_three": Person_three, + "Person_none": Person_none, + "Poss_yes": Poss_yes, + "PronType_advPart": PronType_advPart, + "PronType_art": PronType_art, + "PronType_default": PronType_default, + "PronType_dem": PronType_dem, + "PronType_ind": PronType_ind, + "PronType_int": PronType_int, + "PronType_neg": PronType_neg, + "PronType_prs": PronType_prs, + "PronType_rcp": PronType_rcp, + "PronType_rel": PronType_rel, + "PronType_tot": PronType_tot, + "PronType_clit": PronType_clit, + "PronType_exc ": PronType_exc, # es, ca, it, fa, + "Reflex_yes": Reflex_yes, + "Tense_fut": Tense_fut, + "Tense_imp": Tense_imp, + "Tense_past": Tense_past, + "Tense_pres": Tense_pres, + "VerbForm_fin": VerbForm_fin, + "VerbForm_ger": VerbForm_ger, + "VerbForm_inf": VerbForm_inf, + "VerbForm_none": VerbForm_none, + "VerbForm_part": VerbForm_part, + "VerbForm_partFut": VerbForm_partFut, + "VerbForm_partPast": VerbForm_partPast, + "VerbForm_partPres": VerbForm_partPres, + "VerbForm_sup": VerbForm_sup, + "VerbForm_trans": VerbForm_trans, + "VerbForm_gdv ": VerbForm_gdv, # la, + "Voice_act": Voice_act, + "Voice_cau": Voice_cau, + "Voice_pass": Voice_pass, + "Voice_mid ": Voice_mid, # gkc, + "Voice_int ": Voice_int, # hb, + "Abbr_yes ": Abbr_yes, # cz, fi, sl, U, + "AdpType_prep ": AdpType_prep, # cz, U, + "AdpType_post ": AdpType_post, # U, + "AdpType_voc ": AdpType_voc, # cz, + "AdpType_comprep ": AdpType_comprep, # cz, + "AdpType_circ ": AdpType_circ, # U, + "AdvType_man": AdvType_man, + "AdvType_loc": AdvType_loc, + "AdvType_tim": AdvType_tim, + "AdvType_deg": AdvType_deg, + "AdvType_cau": AdvType_cau, + "AdvType_mod": AdvType_mod, + "AdvType_sta": AdvType_sta, + "AdvType_ex": AdvType_ex, + "AdvType_adadj": AdvType_adadj, + "ConjType_oper ": ConjType_oper, # cz, U, + "ConjType_comp ": ConjType_comp, # cz, U, + "Connegative_yes ": Connegative_yes, # fi, + "Derivation_minen ": Derivation_minen, # fi, + "Derivation_sti ": Derivation_sti, # fi, + "Derivation_inen ": Derivation_inen, # fi, + "Derivation_lainen ": Derivation_lainen, # fi, + "Derivation_ja ": Derivation_ja, # fi, + "Derivation_ton ": Derivation_ton, # fi, + "Derivation_vs ": Derivation_vs, # fi, + "Derivation_ttain ": Derivation_ttain, # fi, + "Derivation_ttaa ": Derivation_ttaa, # fi, + "Echo_rdp ": Echo_rdp, # U, + "Echo_ech ": Echo_ech, # U, + "Foreign_foreign ": Foreign_foreign, # cz, fi, U, + "Foreign_fscript ": Foreign_fscript, # cz, fi, U, + "Foreign_tscript ": Foreign_tscript, # cz, U, + "Foreign_yes ": Foreign_yes, # sl, + "Gender_dat_masc ": Gender_dat_masc, # bq, U, + "Gender_dat_fem ": Gender_dat_fem, # bq, U, + "Gender_erg_masc ": Gender_erg_masc, # bq, + "Gender_erg_fem ": Gender_erg_fem, # bq, + "Gender_psor_masc ": Gender_psor_masc, # cz, sl, U, + "Gender_psor_fem ": Gender_psor_fem, # cz, sl, U, + "Gender_psor_neut ": Gender_psor_neut, # sl, + "Hyph_yes ": Hyph_yes, # cz, U, + "InfForm_one ": InfForm_one, # fi, + "InfForm_two ": InfForm_two, # fi, + "InfForm_three ": InfForm_three, # fi, + "NameType_geo ": NameType_geo, # U, cz, + "NameType_prs ": NameType_prs, # U, cz, + "NameType_giv ": NameType_giv, # U, cz, + "NameType_sur ": NameType_sur, # U, cz, + "NameType_nat ": NameType_nat, # U, cz, + "NameType_com ": NameType_com, # U, cz, + "NameType_pro ": NameType_pro, # U, cz, + "NameType_oth ": NameType_oth, # U, cz, + "NounType_com ": NounType_com, # U, + "NounType_prop ": NounType_prop, # U, + "NounType_class ": NounType_class, # U, + "Number_abs_sing ": Number_abs_sing, # bq, U, + "Number_abs_plur ": Number_abs_plur, # bq, U, + "Number_dat_sing ": Number_dat_sing, # bq, U, + "Number_dat_plur ": Number_dat_plur, # bq, U, + "Number_erg_sing ": Number_erg_sing, # bq, U, + "Number_erg_plur ": Number_erg_plur, # bq, U, + "Number_psee_sing ": Number_psee_sing, # U, + "Number_psee_plur ": Number_psee_plur, # U, + "Number_psor_sing ": Number_psor_sing, # cz, fi, sl, U, + "Number_psor_plur ": Number_psor_plur, # cz, fi, sl, U, + "NumForm_digit ": NumForm_digit, # cz, sl, U, + "NumForm_roman ": NumForm_roman, # cz, sl, U, + "NumForm_word ": NumForm_word, # cz, sl, U, + "NumValue_one ": NumValue_one, # cz, U, + "NumValue_two ": NumValue_two, # cz, U, + "NumValue_three ": NumValue_three, # cz, U, + "PartForm_pres ": PartForm_pres, # fi, + "PartForm_past ": PartForm_past, # fi, + "PartForm_agt ": PartForm_agt, # fi, + "PartForm_neg ": PartForm_neg, # fi, + "PartType_mod ": PartType_mod, # U, + "PartType_emp ": PartType_emp, # U, + "PartType_res ": PartType_res, # U, + "PartType_inf ": PartType_inf, # U, + "PartType_vbp ": PartType_vbp, # U, + "Person_abs_one ": Person_abs_one, # bq, U, + "Person_abs_two ": Person_abs_two, # bq, U, + "Person_abs_three ": Person_abs_three, # bq, U, + "Person_dat_one ": Person_dat_one, # bq, U, + "Person_dat_two ": Person_dat_two, # bq, U, + "Person_dat_three ": Person_dat_three, # bq, U, + "Person_erg_one ": Person_erg_one, # bq, U, + "Person_erg_two ": Person_erg_two, # bq, U, + "Person_erg_three ": Person_erg_three, # bq, U, + "Person_psor_one ": Person_psor_one, # fi, U, + "Person_psor_two ": Person_psor_two, # fi, U, + "Person_psor_three ": Person_psor_three, # fi, U, + "Polite_inf ": Polite_inf, # bq, U, + "Polite_pol ": Polite_pol, # bq, U, + "Polite_abs_inf ": Polite_abs_inf, # bq, U, + "Polite_abs_pol ": Polite_abs_pol, # bq, U, + "Polite_erg_inf ": Polite_erg_inf, # bq, U, + "Polite_erg_pol ": Polite_erg_pol, # bq, U, + "Polite_dat_inf ": Polite_dat_inf, # bq, U, + "Polite_dat_pol ": Polite_dat_pol, # bq, U, + "Prefix_yes ": Prefix_yes, # U, + "PrepCase_npr ": PrepCase_npr, # cz, + "PrepCase_pre ": PrepCase_pre, # U, + "PunctSide_ini ": PunctSide_ini, # U, + "PunctSide_fin ": PunctSide_fin, # U, + "PunctType_peri ": PunctType_peri, # U, + "PunctType_qest ": PunctType_qest, # U, + "PunctType_excl ": PunctType_excl, # U, + "PunctType_quot ": PunctType_quot, # U, + "PunctType_brck ": PunctType_brck, # U, + "PunctType_comm ": PunctType_comm, # U, + "PunctType_colo ": PunctType_colo, # U, + "PunctType_semi ": PunctType_semi, # U, + "PunctType_dash ": PunctType_dash, # U, + "Style_arch ": Style_arch, # cz, fi, U, + "Style_rare ": Style_rare, # cz, fi, U, + "Style_poet ": Style_poet, # cz, U, + "Style_norm ": Style_norm, # cz, U, + "Style_coll ": Style_coll, # cz, U, + "Style_vrnc ": Style_vrnc, # cz, U, + "Style_sing ": Style_sing, # cz, U, + "Style_expr ": Style_expr, # cz, U, + "Style_derg ": Style_derg, # cz, U, + "Style_vulg ": Style_vulg, # cz, U, + "Style_yes ": Style_yes, # fi, U, + "StyleVariant_styleShort ": StyleVariant_styleShort, # cz, + "StyleVariant_styleBound ": StyleVariant_styleBound, # cz, sl, + "VerbType_aux ": VerbType_aux, # U, + "VerbType_cop ": VerbType_cop, # U, + "VerbType_mod ": VerbType_mod, # U, + "VerbType_light ": VerbType_light, # U, +} + + +NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])] diff --git a/spacy/parts_of_speech.pxd b/spacy/parts_of_speech.pxd index e410c6971..c97673a69 100644 --- a/spacy/parts_of_speech.pxd +++ b/spacy/parts_of_speech.pxd @@ -1,7 +1,8 @@ -# Google universal tag set +from . cimport symbols + cpdef enum univ_pos_t: - NO_TAG - ADJ + NO_TAG = 0 + ADJ = symbols.ADJ ADP ADV AUX @@ -20,4 +21,3 @@ cpdef enum univ_pos_t: X EOL SPACE - N_UNIV_TAGS diff --git a/spacy/parts_of_speech.pyx b/spacy/parts_of_speech.pyx index 8c2348a47..14933480c 100644 --- a/spacy/parts_of_speech.pyx +++ b/spacy/parts_of_speech.pyx @@ -1,8 +1,8 @@ from __future__ import unicode_literals -UNIV_POS_NAMES = { - "NO_TAG": NO_TAG, +IDS = { + "": NO_TAG, "ADJ": ADJ, "ADP": ADP, "ADV": ADV, @@ -23,3 +23,6 @@ UNIV_POS_NAMES = { "EOL": EOL, "SPACE": SPACE } + + +NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])] diff --git a/spacy/strings.pyx b/spacy/strings.pyx index a4a470158..2208d3bdf 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -69,12 +69,15 @@ cdef Utf8Str _allocate(Pool mem, const unsigned char* chars, int length) except cdef class StringStore: '''Map strings to and from integer IDs.''' - def __init__(self): + def __init__(self, strings=None): self.mem = Pool() self._map = PreshMap() self._resize_at = 10000 self.c = self.mem.alloc(self._resize_at, sizeof(Utf8Str)) self.size = 1 + if strings is not None: + for string in strings: + _ = self[string] property size: def __get__(self): @@ -113,6 +116,14 @@ cdef class StringStore: for i in range(self.size): yield self[i] + def __reduce__(self): + strings = [""] + for i in range(1, self.size): + string = &self.c[i] + py_string = _decode(string) + strings.append(py_string) + return (StringStore, (strings,), None, None, None) + cdef const Utf8Str* intern(self, unsigned char* chars, int length) except NULL: # 0 means missing, but we don't bother offsetting the index. key = hash64(chars, length * sizeof(char), 0) diff --git a/spacy/symbols.pxd b/spacy/symbols.pxd new file mode 100644 index 000000000..0c60f6f67 --- /dev/null +++ b/spacy/symbols.pxd @@ -0,0 +1,421 @@ +cpdef enum symbol_t: + NIL + IS_ALPHA + IS_ASCII + IS_DIGIT + IS_LOWER + IS_PUNCT + IS_SPACE + IS_TITLE + IS_UPPER + LIKE_URL + LIKE_NUM + LIKE_EMAIL + IS_STOP + IS_OOV + + FLAG14 + FLAG15 + FLAG16 + FLAG17 + FLAG18 + FLAG19 + FLAG20 + FLAG21 + FLAG22 + FLAG23 + FLAG24 + FLAG25 + FLAG26 + FLAG27 + FLAG28 + FLAG29 + FLAG30 + FLAG31 + FLAG32 + FLAG33 + FLAG34 + FLAG35 + FLAG36 + FLAG37 + FLAG38 + FLAG39 + FLAG40 + FLAG41 + FLAG42 + FLAG43 + FLAG44 + FLAG45 + FLAG46 + FLAG47 + FLAG48 + FLAG49 + FLAG50 + FLAG51 + FLAG52 + FLAG53 + FLAG54 + FLAG55 + FLAG56 + FLAG57 + FLAG58 + FLAG59 + FLAG60 + FLAG61 + FLAG62 + FLAG63 + + ID + ORTH + LOWER + NORM + SHAPE + PREFIX + SUFFIX + + LENGTH + CLUSTER + LEMMA + POS + TAG + DEP + ENT_IOB + ENT_TYPE + HEAD + SPACY + PROB + + ADJ + ADP + ADV + AUX + CONJ + DET + INTJ + NOUN + NUM + PART + PRON + PROPN + PUNCT + SCONJ + SYM + VERB + X + EOL + SPACE + + Animacy_anim + Animacy_inam + Aspect_freq + Aspect_imp + Aspect_mod + Aspect_none + Aspect_perf + Case_abe + Case_abl + Case_abs + Case_acc + Case_ade + Case_all + Case_cau + Case_com + Case_dat + Case_del + Case_dis + Case_ela + Case_ess + Case_gen + Case_ill + Case_ine + Case_ins + Case_loc + Case_lat + Case_nom + Case_par + Case_sub + Case_sup + Case_tem + Case_ter + Case_tra + Case_voc + Definite_two + Definite_def + Definite_red + Definite_ind + Degree_cmp + Degree_comp + Degree_none + Degree_pos + Degree_sup + Degree_abs + Degree_com + Degree_dim # du + Gender_com + Gender_fem + Gender_masc + Gender_neut + Mood_cnd + Mood_imp + Mood_ind + Mood_n + Mood_pot + Mood_sub + Mood_opt + Negative_neg + Negative_pos + Negative_yes + Number_com + Number_dual + Number_none + Number_plur + Number_sing + Number_ptan # bg + Number_count # bg + NumType_card + NumType_dist + NumType_frac + NumType_gen + NumType_mult + NumType_none + NumType_ord + NumType_sets + Person_one + Person_two + Person_three + Person_none + Poss_yes + PronType_advPart + PronType_art + PronType_default + PronType_dem + PronType_ind + PronType_int + PronType_neg + PronType_prs + PronType_rcp + PronType_rel + PronType_tot + PronType_clit + PronType_exc # es, ca, it, fa + Reflex_yes + Tense_fut + Tense_imp + Tense_past + Tense_pres + VerbForm_fin + VerbForm_ger + VerbForm_inf + VerbForm_none + VerbForm_part + VerbForm_partFut + VerbForm_partPast + VerbForm_partPres + VerbForm_sup + VerbForm_trans + VerbForm_gdv # la + Voice_act + Voice_cau + Voice_pass + Voice_mid # gkc + Voice_int # hb + Abbr_yes # cz, fi, sl, U + AdpType_prep # cz, U + AdpType_post # U + AdpType_voc # cz + AdpType_comprep # cz + AdpType_circ # U + AdvType_man + AdvType_loc + AdvType_tim + AdvType_deg + AdvType_cau + AdvType_mod + AdvType_sta + AdvType_ex + AdvType_adadj + ConjType_oper # cz, U + ConjType_comp # cz, U + Connegative_yes # fi + Derivation_minen # fi + Derivation_sti # fi + Derivation_inen # fi + Derivation_lainen # fi + Derivation_ja # fi + Derivation_ton # fi + Derivation_vs # fi + Derivation_ttain # fi + Derivation_ttaa # fi + Echo_rdp # U + Echo_ech # U + Foreign_foreign # cz, fi, U + Foreign_fscript # cz, fi, U + Foreign_tscript # cz, U + Foreign_yes # sl + Gender_dat_masc # bq, U + Gender_dat_fem # bq, U + Gender_erg_masc # bq + Gender_erg_fem # bq + Gender_psor_masc # cz, sl, U + Gender_psor_fem # cz, sl, U + Gender_psor_neut # sl + Hyph_yes # cz, U + InfForm_one # fi + InfForm_two # fi + InfForm_three # fi + NameType_geo # U, cz + NameType_prs # U, cz + NameType_giv # U, cz + NameType_sur # U, cz + NameType_nat # U, cz + NameType_com # U, cz + NameType_pro # U, cz + NameType_oth # U, cz + NounType_com # U + NounType_prop # U + NounType_class # U + Number_abs_sing # bq, U + Number_abs_plur # bq, U + Number_dat_sing # bq, U + Number_dat_plur # bq, U + Number_erg_sing # bq, U + Number_erg_plur # bq, U + Number_psee_sing # U + Number_psee_plur # U + Number_psor_sing # cz, fi, sl, U + Number_psor_plur # cz, fi, sl, U + NumForm_digit # cz, sl, U + NumForm_roman # cz, sl, U + NumForm_word # cz, sl, U + NumValue_one # cz, U + NumValue_two # cz, U + NumValue_three # cz, U + PartForm_pres # fi + PartForm_past # fi + PartForm_agt # fi + PartForm_neg # fi + PartType_mod # U + PartType_emp # U + PartType_res # U + PartType_inf # U + PartType_vbp # U + Person_abs_one # bq, U + Person_abs_two # bq, U + Person_abs_three # bq, U + Person_dat_one # bq, U + Person_dat_two # bq, U + Person_dat_three # bq, U + Person_erg_one # bq, U + Person_erg_two # bq, U + Person_erg_three # bq, U + Person_psor_one # fi, U + Person_psor_two # fi, U + Person_psor_three # fi, U + Polite_inf # bq, U + Polite_pol # bq, U + Polite_abs_inf # bq, U + Polite_abs_pol # bq, U + Polite_erg_inf # bq, U + Polite_erg_pol # bq, U + Polite_dat_inf # bq, U + Polite_dat_pol # bq, U + Prefix_yes # U + PrepCase_npr # cz + PrepCase_pre # U + PunctSide_ini # U + PunctSide_fin # U + PunctType_peri # U + PunctType_qest # U + PunctType_excl # U + PunctType_quot # U + PunctType_brck # U + PunctType_comm # U + PunctType_colo # U + PunctType_semi # U + PunctType_dash # U + Style_arch # cz, fi, U + Style_rare # cz, fi, U + Style_poet # cz, U + Style_norm # cz, U + Style_coll # cz, U + Style_vrnc # cz, U + Style_sing # cz, U + Style_expr # cz, U + Style_derg # cz, U + Style_vulg # cz, U + Style_yes # fi, U + StyleVariant_styleShort # cz + StyleVariant_styleBound # cz, sl + VerbType_aux # U + VerbType_cop # U + VerbType_mod # U + VerbType_light # U + + PERSON + NORP + FACILITY + ORG + GPE + LOC + PRODUCT + EVENT + WORK_OF_ART + LANGUAGE + + DATE + TIME + PERCENT + MONEY + QUANTITY + ORDINAL + CARDINAL + + acomp + advcl + advmod + agent + amod + appos + attr + aux + auxpass + cc + ccomp + complm + conj + csubj + csubjpass + dep + det + dobj + expl + hmod + hyph + infmod + intj + iobj + mark + meta + neg + nmod + nn + npadvmod + nsubj + nsubjpass + num + number + oprd + parataxis + partmod + pcomp + pobj + poss + possessive + preconj + prep + prt + punct + quantmod + rcmod + root + xcomp diff --git a/spacy/symbols.pyx b/spacy/symbols.pyx new file mode 100644 index 000000000..31b01db98 --- /dev/null +++ b/spacy/symbols.pyx @@ -0,0 +1,424 @@ +IDS = { + "": NIL, + "IS_ALPHA": IS_ALPHA, + "IS_ASCII": IS_ASCII, + "IS_DIGIT": IS_DIGIT, + "IS_LOWER": IS_LOWER, + "IS_PUNCT": IS_PUNCT, + "IS_SPACE": IS_SPACE, + "IS_TITLE": IS_TITLE, + "IS_UPPER": IS_UPPER, + "LIKE_URL": LIKE_URL, + "LIKE_NUM": LIKE_NUM, + "LIKE_EMAIL": LIKE_EMAIL, + "IS_STOP": IS_STOP, + "IS_OOV": IS_OOV, + + "FLAG14": FLAG14, + "FLAG15": FLAG15, + "FLAG16": FLAG16, + "FLAG17": FLAG17, + "FLAG18": FLAG18, + "FLAG19": FLAG19, + "FLAG20": FLAG20, + "FLAG21": FLAG21, + "FLAG22": FLAG22, + "FLAG23": FLAG23, + "FLAG24": FLAG24, + "FLAG25": FLAG25, + "FLAG26": FLAG26, + "FLAG27": FLAG27, + "FLAG28": FLAG28, + "FLAG29": FLAG29, + "FLAG30": FLAG30, + "FLAG31": FLAG31, + "FLAG32": FLAG32, + "FLAG33": FLAG33, + "FLAG34": FLAG34, + "FLAG35": FLAG35, + "FLAG36": FLAG36, + "FLAG37": FLAG37, + "FLAG38": FLAG38, + "FLAG39": FLAG39, + "FLAG40": FLAG40, + "FLAG41": FLAG41, + "FLAG42": FLAG42, + "FLAG43": FLAG43, + "FLAG44": FLAG44, + "FLAG45": FLAG45, + "FLAG46": FLAG46, + "FLAG47": FLAG47, + "FLAG48": FLAG48, + "FLAG49": FLAG49, + "FLAG50": FLAG50, + "FLAG51": FLAG51, + "FLAG52": FLAG52, + "FLAG53": FLAG53, + "FLAG54": FLAG54, + "FLAG55": FLAG55, + "FLAG56": FLAG56, + "FLAG57": FLAG57, + "FLAG58": FLAG58, + "FLAG59": FLAG59, + "FLAG60": FLAG60, + "FLAG61": FLAG61, + "FLAG62": FLAG62, + "FLAG63": FLAG63, + + "ID": ID, + "ORTH": ORTH, + "LOWER": LOWER, + "NORM": NORM, + "SHAPE": SHAPE, + "PREFIX": PREFIX, + "SUFFIX": SUFFIX, + + "LENGTH": LENGTH, + "CLUSTER": CLUSTER, + "LEMMA": LEMMA, + "POS": POS, + "TAG": TAG, + "DEP": DEP, + "ENT_IOB": ENT_IOB, + "ENT_TYPE": ENT_TYPE, + "HEAD": HEAD, + "SPACY": SPACY, + "PROB": PROB, + + "ADJ": ADJ, + "ADP": ADP, + "ADV": ADV, + "AUX": AUX, + "CONJ": CONJ, + "DET": DET, + "INTJ": INTJ, + "NOUN": NOUN, + "NUM": NUM, + "PART": PART, + "PRON": PRON, + "PROPN": PROPN, + "PUNCT": PUNCT, + "SCONJ": SCONJ, + "SYM": SYM, + "VERB": VERB, + "X": X, + "EOL": EOL, + "SPACE": SPACE, + + "Animacy_anim": Animacy_anim, + "Animacy_inam": Animacy_inam, + "Aspect_freq": Aspect_freq, + "Aspect_imp": Aspect_imp, + "Aspect_mod": Aspect_mod, + "Aspect_none": Aspect_none, + "Aspect_perf": Aspect_perf, + "Case_abe": Case_abe, + "Case_abl": Case_abl, + "Case_abs": Case_abs, + "Case_acc": Case_acc, + "Case_ade": Case_ade, + "Case_all": Case_all, + "Case_cau": Case_cau, + "Case_com": Case_com, + "Case_dat": Case_dat, + "Case_del": Case_del, + "Case_dis": Case_dis, + "Case_ela": Case_ela, + "Case_ess": Case_ess, + "Case_gen": Case_gen, + "Case_ill": Case_ill, + "Case_ine": Case_ine, + "Case_ins": Case_ins, + "Case_loc": Case_loc, + "Case_lat": Case_lat, + "Case_nom": Case_nom, + "Case_par": Case_par, + "Case_sub": Case_sub, + "Case_sup": Case_sup, + "Case_tem": Case_tem, + "Case_ter": Case_ter, + "Case_tra": Case_tra, + "Case_voc": Case_voc, + "Definite_two": Definite_two, + "Definite_def": Definite_def, + "Definite_red": Definite_red, + "Definite_ind": Definite_ind, + "Degree_cmp": Degree_cmp, + "Degree_comp": Degree_comp, + "Degree_none": Degree_none, + "Degree_pos": Degree_pos, + "Degree_sup": Degree_sup, + "Degree_abs": Degree_abs, + "Degree_com": Degree_com, + "Degree_dim ": Degree_dim, # du + "Gender_com": Gender_com, + "Gender_fem": Gender_fem, + "Gender_masc": Gender_masc, + "Gender_neut": Gender_neut, + "Mood_cnd": Mood_cnd, + "Mood_imp": Mood_imp, + "Mood_ind": Mood_ind, + "Mood_n": Mood_n, + "Mood_pot": Mood_pot, + "Mood_sub": Mood_sub, + "Mood_opt": Mood_opt, + "Negative_neg": Negative_neg, + "Negative_pos": Negative_pos, + "Negative_yes": Negative_yes, + "Number_com": Number_com, + "Number_dual": Number_dual, + "Number_none": Number_none, + "Number_plur": Number_plur, + "Number_sing": Number_sing, + "Number_ptan ": Number_ptan, # bg + "Number_count ": Number_count, # bg + "NumType_card": NumType_card, + "NumType_dist": NumType_dist, + "NumType_frac": NumType_frac, + "NumType_gen": NumType_gen, + "NumType_mult": NumType_mult, + "NumType_none": NumType_none, + "NumType_ord": NumType_ord, + "NumType_sets": NumType_sets, + "Person_one": Person_one, + "Person_two": Person_two, + "Person_three": Person_three, + "Person_none": Person_none, + "Poss_yes": Poss_yes, + "PronType_advPart": PronType_advPart, + "PronType_art": PronType_art, + "PronType_default": PronType_default, + "PronType_dem": PronType_dem, + "PronType_ind": PronType_ind, + "PronType_int": PronType_int, + "PronType_neg": PronType_neg, + "PronType_prs": PronType_prs, + "PronType_rcp": PronType_rcp, + "PronType_rel": PronType_rel, + "PronType_tot": PronType_tot, + "PronType_clit": PronType_clit, + "PronType_exc ": PronType_exc, # es, ca, it, fa, + "Reflex_yes": Reflex_yes, + "Tense_fut": Tense_fut, + "Tense_imp": Tense_imp, + "Tense_past": Tense_past, + "Tense_pres": Tense_pres, + "VerbForm_fin": VerbForm_fin, + "VerbForm_ger": VerbForm_ger, + "VerbForm_inf": VerbForm_inf, + "VerbForm_none": VerbForm_none, + "VerbForm_part": VerbForm_part, + "VerbForm_partFut": VerbForm_partFut, + "VerbForm_partPast": VerbForm_partPast, + "VerbForm_partPres": VerbForm_partPres, + "VerbForm_sup": VerbForm_sup, + "VerbForm_trans": VerbForm_trans, + "VerbForm_gdv ": VerbForm_gdv, # la, + "Voice_act": Voice_act, + "Voice_cau": Voice_cau, + "Voice_pass": Voice_pass, + "Voice_mid ": Voice_mid, # gkc, + "Voice_int ": Voice_int, # hb, + "Abbr_yes ": Abbr_yes, # cz, fi, sl, U, + "AdpType_prep ": AdpType_prep, # cz, U, + "AdpType_post ": AdpType_post, # U, + "AdpType_voc ": AdpType_voc, # cz, + "AdpType_comprep ": AdpType_comprep, # cz, + "AdpType_circ ": AdpType_circ, # U, + "AdvType_man": AdvType_man, + "AdvType_loc": AdvType_loc, + "AdvType_tim": AdvType_tim, + "AdvType_deg": AdvType_deg, + "AdvType_cau": AdvType_cau, + "AdvType_mod": AdvType_mod, + "AdvType_sta": AdvType_sta, + "AdvType_ex": AdvType_ex, + "AdvType_adadj": AdvType_adadj, + "ConjType_oper ": ConjType_oper, # cz, U, + "ConjType_comp ": ConjType_comp, # cz, U, + "Connegative_yes ": Connegative_yes, # fi, + "Derivation_minen ": Derivation_minen, # fi, + "Derivation_sti ": Derivation_sti, # fi, + "Derivation_inen ": Derivation_inen, # fi, + "Derivation_lainen ": Derivation_lainen, # fi, + "Derivation_ja ": Derivation_ja, # fi, + "Derivation_ton ": Derivation_ton, # fi, + "Derivation_vs ": Derivation_vs, # fi, + "Derivation_ttain ": Derivation_ttain, # fi, + "Derivation_ttaa ": Derivation_ttaa, # fi, + "Echo_rdp ": Echo_rdp, # U, + "Echo_ech ": Echo_ech, # U, + "Foreign_foreign ": Foreign_foreign, # cz, fi, U, + "Foreign_fscript ": Foreign_fscript, # cz, fi, U, + "Foreign_tscript ": Foreign_tscript, # cz, U, + "Foreign_yes ": Foreign_yes, # sl, + "Gender_dat_masc ": Gender_dat_masc, # bq, U, + "Gender_dat_fem ": Gender_dat_fem, # bq, U, + "Gender_erg_masc ": Gender_erg_masc, # bq, + "Gender_erg_fem ": Gender_erg_fem, # bq, + "Gender_psor_masc ": Gender_psor_masc, # cz, sl, U, + "Gender_psor_fem ": Gender_psor_fem, # cz, sl, U, + "Gender_psor_neut ": Gender_psor_neut, # sl, + "Hyph_yes ": Hyph_yes, # cz, U, + "InfForm_one ": InfForm_one, # fi, + "InfForm_two ": InfForm_two, # fi, + "InfForm_three ": InfForm_three, # fi, + "NameType_geo ": NameType_geo, # U, cz, + "NameType_prs ": NameType_prs, # U, cz, + "NameType_giv ": NameType_giv, # U, cz, + "NameType_sur ": NameType_sur, # U, cz, + "NameType_nat ": NameType_nat, # U, cz, + "NameType_com ": NameType_com, # U, cz, + "NameType_pro ": NameType_pro, # U, cz, + "NameType_oth ": NameType_oth, # U, cz, + "NounType_com ": NounType_com, # U, + "NounType_prop ": NounType_prop, # U, + "NounType_class ": NounType_class, # U, + "Number_abs_sing ": Number_abs_sing, # bq, U, + "Number_abs_plur ": Number_abs_plur, # bq, U, + "Number_dat_sing ": Number_dat_sing, # bq, U, + "Number_dat_plur ": Number_dat_plur, # bq, U, + "Number_erg_sing ": Number_erg_sing, # bq, U, + "Number_erg_plur ": Number_erg_plur, # bq, U, + "Number_psee_sing ": Number_psee_sing, # U, + "Number_psee_plur ": Number_psee_plur, # U, + "Number_psor_sing ": Number_psor_sing, # cz, fi, sl, U, + "Number_psor_plur ": Number_psor_plur, # cz, fi, sl, U, + "NumForm_digit ": NumForm_digit, # cz, sl, U, + "NumForm_roman ": NumForm_roman, # cz, sl, U, + "NumForm_word ": NumForm_word, # cz, sl, U, + "NumValue_one ": NumValue_one, # cz, U, + "NumValue_two ": NumValue_two, # cz, U, + "NumValue_three ": NumValue_three, # cz, U, + "PartForm_pres ": PartForm_pres, # fi, + "PartForm_past ": PartForm_past, # fi, + "PartForm_agt ": PartForm_agt, # fi, + "PartForm_neg ": PartForm_neg, # fi, + "PartType_mod ": PartType_mod, # U, + "PartType_emp ": PartType_emp, # U, + "PartType_res ": PartType_res, # U, + "PartType_inf ": PartType_inf, # U, + "PartType_vbp ": PartType_vbp, # U, + "Person_abs_one ": Person_abs_one, # bq, U, + "Person_abs_two ": Person_abs_two, # bq, U, + "Person_abs_three ": Person_abs_three, # bq, U, + "Person_dat_one ": Person_dat_one, # bq, U, + "Person_dat_two ": Person_dat_two, # bq, U, + "Person_dat_three ": Person_dat_three, # bq, U, + "Person_erg_one ": Person_erg_one, # bq, U, + "Person_erg_two ": Person_erg_two, # bq, U, + "Person_erg_three ": Person_erg_three, # bq, U, + "Person_psor_one ": Person_psor_one, # fi, U, + "Person_psor_two ": Person_psor_two, # fi, U, + "Person_psor_three ": Person_psor_three, # fi, U, + "Polite_inf ": Polite_inf, # bq, U, + "Polite_pol ": Polite_pol, # bq, U, + "Polite_abs_inf ": Polite_abs_inf, # bq, U, + "Polite_abs_pol ": Polite_abs_pol, # bq, U, + "Polite_erg_inf ": Polite_erg_inf, # bq, U, + "Polite_erg_pol ": Polite_erg_pol, # bq, U, + "Polite_dat_inf ": Polite_dat_inf, # bq, U, + "Polite_dat_pol ": Polite_dat_pol, # bq, U, + "Prefix_yes ": Prefix_yes, # U, + "PrepCase_npr ": PrepCase_npr, # cz, + "PrepCase_pre ": PrepCase_pre, # U, + "PunctSide_ini ": PunctSide_ini, # U, + "PunctSide_fin ": PunctSide_fin, # U, + "PunctType_peri ": PunctType_peri, # U, + "PunctType_qest ": PunctType_qest, # U, + "PunctType_excl ": PunctType_excl, # U, + "PunctType_quot ": PunctType_quot, # U, + "PunctType_brck ": PunctType_brck, # U, + "PunctType_comm ": PunctType_comm, # U, + "PunctType_colo ": PunctType_colo, # U, + "PunctType_semi ": PunctType_semi, # U, + "PunctType_dash ": PunctType_dash, # U, + "Style_arch ": Style_arch, # cz, fi, U, + "Style_rare ": Style_rare, # cz, fi, U, + "Style_poet ": Style_poet, # cz, U, + "Style_norm ": Style_norm, # cz, U, + "Style_coll ": Style_coll, # cz, U, + "Style_vrnc ": Style_vrnc, # cz, U, + "Style_sing ": Style_sing, # cz, U, + "Style_expr ": Style_expr, # cz, U, + "Style_derg ": Style_derg, # cz, U, + "Style_vulg ": Style_vulg, # cz, U, + "Style_yes ": Style_yes, # fi, U, + "StyleVariant_styleShort ": StyleVariant_styleShort, # cz, + "StyleVariant_styleBound ": StyleVariant_styleBound, # cz, sl, + "VerbType_aux ": VerbType_aux, # U, + "VerbType_cop ": VerbType_cop, # U, + "VerbType_mod ": VerbType_mod, # U, + "VerbType_light ": VerbType_light, # U, + + "PERSON": PERSON, + "NORP": NORP, + "FACILITY": FACILITY, + "ORG": ORG, + "GPE": GPE, + "LOC": LOC, + "PRODUCT": PRODUCT, + "EVENT": EVENT, + "WORK_OF_ART": WORK_OF_ART, + "LANGUAGE": LANGUAGE, + + "DATE": DATE, + "TIME": TIME, + "PERCENT": PERCENT, + "MONEY": MONEY, + "QUANTITY": QUANTITY, + "ORDINAL": ORDINAL, + "CARDINAL": CARDINAL, + + "acomp": acomp, + "advcl": advcl, + "advmod": advmod, + "agent": agent, + "amod": amod, + "appos": appos, + "attr": attr, + "aux": aux, + "auxpass": auxpass, + "cc": cc, + "ccomp": ccomp, + "complm": complm, + "conj": conj, + "csubj": csubj, + "csubjpass": csubjpass, + "dep": dep, + "det": det, + "dobj": dobj, + "expl": expl, + "hmod": hmod, + "hyph": hyph, + "infmod": infmod, + "intj": intj, + "iobj": iobj, + "mark": mark, + "meta": meta, + "neg": neg, + "nmod": nmod, + "nn": nn, + "npadvmod": npadvmod, + "nsubj": nsubj, + "nsubjpass": nsubjpass, + "num": num, + "number": number, + "oprd": oprd, + "parataxis": parataxis, + "partmod": partmod, + "pcomp": pcomp, + "pobj": pobj, + "poss": poss, + "possessive": possessive, + "preconj": preconj, + "prep": prep, + "prt": prt, + "punct": punct, + "quantmod": quantmod, + "rcmod": rcmod, + "root": root, + "xcomp": xcomp +} + +NAMES = [it[0] for it in sorted(IDS.items(), key=lambda it: it[1])] diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index cf61647b9..25932a0a4 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -83,7 +83,6 @@ cdef class Parser: model = Model(moves.n_moves, templates, model_dir) return cls(strings, moves, model) - def __call__(self, Doc tokens): cdef StateClass stcls = StateClass.init(tokens.data, tokens.length) self.moves.initialize_state(stcls) @@ -93,6 +92,9 @@ cdef class Parser: self.parse(stcls, eg.c) tokens.set_parse(stcls._sent) + def __reduce__(self): + return (Parser, (self.moves.strings, self.moves, self.model), None, None) + cdef void predict(self, StateClass stcls, ExampleC* eg) nogil: memset(eg.scores, 0, eg.nr_class * sizeof(weight_t)) self.moves.set_valid(eg.is_valid, stcls) diff --git a/spacy/syntax/transition_system.pxd b/spacy/syntax/transition_system.pxd index 4cf9aae7e..38bc91605 100644 --- a/spacy/syntax/transition_system.pxd +++ b/spacy/syntax/transition_system.pxd @@ -37,6 +37,8 @@ cdef class TransitionSystem: cdef public int root_label cdef public freqs + cdef object _labels_by_action + cdef int initialize_state(self, StateClass state) except -1 cdef int finalize_state(self, StateClass state) nogil diff --git a/spacy/syntax/transition_system.pyx b/spacy/syntax/transition_system.pyx index 86aef1fbc..5de3513e0 100644 --- a/spacy/syntax/transition_system.pyx +++ b/spacy/syntax/transition_system.pyx @@ -15,7 +15,8 @@ class OracleError(Exception): cdef class TransitionSystem: - def __init__(self, StringStore string_table, dict labels_by_action): + def __init__(self, StringStore string_table, dict labels_by_action, _freqs=None): + self._labels_by_action = labels_by_action self.mem = Pool() self.n_moves = sum(len(labels) for labels in labels_by_action.values()) self._is_valid = self.mem.alloc(self.n_moves, sizeof(bint)) @@ -30,7 +31,7 @@ cdef class TransitionSystem: i += 1 self.c = moves self.root_label = self.strings['ROOT'] - self.freqs = {} + self.freqs = {} if _freqs is None else _freqs for attr in (TAG, HEAD, DEP, ENT_TYPE, ENT_IOB): self.freqs[attr] = defaultdict(int) self.freqs[attr][0] = 1 @@ -39,6 +40,11 @@ cdef class TransitionSystem: self.freqs[HEAD][i] = 1 self.freqs[HEAD][-i] = 1 + def __reduce__(self): + return (self.__class__, + (self.strings, self._labels_by_action, self.freqs), + None, None) + cdef int initialize_state(self, StateClass state) except -1: pass diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx index 756bb7ea4..69925ff89 100644 --- a/spacy/tagger.pyx +++ b/spacy/tagger.pyx @@ -148,6 +148,9 @@ cdef class Tagger: tokens.is_tagged = True tokens._py_tokens = [None] * tokens.length + def __reduce__(self): + return (self.__class__, (self.vocab, self.model), None, None) + def tag_from_strings(self, Doc tokens, object tag_strs): cdef int i for i in range(tokens.length): diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index eab6c044e..50b19d4c1 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -14,7 +14,6 @@ from ..typedefs cimport attr_t, flags_t from ..attrs cimport attr_id_t from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE -from ..parts_of_speech import UNIV_POS_NAMES from ..parts_of_speech cimport CONJ, PUNCT, NOUN from ..parts_of_speech cimport univ_pos_t from ..lexeme cimport Lexeme diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 25db3f47e..af80b5359 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -9,7 +9,7 @@ import numpy from ..lexeme cimport Lexeme -from ..parts_of_speech import UNIV_POS_NAMES +from .. import parts_of_speech from ..attrs cimport LEMMA from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER @@ -318,7 +318,7 @@ cdef class Token: property pos_: def __get__(self): - return _pos_id_to_string[self.c.pos] + return parts_of_speech.NAMES[self.c.pos] property tag_: def __get__(self): @@ -363,6 +363,3 @@ cdef class Token: property like_email: def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_EMAIL) - - -_pos_id_to_string = {id_: string for string, id_ in UNIV_POS_NAMES.items()} diff --git a/spacy/vocab.pxd b/spacy/vocab.pxd index 929c7b345..d850bf929 100644 --- a/spacy/vocab.pxd +++ b/spacy/vocab.pxd @@ -25,7 +25,6 @@ cdef struct _Cached: cdef class Vocab: - cpdef public lexeme_props_getter cdef Pool mem cpdef readonly StringStore strings cpdef readonly Morphology morphology @@ -33,7 +32,6 @@ cdef class Vocab: cdef public object _serializer cdef public object data_dir cdef public object get_lex_attr - cdef public object pos_tags cdef public object serializer_freqs cdef const LexemeC* get(self, Pool mem, unicode string) except NULL diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index d79da8a79..023d0bd89 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -10,6 +10,8 @@ from os import path import io import math import json +import tempfile +import copy_reg from .lexeme cimport EMPTY_LEXEME from .lexeme cimport Lexeme @@ -19,6 +21,9 @@ from .typedefs cimport attr_t from .cfile cimport CFile from .lemmatizer import Lemmatizer +from . import attrs +from . import symbols + from cymem.cymem cimport Address from . import util from .serialize.packer cimport Packer @@ -67,6 +72,14 @@ cdef class Vocab: self._by_hash = PreshMap() self._by_orth = PreshMap() self.strings = StringStore() + # Load strings in a special order, so that we have an onset number for + # the vocabulary. This way, when words are added in order, the orth ID + # is the frequency rank of the word, plus a certain offset. The structural + # strings are loaded first, because the vocab is open-class, and these + # symbols are closed class. + for name in symbols.NAMES + list(sorted(tag_map.keys())): + if name: + _ = self.strings[name] self.get_lex_attr = get_lex_attr self.morphology = Morphology(self.strings, tag_map, lemmatizer) self.serializer_freqs = serializer_freqs @@ -85,6 +98,20 @@ cdef class Vocab: """The current number of lexemes stored.""" return self.length + def __reduce__(self): + # TODO: Dump vectors + tmp_dir = tempfile.mkdtemp() + lex_loc = path.join(tmp_dir, 'lexemes.bin') + str_loc = path.join(tmp_dir, 'strings.txt') + vec_loc = path.join(self.data_dir, 'vec.bin') if self.data_dir is not None else None + + self.dump(lex_loc) + self.strings.dump(str_loc) + + state = (str_loc, lex_loc, vec_loc, self.morphology, self.get_lex_attr, + self.serializer_freqs, self.data_dir) + return (unpickle_vocab, state, None, None) + cdef const LexemeC* get(self, Pool mem, unicode string) except NULL: '''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme if necessary, using memory acquired from the given pool. If the pool @@ -260,17 +287,17 @@ cdef class Vocab: i += 1 fp.close() - def load_vectors(self, loc_or_file): + def load_vectors(self, file_): cdef LexemeC* lexeme cdef attr_t orth cdef int32_t vec_len = -1 - for line_num, line in enumerate(loc_or_file): + for line_num, line in enumerate(file_): pieces = line.split() word_str = pieces.pop(0) if vec_len == -1: vec_len = len(pieces) elif vec_len != len(pieces): - raise VectorReadError.mismatched_sizes(loc_or_file, line_num, + raise VectorReadError.mismatched_sizes(file_, line_num, vec_len, len(pieces)) orth = self.strings[word_str] lexeme = self.get_by_orth(self.mem, orth) @@ -328,6 +355,25 @@ cdef class Vocab: return vec_len +def unpickle_vocab(strings_loc, lex_loc, vec_loc, morphology, get_lex_attr, + serializer_freqs, data_dir): + cdef Vocab vocab = Vocab() + + vocab.get_lex_attr = get_lex_attr + vocab.morphology = morphology + vocab.strings = morphology.strings + vocab.data_dir = data_dir + vocab.serializer_freqs = serializer_freqs + + vocab.load_lexemes(strings_loc, lex_loc) + if vec_loc is not None: + vocab.load_vectors_from_bin_loc(vec_loc) + return vocab + + +copy_reg.constructor(unpickle_vocab) + + def write_binary_vectors(in_loc, out_loc): cdef CFile out_file = CFile(out_loc, 'wb') cdef Address mem diff --git a/tests/morphology/test_pickle.py b/tests/morphology/test_pickle.py new file mode 100644 index 000000000..f1b5bcd4c --- /dev/null +++ b/tests/morphology/test_pickle.py @@ -0,0 +1,17 @@ +import pytest + +import pickle +import StringIO + + +from spacy.morphology import Morphology +from spacy.lemmatizer import Lemmatizer +from spacy.strings import StringStore + + +def test_pickle(): + morphology = Morphology(StringStore(), {}, Lemmatizer({}, {}, {})) + + file_ = StringIO.StringIO() + pickle.dump(morphology, file_) + diff --git a/tests/parser/test_pickle.py b/tests/parser/test_pickle.py new file mode 100644 index 000000000..b1b768650 --- /dev/null +++ b/tests/parser/test_pickle.py @@ -0,0 +1,16 @@ +import pytest + +import pickle +import cloudpickle +import StringIO + + +@pytest.mark.models +def test_pickle(EN): + file_ = StringIO.StringIO() + cloudpickle.dump(EN.parser, file_) + + file_.seek(0) + + loaded = pickle.load(file_) + diff --git a/tests/tagger/test_lemmatizer.py b/tests/tagger/test_lemmatizer.py index ff10b6573..5dfdaabb1 100644 --- a/tests/tagger/test_lemmatizer.py +++ b/tests/tagger/test_lemmatizer.py @@ -1,5 +1,7 @@ # -*- coding: utf-8 -*- from __future__ import unicode_literals +import StringIO +import pickle from spacy.lemmatizer import Lemmatizer, read_index, read_exc from spacy.en import LOCAL_DATA_DIR @@ -41,3 +43,12 @@ def test_smart_quotes(lemmatizer): do = lemmatizer.punct assert do('“') == set(['"']) assert do('“') == set(['"']) + + +def test_pickle_lemmatizer(lemmatizer): + file_ = StringIO.StringIO() + pickle.dump(lemmatizer, file_) + + file_.seek(0) + + loaded = pickle.load(file_) diff --git a/tests/test_pickle.py b/tests/test_pickle.py new file mode 100644 index 000000000..02d908b0d --- /dev/null +++ b/tests/test_pickle.py @@ -0,0 +1,15 @@ +import pytest +import StringIO +import cloudpickle +import pickle + + +@pytest.mark.models +def test_pickle_english(EN): + file_ = StringIO.StringIO() + cloudpickle.dump(EN, file_) + + file_.seek(0) + + loaded = pickle.load(file_) + diff --git a/tests/vocab/test_intern.py b/tests/vocab/test_intern.py index 6e007c645..256706c6f 100644 --- a/tests/vocab/test_intern.py +++ b/tests/vocab/test_intern.py @@ -1,5 +1,7 @@ # -*- coding: utf8 -*- from __future__ import unicode_literals +import pickle +import StringIO from spacy.strings import StringStore @@ -76,3 +78,18 @@ def test_massive_strings(sstore): s513 = '1' * 513 orth = sstore[s513] assert sstore[orth] == s513 + + +def test_pickle_string_store(sstore): + hello_id = sstore[u'Hi'] + string_file = StringIO.StringIO() + pickle.dump(sstore, string_file) + + string_file.seek(0) + + loaded = pickle.load(string_file) + + assert loaded[hello_id] == u'Hi' + + + diff --git a/tests/vocab/test_vocab.py b/tests/vocab/test_vocab.py index 7ad911626..76e8d27dd 100644 --- a/tests/vocab/test_vocab.py +++ b/tests/vocab/test_vocab.py @@ -1,5 +1,11 @@ from __future__ import unicode_literals import pytest +import StringIO +import cloudpickle +import pickle + +from spacy.attrs import LEMMA, ORTH, PROB, IS_ALPHA +from spacy.parts_of_speech import NOUN, VERB def test_neq(en_vocab): @@ -25,3 +31,21 @@ def test_punct_neq(en_vocab): def test_shape_attr(en_vocab): example = en_vocab['example'] assert example.orth != example.shape + + +def test_symbols(en_vocab): + assert en_vocab.strings['IS_ALPHA'] == IS_ALPHA + assert en_vocab.strings['NOUN'] == NOUN + assert en_vocab.strings['VERB'] == VERB + assert en_vocab.strings['LEMMA'] == LEMMA + assert en_vocab.strings['ORTH'] == ORTH + assert en_vocab.strings['PROB'] == PROB + + +def test_pickle_vocab(en_vocab): + file_ = StringIO.StringIO() + cloudpickle.dump(en_vocab, file_) + + file_.seek(0) + + loaded = pickle.load(file_) diff --git a/tests/website/conftest.py b/tests/website/conftest.py index ade1bae2a..35c38d845 100644 --- a/tests/website/conftest.py +++ b/tests/website/conftest.py @@ -1,11 +1,13 @@ from __future__ import unicode_literals import pytest +import os @pytest.fixture(scope='session') def nlp(): - from spacy.en import English - return English() + from spacy.en import English, LOCAL_DATA_DIR + data_dir = os.environ.get('SPACY_DATA', LOCAL_DATA_DIR) + return English(data_dir=data_dir) @pytest.fixture() diff --git a/tests/website/test_home.py b/tests/website/test_home.py index 4da61becf..3f7f7ea4c 100644 --- a/tests/website/test_home.py +++ b/tests/website/test_home.py @@ -1,6 +1,7 @@ from __future__ import unicode_literals import pytest import spacy +import os @pytest.fixture() @@ -9,8 +10,9 @@ def token(doc): def test_load_resources_and_process_text(): - from spacy.en import English - nlp = English() + from spacy.en import English, LOCAL_DATA_DIR + data_dir = os.environ.get('SPACY_DATA', LOCAL_DATA_DIR) + nlp = English(data_dir=data_dir) doc = nlp('Hello, world. Here are two sentences.')