Merge branch 'attrs'

2025-10-26 21:51:24 +03:00 · 2015-10-13 14:03:41 +11:00 · 2015-10-13 14:03:41 +11:00 · c1fdc487bc
commit c1fdc487bc
parent 38109dd912 20fd36a0f7
33 changed files with 1682 additions and 755 deletions
--- a/bin/init_model.py
+++ b/bin/init_model.py
@ -168,6 +168,11 @@ def setup_vocab(get_lex_attr, tag_map, src_dir, dst_dir):
            probs[word] = oov_prob
    lexicon = []
    for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])):
        # First encode the strings into the StringStore. This way, we can map
        # the orth IDs to frequency ranks
        orth = vocab.strings[word]
    # Now actually load the vocab
    for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])):
        lexeme = vocab[word]
        lexeme.prob = prob
--- a/lang_data/en/morphs.json
+++ b/lang_data/en/morphs.json
@ -56,5 +56,4 @@
        "was":  {"L": "be", "VerbForm": "Fin", "Tense": "Past", "Number": "Sing"},
        "were": {"L": "be", "VerbForm": "Fin", "Tense": "Past", "Number": "Plur"}
    }
 }
--- a/lang_data/en/tag_map.json
+++ b/lang_data/en/tag_map.json
@ -22,7 +22,7 @@
 "JJS": {"pos": "adj", "degree": "sup"},
 "LS": {"pos": "punct", "numtype": "ord"},
 "MD": {"pos": "verb", "verbtype": "mod"},
-"NIL": {"pos": "no_tag"},
+"NIL": {"pos": ""},
 "NN": {"pos": "noun", "number": "sing"},
 "NNP": {"pos": "noun", "nountype": "prop", "number": "sing"},
 "NNPS": {"pos": "noun", "nountype": "prop", "number": "plur"},
--- a/setup.py
+++ b/setup.py
@ -166,7 +166,8 @@ MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings',
             'spacy.tokens.doc', 'spacy.tokens.spans', 'spacy.tokens.token',
             'spacy.serialize.packer', 'spacy.serialize.huffman', 'spacy.serialize.bits',
             'spacy.cfile', 'spacy.matcher',
-             'spacy.syntax.ner']
+             'spacy.syntax.ner',
             'spacy.symbols']
 if __name__ == '__main__':
--- a/spacy/_ml.pxd
+++ b/spacy/_ml.pxd
@ -29,5 +29,6 @@ cdef class Model:
    cdef int update(self, atom_t* context, class_t guess, class_t gold, int cost) except -1
    cdef object model_loc
    cdef object _templates
    cdef Extractor _extractor
    cdef LinearModel _model
--- a/spacy/_ml.pyx
+++ b/spacy/_ml.pyx
@ -3,6 +3,7 @@ from __future__ import unicode_literals
 from __future__ import division
 from os import path
 import tempfile
 import os
 import shutil
 import json
@ -52,6 +53,7 @@ cdef class Model:
    def __init__(self, n_classes, templates, model_loc=None):
        if model_loc is not None and path.isdir(model_loc):
            model_loc = path.join(model_loc, 'model')
        self._templates = templates
        self.n_classes = n_classes
        self._extractor = Extractor(templates)
        self.n_feats = self._extractor.n_templ
@ -60,6 +62,18 @@ cdef class Model:
        if self.model_loc and path.exists(self.model_loc):
            self._model.load(self.model_loc, freq_thresh=0)
    def __reduce__(self):
        model_loc = tempfile.mkstemp()
        # TODO: This is a potentially buggy implementation. We're not really
        # given a good guarantee that all internal state is saved correctly here,
        # since there are learning parameters for e.g. the model averaging in
        # averaged perceptron, the gradient calculations in AdaGrad, etc
        # that aren't necessarily saved. So, if we're part way through training
        # the model, and then we pickle it, we won't recover the state correctly.
        self._model.dump(model_loc)
        return (Model, (self.n_classes, self.templates, model_loc),
                None, None)
    def predict(self, Example eg):
        self.set_scores(eg.c.scores, eg.c.atoms)
        eg.c.guess = arg_max_if_true(eg.c.scores, eg.c.is_valid, self.n_classes)
--- a/spacy/attrs.pxd
+++ b/spacy/attrs.pxd
@ -1,5 +1,6 @@
 # Reserve 64 values for flag features
 cpdef enum attr_id_t:
    NULL_ATTR
    IS_ALPHA
    IS_ASCII
    IS_DIGIT
@ -14,8 +15,7 @@ cpdef enum attr_id_t:
    IS_STOP
    IS_OOV
-    FLAG13 = 13
+    FLAG14 = 14
    FLAG14
    FLAG15
    FLAG16
    FLAG17
--- a/spacy/attrs.pyx
+++ b/spacy/attrs.pyx
@ -0,0 +1,90 @@
 IDS = {
    "": NULL_ATTR,
    "IS_ALPHA": IS_ALPHA,
    "IS_ASCII": IS_ASCII,
    "IS_DIGIT": IS_DIGIT,
    "IS_LOWER": IS_LOWER,
    "IS_PUNCT": IS_PUNCT,
    "IS_SPACE": IS_SPACE,
    "IS_TITLE": IS_TITLE,
    "IS_UPPER": IS_UPPER,
    "LIKE_URL": LIKE_URL,
    "LIKE_NUM": LIKE_NUM,
    "LIKE_EMAIL": LIKE_EMAIL,
    "IS_STOP": IS_STOP,
    "IS_OOV": IS_OOV,
    "FLAG14": FLAG14,
    "FLAG15": FLAG15,
    "FLAG16": FLAG16,
    "FLAG17": FLAG17,
    "FLAG18": FLAG18,
    "FLAG19": FLAG19,
    "FLAG20": FLAG20,
    "FLAG21": FLAG21,
    "FLAG22": FLAG22,
    "FLAG23": FLAG23,
    "FLAG24": FLAG24,
    "FLAG25": FLAG25,
    "FLAG26": FLAG26,
    "FLAG27": FLAG27,
    "FLAG28": FLAG28,
    "FLAG29": FLAG29,
    "FLAG30": FLAG30,
    "FLAG31": FLAG31,
    "FLAG32": FLAG32,
    "FLAG33": FLAG33,
    "FLAG34": FLAG34,
    "FLAG35": FLAG35,
    "FLAG36": FLAG36,
    "FLAG37": FLAG37,
    "FLAG38": FLAG38,
    "FLAG39": FLAG39,
    "FLAG40": FLAG40,
    "FLAG41": FLAG41,
    "FLAG42": FLAG42,
    "FLAG43": FLAG43,
    "FLAG44": FLAG44,
    "FLAG45": FLAG45,
    "FLAG46": FLAG46,
    "FLAG47": FLAG47,
    "FLAG48": FLAG48,
    "FLAG49": FLAG49,
    "FLAG50": FLAG50,
    "FLAG51": FLAG51,
    "FLAG52": FLAG52,
    "FLAG53": FLAG53,
    "FLAG54": FLAG54,
    "FLAG55": FLAG55,
    "FLAG56": FLAG56,
    "FLAG57": FLAG57,
    "FLAG58": FLAG58,
    "FLAG59": FLAG59,
    "FLAG60": FLAG60,
    "FLAG61": FLAG61,
    "FLAG62": FLAG62,
    "FLAG63": FLAG63,
    "ID": ID,
    "ORTH": ORTH,
    "LOWER": LOWER,
    "NORM": NORM,
    "SHAPE": SHAPE,
    "PREFIX": PREFIX,
    "SUFFIX": SUFFIX,
    "LENGTH": LENGTH,
    "CLUSTER": CLUSTER,
    "LEMMA": LEMMA,
    "POS": POS,
    "TAG": TAG,
    "DEP": DEP,
    "ENT_IOB": ENT_IOB,
    "ENT_TYPE": ENT_TYPE,
    "HEAD": HEAD,
    "SPACY": SPACY,
    "PROB": PROB,
 }
 # ATTR IDs, in order of the symbol
 NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]
--- a/spacy/language.py
+++ b/spacy/language.py
@ -207,6 +207,12 @@ class Language(object):
        self.entity = entity
        self.matcher = matcher
    def __reduce__(self):
        return (self.__class__,
                  (None, self.vocab, self.tokenizer, self.tagger, self.parser,
                   self.entity, self.matcher, None),
                None, None)
    def __call__(self, text, tag=True, parse=True, entity=True):
        """Apply the pipeline to some text.  The text can span multiple sentences,
        and can contain arbtrary whitespace.  Alignment into the original string
--- a/spacy/matcher.pyx
+++ b/spacy/matcher.pyx
@ -15,7 +15,7 @@ from libcpp.vector cimport vector
 from murmurhash.mrmr cimport hash64
 from .attrs cimport LENGTH, ENT_TYPE, ORTH, NORM, LEMMA, LOWER, SHAPE
-from .attrs cimport FLAG13, FLAG14, FLAG15, FLAG16, FLAG17, FLAG18, FLAG19, FLAG20, FLAG21, FLAG22, FLAG23, FLAG24, FLAG25
+from .attrs cimport FLAG14, FLAG15, FLAG16, FLAG17, FLAG18, FLAG19, FLAG20, FLAG21, FLAG22, FLAG23, FLAG24, FLAG25
 from .tokens.doc cimport get_token_attr
 from .tokens.doc cimport Doc
 from .vocab cimport Vocab
@ -168,13 +168,7 @@ cdef class Matcher:
    cdef Pool mem
    cdef vector[Pattern*] patterns
    cdef readonly Vocab vocab
-
+    cdef object _patterns
    def __init__(self, vocab, patterns):
        self.vocab = vocab
        self.mem = Pool()
        self.vocab = vocab
        for entity_key, (etype, attrs, specs) in sorted(patterns.items()):
            self.add(entity_key, etype, attrs, specs)
    @classmethod
    def from_dir(cls, data_dir, Vocab vocab):
@ -186,10 +180,22 @@ cdef class Matcher:
        else:
            return cls(vocab, {})
    def __init__(self, vocab, patterns):
        self.vocab = vocab
        self.mem = Pool()
        self.vocab = vocab
        self._patterns = dict(patterns)
        for entity_key, (etype, attrs, specs) in sorted(patterns.items()):
            self.add(entity_key, etype, attrs, specs)
    def __reduce__(self):
        return (self.__class__, (self.vocab, self._patterns), None, None)
    property n_patterns:
        def __get__(self): return self.patterns.size()
    def add(self, entity_key, etype, attrs, specs):
        self._patterns[entity_key] = (etype, dict(attrs), list(specs))
        if isinstance(entity_key, basestring):
            entity_key = self.vocab.strings[entity_key]
        if isinstance(etype, basestring):
--- a/spacy/morphology.pxd
+++ b/spacy/morphology.pxd
@ -7,6 +7,7 @@ from .strings cimport StringStore
 from .typedefs cimport attr_t
 from .parts_of_speech cimport univ_pos_t
 from . cimport symbols
 cdef struct RichTagC:
    uint64_t morph
@ -24,6 +25,7 @@ cdef class Morphology:
    cdef readonly Pool mem
    cdef readonly StringStore strings
    cdef public object lemmatizer
    cdef readonly object tag_map
    cdef public object n_tags
    cdef public object reverse_index
    cdef public object tag_names
@ -36,720 +38,252 @@ cdef class Morphology:
    cdef int assign_feature(self, uint64_t* morph, feature, value) except -1
 cpdef enum univ_morph_t:
    NIL = 0
    Animacy_anim = symbols.Animacy_anim
    Animacy_inam
    Aspect_freq
    Aspect_imp
    Aspect_mod
    Aspect_none
    Aspect_perf
    Case_abe
    Case_abl
    Case_abs
    Case_acc
    Case_ade
    Case_all
    Case_cau
    Case_com
    Case_dat
    Case_del
    Case_dis
    Case_ela
    Case_ess
    Case_gen
    Case_ill
    Case_ine
    Case_ins
    Case_loc
    Case_lat
    Case_nom
    Case_par
    Case_sub
    Case_sup
    Case_tem
    Case_ter
    Case_tra
    Case_voc
    Definite_two
    Definite_def
    Definite_red
    Definite_ind
    Degree_cmp
    Degree_comp
    Degree_none
    Degree_pos
    Degree_sup
    Degree_abs
    Degree_com
    Degree_dim # du
    Gender_com
    Gender_fem
    Gender_masc
    Gender_neut
    Mood_cnd
    Mood_imp
    Mood_ind
    Mood_n
    Mood_pot
    Mood_sub
    Mood_opt
    Negative_neg
    Negative_pos
    Negative_yes
    Number_com
    Number_dual
    Number_none
    Number_plur
    Number_sing
    Number_ptan # bg
    Number_count # bg
    NumType_card
    NumType_dist
    NumType_frac
    NumType_gen
    NumType_mult
    NumType_none
    NumType_ord
    NumType_sets
    Person_one
    Person_two
    Person_three
    Person_none
    Poss_yes
    PronType_advPart
    PronType_art
    PronType_default
    PronType_dem
    PronType_ind
    PronType_int
    PronType_neg
    PronType_prs
    PronType_rcp
    PronType_rel
    PronType_tot
    PronType_clit
    PronType_exc # es, ca, it, fa
    Reflex_yes
    Tense_fut
    Tense_imp
    Tense_past
    Tense_pres
    VerbForm_fin
    VerbForm_ger
    VerbForm_inf
    VerbForm_none
    VerbForm_part
    VerbForm_partFut
    VerbForm_partPast
    VerbForm_partPres
    VerbForm_sup
    VerbForm_trans
    VerbForm_gdv # la
    Voice_act
    Voice_cau
    Voice_pass
    Voice_mid # gkc
    Voice_int # hb
    Abbr_yes # cz, fi, sl, U
    AdpType_prep # cz, U
    AdpType_post # U
    AdpType_voc # cz
    AdpType_comprep # cz
    AdpType_circ # U
    AdvType_man
    AdvType_loc
    AdvType_tim
    AdvType_deg
    AdvType_cau
    AdvType_mod
    AdvType_sta
    AdvType_ex
    AdvType_adadj
    ConjType_oper # cz, U
    ConjType_comp # cz, U
    Connegative_yes # fi
    Derivation_minen # fi
    Derivation_sti # fi
    Derivation_inen # fi
    Derivation_lainen # fi
    Derivation_ja # fi
    Derivation_ton # fi
    Derivation_vs # fi
    Derivation_ttain # fi
    Derivation_ttaa # fi
    Echo_rdp # U
    Echo_ech # U
    Foreign_foreign # cz, fi, U
    Foreign_fscript # cz, fi, U
    Foreign_tscript # cz, U
    Foreign_yes # sl
    Gender_dat_masc # bq, U
    Gender_dat_fem # bq, U
    Gender_erg_masc # bq
    Gender_erg_fem # bq
    Gender_psor_masc # cz, sl, U
    Gender_psor_fem # cz, sl, U
    Gender_psor_neut # sl
    Hyph_yes # cz, U
    InfForm_one # fi
    InfForm_two # fi
    InfForm_three # fi
    NameType_geo # U, cz
    NameType_prs # U, cz
    NameType_giv # U, cz
    NameType_sur # U, cz
    NameType_nat # U, cz
    NameType_com # U, cz
    NameType_pro # U, cz
    NameType_oth # U, cz
    NounType_com # U
    NounType_prop # U
    NounType_class # U
    Number_abs_sing # bq, U
    Number_abs_plur # bq, U
    Number_dat_sing # bq, U
    Number_dat_plur # bq, U
    Number_erg_sing # bq, U
    Number_erg_plur # bq, U
    Number_psee_sing # U
    Number_psee_plur # U
    Number_psor_sing # cz, fi, sl, U
    Number_psor_plur # cz, fi, sl, U
    NumForm_digit # cz, sl, U
    NumForm_roman # cz, sl, U
    NumForm_word # cz, sl, U
    NumValue_one # cz, U
    NumValue_two # cz, U
    NumValue_three # cz, U
    PartForm_pres # fi
    PartForm_past # fi
    PartForm_agt # fi
    PartForm_neg # fi
    PartType_mod # U
    PartType_emp # U
    PartType_res # U
    PartType_inf # U
    PartType_vbp # U
    Person_abs_one # bq, U
    Person_abs_two # bq, U
    Person_abs_three # bq, U
    Person_dat_one # bq, U
    Person_dat_two # bq, U
    Person_dat_three # bq, U
    Person_erg_one # bq, U
    Person_erg_two # bq, U
    Person_erg_three # bq, U
    Person_psor_one # fi, U
    Person_psor_two # fi, U
    Person_psor_three # fi, U
    Polite_inf # bq, U
    Polite_pol # bq, U
    Polite_abs_inf # bq, U
    Polite_abs_pol # bq, U
    Polite_erg_inf # bq, U
    Polite_erg_pol # bq, U
    Polite_dat_inf # bq, U
    Polite_dat_pol # bq, U
    Prefix_yes # U
    PrepCase_npr # cz
    PrepCase_pre # U
    PunctSide_ini # U
    PunctSide_fin # U
    PunctType_peri # U
    PunctType_qest # U
    PunctType_excl # U
    PunctType_quot # U
    PunctType_brck # U
    PunctType_comm # U
    PunctType_colo # U
    PunctType_semi # U
    PunctType_dash # U
    Style_arch # cz, fi, U
    Style_rare # cz, fi, U
    Style_poet # cz, U
    Style_norm # cz, U
    Style_coll # cz, U
    Style_vrnc # cz, U
    Style_sing # cz, U
    Style_expr # cz, U
    Style_derg # cz, U
    Style_vulg # cz, U
    Style_yes # fi, U
    StyleVariant_styleShort # cz
    StyleVariant_styleBound # cz, sl
    VerbType_aux # U
    VerbType_cop # U
    VerbType_mod # U
    VerbType_light # U
 #
 #cpdef enum Feature_t:
 #    Abbr
 #    AdpType
 #    AdvType
 #    ConjType
 #    Connegative
 #    Derivation
 #    Echo
 #    Foreign
 #    Gender_dat
 #    Gender_erg
 #    Gender_psor
 #    Hyph
 #    InfForm
 #    NameType
 #    NounType
 #    NumberAbs
 #    NumberDat
 #    NumberErg
 #    NumberPsee
 #    NumberPsor
 #    NumForm
 #    NumValue
 #    PartForm
 #    PartType
 #    Person_abs
 #    Person_dat
 #    Person_psor
 #    Polite
 #    Polite_abs
 #    Polite_dat
 #    Prefix
 #    PrepCase
 #    PunctSide
 #    PunctType
 #    Style
 #    Typo
 #    Variant
 #    VerbType
 #
 #
 #cpdef enum Animacy:
 #    Anim
 #    Inam
 #
 #
 #cpdef enum Aspect:
 #    Freq
 #    Imp
 #    Mod
 #    None_
 #    Perf
 #
 #
 #cpdef enum Case1:
 #    Nom
 #    Gen
 #    Acc
 #    Dat
 #    Voc
 #    Abl
 #    
 #cdef enum Case2:
 #    Abe
 #    Abs
 #    Ade
 #    All
 #    Cau
 #    Com
 #    Del
 #    Dis
 #
 #cdef enum Case3:
 #    Ela
 #    Ess
 #    Ill
 #    Ine
 #    Ins
 #    Loc
 #    Lat
 #    Par
 #
 #cdef enum Case4:
 #    Sub
 #    Sup
 #    Tem
 #    Ter
 #    Tra
 #
 #
 #cpdef enum Definite:
 #    Two
 #    Def
 #    Red
 #    Ind
 #
 #
 #cpdef enum Degree:
 #    Cmp
 #    Comp
 #    None_
 #    Pos
 #    Sup
 #    Abs
 #    Com
 #    Degree # du
 #
 #
 #cpdef enum Gender:
 #    Com
 #    Fem
 #    Masc
 #    Neut
 #
 #
 #cpdef enum Mood:
 #    Cnd
 #    Imp
 #    Ind
 #    N
 #    Pot
 #    Sub
 #    Opt
 #
 #
 #cpdef enum Negative:
 #    Neg
 #    Pos
 #    Yes
 #
 #
 #cpdef enum Number:
 #    Com
 #    Dual
 #    None_
 #    Plur
 #    Sing
 #    Ptan # bg
 #    Count # bg
 #
 #
 #cpdef enum NumType:
 #    Card
 #    Dist
 #    Frac
 #    Gen
 #    Mult
 #    None_
 #    Ord
 #    Sets
 #
 #
 #cpdef enum Person:
 #    One
 #    Two
 #    Three
 #    None_
 #
 #
 #cpdef enum Poss:
 #    Yes
 #
 #
 #cpdef enum PronType1:
 #    AdvPart
 #    Art
 #    Default
 #    Dem
 #    Ind
 #    Int
 #    Neg
 #
 #cpdef enum PronType2:
 #    Prs
 #    Rcp
 #    Rel
 #    Tot
 #    Clit
 #    Exc # es, ca, it, fa
 #    Clit # it
 #
 #
 #cpdef enum Reflex:
 #    Yes
 #
 #
 #cpdef enum Tense:
 #    Fut
 #    Imp
 #    Past
 #    Pres
 #
 #cpdef enum VerbForm1:
 #    Fin
 #    Ger
 #    Inf
 #    None_
 #    Part
 #    PartFut
 #    PartPast
 #
 #cpdef enum VerbForm2:
 #    PartPres
 #    Sup
 #    Trans
 #    Gdv # la
 #
 #
 #cpdef enum Voice:
 #    Act
 #    Cau
 #    Pass
 #    Mid # gkc
 #    Int # hb
 #
 #
 #cpdef enum Abbr:
 #    Yes # cz, fi, sl, U
 #
 #cpdef enum AdpType:
 #    Prep # cz, U
 #    Post # U
 #    Voc # cz
 #    Comprep # cz
 #    Circ # U
 #    Voc # U
 #
 #
 #cpdef enum AdvType1:
 #    # U
 #    Man
 #    Loc
 #    Tim
 #    Deg
 #    Cau
 #    Mod
 #    Sta
 #    Ex
 #
 #cpdef enum AdvType2:
 #    Adadj
 #
 #cpdef enum ConjType:
 #    Oper # cz, U
 #    Comp # cz, U
 #
 #cpdef enum Connegative:
 #    Yes # fi
 #
 #
 #cpdef enum Derivation1:
 #    Minen # fi
 #    Sti # fi
 #    Inen # fi
 #    Lainen # fi
 #    Ja # fi
 #    Ton # fi
 #    Vs # fi
 #    Ttain # fi
 #
 #cpdef enum Derivation2:
 #    Ttaa
 #
 #
 #cpdef enum Echo:
 #    Rdp # U
 #    Ech # U
 #
 #
 #cpdef enum Foreign:
 #    Foreign # cz, fi, U
 #    Fscript # cz, fi, U
 #    Tscript # cz, U
 #    Yes # sl
 #
 #
 #cpdef enum Gender_dat:
 #    Masc # bq, U
 #    Fem # bq, U
 #
 #
 #cpdef enum Gender_erg:
 #    Masc # bq
 #    Fem # bq
 #
 #
 #cpdef enum Gender_psor:
 #    Masc # cz, sl, U
 #    Fem # cz, sl, U
 #    Neut # sl
 #
 #
 #cpdef enum Hyph:
 #    Yes # cz, U
 #
 #
 #cpdef enum InfForm:
 #    One # fi
 #    Two # fi
 #    Three # fi
 #
 #
 #cpdef enum NameType:
 #    Geo # U, cz
 #    Prs # U, cz
 #    Giv # U, cz
 #    Sur # U, cz
 #    Nat # U, cz
 #    Com # U, cz
 #    Pro # U, cz
 #    Oth # U, cz
 #
 #
 #cpdef enum NounType:
 #    Com # U
 #    Prop # U
 #    Class # U
 #
 #cpdef enum Number_abs:
 #    Sing # bq, U
 #    Plur # bq, U
 #
 #cpdef enum Number_dat:
 #    Sing # bq, U
 #    Plur # bq, U
 #
 #cpdef enum Number_erg:
 #    Sing # bq, U
 #    Plur # bq, U
 #
 #cpdef enum Number_psee:
 #    Sing # U
 #    Plur # U
 #
 #
 #cpdef enum Number_psor:
 #    Sing # cz, fi, sl, U
 #    Plur # cz, fi, sl, U
 #
 #
 #cpdef enum NumForm:
 #    Digit # cz, sl, U
 #    Roman # cz, sl, U
 #    Word # cz, sl, U
 #
 #
 #cpdef enum NumValue:
 #    One # cz, U
 #    Two # cz, U
 #    Three # cz, U
 #
 #
 #cpdef enum PartForm:
 #    Pres # fi
 #    Past # fi
 #    Agt # fi
 #    Neg # fi
 #
 #
 #cpdef enum PartType:
 #    Mod # U
 #    Emp # U
 #    Res # U
 #    Inf # U
 #    Vbp # U
 #
 #cpdef enum Person_abs:
 #    One # bq, U
 #    Two # bq, U
 #    Three # bq, U
 #
 #
 #cpdef enum Person_dat:
 #    One # bq, U
 #    Two # bq, U
 #    Three # bq, U
 #
 #
 #cpdef enum Person_erg:
 #    One # bq, U
 #    Two # bq, U
 #    Three # bq, U
 #
 #
 #cpdef enum Person_psor:
 #    One # fi, U
 #    Two # fi, U
 #    Three # fi, U
 #
 #
 #cpdef enum Polite:
 #    Inf # bq, U
 #    Pol # bq, U
 #
 #
 #cpdef enum Polite_abs:
 #    Inf # bq, U
 #    Pol # bq, U
 #
 #
 #cpdef enum Polite_erg:
 #    Inf # bq, U
 #    Pol # bq, U
 #
 #
 #cpdef enum Polite_dat:
 #    Inf # bq, U
 #    Pol # bq, U
 #
 #
 #cpdef enum Prefix:
 #    Yes # U
 #
 #
 #cpdef enum PrepCase:
 #    Npr # cz
 #    Pre # U
 #
 #
 #cpdef enum PunctSide:
 #    Ini # U
 #    Fin # U
 #
 #cpdef enum PunctType1:
 #    Peri # U
 #    Qest # U
 #    Excl # U
 #    Quot # U
 #    Brck # U
 #    Comm # U
 #    Colo # U
 #    Semi # U
 #
 #cpdef enum PunctType2:
 #    Dash # U
 #
 #
 #cpdef enum Style1:
 #    Arch # cz, fi, U
 #    Rare # cz, fi, U
 #    Poet # cz, U
 #    Norm # cz, U
 #    Coll # cz, U
 #    Vrnc # cz, U
 #    Sing # cz, U
 #    Expr # cz, U
 #
 #
 #cpdef enum Style2:
 #    Derg # cz, U
 #    Vulg # cz, U
 #
 #
 #cpdef enum Typo:
 #    Yes # fi, U
 #
 #
 #cpdef enum Variant:
 #    Short # cz
 #    Bound # cz, sl
 #
 #
 #cpdef enum VerbType:
 #    Aux # U
 #    Cop # U
 #    Mod # U
 #    Light # U
 #
 cpdef enum Value_t:
    Animacy_Anim
    Animacy_Inam
    Aspect_Freq
    Aspect_Imp
    Aspect_Mod
    Aspect_None_
    Aspect_Perf
    Case_Abe
    Case_Abl
    Case_Abs
    Case_Acc
    Case_Ade
    Case_All
    Case_Cau
    Case_Com
    Case_Dat
    Case_Del
    Case_Dis
    Case_Ela
    Case_Ess
    Case_Gen
    Case_Ill
    Case_Ine
    Case_Ins
    Case_Loc
    Case_Lat
    Case_Nom
    Case_Par
    Case_Sub
    Case_Sup
    Case_Tem
    Case_Ter
    Case_Tra
    Case_Voc
    Definite_Two
    Definite_Def
    Definite_Red
    Definite_Ind
    Degree_Cmp
    Degree_Comp
    Degree_None
    Degree_Pos
    Degree_Sup
    Degree_Abs
    Degree_Com
    Degree_Dim # du
    Gender_Com
    Gender_Fem
    Gender_Masc
    Gender_Neut
    Mood_Cnd
    Mood_Imp
    Mood_Ind
    Mood_N
    Mood_Pot
    Mood_Sub
    Mood_Opt
    Negative_Neg
    Negative_Pos
    Negative_Yes
    Number_Com
    Number_Dual
    Number_None
    Number_Plur
    Number_Sing
    Number_Ptan # bg
    Number_Count # bg
    NumType_Card
    NumType_Dist
    NumType_Frac
    NumType_Gen
    NumType_Mult
    NumType_None
    NumType_Ord
    NumType_Sets
    Person_One
    Person_Two
    Person_Three
    Person_None
    Poss_Yes
    PronType_AdvPart
    PronType_Art
    PronType_Default
    PronType_Dem
    PronType_Ind
    PronType_Int
    PronType_Neg
    PronType_Prs
    PronType_Rcp
    PronType_Rel
    PronType_Tot
    PronType_Clit
    PronType_Exc # es, ca, it, fa
    Reflex_Yes
    Tense_Fut
    Tense_Imp
    Tense_Past
    Tense_Pres
    VerbForm_Fin
    VerbForm_Ger
    VerbForm_Inf
    VerbForm_None
    VerbForm_Part
    VerbForm_PartFut
    VerbForm_PartPast
    VerbForm_PartPres
    VerbForm_Sup
    VerbForm_Trans
    VerbForm_Gdv # la
    Voice_Act
    Voice_Cau
    Voice_Pass
    Voice_Mid # gkc
    Voice_Int # hb
    Abbr_Yes # cz, fi, sl, U
    AdpType_Prep # cz, U
    AdpType_Post # U
    AdpType_Voc # cz
    AdpType_Comprep # cz
    AdpType_Circ # U
    AdvType_Man
    AdvType_Loc
    AdvType_Tim
    AdvType_Deg
    AdvType_Cau
    AdvType_Mod
    AdvType_Sta
    AdvType_Ex
    AdvType_Adadj
    ConjType_Oper # cz, U
    ConjType_Comp # cz, U
    Connegative_Yes # fi
    Derivation_Minen # fi
    Derivation_Sti # fi
    Derivation_Inen # fi
    Derivation_Lainen # fi
    Derivation_Ja # fi
    Derivation_Ton # fi
    Derivation_Vs # fi
    Derivation_Ttain # fi
    Derivation_Ttaa # fi
    Echo_Rdp # U
    Echo_Ech # U
    Foreign_Foreign # cz, fi, U
    Foreign_Fscript # cz, fi, U
    Foreign_Tscript # cz, U
    Foreign_Yes # sl
    Gender_dat_Masc # bq, U
    Gender_dat_Fem # bq, U
    Gender_erg_Masc # bq
    Gender_erg_Fem # bq
    Gender_psor_Masc # cz, sl, U
    Gender_psor_Fem # cz, sl, U
    Gender_psor_Neut # sl
    Hyph_Yes # cz, U
    InfForm_One # fi
    InfForm_Two # fi
    InfForm_Three # fi
    NameType_Geo # U, cz
    NameType_Prs # U, cz
    NameType_Giv # U, cz
    NameType_Sur # U, cz
    NameType_Nat # U, cz
    NameType_Com # U, cz
    NameType_Pro # U, cz
    NameType_Oth # U, cz
    NounType_Com # U
    NounType_Prop # U
    NounType_Class # U
    Number_abs_Sing # bq, U
    Number_abs_Plur # bq, U
    Number_dat_Sing # bq, U
    Number_dat_Plur # bq, U
    Number_erg_Sing # bq, U
    Number_erg_Plur # bq, U
    Number_psee_Sing # U
    Number_psee_Plur # U
    Number_psor_Sing # cz, fi, sl, U
    Number_psor_Plur # cz, fi, sl, U
    NumForm_Digit # cz, sl, U
    NumForm_Roman # cz, sl, U
    NumForm_Word # cz, sl, U
    NumValue_One # cz, U
    NumValue_Two # cz, U
    NumValue_Three # cz, U
    PartForm_Pres # fi
    PartForm_Past # fi
    PartForm_Agt # fi
    PartForm_Neg # fi
    PartType_Mod # U
    PartType_Emp # U
    PartType_Res # U
    PartType_Inf # U
    PartType_Vbp # U
    Person_abs_One # bq, U
    Person_abs_Two # bq, U
    Person_abs_Three # bq, U
    Person_dat_One # bq, U
    Person_dat_Two # bq, U
    Person_dat_Three # bq, U
    Person_erg_One # bq, U
    Person_erg_Two # bq, U
    Person_erg_Three # bq, U
    Person_psor_One # fi, U
    Person_psor_Two # fi, U
    Person_psor_Three # fi, U
    Polite_Inf # bq, U
    Polite_Pol # bq, U
    Polite_abs_Inf # bq, U
    Polite_abs_Pol # bq, U
    Polite_erg_Inf # bq, U
    Polite_erg_Pol # bq, U
    Polite_dat_Inf # bq, U
    Polite_dat_Pol # bq, U
    Prefix_Yes # U
    PrepCase_Npr # cz
    PrepCase_Pre # U
    PunctSide_Ini # U
    PunctSide_Fin # U
    PunctType_Peri # U
    PunctType_Qest # U
    PunctType_Excl # U
    PunctType_Quot # U
    PunctType_Brck # U
    PunctType_Comm # U
    PunctType_Colo # U
    PunctType_Semi # U
    PunctType_Dash # U
    Style_Arch # cz, fi, U
    Style_Rare # cz, fi, U
    Style_Poet # cz, U
    Style_Norm # cz, U
    Style_Coll # cz, U
    Style_Vrnc # cz, U
    Style_Sing # cz, U
    Style_Expr # cz, U
    Style_Derg # cz, U
    Style_Vulg # cz, U
    Style_Yes # fi, U
    StyleVariant_StyleShort # cz
    StyleVariant_StyleBound # cz, sl
    VerbType_Aux # U
    VerbType_Cop # U
    VerbType_Mod # U
    VerbType_Light # U
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@ -6,7 +6,7 @@ try:
 except ImportError:
    import json
-from .parts_of_speech import UNIV_POS_NAMES
+from .parts_of_speech import IDS as POS_IDS
 from .parts_of_speech cimport ADJ, VERB, NOUN, PUNCT
@ -14,6 +14,7 @@ cdef class Morphology:
    def __init__(self, StringStore string_store, tag_map, lemmatizer):
        self.mem = Pool()
        self.strings = string_store
        self.tag_map = tag_map
        self.lemmatizer = lemmatizer
        self.n_tags = len(tag_map) + 1
        self.tag_names = tuple(sorted(tag_map.keys()))
@ -24,10 +25,13 @@ cdef class Morphology:
            self.rich_tags[i].id = i
            self.rich_tags[i].name = self.strings[tag_str]
            self.rich_tags[i].morph = 0
-            self.rich_tags[i].pos = UNIV_POS_NAMES[props['pos'].upper()]
+            self.rich_tags[i].pos = POS_IDS[props['pos'].upper()]
            self.reverse_index[self.rich_tags[i].name] = i
        self._cache = PreshMapArray(self.n_tags)
    def __reduce__(self):
        return (Morphology, (self.strings, self.tag_map, self.lemmatizer), None, None)
    cdef int assign_tag(self, TokenC* token, tag) except -1:
        cdef int tag_id
        if isinstance(tag, basestring):
@ -89,3 +93,254 @@ cdef class Morphology:
        lemma_string = sorted(lemma_strings)[0]
        lemma = self.strings[lemma_string]
        return lemma
 IDS = {
    "Animacy_anim": Animacy_anim,
    "Animacy_inam": Animacy_inam,
    "Aspect_freq": Aspect_freq,
    "Aspect_imp": Aspect_imp,
    "Aspect_mod": Aspect_mod,
    "Aspect_none": Aspect_none,
    "Aspect_perf": Aspect_perf,
    "Case_abe": Case_abe,
    "Case_abl": Case_abl,
    "Case_abs": Case_abs,
    "Case_acc": Case_acc,
    "Case_ade": Case_ade,
    "Case_all": Case_all,
    "Case_cau": Case_cau,
    "Case_com": Case_com,
    "Case_dat": Case_dat,
    "Case_del": Case_del,
    "Case_dis": Case_dis,
    "Case_ela": Case_ela,
    "Case_ess": Case_ess,
    "Case_gen": Case_gen,
    "Case_ill": Case_ill,
    "Case_ine": Case_ine,
    "Case_ins": Case_ins,
    "Case_loc": Case_loc,
    "Case_lat": Case_lat,
    "Case_nom": Case_nom,
    "Case_par": Case_par,
    "Case_sub": Case_sub,
    "Case_sup": Case_sup,
    "Case_tem": Case_tem,
    "Case_ter": Case_ter,
    "Case_tra": Case_tra,
    "Case_voc": Case_voc,
    "Definite_two": Definite_two,
    "Definite_def": Definite_def,
    "Definite_red": Definite_red,
    "Definite_ind": Definite_ind,
    "Degree_cmp": Degree_cmp,
    "Degree_comp": Degree_comp,
    "Degree_none": Degree_none,
    "Degree_pos": Degree_pos,
    "Degree_sup": Degree_sup,
    "Degree_abs": Degree_abs,
    "Degree_com": Degree_com,
    "Degree_dim ": Degree_dim, # du
    "Gender_com": Gender_com,
    "Gender_fem": Gender_fem,
    "Gender_masc": Gender_masc,
    "Gender_neut": Gender_neut,
    "Mood_cnd": Mood_cnd,
    "Mood_imp": Mood_imp,
    "Mood_ind": Mood_ind,
    "Mood_n": Mood_n,
    "Mood_pot": Mood_pot,
    "Mood_sub": Mood_sub,
    "Mood_opt": Mood_opt,
    "Negative_neg": Negative_neg,
    "Negative_pos": Negative_pos,
    "Negative_yes": Negative_yes,
    "Number_com": Number_com,
    "Number_dual": Number_dual,
    "Number_none": Number_none,
    "Number_plur": Number_plur,
    "Number_sing": Number_sing,
    "Number_ptan ": Number_ptan, # bg
    "Number_count ": Number_count, # bg
    "NumType_card": NumType_card,
    "NumType_dist": NumType_dist,
    "NumType_frac": NumType_frac,
    "NumType_gen": NumType_gen,
    "NumType_mult": NumType_mult,
    "NumType_none": NumType_none,
    "NumType_ord": NumType_ord,
    "NumType_sets": NumType_sets,
    "Person_one": Person_one,
    "Person_two": Person_two,
    "Person_three": Person_three,
    "Person_none": Person_none,
    "Poss_yes": Poss_yes,
    "PronType_advPart": PronType_advPart,
    "PronType_art": PronType_art,
    "PronType_default": PronType_default,
    "PronType_dem": PronType_dem,
    "PronType_ind": PronType_ind,
    "PronType_int": PronType_int,
    "PronType_neg": PronType_neg,
    "PronType_prs": PronType_prs,
    "PronType_rcp": PronType_rcp,
    "PronType_rel": PronType_rel,
    "PronType_tot": PronType_tot,
    "PronType_clit": PronType_clit,
    "PronType_exc ": PronType_exc, # es, ca, it, fa,
    "Reflex_yes": Reflex_yes,
    "Tense_fut": Tense_fut,
    "Tense_imp": Tense_imp,
    "Tense_past": Tense_past,
    "Tense_pres": Tense_pres,
    "VerbForm_fin": VerbForm_fin,
    "VerbForm_ger": VerbForm_ger,
    "VerbForm_inf": VerbForm_inf,
    "VerbForm_none": VerbForm_none,
    "VerbForm_part": VerbForm_part,
    "VerbForm_partFut": VerbForm_partFut,
    "VerbForm_partPast": VerbForm_partPast,
    "VerbForm_partPres": VerbForm_partPres,
    "VerbForm_sup": VerbForm_sup,
    "VerbForm_trans": VerbForm_trans,
    "VerbForm_gdv ": VerbForm_gdv, # la,
    "Voice_act": Voice_act,
    "Voice_cau": Voice_cau,
    "Voice_pass": Voice_pass,
    "Voice_mid ": Voice_mid, # gkc,
    "Voice_int ": Voice_int, # hb,
    "Abbr_yes ": Abbr_yes, # cz, fi, sl, U,
    "AdpType_prep ": AdpType_prep, # cz, U,
    "AdpType_post ": AdpType_post, # U,
    "AdpType_voc ": AdpType_voc, # cz,
    "AdpType_comprep ": AdpType_comprep, # cz,
    "AdpType_circ ": AdpType_circ, # U,
    "AdvType_man": AdvType_man,
    "AdvType_loc": AdvType_loc,
    "AdvType_tim": AdvType_tim,
    "AdvType_deg": AdvType_deg,
    "AdvType_cau": AdvType_cau,
    "AdvType_mod": AdvType_mod,
    "AdvType_sta": AdvType_sta,
    "AdvType_ex": AdvType_ex,
    "AdvType_adadj": AdvType_adadj,
    "ConjType_oper ": ConjType_oper, # cz, U,
    "ConjType_comp ": ConjType_comp, # cz, U,
    "Connegative_yes ": Connegative_yes, # fi,
    "Derivation_minen ": Derivation_minen, # fi,
    "Derivation_sti ": Derivation_sti, # fi,
    "Derivation_inen ": Derivation_inen, # fi,
    "Derivation_lainen ": Derivation_lainen, # fi,
    "Derivation_ja ": Derivation_ja, # fi,
    "Derivation_ton ": Derivation_ton, # fi,
    "Derivation_vs ": Derivation_vs, # fi,
    "Derivation_ttain ": Derivation_ttain, # fi,
    "Derivation_ttaa ": Derivation_ttaa, # fi,
    "Echo_rdp ": Echo_rdp, # U,
    "Echo_ech ": Echo_ech, # U,
    "Foreign_foreign ": Foreign_foreign, # cz, fi, U,
    "Foreign_fscript ": Foreign_fscript, # cz, fi, U,
    "Foreign_tscript ": Foreign_tscript, # cz, U,
    "Foreign_yes ": Foreign_yes, # sl,
    "Gender_dat_masc ": Gender_dat_masc, # bq, U,
    "Gender_dat_fem ": Gender_dat_fem, # bq, U,
    "Gender_erg_masc ": Gender_erg_masc, # bq,
    "Gender_erg_fem ": Gender_erg_fem, # bq,
    "Gender_psor_masc ": Gender_psor_masc, # cz, sl, U,
    "Gender_psor_fem ": Gender_psor_fem, # cz, sl, U,
    "Gender_psor_neut ": Gender_psor_neut, # sl,
    "Hyph_yes ": Hyph_yes, # cz, U,
    "InfForm_one ": InfForm_one, # fi,
    "InfForm_two ": InfForm_two, # fi,
    "InfForm_three ": InfForm_three, # fi,
    "NameType_geo ": NameType_geo, # U, cz,
    "NameType_prs ": NameType_prs, # U, cz,
    "NameType_giv ": NameType_giv, # U, cz,
    "NameType_sur ": NameType_sur, # U, cz,
    "NameType_nat ": NameType_nat, # U, cz,
    "NameType_com ": NameType_com, # U, cz,
    "NameType_pro ": NameType_pro, # U, cz,
    "NameType_oth ": NameType_oth, # U, cz,
    "NounType_com ": NounType_com, # U,
    "NounType_prop ": NounType_prop, # U,
    "NounType_class ": NounType_class, # U,
    "Number_abs_sing ": Number_abs_sing, # bq, U,
    "Number_abs_plur ": Number_abs_plur, # bq, U,
    "Number_dat_sing ": Number_dat_sing, # bq, U,
    "Number_dat_plur ": Number_dat_plur, # bq, U,
    "Number_erg_sing ": Number_erg_sing, # bq, U,
    "Number_erg_plur ": Number_erg_plur, # bq, U,
    "Number_psee_sing ": Number_psee_sing, # U,
    "Number_psee_plur ": Number_psee_plur, # U,
    "Number_psor_sing ": Number_psor_sing, # cz, fi, sl, U,
    "Number_psor_plur ": Number_psor_plur, # cz, fi, sl, U,
    "NumForm_digit ": NumForm_digit, # cz, sl, U,
    "NumForm_roman ": NumForm_roman, # cz, sl, U,
    "NumForm_word ": NumForm_word, # cz, sl, U,
    "NumValue_one ": NumValue_one, # cz, U,
    "NumValue_two ": NumValue_two, # cz, U,
    "NumValue_three ": NumValue_three, # cz, U,
    "PartForm_pres ": PartForm_pres, # fi,
    "PartForm_past ": PartForm_past, # fi,
    "PartForm_agt ": PartForm_agt, # fi,
    "PartForm_neg ": PartForm_neg, # fi,
    "PartType_mod ": PartType_mod, # U,
    "PartType_emp ": PartType_emp, # U,
    "PartType_res ": PartType_res, # U,
    "PartType_inf ": PartType_inf, # U,
    "PartType_vbp ": PartType_vbp, # U,
    "Person_abs_one ": Person_abs_one, # bq, U,
    "Person_abs_two ": Person_abs_two, # bq, U,
    "Person_abs_three ": Person_abs_three, # bq, U,
    "Person_dat_one ": Person_dat_one, # bq, U,
    "Person_dat_two ": Person_dat_two, # bq, U,
    "Person_dat_three ": Person_dat_three, # bq, U,
    "Person_erg_one ": Person_erg_one, # bq, U,
    "Person_erg_two ": Person_erg_two, # bq, U,
    "Person_erg_three ": Person_erg_three, # bq, U,
    "Person_psor_one ": Person_psor_one, # fi, U,
    "Person_psor_two ": Person_psor_two, # fi, U,
    "Person_psor_three ": Person_psor_three, # fi, U,
    "Polite_inf ": Polite_inf, # bq, U,
    "Polite_pol ": Polite_pol, # bq, U,
    "Polite_abs_inf ": Polite_abs_inf, # bq, U,
    "Polite_abs_pol ": Polite_abs_pol, # bq, U,
    "Polite_erg_inf ": Polite_erg_inf, # bq, U,
    "Polite_erg_pol ": Polite_erg_pol, # bq, U,
    "Polite_dat_inf ": Polite_dat_inf, # bq, U,
    "Polite_dat_pol ": Polite_dat_pol, # bq, U,
    "Prefix_yes ": Prefix_yes, # U,
    "PrepCase_npr ": PrepCase_npr, # cz,
    "PrepCase_pre ": PrepCase_pre, # U,
    "PunctSide_ini ": PunctSide_ini, # U,
    "PunctSide_fin ": PunctSide_fin, # U,
    "PunctType_peri ": PunctType_peri, # U,
    "PunctType_qest ": PunctType_qest, # U,
    "PunctType_excl ": PunctType_excl, # U,
    "PunctType_quot ": PunctType_quot, # U,
    "PunctType_brck ": PunctType_brck, # U,
    "PunctType_comm ": PunctType_comm, # U,
    "PunctType_colo ": PunctType_colo, # U,
    "PunctType_semi ": PunctType_semi, # U,
    "PunctType_dash ": PunctType_dash, # U,
    "Style_arch ": Style_arch, # cz, fi, U,
    "Style_rare ": Style_rare, # cz, fi, U,
    "Style_poet ": Style_poet, # cz, U,
    "Style_norm ": Style_norm, # cz, U,
    "Style_coll ": Style_coll, # cz, U,
    "Style_vrnc ": Style_vrnc, # cz, U,
    "Style_sing ": Style_sing, # cz, U,
    "Style_expr ": Style_expr, # cz, U,
    "Style_derg ": Style_derg, # cz, U,
    "Style_vulg ": Style_vulg, # cz, U,
    "Style_yes ": Style_yes, # fi, U,
    "StyleVariant_styleShort ": StyleVariant_styleShort, # cz,
    "StyleVariant_styleBound ": StyleVariant_styleBound, # cz, sl,
    "VerbType_aux ": VerbType_aux, # U,
    "VerbType_cop ": VerbType_cop, # U,
    "VerbType_mod ": VerbType_mod, # U,
    "VerbType_light ": VerbType_light, # U,
 }
 NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]
--- a/spacy/parts_of_speech.pxd
+++ b/spacy/parts_of_speech.pxd
@ -1,7 +1,8 @@
-# Google universal tag set
+from . cimport symbols
 cpdef enum univ_pos_t:
-    NO_TAG
+    NO_TAG = 0
-    ADJ
+    ADJ = symbols.ADJ
    ADP
    ADV
    AUX
@ -20,4 +21,3 @@ cpdef enum univ_pos_t:
    X
    EOL
    SPACE
    N_UNIV_TAGS
--- a/spacy/parts_of_speech.pyx
+++ b/spacy/parts_of_speech.pyx
@ -1,8 +1,8 @@
 from __future__ import unicode_literals
-UNIV_POS_NAMES = {
+IDS = {
-    "NO_TAG": NO_TAG,
+    "": NO_TAG,
    "ADJ": ADJ,
    "ADP": ADP,
    "ADV": ADV,
@ -23,3 +23,6 @@ UNIV_POS_NAMES = {
    "EOL": EOL,
    "SPACE": SPACE
 }
 NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@ -69,12 +69,15 @@ cdef Utf8Str _allocate(Pool mem, const unsigned char* chars, int length) except
 cdef class StringStore:
    '''Map strings to and from integer IDs.'''
-    def __init__(self):
+    def __init__(self, strings=None):
        self.mem = Pool()
        self._map = PreshMap()
        self._resize_at = 10000
        self.c = <Utf8Str*>self.mem.alloc(self._resize_at, sizeof(Utf8Str))
        self.size = 1
        if strings is not None:
            for string in strings:
                _ = self[string]
    property size:
        def __get__(self):
@ -113,6 +116,14 @@ cdef class StringStore:
        for i in range(self.size):
            yield self[i]
    def __reduce__(self):
        strings = [""]
        for i in range(1, self.size):
            string = &self.c[i]
            py_string = _decode(string)
            strings.append(py_string)
        return (StringStore, (strings,), None, None, None)
    cdef const Utf8Str* intern(self, unsigned char* chars, int length) except NULL:
        # 0 means missing, but we don't bother offsetting the index.
        key = hash64(chars, length * sizeof(char), 0)
--- a/spacy/symbols.pxd
+++ b/spacy/symbols.pxd
@ -0,0 +1,421 @@
 cpdef enum symbol_t:
    NIL
    IS_ALPHA
    IS_ASCII
    IS_DIGIT
    IS_LOWER
    IS_PUNCT
    IS_SPACE
    IS_TITLE
    IS_UPPER
    LIKE_URL
    LIKE_NUM
    LIKE_EMAIL
    IS_STOP
    IS_OOV
    FLAG14
    FLAG15
    FLAG16
    FLAG17
    FLAG18
    FLAG19
    FLAG20
    FLAG21
    FLAG22
    FLAG23
    FLAG24
    FLAG25
    FLAG26
    FLAG27
    FLAG28
    FLAG29
    FLAG30
    FLAG31
    FLAG32
    FLAG33
    FLAG34
    FLAG35
    FLAG36
    FLAG37
    FLAG38
    FLAG39
    FLAG40
    FLAG41
    FLAG42
    FLAG43
    FLAG44
    FLAG45
    FLAG46
    FLAG47
    FLAG48
    FLAG49
    FLAG50
    FLAG51
    FLAG52
    FLAG53
    FLAG54
    FLAG55
    FLAG56
    FLAG57
    FLAG58
    FLAG59
    FLAG60
    FLAG61
    FLAG62
    FLAG63
    ID
    ORTH
    LOWER
    NORM
    SHAPE
    PREFIX
    SUFFIX
    LENGTH
    CLUSTER
    LEMMA
    POS
    TAG
    DEP
    ENT_IOB
    ENT_TYPE
    HEAD
    SPACY
    PROB
    ADJ
    ADP
    ADV
    AUX
    CONJ
    DET
    INTJ
    NOUN
    NUM
    PART
    PRON
    PROPN
    PUNCT
    SCONJ
    SYM
    VERB
    X
    EOL
    SPACE
    Animacy_anim
    Animacy_inam
    Aspect_freq
    Aspect_imp
    Aspect_mod
    Aspect_none
    Aspect_perf
    Case_abe
    Case_abl
    Case_abs
    Case_acc
    Case_ade
    Case_all
    Case_cau
    Case_com
    Case_dat
    Case_del
    Case_dis
    Case_ela
    Case_ess
    Case_gen
    Case_ill
    Case_ine
    Case_ins
    Case_loc
    Case_lat
    Case_nom
    Case_par
    Case_sub
    Case_sup
    Case_tem
    Case_ter
    Case_tra
    Case_voc
    Definite_two
    Definite_def
    Definite_red
    Definite_ind
    Degree_cmp
    Degree_comp
    Degree_none
    Degree_pos
    Degree_sup
    Degree_abs
    Degree_com
    Degree_dim # du
    Gender_com
    Gender_fem
    Gender_masc
    Gender_neut
    Mood_cnd
    Mood_imp
    Mood_ind
    Mood_n
    Mood_pot
    Mood_sub
    Mood_opt
    Negative_neg
    Negative_pos
    Negative_yes
    Number_com
    Number_dual
    Number_none
    Number_plur
    Number_sing
    Number_ptan # bg
    Number_count # bg
    NumType_card
    NumType_dist
    NumType_frac
    NumType_gen
    NumType_mult
    NumType_none
    NumType_ord
    NumType_sets
    Person_one
    Person_two
    Person_three
    Person_none
    Poss_yes
    PronType_advPart
    PronType_art
    PronType_default
    PronType_dem
    PronType_ind
    PronType_int
    PronType_neg
    PronType_prs
    PronType_rcp
    PronType_rel
    PronType_tot
    PronType_clit
    PronType_exc # es, ca, it, fa
    Reflex_yes
    Tense_fut
    Tense_imp
    Tense_past
    Tense_pres
    VerbForm_fin
    VerbForm_ger
    VerbForm_inf
    VerbForm_none
    VerbForm_part
    VerbForm_partFut
    VerbForm_partPast
    VerbForm_partPres
    VerbForm_sup
    VerbForm_trans
    VerbForm_gdv # la
    Voice_act
    Voice_cau
    Voice_pass
    Voice_mid # gkc
    Voice_int # hb
    Abbr_yes # cz, fi, sl, U
    AdpType_prep # cz, U
    AdpType_post # U
    AdpType_voc # cz
    AdpType_comprep # cz
    AdpType_circ # U
    AdvType_man
    AdvType_loc
    AdvType_tim
    AdvType_deg
    AdvType_cau
    AdvType_mod
    AdvType_sta
    AdvType_ex
    AdvType_adadj
    ConjType_oper # cz, U
    ConjType_comp # cz, U
    Connegative_yes # fi
    Derivation_minen # fi
    Derivation_sti # fi
    Derivation_inen # fi
    Derivation_lainen # fi
    Derivation_ja # fi
    Derivation_ton # fi
    Derivation_vs # fi
    Derivation_ttain # fi
    Derivation_ttaa # fi
    Echo_rdp # U
    Echo_ech # U
    Foreign_foreign # cz, fi, U
    Foreign_fscript # cz, fi, U
    Foreign_tscript # cz, U
    Foreign_yes # sl
    Gender_dat_masc # bq, U
    Gender_dat_fem # bq, U
    Gender_erg_masc # bq
    Gender_erg_fem # bq
    Gender_psor_masc # cz, sl, U
    Gender_psor_fem # cz, sl, U
    Gender_psor_neut # sl
    Hyph_yes # cz, U
    InfForm_one # fi
    InfForm_two # fi
    InfForm_three # fi
    NameType_geo # U, cz
    NameType_prs # U, cz
    NameType_giv # U, cz
    NameType_sur # U, cz
    NameType_nat # U, cz
    NameType_com # U, cz
    NameType_pro # U, cz
    NameType_oth # U, cz
    NounType_com # U
    NounType_prop # U
    NounType_class # U
    Number_abs_sing # bq, U
    Number_abs_plur # bq, U
    Number_dat_sing # bq, U
    Number_dat_plur # bq, U
    Number_erg_sing # bq, U
    Number_erg_plur # bq, U
    Number_psee_sing # U
    Number_psee_plur # U
    Number_psor_sing # cz, fi, sl, U
    Number_psor_plur # cz, fi, sl, U
    NumForm_digit # cz, sl, U
    NumForm_roman # cz, sl, U
    NumForm_word # cz, sl, U
    NumValue_one # cz, U
    NumValue_two # cz, U
    NumValue_three # cz, U
    PartForm_pres # fi
    PartForm_past # fi
    PartForm_agt # fi
    PartForm_neg # fi
    PartType_mod # U
    PartType_emp # U
    PartType_res # U
    PartType_inf # U
    PartType_vbp # U
    Person_abs_one # bq, U
    Person_abs_two # bq, U
    Person_abs_three # bq, U
    Person_dat_one # bq, U
    Person_dat_two # bq, U
    Person_dat_three # bq, U
    Person_erg_one # bq, U
    Person_erg_two # bq, U
    Person_erg_three # bq, U
    Person_psor_one # fi, U
    Person_psor_two # fi, U
    Person_psor_three # fi, U
    Polite_inf # bq, U
    Polite_pol # bq, U
    Polite_abs_inf # bq, U
    Polite_abs_pol # bq, U
    Polite_erg_inf # bq, U
    Polite_erg_pol # bq, U
    Polite_dat_inf # bq, U
    Polite_dat_pol # bq, U
    Prefix_yes # U
    PrepCase_npr # cz
    PrepCase_pre # U
    PunctSide_ini # U
    PunctSide_fin # U
    PunctType_peri # U
    PunctType_qest # U
    PunctType_excl # U
    PunctType_quot # U
    PunctType_brck # U
    PunctType_comm # U
    PunctType_colo # U
    PunctType_semi # U
    PunctType_dash # U
    Style_arch # cz, fi, U
    Style_rare # cz, fi, U
    Style_poet # cz, U
    Style_norm # cz, U
    Style_coll # cz, U
    Style_vrnc # cz, U
    Style_sing # cz, U
    Style_expr # cz, U
    Style_derg # cz, U
    Style_vulg # cz, U
    Style_yes # fi, U
    StyleVariant_styleShort # cz
    StyleVariant_styleBound # cz, sl
    VerbType_aux # U
    VerbType_cop # U
    VerbType_mod # U
    VerbType_light # U
    PERSON
    NORP
    FACILITY
    ORG
    GPE
    LOC
    PRODUCT
    EVENT
    WORK_OF_ART
    LANGUAGE
    DATE
    TIME
    PERCENT
    MONEY
    QUANTITY
    ORDINAL
    CARDINAL
    acomp
    advcl
    advmod
    agent
    amod
    appos
    attr
    aux
    auxpass
    cc
    ccomp
    complm
    conj
    csubj
    csubjpass
    dep
    det
    dobj
    expl
    hmod
    hyph
    infmod
    intj
    iobj
    mark
    meta
    neg
    nmod
    nn
    npadvmod
    nsubj
    nsubjpass
    num
    number
    oprd
    parataxis
    partmod
    pcomp
    pobj
    poss
    possessive
    preconj
    prep
    prt
    punct
    quantmod
    rcmod
    root
    xcomp
--- a/spacy/symbols.pyx
+++ b/spacy/symbols.pyx
@ -0,0 +1,424 @@
 IDS = {
    "": NIL,
    "IS_ALPHA": IS_ALPHA,
    "IS_ASCII": IS_ASCII,
    "IS_DIGIT": IS_DIGIT,
    "IS_LOWER": IS_LOWER,
    "IS_PUNCT": IS_PUNCT,
    "IS_SPACE": IS_SPACE,
    "IS_TITLE": IS_TITLE,
    "IS_UPPER": IS_UPPER,
    "LIKE_URL": LIKE_URL,
    "LIKE_NUM": LIKE_NUM,
    "LIKE_EMAIL": LIKE_EMAIL,
    "IS_STOP": IS_STOP,
    "IS_OOV": IS_OOV,
    "FLAG14": FLAG14,
    "FLAG15": FLAG15,
    "FLAG16": FLAG16,
    "FLAG17": FLAG17,
    "FLAG18": FLAG18,
    "FLAG19": FLAG19,
    "FLAG20": FLAG20,
    "FLAG21": FLAG21,
    "FLAG22": FLAG22,
    "FLAG23": FLAG23,
    "FLAG24": FLAG24,
    "FLAG25": FLAG25,
    "FLAG26": FLAG26,
    "FLAG27": FLAG27,
    "FLAG28": FLAG28,
    "FLAG29": FLAG29,
    "FLAG30": FLAG30,
    "FLAG31": FLAG31,
    "FLAG32": FLAG32,
    "FLAG33": FLAG33,
    "FLAG34": FLAG34,
    "FLAG35": FLAG35,
    "FLAG36": FLAG36,
    "FLAG37": FLAG37,
    "FLAG38": FLAG38,
    "FLAG39": FLAG39,
    "FLAG40": FLAG40,
    "FLAG41": FLAG41,
    "FLAG42": FLAG42,
    "FLAG43": FLAG43,
    "FLAG44": FLAG44,
    "FLAG45": FLAG45,
    "FLAG46": FLAG46,
    "FLAG47": FLAG47,
    "FLAG48": FLAG48,
    "FLAG49": FLAG49,
    "FLAG50": FLAG50,
    "FLAG51": FLAG51,
    "FLAG52": FLAG52,
    "FLAG53": FLAG53,
    "FLAG54": FLAG54,
    "FLAG55": FLAG55,
    "FLAG56": FLAG56,
    "FLAG57": FLAG57,
    "FLAG58": FLAG58,
    "FLAG59": FLAG59,
    "FLAG60": FLAG60,
    "FLAG61": FLAG61,
    "FLAG62": FLAG62,
    "FLAG63": FLAG63,
    "ID": ID,
    "ORTH": ORTH,
    "LOWER": LOWER,
    "NORM": NORM,
    "SHAPE": SHAPE,
    "PREFIX": PREFIX,
    "SUFFIX": SUFFIX,
    "LENGTH": LENGTH,
    "CLUSTER": CLUSTER,
    "LEMMA": LEMMA,
    "POS": POS,
    "TAG": TAG,
    "DEP": DEP,
    "ENT_IOB": ENT_IOB,
    "ENT_TYPE": ENT_TYPE,
    "HEAD": HEAD,
    "SPACY": SPACY,
    "PROB": PROB,
    "ADJ": ADJ,
    "ADP": ADP,
    "ADV": ADV,
    "AUX": AUX,
    "CONJ": CONJ,
    "DET": DET,
    "INTJ": INTJ,
    "NOUN": NOUN,
    "NUM": NUM,
    "PART": PART,
    "PRON": PRON,
    "PROPN": PROPN,
    "PUNCT": PUNCT,
    "SCONJ": SCONJ,
    "SYM": SYM,
    "VERB": VERB,
    "X": X,
    "EOL": EOL,
    "SPACE": SPACE,
    "Animacy_anim": Animacy_anim,
    "Animacy_inam": Animacy_inam,
    "Aspect_freq": Aspect_freq,
    "Aspect_imp": Aspect_imp,
    "Aspect_mod": Aspect_mod,
    "Aspect_none": Aspect_none,
    "Aspect_perf": Aspect_perf,
    "Case_abe": Case_abe,
    "Case_abl": Case_abl,
    "Case_abs": Case_abs,
    "Case_acc": Case_acc,
    "Case_ade": Case_ade,
    "Case_all": Case_all,
    "Case_cau": Case_cau,
    "Case_com": Case_com,
    "Case_dat": Case_dat,
    "Case_del": Case_del,
    "Case_dis": Case_dis,
    "Case_ela": Case_ela,
    "Case_ess": Case_ess,
    "Case_gen": Case_gen,
    "Case_ill": Case_ill,
    "Case_ine": Case_ine,
    "Case_ins": Case_ins,
    "Case_loc": Case_loc,
    "Case_lat": Case_lat,
    "Case_nom": Case_nom,
    "Case_par": Case_par,
    "Case_sub": Case_sub,
    "Case_sup": Case_sup,
    "Case_tem": Case_tem,
    "Case_ter": Case_ter,
    "Case_tra": Case_tra,
    "Case_voc": Case_voc,
    "Definite_two": Definite_two,
    "Definite_def": Definite_def,
    "Definite_red": Definite_red,
    "Definite_ind": Definite_ind,
    "Degree_cmp": Degree_cmp,
    "Degree_comp": Degree_comp,
    "Degree_none": Degree_none,
    "Degree_pos": Degree_pos,
    "Degree_sup": Degree_sup,
    "Degree_abs": Degree_abs,
    "Degree_com": Degree_com,
    "Degree_dim ": Degree_dim, # du
    "Gender_com": Gender_com,
    "Gender_fem": Gender_fem,
    "Gender_masc": Gender_masc,
    "Gender_neut": Gender_neut,
    "Mood_cnd": Mood_cnd,
    "Mood_imp": Mood_imp,
    "Mood_ind": Mood_ind,
    "Mood_n": Mood_n,
    "Mood_pot": Mood_pot,
    "Mood_sub": Mood_sub,
    "Mood_opt": Mood_opt,
    "Negative_neg": Negative_neg,
    "Negative_pos": Negative_pos,
    "Negative_yes": Negative_yes,
    "Number_com": Number_com,
    "Number_dual": Number_dual,
    "Number_none": Number_none,
    "Number_plur": Number_plur,
    "Number_sing": Number_sing,
    "Number_ptan ": Number_ptan, # bg
    "Number_count ": Number_count, # bg
    "NumType_card": NumType_card,
    "NumType_dist": NumType_dist,
    "NumType_frac": NumType_frac,
    "NumType_gen": NumType_gen,
    "NumType_mult": NumType_mult,
    "NumType_none": NumType_none,
    "NumType_ord": NumType_ord,
    "NumType_sets": NumType_sets,
    "Person_one": Person_one,
    "Person_two": Person_two,
    "Person_three": Person_three,
    "Person_none": Person_none,
    "Poss_yes": Poss_yes,
    "PronType_advPart": PronType_advPart,
    "PronType_art": PronType_art,
    "PronType_default": PronType_default,
    "PronType_dem": PronType_dem,
    "PronType_ind": PronType_ind,
    "PronType_int": PronType_int,
    "PronType_neg": PronType_neg,
    "PronType_prs": PronType_prs,
    "PronType_rcp": PronType_rcp,
    "PronType_rel": PronType_rel,
    "PronType_tot": PronType_tot,
    "PronType_clit": PronType_clit,
    "PronType_exc ": PronType_exc, # es, ca, it, fa,
    "Reflex_yes": Reflex_yes,
    "Tense_fut": Tense_fut,
    "Tense_imp": Tense_imp,
    "Tense_past": Tense_past,
    "Tense_pres": Tense_pres,
    "VerbForm_fin": VerbForm_fin,
    "VerbForm_ger": VerbForm_ger,
    "VerbForm_inf": VerbForm_inf,
    "VerbForm_none": VerbForm_none,
    "VerbForm_part": VerbForm_part,
    "VerbForm_partFut": VerbForm_partFut,
    "VerbForm_partPast": VerbForm_partPast,
    "VerbForm_partPres": VerbForm_partPres,
    "VerbForm_sup": VerbForm_sup,
    "VerbForm_trans": VerbForm_trans,
    "VerbForm_gdv ": VerbForm_gdv, # la,
    "Voice_act": Voice_act,
    "Voice_cau": Voice_cau,
    "Voice_pass": Voice_pass,
    "Voice_mid ": Voice_mid, # gkc,
    "Voice_int ": Voice_int, # hb,
    "Abbr_yes ": Abbr_yes, # cz, fi, sl, U,
    "AdpType_prep ": AdpType_prep, # cz, U,
    "AdpType_post ": AdpType_post, # U,
    "AdpType_voc ": AdpType_voc, # cz,
    "AdpType_comprep ": AdpType_comprep, # cz,
    "AdpType_circ ": AdpType_circ, # U,
    "AdvType_man": AdvType_man,
    "AdvType_loc": AdvType_loc,
    "AdvType_tim": AdvType_tim,
    "AdvType_deg": AdvType_deg,
    "AdvType_cau": AdvType_cau,
    "AdvType_mod": AdvType_mod,
    "AdvType_sta": AdvType_sta,
    "AdvType_ex": AdvType_ex,
    "AdvType_adadj": AdvType_adadj,
    "ConjType_oper ": ConjType_oper, # cz, U,
    "ConjType_comp ": ConjType_comp, # cz, U,
    "Connegative_yes ": Connegative_yes, # fi,
    "Derivation_minen ": Derivation_minen, # fi,
    "Derivation_sti ": Derivation_sti, # fi,
    "Derivation_inen ": Derivation_inen, # fi,
    "Derivation_lainen ": Derivation_lainen, # fi,
    "Derivation_ja ": Derivation_ja, # fi,
    "Derivation_ton ": Derivation_ton, # fi,
    "Derivation_vs ": Derivation_vs, # fi,
    "Derivation_ttain ": Derivation_ttain, # fi,
    "Derivation_ttaa ": Derivation_ttaa, # fi,
    "Echo_rdp ": Echo_rdp, # U,
    "Echo_ech ": Echo_ech, # U,
    "Foreign_foreign ": Foreign_foreign, # cz, fi, U,
    "Foreign_fscript ": Foreign_fscript, # cz, fi, U,
    "Foreign_tscript ": Foreign_tscript, # cz, U,
    "Foreign_yes ": Foreign_yes, # sl,
    "Gender_dat_masc ": Gender_dat_masc, # bq, U,
    "Gender_dat_fem ": Gender_dat_fem, # bq, U,
    "Gender_erg_masc ": Gender_erg_masc, # bq,
    "Gender_erg_fem ": Gender_erg_fem, # bq,
    "Gender_psor_masc ": Gender_psor_masc, # cz, sl, U,
    "Gender_psor_fem ": Gender_psor_fem, # cz, sl, U,
    "Gender_psor_neut ": Gender_psor_neut, # sl,
    "Hyph_yes ": Hyph_yes, # cz, U,
    "InfForm_one ": InfForm_one, # fi,
    "InfForm_two ": InfForm_two, # fi,
    "InfForm_three ": InfForm_three, # fi,
    "NameType_geo ": NameType_geo, # U, cz,
    "NameType_prs ": NameType_prs, # U, cz,
    "NameType_giv ": NameType_giv, # U, cz,
    "NameType_sur ": NameType_sur, # U, cz,
    "NameType_nat ": NameType_nat, # U, cz,
    "NameType_com ": NameType_com, # U, cz,
    "NameType_pro ": NameType_pro, # U, cz,
    "NameType_oth ": NameType_oth, # U, cz,
    "NounType_com ": NounType_com, # U,
    "NounType_prop ": NounType_prop, # U,
    "NounType_class ": NounType_class, # U,
    "Number_abs_sing ": Number_abs_sing, # bq, U,
    "Number_abs_plur ": Number_abs_plur, # bq, U,
    "Number_dat_sing ": Number_dat_sing, # bq, U,
    "Number_dat_plur ": Number_dat_plur, # bq, U,
    "Number_erg_sing ": Number_erg_sing, # bq, U,
    "Number_erg_plur ": Number_erg_plur, # bq, U,
    "Number_psee_sing ": Number_psee_sing, # U,
    "Number_psee_plur ": Number_psee_plur, # U,
    "Number_psor_sing ": Number_psor_sing, # cz, fi, sl, U,
    "Number_psor_plur ": Number_psor_plur, # cz, fi, sl, U,
    "NumForm_digit ": NumForm_digit, # cz, sl, U,
    "NumForm_roman ": NumForm_roman, # cz, sl, U,
    "NumForm_word ": NumForm_word, # cz, sl, U,
    "NumValue_one ": NumValue_one, # cz, U,
    "NumValue_two ": NumValue_two, # cz, U,
    "NumValue_three ": NumValue_three, # cz, U,
    "PartForm_pres ": PartForm_pres, # fi,
    "PartForm_past ": PartForm_past, # fi,
    "PartForm_agt ": PartForm_agt, # fi,
    "PartForm_neg ": PartForm_neg, # fi,
    "PartType_mod ": PartType_mod, # U,
    "PartType_emp ": PartType_emp, # U,
    "PartType_res ": PartType_res, # U,
    "PartType_inf ": PartType_inf, # U,
    "PartType_vbp ": PartType_vbp, # U,
    "Person_abs_one ": Person_abs_one, # bq, U,
    "Person_abs_two ": Person_abs_two, # bq, U,
    "Person_abs_three ": Person_abs_three, # bq, U,
    "Person_dat_one ": Person_dat_one, # bq, U,
    "Person_dat_two ": Person_dat_two, # bq, U,
    "Person_dat_three ": Person_dat_three, # bq, U,
    "Person_erg_one ": Person_erg_one, # bq, U,
    "Person_erg_two ": Person_erg_two, # bq, U,
    "Person_erg_three ": Person_erg_three, # bq, U,
    "Person_psor_one ": Person_psor_one, # fi, U,
    "Person_psor_two ": Person_psor_two, # fi, U,
    "Person_psor_three ": Person_psor_three, # fi, U,
    "Polite_inf ": Polite_inf, # bq, U,
    "Polite_pol ": Polite_pol, # bq, U,
    "Polite_abs_inf ": Polite_abs_inf, # bq, U,
    "Polite_abs_pol ": Polite_abs_pol, # bq, U,
    "Polite_erg_inf ": Polite_erg_inf, # bq, U,
    "Polite_erg_pol ": Polite_erg_pol, # bq, U,
    "Polite_dat_inf ": Polite_dat_inf, # bq, U,
    "Polite_dat_pol ": Polite_dat_pol, # bq, U,
    "Prefix_yes ": Prefix_yes, # U,
    "PrepCase_npr ": PrepCase_npr, # cz,
    "PrepCase_pre ": PrepCase_pre, # U,
    "PunctSide_ini ": PunctSide_ini, # U,
    "PunctSide_fin ": PunctSide_fin, # U,
    "PunctType_peri ": PunctType_peri, # U,
    "PunctType_qest ": PunctType_qest, # U,
    "PunctType_excl ": PunctType_excl, # U,
    "PunctType_quot ": PunctType_quot, # U,
    "PunctType_brck ": PunctType_brck, # U,
    "PunctType_comm ": PunctType_comm, # U,
    "PunctType_colo ": PunctType_colo, # U,
    "PunctType_semi ": PunctType_semi, # U,
    "PunctType_dash ": PunctType_dash, # U,
    "Style_arch ": Style_arch, # cz, fi, U,
    "Style_rare ": Style_rare, # cz, fi, U,
    "Style_poet ": Style_poet, # cz, U,
    "Style_norm ": Style_norm, # cz, U,
    "Style_coll ": Style_coll, # cz, U,
    "Style_vrnc ": Style_vrnc, # cz, U,
    "Style_sing ": Style_sing, # cz, U,
    "Style_expr ": Style_expr, # cz, U,
    "Style_derg ": Style_derg, # cz, U,
    "Style_vulg ": Style_vulg, # cz, U,
    "Style_yes ": Style_yes, # fi, U,
    "StyleVariant_styleShort ": StyleVariant_styleShort, # cz,
    "StyleVariant_styleBound ": StyleVariant_styleBound, # cz, sl,
    "VerbType_aux ": VerbType_aux, # U,
    "VerbType_cop ": VerbType_cop, # U,
    "VerbType_mod ": VerbType_mod, # U,
    "VerbType_light ": VerbType_light, # U,
    "PERSON": PERSON,
    "NORP": NORP,
    "FACILITY": FACILITY,
    "ORG": ORG,
    "GPE": GPE,
    "LOC": LOC,
    "PRODUCT": PRODUCT,
    "EVENT": EVENT,
    "WORK_OF_ART": WORK_OF_ART,
    "LANGUAGE": LANGUAGE,
    "DATE": DATE,
    "TIME": TIME,
    "PERCENT": PERCENT,
    "MONEY": MONEY,
    "QUANTITY": QUANTITY,
    "ORDINAL": ORDINAL,
    "CARDINAL": CARDINAL,
    "acomp": acomp,
    "advcl": advcl,
    "advmod": advmod,
    "agent": agent,
    "amod": amod,
    "appos": appos,
    "attr": attr,
    "aux": aux,
    "auxpass": auxpass,
    "cc": cc,
    "ccomp": ccomp,
    "complm": complm,
    "conj": conj,
    "csubj": csubj,
    "csubjpass": csubjpass,
    "dep": dep,
    "det": det,
    "dobj": dobj,
    "expl": expl,
    "hmod": hmod,
    "hyph": hyph,
    "infmod": infmod,
    "intj": intj,
    "iobj": iobj,
    "mark": mark,
    "meta": meta,
    "neg": neg,
    "nmod": nmod,
    "nn": nn,
    "npadvmod": npadvmod,
    "nsubj": nsubj,
    "nsubjpass": nsubjpass,
    "num": num,
    "number": number,
    "oprd": oprd,
    "parataxis": parataxis,
    "partmod": partmod,
    "pcomp": pcomp,
    "pobj": pobj,
    "poss": poss,
    "possessive": possessive,
    "preconj": preconj,
    "prep": prep,
    "prt": prt,
    "punct": punct,
    "quantmod": quantmod,
    "rcmod": rcmod,
    "root": root,
    "xcomp": xcomp
 }
 NAMES = [it[0] for it in sorted(IDS.items(), key=lambda it: it[1])]
--- a/spacy/syntax/parser.pyx
+++ b/spacy/syntax/parser.pyx
@ -83,7 +83,6 @@ cdef class Parser:
        model = Model(moves.n_moves, templates, model_dir)
        return cls(strings, moves, model)
    def __call__(self, Doc tokens):
        cdef StateClass stcls = StateClass.init(tokens.data, tokens.length)
        self.moves.initialize_state(stcls)
@ -93,6 +92,9 @@ cdef class Parser:
        self.parse(stcls, eg.c)
        tokens.set_parse(stcls._sent)
    def __reduce__(self):
        return (Parser, (self.moves.strings, self.moves, self.model), None, None)
    cdef void predict(self, StateClass stcls, ExampleC* eg) nogil:
        memset(eg.scores, 0, eg.nr_class * sizeof(weight_t))
        self.moves.set_valid(eg.is_valid, stcls)
--- a/spacy/syntax/transition_system.pxd
+++ b/spacy/syntax/transition_system.pxd
@ -37,6 +37,8 @@ cdef class TransitionSystem:
    cdef public int root_label
    cdef public freqs
    cdef object _labels_by_action
    cdef int initialize_state(self, StateClass state) except -1
    cdef int finalize_state(self, StateClass state) nogil
--- a/spacy/syntax/transition_system.pyx
+++ b/spacy/syntax/transition_system.pyx
@ -15,7 +15,8 @@ class OracleError(Exception):
 cdef class TransitionSystem:
-    def __init__(self, StringStore string_table, dict labels_by_action):
+    def __init__(self, StringStore string_table, dict labels_by_action, _freqs=None):
        self._labels_by_action = labels_by_action
        self.mem = Pool()
        self.n_moves = sum(len(labels) for labels in labels_by_action.values())
        self._is_valid = <bint*>self.mem.alloc(self.n_moves, sizeof(bint))
@ -30,7 +31,7 @@ cdef class TransitionSystem:
                i += 1
        self.c = moves
        self.root_label = self.strings['ROOT']
-        self.freqs = {}
+        self.freqs = {} if _freqs is None else _freqs
        for attr in (TAG, HEAD, DEP, ENT_TYPE, ENT_IOB):
            self.freqs[attr] = defaultdict(int)
            self.freqs[attr][0] = 1
@ -39,6 +40,11 @@ cdef class TransitionSystem:
            self.freqs[HEAD][i] = 1
            self.freqs[HEAD][-i] = 1
    def __reduce__(self):
        return (self.__class__,
                (self.strings, self._labels_by_action, self.freqs),
                None, None)
    cdef int initialize_state(self, StateClass state) except -1:
        pass
--- a/spacy/tagger.pyx
+++ b/spacy/tagger.pyx
@ -148,6 +148,9 @@ cdef class Tagger:
        tokens.is_tagged = True
        tokens._py_tokens = [None] * tokens.length
    def __reduce__(self):
        return (self.__class__, (self.vocab, self.model), None, None)
    def tag_from_strings(self, Doc tokens, object tag_strs):
        cdef int i
        for i in range(tokens.length):
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -14,7 +14,6 @@ from ..typedefs cimport attr_t, flags_t
 from ..attrs cimport attr_id_t
 from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
 from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
 from ..parts_of_speech import UNIV_POS_NAMES
 from ..parts_of_speech cimport CONJ, PUNCT, NOUN
 from ..parts_of_speech cimport univ_pos_t
 from ..lexeme cimport Lexeme
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -9,7 +9,7 @@ import numpy
 from ..lexeme cimport Lexeme
-from ..parts_of_speech import UNIV_POS_NAMES
+from .. import parts_of_speech
 from ..attrs cimport LEMMA
 from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
@ -318,7 +318,7 @@ cdef class Token:
    property pos_:
        def __get__(self):
-            return _pos_id_to_string[self.c.pos]
+            return parts_of_speech.NAMES[self.c.pos]
    property tag_:
        def __get__(self):
@ -363,6 +363,3 @@ cdef class Token:
    property like_email:
        def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_EMAIL)
 _pos_id_to_string = {id_: string for string, id_ in UNIV_POS_NAMES.items()}
--- a/spacy/vocab.pxd
+++ b/spacy/vocab.pxd
@ -25,7 +25,6 @@ cdef struct _Cached:
 cdef class Vocab:
    cpdef public lexeme_props_getter
    cdef Pool mem
    cpdef readonly StringStore strings
    cpdef readonly Morphology morphology
@ -33,7 +32,6 @@ cdef class Vocab:
    cdef public object _serializer
    cdef public object data_dir
    cdef public object get_lex_attr
    cdef public object pos_tags
    cdef public object serializer_freqs
    cdef const LexemeC* get(self, Pool mem, unicode string) except NULL
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -10,6 +10,8 @@ from os import path
 import io
 import math
 import json
 import tempfile
 import copy_reg
 from .lexeme cimport EMPTY_LEXEME
 from .lexeme cimport Lexeme
@ -19,6 +21,9 @@ from .typedefs cimport attr_t
 from .cfile cimport CFile
 from .lemmatizer import Lemmatizer
 from . import attrs
 from . import symbols
 from cymem.cymem cimport Address
 from . import util
 from .serialize.packer cimport Packer
@ -67,6 +72,14 @@ cdef class Vocab:
        self._by_hash = PreshMap()
        self._by_orth = PreshMap()
        self.strings = StringStore()
        # Load strings in a special order, so that we have an onset number for
        # the vocabulary. This way, when words are added in order, the orth ID
        # is the frequency rank of the word, plus a certain offset. The structural
        # strings are loaded first, because the vocab is open-class, and these
        # symbols are closed class.
        for name in symbols.NAMES + list(sorted(tag_map.keys())):
            if name:
                _ = self.strings[name]
        self.get_lex_attr = get_lex_attr
        self.morphology = Morphology(self.strings, tag_map, lemmatizer)
        self.serializer_freqs = serializer_freqs
@ -85,6 +98,20 @@ cdef class Vocab:
        """The current number of lexemes stored."""
        return self.length
    def __reduce__(self):
        # TODO: Dump vectors
        tmp_dir = tempfile.mkdtemp()
        lex_loc = path.join(tmp_dir, 'lexemes.bin')
        str_loc = path.join(tmp_dir, 'strings.txt')
        vec_loc = path.join(self.data_dir, 'vec.bin') if self.data_dir is not None else None
        self.dump(lex_loc)
        self.strings.dump(str_loc)
        state = (str_loc, lex_loc, vec_loc, self.morphology, self.get_lex_attr,
                 self.serializer_freqs, self.data_dir)
        return (unpickle_vocab, state, None, None)
    cdef const LexemeC* get(self, Pool mem, unicode string) except NULL:
        '''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
        if necessary, using memory acquired from the given pool.  If the pool
@ -260,17 +287,17 @@ cdef class Vocab:
            i += 1
        fp.close()
-    def load_vectors(self, loc_or_file):
+    def load_vectors(self, file_):
        cdef LexemeC* lexeme
        cdef attr_t orth
        cdef int32_t vec_len = -1
-        for line_num, line in enumerate(loc_or_file):
+        for line_num, line in enumerate(file_):
            pieces = line.split()
            word_str = pieces.pop(0)
            if vec_len == -1:
                vec_len = len(pieces)
            elif vec_len != len(pieces):
-                raise VectorReadError.mismatched_sizes(loc_or_file, line_num,
+                raise VectorReadError.mismatched_sizes(file_, line_num,
                                                        vec_len, len(pieces))
            orth = self.strings[word_str]
            lexeme = <LexemeC*><void*>self.get_by_orth(self.mem, orth)
@ -328,6 +355,25 @@ cdef class Vocab:
        return vec_len
 def unpickle_vocab(strings_loc, lex_loc, vec_loc, morphology, get_lex_attr,
                   serializer_freqs, data_dir):
    cdef Vocab vocab = Vocab()
    vocab.get_lex_attr = get_lex_attr
    vocab.morphology = morphology
    vocab.strings = morphology.strings
    vocab.data_dir = data_dir
    vocab.serializer_freqs = serializer_freqs
    vocab.load_lexemes(strings_loc, lex_loc)
    if vec_loc is not None:
        vocab.load_vectors_from_bin_loc(vec_loc)
    return vocab
 copy_reg.constructor(unpickle_vocab)
 def write_binary_vectors(in_loc, out_loc):
    cdef CFile out_file = CFile(out_loc, 'wb')
    cdef Address mem
--- a/tests/morphology/test_pickle.py
+++ b/tests/morphology/test_pickle.py
@ -0,0 +1,17 @@
 import pytest
 import pickle
 import StringIO
 from spacy.morphology import Morphology
 from spacy.lemmatizer import Lemmatizer
 from spacy.strings import StringStore
 def test_pickle():
    morphology = Morphology(StringStore(), {}, Lemmatizer({}, {}, {})) 
    file_ = StringIO.StringIO()
    pickle.dump(morphology, file_)
--- a/tests/parser/test_pickle.py
+++ b/tests/parser/test_pickle.py
@ -0,0 +1,16 @@
 import pytest
 import pickle
 import cloudpickle
 import StringIO
@pytest.mark.models
 def test_pickle(EN):
    file_ = StringIO.StringIO()
    cloudpickle.dump(EN.parser, file_)
    file_.seek(0)
    loaded = pickle.load(file_)
--- a/tests/tagger/test_lemmatizer.py
+++ b/tests/tagger/test_lemmatizer.py
@ -1,5 +1,7 @@
 # -*- coding: utf-8 -*-
 from __future__ import unicode_literals
 import StringIO
 import pickle
 from spacy.lemmatizer import Lemmatizer, read_index, read_exc
 from spacy.en import LOCAL_DATA_DIR
@ -41,3 +43,12 @@ def test_smart_quotes(lemmatizer):
    do = lemmatizer.punct
    assert do('“') == set(['"'])
    assert do('“') == set(['"'])
 def test_pickle_lemmatizer(lemmatizer):
    file_ = StringIO.StringIO()
    pickle.dump(lemmatizer, file_)
    file_.seek(0)
    loaded = pickle.load(file_)
--- a/tests/test_pickle.py
+++ b/tests/test_pickle.py
@ -0,0 +1,15 @@
 import pytest
 import StringIO
 import cloudpickle
 import pickle
@pytest.mark.models
 def test_pickle_english(EN):
    file_ = StringIO.StringIO()
    cloudpickle.dump(EN, file_)
    file_.seek(0)
    loaded = pickle.load(file_)
--- a/tests/vocab/test_intern.py
+++ b/tests/vocab/test_intern.py
@ -1,5 +1,7 @@
 # -*- coding: utf8 -*-
 from __future__ import unicode_literals
 import pickle
 import StringIO
 from spacy.strings import StringStore
@ -76,3 +78,18 @@ def test_massive_strings(sstore):
    s513 = '1' * 513
    orth = sstore[s513]
    assert sstore[orth] == s513
 def test_pickle_string_store(sstore):
    hello_id = sstore[u'Hi']
    string_file = StringIO.StringIO()
    pickle.dump(sstore, string_file)
    string_file.seek(0)
    loaded = pickle.load(string_file)
    assert loaded[hello_id] == u'Hi'
--- a/tests/vocab/test_vocab.py
+++ b/tests/vocab/test_vocab.py
@ -1,5 +1,11 @@
 from __future__ import unicode_literals
 import pytest
 import StringIO
 import cloudpickle
 import pickle
 from spacy.attrs import LEMMA, ORTH, PROB, IS_ALPHA
 from spacy.parts_of_speech import NOUN, VERB
 def test_neq(en_vocab):
@ -25,3 +31,21 @@ def test_punct_neq(en_vocab):
 def test_shape_attr(en_vocab):
    example = en_vocab['example']
    assert example.orth != example.shape
 def test_symbols(en_vocab):
    assert en_vocab.strings['IS_ALPHA'] == IS_ALPHA
    assert en_vocab.strings['NOUN'] == NOUN
    assert en_vocab.strings['VERB'] == VERB
    assert en_vocab.strings['LEMMA'] == LEMMA
    assert en_vocab.strings['ORTH'] == ORTH
    assert en_vocab.strings['PROB'] == PROB
 def test_pickle_vocab(en_vocab):
    file_ = StringIO.StringIO()
    cloudpickle.dump(en_vocab, file_)
    file_.seek(0)
    loaded = pickle.load(file_)
--- a/tests/website/conftest.py
+++ b/tests/website/conftest.py
@ -1,11 +1,13 @@
 from __future__ import unicode_literals
 import pytest
 import os
@pytest.fixture(scope='session')
 def nlp():
-    from spacy.en import English
+    from spacy.en import English, LOCAL_DATA_DIR
-    return English()
+    data_dir = os.environ.get('SPACY_DATA', LOCAL_DATA_DIR)
    return English(data_dir=data_dir)
@pytest.fixture()
--- a/tests/website/test_home.py
+++ b/tests/website/test_home.py
@ -1,6 +1,7 @@
 from __future__ import unicode_literals
 import pytest
 import spacy
 import os
@pytest.fixture()
@ -9,8 +10,9 @@ def token(doc):
 def test_load_resources_and_process_text():
-    from spacy.en import English
+    from spacy.en import English, LOCAL_DATA_DIR
-    nlp = English()
+    data_dir = os.environ.get('SPACY_DATA', LOCAL_DATA_DIR)
    nlp = English(data_dir=data_dir)
    doc = nlp('Hello, world. Here are two sentences.')