Merge branch 'attrs'

This commit is contained in:
Matthew Honnibal 2015-10-13 05:03:25 +02:00
commit 41cbbdefe3
19 changed files with 1500 additions and 742 deletions

View File

@ -42,7 +42,10 @@ import spacy.de
import spacy.fi
import spacy.it
try:
unicode
except NameError:
unicode = str
def setup_tokenizer(lang_data_dir, tok_dir):
@ -112,8 +115,12 @@ def _read_freqs(loc, max_length=100, min_doc_freq=5, min_freq=200):
total += freq
counts.smooth()
log_total = math.log(total)
if str(loc).endswith('gz'):
file_ = gzip.open(str(loc))
else:
file_ = loc.open()
probs = {}
for line in loc.open():
for line in file_:
freq, doc_freq, key = line.split('\t', 2)
doc_freq = int(doc_freq)
freq = int(freq)
@ -158,7 +165,7 @@ def setup_vocab(get_lex_attr, tag_map, src_dir, dst_dir):
clusters = _read_clusters(src_dir / 'clusters.txt')
probs, oov_prob = _read_probs(src_dir / 'words.sgt.prob')
if not probs:
probs, oov_prob = _read_freqs(src_dir / 'freqs.txt')
probs, oov_prob = _read_freqs(src_dir / 'freqs.txt.gz')
if not probs:
oov_prob = -20
else:
@ -168,6 +175,11 @@ def setup_vocab(get_lex_attr, tag_map, src_dir, dst_dir):
probs[word] = oov_prob
lexicon = []
for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])):
# First encode the strings into the StringStore. This way, we can map
# the orth IDs to frequency ranks
orth = vocab.strings[word]
# Now actually load the vocab
for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])):
lexeme = vocab[word]
lexeme.prob = prob

View File

@ -56,5 +56,4 @@
"was": {"L": "be", "VerbForm": "Fin", "Tense": "Past", "Number": "Sing"},
"were": {"L": "be", "VerbForm": "Fin", "Tense": "Past", "Number": "Plur"}
}
}

View File

@ -22,7 +22,7 @@
"JJS": {"pos": "adj", "degree": "sup"},
"LS": {"pos": "punct", "numtype": "ord"},
"MD": {"pos": "verb", "verbtype": "mod"},
"NIL": {"pos": "no_tag"},
"NIL": {"pos": ""},
"NN": {"pos": "noun", "number": "sing"},
"NNP": {"pos": "noun", "nountype": "prop", "number": "sing"},
"NNPS": {"pos": "noun", "nountype": "prop", "number": "plur"},

View File

@ -166,7 +166,8 @@ MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings',
'spacy.tokens.doc', 'spacy.tokens.spans', 'spacy.tokens.token',
'spacy.serialize.packer', 'spacy.serialize.huffman', 'spacy.serialize.bits',
'spacy.cfile', 'spacy.matcher',
'spacy.syntax.ner']
'spacy.syntax.ner',
'spacy.symbols']
if __name__ == '__main__':

View File

@ -1,5 +1,6 @@
# Reserve 64 values for flag features
cpdef enum attr_id_t:
NULL_ATTR
IS_ALPHA
IS_ASCII
IS_DIGIT
@ -14,8 +15,7 @@ cpdef enum attr_id_t:
IS_STOP
IS_OOV
FLAG13 = 13
FLAG14
FLAG14 = 14
FLAG15
FLAG16
FLAG17

View File

@ -0,0 +1,90 @@
IDS = {
"": NULL_ATTR,
"IS_ALPHA": IS_ALPHA,
"IS_ASCII": IS_ASCII,
"IS_DIGIT": IS_DIGIT,
"IS_LOWER": IS_LOWER,
"IS_PUNCT": IS_PUNCT,
"IS_SPACE": IS_SPACE,
"IS_TITLE": IS_TITLE,
"IS_UPPER": IS_UPPER,
"LIKE_URL": LIKE_URL,
"LIKE_NUM": LIKE_NUM,
"LIKE_EMAIL": LIKE_EMAIL,
"IS_STOP": IS_STOP,
"IS_OOV": IS_OOV,
"FLAG14": FLAG14,
"FLAG15": FLAG15,
"FLAG16": FLAG16,
"FLAG17": FLAG17,
"FLAG18": FLAG18,
"FLAG19": FLAG19,
"FLAG20": FLAG20,
"FLAG21": FLAG21,
"FLAG22": FLAG22,
"FLAG23": FLAG23,
"FLAG24": FLAG24,
"FLAG25": FLAG25,
"FLAG26": FLAG26,
"FLAG27": FLAG27,
"FLAG28": FLAG28,
"FLAG29": FLAG29,
"FLAG30": FLAG30,
"FLAG31": FLAG31,
"FLAG32": FLAG32,
"FLAG33": FLAG33,
"FLAG34": FLAG34,
"FLAG35": FLAG35,
"FLAG36": FLAG36,
"FLAG37": FLAG37,
"FLAG38": FLAG38,
"FLAG39": FLAG39,
"FLAG40": FLAG40,
"FLAG41": FLAG41,
"FLAG42": FLAG42,
"FLAG43": FLAG43,
"FLAG44": FLAG44,
"FLAG45": FLAG45,
"FLAG46": FLAG46,
"FLAG47": FLAG47,
"FLAG48": FLAG48,
"FLAG49": FLAG49,
"FLAG50": FLAG50,
"FLAG51": FLAG51,
"FLAG52": FLAG52,
"FLAG53": FLAG53,
"FLAG54": FLAG54,
"FLAG55": FLAG55,
"FLAG56": FLAG56,
"FLAG57": FLAG57,
"FLAG58": FLAG58,
"FLAG59": FLAG59,
"FLAG60": FLAG60,
"FLAG61": FLAG61,
"FLAG62": FLAG62,
"FLAG63": FLAG63,
"ID": ID,
"ORTH": ORTH,
"LOWER": LOWER,
"NORM": NORM,
"SHAPE": SHAPE,
"PREFIX": PREFIX,
"SUFFIX": SUFFIX,
"LENGTH": LENGTH,
"CLUSTER": CLUSTER,
"LEMMA": LEMMA,
"POS": POS,
"TAG": TAG,
"DEP": DEP,
"ENT_IOB": ENT_IOB,
"ENT_TYPE": ENT_TYPE,
"HEAD": HEAD,
"SPACY": SPACY,
"PROB": PROB,
}
# ATTR IDs, in order of the symbol
NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]

View File

@ -15,7 +15,7 @@ from libcpp.vector cimport vector
from murmurhash.mrmr cimport hash64
from .attrs cimport LENGTH, ENT_TYPE, ORTH, NORM, LEMMA, LOWER, SHAPE
from .attrs cimport FLAG13, FLAG14, FLAG15, FLAG16, FLAG17, FLAG18, FLAG19, FLAG20, FLAG21, FLAG22, FLAG23, FLAG24, FLAG25
from .attrs cimport FLAG14, FLAG15, FLAG16, FLAG17, FLAG18, FLAG19, FLAG20, FLAG21, FLAG22, FLAG23, FLAG24, FLAG25
from .tokens.doc cimport get_token_attr
from .tokens.doc cimport Doc
from .vocab cimport Vocab

View File

@ -7,6 +7,7 @@ from .strings cimport StringStore
from .typedefs cimport attr_t
from .parts_of_speech cimport univ_pos_t
from . cimport symbols
cdef struct RichTagC:
uint64_t morph
@ -36,720 +37,252 @@ cdef class Morphology:
cdef int assign_feature(self, uint64_t* morph, feature, value) except -1
cpdef enum univ_morph_t:
NIL = 0
Animacy_anim = symbols.Animacy_anim
Animacy_inam
Aspect_freq
Aspect_imp
Aspect_mod
Aspect_none
Aspect_perf
Case_abe
Case_abl
Case_abs
Case_acc
Case_ade
Case_all
Case_cau
Case_com
Case_dat
Case_del
Case_dis
Case_ela
Case_ess
Case_gen
Case_ill
Case_ine
Case_ins
Case_loc
Case_lat
Case_nom
Case_par
Case_sub
Case_sup
Case_tem
Case_ter
Case_tra
Case_voc
Definite_two
Definite_def
Definite_red
Definite_ind
Degree_cmp
Degree_comp
Degree_none
Degree_pos
Degree_sup
Degree_abs
Degree_com
Degree_dim # du
Gender_com
Gender_fem
Gender_masc
Gender_neut
Mood_cnd
Mood_imp
Mood_ind
Mood_n
Mood_pot
Mood_sub
Mood_opt
Negative_neg
Negative_pos
Negative_yes
Number_com
Number_dual
Number_none
Number_plur
Number_sing
Number_ptan # bg
Number_count # bg
NumType_card
NumType_dist
NumType_frac
NumType_gen
NumType_mult
NumType_none
NumType_ord
NumType_sets
Person_one
Person_two
Person_three
Person_none
Poss_yes
PronType_advPart
PronType_art
PronType_default
PronType_dem
PronType_ind
PronType_int
PronType_neg
PronType_prs
PronType_rcp
PronType_rel
PronType_tot
PronType_clit
PronType_exc # es, ca, it, fa
Reflex_yes
Tense_fut
Tense_imp
Tense_past
Tense_pres
VerbForm_fin
VerbForm_ger
VerbForm_inf
VerbForm_none
VerbForm_part
VerbForm_partFut
VerbForm_partPast
VerbForm_partPres
VerbForm_sup
VerbForm_trans
VerbForm_gdv # la
Voice_act
Voice_cau
Voice_pass
Voice_mid # gkc
Voice_int # hb
Abbr_yes # cz, fi, sl, U
AdpType_prep # cz, U
AdpType_post # U
AdpType_voc # cz
AdpType_comprep # cz
AdpType_circ # U
AdvType_man
AdvType_loc
AdvType_tim
AdvType_deg
AdvType_cau
AdvType_mod
AdvType_sta
AdvType_ex
AdvType_adadj
ConjType_oper # cz, U
ConjType_comp # cz, U
Connegative_yes # fi
Derivation_minen # fi
Derivation_sti # fi
Derivation_inen # fi
Derivation_lainen # fi
Derivation_ja # fi
Derivation_ton # fi
Derivation_vs # fi
Derivation_ttain # fi
Derivation_ttaa # fi
Echo_rdp # U
Echo_ech # U
Foreign_foreign # cz, fi, U
Foreign_fscript # cz, fi, U
Foreign_tscript # cz, U
Foreign_yes # sl
Gender_dat_masc # bq, U
Gender_dat_fem # bq, U
Gender_erg_masc # bq
Gender_erg_fem # bq
Gender_psor_masc # cz, sl, U
Gender_psor_fem # cz, sl, U
Gender_psor_neut # sl
Hyph_yes # cz, U
InfForm_one # fi
InfForm_two # fi
InfForm_three # fi
NameType_geo # U, cz
NameType_prs # U, cz
NameType_giv # U, cz
NameType_sur # U, cz
NameType_nat # U, cz
NameType_com # U, cz
NameType_pro # U, cz
NameType_oth # U, cz
NounType_com # U
NounType_prop # U
NounType_class # U
Number_abs_sing # bq, U
Number_abs_plur # bq, U
Number_dat_sing # bq, U
Number_dat_plur # bq, U
Number_erg_sing # bq, U
Number_erg_plur # bq, U
Number_psee_sing # U
Number_psee_plur # U
Number_psor_sing # cz, fi, sl, U
Number_psor_plur # cz, fi, sl, U
NumForm_digit # cz, sl, U
NumForm_roman # cz, sl, U
NumForm_word # cz, sl, U
NumValue_one # cz, U
NumValue_two # cz, U
NumValue_three # cz, U
PartForm_pres # fi
PartForm_past # fi
PartForm_agt # fi
PartForm_neg # fi
PartType_mod # U
PartType_emp # U
PartType_res # U
PartType_inf # U
PartType_vbp # U
Person_abs_one # bq, U
Person_abs_two # bq, U
Person_abs_three # bq, U
Person_dat_one # bq, U
Person_dat_two # bq, U
Person_dat_three # bq, U
Person_erg_one # bq, U
Person_erg_two # bq, U
Person_erg_three # bq, U
Person_psor_one # fi, U
Person_psor_two # fi, U
Person_psor_three # fi, U
Polite_inf # bq, U
Polite_pol # bq, U
Polite_abs_inf # bq, U
Polite_abs_pol # bq, U
Polite_erg_inf # bq, U
Polite_erg_pol # bq, U
Polite_dat_inf # bq, U
Polite_dat_pol # bq, U
Prefix_yes # U
PrepCase_npr # cz
PrepCase_pre # U
PunctSide_ini # U
PunctSide_fin # U
PunctType_peri # U
PunctType_qest # U
PunctType_excl # U
PunctType_quot # U
PunctType_brck # U
PunctType_comm # U
PunctType_colo # U
PunctType_semi # U
PunctType_dash # U
Style_arch # cz, fi, U
Style_rare # cz, fi, U
Style_poet # cz, U
Style_norm # cz, U
Style_coll # cz, U
Style_vrnc # cz, U
Style_sing # cz, U
Style_expr # cz, U
Style_derg # cz, U
Style_vulg # cz, U
Style_yes # fi, U
StyleVariant_styleShort # cz
StyleVariant_styleBound # cz, sl
VerbType_aux # U
VerbType_cop # U
VerbType_mod # U
VerbType_light # U
#
#cpdef enum Feature_t:
# Abbr
# AdpType
# AdvType
# ConjType
# Connegative
# Derivation
# Echo
# Foreign
# Gender_dat
# Gender_erg
# Gender_psor
# Hyph
# InfForm
# NameType
# NounType
# NumberAbs
# NumberDat
# NumberErg
# NumberPsee
# NumberPsor
# NumForm
# NumValue
# PartForm
# PartType
# Person_abs
# Person_dat
# Person_psor
# Polite
# Polite_abs
# Polite_dat
# Prefix
# PrepCase
# PunctSide
# PunctType
# Style
# Typo
# Variant
# VerbType
#
#
#cpdef enum Animacy:
# Anim
# Inam
#
#
#cpdef enum Aspect:
# Freq
# Imp
# Mod
# None_
# Perf
#
#
#cpdef enum Case1:
# Nom
# Gen
# Acc
# Dat
# Voc
# Abl
#
#cdef enum Case2:
# Abe
# Abs
# Ade
# All
# Cau
# Com
# Del
# Dis
#
#cdef enum Case3:
# Ela
# Ess
# Ill
# Ine
# Ins
# Loc
# Lat
# Par
#
#cdef enum Case4:
# Sub
# Sup
# Tem
# Ter
# Tra
#
#
#cpdef enum Definite:
# Two
# Def
# Red
# Ind
#
#
#cpdef enum Degree:
# Cmp
# Comp
# None_
# Pos
# Sup
# Abs
# Com
# Degree # du
#
#
#cpdef enum Gender:
# Com
# Fem
# Masc
# Neut
#
#
#cpdef enum Mood:
# Cnd
# Imp
# Ind
# N
# Pot
# Sub
# Opt
#
#
#cpdef enum Negative:
# Neg
# Pos
# Yes
#
#
#cpdef enum Number:
# Com
# Dual
# None_
# Plur
# Sing
# Ptan # bg
# Count # bg
#
#
#cpdef enum NumType:
# Card
# Dist
# Frac
# Gen
# Mult
# None_
# Ord
# Sets
#
#
#cpdef enum Person:
# One
# Two
# Three
# None_
#
#
#cpdef enum Poss:
# Yes
#
#
#cpdef enum PronType1:
# AdvPart
# Art
# Default
# Dem
# Ind
# Int
# Neg
#
#cpdef enum PronType2:
# Prs
# Rcp
# Rel
# Tot
# Clit
# Exc # es, ca, it, fa
# Clit # it
#
#
#cpdef enum Reflex:
# Yes
#
#
#cpdef enum Tense:
# Fut
# Imp
# Past
# Pres
#
#cpdef enum VerbForm1:
# Fin
# Ger
# Inf
# None_
# Part
# PartFut
# PartPast
#
#cpdef enum VerbForm2:
# PartPres
# Sup
# Trans
# Gdv # la
#
#
#cpdef enum Voice:
# Act
# Cau
# Pass
# Mid # gkc
# Int # hb
#
#
#cpdef enum Abbr:
# Yes # cz, fi, sl, U
#
#cpdef enum AdpType:
# Prep # cz, U
# Post # U
# Voc # cz
# Comprep # cz
# Circ # U
# Voc # U
#
#
#cpdef enum AdvType1:
# # U
# Man
# Loc
# Tim
# Deg
# Cau
# Mod
# Sta
# Ex
#
#cpdef enum AdvType2:
# Adadj
#
#cpdef enum ConjType:
# Oper # cz, U
# Comp # cz, U
#
#cpdef enum Connegative:
# Yes # fi
#
#
#cpdef enum Derivation1:
# Minen # fi
# Sti # fi
# Inen # fi
# Lainen # fi
# Ja # fi
# Ton # fi
# Vs # fi
# Ttain # fi
#
#cpdef enum Derivation2:
# Ttaa
#
#
#cpdef enum Echo:
# Rdp # U
# Ech # U
#
#
#cpdef enum Foreign:
# Foreign # cz, fi, U
# Fscript # cz, fi, U
# Tscript # cz, U
# Yes # sl
#
#
#cpdef enum Gender_dat:
# Masc # bq, U
# Fem # bq, U
#
#
#cpdef enum Gender_erg:
# Masc # bq
# Fem # bq
#
#
#cpdef enum Gender_psor:
# Masc # cz, sl, U
# Fem # cz, sl, U
# Neut # sl
#
#
#cpdef enum Hyph:
# Yes # cz, U
#
#
#cpdef enum InfForm:
# One # fi
# Two # fi
# Three # fi
#
#
#cpdef enum NameType:
# Geo # U, cz
# Prs # U, cz
# Giv # U, cz
# Sur # U, cz
# Nat # U, cz
# Com # U, cz
# Pro # U, cz
# Oth # U, cz
#
#
#cpdef enum NounType:
# Com # U
# Prop # U
# Class # U
#
#cpdef enum Number_abs:
# Sing # bq, U
# Plur # bq, U
#
#cpdef enum Number_dat:
# Sing # bq, U
# Plur # bq, U
#
#cpdef enum Number_erg:
# Sing # bq, U
# Plur # bq, U
#
#cpdef enum Number_psee:
# Sing # U
# Plur # U
#
#
#cpdef enum Number_psor:
# Sing # cz, fi, sl, U
# Plur # cz, fi, sl, U
#
#
#cpdef enum NumForm:
# Digit # cz, sl, U
# Roman # cz, sl, U
# Word # cz, sl, U
#
#
#cpdef enum NumValue:
# One # cz, U
# Two # cz, U
# Three # cz, U
#
#
#cpdef enum PartForm:
# Pres # fi
# Past # fi
# Agt # fi
# Neg # fi
#
#
#cpdef enum PartType:
# Mod # U
# Emp # U
# Res # U
# Inf # U
# Vbp # U
#
#cpdef enum Person_abs:
# One # bq, U
# Two # bq, U
# Three # bq, U
#
#
#cpdef enum Person_dat:
# One # bq, U
# Two # bq, U
# Three # bq, U
#
#
#cpdef enum Person_erg:
# One # bq, U
# Two # bq, U
# Three # bq, U
#
#
#cpdef enum Person_psor:
# One # fi, U
# Two # fi, U
# Three # fi, U
#
#
#cpdef enum Polite:
# Inf # bq, U
# Pol # bq, U
#
#
#cpdef enum Polite_abs:
# Inf # bq, U
# Pol # bq, U
#
#
#cpdef enum Polite_erg:
# Inf # bq, U
# Pol # bq, U
#
#
#cpdef enum Polite_dat:
# Inf # bq, U
# Pol # bq, U
#
#
#cpdef enum Prefix:
# Yes # U
#
#
#cpdef enum PrepCase:
# Npr # cz
# Pre # U
#
#
#cpdef enum PunctSide:
# Ini # U
# Fin # U
#
#cpdef enum PunctType1:
# Peri # U
# Qest # U
# Excl # U
# Quot # U
# Brck # U
# Comm # U
# Colo # U
# Semi # U
#
#cpdef enum PunctType2:
# Dash # U
#
#
#cpdef enum Style1:
# Arch # cz, fi, U
# Rare # cz, fi, U
# Poet # cz, U
# Norm # cz, U
# Coll # cz, U
# Vrnc # cz, U
# Sing # cz, U
# Expr # cz, U
#
#
#cpdef enum Style2:
# Derg # cz, U
# Vulg # cz, U
#
#
#cpdef enum Typo:
# Yes # fi, U
#
#
#cpdef enum Variant:
# Short # cz
# Bound # cz, sl
#
#
#cpdef enum VerbType:
# Aux # U
# Cop # U
# Mod # U
# Light # U
#
cpdef enum Value_t:
Animacy_Anim
Animacy_Inam
Aspect_Freq
Aspect_Imp
Aspect_Mod
Aspect_None_
Aspect_Perf
Case_Abe
Case_Abl
Case_Abs
Case_Acc
Case_Ade
Case_All
Case_Cau
Case_Com
Case_Dat
Case_Del
Case_Dis
Case_Ela
Case_Ess
Case_Gen
Case_Ill
Case_Ine
Case_Ins
Case_Loc
Case_Lat
Case_Nom
Case_Par
Case_Sub
Case_Sup
Case_Tem
Case_Ter
Case_Tra
Case_Voc
Definite_Two
Definite_Def
Definite_Red
Definite_Ind
Degree_Cmp
Degree_Comp
Degree_None
Degree_Pos
Degree_Sup
Degree_Abs
Degree_Com
Degree_Dim # du
Gender_Com
Gender_Fem
Gender_Masc
Gender_Neut
Mood_Cnd
Mood_Imp
Mood_Ind
Mood_N
Mood_Pot
Mood_Sub
Mood_Opt
Negative_Neg
Negative_Pos
Negative_Yes
Number_Com
Number_Dual
Number_None
Number_Plur
Number_Sing
Number_Ptan # bg
Number_Count # bg
NumType_Card
NumType_Dist
NumType_Frac
NumType_Gen
NumType_Mult
NumType_None
NumType_Ord
NumType_Sets
Person_One
Person_Two
Person_Three
Person_None
Poss_Yes
PronType_AdvPart
PronType_Art
PronType_Default
PronType_Dem
PronType_Ind
PronType_Int
PronType_Neg
PronType_Prs
PronType_Rcp
PronType_Rel
PronType_Tot
PronType_Clit
PronType_Exc # es, ca, it, fa
Reflex_Yes
Tense_Fut
Tense_Imp
Tense_Past
Tense_Pres
VerbForm_Fin
VerbForm_Ger
VerbForm_Inf
VerbForm_None
VerbForm_Part
VerbForm_PartFut
VerbForm_PartPast
VerbForm_PartPres
VerbForm_Sup
VerbForm_Trans
VerbForm_Gdv # la
Voice_Act
Voice_Cau
Voice_Pass
Voice_Mid # gkc
Voice_Int # hb
Abbr_Yes # cz, fi, sl, U
AdpType_Prep # cz, U
AdpType_Post # U
AdpType_Voc # cz
AdpType_Comprep # cz
AdpType_Circ # U
AdvType_Man
AdvType_Loc
AdvType_Tim
AdvType_Deg
AdvType_Cau
AdvType_Mod
AdvType_Sta
AdvType_Ex
AdvType_Adadj
ConjType_Oper # cz, U
ConjType_Comp # cz, U
Connegative_Yes # fi
Derivation_Minen # fi
Derivation_Sti # fi
Derivation_Inen # fi
Derivation_Lainen # fi
Derivation_Ja # fi
Derivation_Ton # fi
Derivation_Vs # fi
Derivation_Ttain # fi
Derivation_Ttaa # fi
Echo_Rdp # U
Echo_Ech # U
Foreign_Foreign # cz, fi, U
Foreign_Fscript # cz, fi, U
Foreign_Tscript # cz, U
Foreign_Yes # sl
Gender_dat_Masc # bq, U
Gender_dat_Fem # bq, U
Gender_erg_Masc # bq
Gender_erg_Fem # bq
Gender_psor_Masc # cz, sl, U
Gender_psor_Fem # cz, sl, U
Gender_psor_Neut # sl
Hyph_Yes # cz, U
InfForm_One # fi
InfForm_Two # fi
InfForm_Three # fi
NameType_Geo # U, cz
NameType_Prs # U, cz
NameType_Giv # U, cz
NameType_Sur # U, cz
NameType_Nat # U, cz
NameType_Com # U, cz
NameType_Pro # U, cz
NameType_Oth # U, cz
NounType_Com # U
NounType_Prop # U
NounType_Class # U
Number_abs_Sing # bq, U
Number_abs_Plur # bq, U
Number_dat_Sing # bq, U
Number_dat_Plur # bq, U
Number_erg_Sing # bq, U
Number_erg_Plur # bq, U
Number_psee_Sing # U
Number_psee_Plur # U
Number_psor_Sing # cz, fi, sl, U
Number_psor_Plur # cz, fi, sl, U
NumForm_Digit # cz, sl, U
NumForm_Roman # cz, sl, U
NumForm_Word # cz, sl, U
NumValue_One # cz, U
NumValue_Two # cz, U
NumValue_Three # cz, U
PartForm_Pres # fi
PartForm_Past # fi
PartForm_Agt # fi
PartForm_Neg # fi
PartType_Mod # U
PartType_Emp # U
PartType_Res # U
PartType_Inf # U
PartType_Vbp # U
Person_abs_One # bq, U
Person_abs_Two # bq, U
Person_abs_Three # bq, U
Person_dat_One # bq, U
Person_dat_Two # bq, U
Person_dat_Three # bq, U
Person_erg_One # bq, U
Person_erg_Two # bq, U
Person_erg_Three # bq, U
Person_psor_One # fi, U
Person_psor_Two # fi, U
Person_psor_Three # fi, U
Polite_Inf # bq, U
Polite_Pol # bq, U
Polite_abs_Inf # bq, U
Polite_abs_Pol # bq, U
Polite_erg_Inf # bq, U
Polite_erg_Pol # bq, U
Polite_dat_Inf # bq, U
Polite_dat_Pol # bq, U
Prefix_Yes # U
PrepCase_Npr # cz
PrepCase_Pre # U
PunctSide_Ini # U
PunctSide_Fin # U
PunctType_Peri # U
PunctType_Qest # U
PunctType_Excl # U
PunctType_Quot # U
PunctType_Brck # U
PunctType_Comm # U
PunctType_Colo # U
PunctType_Semi # U
PunctType_Dash # U
Style_Arch # cz, fi, U
Style_Rare # cz, fi, U
Style_Poet # cz, U
Style_Norm # cz, U
Style_Coll # cz, U
Style_Vrnc # cz, U
Style_Sing # cz, U
Style_Expr # cz, U
Style_Derg # cz, U
Style_Vulg # cz, U
Style_Yes # fi, U
StyleVariant_StyleShort # cz
StyleVariant_StyleBound # cz, sl
VerbType_Aux # U
VerbType_Cop # U
VerbType_Mod # U
VerbType_Light # U

View File

@ -6,7 +6,7 @@ try:
except ImportError:
import json
from .parts_of_speech import UNIV_POS_NAMES
from .parts_of_speech import IDS as POS_IDS
from .parts_of_speech cimport ADJ, VERB, NOUN, PUNCT
@ -24,7 +24,7 @@ cdef class Morphology:
self.rich_tags[i].id = i
self.rich_tags[i].name = self.strings[tag_str]
self.rich_tags[i].morph = 0
self.rich_tags[i].pos = UNIV_POS_NAMES[props['pos'].upper()]
self.rich_tags[i].pos = POS_IDS[props['pos'].upper()]
self.reverse_index[self.rich_tags[i].name] = i
self._cache = PreshMapArray(self.n_tags)
@ -89,3 +89,254 @@ cdef class Morphology:
lemma_string = sorted(lemma_strings)[0]
lemma = self.strings[lemma_string]
return lemma
IDS = {
"Animacy_anim": Animacy_anim,
"Animacy_inam": Animacy_inam,
"Aspect_freq": Aspect_freq,
"Aspect_imp": Aspect_imp,
"Aspect_mod": Aspect_mod,
"Aspect_none": Aspect_none,
"Aspect_perf": Aspect_perf,
"Case_abe": Case_abe,
"Case_abl": Case_abl,
"Case_abs": Case_abs,
"Case_acc": Case_acc,
"Case_ade": Case_ade,
"Case_all": Case_all,
"Case_cau": Case_cau,
"Case_com": Case_com,
"Case_dat": Case_dat,
"Case_del": Case_del,
"Case_dis": Case_dis,
"Case_ela": Case_ela,
"Case_ess": Case_ess,
"Case_gen": Case_gen,
"Case_ill": Case_ill,
"Case_ine": Case_ine,
"Case_ins": Case_ins,
"Case_loc": Case_loc,
"Case_lat": Case_lat,
"Case_nom": Case_nom,
"Case_par": Case_par,
"Case_sub": Case_sub,
"Case_sup": Case_sup,
"Case_tem": Case_tem,
"Case_ter": Case_ter,
"Case_tra": Case_tra,
"Case_voc": Case_voc,
"Definite_two": Definite_two,
"Definite_def": Definite_def,
"Definite_red": Definite_red,
"Definite_ind": Definite_ind,
"Degree_cmp": Degree_cmp,
"Degree_comp": Degree_comp,
"Degree_none": Degree_none,
"Degree_pos": Degree_pos,
"Degree_sup": Degree_sup,
"Degree_abs": Degree_abs,
"Degree_com": Degree_com,
"Degree_dim ": Degree_dim, # du
"Gender_com": Gender_com,
"Gender_fem": Gender_fem,
"Gender_masc": Gender_masc,
"Gender_neut": Gender_neut,
"Mood_cnd": Mood_cnd,
"Mood_imp": Mood_imp,
"Mood_ind": Mood_ind,
"Mood_n": Mood_n,
"Mood_pot": Mood_pot,
"Mood_sub": Mood_sub,
"Mood_opt": Mood_opt,
"Negative_neg": Negative_neg,
"Negative_pos": Negative_pos,
"Negative_yes": Negative_yes,
"Number_com": Number_com,
"Number_dual": Number_dual,
"Number_none": Number_none,
"Number_plur": Number_plur,
"Number_sing": Number_sing,
"Number_ptan ": Number_ptan, # bg
"Number_count ": Number_count, # bg
"NumType_card": NumType_card,
"NumType_dist": NumType_dist,
"NumType_frac": NumType_frac,
"NumType_gen": NumType_gen,
"NumType_mult": NumType_mult,
"NumType_none": NumType_none,
"NumType_ord": NumType_ord,
"NumType_sets": NumType_sets,
"Person_one": Person_one,
"Person_two": Person_two,
"Person_three": Person_three,
"Person_none": Person_none,
"Poss_yes": Poss_yes,
"PronType_advPart": PronType_advPart,
"PronType_art": PronType_art,
"PronType_default": PronType_default,
"PronType_dem": PronType_dem,
"PronType_ind": PronType_ind,
"PronType_int": PronType_int,
"PronType_neg": PronType_neg,
"PronType_prs": PronType_prs,
"PronType_rcp": PronType_rcp,
"PronType_rel": PronType_rel,
"PronType_tot": PronType_tot,
"PronType_clit": PronType_clit,
"PronType_exc ": PronType_exc, # es, ca, it, fa,
"Reflex_yes": Reflex_yes,
"Tense_fut": Tense_fut,
"Tense_imp": Tense_imp,
"Tense_past": Tense_past,
"Tense_pres": Tense_pres,
"VerbForm_fin": VerbForm_fin,
"VerbForm_ger": VerbForm_ger,
"VerbForm_inf": VerbForm_inf,
"VerbForm_none": VerbForm_none,
"VerbForm_part": VerbForm_part,
"VerbForm_partFut": VerbForm_partFut,
"VerbForm_partPast": VerbForm_partPast,
"VerbForm_partPres": VerbForm_partPres,
"VerbForm_sup": VerbForm_sup,
"VerbForm_trans": VerbForm_trans,
"VerbForm_gdv ": VerbForm_gdv, # la,
"Voice_act": Voice_act,
"Voice_cau": Voice_cau,
"Voice_pass": Voice_pass,
"Voice_mid ": Voice_mid, # gkc,
"Voice_int ": Voice_int, # hb,
"Abbr_yes ": Abbr_yes, # cz, fi, sl, U,
"AdpType_prep ": AdpType_prep, # cz, U,
"AdpType_post ": AdpType_post, # U,
"AdpType_voc ": AdpType_voc, # cz,
"AdpType_comprep ": AdpType_comprep, # cz,
"AdpType_circ ": AdpType_circ, # U,
"AdvType_man": AdvType_man,
"AdvType_loc": AdvType_loc,
"AdvType_tim": AdvType_tim,
"AdvType_deg": AdvType_deg,
"AdvType_cau": AdvType_cau,
"AdvType_mod": AdvType_mod,
"AdvType_sta": AdvType_sta,
"AdvType_ex": AdvType_ex,
"AdvType_adadj": AdvType_adadj,
"ConjType_oper ": ConjType_oper, # cz, U,
"ConjType_comp ": ConjType_comp, # cz, U,
"Connegative_yes ": Connegative_yes, # fi,
"Derivation_minen ": Derivation_minen, # fi,
"Derivation_sti ": Derivation_sti, # fi,
"Derivation_inen ": Derivation_inen, # fi,
"Derivation_lainen ": Derivation_lainen, # fi,
"Derivation_ja ": Derivation_ja, # fi,
"Derivation_ton ": Derivation_ton, # fi,
"Derivation_vs ": Derivation_vs, # fi,
"Derivation_ttain ": Derivation_ttain, # fi,
"Derivation_ttaa ": Derivation_ttaa, # fi,
"Echo_rdp ": Echo_rdp, # U,
"Echo_ech ": Echo_ech, # U,
"Foreign_foreign ": Foreign_foreign, # cz, fi, U,
"Foreign_fscript ": Foreign_fscript, # cz, fi, U,
"Foreign_tscript ": Foreign_tscript, # cz, U,
"Foreign_yes ": Foreign_yes, # sl,
"Gender_dat_masc ": Gender_dat_masc, # bq, U,
"Gender_dat_fem ": Gender_dat_fem, # bq, U,
"Gender_erg_masc ": Gender_erg_masc, # bq,
"Gender_erg_fem ": Gender_erg_fem, # bq,
"Gender_psor_masc ": Gender_psor_masc, # cz, sl, U,
"Gender_psor_fem ": Gender_psor_fem, # cz, sl, U,
"Gender_psor_neut ": Gender_psor_neut, # sl,
"Hyph_yes ": Hyph_yes, # cz, U,
"InfForm_one ": InfForm_one, # fi,
"InfForm_two ": InfForm_two, # fi,
"InfForm_three ": InfForm_three, # fi,
"NameType_geo ": NameType_geo, # U, cz,
"NameType_prs ": NameType_prs, # U, cz,
"NameType_giv ": NameType_giv, # U, cz,
"NameType_sur ": NameType_sur, # U, cz,
"NameType_nat ": NameType_nat, # U, cz,
"NameType_com ": NameType_com, # U, cz,
"NameType_pro ": NameType_pro, # U, cz,
"NameType_oth ": NameType_oth, # U, cz,
"NounType_com ": NounType_com, # U,
"NounType_prop ": NounType_prop, # U,
"NounType_class ": NounType_class, # U,
"Number_abs_sing ": Number_abs_sing, # bq, U,
"Number_abs_plur ": Number_abs_plur, # bq, U,
"Number_dat_sing ": Number_dat_sing, # bq, U,
"Number_dat_plur ": Number_dat_plur, # bq, U,
"Number_erg_sing ": Number_erg_sing, # bq, U,
"Number_erg_plur ": Number_erg_plur, # bq, U,
"Number_psee_sing ": Number_psee_sing, # U,
"Number_psee_plur ": Number_psee_plur, # U,
"Number_psor_sing ": Number_psor_sing, # cz, fi, sl, U,
"Number_psor_plur ": Number_psor_plur, # cz, fi, sl, U,
"NumForm_digit ": NumForm_digit, # cz, sl, U,
"NumForm_roman ": NumForm_roman, # cz, sl, U,
"NumForm_word ": NumForm_word, # cz, sl, U,
"NumValue_one ": NumValue_one, # cz, U,
"NumValue_two ": NumValue_two, # cz, U,
"NumValue_three ": NumValue_three, # cz, U,
"PartForm_pres ": PartForm_pres, # fi,
"PartForm_past ": PartForm_past, # fi,
"PartForm_agt ": PartForm_agt, # fi,
"PartForm_neg ": PartForm_neg, # fi,
"PartType_mod ": PartType_mod, # U,
"PartType_emp ": PartType_emp, # U,
"PartType_res ": PartType_res, # U,
"PartType_inf ": PartType_inf, # U,
"PartType_vbp ": PartType_vbp, # U,
"Person_abs_one ": Person_abs_one, # bq, U,
"Person_abs_two ": Person_abs_two, # bq, U,
"Person_abs_three ": Person_abs_three, # bq, U,
"Person_dat_one ": Person_dat_one, # bq, U,
"Person_dat_two ": Person_dat_two, # bq, U,
"Person_dat_three ": Person_dat_three, # bq, U,
"Person_erg_one ": Person_erg_one, # bq, U,
"Person_erg_two ": Person_erg_two, # bq, U,
"Person_erg_three ": Person_erg_three, # bq, U,
"Person_psor_one ": Person_psor_one, # fi, U,
"Person_psor_two ": Person_psor_two, # fi, U,
"Person_psor_three ": Person_psor_three, # fi, U,
"Polite_inf ": Polite_inf, # bq, U,
"Polite_pol ": Polite_pol, # bq, U,
"Polite_abs_inf ": Polite_abs_inf, # bq, U,
"Polite_abs_pol ": Polite_abs_pol, # bq, U,
"Polite_erg_inf ": Polite_erg_inf, # bq, U,
"Polite_erg_pol ": Polite_erg_pol, # bq, U,
"Polite_dat_inf ": Polite_dat_inf, # bq, U,
"Polite_dat_pol ": Polite_dat_pol, # bq, U,
"Prefix_yes ": Prefix_yes, # U,
"PrepCase_npr ": PrepCase_npr, # cz,
"PrepCase_pre ": PrepCase_pre, # U,
"PunctSide_ini ": PunctSide_ini, # U,
"PunctSide_fin ": PunctSide_fin, # U,
"PunctType_peri ": PunctType_peri, # U,
"PunctType_qest ": PunctType_qest, # U,
"PunctType_excl ": PunctType_excl, # U,
"PunctType_quot ": PunctType_quot, # U,
"PunctType_brck ": PunctType_brck, # U,
"PunctType_comm ": PunctType_comm, # U,
"PunctType_colo ": PunctType_colo, # U,
"PunctType_semi ": PunctType_semi, # U,
"PunctType_dash ": PunctType_dash, # U,
"Style_arch ": Style_arch, # cz, fi, U,
"Style_rare ": Style_rare, # cz, fi, U,
"Style_poet ": Style_poet, # cz, U,
"Style_norm ": Style_norm, # cz, U,
"Style_coll ": Style_coll, # cz, U,
"Style_vrnc ": Style_vrnc, # cz, U,
"Style_sing ": Style_sing, # cz, U,
"Style_expr ": Style_expr, # cz, U,
"Style_derg ": Style_derg, # cz, U,
"Style_vulg ": Style_vulg, # cz, U,
"Style_yes ": Style_yes, # fi, U,
"StyleVariant_styleShort ": StyleVariant_styleShort, # cz,
"StyleVariant_styleBound ": StyleVariant_styleBound, # cz, sl,
"VerbType_aux ": VerbType_aux, # U,
"VerbType_cop ": VerbType_cop, # U,
"VerbType_mod ": VerbType_mod, # U,
"VerbType_light ": VerbType_light, # U,
}
NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]

View File

@ -1,7 +1,8 @@
# Google universal tag set
from . cimport symbols
cpdef enum univ_pos_t:
NO_TAG
ADJ
NO_TAG = 0
ADJ = symbols.ADJ
ADP
ADV
AUX
@ -20,4 +21,3 @@ cpdef enum univ_pos_t:
X
EOL
SPACE
N_UNIV_TAGS

View File

@ -1,8 +1,8 @@
from __future__ import unicode_literals
UNIV_POS_NAMES = {
"NO_TAG": NO_TAG,
IDS = {
"": NO_TAG,
"ADJ": ADJ,
"ADP": ADP,
"ADV": ADV,
@ -23,3 +23,6 @@ UNIV_POS_NAMES = {
"EOL": EOL,
"SPACE": SPACE
}
NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]

421
spacy/symbols.pxd Normal file
View File

@ -0,0 +1,421 @@
cpdef enum symbol_t:
NIL
IS_ALPHA
IS_ASCII
IS_DIGIT
IS_LOWER
IS_PUNCT
IS_SPACE
IS_TITLE
IS_UPPER
LIKE_URL
LIKE_NUM
LIKE_EMAIL
IS_STOP
IS_OOV
FLAG14
FLAG15
FLAG16
FLAG17
FLAG18
FLAG19
FLAG20
FLAG21
FLAG22
FLAG23
FLAG24
FLAG25
FLAG26
FLAG27
FLAG28
FLAG29
FLAG30
FLAG31
FLAG32
FLAG33
FLAG34
FLAG35
FLAG36
FLAG37
FLAG38
FLAG39
FLAG40
FLAG41
FLAG42
FLAG43
FLAG44
FLAG45
FLAG46
FLAG47
FLAG48
FLAG49
FLAG50
FLAG51
FLAG52
FLAG53
FLAG54
FLAG55
FLAG56
FLAG57
FLAG58
FLAG59
FLAG60
FLAG61
FLAG62
FLAG63
ID
ORTH
LOWER
NORM
SHAPE
PREFIX
SUFFIX
LENGTH
CLUSTER
LEMMA
POS
TAG
DEP
ENT_IOB
ENT_TYPE
HEAD
SPACY
PROB
ADJ
ADP
ADV
AUX
CONJ
DET
INTJ
NOUN
NUM
PART
PRON
PROPN
PUNCT
SCONJ
SYM
VERB
X
EOL
SPACE
Animacy_anim
Animacy_inam
Aspect_freq
Aspect_imp
Aspect_mod
Aspect_none
Aspect_perf
Case_abe
Case_abl
Case_abs
Case_acc
Case_ade
Case_all
Case_cau
Case_com
Case_dat
Case_del
Case_dis
Case_ela
Case_ess
Case_gen
Case_ill
Case_ine
Case_ins
Case_loc
Case_lat
Case_nom
Case_par
Case_sub
Case_sup
Case_tem
Case_ter
Case_tra
Case_voc
Definite_two
Definite_def
Definite_red
Definite_ind
Degree_cmp
Degree_comp
Degree_none
Degree_pos
Degree_sup
Degree_abs
Degree_com
Degree_dim # du
Gender_com
Gender_fem
Gender_masc
Gender_neut
Mood_cnd
Mood_imp
Mood_ind
Mood_n
Mood_pot
Mood_sub
Mood_opt
Negative_neg
Negative_pos
Negative_yes
Number_com
Number_dual
Number_none
Number_plur
Number_sing
Number_ptan # bg
Number_count # bg
NumType_card
NumType_dist
NumType_frac
NumType_gen
NumType_mult
NumType_none
NumType_ord
NumType_sets
Person_one
Person_two
Person_three
Person_none
Poss_yes
PronType_advPart
PronType_art
PronType_default
PronType_dem
PronType_ind
PronType_int
PronType_neg
PronType_prs
PronType_rcp
PronType_rel
PronType_tot
PronType_clit
PronType_exc # es, ca, it, fa
Reflex_yes
Tense_fut
Tense_imp
Tense_past
Tense_pres
VerbForm_fin
VerbForm_ger
VerbForm_inf
VerbForm_none
VerbForm_part
VerbForm_partFut
VerbForm_partPast
VerbForm_partPres
VerbForm_sup
VerbForm_trans
VerbForm_gdv # la
Voice_act
Voice_cau
Voice_pass
Voice_mid # gkc
Voice_int # hb
Abbr_yes # cz, fi, sl, U
AdpType_prep # cz, U
AdpType_post # U
AdpType_voc # cz
AdpType_comprep # cz
AdpType_circ # U
AdvType_man
AdvType_loc
AdvType_tim
AdvType_deg
AdvType_cau
AdvType_mod
AdvType_sta
AdvType_ex
AdvType_adadj
ConjType_oper # cz, U
ConjType_comp # cz, U
Connegative_yes # fi
Derivation_minen # fi
Derivation_sti # fi
Derivation_inen # fi
Derivation_lainen # fi
Derivation_ja # fi
Derivation_ton # fi
Derivation_vs # fi
Derivation_ttain # fi
Derivation_ttaa # fi
Echo_rdp # U
Echo_ech # U
Foreign_foreign # cz, fi, U
Foreign_fscript # cz, fi, U
Foreign_tscript # cz, U
Foreign_yes # sl
Gender_dat_masc # bq, U
Gender_dat_fem # bq, U
Gender_erg_masc # bq
Gender_erg_fem # bq
Gender_psor_masc # cz, sl, U
Gender_psor_fem # cz, sl, U
Gender_psor_neut # sl
Hyph_yes # cz, U
InfForm_one # fi
InfForm_two # fi
InfForm_three # fi
NameType_geo # U, cz
NameType_prs # U, cz
NameType_giv # U, cz
NameType_sur # U, cz
NameType_nat # U, cz
NameType_com # U, cz
NameType_pro # U, cz
NameType_oth # U, cz
NounType_com # U
NounType_prop # U
NounType_class # U
Number_abs_sing # bq, U
Number_abs_plur # bq, U
Number_dat_sing # bq, U
Number_dat_plur # bq, U
Number_erg_sing # bq, U
Number_erg_plur # bq, U
Number_psee_sing # U
Number_psee_plur # U
Number_psor_sing # cz, fi, sl, U
Number_psor_plur # cz, fi, sl, U
NumForm_digit # cz, sl, U
NumForm_roman # cz, sl, U
NumForm_word # cz, sl, U
NumValue_one # cz, U
NumValue_two # cz, U
NumValue_three # cz, U
PartForm_pres # fi
PartForm_past # fi
PartForm_agt # fi
PartForm_neg # fi
PartType_mod # U
PartType_emp # U
PartType_res # U
PartType_inf # U
PartType_vbp # U
Person_abs_one # bq, U
Person_abs_two # bq, U
Person_abs_three # bq, U
Person_dat_one # bq, U
Person_dat_two # bq, U
Person_dat_three # bq, U
Person_erg_one # bq, U
Person_erg_two # bq, U
Person_erg_three # bq, U
Person_psor_one # fi, U
Person_psor_two # fi, U
Person_psor_three # fi, U
Polite_inf # bq, U
Polite_pol # bq, U
Polite_abs_inf # bq, U
Polite_abs_pol # bq, U
Polite_erg_inf # bq, U
Polite_erg_pol # bq, U
Polite_dat_inf # bq, U
Polite_dat_pol # bq, U
Prefix_yes # U
PrepCase_npr # cz
PrepCase_pre # U
PunctSide_ini # U
PunctSide_fin # U
PunctType_peri # U
PunctType_qest # U
PunctType_excl # U
PunctType_quot # U
PunctType_brck # U
PunctType_comm # U
PunctType_colo # U
PunctType_semi # U
PunctType_dash # U
Style_arch # cz, fi, U
Style_rare # cz, fi, U
Style_poet # cz, U
Style_norm # cz, U
Style_coll # cz, U
Style_vrnc # cz, U
Style_sing # cz, U
Style_expr # cz, U
Style_derg # cz, U
Style_vulg # cz, U
Style_yes # fi, U
StyleVariant_styleShort # cz
StyleVariant_styleBound # cz, sl
VerbType_aux # U
VerbType_cop # U
VerbType_mod # U
VerbType_light # U
PERSON
NORP
FACILITY
ORG
GPE
LOC
PRODUCT
EVENT
WORK_OF_ART
LANGUAGE
DATE
TIME
PERCENT
MONEY
QUANTITY
ORDINAL
CARDINAL
acomp
advcl
advmod
agent
amod
appos
attr
aux
auxpass
cc
ccomp
complm
conj
csubj
csubjpass
dep
det
dobj
expl
hmod
hyph
infmod
intj
iobj
mark
meta
neg
nmod
nn
npadvmod
nsubj
nsubjpass
num
number
oprd
parataxis
partmod
pcomp
pobj
poss
possessive
preconj
prep
prt
punct
quantmod
rcmod
root
xcomp

424
spacy/symbols.pyx Normal file
View File

@ -0,0 +1,424 @@
IDS = {
"": NIL,
"IS_ALPHA": IS_ALPHA,
"IS_ASCII": IS_ASCII,
"IS_DIGIT": IS_DIGIT,
"IS_LOWER": IS_LOWER,
"IS_PUNCT": IS_PUNCT,
"IS_SPACE": IS_SPACE,
"IS_TITLE": IS_TITLE,
"IS_UPPER": IS_UPPER,
"LIKE_URL": LIKE_URL,
"LIKE_NUM": LIKE_NUM,
"LIKE_EMAIL": LIKE_EMAIL,
"IS_STOP": IS_STOP,
"IS_OOV": IS_OOV,
"FLAG14": FLAG14,
"FLAG15": FLAG15,
"FLAG16": FLAG16,
"FLAG17": FLAG17,
"FLAG18": FLAG18,
"FLAG19": FLAG19,
"FLAG20": FLAG20,
"FLAG21": FLAG21,
"FLAG22": FLAG22,
"FLAG23": FLAG23,
"FLAG24": FLAG24,
"FLAG25": FLAG25,
"FLAG26": FLAG26,
"FLAG27": FLAG27,
"FLAG28": FLAG28,
"FLAG29": FLAG29,
"FLAG30": FLAG30,
"FLAG31": FLAG31,
"FLAG32": FLAG32,
"FLAG33": FLAG33,
"FLAG34": FLAG34,
"FLAG35": FLAG35,
"FLAG36": FLAG36,
"FLAG37": FLAG37,
"FLAG38": FLAG38,
"FLAG39": FLAG39,
"FLAG40": FLAG40,
"FLAG41": FLAG41,
"FLAG42": FLAG42,
"FLAG43": FLAG43,
"FLAG44": FLAG44,
"FLAG45": FLAG45,
"FLAG46": FLAG46,
"FLAG47": FLAG47,
"FLAG48": FLAG48,
"FLAG49": FLAG49,
"FLAG50": FLAG50,
"FLAG51": FLAG51,
"FLAG52": FLAG52,
"FLAG53": FLAG53,
"FLAG54": FLAG54,
"FLAG55": FLAG55,
"FLAG56": FLAG56,
"FLAG57": FLAG57,
"FLAG58": FLAG58,
"FLAG59": FLAG59,
"FLAG60": FLAG60,
"FLAG61": FLAG61,
"FLAG62": FLAG62,
"FLAG63": FLAG63,
"ID": ID,
"ORTH": ORTH,
"LOWER": LOWER,
"NORM": NORM,
"SHAPE": SHAPE,
"PREFIX": PREFIX,
"SUFFIX": SUFFIX,
"LENGTH": LENGTH,
"CLUSTER": CLUSTER,
"LEMMA": LEMMA,
"POS": POS,
"TAG": TAG,
"DEP": DEP,
"ENT_IOB": ENT_IOB,
"ENT_TYPE": ENT_TYPE,
"HEAD": HEAD,
"SPACY": SPACY,
"PROB": PROB,
"ADJ": ADJ,
"ADP": ADP,
"ADV": ADV,
"AUX": AUX,
"CONJ": CONJ,
"DET": DET,
"INTJ": INTJ,
"NOUN": NOUN,
"NUM": NUM,
"PART": PART,
"PRON": PRON,
"PROPN": PROPN,
"PUNCT": PUNCT,
"SCONJ": SCONJ,
"SYM": SYM,
"VERB": VERB,
"X": X,
"EOL": EOL,
"SPACE": SPACE,
"Animacy_anim": Animacy_anim,
"Animacy_inam": Animacy_inam,
"Aspect_freq": Aspect_freq,
"Aspect_imp": Aspect_imp,
"Aspect_mod": Aspect_mod,
"Aspect_none": Aspect_none,
"Aspect_perf": Aspect_perf,
"Case_abe": Case_abe,
"Case_abl": Case_abl,
"Case_abs": Case_abs,
"Case_acc": Case_acc,
"Case_ade": Case_ade,
"Case_all": Case_all,
"Case_cau": Case_cau,
"Case_com": Case_com,
"Case_dat": Case_dat,
"Case_del": Case_del,
"Case_dis": Case_dis,
"Case_ela": Case_ela,
"Case_ess": Case_ess,
"Case_gen": Case_gen,
"Case_ill": Case_ill,
"Case_ine": Case_ine,
"Case_ins": Case_ins,
"Case_loc": Case_loc,
"Case_lat": Case_lat,
"Case_nom": Case_nom,
"Case_par": Case_par,
"Case_sub": Case_sub,
"Case_sup": Case_sup,
"Case_tem": Case_tem,
"Case_ter": Case_ter,
"Case_tra": Case_tra,
"Case_voc": Case_voc,
"Definite_two": Definite_two,
"Definite_def": Definite_def,
"Definite_red": Definite_red,
"Definite_ind": Definite_ind,
"Degree_cmp": Degree_cmp,
"Degree_comp": Degree_comp,
"Degree_none": Degree_none,
"Degree_pos": Degree_pos,
"Degree_sup": Degree_sup,
"Degree_abs": Degree_abs,
"Degree_com": Degree_com,
"Degree_dim ": Degree_dim, # du
"Gender_com": Gender_com,
"Gender_fem": Gender_fem,
"Gender_masc": Gender_masc,
"Gender_neut": Gender_neut,
"Mood_cnd": Mood_cnd,
"Mood_imp": Mood_imp,
"Mood_ind": Mood_ind,
"Mood_n": Mood_n,
"Mood_pot": Mood_pot,
"Mood_sub": Mood_sub,
"Mood_opt": Mood_opt,
"Negative_neg": Negative_neg,
"Negative_pos": Negative_pos,
"Negative_yes": Negative_yes,
"Number_com": Number_com,
"Number_dual": Number_dual,
"Number_none": Number_none,
"Number_plur": Number_plur,
"Number_sing": Number_sing,
"Number_ptan ": Number_ptan, # bg
"Number_count ": Number_count, # bg
"NumType_card": NumType_card,
"NumType_dist": NumType_dist,
"NumType_frac": NumType_frac,
"NumType_gen": NumType_gen,
"NumType_mult": NumType_mult,
"NumType_none": NumType_none,
"NumType_ord": NumType_ord,
"NumType_sets": NumType_sets,
"Person_one": Person_one,
"Person_two": Person_two,
"Person_three": Person_three,
"Person_none": Person_none,
"Poss_yes": Poss_yes,
"PronType_advPart": PronType_advPart,
"PronType_art": PronType_art,
"PronType_default": PronType_default,
"PronType_dem": PronType_dem,
"PronType_ind": PronType_ind,
"PronType_int": PronType_int,
"PronType_neg": PronType_neg,
"PronType_prs": PronType_prs,
"PronType_rcp": PronType_rcp,
"PronType_rel": PronType_rel,
"PronType_tot": PronType_tot,
"PronType_clit": PronType_clit,
"PronType_exc ": PronType_exc, # es, ca, it, fa,
"Reflex_yes": Reflex_yes,
"Tense_fut": Tense_fut,
"Tense_imp": Tense_imp,
"Tense_past": Tense_past,
"Tense_pres": Tense_pres,
"VerbForm_fin": VerbForm_fin,
"VerbForm_ger": VerbForm_ger,
"VerbForm_inf": VerbForm_inf,
"VerbForm_none": VerbForm_none,
"VerbForm_part": VerbForm_part,
"VerbForm_partFut": VerbForm_partFut,
"VerbForm_partPast": VerbForm_partPast,
"VerbForm_partPres": VerbForm_partPres,
"VerbForm_sup": VerbForm_sup,
"VerbForm_trans": VerbForm_trans,
"VerbForm_gdv ": VerbForm_gdv, # la,
"Voice_act": Voice_act,
"Voice_cau": Voice_cau,
"Voice_pass": Voice_pass,
"Voice_mid ": Voice_mid, # gkc,
"Voice_int ": Voice_int, # hb,
"Abbr_yes ": Abbr_yes, # cz, fi, sl, U,
"AdpType_prep ": AdpType_prep, # cz, U,
"AdpType_post ": AdpType_post, # U,
"AdpType_voc ": AdpType_voc, # cz,
"AdpType_comprep ": AdpType_comprep, # cz,
"AdpType_circ ": AdpType_circ, # U,
"AdvType_man": AdvType_man,
"AdvType_loc": AdvType_loc,
"AdvType_tim": AdvType_tim,
"AdvType_deg": AdvType_deg,
"AdvType_cau": AdvType_cau,
"AdvType_mod": AdvType_mod,
"AdvType_sta": AdvType_sta,
"AdvType_ex": AdvType_ex,
"AdvType_adadj": AdvType_adadj,
"ConjType_oper ": ConjType_oper, # cz, U,
"ConjType_comp ": ConjType_comp, # cz, U,
"Connegative_yes ": Connegative_yes, # fi,
"Derivation_minen ": Derivation_minen, # fi,
"Derivation_sti ": Derivation_sti, # fi,
"Derivation_inen ": Derivation_inen, # fi,
"Derivation_lainen ": Derivation_lainen, # fi,
"Derivation_ja ": Derivation_ja, # fi,
"Derivation_ton ": Derivation_ton, # fi,
"Derivation_vs ": Derivation_vs, # fi,
"Derivation_ttain ": Derivation_ttain, # fi,
"Derivation_ttaa ": Derivation_ttaa, # fi,
"Echo_rdp ": Echo_rdp, # U,
"Echo_ech ": Echo_ech, # U,
"Foreign_foreign ": Foreign_foreign, # cz, fi, U,
"Foreign_fscript ": Foreign_fscript, # cz, fi, U,
"Foreign_tscript ": Foreign_tscript, # cz, U,
"Foreign_yes ": Foreign_yes, # sl,
"Gender_dat_masc ": Gender_dat_masc, # bq, U,
"Gender_dat_fem ": Gender_dat_fem, # bq, U,
"Gender_erg_masc ": Gender_erg_masc, # bq,
"Gender_erg_fem ": Gender_erg_fem, # bq,
"Gender_psor_masc ": Gender_psor_masc, # cz, sl, U,
"Gender_psor_fem ": Gender_psor_fem, # cz, sl, U,
"Gender_psor_neut ": Gender_psor_neut, # sl,
"Hyph_yes ": Hyph_yes, # cz, U,
"InfForm_one ": InfForm_one, # fi,
"InfForm_two ": InfForm_two, # fi,
"InfForm_three ": InfForm_three, # fi,
"NameType_geo ": NameType_geo, # U, cz,
"NameType_prs ": NameType_prs, # U, cz,
"NameType_giv ": NameType_giv, # U, cz,
"NameType_sur ": NameType_sur, # U, cz,
"NameType_nat ": NameType_nat, # U, cz,
"NameType_com ": NameType_com, # U, cz,
"NameType_pro ": NameType_pro, # U, cz,
"NameType_oth ": NameType_oth, # U, cz,
"NounType_com ": NounType_com, # U,
"NounType_prop ": NounType_prop, # U,
"NounType_class ": NounType_class, # U,
"Number_abs_sing ": Number_abs_sing, # bq, U,
"Number_abs_plur ": Number_abs_plur, # bq, U,
"Number_dat_sing ": Number_dat_sing, # bq, U,
"Number_dat_plur ": Number_dat_plur, # bq, U,
"Number_erg_sing ": Number_erg_sing, # bq, U,
"Number_erg_plur ": Number_erg_plur, # bq, U,
"Number_psee_sing ": Number_psee_sing, # U,
"Number_psee_plur ": Number_psee_plur, # U,
"Number_psor_sing ": Number_psor_sing, # cz, fi, sl, U,
"Number_psor_plur ": Number_psor_plur, # cz, fi, sl, U,
"NumForm_digit ": NumForm_digit, # cz, sl, U,
"NumForm_roman ": NumForm_roman, # cz, sl, U,
"NumForm_word ": NumForm_word, # cz, sl, U,
"NumValue_one ": NumValue_one, # cz, U,
"NumValue_two ": NumValue_two, # cz, U,
"NumValue_three ": NumValue_three, # cz, U,
"PartForm_pres ": PartForm_pres, # fi,
"PartForm_past ": PartForm_past, # fi,
"PartForm_agt ": PartForm_agt, # fi,
"PartForm_neg ": PartForm_neg, # fi,
"PartType_mod ": PartType_mod, # U,
"PartType_emp ": PartType_emp, # U,
"PartType_res ": PartType_res, # U,
"PartType_inf ": PartType_inf, # U,
"PartType_vbp ": PartType_vbp, # U,
"Person_abs_one ": Person_abs_one, # bq, U,
"Person_abs_two ": Person_abs_two, # bq, U,
"Person_abs_three ": Person_abs_three, # bq, U,
"Person_dat_one ": Person_dat_one, # bq, U,
"Person_dat_two ": Person_dat_two, # bq, U,
"Person_dat_three ": Person_dat_three, # bq, U,
"Person_erg_one ": Person_erg_one, # bq, U,
"Person_erg_two ": Person_erg_two, # bq, U,
"Person_erg_three ": Person_erg_three, # bq, U,
"Person_psor_one ": Person_psor_one, # fi, U,
"Person_psor_two ": Person_psor_two, # fi, U,
"Person_psor_three ": Person_psor_three, # fi, U,
"Polite_inf ": Polite_inf, # bq, U,
"Polite_pol ": Polite_pol, # bq, U,
"Polite_abs_inf ": Polite_abs_inf, # bq, U,
"Polite_abs_pol ": Polite_abs_pol, # bq, U,
"Polite_erg_inf ": Polite_erg_inf, # bq, U,
"Polite_erg_pol ": Polite_erg_pol, # bq, U,
"Polite_dat_inf ": Polite_dat_inf, # bq, U,
"Polite_dat_pol ": Polite_dat_pol, # bq, U,
"Prefix_yes ": Prefix_yes, # U,
"PrepCase_npr ": PrepCase_npr, # cz,
"PrepCase_pre ": PrepCase_pre, # U,
"PunctSide_ini ": PunctSide_ini, # U,
"PunctSide_fin ": PunctSide_fin, # U,
"PunctType_peri ": PunctType_peri, # U,
"PunctType_qest ": PunctType_qest, # U,
"PunctType_excl ": PunctType_excl, # U,
"PunctType_quot ": PunctType_quot, # U,
"PunctType_brck ": PunctType_brck, # U,
"PunctType_comm ": PunctType_comm, # U,
"PunctType_colo ": PunctType_colo, # U,
"PunctType_semi ": PunctType_semi, # U,
"PunctType_dash ": PunctType_dash, # U,
"Style_arch ": Style_arch, # cz, fi, U,
"Style_rare ": Style_rare, # cz, fi, U,
"Style_poet ": Style_poet, # cz, U,
"Style_norm ": Style_norm, # cz, U,
"Style_coll ": Style_coll, # cz, U,
"Style_vrnc ": Style_vrnc, # cz, U,
"Style_sing ": Style_sing, # cz, U,
"Style_expr ": Style_expr, # cz, U,
"Style_derg ": Style_derg, # cz, U,
"Style_vulg ": Style_vulg, # cz, U,
"Style_yes ": Style_yes, # fi, U,
"StyleVariant_styleShort ": StyleVariant_styleShort, # cz,
"StyleVariant_styleBound ": StyleVariant_styleBound, # cz, sl,
"VerbType_aux ": VerbType_aux, # U,
"VerbType_cop ": VerbType_cop, # U,
"VerbType_mod ": VerbType_mod, # U,
"VerbType_light ": VerbType_light, # U,
"PERSON": PERSON,
"NORP": NORP,
"FACILITY": FACILITY,
"ORG": ORG,
"GPE": GPE,
"LOC": LOC,
"PRODUCT": PRODUCT,
"EVENT": EVENT,
"WORK_OF_ART": WORK_OF_ART,
"LANGUAGE": LANGUAGE,
"DATE": DATE,
"TIME": TIME,
"PERCENT": PERCENT,
"MONEY": MONEY,
"QUANTITY": QUANTITY,
"ORDINAL": ORDINAL,
"CARDINAL": CARDINAL,
"acomp": acomp,
"advcl": advcl,
"advmod": advmod,
"agent": agent,
"amod": amod,
"appos": appos,
"attr": attr,
"aux": aux,
"auxpass": auxpass,
"cc": cc,
"ccomp": ccomp,
"complm": complm,
"conj": conj,
"csubj": csubj,
"csubjpass": csubjpass,
"dep": dep,
"det": det,
"dobj": dobj,
"expl": expl,
"hmod": hmod,
"hyph": hyph,
"infmod": infmod,
"intj": intj,
"iobj": iobj,
"mark": mark,
"meta": meta,
"neg": neg,
"nmod": nmod,
"nn": nn,
"npadvmod": npadvmod,
"nsubj": nsubj,
"nsubjpass": nsubjpass,
"num": num,
"number": number,
"oprd": oprd,
"parataxis": parataxis,
"partmod": partmod,
"pcomp": pcomp,
"pobj": pobj,
"poss": poss,
"possessive": possessive,
"preconj": preconj,
"prep": prep,
"prt": prt,
"punct": punct,
"quantmod": quantmod,
"rcmod": rcmod,
"root": root,
"xcomp": xcomp
}
NAMES = [it[0] for it in sorted(IDS.items(), key=lambda it: it[1])]

View File

@ -14,7 +14,6 @@ from ..typedefs cimport attr_t, flags_t
from ..attrs cimport attr_id_t
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
from ..parts_of_speech import UNIV_POS_NAMES
from ..parts_of_speech cimport CONJ, PUNCT, NOUN
from ..parts_of_speech cimport univ_pos_t
from ..lexeme cimport Lexeme

View File

@ -9,7 +9,7 @@ import numpy
from ..lexeme cimport Lexeme
from ..parts_of_speech import UNIV_POS_NAMES
from .. import parts_of_speech
from ..attrs cimport LEMMA
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
@ -318,7 +318,7 @@ cdef class Token:
property pos_:
def __get__(self):
return _pos_id_to_string[self.c.pos]
return parts_of_speech.NAMES[self.c.pos]
property tag_:
def __get__(self):
@ -363,6 +363,3 @@ cdef class Token:
property like_email:
def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_EMAIL)
_pos_id_to_string = {id_: string for string, id_ in UNIV_POS_NAMES.items()}

View File

@ -19,6 +19,9 @@ from .typedefs cimport attr_t
from .cfile cimport CFile
from .lemmatizer import Lemmatizer
from . import attrs
from . import symbols
from cymem.cymem cimport Address
from . import util
from .serialize.packer cimport Packer
@ -67,6 +70,14 @@ cdef class Vocab:
self._by_hash = PreshMap()
self._by_orth = PreshMap()
self.strings = StringStore()
# Load strings in a special order, so that we have an onset number for
# the vocabulary. This way, when words are added in order, the orth ID
# is the frequency rank of the word, plus a certain offset. The structural
# strings are loaded first, because the vocab is open-class, and these
# symbols are closed class.
for name in symbols.NAMES + list(sorted(tag_map.keys())):
if name:
_ = self.strings[name]
self.get_lex_attr = get_lex_attr
self.morphology = Morphology(self.strings, tag_map, lemmatizer)
self.serializer_freqs = serializer_freqs

View File

@ -1,6 +1,9 @@
from __future__ import unicode_literals
import pytest
from spacy.attrs import LEMMA, ORTH, PROB, IS_ALPHA
from spacy.parts_of_speech import NOUN, VERB
def test_neq(en_vocab):
addr = en_vocab['Hello']
@ -25,3 +28,13 @@ def test_punct_neq(en_vocab):
def test_shape_attr(en_vocab):
example = en_vocab['example']
assert example.orth != example.shape
def test_symbols(en_vocab):
assert en_vocab.strings['IS_ALPHA'] == IS_ALPHA
assert en_vocab.strings['NOUN'] == NOUN
assert en_vocab.strings['VERB'] == VERB
assert en_vocab.strings['LEMMA'] == LEMMA
assert en_vocab.strings['ORTH'] == ORTH
assert en_vocab.strings['PROB'] == PROB

View File

@ -1,11 +1,13 @@
from __future__ import unicode_literals
import pytest
import os
@pytest.fixture(scope='session')
def nlp():
from spacy.en import English
return English()
from spacy.en import English, LOCAL_DATA_DIR
data_dir = os.environ.get('SPACY_DATA', LOCAL_DATA_DIR)
return English(data_dir=data_dir)
@pytest.fixture()

View File

@ -1,6 +1,7 @@
from __future__ import unicode_literals
import pytest
import spacy
import os
@pytest.fixture()
@ -9,8 +10,9 @@ def token(doc):
def test_load_resources_and_process_text():
from spacy.en import English
nlp = English()
from spacy.en import English, LOCAL_DATA_DIR
data_dir = os.environ.get('SPACY_DATA', LOCAL_DATA_DIR)
nlp = English(data_dir=data_dir)
doc = nlp('Hello, world. Here are two sentences.')