mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 01:46:28 +03:00
Merge branch 'attrs'
This commit is contained in:
commit
c1fdc487bc
|
@ -168,6 +168,11 @@ def setup_vocab(get_lex_attr, tag_map, src_dir, dst_dir):
|
|||
probs[word] = oov_prob
|
||||
|
||||
lexicon = []
|
||||
for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])):
|
||||
# First encode the strings into the StringStore. This way, we can map
|
||||
# the orth IDs to frequency ranks
|
||||
orth = vocab.strings[word]
|
||||
# Now actually load the vocab
|
||||
for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])):
|
||||
lexeme = vocab[word]
|
||||
lexeme.prob = prob
|
||||
|
|
|
@ -56,5 +56,4 @@
|
|||
"was": {"L": "be", "VerbForm": "Fin", "Tense": "Past", "Number": "Sing"},
|
||||
"were": {"L": "be", "VerbForm": "Fin", "Tense": "Past", "Number": "Plur"}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -22,7 +22,7 @@
|
|||
"JJS": {"pos": "adj", "degree": "sup"},
|
||||
"LS": {"pos": "punct", "numtype": "ord"},
|
||||
"MD": {"pos": "verb", "verbtype": "mod"},
|
||||
"NIL": {"pos": "no_tag"},
|
||||
"NIL": {"pos": ""},
|
||||
"NN": {"pos": "noun", "number": "sing"},
|
||||
"NNP": {"pos": "noun", "nountype": "prop", "number": "sing"},
|
||||
"NNPS": {"pos": "noun", "nountype": "prop", "number": "plur"},
|
||||
|
|
3
setup.py
3
setup.py
|
@ -166,7 +166,8 @@ MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings',
|
|||
'spacy.tokens.doc', 'spacy.tokens.spans', 'spacy.tokens.token',
|
||||
'spacy.serialize.packer', 'spacy.serialize.huffman', 'spacy.serialize.bits',
|
||||
'spacy.cfile', 'spacy.matcher',
|
||||
'spacy.syntax.ner']
|
||||
'spacy.syntax.ner',
|
||||
'spacy.symbols']
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
|
@ -29,5 +29,6 @@ cdef class Model:
|
|||
cdef int update(self, atom_t* context, class_t guess, class_t gold, int cost) except -1
|
||||
|
||||
cdef object model_loc
|
||||
cdef object _templates
|
||||
cdef Extractor _extractor
|
||||
cdef LinearModel _model
|
||||
|
|
|
@ -3,6 +3,7 @@ from __future__ import unicode_literals
|
|||
from __future__ import division
|
||||
|
||||
from os import path
|
||||
import tempfile
|
||||
import os
|
||||
import shutil
|
||||
import json
|
||||
|
@ -52,6 +53,7 @@ cdef class Model:
|
|||
def __init__(self, n_classes, templates, model_loc=None):
|
||||
if model_loc is not None and path.isdir(model_loc):
|
||||
model_loc = path.join(model_loc, 'model')
|
||||
self._templates = templates
|
||||
self.n_classes = n_classes
|
||||
self._extractor = Extractor(templates)
|
||||
self.n_feats = self._extractor.n_templ
|
||||
|
@ -60,6 +62,18 @@ cdef class Model:
|
|||
if self.model_loc and path.exists(self.model_loc):
|
||||
self._model.load(self.model_loc, freq_thresh=0)
|
||||
|
||||
def __reduce__(self):
|
||||
model_loc = tempfile.mkstemp()
|
||||
# TODO: This is a potentially buggy implementation. We're not really
|
||||
# given a good guarantee that all internal state is saved correctly here,
|
||||
# since there are learning parameters for e.g. the model averaging in
|
||||
# averaged perceptron, the gradient calculations in AdaGrad, etc
|
||||
# that aren't necessarily saved. So, if we're part way through training
|
||||
# the model, and then we pickle it, we won't recover the state correctly.
|
||||
self._model.dump(model_loc)
|
||||
return (Model, (self.n_classes, self.templates, model_loc),
|
||||
None, None)
|
||||
|
||||
def predict(self, Example eg):
|
||||
self.set_scores(eg.c.scores, eg.c.atoms)
|
||||
eg.c.guess = arg_max_if_true(eg.c.scores, eg.c.is_valid, self.n_classes)
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
# Reserve 64 values for flag features
|
||||
cpdef enum attr_id_t:
|
||||
NULL_ATTR
|
||||
IS_ALPHA
|
||||
IS_ASCII
|
||||
IS_DIGIT
|
||||
|
@ -14,8 +15,7 @@ cpdef enum attr_id_t:
|
|||
IS_STOP
|
||||
IS_OOV
|
||||
|
||||
FLAG13 = 13
|
||||
FLAG14
|
||||
FLAG14 = 14
|
||||
FLAG15
|
||||
FLAG16
|
||||
FLAG17
|
||||
|
|
|
@ -0,0 +1,90 @@
|
|||
IDS = {
|
||||
"": NULL_ATTR,
|
||||
"IS_ALPHA": IS_ALPHA,
|
||||
"IS_ASCII": IS_ASCII,
|
||||
"IS_DIGIT": IS_DIGIT,
|
||||
"IS_LOWER": IS_LOWER,
|
||||
"IS_PUNCT": IS_PUNCT,
|
||||
"IS_SPACE": IS_SPACE,
|
||||
"IS_TITLE": IS_TITLE,
|
||||
"IS_UPPER": IS_UPPER,
|
||||
"LIKE_URL": LIKE_URL,
|
||||
"LIKE_NUM": LIKE_NUM,
|
||||
"LIKE_EMAIL": LIKE_EMAIL,
|
||||
"IS_STOP": IS_STOP,
|
||||
"IS_OOV": IS_OOV,
|
||||
|
||||
"FLAG14": FLAG14,
|
||||
"FLAG15": FLAG15,
|
||||
"FLAG16": FLAG16,
|
||||
"FLAG17": FLAG17,
|
||||
"FLAG18": FLAG18,
|
||||
"FLAG19": FLAG19,
|
||||
"FLAG20": FLAG20,
|
||||
"FLAG21": FLAG21,
|
||||
"FLAG22": FLAG22,
|
||||
"FLAG23": FLAG23,
|
||||
"FLAG24": FLAG24,
|
||||
"FLAG25": FLAG25,
|
||||
"FLAG26": FLAG26,
|
||||
"FLAG27": FLAG27,
|
||||
"FLAG28": FLAG28,
|
||||
"FLAG29": FLAG29,
|
||||
"FLAG30": FLAG30,
|
||||
"FLAG31": FLAG31,
|
||||
"FLAG32": FLAG32,
|
||||
"FLAG33": FLAG33,
|
||||
"FLAG34": FLAG34,
|
||||
"FLAG35": FLAG35,
|
||||
"FLAG36": FLAG36,
|
||||
"FLAG37": FLAG37,
|
||||
"FLAG38": FLAG38,
|
||||
"FLAG39": FLAG39,
|
||||
"FLAG40": FLAG40,
|
||||
"FLAG41": FLAG41,
|
||||
"FLAG42": FLAG42,
|
||||
"FLAG43": FLAG43,
|
||||
"FLAG44": FLAG44,
|
||||
"FLAG45": FLAG45,
|
||||
"FLAG46": FLAG46,
|
||||
"FLAG47": FLAG47,
|
||||
"FLAG48": FLAG48,
|
||||
"FLAG49": FLAG49,
|
||||
"FLAG50": FLAG50,
|
||||
"FLAG51": FLAG51,
|
||||
"FLAG52": FLAG52,
|
||||
"FLAG53": FLAG53,
|
||||
"FLAG54": FLAG54,
|
||||
"FLAG55": FLAG55,
|
||||
"FLAG56": FLAG56,
|
||||
"FLAG57": FLAG57,
|
||||
"FLAG58": FLAG58,
|
||||
"FLAG59": FLAG59,
|
||||
"FLAG60": FLAG60,
|
||||
"FLAG61": FLAG61,
|
||||
"FLAG62": FLAG62,
|
||||
"FLAG63": FLAG63,
|
||||
|
||||
"ID": ID,
|
||||
"ORTH": ORTH,
|
||||
"LOWER": LOWER,
|
||||
"NORM": NORM,
|
||||
"SHAPE": SHAPE,
|
||||
"PREFIX": PREFIX,
|
||||
"SUFFIX": SUFFIX,
|
||||
|
||||
"LENGTH": LENGTH,
|
||||
"CLUSTER": CLUSTER,
|
||||
"LEMMA": LEMMA,
|
||||
"POS": POS,
|
||||
"TAG": TAG,
|
||||
"DEP": DEP,
|
||||
"ENT_IOB": ENT_IOB,
|
||||
"ENT_TYPE": ENT_TYPE,
|
||||
"HEAD": HEAD,
|
||||
"SPACY": SPACY,
|
||||
"PROB": PROB,
|
||||
}
|
||||
|
||||
# ATTR IDs, in order of the symbol
|
||||
NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]
|
|
@ -207,6 +207,12 @@ class Language(object):
|
|||
self.entity = entity
|
||||
self.matcher = matcher
|
||||
|
||||
def __reduce__(self):
|
||||
return (self.__class__,
|
||||
(None, self.vocab, self.tokenizer, self.tagger, self.parser,
|
||||
self.entity, self.matcher, None),
|
||||
None, None)
|
||||
|
||||
def __call__(self, text, tag=True, parse=True, entity=True):
|
||||
"""Apply the pipeline to some text. The text can span multiple sentences,
|
||||
and can contain arbtrary whitespace. Alignment into the original string
|
||||
|
|
|
@ -15,7 +15,7 @@ from libcpp.vector cimport vector
|
|||
from murmurhash.mrmr cimport hash64
|
||||
|
||||
from .attrs cimport LENGTH, ENT_TYPE, ORTH, NORM, LEMMA, LOWER, SHAPE
|
||||
from .attrs cimport FLAG13, FLAG14, FLAG15, FLAG16, FLAG17, FLAG18, FLAG19, FLAG20, FLAG21, FLAG22, FLAG23, FLAG24, FLAG25
|
||||
from .attrs cimport FLAG14, FLAG15, FLAG16, FLAG17, FLAG18, FLAG19, FLAG20, FLAG21, FLAG22, FLAG23, FLAG24, FLAG25
|
||||
from .tokens.doc cimport get_token_attr
|
||||
from .tokens.doc cimport Doc
|
||||
from .vocab cimport Vocab
|
||||
|
@ -168,13 +168,7 @@ cdef class Matcher:
|
|||
cdef Pool mem
|
||||
cdef vector[Pattern*] patterns
|
||||
cdef readonly Vocab vocab
|
||||
|
||||
def __init__(self, vocab, patterns):
|
||||
self.vocab = vocab
|
||||
self.mem = Pool()
|
||||
self.vocab = vocab
|
||||
for entity_key, (etype, attrs, specs) in sorted(patterns.items()):
|
||||
self.add(entity_key, etype, attrs, specs)
|
||||
cdef object _patterns
|
||||
|
||||
@classmethod
|
||||
def from_dir(cls, data_dir, Vocab vocab):
|
||||
|
@ -186,10 +180,22 @@ cdef class Matcher:
|
|||
else:
|
||||
return cls(vocab, {})
|
||||
|
||||
def __init__(self, vocab, patterns):
|
||||
self.vocab = vocab
|
||||
self.mem = Pool()
|
||||
self.vocab = vocab
|
||||
self._patterns = dict(patterns)
|
||||
for entity_key, (etype, attrs, specs) in sorted(patterns.items()):
|
||||
self.add(entity_key, etype, attrs, specs)
|
||||
|
||||
def __reduce__(self):
|
||||
return (self.__class__, (self.vocab, self._patterns), None, None)
|
||||
|
||||
property n_patterns:
|
||||
def __get__(self): return self.patterns.size()
|
||||
|
||||
def add(self, entity_key, etype, attrs, specs):
|
||||
self._patterns[entity_key] = (etype, dict(attrs), list(specs))
|
||||
if isinstance(entity_key, basestring):
|
||||
entity_key = self.vocab.strings[entity_key]
|
||||
if isinstance(etype, basestring):
|
||||
|
|
|
@ -7,6 +7,7 @@ from .strings cimport StringStore
|
|||
from .typedefs cimport attr_t
|
||||
from .parts_of_speech cimport univ_pos_t
|
||||
|
||||
from . cimport symbols
|
||||
|
||||
cdef struct RichTagC:
|
||||
uint64_t morph
|
||||
|
@ -24,6 +25,7 @@ cdef class Morphology:
|
|||
cdef readonly Pool mem
|
||||
cdef readonly StringStore strings
|
||||
cdef public object lemmatizer
|
||||
cdef readonly object tag_map
|
||||
cdef public object n_tags
|
||||
cdef public object reverse_index
|
||||
cdef public object tag_names
|
||||
|
@ -36,720 +38,252 @@ cdef class Morphology:
|
|||
cdef int assign_feature(self, uint64_t* morph, feature, value) except -1
|
||||
|
||||
|
||||
cpdef enum univ_morph_t:
|
||||
NIL = 0
|
||||
Animacy_anim = symbols.Animacy_anim
|
||||
Animacy_inam
|
||||
Aspect_freq
|
||||
Aspect_imp
|
||||
Aspect_mod
|
||||
Aspect_none
|
||||
Aspect_perf
|
||||
Case_abe
|
||||
Case_abl
|
||||
Case_abs
|
||||
Case_acc
|
||||
Case_ade
|
||||
Case_all
|
||||
Case_cau
|
||||
Case_com
|
||||
Case_dat
|
||||
Case_del
|
||||
Case_dis
|
||||
Case_ela
|
||||
Case_ess
|
||||
Case_gen
|
||||
Case_ill
|
||||
Case_ine
|
||||
Case_ins
|
||||
Case_loc
|
||||
Case_lat
|
||||
Case_nom
|
||||
Case_par
|
||||
Case_sub
|
||||
Case_sup
|
||||
Case_tem
|
||||
Case_ter
|
||||
Case_tra
|
||||
Case_voc
|
||||
Definite_two
|
||||
Definite_def
|
||||
Definite_red
|
||||
Definite_ind
|
||||
Degree_cmp
|
||||
Degree_comp
|
||||
Degree_none
|
||||
Degree_pos
|
||||
Degree_sup
|
||||
Degree_abs
|
||||
Degree_com
|
||||
Degree_dim # du
|
||||
Gender_com
|
||||
Gender_fem
|
||||
Gender_masc
|
||||
Gender_neut
|
||||
Mood_cnd
|
||||
Mood_imp
|
||||
Mood_ind
|
||||
Mood_n
|
||||
Mood_pot
|
||||
Mood_sub
|
||||
Mood_opt
|
||||
Negative_neg
|
||||
Negative_pos
|
||||
Negative_yes
|
||||
Number_com
|
||||
Number_dual
|
||||
Number_none
|
||||
Number_plur
|
||||
Number_sing
|
||||
Number_ptan # bg
|
||||
Number_count # bg
|
||||
NumType_card
|
||||
NumType_dist
|
||||
NumType_frac
|
||||
NumType_gen
|
||||
NumType_mult
|
||||
NumType_none
|
||||
NumType_ord
|
||||
NumType_sets
|
||||
Person_one
|
||||
Person_two
|
||||
Person_three
|
||||
Person_none
|
||||
Poss_yes
|
||||
PronType_advPart
|
||||
PronType_art
|
||||
PronType_default
|
||||
PronType_dem
|
||||
PronType_ind
|
||||
PronType_int
|
||||
PronType_neg
|
||||
PronType_prs
|
||||
PronType_rcp
|
||||
PronType_rel
|
||||
PronType_tot
|
||||
PronType_clit
|
||||
PronType_exc # es, ca, it, fa
|
||||
Reflex_yes
|
||||
Tense_fut
|
||||
Tense_imp
|
||||
Tense_past
|
||||
Tense_pres
|
||||
VerbForm_fin
|
||||
VerbForm_ger
|
||||
VerbForm_inf
|
||||
VerbForm_none
|
||||
VerbForm_part
|
||||
VerbForm_partFut
|
||||
VerbForm_partPast
|
||||
VerbForm_partPres
|
||||
VerbForm_sup
|
||||
VerbForm_trans
|
||||
VerbForm_gdv # la
|
||||
Voice_act
|
||||
Voice_cau
|
||||
Voice_pass
|
||||
Voice_mid # gkc
|
||||
Voice_int # hb
|
||||
Abbr_yes # cz, fi, sl, U
|
||||
AdpType_prep # cz, U
|
||||
AdpType_post # U
|
||||
AdpType_voc # cz
|
||||
AdpType_comprep # cz
|
||||
AdpType_circ # U
|
||||
AdvType_man
|
||||
AdvType_loc
|
||||
AdvType_tim
|
||||
AdvType_deg
|
||||
AdvType_cau
|
||||
AdvType_mod
|
||||
AdvType_sta
|
||||
AdvType_ex
|
||||
AdvType_adadj
|
||||
ConjType_oper # cz, U
|
||||
ConjType_comp # cz, U
|
||||
Connegative_yes # fi
|
||||
Derivation_minen # fi
|
||||
Derivation_sti # fi
|
||||
Derivation_inen # fi
|
||||
Derivation_lainen # fi
|
||||
Derivation_ja # fi
|
||||
Derivation_ton # fi
|
||||
Derivation_vs # fi
|
||||
Derivation_ttain # fi
|
||||
Derivation_ttaa # fi
|
||||
Echo_rdp # U
|
||||
Echo_ech # U
|
||||
Foreign_foreign # cz, fi, U
|
||||
Foreign_fscript # cz, fi, U
|
||||
Foreign_tscript # cz, U
|
||||
Foreign_yes # sl
|
||||
Gender_dat_masc # bq, U
|
||||
Gender_dat_fem # bq, U
|
||||
Gender_erg_masc # bq
|
||||
Gender_erg_fem # bq
|
||||
Gender_psor_masc # cz, sl, U
|
||||
Gender_psor_fem # cz, sl, U
|
||||
Gender_psor_neut # sl
|
||||
Hyph_yes # cz, U
|
||||
InfForm_one # fi
|
||||
InfForm_two # fi
|
||||
InfForm_three # fi
|
||||
NameType_geo # U, cz
|
||||
NameType_prs # U, cz
|
||||
NameType_giv # U, cz
|
||||
NameType_sur # U, cz
|
||||
NameType_nat # U, cz
|
||||
NameType_com # U, cz
|
||||
NameType_pro # U, cz
|
||||
NameType_oth # U, cz
|
||||
NounType_com # U
|
||||
NounType_prop # U
|
||||
NounType_class # U
|
||||
Number_abs_sing # bq, U
|
||||
Number_abs_plur # bq, U
|
||||
Number_dat_sing # bq, U
|
||||
Number_dat_plur # bq, U
|
||||
Number_erg_sing # bq, U
|
||||
Number_erg_plur # bq, U
|
||||
Number_psee_sing # U
|
||||
Number_psee_plur # U
|
||||
Number_psor_sing # cz, fi, sl, U
|
||||
Number_psor_plur # cz, fi, sl, U
|
||||
NumForm_digit # cz, sl, U
|
||||
NumForm_roman # cz, sl, U
|
||||
NumForm_word # cz, sl, U
|
||||
NumValue_one # cz, U
|
||||
NumValue_two # cz, U
|
||||
NumValue_three # cz, U
|
||||
PartForm_pres # fi
|
||||
PartForm_past # fi
|
||||
PartForm_agt # fi
|
||||
PartForm_neg # fi
|
||||
PartType_mod # U
|
||||
PartType_emp # U
|
||||
PartType_res # U
|
||||
PartType_inf # U
|
||||
PartType_vbp # U
|
||||
Person_abs_one # bq, U
|
||||
Person_abs_two # bq, U
|
||||
Person_abs_three # bq, U
|
||||
Person_dat_one # bq, U
|
||||
Person_dat_two # bq, U
|
||||
Person_dat_three # bq, U
|
||||
Person_erg_one # bq, U
|
||||
Person_erg_two # bq, U
|
||||
Person_erg_three # bq, U
|
||||
Person_psor_one # fi, U
|
||||
Person_psor_two # fi, U
|
||||
Person_psor_three # fi, U
|
||||
Polite_inf # bq, U
|
||||
Polite_pol # bq, U
|
||||
Polite_abs_inf # bq, U
|
||||
Polite_abs_pol # bq, U
|
||||
Polite_erg_inf # bq, U
|
||||
Polite_erg_pol # bq, U
|
||||
Polite_dat_inf # bq, U
|
||||
Polite_dat_pol # bq, U
|
||||
Prefix_yes # U
|
||||
PrepCase_npr # cz
|
||||
PrepCase_pre # U
|
||||
PunctSide_ini # U
|
||||
PunctSide_fin # U
|
||||
PunctType_peri # U
|
||||
PunctType_qest # U
|
||||
PunctType_excl # U
|
||||
PunctType_quot # U
|
||||
PunctType_brck # U
|
||||
PunctType_comm # U
|
||||
PunctType_colo # U
|
||||
PunctType_semi # U
|
||||
PunctType_dash # U
|
||||
Style_arch # cz, fi, U
|
||||
Style_rare # cz, fi, U
|
||||
Style_poet # cz, U
|
||||
Style_norm # cz, U
|
||||
Style_coll # cz, U
|
||||
Style_vrnc # cz, U
|
||||
Style_sing # cz, U
|
||||
Style_expr # cz, U
|
||||
Style_derg # cz, U
|
||||
Style_vulg # cz, U
|
||||
Style_yes # fi, U
|
||||
StyleVariant_styleShort # cz
|
||||
StyleVariant_styleBound # cz, sl
|
||||
VerbType_aux # U
|
||||
VerbType_cop # U
|
||||
VerbType_mod # U
|
||||
VerbType_light # U
|
||||
|
||||
#
|
||||
#cpdef enum Feature_t:
|
||||
# Abbr
|
||||
# AdpType
|
||||
# AdvType
|
||||
# ConjType
|
||||
# Connegative
|
||||
# Derivation
|
||||
# Echo
|
||||
# Foreign
|
||||
# Gender_dat
|
||||
# Gender_erg
|
||||
# Gender_psor
|
||||
# Hyph
|
||||
# InfForm
|
||||
# NameType
|
||||
# NounType
|
||||
# NumberAbs
|
||||
# NumberDat
|
||||
# NumberErg
|
||||
# NumberPsee
|
||||
# NumberPsor
|
||||
# NumForm
|
||||
# NumValue
|
||||
# PartForm
|
||||
# PartType
|
||||
# Person_abs
|
||||
# Person_dat
|
||||
# Person_psor
|
||||
# Polite
|
||||
# Polite_abs
|
||||
# Polite_dat
|
||||
# Prefix
|
||||
# PrepCase
|
||||
# PunctSide
|
||||
# PunctType
|
||||
# Style
|
||||
# Typo
|
||||
# Variant
|
||||
# VerbType
|
||||
#
|
||||
#
|
||||
#cpdef enum Animacy:
|
||||
# Anim
|
||||
# Inam
|
||||
#
|
||||
#
|
||||
#cpdef enum Aspect:
|
||||
# Freq
|
||||
# Imp
|
||||
# Mod
|
||||
# None_
|
||||
# Perf
|
||||
#
|
||||
#
|
||||
#cpdef enum Case1:
|
||||
# Nom
|
||||
# Gen
|
||||
# Acc
|
||||
# Dat
|
||||
# Voc
|
||||
# Abl
|
||||
#
|
||||
#cdef enum Case2:
|
||||
# Abe
|
||||
# Abs
|
||||
# Ade
|
||||
# All
|
||||
# Cau
|
||||
# Com
|
||||
# Del
|
||||
# Dis
|
||||
#
|
||||
#cdef enum Case3:
|
||||
# Ela
|
||||
# Ess
|
||||
# Ill
|
||||
# Ine
|
||||
# Ins
|
||||
# Loc
|
||||
# Lat
|
||||
# Par
|
||||
#
|
||||
#cdef enum Case4:
|
||||
# Sub
|
||||
# Sup
|
||||
# Tem
|
||||
# Ter
|
||||
# Tra
|
||||
#
|
||||
#
|
||||
#cpdef enum Definite:
|
||||
# Two
|
||||
# Def
|
||||
# Red
|
||||
# Ind
|
||||
#
|
||||
#
|
||||
#cpdef enum Degree:
|
||||
# Cmp
|
||||
# Comp
|
||||
# None_
|
||||
# Pos
|
||||
# Sup
|
||||
# Abs
|
||||
# Com
|
||||
# Degree # du
|
||||
#
|
||||
#
|
||||
#cpdef enum Gender:
|
||||
# Com
|
||||
# Fem
|
||||
# Masc
|
||||
# Neut
|
||||
#
|
||||
#
|
||||
#cpdef enum Mood:
|
||||
# Cnd
|
||||
# Imp
|
||||
# Ind
|
||||
# N
|
||||
# Pot
|
||||
# Sub
|
||||
# Opt
|
||||
#
|
||||
#
|
||||
#cpdef enum Negative:
|
||||
# Neg
|
||||
# Pos
|
||||
# Yes
|
||||
#
|
||||
#
|
||||
#cpdef enum Number:
|
||||
# Com
|
||||
# Dual
|
||||
# None_
|
||||
# Plur
|
||||
# Sing
|
||||
# Ptan # bg
|
||||
# Count # bg
|
||||
#
|
||||
#
|
||||
#cpdef enum NumType:
|
||||
# Card
|
||||
# Dist
|
||||
# Frac
|
||||
# Gen
|
||||
# Mult
|
||||
# None_
|
||||
# Ord
|
||||
# Sets
|
||||
#
|
||||
#
|
||||
#cpdef enum Person:
|
||||
# One
|
||||
# Two
|
||||
# Three
|
||||
# None_
|
||||
#
|
||||
#
|
||||
#cpdef enum Poss:
|
||||
# Yes
|
||||
#
|
||||
#
|
||||
#cpdef enum PronType1:
|
||||
# AdvPart
|
||||
# Art
|
||||
# Default
|
||||
# Dem
|
||||
# Ind
|
||||
# Int
|
||||
# Neg
|
||||
#
|
||||
#cpdef enum PronType2:
|
||||
# Prs
|
||||
# Rcp
|
||||
# Rel
|
||||
# Tot
|
||||
# Clit
|
||||
# Exc # es, ca, it, fa
|
||||
# Clit # it
|
||||
#
|
||||
#
|
||||
#cpdef enum Reflex:
|
||||
# Yes
|
||||
#
|
||||
#
|
||||
#cpdef enum Tense:
|
||||
# Fut
|
||||
# Imp
|
||||
# Past
|
||||
# Pres
|
||||
#
|
||||
#cpdef enum VerbForm1:
|
||||
# Fin
|
||||
# Ger
|
||||
# Inf
|
||||
# None_
|
||||
# Part
|
||||
# PartFut
|
||||
# PartPast
|
||||
#
|
||||
#cpdef enum VerbForm2:
|
||||
# PartPres
|
||||
# Sup
|
||||
# Trans
|
||||
# Gdv # la
|
||||
#
|
||||
#
|
||||
#cpdef enum Voice:
|
||||
# Act
|
||||
# Cau
|
||||
# Pass
|
||||
# Mid # gkc
|
||||
# Int # hb
|
||||
#
|
||||
#
|
||||
#cpdef enum Abbr:
|
||||
# Yes # cz, fi, sl, U
|
||||
#
|
||||
#cpdef enum AdpType:
|
||||
# Prep # cz, U
|
||||
# Post # U
|
||||
# Voc # cz
|
||||
# Comprep # cz
|
||||
# Circ # U
|
||||
# Voc # U
|
||||
#
|
||||
#
|
||||
#cpdef enum AdvType1:
|
||||
# # U
|
||||
# Man
|
||||
# Loc
|
||||
# Tim
|
||||
# Deg
|
||||
# Cau
|
||||
# Mod
|
||||
# Sta
|
||||
# Ex
|
||||
#
|
||||
#cpdef enum AdvType2:
|
||||
# Adadj
|
||||
#
|
||||
#cpdef enum ConjType:
|
||||
# Oper # cz, U
|
||||
# Comp # cz, U
|
||||
#
|
||||
#cpdef enum Connegative:
|
||||
# Yes # fi
|
||||
#
|
||||
#
|
||||
#cpdef enum Derivation1:
|
||||
# Minen # fi
|
||||
# Sti # fi
|
||||
# Inen # fi
|
||||
# Lainen # fi
|
||||
# Ja # fi
|
||||
# Ton # fi
|
||||
# Vs # fi
|
||||
# Ttain # fi
|
||||
#
|
||||
#cpdef enum Derivation2:
|
||||
# Ttaa
|
||||
#
|
||||
#
|
||||
#cpdef enum Echo:
|
||||
# Rdp # U
|
||||
# Ech # U
|
||||
#
|
||||
#
|
||||
#cpdef enum Foreign:
|
||||
# Foreign # cz, fi, U
|
||||
# Fscript # cz, fi, U
|
||||
# Tscript # cz, U
|
||||
# Yes # sl
|
||||
#
|
||||
#
|
||||
#cpdef enum Gender_dat:
|
||||
# Masc # bq, U
|
||||
# Fem # bq, U
|
||||
#
|
||||
#
|
||||
#cpdef enum Gender_erg:
|
||||
# Masc # bq
|
||||
# Fem # bq
|
||||
#
|
||||
#
|
||||
#cpdef enum Gender_psor:
|
||||
# Masc # cz, sl, U
|
||||
# Fem # cz, sl, U
|
||||
# Neut # sl
|
||||
#
|
||||
#
|
||||
#cpdef enum Hyph:
|
||||
# Yes # cz, U
|
||||
#
|
||||
#
|
||||
#cpdef enum InfForm:
|
||||
# One # fi
|
||||
# Two # fi
|
||||
# Three # fi
|
||||
#
|
||||
#
|
||||
#cpdef enum NameType:
|
||||
# Geo # U, cz
|
||||
# Prs # U, cz
|
||||
# Giv # U, cz
|
||||
# Sur # U, cz
|
||||
# Nat # U, cz
|
||||
# Com # U, cz
|
||||
# Pro # U, cz
|
||||
# Oth # U, cz
|
||||
#
|
||||
#
|
||||
#cpdef enum NounType:
|
||||
# Com # U
|
||||
# Prop # U
|
||||
# Class # U
|
||||
#
|
||||
#cpdef enum Number_abs:
|
||||
# Sing # bq, U
|
||||
# Plur # bq, U
|
||||
#
|
||||
#cpdef enum Number_dat:
|
||||
# Sing # bq, U
|
||||
# Plur # bq, U
|
||||
#
|
||||
#cpdef enum Number_erg:
|
||||
# Sing # bq, U
|
||||
# Plur # bq, U
|
||||
#
|
||||
#cpdef enum Number_psee:
|
||||
# Sing # U
|
||||
# Plur # U
|
||||
#
|
||||
#
|
||||
#cpdef enum Number_psor:
|
||||
# Sing # cz, fi, sl, U
|
||||
# Plur # cz, fi, sl, U
|
||||
#
|
||||
#
|
||||
#cpdef enum NumForm:
|
||||
# Digit # cz, sl, U
|
||||
# Roman # cz, sl, U
|
||||
# Word # cz, sl, U
|
||||
#
|
||||
#
|
||||
#cpdef enum NumValue:
|
||||
# One # cz, U
|
||||
# Two # cz, U
|
||||
# Three # cz, U
|
||||
#
|
||||
#
|
||||
#cpdef enum PartForm:
|
||||
# Pres # fi
|
||||
# Past # fi
|
||||
# Agt # fi
|
||||
# Neg # fi
|
||||
#
|
||||
#
|
||||
#cpdef enum PartType:
|
||||
# Mod # U
|
||||
# Emp # U
|
||||
# Res # U
|
||||
# Inf # U
|
||||
# Vbp # U
|
||||
#
|
||||
#cpdef enum Person_abs:
|
||||
# One # bq, U
|
||||
# Two # bq, U
|
||||
# Three # bq, U
|
||||
#
|
||||
#
|
||||
#cpdef enum Person_dat:
|
||||
# One # bq, U
|
||||
# Two # bq, U
|
||||
# Three # bq, U
|
||||
#
|
||||
#
|
||||
#cpdef enum Person_erg:
|
||||
# One # bq, U
|
||||
# Two # bq, U
|
||||
# Three # bq, U
|
||||
#
|
||||
#
|
||||
#cpdef enum Person_psor:
|
||||
# One # fi, U
|
||||
# Two # fi, U
|
||||
# Three # fi, U
|
||||
#
|
||||
#
|
||||
#cpdef enum Polite:
|
||||
# Inf # bq, U
|
||||
# Pol # bq, U
|
||||
#
|
||||
#
|
||||
#cpdef enum Polite_abs:
|
||||
# Inf # bq, U
|
||||
# Pol # bq, U
|
||||
#
|
||||
#
|
||||
#cpdef enum Polite_erg:
|
||||
# Inf # bq, U
|
||||
# Pol # bq, U
|
||||
#
|
||||
#
|
||||
#cpdef enum Polite_dat:
|
||||
# Inf # bq, U
|
||||
# Pol # bq, U
|
||||
#
|
||||
#
|
||||
#cpdef enum Prefix:
|
||||
# Yes # U
|
||||
#
|
||||
#
|
||||
#cpdef enum PrepCase:
|
||||
# Npr # cz
|
||||
# Pre # U
|
||||
#
|
||||
#
|
||||
#cpdef enum PunctSide:
|
||||
# Ini # U
|
||||
# Fin # U
|
||||
#
|
||||
#cpdef enum PunctType1:
|
||||
# Peri # U
|
||||
# Qest # U
|
||||
# Excl # U
|
||||
# Quot # U
|
||||
# Brck # U
|
||||
# Comm # U
|
||||
# Colo # U
|
||||
# Semi # U
|
||||
#
|
||||
#cpdef enum PunctType2:
|
||||
# Dash # U
|
||||
#
|
||||
#
|
||||
#cpdef enum Style1:
|
||||
# Arch # cz, fi, U
|
||||
# Rare # cz, fi, U
|
||||
# Poet # cz, U
|
||||
# Norm # cz, U
|
||||
# Coll # cz, U
|
||||
# Vrnc # cz, U
|
||||
# Sing # cz, U
|
||||
# Expr # cz, U
|
||||
#
|
||||
#
|
||||
#cpdef enum Style2:
|
||||
# Derg # cz, U
|
||||
# Vulg # cz, U
|
||||
#
|
||||
#
|
||||
#cpdef enum Typo:
|
||||
# Yes # fi, U
|
||||
#
|
||||
#
|
||||
#cpdef enum Variant:
|
||||
# Short # cz
|
||||
# Bound # cz, sl
|
||||
#
|
||||
#
|
||||
#cpdef enum VerbType:
|
||||
# Aux # U
|
||||
# Cop # U
|
||||
# Mod # U
|
||||
# Light # U
|
||||
#
|
||||
|
||||
cpdef enum Value_t:
|
||||
Animacy_Anim
|
||||
Animacy_Inam
|
||||
Aspect_Freq
|
||||
Aspect_Imp
|
||||
Aspect_Mod
|
||||
Aspect_None_
|
||||
Aspect_Perf
|
||||
Case_Abe
|
||||
Case_Abl
|
||||
Case_Abs
|
||||
Case_Acc
|
||||
Case_Ade
|
||||
Case_All
|
||||
Case_Cau
|
||||
Case_Com
|
||||
Case_Dat
|
||||
Case_Del
|
||||
Case_Dis
|
||||
Case_Ela
|
||||
Case_Ess
|
||||
Case_Gen
|
||||
Case_Ill
|
||||
Case_Ine
|
||||
Case_Ins
|
||||
Case_Loc
|
||||
Case_Lat
|
||||
Case_Nom
|
||||
Case_Par
|
||||
Case_Sub
|
||||
Case_Sup
|
||||
Case_Tem
|
||||
Case_Ter
|
||||
Case_Tra
|
||||
Case_Voc
|
||||
Definite_Two
|
||||
Definite_Def
|
||||
Definite_Red
|
||||
Definite_Ind
|
||||
Degree_Cmp
|
||||
Degree_Comp
|
||||
Degree_None
|
||||
Degree_Pos
|
||||
Degree_Sup
|
||||
Degree_Abs
|
||||
Degree_Com
|
||||
Degree_Dim # du
|
||||
Gender_Com
|
||||
Gender_Fem
|
||||
Gender_Masc
|
||||
Gender_Neut
|
||||
Mood_Cnd
|
||||
Mood_Imp
|
||||
Mood_Ind
|
||||
Mood_N
|
||||
Mood_Pot
|
||||
Mood_Sub
|
||||
Mood_Opt
|
||||
Negative_Neg
|
||||
Negative_Pos
|
||||
Negative_Yes
|
||||
Number_Com
|
||||
Number_Dual
|
||||
Number_None
|
||||
Number_Plur
|
||||
Number_Sing
|
||||
Number_Ptan # bg
|
||||
Number_Count # bg
|
||||
NumType_Card
|
||||
NumType_Dist
|
||||
NumType_Frac
|
||||
NumType_Gen
|
||||
NumType_Mult
|
||||
NumType_None
|
||||
NumType_Ord
|
||||
NumType_Sets
|
||||
Person_One
|
||||
Person_Two
|
||||
Person_Three
|
||||
Person_None
|
||||
Poss_Yes
|
||||
PronType_AdvPart
|
||||
PronType_Art
|
||||
PronType_Default
|
||||
PronType_Dem
|
||||
PronType_Ind
|
||||
PronType_Int
|
||||
PronType_Neg
|
||||
PronType_Prs
|
||||
PronType_Rcp
|
||||
PronType_Rel
|
||||
PronType_Tot
|
||||
PronType_Clit
|
||||
PronType_Exc # es, ca, it, fa
|
||||
Reflex_Yes
|
||||
Tense_Fut
|
||||
Tense_Imp
|
||||
Tense_Past
|
||||
Tense_Pres
|
||||
VerbForm_Fin
|
||||
VerbForm_Ger
|
||||
VerbForm_Inf
|
||||
VerbForm_None
|
||||
VerbForm_Part
|
||||
VerbForm_PartFut
|
||||
VerbForm_PartPast
|
||||
VerbForm_PartPres
|
||||
VerbForm_Sup
|
||||
VerbForm_Trans
|
||||
VerbForm_Gdv # la
|
||||
Voice_Act
|
||||
Voice_Cau
|
||||
Voice_Pass
|
||||
Voice_Mid # gkc
|
||||
Voice_Int # hb
|
||||
Abbr_Yes # cz, fi, sl, U
|
||||
AdpType_Prep # cz, U
|
||||
AdpType_Post # U
|
||||
AdpType_Voc # cz
|
||||
AdpType_Comprep # cz
|
||||
AdpType_Circ # U
|
||||
AdvType_Man
|
||||
AdvType_Loc
|
||||
AdvType_Tim
|
||||
AdvType_Deg
|
||||
AdvType_Cau
|
||||
AdvType_Mod
|
||||
AdvType_Sta
|
||||
AdvType_Ex
|
||||
AdvType_Adadj
|
||||
ConjType_Oper # cz, U
|
||||
ConjType_Comp # cz, U
|
||||
Connegative_Yes # fi
|
||||
Derivation_Minen # fi
|
||||
Derivation_Sti # fi
|
||||
Derivation_Inen # fi
|
||||
Derivation_Lainen # fi
|
||||
Derivation_Ja # fi
|
||||
Derivation_Ton # fi
|
||||
Derivation_Vs # fi
|
||||
Derivation_Ttain # fi
|
||||
Derivation_Ttaa # fi
|
||||
Echo_Rdp # U
|
||||
Echo_Ech # U
|
||||
Foreign_Foreign # cz, fi, U
|
||||
Foreign_Fscript # cz, fi, U
|
||||
Foreign_Tscript # cz, U
|
||||
Foreign_Yes # sl
|
||||
Gender_dat_Masc # bq, U
|
||||
Gender_dat_Fem # bq, U
|
||||
Gender_erg_Masc # bq
|
||||
Gender_erg_Fem # bq
|
||||
Gender_psor_Masc # cz, sl, U
|
||||
Gender_psor_Fem # cz, sl, U
|
||||
Gender_psor_Neut # sl
|
||||
Hyph_Yes # cz, U
|
||||
InfForm_One # fi
|
||||
InfForm_Two # fi
|
||||
InfForm_Three # fi
|
||||
NameType_Geo # U, cz
|
||||
NameType_Prs # U, cz
|
||||
NameType_Giv # U, cz
|
||||
NameType_Sur # U, cz
|
||||
NameType_Nat # U, cz
|
||||
NameType_Com # U, cz
|
||||
NameType_Pro # U, cz
|
||||
NameType_Oth # U, cz
|
||||
NounType_Com # U
|
||||
NounType_Prop # U
|
||||
NounType_Class # U
|
||||
Number_abs_Sing # bq, U
|
||||
Number_abs_Plur # bq, U
|
||||
Number_dat_Sing # bq, U
|
||||
Number_dat_Plur # bq, U
|
||||
Number_erg_Sing # bq, U
|
||||
Number_erg_Plur # bq, U
|
||||
Number_psee_Sing # U
|
||||
Number_psee_Plur # U
|
||||
Number_psor_Sing # cz, fi, sl, U
|
||||
Number_psor_Plur # cz, fi, sl, U
|
||||
NumForm_Digit # cz, sl, U
|
||||
NumForm_Roman # cz, sl, U
|
||||
NumForm_Word # cz, sl, U
|
||||
NumValue_One # cz, U
|
||||
NumValue_Two # cz, U
|
||||
NumValue_Three # cz, U
|
||||
PartForm_Pres # fi
|
||||
PartForm_Past # fi
|
||||
PartForm_Agt # fi
|
||||
PartForm_Neg # fi
|
||||
PartType_Mod # U
|
||||
PartType_Emp # U
|
||||
PartType_Res # U
|
||||
PartType_Inf # U
|
||||
PartType_Vbp # U
|
||||
Person_abs_One # bq, U
|
||||
Person_abs_Two # bq, U
|
||||
Person_abs_Three # bq, U
|
||||
Person_dat_One # bq, U
|
||||
Person_dat_Two # bq, U
|
||||
Person_dat_Three # bq, U
|
||||
Person_erg_One # bq, U
|
||||
Person_erg_Two # bq, U
|
||||
Person_erg_Three # bq, U
|
||||
Person_psor_One # fi, U
|
||||
Person_psor_Two # fi, U
|
||||
Person_psor_Three # fi, U
|
||||
Polite_Inf # bq, U
|
||||
Polite_Pol # bq, U
|
||||
Polite_abs_Inf # bq, U
|
||||
Polite_abs_Pol # bq, U
|
||||
Polite_erg_Inf # bq, U
|
||||
Polite_erg_Pol # bq, U
|
||||
Polite_dat_Inf # bq, U
|
||||
Polite_dat_Pol # bq, U
|
||||
Prefix_Yes # U
|
||||
PrepCase_Npr # cz
|
||||
PrepCase_Pre # U
|
||||
PunctSide_Ini # U
|
||||
PunctSide_Fin # U
|
||||
PunctType_Peri # U
|
||||
PunctType_Qest # U
|
||||
PunctType_Excl # U
|
||||
PunctType_Quot # U
|
||||
PunctType_Brck # U
|
||||
PunctType_Comm # U
|
||||
PunctType_Colo # U
|
||||
PunctType_Semi # U
|
||||
PunctType_Dash # U
|
||||
Style_Arch # cz, fi, U
|
||||
Style_Rare # cz, fi, U
|
||||
Style_Poet # cz, U
|
||||
Style_Norm # cz, U
|
||||
Style_Coll # cz, U
|
||||
Style_Vrnc # cz, U
|
||||
Style_Sing # cz, U
|
||||
Style_Expr # cz, U
|
||||
Style_Derg # cz, U
|
||||
Style_Vulg # cz, U
|
||||
Style_Yes # fi, U
|
||||
StyleVariant_StyleShort # cz
|
||||
StyleVariant_StyleBound # cz, sl
|
||||
VerbType_Aux # U
|
||||
VerbType_Cop # U
|
||||
VerbType_Mod # U
|
||||
VerbType_Light # U
|
||||
|
|
|
@ -6,7 +6,7 @@ try:
|
|||
except ImportError:
|
||||
import json
|
||||
|
||||
from .parts_of_speech import UNIV_POS_NAMES
|
||||
from .parts_of_speech import IDS as POS_IDS
|
||||
from .parts_of_speech cimport ADJ, VERB, NOUN, PUNCT
|
||||
|
||||
|
||||
|
@ -14,6 +14,7 @@ cdef class Morphology:
|
|||
def __init__(self, StringStore string_store, tag_map, lemmatizer):
|
||||
self.mem = Pool()
|
||||
self.strings = string_store
|
||||
self.tag_map = tag_map
|
||||
self.lemmatizer = lemmatizer
|
||||
self.n_tags = len(tag_map) + 1
|
||||
self.tag_names = tuple(sorted(tag_map.keys()))
|
||||
|
@ -24,10 +25,13 @@ cdef class Morphology:
|
|||
self.rich_tags[i].id = i
|
||||
self.rich_tags[i].name = self.strings[tag_str]
|
||||
self.rich_tags[i].morph = 0
|
||||
self.rich_tags[i].pos = UNIV_POS_NAMES[props['pos'].upper()]
|
||||
self.rich_tags[i].pos = POS_IDS[props['pos'].upper()]
|
||||
self.reverse_index[self.rich_tags[i].name] = i
|
||||
self._cache = PreshMapArray(self.n_tags)
|
||||
|
||||
def __reduce__(self):
|
||||
return (Morphology, (self.strings, self.tag_map, self.lemmatizer), None, None)
|
||||
|
||||
cdef int assign_tag(self, TokenC* token, tag) except -1:
|
||||
cdef int tag_id
|
||||
if isinstance(tag, basestring):
|
||||
|
@ -89,3 +93,254 @@ cdef class Morphology:
|
|||
lemma_string = sorted(lemma_strings)[0]
|
||||
lemma = self.strings[lemma_string]
|
||||
return lemma
|
||||
|
||||
IDS = {
|
||||
"Animacy_anim": Animacy_anim,
|
||||
"Animacy_inam": Animacy_inam,
|
||||
"Aspect_freq": Aspect_freq,
|
||||
"Aspect_imp": Aspect_imp,
|
||||
"Aspect_mod": Aspect_mod,
|
||||
"Aspect_none": Aspect_none,
|
||||
"Aspect_perf": Aspect_perf,
|
||||
"Case_abe": Case_abe,
|
||||
"Case_abl": Case_abl,
|
||||
"Case_abs": Case_abs,
|
||||
"Case_acc": Case_acc,
|
||||
"Case_ade": Case_ade,
|
||||
"Case_all": Case_all,
|
||||
"Case_cau": Case_cau,
|
||||
"Case_com": Case_com,
|
||||
"Case_dat": Case_dat,
|
||||
"Case_del": Case_del,
|
||||
"Case_dis": Case_dis,
|
||||
"Case_ela": Case_ela,
|
||||
"Case_ess": Case_ess,
|
||||
"Case_gen": Case_gen,
|
||||
"Case_ill": Case_ill,
|
||||
"Case_ine": Case_ine,
|
||||
"Case_ins": Case_ins,
|
||||
"Case_loc": Case_loc,
|
||||
"Case_lat": Case_lat,
|
||||
"Case_nom": Case_nom,
|
||||
"Case_par": Case_par,
|
||||
"Case_sub": Case_sub,
|
||||
"Case_sup": Case_sup,
|
||||
"Case_tem": Case_tem,
|
||||
"Case_ter": Case_ter,
|
||||
"Case_tra": Case_tra,
|
||||
"Case_voc": Case_voc,
|
||||
"Definite_two": Definite_two,
|
||||
"Definite_def": Definite_def,
|
||||
"Definite_red": Definite_red,
|
||||
"Definite_ind": Definite_ind,
|
||||
"Degree_cmp": Degree_cmp,
|
||||
"Degree_comp": Degree_comp,
|
||||
"Degree_none": Degree_none,
|
||||
"Degree_pos": Degree_pos,
|
||||
"Degree_sup": Degree_sup,
|
||||
"Degree_abs": Degree_abs,
|
||||
"Degree_com": Degree_com,
|
||||
"Degree_dim ": Degree_dim, # du
|
||||
"Gender_com": Gender_com,
|
||||
"Gender_fem": Gender_fem,
|
||||
"Gender_masc": Gender_masc,
|
||||
"Gender_neut": Gender_neut,
|
||||
"Mood_cnd": Mood_cnd,
|
||||
"Mood_imp": Mood_imp,
|
||||
"Mood_ind": Mood_ind,
|
||||
"Mood_n": Mood_n,
|
||||
"Mood_pot": Mood_pot,
|
||||
"Mood_sub": Mood_sub,
|
||||
"Mood_opt": Mood_opt,
|
||||
"Negative_neg": Negative_neg,
|
||||
"Negative_pos": Negative_pos,
|
||||
"Negative_yes": Negative_yes,
|
||||
"Number_com": Number_com,
|
||||
"Number_dual": Number_dual,
|
||||
"Number_none": Number_none,
|
||||
"Number_plur": Number_plur,
|
||||
"Number_sing": Number_sing,
|
||||
"Number_ptan ": Number_ptan, # bg
|
||||
"Number_count ": Number_count, # bg
|
||||
"NumType_card": NumType_card,
|
||||
"NumType_dist": NumType_dist,
|
||||
"NumType_frac": NumType_frac,
|
||||
"NumType_gen": NumType_gen,
|
||||
"NumType_mult": NumType_mult,
|
||||
"NumType_none": NumType_none,
|
||||
"NumType_ord": NumType_ord,
|
||||
"NumType_sets": NumType_sets,
|
||||
"Person_one": Person_one,
|
||||
"Person_two": Person_two,
|
||||
"Person_three": Person_three,
|
||||
"Person_none": Person_none,
|
||||
"Poss_yes": Poss_yes,
|
||||
"PronType_advPart": PronType_advPart,
|
||||
"PronType_art": PronType_art,
|
||||
"PronType_default": PronType_default,
|
||||
"PronType_dem": PronType_dem,
|
||||
"PronType_ind": PronType_ind,
|
||||
"PronType_int": PronType_int,
|
||||
"PronType_neg": PronType_neg,
|
||||
"PronType_prs": PronType_prs,
|
||||
"PronType_rcp": PronType_rcp,
|
||||
"PronType_rel": PronType_rel,
|
||||
"PronType_tot": PronType_tot,
|
||||
"PronType_clit": PronType_clit,
|
||||
"PronType_exc ": PronType_exc, # es, ca, it, fa,
|
||||
"Reflex_yes": Reflex_yes,
|
||||
"Tense_fut": Tense_fut,
|
||||
"Tense_imp": Tense_imp,
|
||||
"Tense_past": Tense_past,
|
||||
"Tense_pres": Tense_pres,
|
||||
"VerbForm_fin": VerbForm_fin,
|
||||
"VerbForm_ger": VerbForm_ger,
|
||||
"VerbForm_inf": VerbForm_inf,
|
||||
"VerbForm_none": VerbForm_none,
|
||||
"VerbForm_part": VerbForm_part,
|
||||
"VerbForm_partFut": VerbForm_partFut,
|
||||
"VerbForm_partPast": VerbForm_partPast,
|
||||
"VerbForm_partPres": VerbForm_partPres,
|
||||
"VerbForm_sup": VerbForm_sup,
|
||||
"VerbForm_trans": VerbForm_trans,
|
||||
"VerbForm_gdv ": VerbForm_gdv, # la,
|
||||
"Voice_act": Voice_act,
|
||||
"Voice_cau": Voice_cau,
|
||||
"Voice_pass": Voice_pass,
|
||||
"Voice_mid ": Voice_mid, # gkc,
|
||||
"Voice_int ": Voice_int, # hb,
|
||||
"Abbr_yes ": Abbr_yes, # cz, fi, sl, U,
|
||||
"AdpType_prep ": AdpType_prep, # cz, U,
|
||||
"AdpType_post ": AdpType_post, # U,
|
||||
"AdpType_voc ": AdpType_voc, # cz,
|
||||
"AdpType_comprep ": AdpType_comprep, # cz,
|
||||
"AdpType_circ ": AdpType_circ, # U,
|
||||
"AdvType_man": AdvType_man,
|
||||
"AdvType_loc": AdvType_loc,
|
||||
"AdvType_tim": AdvType_tim,
|
||||
"AdvType_deg": AdvType_deg,
|
||||
"AdvType_cau": AdvType_cau,
|
||||
"AdvType_mod": AdvType_mod,
|
||||
"AdvType_sta": AdvType_sta,
|
||||
"AdvType_ex": AdvType_ex,
|
||||
"AdvType_adadj": AdvType_adadj,
|
||||
"ConjType_oper ": ConjType_oper, # cz, U,
|
||||
"ConjType_comp ": ConjType_comp, # cz, U,
|
||||
"Connegative_yes ": Connegative_yes, # fi,
|
||||
"Derivation_minen ": Derivation_minen, # fi,
|
||||
"Derivation_sti ": Derivation_sti, # fi,
|
||||
"Derivation_inen ": Derivation_inen, # fi,
|
||||
"Derivation_lainen ": Derivation_lainen, # fi,
|
||||
"Derivation_ja ": Derivation_ja, # fi,
|
||||
"Derivation_ton ": Derivation_ton, # fi,
|
||||
"Derivation_vs ": Derivation_vs, # fi,
|
||||
"Derivation_ttain ": Derivation_ttain, # fi,
|
||||
"Derivation_ttaa ": Derivation_ttaa, # fi,
|
||||
"Echo_rdp ": Echo_rdp, # U,
|
||||
"Echo_ech ": Echo_ech, # U,
|
||||
"Foreign_foreign ": Foreign_foreign, # cz, fi, U,
|
||||
"Foreign_fscript ": Foreign_fscript, # cz, fi, U,
|
||||
"Foreign_tscript ": Foreign_tscript, # cz, U,
|
||||
"Foreign_yes ": Foreign_yes, # sl,
|
||||
"Gender_dat_masc ": Gender_dat_masc, # bq, U,
|
||||
"Gender_dat_fem ": Gender_dat_fem, # bq, U,
|
||||
"Gender_erg_masc ": Gender_erg_masc, # bq,
|
||||
"Gender_erg_fem ": Gender_erg_fem, # bq,
|
||||
"Gender_psor_masc ": Gender_psor_masc, # cz, sl, U,
|
||||
"Gender_psor_fem ": Gender_psor_fem, # cz, sl, U,
|
||||
"Gender_psor_neut ": Gender_psor_neut, # sl,
|
||||
"Hyph_yes ": Hyph_yes, # cz, U,
|
||||
"InfForm_one ": InfForm_one, # fi,
|
||||
"InfForm_two ": InfForm_two, # fi,
|
||||
"InfForm_three ": InfForm_three, # fi,
|
||||
"NameType_geo ": NameType_geo, # U, cz,
|
||||
"NameType_prs ": NameType_prs, # U, cz,
|
||||
"NameType_giv ": NameType_giv, # U, cz,
|
||||
"NameType_sur ": NameType_sur, # U, cz,
|
||||
"NameType_nat ": NameType_nat, # U, cz,
|
||||
"NameType_com ": NameType_com, # U, cz,
|
||||
"NameType_pro ": NameType_pro, # U, cz,
|
||||
"NameType_oth ": NameType_oth, # U, cz,
|
||||
"NounType_com ": NounType_com, # U,
|
||||
"NounType_prop ": NounType_prop, # U,
|
||||
"NounType_class ": NounType_class, # U,
|
||||
"Number_abs_sing ": Number_abs_sing, # bq, U,
|
||||
"Number_abs_plur ": Number_abs_plur, # bq, U,
|
||||
"Number_dat_sing ": Number_dat_sing, # bq, U,
|
||||
"Number_dat_plur ": Number_dat_plur, # bq, U,
|
||||
"Number_erg_sing ": Number_erg_sing, # bq, U,
|
||||
"Number_erg_plur ": Number_erg_plur, # bq, U,
|
||||
"Number_psee_sing ": Number_psee_sing, # U,
|
||||
"Number_psee_plur ": Number_psee_plur, # U,
|
||||
"Number_psor_sing ": Number_psor_sing, # cz, fi, sl, U,
|
||||
"Number_psor_plur ": Number_psor_plur, # cz, fi, sl, U,
|
||||
"NumForm_digit ": NumForm_digit, # cz, sl, U,
|
||||
"NumForm_roman ": NumForm_roman, # cz, sl, U,
|
||||
"NumForm_word ": NumForm_word, # cz, sl, U,
|
||||
"NumValue_one ": NumValue_one, # cz, U,
|
||||
"NumValue_two ": NumValue_two, # cz, U,
|
||||
"NumValue_three ": NumValue_three, # cz, U,
|
||||
"PartForm_pres ": PartForm_pres, # fi,
|
||||
"PartForm_past ": PartForm_past, # fi,
|
||||
"PartForm_agt ": PartForm_agt, # fi,
|
||||
"PartForm_neg ": PartForm_neg, # fi,
|
||||
"PartType_mod ": PartType_mod, # U,
|
||||
"PartType_emp ": PartType_emp, # U,
|
||||
"PartType_res ": PartType_res, # U,
|
||||
"PartType_inf ": PartType_inf, # U,
|
||||
"PartType_vbp ": PartType_vbp, # U,
|
||||
"Person_abs_one ": Person_abs_one, # bq, U,
|
||||
"Person_abs_two ": Person_abs_two, # bq, U,
|
||||
"Person_abs_three ": Person_abs_three, # bq, U,
|
||||
"Person_dat_one ": Person_dat_one, # bq, U,
|
||||
"Person_dat_two ": Person_dat_two, # bq, U,
|
||||
"Person_dat_three ": Person_dat_three, # bq, U,
|
||||
"Person_erg_one ": Person_erg_one, # bq, U,
|
||||
"Person_erg_two ": Person_erg_two, # bq, U,
|
||||
"Person_erg_three ": Person_erg_three, # bq, U,
|
||||
"Person_psor_one ": Person_psor_one, # fi, U,
|
||||
"Person_psor_two ": Person_psor_two, # fi, U,
|
||||
"Person_psor_three ": Person_psor_three, # fi, U,
|
||||
"Polite_inf ": Polite_inf, # bq, U,
|
||||
"Polite_pol ": Polite_pol, # bq, U,
|
||||
"Polite_abs_inf ": Polite_abs_inf, # bq, U,
|
||||
"Polite_abs_pol ": Polite_abs_pol, # bq, U,
|
||||
"Polite_erg_inf ": Polite_erg_inf, # bq, U,
|
||||
"Polite_erg_pol ": Polite_erg_pol, # bq, U,
|
||||
"Polite_dat_inf ": Polite_dat_inf, # bq, U,
|
||||
"Polite_dat_pol ": Polite_dat_pol, # bq, U,
|
||||
"Prefix_yes ": Prefix_yes, # U,
|
||||
"PrepCase_npr ": PrepCase_npr, # cz,
|
||||
"PrepCase_pre ": PrepCase_pre, # U,
|
||||
"PunctSide_ini ": PunctSide_ini, # U,
|
||||
"PunctSide_fin ": PunctSide_fin, # U,
|
||||
"PunctType_peri ": PunctType_peri, # U,
|
||||
"PunctType_qest ": PunctType_qest, # U,
|
||||
"PunctType_excl ": PunctType_excl, # U,
|
||||
"PunctType_quot ": PunctType_quot, # U,
|
||||
"PunctType_brck ": PunctType_brck, # U,
|
||||
"PunctType_comm ": PunctType_comm, # U,
|
||||
"PunctType_colo ": PunctType_colo, # U,
|
||||
"PunctType_semi ": PunctType_semi, # U,
|
||||
"PunctType_dash ": PunctType_dash, # U,
|
||||
"Style_arch ": Style_arch, # cz, fi, U,
|
||||
"Style_rare ": Style_rare, # cz, fi, U,
|
||||
"Style_poet ": Style_poet, # cz, U,
|
||||
"Style_norm ": Style_norm, # cz, U,
|
||||
"Style_coll ": Style_coll, # cz, U,
|
||||
"Style_vrnc ": Style_vrnc, # cz, U,
|
||||
"Style_sing ": Style_sing, # cz, U,
|
||||
"Style_expr ": Style_expr, # cz, U,
|
||||
"Style_derg ": Style_derg, # cz, U,
|
||||
"Style_vulg ": Style_vulg, # cz, U,
|
||||
"Style_yes ": Style_yes, # fi, U,
|
||||
"StyleVariant_styleShort ": StyleVariant_styleShort, # cz,
|
||||
"StyleVariant_styleBound ": StyleVariant_styleBound, # cz, sl,
|
||||
"VerbType_aux ": VerbType_aux, # U,
|
||||
"VerbType_cop ": VerbType_cop, # U,
|
||||
"VerbType_mod ": VerbType_mod, # U,
|
||||
"VerbType_light ": VerbType_light, # U,
|
||||
}
|
||||
|
||||
|
||||
NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]
|
||||
|
|
|
@ -1,7 +1,8 @@
|
|||
# Google universal tag set
|
||||
from . cimport symbols
|
||||
|
||||
cpdef enum univ_pos_t:
|
||||
NO_TAG
|
||||
ADJ
|
||||
NO_TAG = 0
|
||||
ADJ = symbols.ADJ
|
||||
ADP
|
||||
ADV
|
||||
AUX
|
||||
|
@ -20,4 +21,3 @@ cpdef enum univ_pos_t:
|
|||
X
|
||||
EOL
|
||||
SPACE
|
||||
N_UNIV_TAGS
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
UNIV_POS_NAMES = {
|
||||
"NO_TAG": NO_TAG,
|
||||
IDS = {
|
||||
"": NO_TAG,
|
||||
"ADJ": ADJ,
|
||||
"ADP": ADP,
|
||||
"ADV": ADV,
|
||||
|
@ -23,3 +23,6 @@ UNIV_POS_NAMES = {
|
|||
"EOL": EOL,
|
||||
"SPACE": SPACE
|
||||
}
|
||||
|
||||
|
||||
NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]
|
||||
|
|
|
@ -69,12 +69,15 @@ cdef Utf8Str _allocate(Pool mem, const unsigned char* chars, int length) except
|
|||
|
||||
cdef class StringStore:
|
||||
'''Map strings to and from integer IDs.'''
|
||||
def __init__(self):
|
||||
def __init__(self, strings=None):
|
||||
self.mem = Pool()
|
||||
self._map = PreshMap()
|
||||
self._resize_at = 10000
|
||||
self.c = <Utf8Str*>self.mem.alloc(self._resize_at, sizeof(Utf8Str))
|
||||
self.size = 1
|
||||
if strings is not None:
|
||||
for string in strings:
|
||||
_ = self[string]
|
||||
|
||||
property size:
|
||||
def __get__(self):
|
||||
|
@ -113,6 +116,14 @@ cdef class StringStore:
|
|||
for i in range(self.size):
|
||||
yield self[i]
|
||||
|
||||
def __reduce__(self):
|
||||
strings = [""]
|
||||
for i in range(1, self.size):
|
||||
string = &self.c[i]
|
||||
py_string = _decode(string)
|
||||
strings.append(py_string)
|
||||
return (StringStore, (strings,), None, None, None)
|
||||
|
||||
cdef const Utf8Str* intern(self, unsigned char* chars, int length) except NULL:
|
||||
# 0 means missing, but we don't bother offsetting the index.
|
||||
key = hash64(chars, length * sizeof(char), 0)
|
||||
|
|
421
spacy/symbols.pxd
Normal file
421
spacy/symbols.pxd
Normal file
|
@ -0,0 +1,421 @@
|
|||
cpdef enum symbol_t:
|
||||
NIL
|
||||
IS_ALPHA
|
||||
IS_ASCII
|
||||
IS_DIGIT
|
||||
IS_LOWER
|
||||
IS_PUNCT
|
||||
IS_SPACE
|
||||
IS_TITLE
|
||||
IS_UPPER
|
||||
LIKE_URL
|
||||
LIKE_NUM
|
||||
LIKE_EMAIL
|
||||
IS_STOP
|
||||
IS_OOV
|
||||
|
||||
FLAG14
|
||||
FLAG15
|
||||
FLAG16
|
||||
FLAG17
|
||||
FLAG18
|
||||
FLAG19
|
||||
FLAG20
|
||||
FLAG21
|
||||
FLAG22
|
||||
FLAG23
|
||||
FLAG24
|
||||
FLAG25
|
||||
FLAG26
|
||||
FLAG27
|
||||
FLAG28
|
||||
FLAG29
|
||||
FLAG30
|
||||
FLAG31
|
||||
FLAG32
|
||||
FLAG33
|
||||
FLAG34
|
||||
FLAG35
|
||||
FLAG36
|
||||
FLAG37
|
||||
FLAG38
|
||||
FLAG39
|
||||
FLAG40
|
||||
FLAG41
|
||||
FLAG42
|
||||
FLAG43
|
||||
FLAG44
|
||||
FLAG45
|
||||
FLAG46
|
||||
FLAG47
|
||||
FLAG48
|
||||
FLAG49
|
||||
FLAG50
|
||||
FLAG51
|
||||
FLAG52
|
||||
FLAG53
|
||||
FLAG54
|
||||
FLAG55
|
||||
FLAG56
|
||||
FLAG57
|
||||
FLAG58
|
||||
FLAG59
|
||||
FLAG60
|
||||
FLAG61
|
||||
FLAG62
|
||||
FLAG63
|
||||
|
||||
ID
|
||||
ORTH
|
||||
LOWER
|
||||
NORM
|
||||
SHAPE
|
||||
PREFIX
|
||||
SUFFIX
|
||||
|
||||
LENGTH
|
||||
CLUSTER
|
||||
LEMMA
|
||||
POS
|
||||
TAG
|
||||
DEP
|
||||
ENT_IOB
|
||||
ENT_TYPE
|
||||
HEAD
|
||||
SPACY
|
||||
PROB
|
||||
|
||||
ADJ
|
||||
ADP
|
||||
ADV
|
||||
AUX
|
||||
CONJ
|
||||
DET
|
||||
INTJ
|
||||
NOUN
|
||||
NUM
|
||||
PART
|
||||
PRON
|
||||
PROPN
|
||||
PUNCT
|
||||
SCONJ
|
||||
SYM
|
||||
VERB
|
||||
X
|
||||
EOL
|
||||
SPACE
|
||||
|
||||
Animacy_anim
|
||||
Animacy_inam
|
||||
Aspect_freq
|
||||
Aspect_imp
|
||||
Aspect_mod
|
||||
Aspect_none
|
||||
Aspect_perf
|
||||
Case_abe
|
||||
Case_abl
|
||||
Case_abs
|
||||
Case_acc
|
||||
Case_ade
|
||||
Case_all
|
||||
Case_cau
|
||||
Case_com
|
||||
Case_dat
|
||||
Case_del
|
||||
Case_dis
|
||||
Case_ela
|
||||
Case_ess
|
||||
Case_gen
|
||||
Case_ill
|
||||
Case_ine
|
||||
Case_ins
|
||||
Case_loc
|
||||
Case_lat
|
||||
Case_nom
|
||||
Case_par
|
||||
Case_sub
|
||||
Case_sup
|
||||
Case_tem
|
||||
Case_ter
|
||||
Case_tra
|
||||
Case_voc
|
||||
Definite_two
|
||||
Definite_def
|
||||
Definite_red
|
||||
Definite_ind
|
||||
Degree_cmp
|
||||
Degree_comp
|
||||
Degree_none
|
||||
Degree_pos
|
||||
Degree_sup
|
||||
Degree_abs
|
||||
Degree_com
|
||||
Degree_dim # du
|
||||
Gender_com
|
||||
Gender_fem
|
||||
Gender_masc
|
||||
Gender_neut
|
||||
Mood_cnd
|
||||
Mood_imp
|
||||
Mood_ind
|
||||
Mood_n
|
||||
Mood_pot
|
||||
Mood_sub
|
||||
Mood_opt
|
||||
Negative_neg
|
||||
Negative_pos
|
||||
Negative_yes
|
||||
Number_com
|
||||
Number_dual
|
||||
Number_none
|
||||
Number_plur
|
||||
Number_sing
|
||||
Number_ptan # bg
|
||||
Number_count # bg
|
||||
NumType_card
|
||||
NumType_dist
|
||||
NumType_frac
|
||||
NumType_gen
|
||||
NumType_mult
|
||||
NumType_none
|
||||
NumType_ord
|
||||
NumType_sets
|
||||
Person_one
|
||||
Person_two
|
||||
Person_three
|
||||
Person_none
|
||||
Poss_yes
|
||||
PronType_advPart
|
||||
PronType_art
|
||||
PronType_default
|
||||
PronType_dem
|
||||
PronType_ind
|
||||
PronType_int
|
||||
PronType_neg
|
||||
PronType_prs
|
||||
PronType_rcp
|
||||
PronType_rel
|
||||
PronType_tot
|
||||
PronType_clit
|
||||
PronType_exc # es, ca, it, fa
|
||||
Reflex_yes
|
||||
Tense_fut
|
||||
Tense_imp
|
||||
Tense_past
|
||||
Tense_pres
|
||||
VerbForm_fin
|
||||
VerbForm_ger
|
||||
VerbForm_inf
|
||||
VerbForm_none
|
||||
VerbForm_part
|
||||
VerbForm_partFut
|
||||
VerbForm_partPast
|
||||
VerbForm_partPres
|
||||
VerbForm_sup
|
||||
VerbForm_trans
|
||||
VerbForm_gdv # la
|
||||
Voice_act
|
||||
Voice_cau
|
||||
Voice_pass
|
||||
Voice_mid # gkc
|
||||
Voice_int # hb
|
||||
Abbr_yes # cz, fi, sl, U
|
||||
AdpType_prep # cz, U
|
||||
AdpType_post # U
|
||||
AdpType_voc # cz
|
||||
AdpType_comprep # cz
|
||||
AdpType_circ # U
|
||||
AdvType_man
|
||||
AdvType_loc
|
||||
AdvType_tim
|
||||
AdvType_deg
|
||||
AdvType_cau
|
||||
AdvType_mod
|
||||
AdvType_sta
|
||||
AdvType_ex
|
||||
AdvType_adadj
|
||||
ConjType_oper # cz, U
|
||||
ConjType_comp # cz, U
|
||||
Connegative_yes # fi
|
||||
Derivation_minen # fi
|
||||
Derivation_sti # fi
|
||||
Derivation_inen # fi
|
||||
Derivation_lainen # fi
|
||||
Derivation_ja # fi
|
||||
Derivation_ton # fi
|
||||
Derivation_vs # fi
|
||||
Derivation_ttain # fi
|
||||
Derivation_ttaa # fi
|
||||
Echo_rdp # U
|
||||
Echo_ech # U
|
||||
Foreign_foreign # cz, fi, U
|
||||
Foreign_fscript # cz, fi, U
|
||||
Foreign_tscript # cz, U
|
||||
Foreign_yes # sl
|
||||
Gender_dat_masc # bq, U
|
||||
Gender_dat_fem # bq, U
|
||||
Gender_erg_masc # bq
|
||||
Gender_erg_fem # bq
|
||||
Gender_psor_masc # cz, sl, U
|
||||
Gender_psor_fem # cz, sl, U
|
||||
Gender_psor_neut # sl
|
||||
Hyph_yes # cz, U
|
||||
InfForm_one # fi
|
||||
InfForm_two # fi
|
||||
InfForm_three # fi
|
||||
NameType_geo # U, cz
|
||||
NameType_prs # U, cz
|
||||
NameType_giv # U, cz
|
||||
NameType_sur # U, cz
|
||||
NameType_nat # U, cz
|
||||
NameType_com # U, cz
|
||||
NameType_pro # U, cz
|
||||
NameType_oth # U, cz
|
||||
NounType_com # U
|
||||
NounType_prop # U
|
||||
NounType_class # U
|
||||
Number_abs_sing # bq, U
|
||||
Number_abs_plur # bq, U
|
||||
Number_dat_sing # bq, U
|
||||
Number_dat_plur # bq, U
|
||||
Number_erg_sing # bq, U
|
||||
Number_erg_plur # bq, U
|
||||
Number_psee_sing # U
|
||||
Number_psee_plur # U
|
||||
Number_psor_sing # cz, fi, sl, U
|
||||
Number_psor_plur # cz, fi, sl, U
|
||||
NumForm_digit # cz, sl, U
|
||||
NumForm_roman # cz, sl, U
|
||||
NumForm_word # cz, sl, U
|
||||
NumValue_one # cz, U
|
||||
NumValue_two # cz, U
|
||||
NumValue_three # cz, U
|
||||
PartForm_pres # fi
|
||||
PartForm_past # fi
|
||||
PartForm_agt # fi
|
||||
PartForm_neg # fi
|
||||
PartType_mod # U
|
||||
PartType_emp # U
|
||||
PartType_res # U
|
||||
PartType_inf # U
|
||||
PartType_vbp # U
|
||||
Person_abs_one # bq, U
|
||||
Person_abs_two # bq, U
|
||||
Person_abs_three # bq, U
|
||||
Person_dat_one # bq, U
|
||||
Person_dat_two # bq, U
|
||||
Person_dat_three # bq, U
|
||||
Person_erg_one # bq, U
|
||||
Person_erg_two # bq, U
|
||||
Person_erg_three # bq, U
|
||||
Person_psor_one # fi, U
|
||||
Person_psor_two # fi, U
|
||||
Person_psor_three # fi, U
|
||||
Polite_inf # bq, U
|
||||
Polite_pol # bq, U
|
||||
Polite_abs_inf # bq, U
|
||||
Polite_abs_pol # bq, U
|
||||
Polite_erg_inf # bq, U
|
||||
Polite_erg_pol # bq, U
|
||||
Polite_dat_inf # bq, U
|
||||
Polite_dat_pol # bq, U
|
||||
Prefix_yes # U
|
||||
PrepCase_npr # cz
|
||||
PrepCase_pre # U
|
||||
PunctSide_ini # U
|
||||
PunctSide_fin # U
|
||||
PunctType_peri # U
|
||||
PunctType_qest # U
|
||||
PunctType_excl # U
|
||||
PunctType_quot # U
|
||||
PunctType_brck # U
|
||||
PunctType_comm # U
|
||||
PunctType_colo # U
|
||||
PunctType_semi # U
|
||||
PunctType_dash # U
|
||||
Style_arch # cz, fi, U
|
||||
Style_rare # cz, fi, U
|
||||
Style_poet # cz, U
|
||||
Style_norm # cz, U
|
||||
Style_coll # cz, U
|
||||
Style_vrnc # cz, U
|
||||
Style_sing # cz, U
|
||||
Style_expr # cz, U
|
||||
Style_derg # cz, U
|
||||
Style_vulg # cz, U
|
||||
Style_yes # fi, U
|
||||
StyleVariant_styleShort # cz
|
||||
StyleVariant_styleBound # cz, sl
|
||||
VerbType_aux # U
|
||||
VerbType_cop # U
|
||||
VerbType_mod # U
|
||||
VerbType_light # U
|
||||
|
||||
PERSON
|
||||
NORP
|
||||
FACILITY
|
||||
ORG
|
||||
GPE
|
||||
LOC
|
||||
PRODUCT
|
||||
EVENT
|
||||
WORK_OF_ART
|
||||
LANGUAGE
|
||||
|
||||
DATE
|
||||
TIME
|
||||
PERCENT
|
||||
MONEY
|
||||
QUANTITY
|
||||
ORDINAL
|
||||
CARDINAL
|
||||
|
||||
acomp
|
||||
advcl
|
||||
advmod
|
||||
agent
|
||||
amod
|
||||
appos
|
||||
attr
|
||||
aux
|
||||
auxpass
|
||||
cc
|
||||
ccomp
|
||||
complm
|
||||
conj
|
||||
csubj
|
||||
csubjpass
|
||||
dep
|
||||
det
|
||||
dobj
|
||||
expl
|
||||
hmod
|
||||
hyph
|
||||
infmod
|
||||
intj
|
||||
iobj
|
||||
mark
|
||||
meta
|
||||
neg
|
||||
nmod
|
||||
nn
|
||||
npadvmod
|
||||
nsubj
|
||||
nsubjpass
|
||||
num
|
||||
number
|
||||
oprd
|
||||
parataxis
|
||||
partmod
|
||||
pcomp
|
||||
pobj
|
||||
poss
|
||||
possessive
|
||||
preconj
|
||||
prep
|
||||
prt
|
||||
punct
|
||||
quantmod
|
||||
rcmod
|
||||
root
|
||||
xcomp
|
424
spacy/symbols.pyx
Normal file
424
spacy/symbols.pyx
Normal file
|
@ -0,0 +1,424 @@
|
|||
IDS = {
|
||||
"": NIL,
|
||||
"IS_ALPHA": IS_ALPHA,
|
||||
"IS_ASCII": IS_ASCII,
|
||||
"IS_DIGIT": IS_DIGIT,
|
||||
"IS_LOWER": IS_LOWER,
|
||||
"IS_PUNCT": IS_PUNCT,
|
||||
"IS_SPACE": IS_SPACE,
|
||||
"IS_TITLE": IS_TITLE,
|
||||
"IS_UPPER": IS_UPPER,
|
||||
"LIKE_URL": LIKE_URL,
|
||||
"LIKE_NUM": LIKE_NUM,
|
||||
"LIKE_EMAIL": LIKE_EMAIL,
|
||||
"IS_STOP": IS_STOP,
|
||||
"IS_OOV": IS_OOV,
|
||||
|
||||
"FLAG14": FLAG14,
|
||||
"FLAG15": FLAG15,
|
||||
"FLAG16": FLAG16,
|
||||
"FLAG17": FLAG17,
|
||||
"FLAG18": FLAG18,
|
||||
"FLAG19": FLAG19,
|
||||
"FLAG20": FLAG20,
|
||||
"FLAG21": FLAG21,
|
||||
"FLAG22": FLAG22,
|
||||
"FLAG23": FLAG23,
|
||||
"FLAG24": FLAG24,
|
||||
"FLAG25": FLAG25,
|
||||
"FLAG26": FLAG26,
|
||||
"FLAG27": FLAG27,
|
||||
"FLAG28": FLAG28,
|
||||
"FLAG29": FLAG29,
|
||||
"FLAG30": FLAG30,
|
||||
"FLAG31": FLAG31,
|
||||
"FLAG32": FLAG32,
|
||||
"FLAG33": FLAG33,
|
||||
"FLAG34": FLAG34,
|
||||
"FLAG35": FLAG35,
|
||||
"FLAG36": FLAG36,
|
||||
"FLAG37": FLAG37,
|
||||
"FLAG38": FLAG38,
|
||||
"FLAG39": FLAG39,
|
||||
"FLAG40": FLAG40,
|
||||
"FLAG41": FLAG41,
|
||||
"FLAG42": FLAG42,
|
||||
"FLAG43": FLAG43,
|
||||
"FLAG44": FLAG44,
|
||||
"FLAG45": FLAG45,
|
||||
"FLAG46": FLAG46,
|
||||
"FLAG47": FLAG47,
|
||||
"FLAG48": FLAG48,
|
||||
"FLAG49": FLAG49,
|
||||
"FLAG50": FLAG50,
|
||||
"FLAG51": FLAG51,
|
||||
"FLAG52": FLAG52,
|
||||
"FLAG53": FLAG53,
|
||||
"FLAG54": FLAG54,
|
||||
"FLAG55": FLAG55,
|
||||
"FLAG56": FLAG56,
|
||||
"FLAG57": FLAG57,
|
||||
"FLAG58": FLAG58,
|
||||
"FLAG59": FLAG59,
|
||||
"FLAG60": FLAG60,
|
||||
"FLAG61": FLAG61,
|
||||
"FLAG62": FLAG62,
|
||||
"FLAG63": FLAG63,
|
||||
|
||||
"ID": ID,
|
||||
"ORTH": ORTH,
|
||||
"LOWER": LOWER,
|
||||
"NORM": NORM,
|
||||
"SHAPE": SHAPE,
|
||||
"PREFIX": PREFIX,
|
||||
"SUFFIX": SUFFIX,
|
||||
|
||||
"LENGTH": LENGTH,
|
||||
"CLUSTER": CLUSTER,
|
||||
"LEMMA": LEMMA,
|
||||
"POS": POS,
|
||||
"TAG": TAG,
|
||||
"DEP": DEP,
|
||||
"ENT_IOB": ENT_IOB,
|
||||
"ENT_TYPE": ENT_TYPE,
|
||||
"HEAD": HEAD,
|
||||
"SPACY": SPACY,
|
||||
"PROB": PROB,
|
||||
|
||||
"ADJ": ADJ,
|
||||
"ADP": ADP,
|
||||
"ADV": ADV,
|
||||
"AUX": AUX,
|
||||
"CONJ": CONJ,
|
||||
"DET": DET,
|
||||
"INTJ": INTJ,
|
||||
"NOUN": NOUN,
|
||||
"NUM": NUM,
|
||||
"PART": PART,
|
||||
"PRON": PRON,
|
||||
"PROPN": PROPN,
|
||||
"PUNCT": PUNCT,
|
||||
"SCONJ": SCONJ,
|
||||
"SYM": SYM,
|
||||
"VERB": VERB,
|
||||
"X": X,
|
||||
"EOL": EOL,
|
||||
"SPACE": SPACE,
|
||||
|
||||
"Animacy_anim": Animacy_anim,
|
||||
"Animacy_inam": Animacy_inam,
|
||||
"Aspect_freq": Aspect_freq,
|
||||
"Aspect_imp": Aspect_imp,
|
||||
"Aspect_mod": Aspect_mod,
|
||||
"Aspect_none": Aspect_none,
|
||||
"Aspect_perf": Aspect_perf,
|
||||
"Case_abe": Case_abe,
|
||||
"Case_abl": Case_abl,
|
||||
"Case_abs": Case_abs,
|
||||
"Case_acc": Case_acc,
|
||||
"Case_ade": Case_ade,
|
||||
"Case_all": Case_all,
|
||||
"Case_cau": Case_cau,
|
||||
"Case_com": Case_com,
|
||||
"Case_dat": Case_dat,
|
||||
"Case_del": Case_del,
|
||||
"Case_dis": Case_dis,
|
||||
"Case_ela": Case_ela,
|
||||
"Case_ess": Case_ess,
|
||||
"Case_gen": Case_gen,
|
||||
"Case_ill": Case_ill,
|
||||
"Case_ine": Case_ine,
|
||||
"Case_ins": Case_ins,
|
||||
"Case_loc": Case_loc,
|
||||
"Case_lat": Case_lat,
|
||||
"Case_nom": Case_nom,
|
||||
"Case_par": Case_par,
|
||||
"Case_sub": Case_sub,
|
||||
"Case_sup": Case_sup,
|
||||
"Case_tem": Case_tem,
|
||||
"Case_ter": Case_ter,
|
||||
"Case_tra": Case_tra,
|
||||
"Case_voc": Case_voc,
|
||||
"Definite_two": Definite_two,
|
||||
"Definite_def": Definite_def,
|
||||
"Definite_red": Definite_red,
|
||||
"Definite_ind": Definite_ind,
|
||||
"Degree_cmp": Degree_cmp,
|
||||
"Degree_comp": Degree_comp,
|
||||
"Degree_none": Degree_none,
|
||||
"Degree_pos": Degree_pos,
|
||||
"Degree_sup": Degree_sup,
|
||||
"Degree_abs": Degree_abs,
|
||||
"Degree_com": Degree_com,
|
||||
"Degree_dim ": Degree_dim, # du
|
||||
"Gender_com": Gender_com,
|
||||
"Gender_fem": Gender_fem,
|
||||
"Gender_masc": Gender_masc,
|
||||
"Gender_neut": Gender_neut,
|
||||
"Mood_cnd": Mood_cnd,
|
||||
"Mood_imp": Mood_imp,
|
||||
"Mood_ind": Mood_ind,
|
||||
"Mood_n": Mood_n,
|
||||
"Mood_pot": Mood_pot,
|
||||
"Mood_sub": Mood_sub,
|
||||
"Mood_opt": Mood_opt,
|
||||
"Negative_neg": Negative_neg,
|
||||
"Negative_pos": Negative_pos,
|
||||
"Negative_yes": Negative_yes,
|
||||
"Number_com": Number_com,
|
||||
"Number_dual": Number_dual,
|
||||
"Number_none": Number_none,
|
||||
"Number_plur": Number_plur,
|
||||
"Number_sing": Number_sing,
|
||||
"Number_ptan ": Number_ptan, # bg
|
||||
"Number_count ": Number_count, # bg
|
||||
"NumType_card": NumType_card,
|
||||
"NumType_dist": NumType_dist,
|
||||
"NumType_frac": NumType_frac,
|
||||
"NumType_gen": NumType_gen,
|
||||
"NumType_mult": NumType_mult,
|
||||
"NumType_none": NumType_none,
|
||||
"NumType_ord": NumType_ord,
|
||||
"NumType_sets": NumType_sets,
|
||||
"Person_one": Person_one,
|
||||
"Person_two": Person_two,
|
||||
"Person_three": Person_three,
|
||||
"Person_none": Person_none,
|
||||
"Poss_yes": Poss_yes,
|
||||
"PronType_advPart": PronType_advPart,
|
||||
"PronType_art": PronType_art,
|
||||
"PronType_default": PronType_default,
|
||||
"PronType_dem": PronType_dem,
|
||||
"PronType_ind": PronType_ind,
|
||||
"PronType_int": PronType_int,
|
||||
"PronType_neg": PronType_neg,
|
||||
"PronType_prs": PronType_prs,
|
||||
"PronType_rcp": PronType_rcp,
|
||||
"PronType_rel": PronType_rel,
|
||||
"PronType_tot": PronType_tot,
|
||||
"PronType_clit": PronType_clit,
|
||||
"PronType_exc ": PronType_exc, # es, ca, it, fa,
|
||||
"Reflex_yes": Reflex_yes,
|
||||
"Tense_fut": Tense_fut,
|
||||
"Tense_imp": Tense_imp,
|
||||
"Tense_past": Tense_past,
|
||||
"Tense_pres": Tense_pres,
|
||||
"VerbForm_fin": VerbForm_fin,
|
||||
"VerbForm_ger": VerbForm_ger,
|
||||
"VerbForm_inf": VerbForm_inf,
|
||||
"VerbForm_none": VerbForm_none,
|
||||
"VerbForm_part": VerbForm_part,
|
||||
"VerbForm_partFut": VerbForm_partFut,
|
||||
"VerbForm_partPast": VerbForm_partPast,
|
||||
"VerbForm_partPres": VerbForm_partPres,
|
||||
"VerbForm_sup": VerbForm_sup,
|
||||
"VerbForm_trans": VerbForm_trans,
|
||||
"VerbForm_gdv ": VerbForm_gdv, # la,
|
||||
"Voice_act": Voice_act,
|
||||
"Voice_cau": Voice_cau,
|
||||
"Voice_pass": Voice_pass,
|
||||
"Voice_mid ": Voice_mid, # gkc,
|
||||
"Voice_int ": Voice_int, # hb,
|
||||
"Abbr_yes ": Abbr_yes, # cz, fi, sl, U,
|
||||
"AdpType_prep ": AdpType_prep, # cz, U,
|
||||
"AdpType_post ": AdpType_post, # U,
|
||||
"AdpType_voc ": AdpType_voc, # cz,
|
||||
"AdpType_comprep ": AdpType_comprep, # cz,
|
||||
"AdpType_circ ": AdpType_circ, # U,
|
||||
"AdvType_man": AdvType_man,
|
||||
"AdvType_loc": AdvType_loc,
|
||||
"AdvType_tim": AdvType_tim,
|
||||
"AdvType_deg": AdvType_deg,
|
||||
"AdvType_cau": AdvType_cau,
|
||||
"AdvType_mod": AdvType_mod,
|
||||
"AdvType_sta": AdvType_sta,
|
||||
"AdvType_ex": AdvType_ex,
|
||||
"AdvType_adadj": AdvType_adadj,
|
||||
"ConjType_oper ": ConjType_oper, # cz, U,
|
||||
"ConjType_comp ": ConjType_comp, # cz, U,
|
||||
"Connegative_yes ": Connegative_yes, # fi,
|
||||
"Derivation_minen ": Derivation_minen, # fi,
|
||||
"Derivation_sti ": Derivation_sti, # fi,
|
||||
"Derivation_inen ": Derivation_inen, # fi,
|
||||
"Derivation_lainen ": Derivation_lainen, # fi,
|
||||
"Derivation_ja ": Derivation_ja, # fi,
|
||||
"Derivation_ton ": Derivation_ton, # fi,
|
||||
"Derivation_vs ": Derivation_vs, # fi,
|
||||
"Derivation_ttain ": Derivation_ttain, # fi,
|
||||
"Derivation_ttaa ": Derivation_ttaa, # fi,
|
||||
"Echo_rdp ": Echo_rdp, # U,
|
||||
"Echo_ech ": Echo_ech, # U,
|
||||
"Foreign_foreign ": Foreign_foreign, # cz, fi, U,
|
||||
"Foreign_fscript ": Foreign_fscript, # cz, fi, U,
|
||||
"Foreign_tscript ": Foreign_tscript, # cz, U,
|
||||
"Foreign_yes ": Foreign_yes, # sl,
|
||||
"Gender_dat_masc ": Gender_dat_masc, # bq, U,
|
||||
"Gender_dat_fem ": Gender_dat_fem, # bq, U,
|
||||
"Gender_erg_masc ": Gender_erg_masc, # bq,
|
||||
"Gender_erg_fem ": Gender_erg_fem, # bq,
|
||||
"Gender_psor_masc ": Gender_psor_masc, # cz, sl, U,
|
||||
"Gender_psor_fem ": Gender_psor_fem, # cz, sl, U,
|
||||
"Gender_psor_neut ": Gender_psor_neut, # sl,
|
||||
"Hyph_yes ": Hyph_yes, # cz, U,
|
||||
"InfForm_one ": InfForm_one, # fi,
|
||||
"InfForm_two ": InfForm_two, # fi,
|
||||
"InfForm_three ": InfForm_three, # fi,
|
||||
"NameType_geo ": NameType_geo, # U, cz,
|
||||
"NameType_prs ": NameType_prs, # U, cz,
|
||||
"NameType_giv ": NameType_giv, # U, cz,
|
||||
"NameType_sur ": NameType_sur, # U, cz,
|
||||
"NameType_nat ": NameType_nat, # U, cz,
|
||||
"NameType_com ": NameType_com, # U, cz,
|
||||
"NameType_pro ": NameType_pro, # U, cz,
|
||||
"NameType_oth ": NameType_oth, # U, cz,
|
||||
"NounType_com ": NounType_com, # U,
|
||||
"NounType_prop ": NounType_prop, # U,
|
||||
"NounType_class ": NounType_class, # U,
|
||||
"Number_abs_sing ": Number_abs_sing, # bq, U,
|
||||
"Number_abs_plur ": Number_abs_plur, # bq, U,
|
||||
"Number_dat_sing ": Number_dat_sing, # bq, U,
|
||||
"Number_dat_plur ": Number_dat_plur, # bq, U,
|
||||
"Number_erg_sing ": Number_erg_sing, # bq, U,
|
||||
"Number_erg_plur ": Number_erg_plur, # bq, U,
|
||||
"Number_psee_sing ": Number_psee_sing, # U,
|
||||
"Number_psee_plur ": Number_psee_plur, # U,
|
||||
"Number_psor_sing ": Number_psor_sing, # cz, fi, sl, U,
|
||||
"Number_psor_plur ": Number_psor_plur, # cz, fi, sl, U,
|
||||
"NumForm_digit ": NumForm_digit, # cz, sl, U,
|
||||
"NumForm_roman ": NumForm_roman, # cz, sl, U,
|
||||
"NumForm_word ": NumForm_word, # cz, sl, U,
|
||||
"NumValue_one ": NumValue_one, # cz, U,
|
||||
"NumValue_two ": NumValue_two, # cz, U,
|
||||
"NumValue_three ": NumValue_three, # cz, U,
|
||||
"PartForm_pres ": PartForm_pres, # fi,
|
||||
"PartForm_past ": PartForm_past, # fi,
|
||||
"PartForm_agt ": PartForm_agt, # fi,
|
||||
"PartForm_neg ": PartForm_neg, # fi,
|
||||
"PartType_mod ": PartType_mod, # U,
|
||||
"PartType_emp ": PartType_emp, # U,
|
||||
"PartType_res ": PartType_res, # U,
|
||||
"PartType_inf ": PartType_inf, # U,
|
||||
"PartType_vbp ": PartType_vbp, # U,
|
||||
"Person_abs_one ": Person_abs_one, # bq, U,
|
||||
"Person_abs_two ": Person_abs_two, # bq, U,
|
||||
"Person_abs_three ": Person_abs_three, # bq, U,
|
||||
"Person_dat_one ": Person_dat_one, # bq, U,
|
||||
"Person_dat_two ": Person_dat_two, # bq, U,
|
||||
"Person_dat_three ": Person_dat_three, # bq, U,
|
||||
"Person_erg_one ": Person_erg_one, # bq, U,
|
||||
"Person_erg_two ": Person_erg_two, # bq, U,
|
||||
"Person_erg_three ": Person_erg_three, # bq, U,
|
||||
"Person_psor_one ": Person_psor_one, # fi, U,
|
||||
"Person_psor_two ": Person_psor_two, # fi, U,
|
||||
"Person_psor_three ": Person_psor_three, # fi, U,
|
||||
"Polite_inf ": Polite_inf, # bq, U,
|
||||
"Polite_pol ": Polite_pol, # bq, U,
|
||||
"Polite_abs_inf ": Polite_abs_inf, # bq, U,
|
||||
"Polite_abs_pol ": Polite_abs_pol, # bq, U,
|
||||
"Polite_erg_inf ": Polite_erg_inf, # bq, U,
|
||||
"Polite_erg_pol ": Polite_erg_pol, # bq, U,
|
||||
"Polite_dat_inf ": Polite_dat_inf, # bq, U,
|
||||
"Polite_dat_pol ": Polite_dat_pol, # bq, U,
|
||||
"Prefix_yes ": Prefix_yes, # U,
|
||||
"PrepCase_npr ": PrepCase_npr, # cz,
|
||||
"PrepCase_pre ": PrepCase_pre, # U,
|
||||
"PunctSide_ini ": PunctSide_ini, # U,
|
||||
"PunctSide_fin ": PunctSide_fin, # U,
|
||||
"PunctType_peri ": PunctType_peri, # U,
|
||||
"PunctType_qest ": PunctType_qest, # U,
|
||||
"PunctType_excl ": PunctType_excl, # U,
|
||||
"PunctType_quot ": PunctType_quot, # U,
|
||||
"PunctType_brck ": PunctType_brck, # U,
|
||||
"PunctType_comm ": PunctType_comm, # U,
|
||||
"PunctType_colo ": PunctType_colo, # U,
|
||||
"PunctType_semi ": PunctType_semi, # U,
|
||||
"PunctType_dash ": PunctType_dash, # U,
|
||||
"Style_arch ": Style_arch, # cz, fi, U,
|
||||
"Style_rare ": Style_rare, # cz, fi, U,
|
||||
"Style_poet ": Style_poet, # cz, U,
|
||||
"Style_norm ": Style_norm, # cz, U,
|
||||
"Style_coll ": Style_coll, # cz, U,
|
||||
"Style_vrnc ": Style_vrnc, # cz, U,
|
||||
"Style_sing ": Style_sing, # cz, U,
|
||||
"Style_expr ": Style_expr, # cz, U,
|
||||
"Style_derg ": Style_derg, # cz, U,
|
||||
"Style_vulg ": Style_vulg, # cz, U,
|
||||
"Style_yes ": Style_yes, # fi, U,
|
||||
"StyleVariant_styleShort ": StyleVariant_styleShort, # cz,
|
||||
"StyleVariant_styleBound ": StyleVariant_styleBound, # cz, sl,
|
||||
"VerbType_aux ": VerbType_aux, # U,
|
||||
"VerbType_cop ": VerbType_cop, # U,
|
||||
"VerbType_mod ": VerbType_mod, # U,
|
||||
"VerbType_light ": VerbType_light, # U,
|
||||
|
||||
"PERSON": PERSON,
|
||||
"NORP": NORP,
|
||||
"FACILITY": FACILITY,
|
||||
"ORG": ORG,
|
||||
"GPE": GPE,
|
||||
"LOC": LOC,
|
||||
"PRODUCT": PRODUCT,
|
||||
"EVENT": EVENT,
|
||||
"WORK_OF_ART": WORK_OF_ART,
|
||||
"LANGUAGE": LANGUAGE,
|
||||
|
||||
"DATE": DATE,
|
||||
"TIME": TIME,
|
||||
"PERCENT": PERCENT,
|
||||
"MONEY": MONEY,
|
||||
"QUANTITY": QUANTITY,
|
||||
"ORDINAL": ORDINAL,
|
||||
"CARDINAL": CARDINAL,
|
||||
|
||||
"acomp": acomp,
|
||||
"advcl": advcl,
|
||||
"advmod": advmod,
|
||||
"agent": agent,
|
||||
"amod": amod,
|
||||
"appos": appos,
|
||||
"attr": attr,
|
||||
"aux": aux,
|
||||
"auxpass": auxpass,
|
||||
"cc": cc,
|
||||
"ccomp": ccomp,
|
||||
"complm": complm,
|
||||
"conj": conj,
|
||||
"csubj": csubj,
|
||||
"csubjpass": csubjpass,
|
||||
"dep": dep,
|
||||
"det": det,
|
||||
"dobj": dobj,
|
||||
"expl": expl,
|
||||
"hmod": hmod,
|
||||
"hyph": hyph,
|
||||
"infmod": infmod,
|
||||
"intj": intj,
|
||||
"iobj": iobj,
|
||||
"mark": mark,
|
||||
"meta": meta,
|
||||
"neg": neg,
|
||||
"nmod": nmod,
|
||||
"nn": nn,
|
||||
"npadvmod": npadvmod,
|
||||
"nsubj": nsubj,
|
||||
"nsubjpass": nsubjpass,
|
||||
"num": num,
|
||||
"number": number,
|
||||
"oprd": oprd,
|
||||
"parataxis": parataxis,
|
||||
"partmod": partmod,
|
||||
"pcomp": pcomp,
|
||||
"pobj": pobj,
|
||||
"poss": poss,
|
||||
"possessive": possessive,
|
||||
"preconj": preconj,
|
||||
"prep": prep,
|
||||
"prt": prt,
|
||||
"punct": punct,
|
||||
"quantmod": quantmod,
|
||||
"rcmod": rcmod,
|
||||
"root": root,
|
||||
"xcomp": xcomp
|
||||
}
|
||||
|
||||
NAMES = [it[0] for it in sorted(IDS.items(), key=lambda it: it[1])]
|
|
@ -83,7 +83,6 @@ cdef class Parser:
|
|||
model = Model(moves.n_moves, templates, model_dir)
|
||||
return cls(strings, moves, model)
|
||||
|
||||
|
||||
def __call__(self, Doc tokens):
|
||||
cdef StateClass stcls = StateClass.init(tokens.data, tokens.length)
|
||||
self.moves.initialize_state(stcls)
|
||||
|
@ -93,6 +92,9 @@ cdef class Parser:
|
|||
self.parse(stcls, eg.c)
|
||||
tokens.set_parse(stcls._sent)
|
||||
|
||||
def __reduce__(self):
|
||||
return (Parser, (self.moves.strings, self.moves, self.model), None, None)
|
||||
|
||||
cdef void predict(self, StateClass stcls, ExampleC* eg) nogil:
|
||||
memset(eg.scores, 0, eg.nr_class * sizeof(weight_t))
|
||||
self.moves.set_valid(eg.is_valid, stcls)
|
||||
|
|
|
@ -37,6 +37,8 @@ cdef class TransitionSystem:
|
|||
cdef public int root_label
|
||||
cdef public freqs
|
||||
|
||||
cdef object _labels_by_action
|
||||
|
||||
cdef int initialize_state(self, StateClass state) except -1
|
||||
cdef int finalize_state(self, StateClass state) nogil
|
||||
|
||||
|
|
|
@ -15,7 +15,8 @@ class OracleError(Exception):
|
|||
|
||||
|
||||
cdef class TransitionSystem:
|
||||
def __init__(self, StringStore string_table, dict labels_by_action):
|
||||
def __init__(self, StringStore string_table, dict labels_by_action, _freqs=None):
|
||||
self._labels_by_action = labels_by_action
|
||||
self.mem = Pool()
|
||||
self.n_moves = sum(len(labels) for labels in labels_by_action.values())
|
||||
self._is_valid = <bint*>self.mem.alloc(self.n_moves, sizeof(bint))
|
||||
|
@ -30,7 +31,7 @@ cdef class TransitionSystem:
|
|||
i += 1
|
||||
self.c = moves
|
||||
self.root_label = self.strings['ROOT']
|
||||
self.freqs = {}
|
||||
self.freqs = {} if _freqs is None else _freqs
|
||||
for attr in (TAG, HEAD, DEP, ENT_TYPE, ENT_IOB):
|
||||
self.freqs[attr] = defaultdict(int)
|
||||
self.freqs[attr][0] = 1
|
||||
|
@ -39,6 +40,11 @@ cdef class TransitionSystem:
|
|||
self.freqs[HEAD][i] = 1
|
||||
self.freqs[HEAD][-i] = 1
|
||||
|
||||
def __reduce__(self):
|
||||
return (self.__class__,
|
||||
(self.strings, self._labels_by_action, self.freqs),
|
||||
None, None)
|
||||
|
||||
cdef int initialize_state(self, StateClass state) except -1:
|
||||
pass
|
||||
|
||||
|
|
|
@ -148,6 +148,9 @@ cdef class Tagger:
|
|||
tokens.is_tagged = True
|
||||
tokens._py_tokens = [None] * tokens.length
|
||||
|
||||
def __reduce__(self):
|
||||
return (self.__class__, (self.vocab, self.model), None, None)
|
||||
|
||||
def tag_from_strings(self, Doc tokens, object tag_strs):
|
||||
cdef int i
|
||||
for i in range(tokens.length):
|
||||
|
|
|
@ -14,7 +14,6 @@ from ..typedefs cimport attr_t, flags_t
|
|||
from ..attrs cimport attr_id_t
|
||||
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
|
||||
from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
|
||||
from ..parts_of_speech import UNIV_POS_NAMES
|
||||
from ..parts_of_speech cimport CONJ, PUNCT, NOUN
|
||||
from ..parts_of_speech cimport univ_pos_t
|
||||
from ..lexeme cimport Lexeme
|
||||
|
|
|
@ -9,7 +9,7 @@ import numpy
|
|||
|
||||
|
||||
from ..lexeme cimport Lexeme
|
||||
from ..parts_of_speech import UNIV_POS_NAMES
|
||||
from .. import parts_of_speech
|
||||
|
||||
from ..attrs cimport LEMMA
|
||||
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
|
||||
|
@ -318,7 +318,7 @@ cdef class Token:
|
|||
|
||||
property pos_:
|
||||
def __get__(self):
|
||||
return _pos_id_to_string[self.c.pos]
|
||||
return parts_of_speech.NAMES[self.c.pos]
|
||||
|
||||
property tag_:
|
||||
def __get__(self):
|
||||
|
@ -363,6 +363,3 @@ cdef class Token:
|
|||
|
||||
property like_email:
|
||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_EMAIL)
|
||||
|
||||
|
||||
_pos_id_to_string = {id_: string for string, id_ in UNIV_POS_NAMES.items()}
|
||||
|
|
|
@ -25,7 +25,6 @@ cdef struct _Cached:
|
|||
|
||||
|
||||
cdef class Vocab:
|
||||
cpdef public lexeme_props_getter
|
||||
cdef Pool mem
|
||||
cpdef readonly StringStore strings
|
||||
cpdef readonly Morphology morphology
|
||||
|
@ -33,7 +32,6 @@ cdef class Vocab:
|
|||
cdef public object _serializer
|
||||
cdef public object data_dir
|
||||
cdef public object get_lex_attr
|
||||
cdef public object pos_tags
|
||||
cdef public object serializer_freqs
|
||||
|
||||
cdef const LexemeC* get(self, Pool mem, unicode string) except NULL
|
||||
|
|
|
@ -10,6 +10,8 @@ from os import path
|
|||
import io
|
||||
import math
|
||||
import json
|
||||
import tempfile
|
||||
import copy_reg
|
||||
|
||||
from .lexeme cimport EMPTY_LEXEME
|
||||
from .lexeme cimport Lexeme
|
||||
|
@ -19,6 +21,9 @@ from .typedefs cimport attr_t
|
|||
from .cfile cimport CFile
|
||||
from .lemmatizer import Lemmatizer
|
||||
|
||||
from . import attrs
|
||||
from . import symbols
|
||||
|
||||
from cymem.cymem cimport Address
|
||||
from . import util
|
||||
from .serialize.packer cimport Packer
|
||||
|
@ -67,6 +72,14 @@ cdef class Vocab:
|
|||
self._by_hash = PreshMap()
|
||||
self._by_orth = PreshMap()
|
||||
self.strings = StringStore()
|
||||
# Load strings in a special order, so that we have an onset number for
|
||||
# the vocabulary. This way, when words are added in order, the orth ID
|
||||
# is the frequency rank of the word, plus a certain offset. The structural
|
||||
# strings are loaded first, because the vocab is open-class, and these
|
||||
# symbols are closed class.
|
||||
for name in symbols.NAMES + list(sorted(tag_map.keys())):
|
||||
if name:
|
||||
_ = self.strings[name]
|
||||
self.get_lex_attr = get_lex_attr
|
||||
self.morphology = Morphology(self.strings, tag_map, lemmatizer)
|
||||
self.serializer_freqs = serializer_freqs
|
||||
|
@ -85,6 +98,20 @@ cdef class Vocab:
|
|||
"""The current number of lexemes stored."""
|
||||
return self.length
|
||||
|
||||
def __reduce__(self):
|
||||
# TODO: Dump vectors
|
||||
tmp_dir = tempfile.mkdtemp()
|
||||
lex_loc = path.join(tmp_dir, 'lexemes.bin')
|
||||
str_loc = path.join(tmp_dir, 'strings.txt')
|
||||
vec_loc = path.join(self.data_dir, 'vec.bin') if self.data_dir is not None else None
|
||||
|
||||
self.dump(lex_loc)
|
||||
self.strings.dump(str_loc)
|
||||
|
||||
state = (str_loc, lex_loc, vec_loc, self.morphology, self.get_lex_attr,
|
||||
self.serializer_freqs, self.data_dir)
|
||||
return (unpickle_vocab, state, None, None)
|
||||
|
||||
cdef const LexemeC* get(self, Pool mem, unicode string) except NULL:
|
||||
'''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
|
||||
if necessary, using memory acquired from the given pool. If the pool
|
||||
|
@ -260,17 +287,17 @@ cdef class Vocab:
|
|||
i += 1
|
||||
fp.close()
|
||||
|
||||
def load_vectors(self, loc_or_file):
|
||||
def load_vectors(self, file_):
|
||||
cdef LexemeC* lexeme
|
||||
cdef attr_t orth
|
||||
cdef int32_t vec_len = -1
|
||||
for line_num, line in enumerate(loc_or_file):
|
||||
for line_num, line in enumerate(file_):
|
||||
pieces = line.split()
|
||||
word_str = pieces.pop(0)
|
||||
if vec_len == -1:
|
||||
vec_len = len(pieces)
|
||||
elif vec_len != len(pieces):
|
||||
raise VectorReadError.mismatched_sizes(loc_or_file, line_num,
|
||||
raise VectorReadError.mismatched_sizes(file_, line_num,
|
||||
vec_len, len(pieces))
|
||||
orth = self.strings[word_str]
|
||||
lexeme = <LexemeC*><void*>self.get_by_orth(self.mem, orth)
|
||||
|
@ -328,6 +355,25 @@ cdef class Vocab:
|
|||
return vec_len
|
||||
|
||||
|
||||
def unpickle_vocab(strings_loc, lex_loc, vec_loc, morphology, get_lex_attr,
|
||||
serializer_freqs, data_dir):
|
||||
cdef Vocab vocab = Vocab()
|
||||
|
||||
vocab.get_lex_attr = get_lex_attr
|
||||
vocab.morphology = morphology
|
||||
vocab.strings = morphology.strings
|
||||
vocab.data_dir = data_dir
|
||||
vocab.serializer_freqs = serializer_freqs
|
||||
|
||||
vocab.load_lexemes(strings_loc, lex_loc)
|
||||
if vec_loc is not None:
|
||||
vocab.load_vectors_from_bin_loc(vec_loc)
|
||||
return vocab
|
||||
|
||||
|
||||
copy_reg.constructor(unpickle_vocab)
|
||||
|
||||
|
||||
def write_binary_vectors(in_loc, out_loc):
|
||||
cdef CFile out_file = CFile(out_loc, 'wb')
|
||||
cdef Address mem
|
||||
|
|
17
tests/morphology/test_pickle.py
Normal file
17
tests/morphology/test_pickle.py
Normal file
|
@ -0,0 +1,17 @@
|
|||
import pytest
|
||||
|
||||
import pickle
|
||||
import StringIO
|
||||
|
||||
|
||||
from spacy.morphology import Morphology
|
||||
from spacy.lemmatizer import Lemmatizer
|
||||
from spacy.strings import StringStore
|
||||
|
||||
|
||||
def test_pickle():
|
||||
morphology = Morphology(StringStore(), {}, Lemmatizer({}, {}, {}))
|
||||
|
||||
file_ = StringIO.StringIO()
|
||||
pickle.dump(morphology, file_)
|
||||
|
16
tests/parser/test_pickle.py
Normal file
16
tests/parser/test_pickle.py
Normal file
|
@ -0,0 +1,16 @@
|
|||
import pytest
|
||||
|
||||
import pickle
|
||||
import cloudpickle
|
||||
import StringIO
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
def test_pickle(EN):
|
||||
file_ = StringIO.StringIO()
|
||||
cloudpickle.dump(EN.parser, file_)
|
||||
|
||||
file_.seek(0)
|
||||
|
||||
loaded = pickle.load(file_)
|
||||
|
|
@ -1,5 +1,7 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
from __future__ import unicode_literals
|
||||
import StringIO
|
||||
import pickle
|
||||
|
||||
from spacy.lemmatizer import Lemmatizer, read_index, read_exc
|
||||
from spacy.en import LOCAL_DATA_DIR
|
||||
|
@ -41,3 +43,12 @@ def test_smart_quotes(lemmatizer):
|
|||
do = lemmatizer.punct
|
||||
assert do('“') == set(['"'])
|
||||
assert do('“') == set(['"'])
|
||||
|
||||
|
||||
def test_pickle_lemmatizer(lemmatizer):
|
||||
file_ = StringIO.StringIO()
|
||||
pickle.dump(lemmatizer, file_)
|
||||
|
||||
file_.seek(0)
|
||||
|
||||
loaded = pickle.load(file_)
|
||||
|
|
15
tests/test_pickle.py
Normal file
15
tests/test_pickle.py
Normal file
|
@ -0,0 +1,15 @@
|
|||
import pytest
|
||||
import StringIO
|
||||
import cloudpickle
|
||||
import pickle
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
def test_pickle_english(EN):
|
||||
file_ = StringIO.StringIO()
|
||||
cloudpickle.dump(EN, file_)
|
||||
|
||||
file_.seek(0)
|
||||
|
||||
loaded = pickle.load(file_)
|
||||
|
|
@ -1,5 +1,7 @@
|
|||
# -*- coding: utf8 -*-
|
||||
from __future__ import unicode_literals
|
||||
import pickle
|
||||
import StringIO
|
||||
|
||||
from spacy.strings import StringStore
|
||||
|
||||
|
@ -76,3 +78,18 @@ def test_massive_strings(sstore):
|
|||
s513 = '1' * 513
|
||||
orth = sstore[s513]
|
||||
assert sstore[orth] == s513
|
||||
|
||||
|
||||
def test_pickle_string_store(sstore):
|
||||
hello_id = sstore[u'Hi']
|
||||
string_file = StringIO.StringIO()
|
||||
pickle.dump(sstore, string_file)
|
||||
|
||||
string_file.seek(0)
|
||||
|
||||
loaded = pickle.load(string_file)
|
||||
|
||||
assert loaded[hello_id] == u'Hi'
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -1,5 +1,11 @@
|
|||
from __future__ import unicode_literals
|
||||
import pytest
|
||||
import StringIO
|
||||
import cloudpickle
|
||||
import pickle
|
||||
|
||||
from spacy.attrs import LEMMA, ORTH, PROB, IS_ALPHA
|
||||
from spacy.parts_of_speech import NOUN, VERB
|
||||
|
||||
|
||||
def test_neq(en_vocab):
|
||||
|
@ -25,3 +31,21 @@ def test_punct_neq(en_vocab):
|
|||
def test_shape_attr(en_vocab):
|
||||
example = en_vocab['example']
|
||||
assert example.orth != example.shape
|
||||
|
||||
|
||||
def test_symbols(en_vocab):
|
||||
assert en_vocab.strings['IS_ALPHA'] == IS_ALPHA
|
||||
assert en_vocab.strings['NOUN'] == NOUN
|
||||
assert en_vocab.strings['VERB'] == VERB
|
||||
assert en_vocab.strings['LEMMA'] == LEMMA
|
||||
assert en_vocab.strings['ORTH'] == ORTH
|
||||
assert en_vocab.strings['PROB'] == PROB
|
||||
|
||||
|
||||
def test_pickle_vocab(en_vocab):
|
||||
file_ = StringIO.StringIO()
|
||||
cloudpickle.dump(en_vocab, file_)
|
||||
|
||||
file_.seek(0)
|
||||
|
||||
loaded = pickle.load(file_)
|
||||
|
|
|
@ -1,11 +1,13 @@
|
|||
from __future__ import unicode_literals
|
||||
import pytest
|
||||
import os
|
||||
|
||||
|
||||
@pytest.fixture(scope='session')
|
||||
def nlp():
|
||||
from spacy.en import English
|
||||
return English()
|
||||
from spacy.en import English, LOCAL_DATA_DIR
|
||||
data_dir = os.environ.get('SPACY_DATA', LOCAL_DATA_DIR)
|
||||
return English(data_dir=data_dir)
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
from __future__ import unicode_literals
|
||||
import pytest
|
||||
import spacy
|
||||
import os
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
|
@ -9,8 +10,9 @@ def token(doc):
|
|||
|
||||
|
||||
def test_load_resources_and_process_text():
|
||||
from spacy.en import English
|
||||
nlp = English()
|
||||
from spacy.en import English, LOCAL_DATA_DIR
|
||||
data_dir = os.environ.get('SPACY_DATA', LOCAL_DATA_DIR)
|
||||
nlp = English(data_dir=data_dir)
|
||||
doc = nlp('Hello, world. Here are two sentences.')
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user