Add support for Universal Dependencies v2.0

This commit is contained in:
Roman Inflianskas 2017-02-26 22:27:11 +01:00
parent 8dff040032
commit 66e1109b53
14 changed files with 155 additions and 74 deletions

View File

@ -93,7 +93,7 @@ NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]
def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
'''Normalize a dictionary of attributes, converting them to ints.
Arguments:
stringy_attrs (dict):
Dictionary keyed by attribute string names. Values can be ints or strings.
@ -125,7 +125,9 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
'VerbForm', 'PronType', 'Aspect', 'Tense', 'PartType', 'Poss',
'Hyph', 'ConjType', 'NumType', 'Foreign', 'VerbType', 'NounType',
'Number', 'PronType', 'AdjType', 'Person', 'Variant', 'AdpType',
'Reflex', 'Negative', 'Mood', 'Aspect', 'Case']
'Reflex', 'Negative', 'Mood', 'Aspect', 'Case',
'Polarity', # U20
]
for key in morph_keys:
if key in stringy_attrs:
stringy_attrs.pop(key)

View File

@ -41,7 +41,7 @@ TAG_MAP = {
"PRF": {POS: PRON, "PronType": "prs", "Reflex": "yes"},
"PTKA": {POS: PART},
"PTKANT": {POS: PART, "PartType": "res"},
"PTKNEG": {POS: PART, "Negative": "yes"},
"PTKNEG": {POS: PART, "Polarity": "Neg"},
"PTKVZ": {POS: PART, "PartType": "vbp"},
"PTKZU": {POS: PART, "PartType": "inf"},
"PWAT": {POS: DET, "PronType": "int"},

View File

@ -16,7 +16,7 @@ TAG_MAP = {
"$": {POS: SYM, "Other": {"SymType": "currency"}},
"#": {POS: SYM, "Other": {"SymType": "numbersign"}},
"AFX": {POS: ADJ, "Hyph": "yes"},
"CC": {POS: CONJ, "ConjType": "coor"},
"CC": {POS: CCONJ, "ConjType": "coor"},
"CD": {POS: NUM, "NumType": "card"},
"DT": {POS: DET},
"EX": {POS: ADV, "AdvType": "ex"},

View File

@ -19,6 +19,7 @@ TAG_MAP = {
"AUX": {POS: AUX},
"X": {POS: X},
"CONJ": {POS: CONJ},
"CCONJ": {POS: CCONJ}, # U20
"ADJ": {POS: ADJ},
"VERB": {POS: VERB},
"PART": {POS: PART}

View File

@ -37,7 +37,7 @@ cdef class Morphology:
cdef int assign_tag(self, TokenC* token, tag) except -1
cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1
cdef int assign_feature(self, uint64_t* morph, univ_morph_t feat_id, bint value) except -1
@ -80,6 +80,7 @@ cpdef enum univ_morph_t:
Definite_two
Definite_def
Definite_red
Definite_cons # U20
Definite_ind
Degree_cmp
Degree_comp
@ -103,6 +104,8 @@ cpdef enum univ_morph_t:
Negative_neg
Negative_pos
Negative_yes
Polarity_neg # U20
Polarity_pos # U20
Number_com
Number_dual
Number_none
@ -151,6 +154,7 @@ cpdef enum univ_morph_t:
VerbForm_partPres
VerbForm_sup
VerbForm_trans
VerbForm_conv # U20
VerbForm_gdv # la
Voice_act
Voice_cau

View File

@ -192,6 +192,7 @@ IDS = {
"Definite_two": Definite_two,
"Definite_def": Definite_def,
"Definite_red": Definite_red,
"Definite_cons": Definite_cons, # U20
"Definite_ind": Definite_ind,
"Degree_cmp": Degree_cmp,
"Degree_comp": Degree_comp,
@ -215,6 +216,8 @@ IDS = {
"Negative_neg": Negative_neg,
"Negative_pos": Negative_pos,
"Negative_yes": Negative_yes,
"Polarity_neg": Polarity_neg, # U20
"Polarity_pos": Polarity_pos, # U20
"Number_com": Number_com,
"Number_dual": Number_dual,
"Number_none": Number_none,
@ -263,6 +266,7 @@ IDS = {
"VerbForm_partPres": VerbForm_partPres,
"VerbForm_sup": VerbForm_sup,
"VerbForm_trans": VerbForm_trans,
"VerbForm_conv": VerbForm_conv, # U20
"VerbForm_gdv ": VerbForm_gdv, # la,
"Voice_act": Voice_act,
"Voice_cau": Voice_cau,

View File

@ -7,6 +7,7 @@ cpdef enum univ_pos_t:
ADV
AUX
CONJ
CCONJ # U20
DET
INTJ
NOUN

View File

@ -7,7 +7,8 @@ IDS = {
"ADP": ADP,
"ADV": ADV,
"AUX": AUX,
"CONJ": CONJ,
"CONJ": CONJ, # U20
"CCONJ": CCONJ,
"DET": DET,
"INTJ": INTJ,
"NOUN": NOUN,

View File

@ -13,7 +13,7 @@ cpdef enum symbol_t:
LIKE_EMAIL
IS_STOP
IS_OOV
FLAG14 = 14
FLAG15
FLAG16
@ -90,6 +90,7 @@ cpdef enum symbol_t:
ADV
AUX
CONJ
CCONJ # U20
DET
INTJ
NOUN
@ -107,11 +108,14 @@ cpdef enum symbol_t:
Animacy_anim
Animacy_inam
Animacy_hum # U20
Aspect_freq
Aspect_imp
Aspect_mod
Aspect_none
Aspect_perf
Aspect_iter # U20
Aspect_hab # U20
Case_abe
Case_abl
Case_abs
@ -120,10 +124,12 @@ cpdef enum symbol_t:
Case_all
Case_cau
Case_com
Case_cmp # U20
Case_dat
Case_del
Case_dis
Case_ela
Case_equ # U20
Case_ess
Case_gen
Case_ill
@ -142,7 +148,9 @@ cpdef enum symbol_t:
Definite_two
Definite_def
Definite_red
Definite_cons # U20
Definite_ind
Definite_spec # U20
Degree_cmp
Degree_comp
Degree_none
@ -151,6 +159,8 @@ cpdef enum symbol_t:
Degree_abs
Degree_com
Degree_dim # du
Degree_equ # U20
Evident_nfh # U20
Gender_com
Gender_fem
Gender_masc
@ -162,16 +172,21 @@ cpdef enum symbol_t:
Mood_pot
Mood_sub
Mood_opt
Mood_prp # U20
Mood_adm # U20
Negative_neg
Negative_pos
Negative_yes
Polarity_neg # U20
Polarity_pos # U20
Number_com
Number_dual
Number_none
Number_plur
Number_sing
Number_ptan # bg
Number_count # bg
Number_count # bg, U20
Number_tri # U20
NumType_card
NumType_dist
NumType_frac
@ -197,7 +212,8 @@ cpdef enum symbol_t:
PronType_rel
PronType_tot
PronType_clit
PronType_exc # es, ca, it, fa
PronType_exc # es, ca, it, fa, U20
PronType_emp # U20
Reflex_yes
Tense_fut
Tense_imp
@ -213,12 +229,17 @@ cpdef enum symbol_t:
VerbForm_partPres
VerbForm_sup
VerbForm_trans
VerbForm_conv # U20
VerbForm_gdv # la
VerbForm_vnoun # U20
Voice_act
Voice_cau
Voice_pass
Voice_mid # gkc
Voice_mid # gkc, U20
Voice_int # hb
Voice_antip # U20
Voice_dir # U20
Voice_inv # U20
Abbr_yes # cz, fi, sl, U
AdpType_prep # cz, U
AdpType_post # U
@ -284,6 +305,10 @@ cpdef enum symbol_t:
Number_psee_plur # U
Number_psor_sing # cz, fi, sl, U
Number_psor_plur # cz, fi, sl, U
Number_pauc # U20
Number_grpa # U20
Number_grpl # U20
Number_inv # U20
NumForm_digit # cz, sl, U
NumForm_roman # cz, sl, U
NumForm_word # cz, sl, U
@ -311,6 +336,8 @@ cpdef enum symbol_t:
Person_psor_one # fi, U
Person_psor_two # fi, U
Person_psor_three # fi, U
Person_zero # U20
Person_four # U20
Polite_inf # bq, U
Polite_pol # bq, U
Polite_abs_inf # bq, U
@ -319,6 +346,10 @@ cpdef enum symbol_t:
Polite_erg_pol # bq, U
Polite_dat_inf # bq, U
Polite_dat_pol # bq, U
Polite_infm # U20
Polite_form # U20
Polite_form_elev # U20
Polite_form_humb # U20
Prefix_yes # U
PrepCase_npr # cz
PrepCase_pre # U
@ -383,6 +414,7 @@ cpdef enum symbol_t:
ccomp
complm
conj
cop # U20
csubj
csubjpass
dep
@ -405,6 +437,8 @@ cpdef enum symbol_t:
num
number
oprd
obj # U20
obl # U20
parataxis
partmod
pcomp

View File

@ -91,6 +91,7 @@ IDS = {
"ADV": ADV,
"AUX": AUX,
"CONJ": CONJ,
"CCONJ": CCONJ, # U20
"DET": DET,
"INTJ": INTJ,
"NOUN": NOUN,
@ -108,11 +109,14 @@ IDS = {
"Animacy_anim": Animacy_anim,
"Animacy_inam": Animacy_inam,
"Animacy_hum": Animacy_hum, # U20
"Aspect_freq": Aspect_freq,
"Aspect_imp": Aspect_imp,
"Aspect_mod": Aspect_mod,
"Aspect_none": Aspect_none,
"Aspect_perf": Aspect_perf,
"Aspect_iter": Aspect_iter, # U20
"Aspect_hab": Aspect_hab, # U20
"Case_abe": Case_abe,
"Case_abl": Case_abl,
"Case_abs": Case_abs,
@ -121,10 +125,12 @@ IDS = {
"Case_all": Case_all,
"Case_cau": Case_cau,
"Case_com": Case_com,
"Case_cmp": Case_cmp, # U20
"Case_dat": Case_dat,
"Case_del": Case_del,
"Case_dis": Case_dis,
"Case_ela": Case_ela,
"Case_equ": Case_equ, # U20
"Case_ess": Case_ess,
"Case_gen": Case_gen,
"Case_ill": Case_ill,
@ -143,7 +149,9 @@ IDS = {
"Definite_two": Definite_two,
"Definite_def": Definite_def,
"Definite_red": Definite_red,
"Definite_cons": Definite_cons, # U20
"Definite_ind": Definite_ind,
"Definite_spec": Definite_spec, # U20
"Degree_cmp": Degree_cmp,
"Degree_comp": Degree_comp,
"Degree_none": Degree_none,
@ -152,6 +160,8 @@ IDS = {
"Degree_abs": Degree_abs,
"Degree_com": Degree_com,
"Degree_dim ": Degree_dim, # du
"Degree_equ": Degree_equ, # U20
"Evident_nfh": Evident_nfh, # U20
"Gender_com": Gender_com,
"Gender_fem": Gender_fem,
"Gender_masc": Gender_masc,
@ -163,16 +173,21 @@ IDS = {
"Mood_pot": Mood_pot,
"Mood_sub": Mood_sub,
"Mood_opt": Mood_opt,
"Mood_prp": Mood_prp, # U20
"Mood_adm": Mood_adm, # U20
"Negative_neg": Negative_neg,
"Negative_pos": Negative_pos,
"Negative_yes": Negative_yes,
"Polarity_neg": Polarity_neg, # U20
"Polarity_pos": Polarity_pos, # U20
"Number_com": Number_com,
"Number_dual": Number_dual,
"Number_none": Number_none,
"Number_plur": Number_plur,
"Number_sing": Number_sing,
"Number_ptan ": Number_ptan, # bg
"Number_count ": Number_count, # bg
"Number_count ": Number_count, # bg, U20
"Number_tri": Number_tri, # U20
"NumType_card": NumType_card,
"NumType_dist": NumType_dist,
"NumType_frac": NumType_frac,
@ -198,7 +213,8 @@ IDS = {
"PronType_rel": PronType_rel,
"PronType_tot": PronType_tot,
"PronType_clit": PronType_clit,
"PronType_exc ": PronType_exc, # es, ca, it, fa,
"PronType_exc": PronType_exc, # es, ca, it, fa, U20
"PronType_emp": PronType_emp, # U20
"Reflex_yes": Reflex_yes,
"Tense_fut": Tense_fut,
"Tense_imp": Tense_imp,
@ -214,12 +230,17 @@ IDS = {
"VerbForm_partPres": VerbForm_partPres,
"VerbForm_sup": VerbForm_sup,
"VerbForm_trans": VerbForm_trans,
"VerbForm_conv": VerbForm_conv, # U20
"VerbForm_gdv ": VerbForm_gdv, # la,
"VerbForm_vnoun": VerbForm_vnoun, # U20
"Voice_act": Voice_act,
"Voice_cau": Voice_cau,
"Voice_pass": Voice_pass,
"Voice_mid ": Voice_mid, # gkc,
"Voice_mid ": Voice_mid, # gkc, U20
"Voice_int ": Voice_int, # hb,
"Voice_antip": Voice_antip, # U20
"Voice_dir": Voice_dir, # U20
"Voice_inv": Voice_inv, # U20
"Abbr_yes ": Abbr_yes, # cz, fi, sl, U,
"AdpType_prep ": AdpType_prep, # cz, U,
"AdpType_post ": AdpType_post, # U,
@ -285,6 +306,10 @@ IDS = {
"Number_psee_plur ": Number_psee_plur, # U,
"Number_psor_sing ": Number_psor_sing, # cz, fi, sl, U,
"Number_psor_plur ": Number_psor_plur, # cz, fi, sl, U,
"Number_pauc": Number_pauc, # U20
"Number_grpa": Number_grpa, # U20
"Number_grpl": Number_grpl, # U20
"Number_inv": Number_inv, # U20
"NumForm_digit ": NumForm_digit, # cz, sl, U,
"NumForm_roman ": NumForm_roman, # cz, sl, U,
"NumForm_word ": NumForm_word, # cz, sl, U,
@ -312,6 +337,8 @@ IDS = {
"Person_psor_one ": Person_psor_one, # fi, U,
"Person_psor_two ": Person_psor_two, # fi, U,
"Person_psor_three ": Person_psor_three, # fi, U,
"Person_zero ": Person_zero, # U20
"Person_four ": Person_four, # U20
"Polite_inf ": Polite_inf, # bq, U,
"Polite_pol ": Polite_pol, # bq, U,
"Polite_abs_inf ": Polite_abs_inf, # bq, U,
@ -320,6 +347,10 @@ IDS = {
"Polite_erg_pol ": Polite_erg_pol, # bq, U,
"Polite_dat_inf ": Polite_dat_inf, # bq, U,
"Polite_dat_pol ": Polite_dat_pol, # bq, U,
"Polite_infm ": Polite_infm, # U20
"Polite_form ": Polite_form, # U20
"Polite_form_elev ": Polite_form_elev, # U20
"Polite_form_humb ": Polite_form_humb, # U20
"Prefix_yes ": Prefix_yes, # U,
"PrepCase_npr ": PrepCase_npr, # cz,
"PrepCase_pre ": PrepCase_pre, # U,
@ -384,6 +415,7 @@ IDS = {
"ccomp": ccomp,
"complm": complm,
"conj": conj,
"cop": cop, # U20
"csubj": csubj,
"csubjpass": csubjpass,
"dep": dep,
@ -406,6 +438,8 @@ IDS = {
"num": num,
"number": number,
"oprd": oprd,
"obj": obj, # U20
"obl": obl, # U20
"parataxis": parataxis,
"partmod": partmod,
"pcomp": pcomp,

View File

@ -8,7 +8,7 @@ from spacy.attrs import DEP, HEAD
def ancestors(tokenid, heads):
# returns all words going from the word up the path to the root
# the path to root cannot be longer than the number of words in the sentence
# this function ends after at most len(heads) steps
# this function ends after at most len(heads) steps
# because it would otherwise loop indefinitely on cycles
head = tokenid
cnt = 0
@ -180,7 +180,7 @@ class PseudoProjectivity:
next_queue = []
for qtoken in queue:
for child in qtoken.children:
if child.is_space: continue
if child.is_space: continue
if child == token: continue
if child.dep_ == headlabel:
return child

View File

@ -13,13 +13,13 @@ from thinc.linalg cimport VecVec
from .typedefs cimport attr_t
from .tokens.doc cimport Doc
from .attrs cimport TAG
from .parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON
from .parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CCONJ, DET, NOUN, NUM, PRON
from .parts_of_speech cimport VERB, X, PUNCT, EOL, SPACE
from .gold cimport GoldParse
from .attrs cimport *
cpdef enum:
P2_orth
P2_cluster
@ -71,7 +71,7 @@ cpdef enum:
cdef class TaggerModel(AveragedPerceptron):
cdef void set_featuresC(self, ExampleC* eg, const TokenC* tokens, int i) except *:
_fill_from_token(&eg.atoms[P2_orth], &tokens[i-2])
_fill_from_token(&eg.atoms[P1_orth], &tokens[i-1])
_fill_from_token(&eg.atoms[W_orth], &tokens[i])
@ -191,7 +191,7 @@ cdef class Tagger:
nr_class=self.vocab.morphology.n_tags,
nr_feat=self.model.nr_feat)
for i in range(tokens.length):
if tokens.c[i].pos == 0:
if tokens.c[i].pos == 0:
self.model.set_featuresC(&eg.c, tokens.c, i)
self.model.set_scoresC(eg.c.scores,
eg.c.features, eg.c.nr_feat)
@ -217,7 +217,7 @@ cdef class Tagger:
for doc in stream:
self(doc)
yield doc
def update(self, Doc tokens, GoldParse gold):
"""Update the statistical model, with tags supplied for the given document.
@ -251,7 +251,7 @@ cdef class Tagger:
self.model.updateC(&eg.c)
self.vocab.morphology.assign_tag_id(&tokens.c[i], eg.guess)
correct += eg.cost == 0
self.freqs[TAG][tokens.c[i].tag] += 1
eg.fill_scores(0, eg.c.nr_class)

View File

@ -16,7 +16,7 @@ from ..typedefs cimport attr_t, flags_t
from ..attrs cimport attr_id_t
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
from ..parts_of_speech cimport CONJ, PUNCT, NOUN
from ..parts_of_speech cimport CCONJ, PUNCT, NOUN
from ..parts_of_speech cimport univ_pos_t
from ..lexeme cimport Lexeme
from .span cimport Span
@ -59,13 +59,13 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil:
cdef class Doc:
"""
A sequence of `Token` objects. Access sentences and named entities,
export annotations to numpy arrays, losslessly serialize to compressed
A sequence of `Token` objects. Access sentences and named entities,
export annotations to numpy arrays, losslessly serialize to compressed
binary strings.
Aside: Internals
The `Doc` object holds an array of `TokenC` structs.
The Python-level `Token` and `Span` objects are views of this
The `Doc` object holds an array of `TokenC` structs.
The Python-level `Token` and `Span` objects are views of this
array, i.e. they don't own the data themselves.
Code: Construction 1
@ -80,13 +80,13 @@ cdef class Doc:
Create a Doc object.
Aside: Implementation
This method of constructing a `Doc` object is usually only used
for deserialization. Standard usage is to construct the document via
This method of constructing a `Doc` object is usually only used
for deserialization. Standard usage is to construct the document via
a call to the language object.
Arguments:
vocab:
A Vocabulary object, which must match any models you want to
A Vocabulary object, which must match any models you want to
use (e.g. tokenizer, parser, entity recognizer).
words:
@ -156,19 +156,19 @@ cdef class Doc:
if self.length == 0:
self.is_tagged = True
self.is_parsed = True
def __getitem__(self, object i):
'''
doc[i]
Get the Token object at position i, where i is an integer.
Negative indexing is supported, and follows the usual Python
Get the Token object at position i, where i is an integer.
Negative indexing is supported, and follows the usual Python
semantics, i.e. doc[-2] is doc[len(doc) - 2].
doc[start : end]]
Get a `Span` object, starting at position `start`
and ending at position `end`, where `start` and
`end` are token indices. For instance,
`doc[2:5]` produces a span consisting of
tokens 2, 3 and 4. Stepped slices (e.g. `doc[start : end : step]`)
`doc[2:5]` produces a span consisting of
tokens 2, 3 and 4. Stepped slices (e.g. `doc[start : end : step]`)
are not supported, as `Span` objects must be contiguous (cannot have gaps).
You can use negative indices and open-ended ranges, which have their
normal Python semantics.
@ -188,11 +188,11 @@ cdef class Doc:
def __iter__(self):
'''
for token in doc
Iterate over `Token` objects, from which the annotations can
be easily accessed. This is the main way of accessing Token
objects, which are the main way annotations are accessed from
Python. If faster-than-Python speeds are required, you can
instead access the annotations as a numpy array, or access the
Iterate over `Token` objects, from which the annotations can
be easily accessed. This is the main way of accessing Token
objects, which are the main way annotations are accessed from
Python. If faster-than-Python speeds are required, you can
instead access the annotations as a numpy array, or access the
underlying C data directly from Cython.
'''
cdef int i
@ -251,13 +251,13 @@ cdef class Doc:
def __get__(self):
if 'has_vector' in self.user_hooks:
return self.user_hooks['has_vector'](self)
return any(token.has_vector for token in self)
property vector:
'''
A real-valued meaning representation. Defaults to an average of the token vectors.
Type: numpy.ndarray[ndim=1, dtype='float32']
'''
def __get__(self):
@ -285,14 +285,14 @@ cdef class Doc:
norm += value * value
self._vector_norm = sqrt(norm) if norm != 0 else 0
return self._vector_norm
def __set__(self, value):
self._vector_norm = value
self._vector_norm = value
@property
def string(self):
return self.text
property text:
'''A unicode representation of the document text.'''
def __get__(self):
@ -306,7 +306,7 @@ cdef class Doc:
property ents:
'''
Yields named-entity `Span` objects, if the entity recognizer
has been applied to the document. Iterate over the span to get
has been applied to the document. Iterate over the span to get
individual Token objects, or access the label:
Example:
@ -352,7 +352,7 @@ cdef class Doc:
cdef int i
for i in range(self.length):
self.c[i].ent_type = 0
# At this point we don't know whether the NER has run over the
# At this point we don't know whether the NER has run over the
# Doc. If the ent_iob is missing, leave it missing.
if self.c[i].ent_iob != 0:
self.c[i].ent_iob = 2 # Means O. Non-O are set from ents.
@ -384,9 +384,9 @@ cdef class Doc:
property noun_chunks:
'''
Yields base noun-phrase #[code Span] objects, if the document
has been syntactically parsed. A base noun phrase, or
'NP chunk', is a noun phrase that does not permit other NPs to
be nested within it so no NP-level coordination, no prepositional
has been syntactically parsed. A base noun phrase, or
'NP chunk', is a noun phrase that does not permit other NPs to
be nested within it so no NP-level coordination, no prepositional
phrases, and no relative clauses. For example:
'''
def __get__(self):
@ -422,7 +422,7 @@ cdef class Doc:
def __get__(self):
if 'sents' in self.user_hooks:
return self.user_hooks['sents'](self)
if not self.is_parsed:
raise ValueError(
"sentence boundary detection requires the dependency parse, which "
@ -465,8 +465,8 @@ cdef class Doc:
@cython.boundscheck(False)
cpdef np.ndarray to_array(self, object py_attr_ids):
"""
Given a list of M attribute IDs, export the tokens to a numpy
`ndarray` of shape (N, M), where `N` is the length
Given a list of M attribute IDs, export the tokens to a numpy
`ndarray` of shape (N, M), where `N` is the length
of the document. The values will be 32-bit integers.
Example:
@ -474,7 +474,7 @@ cdef class Doc:
doc = nlp(text)
# All strings mapped to integers, for easy export to numpy
np_array = doc.to_array([attrs.LOWER, attrs.POS, attrs.ENT_TYPE, attrs.IS_ALPHA])
Arguments:
attr_ids (list[int]): A list of attribute ID ints.
@ -520,7 +520,7 @@ cdef class Doc:
cdef int i
cdef attr_t attr
cdef size_t count
if counts is None:
counts = PreshCounter()
output_dict = True
@ -570,7 +570,7 @@ cdef class Doc:
cdef TokenC* tokens = self.c
cdef int length = len(array)
cdef attr_t[:] values
for col, attr_id in enumerate(attrs):
for col, attr_id in enumerate(attrs):
values = array[:, col]
if attr_id == HEAD:
for i in range(length):
@ -612,11 +612,11 @@ cdef class Doc:
'''Deserialize, loading from bytes.'''
self.vocab.serializer.unpack_into(data[4:], self)
return self
@staticmethod
def read_bytes(file_):
'''
A static method, used to read serialized #[code Doc] objects from
A static method, used to read serialized #[code Doc] objects from
a file. For example:
Example:
@ -673,7 +673,7 @@ cdef class Doc:
"Expected either 3 arguments (deprecated), or 0 (use keyword arguments). "
"Arguments supplied:\n%s\n"
"Keyword arguments:%s\n" % (len(args), repr(args), repr(attributes)))
cdef int start = token_by_start(self.c, self.length, start_idx)
if start == -1:
return None
@ -784,7 +784,7 @@ cdef int set_children_from_heads(TokenC* tokens, int length) except -1:
if child.l_edge < head.l_edge:
head.l_edge = child.l_edge
head.l_kids += 1
# Set right edges --- same as above, but iterate in reverse
for i in range(length-1, -1, -1):
child = &tokens[i]
@ -798,4 +798,4 @@ cdef int set_children_from_heads(TokenC* tokens, int length) except -1:
for i in range(length):
if tokens[i].head == 0 and tokens[i].dep != 0:
tokens[tokens[i].l_edge].sent_start = True

View File

@ -20,7 +20,7 @@ from .. import parts_of_speech
from ..attrs cimport LEMMA
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
from ..attrs cimport POS, LEMMA, TAG, DEP
from ..parts_of_speech cimport CONJ, PUNCT
from ..parts_of_speech cimport CCONJ, PUNCT
from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
from ..attrs cimport IS_BRACKET
@ -84,7 +84,7 @@ cdef class Token:
cpdef bint check_flag(self, attr_id_t flag_id) except -1:
'''Check the value of a boolean flag.
Arguments:
flag_id (int): The ID of the flag attribute.
Returns:
@ -225,7 +225,7 @@ cdef class Token:
property vector:
'''
A real-valued meaning representation.
Type: numpy.ndarray[ndim=1, dtype='float32']
'''
def __get__(self):
@ -343,7 +343,7 @@ cdef class Token:
'''
def __get__(self):
cdef const TokenC* head_ptr = self.c
# guard against infinite loop, no token can have
# guard against infinite loop, no token can have
# more ancestors than tokens in the tree
cdef int i = 0
while head_ptr.head != 0 and i < self.doc.length:
@ -370,7 +370,7 @@ cdef class Token:
property head:
'''The syntactic parent, or "governor", of this token.
Returns: Token
'''
def __get__(self):
@ -390,7 +390,7 @@ cdef class Token:
# is the new head a descendant of the old head
cdef bint is_desc = old_head.is_ancestor_of(new_head)
cdef int new_edge
cdef Token anc, child
@ -420,7 +420,7 @@ cdef class Token:
if anc.c.l_edge <= new_edge:
break
anc.c.l_edge = new_edge
elif self.c.head < 0: # right dependent
old_head.c.r_kids -= 1
# do the same thing as for l_edge
@ -435,7 +435,7 @@ cdef class Token:
if child.c.r_edge > new_edge:
new_edge = child.c.r_edge
old_head.c.r_edge = new_edge
for anc in old_head.ancestors:
if anc.c.r_edge >= new_edge:
break
@ -598,19 +598,19 @@ cdef class Token:
property is_punct:
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_PUNCT)
property is_space:
property is_space:
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_SPACE)
property is_bracket:
property is_bracket:
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_BRACKET)
property is_quote:
property is_quote:
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_QUOTE)
property is_left_punct:
property is_left_punct:
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_LEFT_PUNCT)
property is_right_punct:
property is_right_punct:
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_RIGHT_PUNCT)
property like_url: