mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-11 04:08:09 +03:00
* Fix context vector, importing all features
This commit is contained in:
parent
07a23768de
commit
50309e6e49
|
@ -5,24 +5,43 @@ from .lexeme cimport Lexeme
|
||||||
|
|
||||||
|
|
||||||
cdef class Token:
|
cdef class Token:
|
||||||
cdef readonly atom_t i
|
cdef readonly atom_t sic
|
||||||
cdef readonly atom_t c
|
cdef readonly atom_t cluster
|
||||||
cdef readonly atom_t w
|
cdef readonly atom_t norm
|
||||||
cdef readonly atom_t shape
|
cdef readonly atom_t shape
|
||||||
cdef readonly atom_t pref
|
cdef readonly atom_t asciied
|
||||||
cdef readonly atom_t suff
|
cdef readonly atom_t prefix
|
||||||
cdef readonly atom_t oft_title
|
cdef readonly atom_t suffix
|
||||||
cdef readonly atom_t oft_upper
|
cdef readonly atom_t length
|
||||||
cdef readonly atom_t is_alpha
|
|
||||||
cdef readonly atom_t is_digit
|
|
||||||
cdef readonly atom_t is_title
|
|
||||||
cdef readonly atom_t is_upper
|
|
||||||
|
|
||||||
cdef readonly atom_t url
|
|
||||||
cdef readonly atom_t num
|
|
||||||
|
|
||||||
cdef readonly atom_t postype
|
cdef readonly atom_t postype
|
||||||
|
cdef readonly atom_t nertype
|
||||||
|
cdef readonly atom_t sensetype
|
||||||
|
|
||||||
|
cdef readonly atom_t is_alpha
|
||||||
|
cdef readonly atom_t is_ascii
|
||||||
|
cdef readonly atom_t is_digit
|
||||||
|
cdef readonly atom_t is_lower
|
||||||
|
cdef readonly atom_t is_punct
|
||||||
|
cdef readonly atom_t is_space
|
||||||
|
cdef readonly atom_t is_title
|
||||||
|
cdef readonly atom_t is_upper
|
||||||
|
cdef readonly atom_t like_url
|
||||||
|
cdef readonly atom_t like_number
|
||||||
|
cdef readonly atom_t oft_lower
|
||||||
|
cdef readonly atom_t oft_title
|
||||||
|
cdef readonly atom_t oft_upper
|
||||||
|
|
||||||
|
cdef readonly atom_t in_males
|
||||||
|
cdef readonly atom_t in_females
|
||||||
|
cdef readonly atom_t in_surnames
|
||||||
|
cdef readonly atom_t in_places
|
||||||
|
cdef readonly atom_t in_games
|
||||||
|
cdef readonly atom_t in_celebs
|
||||||
|
cdef readonly atom_t in_names
|
||||||
|
|
||||||
cdef readonly atom_t pos
|
cdef readonly atom_t pos
|
||||||
|
cdef readonly atom_t sense
|
||||||
cdef readonly atom_t ner
|
cdef readonly atom_t ner
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -13,67 +13,126 @@ cdef class Slots:
|
||||||
|
|
||||||
cdef void _number_token(Token t, int* n_fields):
|
cdef void _number_token(Token t, int* n_fields):
|
||||||
cdef int i = n_fields[0]
|
cdef int i = n_fields[0]
|
||||||
t.i = i; i += 1
|
t.sic = i; i += 1
|
||||||
t.c = i; i += 1
|
t.cluster = i; i += 1
|
||||||
t.w = i; i += 1
|
t.norm = i; i += 1
|
||||||
t.shape = i; i += 1
|
t.shape = i; i += 1
|
||||||
t.pref = i; i += 1
|
t.prefix = i; i += 1
|
||||||
t.suff = i; i += 1
|
t.suffix = i; i += 1
|
||||||
t.oft_title = i; i += 1
|
t.length = i; i += 1
|
||||||
t.oft_upper = i; i += 1
|
|
||||||
|
t.postype = i; i += 1
|
||||||
|
t.nertype = i; i += 1
|
||||||
|
t.sensetype = i; i += 1
|
||||||
|
|
||||||
t.is_alpha = i; i += 1
|
t.is_alpha = i; i += 1
|
||||||
|
t.is_ascii = i; i += 1
|
||||||
t.is_digit = i; i += 1
|
t.is_digit = i; i += 1
|
||||||
|
t.is_lower = i; i += 1
|
||||||
|
t.is_punct = i; i += 1
|
||||||
|
t.is_space = i; i += 1
|
||||||
t.is_title = i; i += 1
|
t.is_title = i; i += 1
|
||||||
t.is_upper = i; i += 1
|
t.is_upper = i; i += 1
|
||||||
|
|
||||||
t.url = i; i += 1
|
t.like_number = i; i += 1
|
||||||
t.num = i; i += 1
|
t.like_url = i; i += 1
|
||||||
|
|
||||||
|
t.oft_lower = i; i += 1
|
||||||
|
t.oft_title = i; i += 1
|
||||||
|
t.oft_upper = i; i += 1
|
||||||
|
|
||||||
|
t.in_males = i; i += 1
|
||||||
|
t.in_females = i; i += 1
|
||||||
|
t.in_surnames = i; i += 1
|
||||||
|
t.in_places = i; i += 1
|
||||||
|
t.in_games = i; i += 1
|
||||||
|
t.in_celebs = i; i += 1
|
||||||
|
t.in_names = i; i += 1
|
||||||
|
|
||||||
t.postype = i; i += 1
|
|
||||||
t.pos = i; i += 1
|
t.pos = i; i += 1
|
||||||
|
t.sense = i; i += 1
|
||||||
t.ner = i; i += 1
|
t.ner = i; i += 1
|
||||||
|
|
||||||
n_fields[0] = i
|
n_fields[0] = i
|
||||||
|
|
||||||
|
|
||||||
cdef int fill_token(Token t, Lexeme* lex, atom_t pos, atom_t ner):
|
cdef int fill_token(Token t, Lexeme* lex, atom_t pos, atom_t ner):
|
||||||
t.i = lex.sic
|
t.sic = lex.sic
|
||||||
t.c = lex.cluster
|
t.cluster = lex.cluster
|
||||||
t.w = lex.norm if (lex.prob != 0 and lex.prob >= -10) else lex.shape
|
t.norm = lex.norm if (lex.prob != 0 and lex.prob >= -10) else lex.shape
|
||||||
t.shape = lex.shape
|
t.shape = lex.shape
|
||||||
t.pref = lex.prefix
|
t.asciied = lex.asciied
|
||||||
t.suff = lex.suffix
|
t.prefix = lex.prefix
|
||||||
|
t.suffix = lex.suffix
|
||||||
|
t.length = lex.length
|
||||||
|
|
||||||
|
t.postype = lex.postype
|
||||||
|
t.nertype = 0
|
||||||
|
t.sensetype = 0
|
||||||
|
|
||||||
t.oft_title = lex.flags & (1 << OFT_TITLE)
|
|
||||||
t.oft_upper = lex.flags & (1 << OFT_UPPER)
|
|
||||||
t.is_alpha = lex.flags & (1 << IS_ALPHA)
|
t.is_alpha = lex.flags & (1 << IS_ALPHA)
|
||||||
t.is_digit = lex.flags & (1 << IS_DIGIT)
|
t.is_digit = lex.flags & (1 << IS_DIGIT)
|
||||||
|
t.is_lower = lex.flags & (1 << IS_LOWER)
|
||||||
|
t.is_punct = lex.flags & (1 << IS_PUNCT)
|
||||||
|
t.is_space = lex.flags & (1 << IS_SPACE)
|
||||||
t.is_title = lex.flags & (1 << IS_TITLE)
|
t.is_title = lex.flags & (1 << IS_TITLE)
|
||||||
t.is_upper = lex.flags & (1 << IS_UPPER)
|
t.is_upper = lex.flags & (1 << IS_UPPER)
|
||||||
t.url = lex.flags & (1 << LIKE_URL)
|
t.like_url = lex.flags & (1 << LIKE_URL)
|
||||||
t.num = lex.flags & (1 << LIKE_NUMBER)
|
t.like_number = lex.flags & (1 << LIKE_NUMBER)
|
||||||
t.postype = lex.postype
|
t.oft_lower = lex.flags & (1 << OFT_LOWER)
|
||||||
|
t.oft_title = lex.flags & (1 << OFT_TITLE)
|
||||||
|
t.oft_upper = lex.flags & (1 << OFT_UPPER)
|
||||||
|
|
||||||
|
t.in_males = lex.flags & (1 << IN_MALES)
|
||||||
|
t.in_females = lex.flags & (1 << IN_FEMALES)
|
||||||
|
t.in_surnames = lex.flags & (1 << IN_SURNAMES)
|
||||||
|
t.in_places = lex.flags & (1 << IN_PLACES)
|
||||||
|
t.in_games = lex.flags & (1 << IN_GAMES)
|
||||||
|
t.in_celebs = lex.flags & (1 << IN_CELEBS)
|
||||||
|
t.in_names = lex.flags & (1 << IN_NAMES)
|
||||||
|
|
||||||
t.pos = pos
|
t.pos = pos
|
||||||
|
t.sense = 0
|
||||||
t.ner = ner
|
t.ner = ner
|
||||||
|
|
||||||
|
|
||||||
cdef int _flatten_token(atom_t* context, Token ids, Token vals) except -1:
|
cdef int _flatten_token(atom_t* context, Token ids, Token vals) except -1:
|
||||||
context[ids.i] = vals.i
|
context[ids.sic] = vals.sic
|
||||||
context[ids.c] = vals.c
|
context[ids.cluster] = vals.cluster
|
||||||
context[ids.w] = vals.w
|
context[ids.norm] = vals.norm
|
||||||
context[ids.shape] = vals.shape
|
context[ids.shape] = vals.shape
|
||||||
context[ids.pref] = vals.pref
|
context[ids.asciied] = vals.asciied
|
||||||
context[ids.suff] = vals.suff
|
context[ids.prefix] = vals.prefix
|
||||||
context[ids.oft_title] = vals.oft_title
|
context[ids.suffix] = vals.suffix
|
||||||
context[ids.oft_upper] = vals.oft_upper
|
context[ids.length] = vals.length
|
||||||
|
|
||||||
|
context[ids.postype] = vals.postype
|
||||||
|
context[ids.nertype] = vals.nertype
|
||||||
|
context[ids.sensetype] = vals.sensetype
|
||||||
|
|
||||||
context[ids.is_alpha] = vals.is_alpha
|
context[ids.is_alpha] = vals.is_alpha
|
||||||
|
context[ids.is_ascii] = vals.is_ascii
|
||||||
context[ids.is_digit] = vals.is_digit
|
context[ids.is_digit] = vals.is_digit
|
||||||
|
context[ids.is_lower] = vals.is_lower
|
||||||
|
context[ids.is_punct] = vals.is_punct
|
||||||
context[ids.is_title] = vals.is_title
|
context[ids.is_title] = vals.is_title
|
||||||
context[ids.is_upper] = vals.is_upper
|
context[ids.is_upper] = vals.is_upper
|
||||||
context[ids.url] = vals.url
|
context[ids.like_url] = vals.like_url
|
||||||
context[ids.num] = vals.num
|
context[ids.like_number] = vals.like_number
|
||||||
context[ids.postype] = vals.postype
|
context[ids.oft_lower] = vals.oft_lower
|
||||||
|
context[ids.oft_title] = vals.oft_title
|
||||||
|
context[ids.oft_upper] = vals.oft_upper
|
||||||
|
|
||||||
|
context[ids.in_males] = vals.in_males
|
||||||
|
context[ids.in_females] = vals.in_females
|
||||||
|
context[ids.in_surnames] = vals.in_surnames
|
||||||
|
context[ids.in_places] = vals.in_places
|
||||||
|
context[ids.in_games] = vals.in_games
|
||||||
|
context[ids.in_celebs] = vals.in_celebs
|
||||||
|
context[ids.in_names] = vals.in_names
|
||||||
|
|
||||||
context[ids.pos] = vals.pos
|
context[ids.pos] = vals.pos
|
||||||
|
context[ids.sense] = vals.sense
|
||||||
context[ids.ner] = vals.ner
|
context[ids.ner] = vals.ner
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -51,8 +51,6 @@ cdef struct Lexeme:
|
||||||
tag_t supersense
|
tag_t supersense
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
cdef Lexeme EMPTY_LEXEME
|
cdef Lexeme EMPTY_LEXEME
|
||||||
|
|
||||||
cpdef Lexeme init(id_t i, unicode string, hash_t hashed,
|
cpdef Lexeme init(id_t i, unicode string, hash_t hashed,
|
||||||
|
|
|
@ -9,33 +9,33 @@ cpdef Token N2 = FIELD_IDS.N2
|
||||||
|
|
||||||
|
|
||||||
TEMPLATES = (
|
TEMPLATES = (
|
||||||
(N0.i,),
|
(N0.sic,),
|
||||||
(N0.w,),
|
(N0.norm,),
|
||||||
(N0.suff,),
|
(N0.suffix,),
|
||||||
(N0.pref,),
|
(N0.prefix,),
|
||||||
(P1.pos,),
|
(P1.pos,),
|
||||||
(P2.pos,),
|
(P2.pos,),
|
||||||
(P1.pos, P2.pos),
|
(P1.pos, P2.pos),
|
||||||
(P1.pos, N0.w),
|
(P1.pos, N0.norm),
|
||||||
(P1.w,),
|
(P1.norm,),
|
||||||
(P1.suff,),
|
(P1.suffix,),
|
||||||
(P2.w,),
|
(P2.norm,),
|
||||||
(N1.w,),
|
(N1.norm,),
|
||||||
(N1.suff,),
|
(N1.suffix,),
|
||||||
(N2.w,),
|
(N2.norm,),
|
||||||
|
|
||||||
(N0.shape,),
|
(N0.shape,),
|
||||||
(N0.c,),
|
(N0.cluster,),
|
||||||
(N1.c,),
|
(N1.cluster,),
|
||||||
(N2.c,),
|
(N2.cluster,),
|
||||||
(P1.c,),
|
(P1.cluster,),
|
||||||
(P2.c,),
|
(P2.cluster,),
|
||||||
(N0.oft_upper,),
|
(N0.oft_upper,),
|
||||||
(N0.oft_title,),
|
(N0.oft_title,),
|
||||||
|
|
||||||
(N0.postype,),
|
(N0.postype,),
|
||||||
|
|
||||||
(P1.url,),
|
(P1.like_url,),
|
||||||
(N1.num,),
|
(N1.like_number,),
|
||||||
(N1.url,),
|
(N1.like_url,),
|
||||||
)
|
)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user