mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
* Tmp. Refactoring, introducing a Lexeme PyObject.
This commit is contained in:
parent
ce2edd6312
commit
46da3d74d2
|
@ -3,9 +3,9 @@ from ..typedefs cimport FLAG8, FLAG9
|
|||
from ..typedefs cimport ID as _ID
|
||||
from ..typedefs cimport SIC as _SIC
|
||||
from ..typedefs cimport SHAPE as _SHAPE
|
||||
from ..typedefs cimport DENSE as _DENSE
|
||||
from ..typedefs cimport NORM1 as _NORM1
|
||||
from ..typedefs cimport NORM2 as _NORM2
|
||||
from ..typedefs cimport CLUSTER as _CLUSTER
|
||||
from ..typedefs cimport SHAPE as _SHAPE
|
||||
from ..typedefs cimport PREFIX as _PREFIX
|
||||
from ..typedefs cimport SUFFIX as _SUFFIX
|
||||
from ..typedefs cimport LEMMA as _LEMMA
|
||||
|
@ -28,7 +28,8 @@ cpdef enum:
|
|||
ID = _ID
|
||||
SIC = _SIC
|
||||
SHAPE = _SHAPE
|
||||
DENSE = _DENSE
|
||||
NORM1 = _NORM1
|
||||
NORM2 = _NORM2
|
||||
PREFIX = _PREFIX
|
||||
SUFFIX = _SUFFIX
|
||||
CLUSTER = _CLUSTER
|
||||
|
|
|
@ -77,7 +77,6 @@ cpdef enum:
|
|||
P2_suffix
|
||||
P2_pos
|
||||
P2_lemma
|
||||
P2_pos_type
|
||||
|
||||
P1_sic
|
||||
P1_cluster
|
||||
|
@ -86,7 +85,6 @@ cpdef enum:
|
|||
P1_suffix
|
||||
P1_pos
|
||||
P1_lemma
|
||||
P1_pos_type
|
||||
|
||||
W_sic
|
||||
W_cluster
|
||||
|
@ -95,7 +93,6 @@ cpdef enum:
|
|||
W_suffix
|
||||
W_pos
|
||||
W_lemma
|
||||
W_pos_type
|
||||
|
||||
N1_sic
|
||||
N1_cluster
|
||||
|
@ -104,7 +101,6 @@ cpdef enum:
|
|||
N1_suffix
|
||||
N1_pos
|
||||
N1_lemma
|
||||
N1_pos_type
|
||||
|
||||
N2_sic
|
||||
N2_cluster
|
||||
|
@ -113,7 +109,6 @@ cpdef enum:
|
|||
N2_suffix
|
||||
N2_pos
|
||||
N2_lemma
|
||||
N2_pos_type
|
||||
|
||||
N_CONTEXT_FIELDS
|
||||
|
||||
|
@ -196,11 +191,6 @@ POS_TEMPLATES = (
|
|||
(N2_cluster,),
|
||||
(P1_cluster,),
|
||||
(P2_cluster,),
|
||||
|
||||
(W_pos_type,),
|
||||
(N1_pos_type,),
|
||||
(N1_pos_type,),
|
||||
(P1_pos, W_pos_type, N1_pos_type),
|
||||
)
|
||||
|
||||
|
||||
|
@ -339,4 +329,3 @@ cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
|
|||
context[4] = t.lex.suffix
|
||||
context[5] = t.pos
|
||||
context[6] = t.lemma
|
||||
context[7] = t.lex.pos_type
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
from .typedefs cimport hash_t, flags_t, id_t, len_t, tag_t, attr_t, attr_id_t
|
||||
from .typedefs cimport ID, SIC, DENSE, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER, POS_TYPE
|
||||
from .typedefs cimport ID, SIC, NORM1, NORM2, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
|
||||
from .structs cimport LexemeC
|
||||
from .strings cimport StringStore
|
||||
|
||||
|
@ -11,6 +11,35 @@ cdef LexemeC init(id_t i, unicode string, hash_t hashed, StringStore store,
|
|||
dict props) except *
|
||||
|
||||
|
||||
cdef class Lexeme:
|
||||
cdef const float* vec
|
||||
|
||||
cdef readonly flags_t flags
|
||||
cdef readonly attr_t id
|
||||
cdef readonly attr_t length
|
||||
|
||||
cdef readonly unicode sic
|
||||
cdef readonly unicode norm1
|
||||
cdef readonly unicode norm2
|
||||
cdef readonly unicode shape
|
||||
cdef readonly unicode prefix
|
||||
cdef readonly unicode suffix
|
||||
|
||||
cdef readonly attr_t sic_id
|
||||
cdef readonly attr_t norm1_id
|
||||
cdef readonly attr_t norm2_id
|
||||
cdef readonly attr_t shape_id
|
||||
cdef readonly attr_t prefix_id
|
||||
cdef readonly attr_t suffix_id
|
||||
|
||||
cdef readonly attr_t cluster
|
||||
cdef readonly float prob
|
||||
cdef readonly float sentiment
|
||||
|
||||
|
||||
cdef Lexeme Lexeme_cinit(const LexemeC* c, StringStore strings)
|
||||
|
||||
|
||||
cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:
|
||||
return lexeme.flags & (1 << flag_id)
|
||||
|
||||
|
@ -22,8 +51,10 @@ cdef inline attr_t get_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
|
|||
return lex.id
|
||||
elif feat_name == SIC:
|
||||
return lex.sic
|
||||
elif feat_name == DENSE:
|
||||
return lex.dense
|
||||
elif feat_name == NORM1:
|
||||
return lex.norm1
|
||||
elif feat_name == NORM2:
|
||||
return lex.norm2
|
||||
elif feat_name == SHAPE:
|
||||
return lex.shape
|
||||
elif feat_name == PREFIX:
|
||||
|
@ -34,7 +65,5 @@ cdef inline attr_t get_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
|
|||
return lex.length
|
||||
elif feat_name == CLUSTER:
|
||||
return lex.cluster
|
||||
elif feat_name == POS_TYPE:
|
||||
return lex.pos_type
|
||||
else:
|
||||
return 0
|
||||
|
|
|
@ -18,7 +18,6 @@ cdef LexemeC init(id_t i, unicode string, hash_t hashed,
|
|||
lex.sic = string_store[string]
|
||||
|
||||
lex.cluster = props.get('cluster', 0)
|
||||
lex.pos_type = props.get('pos_type', 0)
|
||||
lex.prob = props.get('prob', 0)
|
||||
|
||||
lex.prefix = string_store[string[:1]]
|
||||
|
@ -29,4 +28,36 @@ cdef LexemeC init(id_t i, unicode string, hash_t hashed,
|
|||
return lex
|
||||
|
||||
|
||||
cdef class Lexeme:
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
|
||||
cdef Lexeme Lexeme_cinit(const LexemeC* c, StringStore strings):
|
||||
cdef Lexeme py = Lexeme.__new__(Lexeme)
|
||||
|
||||
py.vec = c.vec
|
||||
|
||||
py.flags = c.flags
|
||||
py.id = c.id
|
||||
py.length = c.length
|
||||
|
||||
py.sic = strings[c.sic]
|
||||
py.norm1 = strings[c.norm1]
|
||||
py.norm2 = strings[c.norm2]
|
||||
py.shape = strings[c.shape]
|
||||
py.prefix = strings[c.prefix]
|
||||
py.suffix = strings[c.suffix]
|
||||
|
||||
py.sic_id = c.sic
|
||||
py.norm1_id = c.norm1
|
||||
py.norm2_id = c.norm2
|
||||
py.shape_id = c.shape
|
||||
py.prefix_id = c.prefix
|
||||
py.suffix_id = c.suffix
|
||||
|
||||
py.cluster = c.cluster
|
||||
|
||||
py.prob = c.prob
|
||||
py.sentiment = c.sentiment
|
||||
return py
|
||||
|
|
|
@ -9,15 +9,16 @@ cdef struct LexemeC:
|
|||
flags_t flags
|
||||
|
||||
attr_t id
|
||||
attr_t length
|
||||
|
||||
attr_t sic
|
||||
attr_t dense
|
||||
attr_t norm1
|
||||
attr_t norm2
|
||||
attr_t shape
|
||||
attr_t prefix
|
||||
attr_t suffix
|
||||
|
||||
attr_t length
|
||||
attr_t cluster
|
||||
attr_t pos_type
|
||||
|
||||
float prob
|
||||
float sentiment
|
||||
|
|
|
@ -7,7 +7,7 @@ from preshed.counter cimport PreshCounter
|
|||
from .vocab cimport EMPTY_LEXEME
|
||||
from .typedefs cimport attr_id_t, attr_t
|
||||
from .typedefs cimport LEMMA
|
||||
from .typedefs cimport ID, SIC, DENSE, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER, POS_TYPE
|
||||
from .typedefs cimport ID, SIC, NORM1, NORM2, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
|
||||
from .typedefs cimport POS, LEMMA
|
||||
|
||||
cimport cython
|
||||
|
@ -39,8 +39,10 @@ cdef attr_t get_lex_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
|
|||
return lex.id
|
||||
elif feat_name == SIC:
|
||||
return lex.sic
|
||||
elif feat_name == DENSE:
|
||||
return lex.dense
|
||||
elif feat_name == NORM1:
|
||||
return lex.norm1
|
||||
elif feat_name == NORM2:
|
||||
return lex.norm2
|
||||
elif feat_name == SHAPE:
|
||||
return lex.shape
|
||||
elif feat_name == PREFIX:
|
||||
|
@ -51,8 +53,6 @@ cdef attr_t get_lex_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
|
|||
return lex.length
|
||||
elif feat_name == CLUSTER:
|
||||
return lex.cluster
|
||||
elif feat_name == POS_TYPE:
|
||||
return lex.pos_type
|
||||
else:
|
||||
return 0
|
||||
|
||||
|
@ -175,26 +175,7 @@ cdef class Tokens:
|
|||
|
||||
|
||||
cdef Token cinit_token(const TokenC* c_tok):
|
||||
cdef const LexemeC* lex = c_tok.lex
|
||||
cdef Token py_tok = Token.__new__(Token)
|
||||
|
||||
cyarr = cvarray(shape=(300,), itemsize=sizeof(float), format="i")
|
||||
py_tok.vec = cyarr
|
||||
|
||||
py_tok.flags = lex.flags
|
||||
py_tok.id = lex.id
|
||||
py_tok.sic = lex.sic
|
||||
py_tok.dense = lex.dense
|
||||
py_tok.shape = lex.shape
|
||||
py_tok.prefix = lex.prefix
|
||||
py_tok.suffix = lex.suffix
|
||||
py_tok.length = lex.length
|
||||
py_tok.cluster = lex.cluster
|
||||
py_tok.pos_type = lex.pos_type
|
||||
|
||||
py_tok.prob = lex.prob
|
||||
py_tok.sentiment = lex.sentiment
|
||||
|
||||
py_tok.morph = c_tok.morph
|
||||
py_tok.pos = c_tok.pos
|
||||
py_tok.fine_pos = c_tok.fine_pos
|
||||
|
|
|
@ -90,14 +90,14 @@ cpdef enum attr_id_t:
|
|||
|
||||
ID
|
||||
SIC
|
||||
DENSE
|
||||
NORM1
|
||||
NORM2
|
||||
SHAPE
|
||||
PREFIX
|
||||
SUFFIX
|
||||
|
||||
LENGTH
|
||||
CLUSTER
|
||||
POS_TYPE
|
||||
LEMMA
|
||||
POS
|
||||
|
||||
|
|
|
@ -6,6 +6,7 @@ import codecs
|
|||
|
||||
from .lexeme cimport EMPTY_LEXEME
|
||||
from .lexeme cimport init as lexeme_init
|
||||
from .lexeme cimport Lexeme_cinit
|
||||
from .strings cimport slice_unicode
|
||||
from .strings cimport hash_string
|
||||
from .orth cimport word_shape
|
||||
|
@ -28,7 +29,6 @@ cdef LexemeC init_lexeme(id_t i, unicode string, hash_t hashed,
|
|||
lex.sic = string_store[string]
|
||||
|
||||
lex.cluster = props.get('cluster', 0)
|
||||
lex.pos_type = props.get('pos_type', 0)
|
||||
lex.prob = props.get('prob', 0)
|
||||
|
||||
lex.prefix = string_store[string[:1]]
|
||||
|
@ -90,12 +90,6 @@ cdef class Vocab:
|
|||
'''Retrieve a lexeme, given an int ID or a unicode string. If a previously
|
||||
unseen unicode string is given, a new LexemeC is created and stored.
|
||||
|
||||
This function relies on Cython's struct-to-dict conversion. Python clients
|
||||
receive a dict keyed by strings (byte or unicode, depending on Python 2/3),
|
||||
with int values. Cython clients can instead receive a LexemeC struct value.
|
||||
More efficient Cython access is provided by Lexicon.get, which returns
|
||||
a LexemeC*.
|
||||
|
||||
Args:
|
||||
id_or_string (int or unicode): The integer ID of a word, or its unicode
|
||||
string. If an int >= Lexicon.size, IndexError is raised.
|
||||
|
@ -103,19 +97,19 @@ cdef class Vocab:
|
|||
is raised.
|
||||
|
||||
Returns:
|
||||
lexeme (dict): A LexemeC struct instance, which Cython translates into
|
||||
a dict if the operator is called from Python.
|
||||
lexeme (Lexeme): An instance of the Lexeme Python class, with data
|
||||
copied on instantiation.
|
||||
'''
|
||||
cdef UniStr string
|
||||
cdef const LexemeC* lexeme
|
||||
if type(id_or_string) == int:
|
||||
if id_or_string >= self.lexemes.size():
|
||||
raise IndexError
|
||||
return {}
|
||||
#return self.lexemes.at(id_or_string)[0]
|
||||
cdef UniStr string
|
||||
slice_unicode(&string, id_or_string, 0, len(id_or_string))
|
||||
cdef const LexemeC* lexeme = self.get(self.mem, &string)
|
||||
return {}
|
||||
#return lexeme[0]
|
||||
lexeme = self.lexemes.at(id_or_string)
|
||||
else:
|
||||
slice_unicode(&string, id_or_string, 0, len(id_or_string))
|
||||
lexeme = self.get(self.mem, &string)
|
||||
return Lexeme_cinit(lexeme, self.strings)
|
||||
|
||||
def __setitem__(self, unicode uni_string, dict props):
|
||||
cdef UniStr s
|
||||
|
|
Loading…
Reference in New Issue
Block a user