* Tmp. Refactoring, introducing a Lexeme PyObject.

This commit is contained in:
Matthew Honnibal 2015-01-12 11:23:44 +11:00
parent ce2edd6312
commit 46da3d74d2
8 changed files with 91 additions and 65 deletions

View File

@ -3,9 +3,9 @@ from ..typedefs cimport FLAG8, FLAG9
from ..typedefs cimport ID as _ID
from ..typedefs cimport SIC as _SIC
from ..typedefs cimport SHAPE as _SHAPE
from ..typedefs cimport DENSE as _DENSE
from ..typedefs cimport NORM1 as _NORM1
from ..typedefs cimport NORM2 as _NORM2
from ..typedefs cimport CLUSTER as _CLUSTER
from ..typedefs cimport SHAPE as _SHAPE
from ..typedefs cimport PREFIX as _PREFIX
from ..typedefs cimport SUFFIX as _SUFFIX
from ..typedefs cimport LEMMA as _LEMMA
@ -28,7 +28,8 @@ cpdef enum:
ID = _ID
SIC = _SIC
SHAPE = _SHAPE
DENSE = _DENSE
NORM1 = _NORM1
NORM2 = _NORM2
PREFIX = _PREFIX
SUFFIX = _SUFFIX
CLUSTER = _CLUSTER

View File

@ -77,7 +77,6 @@ cpdef enum:
P2_suffix
P2_pos
P2_lemma
P2_pos_type
P1_sic
P1_cluster
@ -86,7 +85,6 @@ cpdef enum:
P1_suffix
P1_pos
P1_lemma
P1_pos_type
W_sic
W_cluster
@ -95,7 +93,6 @@ cpdef enum:
W_suffix
W_pos
W_lemma
W_pos_type
N1_sic
N1_cluster
@ -104,7 +101,6 @@ cpdef enum:
N1_suffix
N1_pos
N1_lemma
N1_pos_type
N2_sic
N2_cluster
@ -113,7 +109,6 @@ cpdef enum:
N2_suffix
N2_pos
N2_lemma
N2_pos_type
N_CONTEXT_FIELDS
@ -196,11 +191,6 @@ POS_TEMPLATES = (
(N2_cluster,),
(P1_cluster,),
(P2_cluster,),
(W_pos_type,),
(N1_pos_type,),
(N1_pos_type,),
(P1_pos, W_pos_type, N1_pos_type),
)
@ -339,4 +329,3 @@ cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
context[4] = t.lex.suffix
context[5] = t.pos
context[6] = t.lemma
context[7] = t.lex.pos_type

View File

@ -1,5 +1,5 @@
from .typedefs cimport hash_t, flags_t, id_t, len_t, tag_t, attr_t, attr_id_t
from .typedefs cimport ID, SIC, DENSE, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER, POS_TYPE
from .typedefs cimport ID, SIC, NORM1, NORM2, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
from .structs cimport LexemeC
from .strings cimport StringStore
@ -11,6 +11,35 @@ cdef LexemeC init(id_t i, unicode string, hash_t hashed, StringStore store,
dict props) except *
cdef class Lexeme:
cdef const float* vec
cdef readonly flags_t flags
cdef readonly attr_t id
cdef readonly attr_t length
cdef readonly unicode sic
cdef readonly unicode norm1
cdef readonly unicode norm2
cdef readonly unicode shape
cdef readonly unicode prefix
cdef readonly unicode suffix
cdef readonly attr_t sic_id
cdef readonly attr_t norm1_id
cdef readonly attr_t norm2_id
cdef readonly attr_t shape_id
cdef readonly attr_t prefix_id
cdef readonly attr_t suffix_id
cdef readonly attr_t cluster
cdef readonly float prob
cdef readonly float sentiment
cdef Lexeme Lexeme_cinit(const LexemeC* c, StringStore strings)
cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:
return lexeme.flags & (1 << flag_id)
@ -22,8 +51,10 @@ cdef inline attr_t get_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
return lex.id
elif feat_name == SIC:
return lex.sic
elif feat_name == DENSE:
return lex.dense
elif feat_name == NORM1:
return lex.norm1
elif feat_name == NORM2:
return lex.norm2
elif feat_name == SHAPE:
return lex.shape
elif feat_name == PREFIX:
@ -34,7 +65,5 @@ cdef inline attr_t get_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
return lex.length
elif feat_name == CLUSTER:
return lex.cluster
elif feat_name == POS_TYPE:
return lex.pos_type
else:
return 0

View File

@ -18,7 +18,6 @@ cdef LexemeC init(id_t i, unicode string, hash_t hashed,
lex.sic = string_store[string]
lex.cluster = props.get('cluster', 0)
lex.pos_type = props.get('pos_type', 0)
lex.prob = props.get('prob', 0)
lex.prefix = string_store[string[:1]]
@ -29,4 +28,36 @@ cdef LexemeC init(id_t i, unicode string, hash_t hashed,
return lex
cdef class Lexeme:
def __init__(self):
pass
cdef Lexeme Lexeme_cinit(const LexemeC* c, StringStore strings):
cdef Lexeme py = Lexeme.__new__(Lexeme)
py.vec = c.vec
py.flags = c.flags
py.id = c.id
py.length = c.length
py.sic = strings[c.sic]
py.norm1 = strings[c.norm1]
py.norm2 = strings[c.norm2]
py.shape = strings[c.shape]
py.prefix = strings[c.prefix]
py.suffix = strings[c.suffix]
py.sic_id = c.sic
py.norm1_id = c.norm1
py.norm2_id = c.norm2
py.shape_id = c.shape
py.prefix_id = c.prefix
py.suffix_id = c.suffix
py.cluster = c.cluster
py.prob = c.prob
py.sentiment = c.sentiment
return py

View File

@ -9,15 +9,16 @@ cdef struct LexemeC:
flags_t flags
attr_t id
attr_t length
attr_t sic
attr_t dense
attr_t norm1
attr_t norm2
attr_t shape
attr_t prefix
attr_t suffix
attr_t length
attr_t cluster
attr_t pos_type
float prob
float sentiment

View File

@ -7,7 +7,7 @@ from preshed.counter cimport PreshCounter
from .vocab cimport EMPTY_LEXEME
from .typedefs cimport attr_id_t, attr_t
from .typedefs cimport LEMMA
from .typedefs cimport ID, SIC, DENSE, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER, POS_TYPE
from .typedefs cimport ID, SIC, NORM1, NORM2, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
from .typedefs cimport POS, LEMMA
cimport cython
@ -39,8 +39,10 @@ cdef attr_t get_lex_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
return lex.id
elif feat_name == SIC:
return lex.sic
elif feat_name == DENSE:
return lex.dense
elif feat_name == NORM1:
return lex.norm1
elif feat_name == NORM2:
return lex.norm2
elif feat_name == SHAPE:
return lex.shape
elif feat_name == PREFIX:
@ -51,8 +53,6 @@ cdef attr_t get_lex_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
return lex.length
elif feat_name == CLUSTER:
return lex.cluster
elif feat_name == POS_TYPE:
return lex.pos_type
else:
return 0
@ -175,26 +175,7 @@ cdef class Tokens:
cdef Token cinit_token(const TokenC* c_tok):
cdef const LexemeC* lex = c_tok.lex
cdef Token py_tok = Token.__new__(Token)
cyarr = cvarray(shape=(300,), itemsize=sizeof(float), format="i")
py_tok.vec = cyarr
py_tok.flags = lex.flags
py_tok.id = lex.id
py_tok.sic = lex.sic
py_tok.dense = lex.dense
py_tok.shape = lex.shape
py_tok.prefix = lex.prefix
py_tok.suffix = lex.suffix
py_tok.length = lex.length
py_tok.cluster = lex.cluster
py_tok.pos_type = lex.pos_type
py_tok.prob = lex.prob
py_tok.sentiment = lex.sentiment
py_tok.morph = c_tok.morph
py_tok.pos = c_tok.pos
py_tok.fine_pos = c_tok.fine_pos

View File

@ -90,14 +90,14 @@ cpdef enum attr_id_t:
ID
SIC
DENSE
NORM1
NORM2
SHAPE
PREFIX
SUFFIX
LENGTH
CLUSTER
POS_TYPE
LEMMA
POS

View File

@ -6,6 +6,7 @@ import codecs
from .lexeme cimport EMPTY_LEXEME
from .lexeme cimport init as lexeme_init
from .lexeme cimport Lexeme_cinit
from .strings cimport slice_unicode
from .strings cimport hash_string
from .orth cimport word_shape
@ -28,7 +29,6 @@ cdef LexemeC init_lexeme(id_t i, unicode string, hash_t hashed,
lex.sic = string_store[string]
lex.cluster = props.get('cluster', 0)
lex.pos_type = props.get('pos_type', 0)
lex.prob = props.get('prob', 0)
lex.prefix = string_store[string[:1]]
@ -90,12 +90,6 @@ cdef class Vocab:
'''Retrieve a lexeme, given an int ID or a unicode string. If a previously
unseen unicode string is given, a new LexemeC is created and stored.
This function relies on Cython's struct-to-dict conversion. Python clients
receive a dict keyed by strings (byte or unicode, depending on Python 2/3),
with int values. Cython clients can instead receive a LexemeC struct value.
More efficient Cython access is provided by Lexicon.get, which returns
a LexemeC*.
Args:
id_or_string (int or unicode): The integer ID of a word, or its unicode
string. If an int >= Lexicon.size, IndexError is raised.
@ -103,19 +97,19 @@ cdef class Vocab:
is raised.
Returns:
lexeme (dict): A LexemeC struct instance, which Cython translates into
a dict if the operator is called from Python.
lexeme (Lexeme): An instance of the Lexeme Python class, with data
copied on instantiation.
'''
cdef UniStr string
cdef const LexemeC* lexeme
if type(id_or_string) == int:
if id_or_string >= self.lexemes.size():
raise IndexError
return {}
#return self.lexemes.at(id_or_string)[0]
cdef UniStr string
slice_unicode(&string, id_or_string, 0, len(id_or_string))
cdef const LexemeC* lexeme = self.get(self.mem, &string)
return {}
#return lexeme[0]
lexeme = self.lexemes.at(id_or_string)
else:
slice_unicode(&string, id_or_string, 0, len(id_or_string))
lexeme = self.get(self.mem, &string)
return Lexeme_cinit(lexeme, self.strings)
def __setitem__(self, unicode uni_string, dict props):
cdef UniStr s