* Tmp. Refactoring, introducing a Lexeme PyObject.

This commit is contained in:
Matthew Honnibal 2015-01-12 11:23:44 +11:00
parent ce2edd6312
commit 46da3d74d2
8 changed files with 91 additions and 65 deletions

View File

@ -3,9 +3,9 @@ from ..typedefs cimport FLAG8, FLAG9
from ..typedefs cimport ID as _ID from ..typedefs cimport ID as _ID
from ..typedefs cimport SIC as _SIC from ..typedefs cimport SIC as _SIC
from ..typedefs cimport SHAPE as _SHAPE from ..typedefs cimport SHAPE as _SHAPE
from ..typedefs cimport DENSE as _DENSE from ..typedefs cimport NORM1 as _NORM1
from ..typedefs cimport NORM2 as _NORM2
from ..typedefs cimport CLUSTER as _CLUSTER from ..typedefs cimport CLUSTER as _CLUSTER
from ..typedefs cimport SHAPE as _SHAPE
from ..typedefs cimport PREFIX as _PREFIX from ..typedefs cimport PREFIX as _PREFIX
from ..typedefs cimport SUFFIX as _SUFFIX from ..typedefs cimport SUFFIX as _SUFFIX
from ..typedefs cimport LEMMA as _LEMMA from ..typedefs cimport LEMMA as _LEMMA
@ -28,7 +28,8 @@ cpdef enum:
ID = _ID ID = _ID
SIC = _SIC SIC = _SIC
SHAPE = _SHAPE SHAPE = _SHAPE
DENSE = _DENSE NORM1 = _NORM1
NORM2 = _NORM2
PREFIX = _PREFIX PREFIX = _PREFIX
SUFFIX = _SUFFIX SUFFIX = _SUFFIX
CLUSTER = _CLUSTER CLUSTER = _CLUSTER

View File

@ -77,7 +77,6 @@ cpdef enum:
P2_suffix P2_suffix
P2_pos P2_pos
P2_lemma P2_lemma
P2_pos_type
P1_sic P1_sic
P1_cluster P1_cluster
@ -86,7 +85,6 @@ cpdef enum:
P1_suffix P1_suffix
P1_pos P1_pos
P1_lemma P1_lemma
P1_pos_type
W_sic W_sic
W_cluster W_cluster
@ -95,7 +93,6 @@ cpdef enum:
W_suffix W_suffix
W_pos W_pos
W_lemma W_lemma
W_pos_type
N1_sic N1_sic
N1_cluster N1_cluster
@ -104,7 +101,6 @@ cpdef enum:
N1_suffix N1_suffix
N1_pos N1_pos
N1_lemma N1_lemma
N1_pos_type
N2_sic N2_sic
N2_cluster N2_cluster
@ -113,7 +109,6 @@ cpdef enum:
N2_suffix N2_suffix
N2_pos N2_pos
N2_lemma N2_lemma
N2_pos_type
N_CONTEXT_FIELDS N_CONTEXT_FIELDS
@ -196,11 +191,6 @@ POS_TEMPLATES = (
(N2_cluster,), (N2_cluster,),
(P1_cluster,), (P1_cluster,),
(P2_cluster,), (P2_cluster,),
(W_pos_type,),
(N1_pos_type,),
(N1_pos_type,),
(P1_pos, W_pos_type, N1_pos_type),
) )
@ -339,4 +329,3 @@ cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
context[4] = t.lex.suffix context[4] = t.lex.suffix
context[5] = t.pos context[5] = t.pos
context[6] = t.lemma context[6] = t.lemma
context[7] = t.lex.pos_type

View File

@ -1,5 +1,5 @@
from .typedefs cimport hash_t, flags_t, id_t, len_t, tag_t, attr_t, attr_id_t from .typedefs cimport hash_t, flags_t, id_t, len_t, tag_t, attr_t, attr_id_t
from .typedefs cimport ID, SIC, DENSE, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER, POS_TYPE from .typedefs cimport ID, SIC, NORM1, NORM2, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
from .structs cimport LexemeC from .structs cimport LexemeC
from .strings cimport StringStore from .strings cimport StringStore
@ -11,6 +11,35 @@ cdef LexemeC init(id_t i, unicode string, hash_t hashed, StringStore store,
dict props) except * dict props) except *
cdef class Lexeme:
cdef const float* vec
cdef readonly flags_t flags
cdef readonly attr_t id
cdef readonly attr_t length
cdef readonly unicode sic
cdef readonly unicode norm1
cdef readonly unicode norm2
cdef readonly unicode shape
cdef readonly unicode prefix
cdef readonly unicode suffix
cdef readonly attr_t sic_id
cdef readonly attr_t norm1_id
cdef readonly attr_t norm2_id
cdef readonly attr_t shape_id
cdef readonly attr_t prefix_id
cdef readonly attr_t suffix_id
cdef readonly attr_t cluster
cdef readonly float prob
cdef readonly float sentiment
cdef Lexeme Lexeme_cinit(const LexemeC* c, StringStore strings)
cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil: cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:
return lexeme.flags & (1 << flag_id) return lexeme.flags & (1 << flag_id)
@ -22,8 +51,10 @@ cdef inline attr_t get_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
return lex.id return lex.id
elif feat_name == SIC: elif feat_name == SIC:
return lex.sic return lex.sic
elif feat_name == DENSE: elif feat_name == NORM1:
return lex.dense return lex.norm1
elif feat_name == NORM2:
return lex.norm2
elif feat_name == SHAPE: elif feat_name == SHAPE:
return lex.shape return lex.shape
elif feat_name == PREFIX: elif feat_name == PREFIX:
@ -34,7 +65,5 @@ cdef inline attr_t get_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
return lex.length return lex.length
elif feat_name == CLUSTER: elif feat_name == CLUSTER:
return lex.cluster return lex.cluster
elif feat_name == POS_TYPE:
return lex.pos_type
else: else:
return 0 return 0

View File

@ -18,7 +18,6 @@ cdef LexemeC init(id_t i, unicode string, hash_t hashed,
lex.sic = string_store[string] lex.sic = string_store[string]
lex.cluster = props.get('cluster', 0) lex.cluster = props.get('cluster', 0)
lex.pos_type = props.get('pos_type', 0)
lex.prob = props.get('prob', 0) lex.prob = props.get('prob', 0)
lex.prefix = string_store[string[:1]] lex.prefix = string_store[string[:1]]
@ -29,4 +28,36 @@ cdef LexemeC init(id_t i, unicode string, hash_t hashed,
return lex return lex
cdef class Lexeme:
def __init__(self):
pass
cdef Lexeme Lexeme_cinit(const LexemeC* c, StringStore strings):
cdef Lexeme py = Lexeme.__new__(Lexeme)
py.vec = c.vec
py.flags = c.flags
py.id = c.id
py.length = c.length
py.sic = strings[c.sic]
py.norm1 = strings[c.norm1]
py.norm2 = strings[c.norm2]
py.shape = strings[c.shape]
py.prefix = strings[c.prefix]
py.suffix = strings[c.suffix]
py.sic_id = c.sic
py.norm1_id = c.norm1
py.norm2_id = c.norm2
py.shape_id = c.shape
py.prefix_id = c.prefix
py.suffix_id = c.suffix
py.cluster = c.cluster
py.prob = c.prob
py.sentiment = c.sentiment
return py

View File

@ -9,15 +9,16 @@ cdef struct LexemeC:
flags_t flags flags_t flags
attr_t id attr_t id
attr_t length
attr_t sic attr_t sic
attr_t dense attr_t norm1
attr_t norm2
attr_t shape attr_t shape
attr_t prefix attr_t prefix
attr_t suffix attr_t suffix
attr_t length
attr_t cluster attr_t cluster
attr_t pos_type
float prob float prob
float sentiment float sentiment

View File

@ -7,7 +7,7 @@ from preshed.counter cimport PreshCounter
from .vocab cimport EMPTY_LEXEME from .vocab cimport EMPTY_LEXEME
from .typedefs cimport attr_id_t, attr_t from .typedefs cimport attr_id_t, attr_t
from .typedefs cimport LEMMA from .typedefs cimport LEMMA
from .typedefs cimport ID, SIC, DENSE, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER, POS_TYPE from .typedefs cimport ID, SIC, NORM1, NORM2, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
from .typedefs cimport POS, LEMMA from .typedefs cimport POS, LEMMA
cimport cython cimport cython
@ -39,8 +39,10 @@ cdef attr_t get_lex_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
return lex.id return lex.id
elif feat_name == SIC: elif feat_name == SIC:
return lex.sic return lex.sic
elif feat_name == DENSE: elif feat_name == NORM1:
return lex.dense return lex.norm1
elif feat_name == NORM2:
return lex.norm2
elif feat_name == SHAPE: elif feat_name == SHAPE:
return lex.shape return lex.shape
elif feat_name == PREFIX: elif feat_name == PREFIX:
@ -51,8 +53,6 @@ cdef attr_t get_lex_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
return lex.length return lex.length
elif feat_name == CLUSTER: elif feat_name == CLUSTER:
return lex.cluster return lex.cluster
elif feat_name == POS_TYPE:
return lex.pos_type
else: else:
return 0 return 0
@ -175,26 +175,7 @@ cdef class Tokens:
cdef Token cinit_token(const TokenC* c_tok): cdef Token cinit_token(const TokenC* c_tok):
cdef const LexemeC* lex = c_tok.lex
cdef Token py_tok = Token.__new__(Token) cdef Token py_tok = Token.__new__(Token)
cyarr = cvarray(shape=(300,), itemsize=sizeof(float), format="i")
py_tok.vec = cyarr
py_tok.flags = lex.flags
py_tok.id = lex.id
py_tok.sic = lex.sic
py_tok.dense = lex.dense
py_tok.shape = lex.shape
py_tok.prefix = lex.prefix
py_tok.suffix = lex.suffix
py_tok.length = lex.length
py_tok.cluster = lex.cluster
py_tok.pos_type = lex.pos_type
py_tok.prob = lex.prob
py_tok.sentiment = lex.sentiment
py_tok.morph = c_tok.morph py_tok.morph = c_tok.morph
py_tok.pos = c_tok.pos py_tok.pos = c_tok.pos
py_tok.fine_pos = c_tok.fine_pos py_tok.fine_pos = c_tok.fine_pos

View File

@ -90,14 +90,14 @@ cpdef enum attr_id_t:
ID ID
SIC SIC
DENSE NORM1
NORM2
SHAPE SHAPE
PREFIX PREFIX
SUFFIX SUFFIX
LENGTH LENGTH
CLUSTER CLUSTER
POS_TYPE
LEMMA LEMMA
POS POS

View File

@ -6,6 +6,7 @@ import codecs
from .lexeme cimport EMPTY_LEXEME from .lexeme cimport EMPTY_LEXEME
from .lexeme cimport init as lexeme_init from .lexeme cimport init as lexeme_init
from .lexeme cimport Lexeme_cinit
from .strings cimport slice_unicode from .strings cimport slice_unicode
from .strings cimport hash_string from .strings cimport hash_string
from .orth cimport word_shape from .orth cimport word_shape
@ -28,7 +29,6 @@ cdef LexemeC init_lexeme(id_t i, unicode string, hash_t hashed,
lex.sic = string_store[string] lex.sic = string_store[string]
lex.cluster = props.get('cluster', 0) lex.cluster = props.get('cluster', 0)
lex.pos_type = props.get('pos_type', 0)
lex.prob = props.get('prob', 0) lex.prob = props.get('prob', 0)
lex.prefix = string_store[string[:1]] lex.prefix = string_store[string[:1]]
@ -90,12 +90,6 @@ cdef class Vocab:
'''Retrieve a lexeme, given an int ID or a unicode string. If a previously '''Retrieve a lexeme, given an int ID or a unicode string. If a previously
unseen unicode string is given, a new LexemeC is created and stored. unseen unicode string is given, a new LexemeC is created and stored.
This function relies on Cython's struct-to-dict conversion. Python clients
receive a dict keyed by strings (byte or unicode, depending on Python 2/3),
with int values. Cython clients can instead receive a LexemeC struct value.
More efficient Cython access is provided by Lexicon.get, which returns
a LexemeC*.
Args: Args:
id_or_string (int or unicode): The integer ID of a word, or its unicode id_or_string (int or unicode): The integer ID of a word, or its unicode
string. If an int >= Lexicon.size, IndexError is raised. string. If an int >= Lexicon.size, IndexError is raised.
@ -103,19 +97,19 @@ cdef class Vocab:
is raised. is raised.
Returns: Returns:
lexeme (dict): A LexemeC struct instance, which Cython translates into lexeme (Lexeme): An instance of the Lexeme Python class, with data
a dict if the operator is called from Python. copied on instantiation.
''' '''
cdef UniStr string
cdef const LexemeC* lexeme
if type(id_or_string) == int: if type(id_or_string) == int:
if id_or_string >= self.lexemes.size(): if id_or_string >= self.lexemes.size():
raise IndexError raise IndexError
return {} lexeme = self.lexemes.at(id_or_string)
#return self.lexemes.at(id_or_string)[0] else:
cdef UniStr string slice_unicode(&string, id_or_string, 0, len(id_or_string))
slice_unicode(&string, id_or_string, 0, len(id_or_string)) lexeme = self.get(self.mem, &string)
cdef const LexemeC* lexeme = self.get(self.mem, &string) return Lexeme_cinit(lexeme, self.strings)
return {}
#return lexeme[0]
def __setitem__(self, unicode uni_string, dict props): def __setitem__(self, unicode uni_string, dict props):
cdef UniStr s cdef UniStr s