Adjust lexeme sizing for attr_t being 64 bit

This commit is contained in:
Matthew Honnibal 2017-05-28 12:51:09 +02:00
parent a5606c3eda
commit f51e6a6c16
3 changed files with 16 additions and 16 deletions

View File

@ -27,7 +27,7 @@ cdef class Lexeme:
cdef inline SerializedLexemeC c_to_bytes(const LexemeC* lex) nogil: cdef inline SerializedLexemeC c_to_bytes(const LexemeC* lex) nogil:
cdef SerializedLexemeC lex_data cdef SerializedLexemeC lex_data
buff = <const unsigned char*>&lex.flags buff = <const unsigned char*>&lex.flags
end = <const unsigned char*>&lex.l2_norm + sizeof(lex.l2_norm) end = <const unsigned char*>&lex.sentiment + sizeof(lex.sentiment)
for i in range(sizeof(lex_data.data)): for i in range(sizeof(lex_data.data)):
lex_data.data[i] = buff[i] lex_data.data[i] = buff[i]
return lex_data return lex_data

View File

@ -35,11 +35,11 @@ cdef class Lexeme:
tag, dependency parse, or lemma (lemmatization depends on the part-of-speech tag, dependency parse, or lemma (lemmatization depends on the part-of-speech
tag). tag).
""" """
def __init__(self, Vocab vocab, int orth): def __init__(self, Vocab vocab, attr_t orth):
"""Create a Lexeme object. """Create a Lexeme object.
vocab (Vocab): The parent vocabulary vocab (Vocab): The parent vocabulary
orth (int): The orth id of the lexeme. orth (uint64): The orth id of the lexeme.
Returns (Lexeme): The newly constructd object. Returns (Lexeme): The newly constructd object.
""" """
self.vocab = vocab self.vocab = vocab
@ -51,7 +51,7 @@ cdef class Lexeme:
if isinstance(other, Lexeme): if isinstance(other, Lexeme):
a = self.orth a = self.orth
b = other.orth b = other.orth
elif isinstance(other, int): elif isinstance(other, long):
a = self.orth a = self.orth
b = other b = other
elif isinstance(other, str): elif isinstance(other, str):
@ -109,7 +109,7 @@ cdef class Lexeme:
def to_bytes(self): def to_bytes(self):
lex_data = Lexeme.c_to_bytes(self.c) lex_data = Lexeme.c_to_bytes(self.c)
start = <const char*>&self.c.flags start = <const char*>&self.c.flags
end = <const char*>&self.c.l2_norm + sizeof(self.c.l2_norm) end = <const char*>&self.c.sentiment + sizeof(self.c.sentiment)
assert (end-start) == sizeof(lex_data.data), (end-start, sizeof(lex_data.data)) assert (end-start) == sizeof(lex_data.data), (end-start, sizeof(lex_data.data))
byte_string = b'\0' * sizeof(lex_data.data) byte_string = b'\0' * sizeof(lex_data.data)
byte_chars = <char*>byte_string byte_chars = <char*>byte_string
@ -192,31 +192,31 @@ cdef class Lexeme:
property lower: property lower:
def __get__(self): return self.c.lower def __get__(self): return self.c.lower
def __set__(self, int x): self.c.lower = x def __set__(self, attr_t x): self.c.lower = x
property norm: property norm:
def __get__(self): return self.c.norm def __get__(self): return self.c.norm
def __set__(self, int x): self.c.norm = x def __set__(self, attr_t x): self.c.norm = x
property shape: property shape:
def __get__(self): return self.c.shape def __get__(self): return self.c.shape
def __set__(self, int x): self.c.shape = x def __set__(self, attr_t x): self.c.shape = x
property prefix: property prefix:
def __get__(self): return self.c.prefix def __get__(self): return self.c.prefix
def __set__(self, int x): self.c.prefix = x def __set__(self, attr_t x): self.c.prefix = x
property suffix: property suffix:
def __get__(self): return self.c.suffix def __get__(self): return self.c.suffix
def __set__(self, int x): self.c.suffix = x def __set__(self, attr_t x): self.c.suffix = x
property cluster: property cluster:
def __get__(self): return self.c.cluster def __get__(self): return self.c.cluster
def __set__(self, int x): self.c.cluster = x def __set__(self, attr_t x): self.c.cluster = x
property lang: property lang:
def __get__(self): return self.c.lang def __get__(self): return self.c.lang
def __set__(self, int x): self.c.lang = x def __set__(self, attr_t x): self.c.lang = x
property prob: property prob:
def __get__(self): return self.c.prob def __get__(self): return self.c.prob
@ -252,7 +252,7 @@ cdef class Lexeme:
property is_oov: property is_oov:
def __get__(self): return Lexeme.c_check_flag(self.c, IS_OOV) def __get__(self): return Lexeme.c_check_flag(self.c, IS_OOV)
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_OOV, x) def __set__(self, attr_t x): Lexeme.c_set_flag(self.c, IS_OOV, x)
property is_stop: property is_stop:
def __get__(self): return Lexeme.c_check_flag(self.c, IS_STOP) def __get__(self): return Lexeme.c_check_flag(self.c, IS_STOP)

View File

@ -27,7 +27,7 @@ cdef struct LexemeC:
cdef struct SerializedLexemeC: cdef struct SerializedLexemeC:
unsigned char[4*13 + 8] data unsigned char[8 + 8*10 + 4 + 4] data
# sizeof(flags_t) # flags # sizeof(flags_t) # flags
# + sizeof(attr_t) # lang # + sizeof(attr_t) # lang
# + sizeof(attr_t) # id # + sizeof(attr_t) # id
@ -58,10 +58,10 @@ cdef struct TokenC:
bint spacy bint spacy
int tag int tag
int idx int idx
int lemma attr_t lemma
int sense int sense
int head int head
int dep attr_t dep
bint sent_start bint sent_start
uint32_t l_kids uint32_t l_kids