* Moving back to lexeme structs

This commit is contained in:
Matthew Honnibal 2014-09-10 20:41:47 +02:00
parent b488224c09
commit e567713429
3 changed files with 31 additions and 25 deletions

View File

@ -105,7 +105,6 @@ cdef class Language:
for i, substring in enumerate(substrings): for i, substring in enumerate(substrings):
lexemes.append(self.lexicon.lookup(substring)) lexemes.append(self.lexicon.lookup(substring))
self.cache[string] = lexemes self.cache[string] = lexemes
cdef Lexeme lexeme cdef Lexeme lexeme
for lexeme in lexemes: for lexeme in lexemes:
tokens.append(lexeme) tokens.append(lexeme)
@ -178,9 +177,11 @@ cdef class Lexicon:
Returns: Returns:
lexeme (Lexeme): A reference to a lexical type. lexeme (Lexeme): A reference to a lexical type.
""" """
cdef Lexeme lexeme
assert len(string) != 0 assert len(string) != 0
if string in self._dict: if string in self._dict:
return self._dict[string] lexeme = self._dict[string]
return lexeme
cdef Lexeme word = Lexeme(string, 0, 0, {}, {}, self._string_features, cdef Lexeme word = Lexeme(string, 0, 0, {}, {}, self._string_features,
self._flag_features) self._flag_features)

View File

@ -1,20 +1,11 @@
from .typedefs cimport hash_t, utf8_t, flag_t, id_t from .typedefs cimport hash_t, utf8_t, flag_t, id_t
from spacy.lexeme cimport LexemeC
DEF MAX_FLAG = 64 DEF MAX_FLAG = 64
cdef class Lexeme: cdef class Lexeme:
# NB: the readonly keyword refers to _Python_ access. The attributes are cdef LexemeC* _c
# writeable from Cython.
cpdef readonly size_t length
cpdef readonly double prob
cpdef readonly size_t cluster
cpdef readonly unicode string
cpdef readonly list views
cdef readonly flag_t flags
cpdef bint check_flag(self, size_t flag_id) except * cpdef bint check_flag(self, size_t flag_id) except *
cpdef unicode string_view(self, size_t view_id) cpdef unicode string_view(self, size_t view_id)

View File

@ -1,9 +1,12 @@
# cython: profile=True # cython: profile=True
# cython: embedsignature=True # cython: embedsignature=True
from libc.stdlib cimport calloc, free, realloc from libc.stdlib cimport calloc, free, realloc
from spacy.lexeme cimport lexeme_free, lexeme_init
from spacy.lexeme cimport lexeme_check_flag, lexeme_string_view
cdef class Lexeme: cdef class Lexeme:
"""A lexical type --- a word, punctuation symbol, whitespace sequence, etc """A lexical type --- a word, punctuation symbol, whitespace sequence, etc
keyed by a case-sensitive unicode string. All tokens with the same string, keyed by a case-sensitive unicode string. All tokens with the same string,
@ -48,23 +51,34 @@ cdef class Lexeme:
""" """
def __cinit__(self, unicode string, double prob, int cluster, dict case_stats, def __cinit__(self, unicode string, double prob, int cluster, dict case_stats,
dict tag_stats, list string_features, list flag_features): dict tag_stats, list string_features, list flag_features):
self.prob = prob views = []
self.cluster = cluster
self.length = len(string)
self.string = string
self.views = []
cdef unicode view cdef unicode view
for string_feature in string_features: for string_feature in string_features:
view = string_feature(string, prob, cluster, case_stats, tag_stats) view = string_feature(string, prob, cluster, case_stats, tag_stats)
self.views.append(view) views.append(view)
flags = set()
for i, flag_feature in enumerate(flag_features): for i, flag_feature in enumerate(flag_features):
if flag_feature(string, prob, case_stats, tag_stats): if flag_feature(string, prob, case_stats, tag_stats):
self.flags |= (1 << i) if (1 << i):
flags.add(i)
self._c = lexeme_init(string, prob, cluster, views, flags)
def __dealloc__(self): def __dealloc__(self):
pass lexeme_free(self._c)
property string:
def __get__(self):
cdef bytes utf8_string = self._c.string
cdef unicode string = utf8_string.decode('utf8')
return string
property prob:
def __get__(self): return self._c.prob
property cluster:
def __get__(self): return self._c.cluster
property length:
def __get__(self): return self._c.length
cpdef bint check_flag(self, size_t flag_id) except *: cpdef bint check_flag(self, size_t flag_id) except *:
"""Lexemes may store language-specific boolean features in a bit-field, """Lexemes may store language-specific boolean features in a bit-field,
@ -80,7 +94,7 @@ cdef class Lexeme:
>>> lexeme.check_flag(EN.OFT_UPPER) >>> lexeme.check_flag(EN.OFT_UPPER)
True True
""" """
return self.flags & (1 << flag_id) return lexeme_check_flag(self._c, flag_id)
cpdef unicode string_view(self, size_t view_id): cpdef unicode string_view(self, size_t view_id):
"""Lexemes may store language-specific string-view features, obtained """Lexemes may store language-specific string-view features, obtained
@ -100,4 +114,4 @@ cdef class Lexeme:
>>> lexeme.string_view(EN.NON_SPARSE) >>> lexeme.string_view(EN.NON_SPARSE)
u'Xxxx' u'Xxxx'
""" """
return self.views[view_id] return lexeme_string_view(self._c, view_id)