mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 09:56:28 +03:00
* Moving back to lexeme structs
This commit is contained in:
parent
b488224c09
commit
e567713429
|
@ -105,7 +105,6 @@ cdef class Language:
|
||||||
for i, substring in enumerate(substrings):
|
for i, substring in enumerate(substrings):
|
||||||
lexemes.append(self.lexicon.lookup(substring))
|
lexemes.append(self.lexicon.lookup(substring))
|
||||||
self.cache[string] = lexemes
|
self.cache[string] = lexemes
|
||||||
|
|
||||||
cdef Lexeme lexeme
|
cdef Lexeme lexeme
|
||||||
for lexeme in lexemes:
|
for lexeme in lexemes:
|
||||||
tokens.append(lexeme)
|
tokens.append(lexeme)
|
||||||
|
@ -178,9 +177,11 @@ cdef class Lexicon:
|
||||||
Returns:
|
Returns:
|
||||||
lexeme (Lexeme): A reference to a lexical type.
|
lexeme (Lexeme): A reference to a lexical type.
|
||||||
"""
|
"""
|
||||||
|
cdef Lexeme lexeme
|
||||||
assert len(string) != 0
|
assert len(string) != 0
|
||||||
if string in self._dict:
|
if string in self._dict:
|
||||||
return self._dict[string]
|
lexeme = self._dict[string]
|
||||||
|
return lexeme
|
||||||
|
|
||||||
cdef Lexeme word = Lexeme(string, 0, 0, {}, {}, self._string_features,
|
cdef Lexeme word = Lexeme(string, 0, 0, {}, {}, self._string_features,
|
||||||
self._flag_features)
|
self._flag_features)
|
||||||
|
|
|
@ -1,20 +1,11 @@
|
||||||
from .typedefs cimport hash_t, utf8_t, flag_t, id_t
|
from .typedefs cimport hash_t, utf8_t, flag_t, id_t
|
||||||
|
from spacy.lexeme cimport LexemeC
|
||||||
|
|
||||||
DEF MAX_FLAG = 64
|
DEF MAX_FLAG = 64
|
||||||
|
|
||||||
|
|
||||||
cdef class Lexeme:
|
cdef class Lexeme:
|
||||||
# NB: the readonly keyword refers to _Python_ access. The attributes are
|
cdef LexemeC* _c
|
||||||
# writeable from Cython.
|
|
||||||
cpdef readonly size_t length
|
|
||||||
cpdef readonly double prob
|
|
||||||
cpdef readonly size_t cluster
|
|
||||||
|
|
||||||
cpdef readonly unicode string
|
|
||||||
cpdef readonly list views
|
|
||||||
|
|
||||||
cdef readonly flag_t flags
|
|
||||||
|
|
||||||
cpdef bint check_flag(self, size_t flag_id) except *
|
cpdef bint check_flag(self, size_t flag_id) except *
|
||||||
cpdef unicode string_view(self, size_t view_id)
|
cpdef unicode string_view(self, size_t view_id)
|
||||||
|
|
|
@ -1,9 +1,12 @@
|
||||||
# cython: profile=True
|
# cython: profile=True
|
||||||
# cython: embedsignature=True
|
# cython: embedsignature=True
|
||||||
|
|
||||||
|
|
||||||
from libc.stdlib cimport calloc, free, realloc
|
from libc.stdlib cimport calloc, free, realloc
|
||||||
|
|
||||||
|
from spacy.lexeme cimport lexeme_free, lexeme_init
|
||||||
|
from spacy.lexeme cimport lexeme_check_flag, lexeme_string_view
|
||||||
|
|
||||||
|
|
||||||
cdef class Lexeme:
|
cdef class Lexeme:
|
||||||
"""A lexical type --- a word, punctuation symbol, whitespace sequence, etc
|
"""A lexical type --- a word, punctuation symbol, whitespace sequence, etc
|
||||||
keyed by a case-sensitive unicode string. All tokens with the same string,
|
keyed by a case-sensitive unicode string. All tokens with the same string,
|
||||||
|
@ -48,23 +51,34 @@ cdef class Lexeme:
|
||||||
"""
|
"""
|
||||||
def __cinit__(self, unicode string, double prob, int cluster, dict case_stats,
|
def __cinit__(self, unicode string, double prob, int cluster, dict case_stats,
|
||||||
dict tag_stats, list string_features, list flag_features):
|
dict tag_stats, list string_features, list flag_features):
|
||||||
self.prob = prob
|
views = []
|
||||||
self.cluster = cluster
|
|
||||||
self.length = len(string)
|
|
||||||
self.string = string
|
|
||||||
|
|
||||||
self.views = []
|
|
||||||
cdef unicode view
|
cdef unicode view
|
||||||
for string_feature in string_features:
|
for string_feature in string_features:
|
||||||
view = string_feature(string, prob, cluster, case_stats, tag_stats)
|
view = string_feature(string, prob, cluster, case_stats, tag_stats)
|
||||||
self.views.append(view)
|
views.append(view)
|
||||||
|
|
||||||
|
flags = set()
|
||||||
for i, flag_feature in enumerate(flag_features):
|
for i, flag_feature in enumerate(flag_features):
|
||||||
if flag_feature(string, prob, case_stats, tag_stats):
|
if flag_feature(string, prob, case_stats, tag_stats):
|
||||||
self.flags |= (1 << i)
|
if (1 << i):
|
||||||
|
flags.add(i)
|
||||||
|
self._c = lexeme_init(string, prob, cluster, views, flags)
|
||||||
|
|
||||||
def __dealloc__(self):
|
def __dealloc__(self):
|
||||||
pass
|
lexeme_free(self._c)
|
||||||
|
|
||||||
|
property string:
|
||||||
|
def __get__(self):
|
||||||
|
cdef bytes utf8_string = self._c.string
|
||||||
|
cdef unicode string = utf8_string.decode('utf8')
|
||||||
|
return string
|
||||||
|
|
||||||
|
property prob:
|
||||||
|
def __get__(self): return self._c.prob
|
||||||
|
property cluster:
|
||||||
|
def __get__(self): return self._c.cluster
|
||||||
|
property length:
|
||||||
|
def __get__(self): return self._c.length
|
||||||
|
|
||||||
cpdef bint check_flag(self, size_t flag_id) except *:
|
cpdef bint check_flag(self, size_t flag_id) except *:
|
||||||
"""Lexemes may store language-specific boolean features in a bit-field,
|
"""Lexemes may store language-specific boolean features in a bit-field,
|
||||||
|
@ -80,7 +94,7 @@ cdef class Lexeme:
|
||||||
>>> lexeme.check_flag(EN.OFT_UPPER)
|
>>> lexeme.check_flag(EN.OFT_UPPER)
|
||||||
True
|
True
|
||||||
"""
|
"""
|
||||||
return self.flags & (1 << flag_id)
|
return lexeme_check_flag(self._c, flag_id)
|
||||||
|
|
||||||
cpdef unicode string_view(self, size_t view_id):
|
cpdef unicode string_view(self, size_t view_id):
|
||||||
"""Lexemes may store language-specific string-view features, obtained
|
"""Lexemes may store language-specific string-view features, obtained
|
||||||
|
@ -100,4 +114,4 @@ cdef class Lexeme:
|
||||||
>>> lexeme.string_view(EN.NON_SPARSE)
|
>>> lexeme.string_view(EN.NON_SPARSE)
|
||||||
u'Xxxx'
|
u'Xxxx'
|
||||||
"""
|
"""
|
||||||
return self.views[view_id]
|
return lexeme_string_view(self._c, view_id)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user