mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
* Move special tokenization into its own lookup table, away from the cache.
This commit is contained in:
parent
985bc68327
commit
9298e36b36
|
@ -237,6 +237,7 @@ cdef class English(Language):
|
||||||
v_shape = View_WordShape
|
v_shape = View_WordShape
|
||||||
def __cinit__(self, name, user_string_features, user_flag_features):
|
def __cinit__(self, name, user_string_features, user_flag_features):
|
||||||
self.cache = {}
|
self.cache = {}
|
||||||
|
self.specials = {}
|
||||||
lang_data = util.read_lang_data(name)
|
lang_data = util.read_lang_data(name)
|
||||||
rules, words, probs, clusters, case_stats, tag_stats = lang_data
|
rules, words, probs, clusters, case_stats, tag_stats = lang_data
|
||||||
self.lexicon = lang.Lexicon(words, probs, clusters, case_stats, tag_stats,
|
self.lexicon = lang.Lexicon(words, probs, clusters, case_stats, tag_stats,
|
||||||
|
|
|
@ -72,6 +72,7 @@ cdef class Lexicon:
|
||||||
cdef class Language:
|
cdef class Language:
|
||||||
cdef unicode name
|
cdef unicode name
|
||||||
cdef dict cache
|
cdef dict cache
|
||||||
|
cdef dict specials
|
||||||
cpdef readonly Lexicon lexicon
|
cpdef readonly Lexicon lexicon
|
||||||
cpdef readonly object tokens_class
|
cpdef readonly object tokens_class
|
||||||
|
|
||||||
|
|
|
@ -41,6 +41,7 @@ cdef class Language:
|
||||||
string_features = []
|
string_features = []
|
||||||
self.name = name
|
self.name = name
|
||||||
self.cache = {}
|
self.cache = {}
|
||||||
|
self.specials = {}
|
||||||
lang_data = read_lang_data(name)
|
lang_data = read_lang_data(name)
|
||||||
rules, words, probs, clusters, case_stats, tag_stats = lang_data
|
rules, words, probs, clusters, case_stats, tag_stats = lang_data
|
||||||
self.lexicon = Lexicon(words, probs, clusters, case_stats, tag_stats,
|
self.lexicon = Lexicon(words, probs, clusters, case_stats, tag_stats,
|
||||||
|
@ -88,6 +89,7 @@ cdef class Language:
|
||||||
cdef size_t i = 0
|
cdef size_t i = 0
|
||||||
cdef Py_UNICODE* characters = string
|
cdef Py_UNICODE* characters = string
|
||||||
cdef Py_UNICODE c
|
cdef Py_UNICODE c
|
||||||
|
assert Py_UNICODE_ISSPACE(' ') == 1
|
||||||
for i in range(length):
|
for i in range(length):
|
||||||
c = characters[i]
|
c = characters[i]
|
||||||
if Py_UNICODE_ISSPACE(c) == 1:
|
if Py_UNICODE_ISSPACE(c) == 1:
|
||||||
|
@ -103,6 +105,11 @@ cdef class Language:
|
||||||
cdef list lexemes
|
cdef list lexemes
|
||||||
cdef size_t lex_addr
|
cdef size_t lex_addr
|
||||||
cdef uint64_t hashed = hash64(characters, length * sizeof(Py_UNICODE), 0)
|
cdef uint64_t hashed = hash64(characters, length * sizeof(Py_UNICODE), 0)
|
||||||
|
if hashed in self.specials:
|
||||||
|
for lex_addr in self.specials[hashed]:
|
||||||
|
tokens.push_back(<LexemeC*>lex_addr)
|
||||||
|
return 0
|
||||||
|
|
||||||
if hashed in self.cache:
|
if hashed in self.cache:
|
||||||
for lex_addr in self.cache[hashed]:
|
for lex_addr in self.cache[hashed]:
|
||||||
tokens.push_back(<LexemeC*>lex_addr)
|
tokens.push_back(<LexemeC*>lex_addr)
|
||||||
|
@ -113,16 +120,16 @@ cdef class Language:
|
||||||
cdef size_t split = 0
|
cdef size_t split = 0
|
||||||
while start < length:
|
while start < length:
|
||||||
split = self._split_one(&characters[start], length - start)
|
split = self._split_one(&characters[start], length - start)
|
||||||
hashed = hash64(&characters[start], split * sizeof(Py_UNICODE), 0)
|
piece_hash = hash64(&characters[start], split * sizeof(Py_UNICODE), 0)
|
||||||
if hashed in self.cache:
|
if piece_hash in self.specials:
|
||||||
lexemes.extend(self.cache[hashed])
|
lexemes.extend(self.specials[piece_hash])
|
||||||
else:
|
else:
|
||||||
lexeme = <LexemeC*>self.lexicon.get(&characters[start], split)
|
lexeme = <LexemeC*>self.lexicon.get(&characters[start], split)
|
||||||
lexemes.append(<size_t>lexeme)
|
lexemes.append(<size_t>lexeme)
|
||||||
start += split
|
start += split
|
||||||
for lex_addr in lexemes:
|
for lex_addr in lexemes:
|
||||||
tokens.push_back(<LexemeC*>lex_addr)
|
tokens.push_back(<LexemeC*>lex_addr)
|
||||||
#self.cache[hashed] = lexemes
|
self.cache[hashed] = lexemes
|
||||||
|
|
||||||
cdef int _split_one(self, Py_UNICODE* characters, size_t length):
|
cdef int _split_one(self, Py_UNICODE* characters, size_t length):
|
||||||
return length
|
return length
|
||||||
|
@ -146,7 +153,7 @@ cdef class Language:
|
||||||
lexemes = []
|
lexemes = []
|
||||||
for substring in substrings:
|
for substring in substrings:
|
||||||
lexemes.append(self.lexicon.get(<Py_UNICODE*>substring, len(substring)))
|
lexemes.append(self.lexicon.get(<Py_UNICODE*>substring, len(substring)))
|
||||||
self.cache[hashed] = lexemes
|
self.specials[hashed] = lexemes
|
||||||
|
|
||||||
|
|
||||||
cdef class Lexicon:
|
cdef class Lexicon:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user