mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 10:46:29 +03:00
* Replace main lexicon dict with dense_hash_map. May be unsuitable, if strings need recovery.
This commit is contained in:
parent
8b20e9ad97
commit
4817277d66
|
@ -35,4 +35,4 @@ cdef class EnglishTokens(Tokens):
|
||||||
|
|
||||||
|
|
||||||
cdef class English(Language):
|
cdef class English(Language):
|
||||||
cdef int _split_one(self, unicode word)
|
cdef int _split_one(self, Py_UNICODE* characters, size_t length)
|
||||||
|
|
|
@ -233,8 +233,8 @@ cdef class English(Language):
|
||||||
self._load_special_tokenization(rules)
|
self._load_special_tokenization(rules)
|
||||||
self.tokens_class = EnglishTokens
|
self.tokens_class = EnglishTokens
|
||||||
|
|
||||||
cdef int _split_one(self, unicode word):
|
cdef int _split_one(self, Py_UNICODE* characters, size_t length):
|
||||||
cdef size_t length = len(word)
|
cdef unicode word = characters[:length]
|
||||||
cdef int i = 0
|
cdef int i = 0
|
||||||
if word.startswith("'s") or word.startswith("'S"):
|
if word.startswith("'s") or word.startswith("'S"):
|
||||||
return 2
|
return 2
|
||||||
|
|
|
@ -60,9 +60,9 @@ cdef class Lexicon:
|
||||||
cpdef readonly size_t size
|
cpdef readonly size_t size
|
||||||
|
|
||||||
cpdef Lexeme lookup(self, unicode string)
|
cpdef Lexeme lookup(self, unicode string)
|
||||||
cdef size_t get(self, unicode string)
|
cdef size_t get(self, Py_UNICODE* characters, size_t length)
|
||||||
|
|
||||||
cdef dict _dict
|
cdef dense_hash_map[uint64_t, size_t] _dict
|
||||||
|
|
||||||
cdef list _string_features
|
cdef list _string_features
|
||||||
cdef list _flag_features
|
cdef list _flag_features
|
||||||
|
@ -79,4 +79,4 @@ cdef class Language:
|
||||||
cpdef Lexeme lookup(self, unicode text)
|
cpdef Lexeme lookup(self, unicode text)
|
||||||
|
|
||||||
cdef int _tokenize(self, Tokens tokens, Py_UNICODE* characters, size_t length) except -1
|
cdef int _tokenize(self, Tokens tokens, Py_UNICODE* characters, size_t length) except -1
|
||||||
cdef int _split_one(self, unicode word)
|
cdef int _split_one(self, Py_UNICODE* characters, size_t length)
|
||||||
|
|
|
@ -103,7 +103,6 @@ cdef class Language:
|
||||||
cdef int _tokenize(self, Tokens tokens, Py_UNICODE* characters, size_t length) except -1:
|
cdef int _tokenize(self, Tokens tokens, Py_UNICODE* characters, size_t length) except -1:
|
||||||
cdef uint64_t hashed = hash64(characters, length * sizeof(Py_UNICODE), 0)
|
cdef uint64_t hashed = hash64(characters, length * sizeof(Py_UNICODE), 0)
|
||||||
cdef LexList* node = <LexList*>self.cache[hashed]
|
cdef LexList* node = <LexList*>self.cache[hashed]
|
||||||
cdef size_t i = 0
|
|
||||||
if node is not NULL:
|
if node is not NULL:
|
||||||
while node != NULL:
|
while node != NULL:
|
||||||
tokens.push_back(node.lex)
|
tokens.push_back(node.lex)
|
||||||
|
@ -115,17 +114,17 @@ cdef class Language:
|
||||||
cdef size_t start = 0
|
cdef size_t start = 0
|
||||||
cdef size_t split = 0
|
cdef size_t split = 0
|
||||||
while start < length:
|
while start < length:
|
||||||
split = start + self._split_one(characters[start:length])
|
split = self._split_one(&characters[start], length - start)
|
||||||
node.lex = <LexemeC*>self.lexicon.get(characters[start:split])
|
node.lex = <LexemeC*>self.lexicon.get(&characters[start], split)
|
||||||
tokens.push_back(node.lex)
|
tokens.push_back(node.lex)
|
||||||
if split == length:
|
start += split
|
||||||
|
if start >= length:
|
||||||
break
|
break
|
||||||
hashed = hash64(&characters[split], (length - split) * sizeof(Py_UNICODE), 0)
|
hashed = hash64(&characters[start], (length - start) * sizeof(Py_UNICODE), 0)
|
||||||
node.tail = <LexList*>self.cache[hashed]
|
node.tail = <LexList*>self.cache[hashed]
|
||||||
if node.tail == NULL:
|
if node.tail == NULL:
|
||||||
node.tail = <LexList*>calloc(1, sizeof(LexList))
|
node.tail = <LexList*>calloc(1, sizeof(LexList))
|
||||||
self.cache[hashed] = <size_t>node.tail
|
self.cache[hashed] = <size_t>node.tail
|
||||||
start = split
|
|
||||||
node = node.tail
|
node = node.tail
|
||||||
else:
|
else:
|
||||||
node = node.tail
|
node = node.tail
|
||||||
|
@ -134,8 +133,8 @@ cdef class Language:
|
||||||
node = node.tail
|
node = node.tail
|
||||||
break
|
break
|
||||||
|
|
||||||
cdef int _split_one(self, unicode word):
|
cdef int _split_one(self, Py_UNICODE* characters, size_t length):
|
||||||
return len(word)
|
return length
|
||||||
|
|
||||||
def _load_special_tokenization(self, token_rules):
|
def _load_special_tokenization(self, token_rules):
|
||||||
'''Load special-case tokenization rules.
|
'''Load special-case tokenization rules.
|
||||||
|
@ -156,10 +155,10 @@ cdef class Language:
|
||||||
node = <LexList*>calloc(1, sizeof(LexList))
|
node = <LexList*>calloc(1, sizeof(LexList))
|
||||||
self.cache[hashed] = <size_t>node
|
self.cache[hashed] = <size_t>node
|
||||||
for substring in substrings[:-1]:
|
for substring in substrings[:-1]:
|
||||||
node.lex = <LexemeC*>self.lexicon.get(substring)
|
node.lex = <LexemeC*>self.lexicon.get(<Py_UNICODE*>substring, len(substring))
|
||||||
node.tail = <LexList*>calloc(1, sizeof(LexList))
|
node.tail = <LexList*>calloc(1, sizeof(LexList))
|
||||||
node = node.tail
|
node = node.tail
|
||||||
node.lex = <LexemeC*>self.lexicon.get(substrings[-1])
|
node.lex = <LexemeC*>self.lexicon.get(<Py_UNICODE*>substrings[-1], len(substrings[-1]))
|
||||||
|
|
||||||
|
|
||||||
cdef class Lexicon:
|
cdef class Lexicon:
|
||||||
|
@ -167,7 +166,7 @@ cdef class Lexicon:
|
||||||
string_features, flag_features):
|
string_features, flag_features):
|
||||||
self._flag_features = flag_features
|
self._flag_features = flag_features
|
||||||
self._string_features = string_features
|
self._string_features = string_features
|
||||||
self._dict = {}
|
self._dict.set_empty_key(0)
|
||||||
self.size = 0
|
self.size = 0
|
||||||
cdef Lexeme word
|
cdef Lexeme word
|
||||||
for string in words:
|
for string in words:
|
||||||
|
@ -185,12 +184,13 @@ cdef class Lexicon:
|
||||||
self._dict[string] = <size_t>lexeme
|
self._dict[string] = <size_t>lexeme
|
||||||
self.size += 1
|
self.size += 1
|
||||||
|
|
||||||
cdef size_t get(self, unicode string):
|
cdef size_t get(self, Py_UNICODE* characters, size_t length):
|
||||||
cdef LexemeC* lexeme
|
cdef uint64_t hashed = hash64(characters, length * sizeof(Py_UNICODE), 0)
|
||||||
assert len(string) != 0
|
cdef LexemeC* lexeme = <LexemeC*>self._dict[hashed]
|
||||||
if string in self._dict:
|
if lexeme != NULL:
|
||||||
return self._dict[string]
|
return <size_t>lexeme
|
||||||
|
|
||||||
|
cdef unicode string = characters[:length]
|
||||||
views = [string_view(string, 0.0, 0, {}, {})
|
views = [string_view(string, 0.0, 0, {}, {})
|
||||||
for string_view in self._string_features]
|
for string_view in self._string_features]
|
||||||
flags = set()
|
flags = set()
|
||||||
|
@ -199,7 +199,7 @@ cdef class Lexicon:
|
||||||
flags.add(i)
|
flags.add(i)
|
||||||
|
|
||||||
lexeme = lexeme_init(string, 0, 0, views, flags)
|
lexeme = lexeme_init(string, 0, 0, views, flags)
|
||||||
self._dict[string] = <size_t>lexeme
|
self._dict[hashed] = <size_t>lexeme
|
||||||
self.size += 1
|
self.size += 1
|
||||||
return <size_t>lexeme
|
return <size_t>lexeme
|
||||||
|
|
||||||
|
@ -212,5 +212,5 @@ cdef class Lexicon:
|
||||||
Returns:
|
Returns:
|
||||||
lexeme (Lexeme): A reference to a lexical type.
|
lexeme (Lexeme): A reference to a lexical type.
|
||||||
"""
|
"""
|
||||||
cdef size_t lexeme = self.get(string)
|
cdef size_t lexeme = self.get(<Py_UNICODE*>string, len(string))
|
||||||
return Lexeme(lexeme)
|
return Lexeme(lexeme)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user