mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
* Changed cache to use a linked-list data structure, to take out Python list code. Taking 6-7 mins for gigaword.
This commit is contained in:
parent
51e2006a65
commit
a4863686ec
|
@ -2,6 +2,7 @@ from libc.stdint cimport uint32_t
|
||||||
from libc.stdint cimport uint64_t
|
from libc.stdint cimport uint64_t
|
||||||
from spacy.word cimport Lexeme
|
from spacy.word cimport Lexeme
|
||||||
from spacy.tokens cimport Tokens
|
from spacy.tokens cimport Tokens
|
||||||
|
from spacy.lexeme cimport LexemeC
|
||||||
|
|
||||||
from libcpp.utility cimport pair
|
from libcpp.utility cimport pair
|
||||||
from libcpp.vector cimport vector
|
from libcpp.vector cimport vector
|
||||||
|
@ -51,6 +52,10 @@ cdef extern from "sparsehash/dense_hash_map" namespace "google":
|
||||||
D& operator[](K&) nogil
|
D& operator[](K&) nogil
|
||||||
|
|
||||||
|
|
||||||
|
cdef struct LexList:
|
||||||
|
LexemeC* lex
|
||||||
|
LexList* tail
|
||||||
|
|
||||||
cdef class Lexicon:
|
cdef class Lexicon:
|
||||||
cpdef readonly size_t size
|
cpdef readonly size_t size
|
||||||
|
|
||||||
|
|
|
@ -102,27 +102,37 @@ cdef class Language:
|
||||||
|
|
||||||
cdef int _tokenize(self, Tokens tokens, Py_UNICODE* characters, size_t length) except -1:
|
cdef int _tokenize(self, Tokens tokens, Py_UNICODE* characters, size_t length) except -1:
|
||||||
cdef uint64_t hashed = hash64(characters, length * sizeof(Py_UNICODE), 0)
|
cdef uint64_t hashed = hash64(characters, length * sizeof(Py_UNICODE), 0)
|
||||||
|
cdef LexList* node = <LexList*>self.cache[hashed]
|
||||||
cdef size_t i = 0
|
cdef size_t i = 0
|
||||||
cdef LexemeC** lexemes = <LexemeC**>self.cache[hashed]
|
if node is not NULL:
|
||||||
if lexemes is not NULL:
|
while node != NULL:
|
||||||
while lexemes[i] != NULL:
|
tokens.push_back(node.lex)
|
||||||
tokens.push_back(lexemes[i])
|
node = node.tail
|
||||||
i += 1
|
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
cdef unicode string = characters[:length]
|
node = <LexList*>calloc(1, sizeof(LexList))
|
||||||
cdef list substrings = self._split(string)
|
self.cache[hashed] = <size_t>node
|
||||||
lexemes = <LexemeC**>calloc(len(substrings) + 1, sizeof(LexemeC*))
|
cdef size_t start = 0
|
||||||
cdef unicode substring
|
cdef size_t split = 0
|
||||||
for i, substring in enumerate(substrings):
|
while start < length:
|
||||||
lexemes[i] = <LexemeC*>self.lexicon.get(substring)
|
split = start + self._split_one(characters[start:length])
|
||||||
tokens.push_back(lexemes[i])
|
node.lex = <LexemeC*>self.lexicon.get(characters[start:split])
|
||||||
lexemes[i + 1] = NULL
|
tokens.push_back(node.lex)
|
||||||
if self.cache_size < 10000000:
|
if split == length:
|
||||||
self.cache[hashed] = <size_t>lexemes
|
break
|
||||||
self.cache_size += 1
|
hashed = hash64(&characters[split], (length - split) * sizeof(Py_UNICODE), 0)
|
||||||
else:
|
node.tail = <LexList*>self.cache[hashed]
|
||||||
free(lexemes)
|
if node.tail == NULL:
|
||||||
|
node.tail = <LexList*>calloc(1, sizeof(LexList))
|
||||||
|
self.cache[hashed] = <size_t>node.tail
|
||||||
|
start = split
|
||||||
|
node = node.tail
|
||||||
|
else:
|
||||||
|
node = node.tail
|
||||||
|
while node != NULL:
|
||||||
|
tokens.push_back(node.lex)
|
||||||
|
node = node.tail
|
||||||
|
break
|
||||||
|
|
||||||
cdef list _split(self, unicode string):
|
cdef list _split(self, unicode string):
|
||||||
"""Find how to split a contiguous span of non-space characters into substrings.
|
"""Find how to split a contiguous span of non-space characters into substrings.
|
||||||
|
@ -161,16 +171,17 @@ cdef class Language:
|
||||||
token_rules (list): A list of (chunk, tokens) pairs, where chunk is
|
token_rules (list): A list of (chunk, tokens) pairs, where chunk is
|
||||||
a string and tokens is a list of strings.
|
a string and tokens is a list of strings.
|
||||||
'''
|
'''
|
||||||
cdef LexemeC** lexemes
|
cdef LexList* node
|
||||||
cdef uint64_t hashed
|
cdef uint64_t hashed
|
||||||
for string, substrings in token_rules:
|
for string, substrings in token_rules:
|
||||||
lexemes = <LexemeC**>calloc(len(substrings) + 1, sizeof(LexemeC*))
|
|
||||||
for i, substring in enumerate(substrings):
|
|
||||||
lexemes[i] = <LexemeC*>self.lexicon.get(substring)
|
|
||||||
lexemes[i + 1] = NULL
|
|
||||||
hashed = hash64(<Py_UNICODE*>string, len(string) * sizeof(Py_UNICODE), 0)
|
hashed = hash64(<Py_UNICODE*>string, len(string) * sizeof(Py_UNICODE), 0)
|
||||||
self.cache[hashed] = <size_t>lexemes
|
node = <LexList*>calloc(1, sizeof(LexList))
|
||||||
self.cache_size += 1
|
self.cache[hashed] = <size_t>node
|
||||||
|
for substring in substrings[:-1]:
|
||||||
|
node.lex = <LexemeC*>self.lexicon.get(substring)
|
||||||
|
node.tail = <LexList*>calloc(1, sizeof(LexList))
|
||||||
|
node = node.tail
|
||||||
|
node.lex = <LexemeC*>self.lexicon.get(substrings[-1])
|
||||||
|
|
||||||
|
|
||||||
cdef class Lexicon:
|
cdef class Lexicon:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user