* Changed cache to use a linked-list data structure, to take out Python list code. Taking 6-7 mins for gigaword.

This commit is contained in:
Matthew Honnibal 2014-09-12 03:30:50 +02:00
parent 51e2006a65
commit a4863686ec
2 changed files with 42 additions and 26 deletions

View File

@ -2,6 +2,7 @@ from libc.stdint cimport uint32_t
from libc.stdint cimport uint64_t from libc.stdint cimport uint64_t
from spacy.word cimport Lexeme from spacy.word cimport Lexeme
from spacy.tokens cimport Tokens from spacy.tokens cimport Tokens
from spacy.lexeme cimport LexemeC
from libcpp.utility cimport pair from libcpp.utility cimport pair
from libcpp.vector cimport vector from libcpp.vector cimport vector
@ -51,6 +52,10 @@ cdef extern from "sparsehash/dense_hash_map" namespace "google":
D& operator[](K&) nogil D& operator[](K&) nogil
cdef struct LexList:
LexemeC* lex
LexList* tail
cdef class Lexicon: cdef class Lexicon:
cpdef readonly size_t size cpdef readonly size_t size

View File

@ -102,27 +102,37 @@ cdef class Language:
cdef int _tokenize(self, Tokens tokens, Py_UNICODE* characters, size_t length) except -1: cdef int _tokenize(self, Tokens tokens, Py_UNICODE* characters, size_t length) except -1:
cdef uint64_t hashed = hash64(characters, length * sizeof(Py_UNICODE), 0) cdef uint64_t hashed = hash64(characters, length * sizeof(Py_UNICODE), 0)
cdef LexList* node = <LexList*>self.cache[hashed]
cdef size_t i = 0 cdef size_t i = 0
cdef LexemeC** lexemes = <LexemeC**>self.cache[hashed] if node is not NULL:
if lexemes is not NULL: while node != NULL:
while lexemes[i] != NULL: tokens.push_back(node.lex)
tokens.push_back(lexemes[i]) node = node.tail
i += 1
return 0 return 0
cdef unicode string = characters[:length] node = <LexList*>calloc(1, sizeof(LexList))
cdef list substrings = self._split(string) self.cache[hashed] = <size_t>node
lexemes = <LexemeC**>calloc(len(substrings) + 1, sizeof(LexemeC*)) cdef size_t start = 0
cdef unicode substring cdef size_t split = 0
for i, substring in enumerate(substrings): while start < length:
lexemes[i] = <LexemeC*>self.lexicon.get(substring) split = start + self._split_one(characters[start:length])
tokens.push_back(lexemes[i]) node.lex = <LexemeC*>self.lexicon.get(characters[start:split])
lexemes[i + 1] = NULL tokens.push_back(node.lex)
if self.cache_size < 10000000: if split == length:
self.cache[hashed] = <size_t>lexemes break
self.cache_size += 1 hashed = hash64(&characters[split], (length - split) * sizeof(Py_UNICODE), 0)
else: node.tail = <LexList*>self.cache[hashed]
free(lexemes) if node.tail == NULL:
node.tail = <LexList*>calloc(1, sizeof(LexList))
self.cache[hashed] = <size_t>node.tail
start = split
node = node.tail
else:
node = node.tail
while node != NULL:
tokens.push_back(node.lex)
node = node.tail
break
cdef list _split(self, unicode string): cdef list _split(self, unicode string):
"""Find how to split a contiguous span of non-space characters into substrings. """Find how to split a contiguous span of non-space characters into substrings.
@ -161,16 +171,17 @@ cdef class Language:
token_rules (list): A list of (chunk, tokens) pairs, where chunk is token_rules (list): A list of (chunk, tokens) pairs, where chunk is
a string and tokens is a list of strings. a string and tokens is a list of strings.
''' '''
cdef LexemeC** lexemes cdef LexList* node
cdef uint64_t hashed cdef uint64_t hashed
for string, substrings in token_rules: for string, substrings in token_rules:
lexemes = <LexemeC**>calloc(len(substrings) + 1, sizeof(LexemeC*))
for i, substring in enumerate(substrings):
lexemes[i] = <LexemeC*>self.lexicon.get(substring)
lexemes[i + 1] = NULL
hashed = hash64(<Py_UNICODE*>string, len(string) * sizeof(Py_UNICODE), 0) hashed = hash64(<Py_UNICODE*>string, len(string) * sizeof(Py_UNICODE), 0)
self.cache[hashed] = <size_t>lexemes node = <LexList*>calloc(1, sizeof(LexList))
self.cache_size += 1 self.cache[hashed] = <size_t>node
for substring in substrings[:-1]:
node.lex = <LexemeC*>self.lexicon.get(substring)
node.tail = <LexList*>calloc(1, sizeof(LexList))
node = node.tail
node.lex = <LexemeC*>self.lexicon.get(substrings[-1])
cdef class Lexicon: cdef class Lexicon: