* Replace main lexicon dict with dense_hash_map. May be unsuitable, if strings need recovery.

This commit is contained in:
Matthew Honnibal 2014-09-12 04:29:09 +02:00
parent 8b20e9ad97
commit 4817277d66
4 changed files with 24 additions and 24 deletions

View File

@ -35,4 +35,4 @@ cdef class EnglishTokens(Tokens):
cdef class English(Language): cdef class English(Language):
cdef int _split_one(self, unicode word) cdef int _split_one(self, Py_UNICODE* characters, size_t length)

View File

@ -233,8 +233,8 @@ cdef class English(Language):
self._load_special_tokenization(rules) self._load_special_tokenization(rules)
self.tokens_class = EnglishTokens self.tokens_class = EnglishTokens
cdef int _split_one(self, unicode word): cdef int _split_one(self, Py_UNICODE* characters, size_t length):
cdef size_t length = len(word) cdef unicode word = characters[:length]
cdef int i = 0 cdef int i = 0
if word.startswith("'s") or word.startswith("'S"): if word.startswith("'s") or word.startswith("'S"):
return 2 return 2

View File

@ -60,9 +60,9 @@ cdef class Lexicon:
cpdef readonly size_t size cpdef readonly size_t size
cpdef Lexeme lookup(self, unicode string) cpdef Lexeme lookup(self, unicode string)
cdef size_t get(self, unicode string) cdef size_t get(self, Py_UNICODE* characters, size_t length)
cdef dict _dict cdef dense_hash_map[uint64_t, size_t] _dict
cdef list _string_features cdef list _string_features
cdef list _flag_features cdef list _flag_features
@ -79,4 +79,4 @@ cdef class Language:
cpdef Lexeme lookup(self, unicode text) cpdef Lexeme lookup(self, unicode text)
cdef int _tokenize(self, Tokens tokens, Py_UNICODE* characters, size_t length) except -1 cdef int _tokenize(self, Tokens tokens, Py_UNICODE* characters, size_t length) except -1
cdef int _split_one(self, unicode word) cdef int _split_one(self, Py_UNICODE* characters, size_t length)

View File

@ -103,7 +103,6 @@ cdef class Language:
cdef int _tokenize(self, Tokens tokens, Py_UNICODE* characters, size_t length) except -1: cdef int _tokenize(self, Tokens tokens, Py_UNICODE* characters, size_t length) except -1:
cdef uint64_t hashed = hash64(characters, length * sizeof(Py_UNICODE), 0) cdef uint64_t hashed = hash64(characters, length * sizeof(Py_UNICODE), 0)
cdef LexList* node = <LexList*>self.cache[hashed] cdef LexList* node = <LexList*>self.cache[hashed]
cdef size_t i = 0
if node is not NULL: if node is not NULL:
while node != NULL: while node != NULL:
tokens.push_back(node.lex) tokens.push_back(node.lex)
@ -115,17 +114,17 @@ cdef class Language:
cdef size_t start = 0 cdef size_t start = 0
cdef size_t split = 0 cdef size_t split = 0
while start < length: while start < length:
split = start + self._split_one(characters[start:length]) split = self._split_one(&characters[start], length - start)
node.lex = <LexemeC*>self.lexicon.get(characters[start:split]) node.lex = <LexemeC*>self.lexicon.get(&characters[start], split)
tokens.push_back(node.lex) tokens.push_back(node.lex)
if split == length: start += split
if start >= length:
break break
hashed = hash64(&characters[split], (length - split) * sizeof(Py_UNICODE), 0) hashed = hash64(&characters[start], (length - start) * sizeof(Py_UNICODE), 0)
node.tail = <LexList*>self.cache[hashed] node.tail = <LexList*>self.cache[hashed]
if node.tail == NULL: if node.tail == NULL:
node.tail = <LexList*>calloc(1, sizeof(LexList)) node.tail = <LexList*>calloc(1, sizeof(LexList))
self.cache[hashed] = <size_t>node.tail self.cache[hashed] = <size_t>node.tail
start = split
node = node.tail node = node.tail
else: else:
node = node.tail node = node.tail
@ -134,8 +133,8 @@ cdef class Language:
node = node.tail node = node.tail
break break
cdef int _split_one(self, unicode word): cdef int _split_one(self, Py_UNICODE* characters, size_t length):
return len(word) return length
def _load_special_tokenization(self, token_rules): def _load_special_tokenization(self, token_rules):
'''Load special-case tokenization rules. '''Load special-case tokenization rules.
@ -156,10 +155,10 @@ cdef class Language:
node = <LexList*>calloc(1, sizeof(LexList)) node = <LexList*>calloc(1, sizeof(LexList))
self.cache[hashed] = <size_t>node self.cache[hashed] = <size_t>node
for substring in substrings[:-1]: for substring in substrings[:-1]:
node.lex = <LexemeC*>self.lexicon.get(substring) node.lex = <LexemeC*>self.lexicon.get(<Py_UNICODE*>substring, len(substring))
node.tail = <LexList*>calloc(1, sizeof(LexList)) node.tail = <LexList*>calloc(1, sizeof(LexList))
node = node.tail node = node.tail
node.lex = <LexemeC*>self.lexicon.get(substrings[-1]) node.lex = <LexemeC*>self.lexicon.get(<Py_UNICODE*>substrings[-1], len(substrings[-1]))
cdef class Lexicon: cdef class Lexicon:
@ -167,7 +166,7 @@ cdef class Lexicon:
string_features, flag_features): string_features, flag_features):
self._flag_features = flag_features self._flag_features = flag_features
self._string_features = string_features self._string_features = string_features
self._dict = {} self._dict.set_empty_key(0)
self.size = 0 self.size = 0
cdef Lexeme word cdef Lexeme word
for string in words: for string in words:
@ -185,12 +184,13 @@ cdef class Lexicon:
self._dict[string] = <size_t>lexeme self._dict[string] = <size_t>lexeme
self.size += 1 self.size += 1
cdef size_t get(self, unicode string): cdef size_t get(self, Py_UNICODE* characters, size_t length):
cdef LexemeC* lexeme cdef uint64_t hashed = hash64(characters, length * sizeof(Py_UNICODE), 0)
assert len(string) != 0 cdef LexemeC* lexeme = <LexemeC*>self._dict[hashed]
if string in self._dict: if lexeme != NULL:
return self._dict[string] return <size_t>lexeme
cdef unicode string = characters[:length]
views = [string_view(string, 0.0, 0, {}, {}) views = [string_view(string, 0.0, 0, {}, {})
for string_view in self._string_features] for string_view in self._string_features]
flags = set() flags = set()
@ -199,7 +199,7 @@ cdef class Lexicon:
flags.add(i) flags.add(i)
lexeme = lexeme_init(string, 0, 0, views, flags) lexeme = lexeme_init(string, 0, 0, views, flags)
self._dict[string] = <size_t>lexeme self._dict[hashed] = <size_t>lexeme
self.size += 1 self.size += 1
return <size_t>lexeme return <size_t>lexeme
@ -212,5 +212,5 @@ cdef class Lexicon:
Returns: Returns:
lexeme (Lexeme): A reference to a lexical type. lexeme (Lexeme): A reference to a lexical type.
""" """
cdef size_t lexeme = self.get(string) cdef size_t lexeme = self.get(<Py_UNICODE*>string, len(string))
return Lexeme(lexeme) return Lexeme(lexeme)