From 4817277d66fe8ce22710aa99f412c8ef8b2a0ac7 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 12 Sep 2014 04:29:09 +0200 Subject: [PATCH] * Replace main lexicon dict with dense_hash_map. May be unsuitable, if strings need recovery. --- spacy/en.pxd | 2 +- spacy/en.pyx | 4 ++-- spacy/lang.pxd | 6 +++--- spacy/lang.pyx | 36 ++++++++++++++++++------------------ 4 files changed, 24 insertions(+), 24 deletions(-) diff --git a/spacy/en.pxd b/spacy/en.pxd index caf17227a..0af541847 100644 --- a/spacy/en.pxd +++ b/spacy/en.pxd @@ -35,4 +35,4 @@ cdef class EnglishTokens(Tokens): cdef class English(Language): - cdef int _split_one(self, unicode word) + cdef int _split_one(self, Py_UNICODE* characters, size_t length) diff --git a/spacy/en.pyx b/spacy/en.pyx index dc8465fc9..ff9499db6 100644 --- a/spacy/en.pyx +++ b/spacy/en.pyx @@ -233,8 +233,8 @@ cdef class English(Language): self._load_special_tokenization(rules) self.tokens_class = EnglishTokens - cdef int _split_one(self, unicode word): - cdef size_t length = len(word) + cdef int _split_one(self, Py_UNICODE* characters, size_t length): + cdef unicode word = characters[:length] cdef int i = 0 if word.startswith("'s") or word.startswith("'S"): return 2 diff --git a/spacy/lang.pxd b/spacy/lang.pxd index 6e5bda97a..c32cb0c41 100644 --- a/spacy/lang.pxd +++ b/spacy/lang.pxd @@ -60,9 +60,9 @@ cdef class Lexicon: cpdef readonly size_t size cpdef Lexeme lookup(self, unicode string) - cdef size_t get(self, unicode string) + cdef size_t get(self, Py_UNICODE* characters, size_t length) - cdef dict _dict + cdef dense_hash_map[uint64_t, size_t] _dict cdef list _string_features cdef list _flag_features @@ -79,4 +79,4 @@ cdef class Language: cpdef Lexeme lookup(self, unicode text) cdef int _tokenize(self, Tokens tokens, Py_UNICODE* characters, size_t length) except -1 - cdef int _split_one(self, unicode word) + cdef int _split_one(self, Py_UNICODE* characters, size_t length) diff --git a/spacy/lang.pyx b/spacy/lang.pyx index d7707e37b..93bc9d682 100644 --- a/spacy/lang.pyx +++ b/spacy/lang.pyx @@ -103,7 +103,6 @@ cdef class Language: cdef int _tokenize(self, Tokens tokens, Py_UNICODE* characters, size_t length) except -1: cdef uint64_t hashed = hash64(characters, length * sizeof(Py_UNICODE), 0) cdef LexList* node = self.cache[hashed] - cdef size_t i = 0 if node is not NULL: while node != NULL: tokens.push_back(node.lex) @@ -115,17 +114,17 @@ cdef class Language: cdef size_t start = 0 cdef size_t split = 0 while start < length: - split = start + self._split_one(characters[start:length]) - node.lex = self.lexicon.get(characters[start:split]) + split = self._split_one(&characters[start], length - start) + node.lex = self.lexicon.get(&characters[start], split) tokens.push_back(node.lex) - if split == length: + start += split + if start >= length: break - hashed = hash64(&characters[split], (length - split) * sizeof(Py_UNICODE), 0) + hashed = hash64(&characters[start], (length - start) * sizeof(Py_UNICODE), 0) node.tail = self.cache[hashed] if node.tail == NULL: node.tail = calloc(1, sizeof(LexList)) self.cache[hashed] = node.tail - start = split node = node.tail else: node = node.tail @@ -134,8 +133,8 @@ cdef class Language: node = node.tail break - cdef int _split_one(self, unicode word): - return len(word) + cdef int _split_one(self, Py_UNICODE* characters, size_t length): + return length def _load_special_tokenization(self, token_rules): '''Load special-case tokenization rules. @@ -156,10 +155,10 @@ cdef class Language: node = calloc(1, sizeof(LexList)) self.cache[hashed] = node for substring in substrings[:-1]: - node.lex = self.lexicon.get(substring) + node.lex = self.lexicon.get(substring, len(substring)) node.tail = calloc(1, sizeof(LexList)) node = node.tail - node.lex = self.lexicon.get(substrings[-1]) + node.lex = self.lexicon.get(substrings[-1], len(substrings[-1])) cdef class Lexicon: @@ -167,7 +166,7 @@ cdef class Lexicon: string_features, flag_features): self._flag_features = flag_features self._string_features = string_features - self._dict = {} + self._dict.set_empty_key(0) self.size = 0 cdef Lexeme word for string in words: @@ -185,12 +184,13 @@ cdef class Lexicon: self._dict[string] = lexeme self.size += 1 - cdef size_t get(self, unicode string): - cdef LexemeC* lexeme - assert len(string) != 0 - if string in self._dict: - return self._dict[string] + cdef size_t get(self, Py_UNICODE* characters, size_t length): + cdef uint64_t hashed = hash64(characters, length * sizeof(Py_UNICODE), 0) + cdef LexemeC* lexeme = self._dict[hashed] + if lexeme != NULL: + return lexeme + cdef unicode string = characters[:length] views = [string_view(string, 0.0, 0, {}, {}) for string_view in self._string_features] flags = set() @@ -199,7 +199,7 @@ cdef class Lexicon: flags.add(i) lexeme = lexeme_init(string, 0, 0, views, flags) - self._dict[string] = lexeme + self._dict[hashed] = lexeme self.size += 1 return lexeme @@ -212,5 +212,5 @@ cdef class Lexicon: Returns: lexeme (Lexeme): A reference to a lexical type. """ - cdef size_t lexeme = self.get(string) + cdef size_t lexeme = self.get(string, len(string)) return Lexeme(lexeme)