From 126a8453a549837b44d2bfe99466f6f7814b79c8 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 12 Sep 2014 23:50:37 +0200 Subject: [PATCH] * Fix performance issues by implementing a better cache. Add own String struct to help --- spacy/en.pyx | 4 +- spacy/lang.pxd | 14 ++++-- spacy/lang.pyx | 132 +++++++++++++++++++++++++++++++------------------ 3 files changed, 95 insertions(+), 55 deletions(-) diff --git a/spacy/en.pyx b/spacy/en.pyx index 9d9f4f596..eb2486711 100644 --- a/spacy/en.pyx +++ b/spacy/en.pyx @@ -236,8 +236,8 @@ cdef class English(Language): fl_is_digit = Flag_IsDigit v_shape = View_WordShape def __cinit__(self, name, user_string_features, user_flag_features): - self.cache = {} - self.specials = {} + self.cache.set_empty_key(0) + self.specials.set_empty_key(0) lang_data = util.read_lang_data(name) rules, words, probs, clusters, case_stats, tag_stats = lang_data self.lexicon = lang.Lexicon(words, probs, clusters, case_stats, tag_stats, diff --git a/spacy/lang.pxd b/spacy/lang.pxd index 094b3d580..5d9cab2d7 100644 --- a/spacy/lang.pxd +++ b/spacy/lang.pxd @@ -57,11 +57,17 @@ cdef extern from "sparsehash/dense_hash_map" namespace "google": D& operator[](K&) nogil +cdef struct String: + Py_UNICODE* chars + size_t n + uint64_t key + + cdef class Lexicon: cpdef readonly size_t size cpdef Lexeme lookup(self, unicode string) - cdef size_t get(self, Py_UNICODE* characters, size_t length) + cdef size_t get(self, String* s) cdef dense_hash_map[uint64_t, size_t] _dict @@ -71,13 +77,13 @@ cdef class Lexicon: cdef class Language: cdef unicode name - cdef dict cache - cdef dict specials + cdef dense_hash_map[uint64_t, size_t] cache + cdef dense_hash_map[uint64_t, size_t] specials cpdef readonly Lexicon lexicon cpdef readonly object tokens_class cpdef Tokens tokenize(self, unicode text) cpdef Lexeme lookup(self, unicode text) - cdef _tokenize(self, Tokens tokens, Py_UNICODE* characters, size_t length) + cdef _tokenize(self, Tokens tokens, String* string) cdef int _split_one(self, Py_UNICODE* characters, size_t length) diff --git a/spacy/lang.pyx b/spacy/lang.pyx index 9e57c63af..ae2d6472e 100644 --- a/spacy/lang.pyx +++ b/spacy/lang.pyx @@ -40,8 +40,8 @@ cdef class Language: if string_features is None: string_features = [] self.name = name - self.cache = {} - self.specials = {} + self.cache.set_empty_key(0) + self.specials.set_empty_key(0) lang_data = read_lang_data(name) rules, words, probs, clusters, case_stats, tag_stats = lang_data self.lexicon = Lexicon(words, probs, clusters, case_stats, tag_stats, @@ -49,6 +49,12 @@ cdef class Language: self._load_special_tokenization(rules) self.tokens_class = Tokens + def __dealloc__(self): + cdef uint64_t hashed + cdef size_t lex_addr + for (hashed, lex_addr) in self.specials: + free(lex_addr) + property nr_types: def __get__(self): """Return the number of lexical types in the vocabulary""" @@ -87,49 +93,48 @@ cdef class Language: cdef size_t start = 0 cdef size_t i = 0 - cdef Py_UNICODE* characters = string + cdef Py_UNICODE* chars = string cdef Py_UNICODE c - assert Py_UNICODE_ISSPACE(' ') == 1 + cdef String span for i in range(length): - c = characters[i] + c = chars[i] if Py_UNICODE_ISSPACE(c) == 1: if start < i: - self._tokenize(tokens, &characters[start], i - start) + string_from_slice(&span, chars, start, i) + self._tokenize(tokens, &span) start = i + 1 i += 1 if start < i: - self._tokenize(tokens, &characters[start], i - start) + string_from_slice(&span, chars, start, i) + self._tokenize(tokens, &span) return tokens - cdef _tokenize(self, Tokens tokens, Py_UNICODE* characters, size_t length): - cdef list lexemes - cdef size_t lex_addr - cdef uint64_t hashed = hash64(characters, length * sizeof(Py_UNICODE), 0) - if hashed in self.specials: - for lex_addr in self.specials[hashed]: - tokens.push_back(lex_addr) + cdef _tokenize(self, Tokens tokens, String* string): + cdef LexemeC** lexemes = self.specials[string.key] + if lexemes == NULL: + lexemes = self.cache[string.key] + if lexemes != NULL: + _extend_tokens(tokens, lexemes) return 0 + cdef uint64_t hashed = string.key - if hashed in self.cache: - for lex_addr in self.cache[hashed]: - tokens.push_back(lex_addr) - return 0 - - lexemes = [] + cdef size_t first_token = tokens.length cdef size_t start = 0 - cdef size_t split = 0 + cdef size_t length = string.n + cdef String prefix while start < length: - split = self._split_one(&characters[start], length - start) - piece_hash = hash64(&characters[start], split * sizeof(Py_UNICODE), 0) - if piece_hash in self.specials: - lexemes.extend(self.specials[piece_hash]) + split = self._split_one(string.chars, string.n) + string_slice_prefix(string, &prefix, split) + lexemes = self.specials[prefix.key] + if lexemes != NULL: + _extend_tokens(tokens, lexemes) else: - lexeme = self.lexicon.get(&characters[start], split) - lexemes.append(lexeme) - start += split - for lex_addr in lexemes: - tokens.push_back(lex_addr) - self.cache[hashed] = lexemes + tokens.push_back(self.lexicon.get(&prefix)) + start += prefix.n + lexemes = calloc(tokens.length - first_token, sizeof(LexemeC*)) + for i, j in enumerate(range(first_token, tokens.length)): + lexemes[i] = tokens.lexemes[j] + self.cache[hashed] = lexemes cdef int _split_one(self, Py_UNICODE* characters, size_t length): return length @@ -146,15 +151,25 @@ cdef class Language: token_rules (list): A list of (chunk, tokens) pairs, where chunk is a string and tokens is a list of strings. ''' - cdef list lexemes + cdef LexemeC** lexemes cdef uint64_t hashed - for string, substrings in token_rules: - hashed = hash64(string, len(string) * sizeof(Py_UNICODE), 0) - lexemes = [] - for substring in substrings: - lexemes.append(self.lexicon.get(substring, len(substring))) - self.specials[hashed] = lexemes - + cdef String string + for uni_string, substrings in token_rules: + lexemes = calloc(len(substrings) + 1, sizeof(LexemeC*)) + for i, substring in enumerate(substrings): + string_from_unicode(&string, substring) + lexemes[i] = self.lexicon.get(&string) + lexemes[i + 1] = NULL + string_from_unicode(&string, uni_string) + self.specials[string.key] = lexemes + + +cdef _extend_tokens(Tokens tokens, LexemeC** lexemes): + cdef size_t i = 0 + while lexemes[i] != NULL: + tokens.push_back(lexemes[i]) + i += 1 + cdef class Lexicon: def __cinit__(self, words, probs, clusters, case_stats, tag_stats, @@ -179,26 +194,25 @@ cdef class Lexicon: self._dict[string] = lexeme self.size += 1 - cdef size_t get(self, Py_UNICODE* characters, size_t length): - cdef uint64_t hashed = hash64(characters, length * sizeof(Py_UNICODE), 0) - cdef LexemeC* lexeme = self._dict[hashed] + cdef size_t get(self, String* string): + cdef LexemeC* lexeme = self._dict[string.key] if lexeme != NULL: return lexeme - cdef unicode string = characters[:length] - views = [string_view(string, 0.0, 0, {}, {}) + cdef unicode uni_string = string.chars[:string.n] + views = [string_view(uni_string, 0.0, 0, {}, {}) for string_view in self._string_features] flags = set() for i, flag_feature in enumerate(self._flag_features): - if flag_feature(string, 0.0, {}, {}): + if flag_feature(uni_string, 0.0, {}, {}): flags.add(i) - lexeme = lexeme_init(string, 0, 0, views, flags) - self._dict[hashed] = lexeme + lexeme = lexeme_init(uni_string, 0, 0, views, flags) + self._dict[string.key] = lexeme self.size += 1 return lexeme - cpdef Lexeme lookup(self, unicode string): + cpdef Lexeme lookup(self, unicode uni_string): """Retrieve (or create, if not found) a Lexeme for a string, and return it. Args @@ -207,5 +221,25 @@ cdef class Lexicon: Returns: lexeme (Lexeme): A reference to a lexical type. """ - cdef size_t lexeme = self.get(string, len(string)) + cdef String string + string_from_unicode(&string, uni_string) + cdef size_t lexeme = self.get(&string) return Lexeme(lexeme) + + +cdef string_from_unicode(String* s, unicode uni): + string_from_slice(s, uni, 0, len(uni)) + + +cdef string_from_slice(String* s, Py_UNICODE* chars, size_t start, size_t end): + s.chars = &chars[start] + s.n = end - start + s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0) + + +cdef string_slice_prefix(String* s, String* prefix, size_t n): + assert s.n >= n + string_from_slice(prefix, s.chars, 0, n) + s.chars += n + s.n -= n + s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0)