* Refactor tokenizer, to set the 'spacy' field on TokenC instead of passing a string

This commit is contained in:
Matthew Honnibal 2015-07-13 21:46:02 +02:00
parent 6eef0bf9ab
commit 67641f3b58
4 changed files with 49 additions and 34 deletions

View File

@ -29,7 +29,7 @@ cdef class Tokenizer:
cpdef Doc tokens_from_list(self, list strings) cpdef Doc tokens_from_list(self, list strings)
cdef int _try_cache(self, int idx, hash_t key, Doc tokens) except -1 cdef int _try_cache(self, hash_t key, Doc tokens) except -1
cdef int _tokenize(self, Doc tokens, UniStr* span, int start, int end) except -1 cdef int _tokenize(self, Doc tokens, UniStr* span, int start, int end) except -1
cdef UniStr* _split_affixes(self, UniStr* string, vector[LexemeC*] *prefixes, cdef UniStr* _split_affixes(self, UniStr* string, vector[LexemeC*] *prefixes,
vector[LexemeC*] *suffixes) except NULL vector[LexemeC*] *suffixes) except NULL

View File

@ -39,16 +39,17 @@ cdef class Tokenizer:
return cls(vocab, rules, prefix_re, suffix_re, infix_re) return cls(vocab, rules, prefix_re, suffix_re, infix_re)
cpdef Doc tokens_from_list(self, list strings): cpdef Doc tokens_from_list(self, list strings):
cdef int length = sum([len(s) for s in strings]) cdef Doc tokens = Doc(self.vocab)
cdef Doc tokens = Doc(self.vocab, ' '.join(strings)) if sum([len(s) for s in strings]) == 0:
if length == 0:
return tokens return tokens
cdef UniStr string_struct cdef UniStr string_struct
cdef unicode py_string cdef unicode py_string
cdef int idx = 0 cdef int idx = 0
for i, py_string in enumerate(strings): for i, py_string in enumerate(strings):
slice_unicode(&string_struct, py_string, 0, len(py_string)) slice_unicode(&string_struct, py_string, 0, len(py_string))
tokens.push_back(idx, <const LexemeC*>self.vocab.get(tokens.mem, &string_struct)) # Note that we pass tokens.mem here --- the Doc object has ownership
tokens.push_back(
<const LexemeC*>self.vocab.get(tokens.mem, &string_struct), True)
idx += len(py_string) + 1 idx += len(py_string) + 1
return tokens return tokens
@ -73,7 +74,7 @@ cdef class Tokenizer:
tokens (Doc): A Doc object, giving access to a sequence of LexemeCs. tokens (Doc): A Doc object, giving access to a sequence of LexemeCs.
""" """
cdef int length = len(string) cdef int length = len(string)
cdef Doc tokens = Doc(self.vocab, string) cdef Doc tokens = Doc(self.vocab)
if length == 0: if length == 0:
return tokens return tokens
cdef int i = 0 cdef int i = 0
@ -86,32 +87,39 @@ cdef class Tokenizer:
if Py_UNICODE_ISSPACE(chars[i]) != in_ws: if Py_UNICODE_ISSPACE(chars[i]) != in_ws:
if start < i: if start < i:
slice_unicode(&span, chars, start, i) slice_unicode(&span, chars, start, i)
cache_hit = self._try_cache(start, span.key, tokens) cache_hit = self._try_cache(span.key, tokens)
if not cache_hit: if not cache_hit:
self._tokenize(tokens, &span, start, i) self._tokenize(tokens, &span, start, i)
in_ws = not in_ws in_ws = not in_ws
start = i start = i
if chars[i] == ' ': if chars[i] == ' ':
tokens.data[tokens.length - 1].spacy = True
start += 1 start += 1
i += 1 i += 1
if start < i: if start < i:
slice_unicode(&span, chars, start, i) slice_unicode(&span, chars, start, i)
cache_hit = self._try_cache(start, span.key, tokens) cache_hit = self._try_cache(span.key, tokens)
if not cache_hit: if not cache_hit:
self._tokenize(tokens, &span, start, i) self._tokenize(tokens, &span, start, i)
tokens.data[tokens.length - 1].spacy = string[-1] == ' '
return tokens return tokens
cdef int _try_cache(self, int idx, hash_t key, Doc tokens) except -1: cdef int _try_cache(self, hash_t key, Doc tokens) except -1:
cached = <_Cached*>self._cache.get(key) cached = <_Cached*>self._cache.get(key)
if cached == NULL: if cached == NULL:
return False return False
cdef int i cdef int i
cdef int less_one = cached.length-1
if cached.is_lex: if cached.is_lex:
for i in range(cached.length): for i in range(less_one):
idx = tokens.push_back(idx, cached.data.lexemes[i]) # There's a space at the end of the chunk.
tokens.push_back(cached.data.lexemes[i], False)
tokens.push_back(cached.data.lexemes[less_one], False)
else: else:
for i in range(cached.length): for i in range(less_one):
idx = tokens.push_back(idx, &cached.data.tokens[i]) tokens.push_back(&cached.data.tokens[i], False)
tokens.push_back(&cached.data.tokens[less_one], False)
return True return True
cdef int _tokenize(self, Doc tokens, UniStr* span, int start, int end) except -1: cdef int _tokenize(self, Doc tokens, UniStr* span, int start, int end) except -1:
@ -171,36 +179,39 @@ cdef class Tokenizer:
vector[const LexemeC*] *prefixes, vector[const LexemeC*] *prefixes,
vector[const LexemeC*] *suffixes) except -1: vector[const LexemeC*] *suffixes) except -1:
cdef bint cache_hit cdef bint cache_hit
cdef bint is_spacy
cdef int split cdef int split
cdef const LexemeC* const* lexemes cdef const LexemeC* const* lexemes
cdef LexemeC* lexeme cdef const LexemeC* lexeme
cdef UniStr span cdef UniStr span
cdef int i cdef int i
# Have to calculate is_spacy here, i.e. does the token have a trailing
# space. There are no spaces *between* the tokens we attach
# here, and there *is* a space after the last token.
if prefixes.size(): if prefixes.size():
for i in range(prefixes.size()): for i in range(prefixes.size()):
idx = tokens.push_back(idx, prefixes[0][i]) tokens.push_back(prefixes[0][i], False)
if string.n != 0: if string.n != 0:
cache_hit = self._try_cache(idx, string.key, tokens) cache_hit = self._try_cache(string.key, tokens)
if cache_hit: if cache_hit:
# Get last idx pass
idx = tokens.data[tokens.length - 1].idx
# Increment by last length
idx += tokens.data[tokens.length - 1].lex.length
else: else:
split = self._find_infix(string.chars, string.n) split = self._find_infix(string.chars, string.n)
if split == 0 or split == -1: if split == 0 or split == -1:
idx = tokens.push_back(idx, self.vocab.get(tokens.mem, string)) tokens.push_back(self.vocab.get(tokens.mem, string), False)
else: else:
# Append the beginning, afix, end of the infix token
slice_unicode(&span, string.chars, 0, split) slice_unicode(&span, string.chars, 0, split)
idx = tokens.push_back(idx, self.vocab.get(tokens.mem, &span)) tokens.push_back(self.vocab.get(tokens.mem, &span), False)
slice_unicode(&span, string.chars, split, split+1) slice_unicode(&span, string.chars, split, split+1)
idx = tokens.push_back(idx, self.vocab.get(tokens.mem, &span)) tokens.push_back(self.vocab.get(tokens.mem, &span), False)
slice_unicode(&span, string.chars, split + 1, string.n) slice_unicode(&span, string.chars, split + 1, string.n)
idx = tokens.push_back(idx, self.vocab.get(tokens.mem, &span)) tokens.push_back(self.vocab.get(tokens.mem, &span), False)
cdef vector[const LexemeC*].reverse_iterator it = suffixes.rbegin() cdef vector[const LexemeC*].reverse_iterator it = suffixes.rbegin()
while it != suffixes.rend(): while it != suffixes.rend():
idx = tokens.push_back(idx, deref(it)) lexeme = deref(it)
preinc(it) preinc(it)
tokens.push_back(lexeme, False)
cdef int _save_cached(self, const TokenC* tokens, hash_t key, int n) except -1: cdef int _save_cached(self, const TokenC* tokens, hash_t key, int n) except -1:
cdef int i cdef int i

View File

@ -26,7 +26,7 @@ cdef class Doc:
cdef int length cdef int length
cdef int max_length cdef int max_length
cdef int push_back(self, int i, LexemeOrToken lex_or_tok) except -1 cdef int push_back(self, LexemeOrToken lex_or_tok, bint trailing_space) except -1
cpdef np.ndarray to_array(self, object features) cpdef np.ndarray to_array(self, object features)

View File

@ -173,7 +173,7 @@ cdef class Doc:
start = i start = i
yield Span(self, start, self.length) yield Span(self, start, self.length)
cdef int push_back(self, int idx, LexemeOrToken lex_or_tok) except -1: cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1:
if self.length == self.max_length: if self.length == self.max_length:
self._realloc(self.length * 2) self._realloc(self.length * 2)
cdef TokenC* t = &self.data[self.length] cdef TokenC* t = &self.data[self.length]
@ -181,9 +181,13 @@ cdef class Doc:
t[0] = lex_or_tok[0] t[0] = lex_or_tok[0]
else: else:
t.lex = lex_or_tok t.lex = lex_or_tok
t.idx = idx if self.length == 0:
t.idx = 0
else:
t.idx = (t-1).idx + (t-1).lex.length + (t-1).spacy
t.spacy = has_space
self.length += 1 self.length += 1
return idx + t.lex.length return t.idx + t.lex.length + t.spacy
@cython.boundscheck(False) @cython.boundscheck(False)
cpdef np.ndarray to_array(self, object py_attr_ids): cpdef np.ndarray to_array(self, object py_attr_ids):
@ -375,11 +379,11 @@ cdef class Doc:
string += vocab.strings[lex.orth] string += vocab.strings[lex.orth]
if space: if space:
string += u' ' string += u' '
cdef Doc doc = Doc(vocab, string) cdef Doc doc = Doc(vocab)
cdef bint has_space = False
cdef int idx = 0 cdef int idx = 0
for i, id_ in enumerate(ids): for i, id_ in enumerate(ids):
doc.push_back(idx, vocab.lexemes[id_]) lex = vocab.lexemes[id_]
idx += vocab.lexemes[id_].length has_space = spaces[i]
if spaces[i]: doc.push_back(lex, has_space)
idx += 1
return doc return doc