* Refactor tokenizer, to set the 'spacy' field on TokenC instead of passing a string

This commit is contained in:
Matthew Honnibal 2015-07-13 21:46:02 +02:00
parent 6eef0bf9ab
commit 67641f3b58
4 changed files with 49 additions and 34 deletions

View File

@ -29,7 +29,7 @@ cdef class Tokenizer:
cpdef Doc tokens_from_list(self, list strings)
cdef int _try_cache(self, int idx, hash_t key, Doc tokens) except -1
cdef int _try_cache(self, hash_t key, Doc tokens) except -1
cdef int _tokenize(self, Doc tokens, UniStr* span, int start, int end) except -1
cdef UniStr* _split_affixes(self, UniStr* string, vector[LexemeC*] *prefixes,
vector[LexemeC*] *suffixes) except NULL

View File

@ -39,16 +39,17 @@ cdef class Tokenizer:
return cls(vocab, rules, prefix_re, suffix_re, infix_re)
cpdef Doc tokens_from_list(self, list strings):
cdef int length = sum([len(s) for s in strings])
cdef Doc tokens = Doc(self.vocab, ' '.join(strings))
if length == 0:
cdef Doc tokens = Doc(self.vocab)
if sum([len(s) for s in strings]) == 0:
return tokens
cdef UniStr string_struct
cdef unicode py_string
cdef int idx = 0
for i, py_string in enumerate(strings):
slice_unicode(&string_struct, py_string, 0, len(py_string))
tokens.push_back(idx, <const LexemeC*>self.vocab.get(tokens.mem, &string_struct))
# Note that we pass tokens.mem here --- the Doc object has ownership
tokens.push_back(
<const LexemeC*>self.vocab.get(tokens.mem, &string_struct), True)
idx += len(py_string) + 1
return tokens
@ -73,7 +74,7 @@ cdef class Tokenizer:
tokens (Doc): A Doc object, giving access to a sequence of LexemeCs.
"""
cdef int length = len(string)
cdef Doc tokens = Doc(self.vocab, string)
cdef Doc tokens = Doc(self.vocab)
if length == 0:
return tokens
cdef int i = 0
@ -86,32 +87,39 @@ cdef class Tokenizer:
if Py_UNICODE_ISSPACE(chars[i]) != in_ws:
if start < i:
slice_unicode(&span, chars, start, i)
cache_hit = self._try_cache(start, span.key, tokens)
cache_hit = self._try_cache(span.key, tokens)
if not cache_hit:
self._tokenize(tokens, &span, start, i)
in_ws = not in_ws
start = i
if chars[i] == ' ':
tokens.data[tokens.length - 1].spacy = True
start += 1
i += 1
if start < i:
slice_unicode(&span, chars, start, i)
cache_hit = self._try_cache(start, span.key, tokens)
cache_hit = self._try_cache(span.key, tokens)
if not cache_hit:
self._tokenize(tokens, &span, start, i)
tokens.data[tokens.length - 1].spacy = string[-1] == ' '
return tokens
cdef int _try_cache(self, int idx, hash_t key, Doc tokens) except -1:
cdef int _try_cache(self, hash_t key, Doc tokens) except -1:
cached = <_Cached*>self._cache.get(key)
if cached == NULL:
return False
cdef int i
cdef int less_one = cached.length-1
if cached.is_lex:
for i in range(cached.length):
idx = tokens.push_back(idx, cached.data.lexemes[i])
for i in range(less_one):
# There's a space at the end of the chunk.
tokens.push_back(cached.data.lexemes[i], False)
tokens.push_back(cached.data.lexemes[less_one], False)
else:
for i in range(cached.length):
idx = tokens.push_back(idx, &cached.data.tokens[i])
for i in range(less_one):
tokens.push_back(&cached.data.tokens[i], False)
tokens.push_back(&cached.data.tokens[less_one], False)
return True
cdef int _tokenize(self, Doc tokens, UniStr* span, int start, int end) except -1:
@ -171,36 +179,39 @@ cdef class Tokenizer:
vector[const LexemeC*] *prefixes,
vector[const LexemeC*] *suffixes) except -1:
cdef bint cache_hit
cdef bint is_spacy
cdef int split
cdef const LexemeC* const* lexemes
cdef LexemeC* lexeme
cdef const LexemeC* lexeme
cdef UniStr span
cdef int i
# Have to calculate is_spacy here, i.e. does the token have a trailing
# space. There are no spaces *between* the tokens we attach
# here, and there *is* a space after the last token.
if prefixes.size():
for i in range(prefixes.size()):
idx = tokens.push_back(idx, prefixes[0][i])
tokens.push_back(prefixes[0][i], False)
if string.n != 0:
cache_hit = self._try_cache(idx, string.key, tokens)
cache_hit = self._try_cache(string.key, tokens)
if cache_hit:
# Get last idx
idx = tokens.data[tokens.length - 1].idx
# Increment by last length
idx += tokens.data[tokens.length - 1].lex.length
pass
else:
split = self._find_infix(string.chars, string.n)
if split == 0 or split == -1:
idx = tokens.push_back(idx, self.vocab.get(tokens.mem, string))
tokens.push_back(self.vocab.get(tokens.mem, string), False)
else:
# Append the beginning, afix, end of the infix token
slice_unicode(&span, string.chars, 0, split)
idx = tokens.push_back(idx, self.vocab.get(tokens.mem, &span))
tokens.push_back(self.vocab.get(tokens.mem, &span), False)
slice_unicode(&span, string.chars, split, split+1)
idx = tokens.push_back(idx, self.vocab.get(tokens.mem, &span))
tokens.push_back(self.vocab.get(tokens.mem, &span), False)
slice_unicode(&span, string.chars, split + 1, string.n)
idx = tokens.push_back(idx, self.vocab.get(tokens.mem, &span))
tokens.push_back(self.vocab.get(tokens.mem, &span), False)
cdef vector[const LexemeC*].reverse_iterator it = suffixes.rbegin()
while it != suffixes.rend():
idx = tokens.push_back(idx, deref(it))
lexeme = deref(it)
preinc(it)
tokens.push_back(lexeme, False)
cdef int _save_cached(self, const TokenC* tokens, hash_t key, int n) except -1:
cdef int i

View File

@ -26,7 +26,7 @@ cdef class Doc:
cdef int length
cdef int max_length
cdef int push_back(self, int i, LexemeOrToken lex_or_tok) except -1
cdef int push_back(self, LexemeOrToken lex_or_tok, bint trailing_space) except -1
cpdef np.ndarray to_array(self, object features)

View File

@ -173,7 +173,7 @@ cdef class Doc:
start = i
yield Span(self, start, self.length)
cdef int push_back(self, int idx, LexemeOrToken lex_or_tok) except -1:
cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1:
if self.length == self.max_length:
self._realloc(self.length * 2)
cdef TokenC* t = &self.data[self.length]
@ -181,9 +181,13 @@ cdef class Doc:
t[0] = lex_or_tok[0]
else:
t.lex = lex_or_tok
t.idx = idx
if self.length == 0:
t.idx = 0
else:
t.idx = (t-1).idx + (t-1).lex.length + (t-1).spacy
t.spacy = has_space
self.length += 1
return idx + t.lex.length
return t.idx + t.lex.length + t.spacy
@cython.boundscheck(False)
cpdef np.ndarray to_array(self, object py_attr_ids):
@ -375,11 +379,11 @@ cdef class Doc:
string += vocab.strings[lex.orth]
if space:
string += u' '
cdef Doc doc = Doc(vocab, string)
cdef Doc doc = Doc(vocab)
cdef bint has_space = False
cdef int idx = 0
for i, id_ in enumerate(ids):
doc.push_back(idx, vocab.lexemes[id_])
idx += vocab.lexemes[id_].length
if spaces[i]:
idx += 1
lex = vocab.lexemes[id_]
has_space = spaces[i]
doc.push_back(lex, has_space)
return doc