mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
* Refactor tokenizer, to set the 'spacy' field on TokenC instead of passing a string
This commit is contained in:
parent
6eef0bf9ab
commit
67641f3b58
|
@ -29,7 +29,7 @@ cdef class Tokenizer:
|
||||||
|
|
||||||
cpdef Doc tokens_from_list(self, list strings)
|
cpdef Doc tokens_from_list(self, list strings)
|
||||||
|
|
||||||
cdef int _try_cache(self, int idx, hash_t key, Doc tokens) except -1
|
cdef int _try_cache(self, hash_t key, Doc tokens) except -1
|
||||||
cdef int _tokenize(self, Doc tokens, UniStr* span, int start, int end) except -1
|
cdef int _tokenize(self, Doc tokens, UniStr* span, int start, int end) except -1
|
||||||
cdef UniStr* _split_affixes(self, UniStr* string, vector[LexemeC*] *prefixes,
|
cdef UniStr* _split_affixes(self, UniStr* string, vector[LexemeC*] *prefixes,
|
||||||
vector[LexemeC*] *suffixes) except NULL
|
vector[LexemeC*] *suffixes) except NULL
|
||||||
|
|
|
@ -39,16 +39,17 @@ cdef class Tokenizer:
|
||||||
return cls(vocab, rules, prefix_re, suffix_re, infix_re)
|
return cls(vocab, rules, prefix_re, suffix_re, infix_re)
|
||||||
|
|
||||||
cpdef Doc tokens_from_list(self, list strings):
|
cpdef Doc tokens_from_list(self, list strings):
|
||||||
cdef int length = sum([len(s) for s in strings])
|
cdef Doc tokens = Doc(self.vocab)
|
||||||
cdef Doc tokens = Doc(self.vocab, ' '.join(strings))
|
if sum([len(s) for s in strings]) == 0:
|
||||||
if length == 0:
|
|
||||||
return tokens
|
return tokens
|
||||||
cdef UniStr string_struct
|
cdef UniStr string_struct
|
||||||
cdef unicode py_string
|
cdef unicode py_string
|
||||||
cdef int idx = 0
|
cdef int idx = 0
|
||||||
for i, py_string in enumerate(strings):
|
for i, py_string in enumerate(strings):
|
||||||
slice_unicode(&string_struct, py_string, 0, len(py_string))
|
slice_unicode(&string_struct, py_string, 0, len(py_string))
|
||||||
tokens.push_back(idx, <const LexemeC*>self.vocab.get(tokens.mem, &string_struct))
|
# Note that we pass tokens.mem here --- the Doc object has ownership
|
||||||
|
tokens.push_back(
|
||||||
|
<const LexemeC*>self.vocab.get(tokens.mem, &string_struct), True)
|
||||||
idx += len(py_string) + 1
|
idx += len(py_string) + 1
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
|
@ -73,7 +74,7 @@ cdef class Tokenizer:
|
||||||
tokens (Doc): A Doc object, giving access to a sequence of LexemeCs.
|
tokens (Doc): A Doc object, giving access to a sequence of LexemeCs.
|
||||||
"""
|
"""
|
||||||
cdef int length = len(string)
|
cdef int length = len(string)
|
||||||
cdef Doc tokens = Doc(self.vocab, string)
|
cdef Doc tokens = Doc(self.vocab)
|
||||||
if length == 0:
|
if length == 0:
|
||||||
return tokens
|
return tokens
|
||||||
cdef int i = 0
|
cdef int i = 0
|
||||||
|
@ -86,32 +87,39 @@ cdef class Tokenizer:
|
||||||
if Py_UNICODE_ISSPACE(chars[i]) != in_ws:
|
if Py_UNICODE_ISSPACE(chars[i]) != in_ws:
|
||||||
if start < i:
|
if start < i:
|
||||||
slice_unicode(&span, chars, start, i)
|
slice_unicode(&span, chars, start, i)
|
||||||
cache_hit = self._try_cache(start, span.key, tokens)
|
cache_hit = self._try_cache(span.key, tokens)
|
||||||
if not cache_hit:
|
if not cache_hit:
|
||||||
self._tokenize(tokens, &span, start, i)
|
self._tokenize(tokens, &span, start, i)
|
||||||
in_ws = not in_ws
|
in_ws = not in_ws
|
||||||
start = i
|
start = i
|
||||||
if chars[i] == ' ':
|
if chars[i] == ' ':
|
||||||
|
tokens.data[tokens.length - 1].spacy = True
|
||||||
start += 1
|
start += 1
|
||||||
i += 1
|
i += 1
|
||||||
if start < i:
|
if start < i:
|
||||||
slice_unicode(&span, chars, start, i)
|
slice_unicode(&span, chars, start, i)
|
||||||
cache_hit = self._try_cache(start, span.key, tokens)
|
cache_hit = self._try_cache(span.key, tokens)
|
||||||
if not cache_hit:
|
if not cache_hit:
|
||||||
self._tokenize(tokens, &span, start, i)
|
self._tokenize(tokens, &span, start, i)
|
||||||
|
|
||||||
|
tokens.data[tokens.length - 1].spacy = string[-1] == ' '
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
cdef int _try_cache(self, int idx, hash_t key, Doc tokens) except -1:
|
cdef int _try_cache(self, hash_t key, Doc tokens) except -1:
|
||||||
cached = <_Cached*>self._cache.get(key)
|
cached = <_Cached*>self._cache.get(key)
|
||||||
if cached == NULL:
|
if cached == NULL:
|
||||||
return False
|
return False
|
||||||
cdef int i
|
cdef int i
|
||||||
|
cdef int less_one = cached.length-1
|
||||||
if cached.is_lex:
|
if cached.is_lex:
|
||||||
for i in range(cached.length):
|
for i in range(less_one):
|
||||||
idx = tokens.push_back(idx, cached.data.lexemes[i])
|
# There's a space at the end of the chunk.
|
||||||
|
tokens.push_back(cached.data.lexemes[i], False)
|
||||||
|
tokens.push_back(cached.data.lexemes[less_one], False)
|
||||||
else:
|
else:
|
||||||
for i in range(cached.length):
|
for i in range(less_one):
|
||||||
idx = tokens.push_back(idx, &cached.data.tokens[i])
|
tokens.push_back(&cached.data.tokens[i], False)
|
||||||
|
tokens.push_back(&cached.data.tokens[less_one], False)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
cdef int _tokenize(self, Doc tokens, UniStr* span, int start, int end) except -1:
|
cdef int _tokenize(self, Doc tokens, UniStr* span, int start, int end) except -1:
|
||||||
|
@ -171,36 +179,39 @@ cdef class Tokenizer:
|
||||||
vector[const LexemeC*] *prefixes,
|
vector[const LexemeC*] *prefixes,
|
||||||
vector[const LexemeC*] *suffixes) except -1:
|
vector[const LexemeC*] *suffixes) except -1:
|
||||||
cdef bint cache_hit
|
cdef bint cache_hit
|
||||||
|
cdef bint is_spacy
|
||||||
cdef int split
|
cdef int split
|
||||||
cdef const LexemeC* const* lexemes
|
cdef const LexemeC* const* lexemes
|
||||||
cdef LexemeC* lexeme
|
cdef const LexemeC* lexeme
|
||||||
cdef UniStr span
|
cdef UniStr span
|
||||||
cdef int i
|
cdef int i
|
||||||
|
# Have to calculate is_spacy here, i.e. does the token have a trailing
|
||||||
|
# space. There are no spaces *between* the tokens we attach
|
||||||
|
# here, and there *is* a space after the last token.
|
||||||
if prefixes.size():
|
if prefixes.size():
|
||||||
for i in range(prefixes.size()):
|
for i in range(prefixes.size()):
|
||||||
idx = tokens.push_back(idx, prefixes[0][i])
|
tokens.push_back(prefixes[0][i], False)
|
||||||
if string.n != 0:
|
if string.n != 0:
|
||||||
cache_hit = self._try_cache(idx, string.key, tokens)
|
cache_hit = self._try_cache(string.key, tokens)
|
||||||
if cache_hit:
|
if cache_hit:
|
||||||
# Get last idx
|
pass
|
||||||
idx = tokens.data[tokens.length - 1].idx
|
|
||||||
# Increment by last length
|
|
||||||
idx += tokens.data[tokens.length - 1].lex.length
|
|
||||||
else:
|
else:
|
||||||
split = self._find_infix(string.chars, string.n)
|
split = self._find_infix(string.chars, string.n)
|
||||||
if split == 0 or split == -1:
|
if split == 0 or split == -1:
|
||||||
idx = tokens.push_back(idx, self.vocab.get(tokens.mem, string))
|
tokens.push_back(self.vocab.get(tokens.mem, string), False)
|
||||||
else:
|
else:
|
||||||
|
# Append the beginning, afix, end of the infix token
|
||||||
slice_unicode(&span, string.chars, 0, split)
|
slice_unicode(&span, string.chars, 0, split)
|
||||||
idx = tokens.push_back(idx, self.vocab.get(tokens.mem, &span))
|
tokens.push_back(self.vocab.get(tokens.mem, &span), False)
|
||||||
slice_unicode(&span, string.chars, split, split+1)
|
slice_unicode(&span, string.chars, split, split+1)
|
||||||
idx = tokens.push_back(idx, self.vocab.get(tokens.mem, &span))
|
tokens.push_back(self.vocab.get(tokens.mem, &span), False)
|
||||||
slice_unicode(&span, string.chars, split + 1, string.n)
|
slice_unicode(&span, string.chars, split + 1, string.n)
|
||||||
idx = tokens.push_back(idx, self.vocab.get(tokens.mem, &span))
|
tokens.push_back(self.vocab.get(tokens.mem, &span), False)
|
||||||
cdef vector[const LexemeC*].reverse_iterator it = suffixes.rbegin()
|
cdef vector[const LexemeC*].reverse_iterator it = suffixes.rbegin()
|
||||||
while it != suffixes.rend():
|
while it != suffixes.rend():
|
||||||
idx = tokens.push_back(idx, deref(it))
|
lexeme = deref(it)
|
||||||
preinc(it)
|
preinc(it)
|
||||||
|
tokens.push_back(lexeme, False)
|
||||||
|
|
||||||
cdef int _save_cached(self, const TokenC* tokens, hash_t key, int n) except -1:
|
cdef int _save_cached(self, const TokenC* tokens, hash_t key, int n) except -1:
|
||||||
cdef int i
|
cdef int i
|
||||||
|
|
|
@ -26,7 +26,7 @@ cdef class Doc:
|
||||||
cdef int length
|
cdef int length
|
||||||
cdef int max_length
|
cdef int max_length
|
||||||
|
|
||||||
cdef int push_back(self, int i, LexemeOrToken lex_or_tok) except -1
|
cdef int push_back(self, LexemeOrToken lex_or_tok, bint trailing_space) except -1
|
||||||
|
|
||||||
cpdef np.ndarray to_array(self, object features)
|
cpdef np.ndarray to_array(self, object features)
|
||||||
|
|
||||||
|
|
|
@ -173,7 +173,7 @@ cdef class Doc:
|
||||||
start = i
|
start = i
|
||||||
yield Span(self, start, self.length)
|
yield Span(self, start, self.length)
|
||||||
|
|
||||||
cdef int push_back(self, int idx, LexemeOrToken lex_or_tok) except -1:
|
cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1:
|
||||||
if self.length == self.max_length:
|
if self.length == self.max_length:
|
||||||
self._realloc(self.length * 2)
|
self._realloc(self.length * 2)
|
||||||
cdef TokenC* t = &self.data[self.length]
|
cdef TokenC* t = &self.data[self.length]
|
||||||
|
@ -181,9 +181,13 @@ cdef class Doc:
|
||||||
t[0] = lex_or_tok[0]
|
t[0] = lex_or_tok[0]
|
||||||
else:
|
else:
|
||||||
t.lex = lex_or_tok
|
t.lex = lex_or_tok
|
||||||
t.idx = idx
|
if self.length == 0:
|
||||||
|
t.idx = 0
|
||||||
|
else:
|
||||||
|
t.idx = (t-1).idx + (t-1).lex.length + (t-1).spacy
|
||||||
|
t.spacy = has_space
|
||||||
self.length += 1
|
self.length += 1
|
||||||
return idx + t.lex.length
|
return t.idx + t.lex.length + t.spacy
|
||||||
|
|
||||||
@cython.boundscheck(False)
|
@cython.boundscheck(False)
|
||||||
cpdef np.ndarray to_array(self, object py_attr_ids):
|
cpdef np.ndarray to_array(self, object py_attr_ids):
|
||||||
|
@ -375,11 +379,11 @@ cdef class Doc:
|
||||||
string += vocab.strings[lex.orth]
|
string += vocab.strings[lex.orth]
|
||||||
if space:
|
if space:
|
||||||
string += u' '
|
string += u' '
|
||||||
cdef Doc doc = Doc(vocab, string)
|
cdef Doc doc = Doc(vocab)
|
||||||
|
cdef bint has_space = False
|
||||||
cdef int idx = 0
|
cdef int idx = 0
|
||||||
for i, id_ in enumerate(ids):
|
for i, id_ in enumerate(ids):
|
||||||
doc.push_back(idx, vocab.lexemes[id_])
|
lex = vocab.lexemes[id_]
|
||||||
idx += vocab.lexemes[id_].length
|
has_space = spaces[i]
|
||||||
if spaces[i]:
|
doc.push_back(lex, has_space)
|
||||||
idx += 1
|
|
||||||
return doc
|
return doc
|
||||||
|
|
Loading…
Reference in New Issue
Block a user