Changed tokenizer to add infix when infix_start is offset

This commit is contained in:
Felix Sonntag 2017-11-19 15:14:40 +01:00
parent 8be3392302
commit 33b0f86de3

View File

@ -241,9 +241,8 @@ cdef class Tokenizer:
for match in matches: for match in matches:
infix_start = match.start() infix_start = match.start()
infix_end = match.end() infix_end = match.end()
if infix_start == start:
continue
if infix_start != start:
span = string[start:infix_start] span = string[start:infix_start]
tokens.push_back(self.vocab.get(tokens.mem, span), False) tokens.push_back(self.vocab.get(tokens.mem, span), False)