Changed tokenizer to add infix when infix_start is offset

This commit is contained in:
Felix Sonntag 2017-11-19 15:14:40 +01:00
parent 8be3392302
commit 33b0f86de3

View File

@ -241,11 +241,10 @@ cdef class Tokenizer:
for match in matches:
infix_start = match.start()
infix_end = match.end()
if infix_start == start:
continue
span = string[start:infix_start]
tokens.push_back(self.vocab.get(tokens.mem, span), False)
if infix_start != start:
span = string[start:infix_start]
tokens.push_back(self.vocab.get(tokens.mem, span), False)
if infix_start != infix_end:
# If infix_start != infix_end, it means the infix