mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 01:46:28 +03:00
* Fix Issue #360: Tokenizer failed when the infix regex matched the start of the string while trying to tokenize multi-infix tokens.
This commit is contained in:
parent
eab2376547
commit
cc8bf62208
|
@ -24,6 +24,12 @@ def test_ellipsis(en_tokenizer):
|
|||
tokens = en_tokenizer('best...known')
|
||||
assert len(tokens) == 3
|
||||
|
||||
def test_big_ellipsis(en_tokenizer):
|
||||
'''Test regression identified in Issue #360'''
|
||||
tokens = en_tokenizer(u'$45...............Asking')
|
||||
assert len(tokens) > 2
|
||||
|
||||
|
||||
|
||||
def test_email(en_tokenizer):
|
||||
tokens = en_tokenizer('hello@example.com')
|
||||
|
|
|
@ -227,6 +227,8 @@ cdef class Tokenizer:
|
|||
for match in matches:
|
||||
infix_start = match.start()
|
||||
infix_end = match.end()
|
||||
if infix_start == start:
|
||||
continue
|
||||
span = string[start:infix_start]
|
||||
tokens.push_back(self.vocab.get(tokens.mem, span), False)
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user