* Fix infixed commas in tokenizer, re Issue #326. Need to benchmark on empirical data, to make sure this doesn't break other cases.

This commit is contained in:
Matthew Honnibal 2016-04-14 11:36:03 +02:00
parent 0f957dd586
commit 6f82065761
2 changed files with 8 additions and 0 deletions

View File

@ -3,3 +3,4 @@
(?<=[a-zA-Z])-(?=[a-zA-z]) (?<=[a-zA-Z])-(?=[a-zA-z])
(?<=[a-zA-Z])--(?=[a-zA-z]) (?<=[a-zA-Z])--(?=[a-zA-z])
(?<=[0-9])-(?=[0-9]) (?<=[0-9])-(?=[0-9])
(?<=[A-Za-z]),(?=[A-Za-z])

View File

@ -47,3 +47,10 @@ def test_double_hyphen(en_tokenizer):
assert tokens[8].text == u'--' assert tokens[8].text == u'--'
assert tokens[9].text == u'people' assert tokens[9].text == u'people'
def test_infix_comma(en_tokenizer):
# Re issue #326
tokens = en_tokenizer(u'Hello,world')
assert tokens[0].text == u'Hello'
assert tokens[1].text == u','
assert tokens[2].text == u'world'