mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
* Fix infixed commas in tokenizer, re Issue #326. Need to benchmark on empirical data, to make sure this doesn't break other cases.
This commit is contained in:
parent
0f957dd586
commit
6f82065761
|
@ -3,3 +3,4 @@
|
|||
(?<=[a-zA-Z])-(?=[a-zA-z])
|
||||
(?<=[a-zA-Z])--(?=[a-zA-z])
|
||||
(?<=[0-9])-(?=[0-9])
|
||||
(?<=[A-Za-z]),(?=[A-Za-z])
|
||||
|
|
|
@ -47,3 +47,10 @@ def test_double_hyphen(en_tokenizer):
|
|||
assert tokens[8].text == u'--'
|
||||
assert tokens[9].text == u'people'
|
||||
|
||||
|
||||
def test_infix_comma(en_tokenizer):
|
||||
# Re issue #326
|
||||
tokens = en_tokenizer(u'Hello,world')
|
||||
assert tokens[0].text == u'Hello'
|
||||
assert tokens[1].text == u','
|
||||
assert tokens[2].text == u'world'
|
||||
|
|
Loading…
Reference in New Issue
Block a user