mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-27 02:16:32 +03:00
* Fix infixed commas in tokenizer, re Issue #326. Need to benchmark on empirical data, to make sure this doesn't break other cases.
This commit is contained in:
parent
0f957dd586
commit
6f82065761
|
@ -3,3 +3,4 @@
|
||||||
(?<=[a-zA-Z])-(?=[a-zA-z])
|
(?<=[a-zA-Z])-(?=[a-zA-z])
|
||||||
(?<=[a-zA-Z])--(?=[a-zA-z])
|
(?<=[a-zA-Z])--(?=[a-zA-z])
|
||||||
(?<=[0-9])-(?=[0-9])
|
(?<=[0-9])-(?=[0-9])
|
||||||
|
(?<=[A-Za-z]),(?=[A-Za-z])
|
||||||
|
|
|
@ -47,3 +47,10 @@ def test_double_hyphen(en_tokenizer):
|
||||||
assert tokens[8].text == u'--'
|
assert tokens[8].text == u'--'
|
||||||
assert tokens[9].text == u'people'
|
assert tokens[9].text == u'people'
|
||||||
|
|
||||||
|
|
||||||
|
def test_infix_comma(en_tokenizer):
|
||||||
|
# Re issue #326
|
||||||
|
tokens = en_tokenizer(u'Hello,world')
|
||||||
|
assert tokens[0].text == u'Hello'
|
||||||
|
assert tokens[1].text == u','
|
||||||
|
assert tokens[2].text == u'world'
|
||||||
|
|
Loading…
Reference in New Issue
Block a user