* Fix infixed commas in tokenizer, re Issue #326. Need to benchmark on empirical data, to make sure this doesn't break other cases.

2025-07-15 18:52:29 +03:00 · 2016-04-14 11:36:03 +02:00 · 2016-04-14 11:36:03 +02:00 · 6f82065761
commit 6f82065761
parent 0f957dd586
2 changed files with 8 additions and 0 deletions
--- a/lang_data/en/infix.txt
+++ b/lang_data/en/infix.txt
@ -3,3 +3,4 @@
 (?<=[a-zA-Z])-(?=[a-zA-z])
 (?<=[a-zA-Z])--(?=[a-zA-z])
 (?<=[0-9])-(?=[0-9])
+(?<=[A-Za-z]),(?=[A-Za-z])
--- a/spacy/tests/tokenizer/test_infix.py
+++ b/spacy/tests/tokenizer/test_infix.py
@ -47,3 +47,10 @@ def test_double_hyphen(en_tokenizer):
    assert tokens[8].text == u'--'
    assert tokens[9].text == u'people'

+
+def test_infix_comma(en_tokenizer):
+    # Re issue #326
+    tokens = en_tokenizer(u'Hello,world')
+    assert tokens[0].text == u'Hello'
+    assert tokens[1].text == u','
+    assert tokens[2].text == u'world'