* Add xfail test for Issue #225: tokenization with non-whitespace delimiters

2025-07-16 03:02:41 +03:00 · 2016-01-19 13:20:14 +01:00 · 2016-01-19 13:20:14 +01:00 · 515493c675
commit 515493c675
parent 7abe653223
1 changed files with 17 additions and 6 deletions
--- a/spacy/tests/tokenizer/test_tokenizer.py
+++ b/spacy/tests/tokenizer/test_tokenizer.py
@ -116,12 +116,13 @@ def test_cnts5(en_tokenizer):
    tokens = en_tokenizer(text)
    assert len(tokens) == 11

-# TODO: This is currently difficult --- infix interferes here.
-#def test_mr(en_tokenizer):
-#    text = """Today is Tuesday.Mr."""
-#    tokens = en_tokenizer(text)
-#    assert len(tokens) == 5
-#    assert [w.orth_ for w in tokens] == ['Today', 'is', 'Tuesday', '.', 'Mr.']
+
+@pytest.mark.xfail
+def test_mr(en_tokenizer):
+    text = """Today is Tuesday.Mr."""
+    tokens = en_tokenizer(text)
+    assert len(tokens) == 5
+    assert [w.orth_ for w in tokens] == ['Today', 'is', 'Tuesday', '.', 'Mr.']


 def test_cnts6(en_tokenizer):
@ -148,6 +149,16 @@ def test_two_whitespace(en_tokenizer):
    tokens = en_tokenizer(orig_str)
    assert repr(tokens.text_with_ws) == repr(orig_str)

+
+@pytest.mark.xfail
+def test_em_dash_infix(en_tokenizer):
+    # Re Issue #225
+    tokens = en_tokenizer('''Will this road take me to Puddleton?\u2014No, '''
+                          '''you'll have to walk there.\u2014Ariel.''')
+    assert tokens[6].text == 'Puddleton'
+    assert tokens[7].text == '?'
+    assert tokens[8].text == '\u2014'
+
 #def test_cnts7():
 #    text = 'But then the 6,000-year ice age came...'
 #    tokens = EN.tokenize(text)