* Add new tests for fancier tokenization cases

2025-11-07 03:17:37 +03:00 · 2014-09-15 06:31:58 +02:00 · 2014-09-15 06:31:58 +02:00 · db191361ee
commit db191361ee
parent 6fc06bfe2f
1 changed files with 42 additions and 5 deletions
--- a/tests/test_tokenizer.py
+++ b/tests/test_tokenizer.py
@ -1,3 +1,4 @@
 # coding: utf-8
 from __future__ import unicode_literals
 from spacy.en import EN
@ -17,19 +18,19 @@ def test_two_words():
 def test_punct():
    tokens = EN.tokenize('hello, possums.')
-    assert len(tokens) == 3
+    assert len(tokens) == 4
    assert tokens[0].string == EN.lexicon.lookup('hello').string
    assert tokens[1].string == EN.lexicon.lookup(',').string
-    assert tokens[2].string == EN.lexicon.lookup('possums.').string
+    assert tokens[2].string == EN.lexicon.lookup('possums').string
    assert tokens[1].string != EN.lexicon.lookup('hello').string
 def test_digits():
    lex_ids = EN.tokenize('The year: 1984.')
-    assert lex_ids.string(3) == "1984."
+    assert lex_ids.string(3) == "1984"
-    assert len(lex_ids) == 4
+    assert len(lex_ids) == 5
    assert lex_ids[0].string == EN.lexicon.lookup('The').string
-    assert lex_ids[3].string == EN.lexicon.lookup('1984.').string
+    assert lex_ids[3].string == EN.lexicon.lookup('1984').string
 def test_contraction():
@ -65,3 +66,39 @@ untimely death" of the rapier-tongued Scottish barrister and parliamentarian.
    tokens = EN.tokenize(text)
    assert len(tokens) > 5
 def test_cnts1():
    text = u"""The U.S. Army likes Shock and Awe."""
    tokens = EN.tokenize(text)
    assert len(tokens) == 8
 def test_cnts2():
    text = u"""U.N. regulations are not a part of their concern."""
    tokens = EN.tokenize(text)
    assert len(tokens) == 10
 def test_cnts3():
    text = u"“Isn't it?”"
    tokens = EN.tokenize(text)
    assert len(tokens) == 6
 def test_cnts4():
    text = u"""Yes! "I'd rather have a walk", Ms. Comble sighed. """
    tokens = EN.tokenize(text)
    assert len(tokens) == 15
 def test_cnts5():
    text = """'Me too!', Mr. P. Delaware cried. """
    tokens = EN.tokenize(text)
    assert len(tokens) == 11
 def test_cnts6():
    text = u'They ran about 10km.'
    tokens = EN.tokenize(text)
    assert len(tokens) == 6
 def test_cnts7():
    text = 'But then the 6,000-year ice age came...'
    tokens = EN.tokenize(text)
    assert len(tokens) == 8