* Set up tokenizer/ tests properly, using a session-scoped fixture to avoid long load/unload times. Tokenizer tests now complete in 20 seconds.

2025-11-09 04:17:53 +03:00 · 2015-06-07 17:24:49 +02:00 · 2015-06-07 17:24:49 +02:00 · 877abb0e5b
commit 877abb0e5b
parent 1d5f20fdda
13 changed files with 126 additions and 190 deletions
--- a/tests/tokenizer/conftest.py
+++ b/tests/tokenizer/conftest.py
@ -0,0 +1,11 @@
 import pytest
 from spacy.en import English
@pytest.fixture(scope="session")
 def EN():
    return English(load_vectors=False)
@pytest.fixture(scope="session")
 def en_tokenizer(EN):
    return EN.tokenizer
--- a/tests/tokenizer/test_contractions.py
+++ b/tests/tokenizer/test_contractions.py
@ -1,34 +1,31 @@
 from __future__ import unicode_literals
 import pytest
 from spacy.en import English
-EN = English()
+def test_possess(en_tokenizer):
-
+    tokens = en_tokenizer("Mike's")
-def test_possess():
+    assert en_tokenizer.vocab.strings[tokens[0].orth] == "Mike"
-    tokens = EN("Mike's", parse=False, tag=False)
+    assert en_tokenizer.vocab.strings[tokens[1].orth] == "'s"
    assert EN.vocab.strings[tokens[0].orth] == "Mike"
    assert EN.vocab.strings[tokens[1].orth] == "'s"
    assert len(tokens) == 2
-def test_apostrophe():
+def test_apostrophe(en_tokenizer):
-    tokens = EN("schools'", parse=False, tag=False)
+    tokens = en_tokenizer("schools'")
    assert len(tokens) == 2
    assert tokens[1].orth_ == "'"
    assert tokens[0].orth_ == "schools"
-def test_LL():
+def test_LL(en_tokenizer):
-    tokens = EN("we'll", parse=False)
+    tokens = en_tokenizer("we'll")
    assert len(tokens) == 2
    assert tokens[1].orth_ == "'ll"
    assert tokens[1].lemma_ == "will"
    assert tokens[0].orth_ == "we"
-def test_aint():
+def test_aint(en_tokenizer):
-    tokens = EN("ain't", parse=False)
+    tokens = en_tokenizer("ain't")
    assert len(tokens) == 2
    assert tokens[0].orth_ == "ai"
    assert tokens[0].lemma_ == "be"
@ -36,19 +33,19 @@ def test_aint():
    assert tokens[1].lemma_ == "not"
-def test_capitalized():
+def test_capitalized(en_tokenizer):
-    tokens = EN("can't", parse=False)
+    tokens = en_tokenizer("can't")
    assert len(tokens) == 2
-    tokens = EN("Can't", parse=False)
+    tokens = en_tokenizer("Can't")
    assert len(tokens) == 2
-    tokens = EN("Ain't", parse=False)
+    tokens = en_tokenizer("Ain't")
    assert len(tokens) == 2
    assert tokens[0].orth_ == "Ai"
    assert tokens[0].lemma_ == "be"
-def test_punct():
+def test_punct(en_tokenizer):
-    tokens = EN("We've", parse=False)
+    tokens = en_tokenizer("We've")
    assert len(tokens) == 2
-    tokens = EN("``We've", parse=False)
+    tokens = en_tokenizer("``We've")
    assert len(tokens) == 3
--- a/tests/tokenizer/test_emoticons.py
+++ b/tests/tokenizer/test_emoticons.py
@ -1,17 +1,10 @@
 from __future__ import unicode_literals
 import pytest
 from spacy.en import English
-
+def test_tweebo_challenge(en_tokenizer):
@pytest.fixture
 def EN():
    return English()
 def test_tweebo_challenge(EN):
    text = u""":o :/ :'( >:o (: :) >.< XD -__- o.O ;D :-) @_@ :P 8D :1 >:( :D =| ") :> ...."""
-    tokens = EN(text, parse=False, tag=False)
+    tokens = en_tokenizer(text)
    assert tokens[0].orth_ == ":o"
    assert tokens[1].orth_ == ":/"
    assert tokens[2].orth_ == ":'("
@ -36,7 +29,7 @@ def test_tweebo_challenge(EN):
    assert tokens[21].orth_ == '....'
-def test_false_positive(EN):
+def test_false_positive(en_tokenizer):
    text = "example:)"
-    tokens = EN(text, parse=False, tag=False)
+    tokens = en_tokenizer(text)
    assert len(tokens) == 3
--- a/tests/tokenizer/test_indices.py
+++ b/tests/tokenizer/test_indices.py
@ -3,18 +3,11 @@
 from __future__ import unicode_literals
 import pytest
 from spacy.en import English
-@pytest.fixture
+def test_simple_punct(en_tokenizer):
 def nlp():
    nlp = English()
    return nlp.tokenizer
 def test_simple_punct(nlp):
    text = 'to walk, do foo'
-    tokens = nlp(text)
+    tokens = en_tokenizer(text)
    assert tokens[0].idx == 0
    assert tokens[1].idx == 3
    assert tokens[2].idx == 7
@ -22,9 +15,9 @@ def test_simple_punct(nlp):
    assert tokens[4].idx == 12
-def test_complex_punct(nlp):
+def test_complex_punct(en_tokenizer):
    text = 'Tom (D., Ill.)!'
-    tokens = nlp(text)
+    tokens = en_tokenizer(text)
    assert tokens[0].idx == 0
    assert len(tokens[0]) == 3
    assert tokens[1].idx == 4
--- a/tests/tokenizer/test_infix.py
+++ b/tests/tokenizer/test_infix.py
@ -2,17 +2,13 @@ from __future__ import unicode_literals
 import pytest
-from spacy.en import English
+def test_hyphen(en_tokenizer):
-
+    tokens = en_tokenizer('best-known')
 EN = English()
 def test_hyphen():
    tokens = EN.tokenizer('best-known')
    assert len(tokens) == 3
-def test_period():
+def test_period(en_tokenizer):
-    tokens = EN.tokenizer('best.Known')
+    tokens = en_tokenizer('best.Known')
    assert len(tokens) == 3
-    tokens = EN.tokenizer('zombo.com')
+    tokens = en_tokenizer('zombo.com')
    assert len(tokens) == 1
--- a/tests/tokenizer/test_only_punct.py
+++ b/tests/tokenizer/test_only_punct.py
@ -1,14 +1,9 @@
 from __future__ import unicode_literals
 import pytest
 from spacy.en import English
-def test_only_pre1():
+def test_only_pre1(en_tokenizer):
-    EN = English()
+    assert len(en_tokenizer("(")) == 1
    assert len(EN("(")) == 1
-def test_only_pre2():
+def test_only_pre2(en_tokenizer):
-    EN = English()
+    assert len(en_tokenizer("((")) == 2
    assert len(EN("((")) == 2
--- a/tests/tokenizer/test_post_punct.py
+++ b/tests/tokenizer/test_post_punct.py
@ -1,7 +1,4 @@
 from __future__ import unicode_literals
 from spacy.en import English
 import pytest
@ -10,42 +7,37 @@ def close_puncts():
    return [')', ']', '}', '*']
-@pytest.fixture
+def test_close(close_puncts, en_tokenizer):
 def EN():
    return English()
 def test_close(close_puncts, EN):
    word_str = 'Hello'
    for p in close_puncts:
        string = word_str + p
-        tokens = EN(string, parse=False, tag=False)
+        tokens = en_tokenizer(string)
        assert len(tokens) == 2
        assert tokens[1].string == p
        assert tokens[0].string == word_str
-def test_two_different_close(close_puncts, EN):
+def test_two_different_close(close_puncts, en_tokenizer):
    word_str = 'Hello'
    for p in close_puncts:
        string = word_str + p + "'"
-        tokens = EN(string, parse=False, tag=False)
+        tokens = en_tokenizer(string)
        assert len(tokens) == 3
        assert tokens[0].string == word_str
        assert tokens[1].string == p
        assert tokens[2].string == "'"
-def test_three_same_close(close_puncts, EN):
+def test_three_same_close(close_puncts, en_tokenizer):
    word_str = 'Hello'
    for p in close_puncts:
        string = word_str + p + p + p
-        tokens = EN(string, tag=False, parse=False)
+        tokens = en_tokenizer(string)
        assert len(tokens) == 4
        assert tokens[0].string == word_str
        assert tokens[1].string == p
-def test_double_end_quote(EN):
+def test_double_end_quote(en_tokenizer):
-    assert len(EN("Hello''", tag=False, parse=False)) == 2
+    assert len(en_tokenizer("Hello''")) == 2
-    assert len(EN("''", tag=False, parse=False)) == 1
+    assert len(en_tokenizer("''")) == 1
--- a/tests/tokenizer/test_pre_punct.py
+++ b/tests/tokenizer/test_pre_punct.py
@ -1,7 +1,5 @@
 from __future__ import unicode_literals
 from spacy.en import English
 import pytest
@ -10,44 +8,39 @@ def open_puncts():
    return ['(', '[', '{', '*']
-@pytest.fixture
+def test_open(open_puncts, en_tokenizer):
 def EN():
    return English().tokenizer
 def test_open(open_puncts, EN):
    word_str = 'Hello'
    for p in open_puncts:
        string = p + word_str
-        tokens = EN(string)
+        tokens = en_tokenizer(string)
        assert len(tokens) == 2
        assert tokens[0].orth_ == p
        assert tokens[1].orth_ == word_str
-def test_two_different_open(open_puncts, EN):
+def test_two_different_open(open_puncts, en_tokenizer):
    word_str = 'Hello'
    for p in open_puncts:
        string = p + "`" + word_str
-        tokens = EN(string)
+        tokens = en_tokenizer(string)
        assert len(tokens) == 3
        assert tokens[0].orth_ == p
        assert tokens[1].orth_ == "`"
        assert tokens[2].orth_ == word_str
-def test_three_same_open(open_puncts, EN):
+def test_three_same_open(open_puncts, en_tokenizer):
    word_str = 'Hello'
    for p in open_puncts:
        string = p + p + p + word_str
-        tokens = EN(string)
+        tokens = en_tokenizer(string)
        assert len(tokens) == 4
        assert tokens[0].orth_ == p
        assert tokens[3].orth_ == word_str
-def test_open_appostrophe(EN):
+def test_open_appostrophe(en_tokenizer):
    string = "'The"
-    tokens = EN(string)
+    tokens = en_tokenizer(string)
    assert len(tokens) == 2
    assert tokens[0].orth_ == "'"
--- a/tests/tokenizer/test_special_affix.py
+++ b/tests/tokenizer/test_special_affix.py
@ -3,50 +3,44 @@ and suffix punctuation."""
 from __future__ import unicode_literals
 import pytest
 from spacy.en import English
-@pytest.fixture
+def test_no_special(en_tokenizer):
-def EN():
+    assert len(en_tokenizer("(can)")) == 3
    return English().tokenizer
-def test_no_special(EN):
+def test_no_punct(en_tokenizer):
-    assert len(EN("(can)")) == 3
+    assert len(en_tokenizer("can't")) == 2
-def test_no_punct(EN):
+def test_prefix(en_tokenizer):
-    assert len(EN("can't")) == 2
+    assert len(en_tokenizer("(can't")) == 3
-def test_prefix(EN):
+def test_suffix(en_tokenizer):
-    assert len(EN("(can't")) == 3
+    assert len(en_tokenizer("can't)")) == 3
-def test_suffix(EN):
+def test_wrap(en_tokenizer):
-    assert len(EN("can't)")) == 3
+    assert len(en_tokenizer("(can't)")) == 4
-def test_wrap(EN):
+def test_uneven_wrap(en_tokenizer):
-    assert len(EN("(can't)")) == 4
+    assert len(en_tokenizer("(can't?)")) == 5
-def test_uneven_wrap(EN):
+def test_prefix_interact(en_tokenizer):
-    assert len(EN("(can't?)")) == 5
+    assert len(en_tokenizer("U.S.")) == 1
    assert len(en_tokenizer("us.")) == 2
    assert len(en_tokenizer("(U.S.")) == 2
-def test_prefix_interact(EN):
+def test_suffix_interact(en_tokenizer):
-    assert len(EN("U.S.")) == 1
+    assert len(en_tokenizer("U.S.)")) == 2
    assert len(EN("us.")) == 2
    assert len(EN("(U.S.")) == 2
-def test_suffix_interact(EN):
+def test_even_wrap_interact(en_tokenizer):
-    assert len(EN("U.S.)")) == 2
+    assert len(en_tokenizer("(U.S.)")) == 3
-def test_even_wrap_interact(EN):
+def test_uneven_wrap_interact(en_tokenizer):
-    assert len(EN("(U.S.)")) == 3
+    assert len(en_tokenizer("(U.S.?)")) == 4
 def test_uneven_wrap_interact(EN):
    assert len(EN("(U.S.?)")) == 4
--- a/tests/tokenizer/test_surround_punct.py
+++ b/tests/tokenizer/test_surround_punct.py
@ -1,7 +1,4 @@
 from __future__ import unicode_literals
 from spacy.en import English
 import pytest
@ -10,27 +7,22 @@ def paired_puncts():
    return [('(', ')'),  ('[', ']'), ('{', '}'), ('*', '*')]
-@pytest.fixture
+def test_token(paired_puncts, en_tokenizer):
 def EN():
    return English().tokenizer
 def test_token(paired_puncts, EN):
    word_str = 'Hello'
    for open_, close_ in paired_puncts:
        string = open_ + word_str + close_
-        tokens = EN(string)
+        tokens = en_tokenizer(string)
        assert len(tokens) == 3
        assert tokens[0].orth_ == open_
        assert tokens[1].orth_ == word_str
        assert tokens[2].orth_ == close_
-def test_two_different(paired_puncts, EN):
+def test_two_different(paired_puncts, en_tokenizer):
    word_str = 'Hello'
    for open_, close_ in paired_puncts:
        string = "`" + open_ + word_str + close_ + "'"
-        tokens = EN(string)
+        tokens = en_tokenizer(string)
        assert len(tokens) == 5
        assert tokens[0].orth_ == "`"
        assert tokens[1].orth_ == open_
--- a/tests/tokenizer/test_tokenizer.py
+++ b/tests/tokenizer/test_tokenizer.py
@ -3,32 +3,25 @@ from __future__ import unicode_literals
 import pytest
 from spacy.en import English
-
+def test_no_word(en_tokenizer):
-@pytest.fixture
+    tokens = en_tokenizer(u'')
 def EN():
    return English().tokenizer
 def test_no_word(EN):
    tokens = EN(u'')
    assert len(tokens) == 0
-def test_single_word(EN):
+def test_single_word(en_tokenizer):
-    tokens = EN(u'hello')
+    tokens = en_tokenizer(u'hello')
    assert tokens[0].orth_ == 'hello'
-def test_two_words(EN):
+def test_two_words(en_tokenizer):
-    tokens = EN('hello possums')
+    tokens = en_tokenizer('hello possums')
    assert len(tokens) == 2
    assert tokens[0].orth_ != tokens[1].orth_
-def test_punct(EN):
+def test_punct(en_tokenizer):
-    tokens = EN('hello, possums.')
+    tokens = en_tokenizer('hello, possums.')
    assert len(tokens) == 4
    assert tokens[0].orth_ == 'hello'
    assert tokens[1].orth_ == ','
@ -36,34 +29,34 @@ def test_punct(EN):
    assert tokens[1].orth_ != 'hello'
-def test_digits(EN):
+def test_digits(en_tokenizer):
-    tokens = EN('The year: 1984.')
+    tokens = en_tokenizer('The year: 1984.')
    assert len(tokens) == 5
-    assert tokens[0].orth == EN.vocab['The'].orth
+    assert tokens[0].orth == en_tokenizer.vocab['The'].orth
-    assert tokens[3].orth == EN.vocab['1984'].orth
+    assert tokens[3].orth == en_tokenizer.vocab['1984'].orth
-def test_contraction(EN):
+def test_contraction(en_tokenizer):
-    tokens = EN("don't giggle")
+    tokens = en_tokenizer("don't giggle")
    assert len(tokens) == 3
-    assert tokens[1].orth == EN.vocab["n't"].orth
+    assert tokens[1].orth == en_tokenizer.vocab["n't"].orth
-    tokens = EN("i said don't!")
+    tokens = en_tokenizer("i said don't!")
    assert len(tokens) == 5
-    assert tokens[4].orth == EN.vocab['!'].orth
+    assert tokens[4].orth == en_tokenizer.vocab['!'].orth
-def test_contraction_punct(EN):
+def test_contraction_punct(en_tokenizer):
-    tokens = EN("(can't")
+    tokens = en_tokenizer("(can't")
    assert len(tokens) == 3
-    tokens = EN("`ain't")
+    tokens = en_tokenizer("`ain't")
    assert len(tokens) == 3
-    tokens = EN('''"isn't''')
+    tokens = en_tokenizer('''"isn't''')
    assert len(tokens) == 3
-    tokens = EN("can't!")
+    tokens = en_tokenizer("can't!")
    assert len(tokens) == 3
-def test_sample(EN):
+def test_sample(en_tokenizer):
    text = """Tributes pour in for late British Labour Party leader
 Tributes poured in from around the world Thursday
@ -75,7 +68,7 @@ untimely death" of the rapier-tongued Scottish barrister and parliamentarian.
 "Mr. Smith, throughout his distinguished"""
-    tokens = EN(text)
+    tokens = en_tokenizer(text)
    assert len(tokens) > 5
--- a/tests/tokenizer/test_tokens_from_list.py
+++ b/tests/tokenizer/test_tokens_from_list.py
@ -1,16 +1,9 @@
 from __future__ import unicode_literals
 import pytest
 from spacy.en import English
-
+def test1(en_tokenizer):
@pytest.fixture
 def EN():
    return English()
 def test1(EN):
    words = ['JAPAN', 'GET', 'LUCKY']
-    tokens = EN.tokenizer.tokens_from_list(words)
+    tokens = en_tokenizer.tokens_from_list(words)
    assert len(tokens) == 3
    assert tokens[0].orth_ == 'JAPAN'
--- a/tests/tokenizer/test_whitespace.py
+++ b/tests/tokenizer/test_whitespace.py
@ -1,41 +1,35 @@
 """Test that tokens are created correctly for whitespace."""
 from __future__ import unicode_literals
 from spacy.en import English
 import pytest
-@pytest.fixture
+def test_single_space(en_tokenizer):
-def EN():
+    tokens = en_tokenizer('hello possums')
    return English().tokenizer
 def test_single_space(EN):
    tokens = EN('hello possums')
    assert len(tokens) == 2
-def test_double_space(EN):
+def test_double_space(en_tokenizer):
-    tokens = EN('hello  possums')
+    tokens = en_tokenizer('hello  possums')
    assert len(tokens) == 3
    assert tokens[1].orth_ == ' '
-def test_newline(EN):
+def test_newline(en_tokenizer):
-    tokens = EN('hello\npossums')
+    tokens = en_tokenizer('hello\npossums')
    assert len(tokens) == 3
-def test_newline_space(EN):
+def test_newline_space(en_tokenizer):
-    tokens = EN('hello \npossums')
+    tokens = en_tokenizer('hello \npossums')
    assert len(tokens) == 3
-def test_newline_double_space(EN):
+def test_newline_double_space(en_tokenizer):
-    tokens = EN('hello  \npossums')
+    tokens = en_tokenizer('hello  \npossums')
    assert len(tokens) == 3
-def test_newline_space_wrap(EN):
+def test_newline_space_wrap(en_tokenizer):
-    tokens = EN('hello \n possums')
+    tokens = en_tokenizer('hello \n possums')
    assert len(tokens) == 3