* Set up tokenizer/ tests properly, using a session-scoped fixture to avoid long load/unload times. Tokenizer tests now complete in 20 seconds.

2025-10-25 05:01:02 +03:00 · 2015-06-07 17:24:49 +02:00 · 2015-06-07 17:24:49 +02:00 · 877abb0e5b
commit 877abb0e5b
parent 1d5f20fdda
13 changed files with 126 additions and 190 deletions
--- a/tests/tokenizer/conftest.py
+++ b/tests/tokenizer/conftest.py
@ -0,0 +1,11 @@
+import pytest
+from spacy.en import English
+
+
+@pytest.fixture(scope="session")
+def EN():
+    return English(load_vectors=False)
+
+@pytest.fixture(scope="session")
+def en_tokenizer(EN):
+    return EN.tokenizer
--- a/tests/tokenizer/test_contractions.py
+++ b/tests/tokenizer/test_contractions.py
@ -1,34 +1,31 @@
 from __future__ import unicode_literals
 import pytest

-from spacy.en import English

-EN = English()
-
-def test_possess():
-    tokens = EN("Mike's", parse=False, tag=False)
-    assert EN.vocab.strings[tokens[0].orth] == "Mike"
-    assert EN.vocab.strings[tokens[1].orth] == "'s"
+def test_possess(en_tokenizer):
+    tokens = en_tokenizer("Mike's")
+    assert en_tokenizer.vocab.strings[tokens[0].orth] == "Mike"
+    assert en_tokenizer.vocab.strings[tokens[1].orth] == "'s"
    assert len(tokens) == 2


-def test_apostrophe():
-    tokens = EN("schools'", parse=False, tag=False)
+def test_apostrophe(en_tokenizer):
+    tokens = en_tokenizer("schools'")
    assert len(tokens) == 2
    assert tokens[1].orth_ == "'"
    assert tokens[0].orth_ == "schools"


-def test_LL():
-    tokens = EN("we'll", parse=False)
+def test_LL(en_tokenizer):
+    tokens = en_tokenizer("we'll")
    assert len(tokens) == 2
    assert tokens[1].orth_ == "'ll"
    assert tokens[1].lemma_ == "will"
    assert tokens[0].orth_ == "we"


-def test_aint():
-    tokens = EN("ain't", parse=False)
+def test_aint(en_tokenizer):
+    tokens = en_tokenizer("ain't")
    assert len(tokens) == 2
    assert tokens[0].orth_ == "ai"
    assert tokens[0].lemma_ == "be"
@ -36,19 +33,19 @@ def test_aint():
    assert tokens[1].lemma_ == "not"


-def test_capitalized():
-    tokens = EN("can't", parse=False)
+def test_capitalized(en_tokenizer):
+    tokens = en_tokenizer("can't")
    assert len(tokens) == 2
-    tokens = EN("Can't", parse=False)
+    tokens = en_tokenizer("Can't")
    assert len(tokens) == 2
-    tokens = EN("Ain't", parse=False)
+    tokens = en_tokenizer("Ain't")
    assert len(tokens) == 2
    assert tokens[0].orth_ == "Ai"
    assert tokens[0].lemma_ == "be"


-def test_punct():
-    tokens = EN("We've", parse=False)
+def test_punct(en_tokenizer):
+    tokens = en_tokenizer("We've")
    assert len(tokens) == 2
-    tokens = EN("``We've", parse=False)
+    tokens = en_tokenizer("``We've")
    assert len(tokens) == 3
--- a/tests/tokenizer/test_emoticons.py
+++ b/tests/tokenizer/test_emoticons.py
@ -1,17 +1,10 @@
 from __future__ import unicode_literals
 import pytest

-from spacy.en import English

-
-@pytest.fixture
-def EN():
-    return English()
-
-
-def test_tweebo_challenge(EN):
+def test_tweebo_challenge(en_tokenizer):
    text = u""":o :/ :'( >:o (: :) >.< XD -__- o.O ;D :-) @_@ :P 8D :1 >:( :D =| ") :> ...."""
-    tokens = EN(text, parse=False, tag=False)
+    tokens = en_tokenizer(text)
    assert tokens[0].orth_ == ":o"
    assert tokens[1].orth_ == ":/"
    assert tokens[2].orth_ == ":'("
@ -36,7 +29,7 @@ def test_tweebo_challenge(EN):
    assert tokens[21].orth_ == '....'


-def test_false_positive(EN):
+def test_false_positive(en_tokenizer):
    text = "example:)"
-    tokens = EN(text, parse=False, tag=False)
+    tokens = en_tokenizer(text)
    assert len(tokens) == 3
--- a/tests/tokenizer/test_indices.py
+++ b/tests/tokenizer/test_indices.py
@ -3,18 +3,11 @@
 from __future__ import unicode_literals

 import pytest
-from spacy.en import English


-@pytest.fixture
-def nlp():
-    nlp = English()
-    return nlp.tokenizer
-
-
-def test_simple_punct(nlp):
+def test_simple_punct(en_tokenizer):
    text = 'to walk, do foo'
-    tokens = nlp(text)
+    tokens = en_tokenizer(text)
    assert tokens[0].idx == 0
    assert tokens[1].idx == 3
    assert tokens[2].idx == 7
@ -22,9 +15,9 @@ def test_simple_punct(nlp):
    assert tokens[4].idx == 12


-def test_complex_punct(nlp):
+def test_complex_punct(en_tokenizer):
    text = 'Tom (D., Ill.)!'
-    tokens = nlp(text)
+    tokens = en_tokenizer(text)
    assert tokens[0].idx == 0
    assert len(tokens[0]) == 3
    assert tokens[1].idx == 4
--- a/tests/tokenizer/test_infix.py
+++ b/tests/tokenizer/test_infix.py
@ -2,17 +2,13 @@ from __future__ import unicode_literals

 import pytest

-from spacy.en import English
-
-EN = English()
-
-def test_hyphen():
-    tokens = EN.tokenizer('best-known')
+def test_hyphen(en_tokenizer):
+    tokens = en_tokenizer('best-known')
    assert len(tokens) == 3


-def test_period():
-    tokens = EN.tokenizer('best.Known')
+def test_period(en_tokenizer):
+    tokens = en_tokenizer('best.Known')
    assert len(tokens) == 3
-    tokens = EN.tokenizer('zombo.com')
+    tokens = en_tokenizer('zombo.com')
    assert len(tokens) == 1
--- a/tests/tokenizer/test_only_punct.py
+++ b/tests/tokenizer/test_only_punct.py
@ -1,14 +1,9 @@
 from __future__ import unicode_literals
-import pytest
-
-from spacy.en import English


-def test_only_pre1():
-    EN = English()
-    assert len(EN("(")) == 1
+def test_only_pre1(en_tokenizer):
+    assert len(en_tokenizer("(")) == 1


-def test_only_pre2():
-    EN = English()
-    assert len(EN("((")) == 2
+def test_only_pre2(en_tokenizer):
+    assert len(en_tokenizer("((")) == 2
--- a/tests/tokenizer/test_post_punct.py
+++ b/tests/tokenizer/test_post_punct.py
@ -1,7 +1,4 @@
 from __future__ import unicode_literals
-
-from spacy.en import English
-
 import pytest


@ -10,42 +7,37 @@ def close_puncts():
    return [')', ']', '}', '*']


-@pytest.fixture
-def EN():
-    return English()
-
-
-def test_close(close_puncts, EN):
+def test_close(close_puncts, en_tokenizer):
    word_str = 'Hello'
    for p in close_puncts:
        string = word_str + p
-        tokens = EN(string, parse=False, tag=False)
+        tokens = en_tokenizer(string)
        assert len(tokens) == 2
        assert tokens[1].string == p
        assert tokens[0].string == word_str


-def test_two_different_close(close_puncts, EN):
+def test_two_different_close(close_puncts, en_tokenizer):
    word_str = 'Hello'
    for p in close_puncts:
        string = word_str + p + "'"
-        tokens = EN(string, parse=False, tag=False)
+        tokens = en_tokenizer(string)
        assert len(tokens) == 3
        assert tokens[0].string == word_str
        assert tokens[1].string == p
        assert tokens[2].string == "'"


-def test_three_same_close(close_puncts, EN):
+def test_three_same_close(close_puncts, en_tokenizer):
    word_str = 'Hello'
    for p in close_puncts:
        string = word_str + p + p + p
-        tokens = EN(string, tag=False, parse=False)
+        tokens = en_tokenizer(string)
        assert len(tokens) == 4
        assert tokens[0].string == word_str
        assert tokens[1].string == p


-def test_double_end_quote(EN):
-    assert len(EN("Hello''", tag=False, parse=False)) == 2
-    assert len(EN("''", tag=False, parse=False)) == 1
+def test_double_end_quote(en_tokenizer):
+    assert len(en_tokenizer("Hello''")) == 2
+    assert len(en_tokenizer("''")) == 1
--- a/tests/tokenizer/test_pre_punct.py
+++ b/tests/tokenizer/test_pre_punct.py
@ -1,7 +1,5 @@
 from __future__ import unicode_literals

-from spacy.en import English
-
 import pytest


@ -10,44 +8,39 @@ def open_puncts():
    return ['(', '[', '{', '*']


-@pytest.fixture
-def EN():
-    return English().tokenizer
-
-
-def test_open(open_puncts, EN):
+def test_open(open_puncts, en_tokenizer):
    word_str = 'Hello'
    for p in open_puncts:
        string = p + word_str
-        tokens = EN(string)
+        tokens = en_tokenizer(string)
        assert len(tokens) == 2
        assert tokens[0].orth_ == p
        assert tokens[1].orth_ == word_str


-def test_two_different_open(open_puncts, EN):
+def test_two_different_open(open_puncts, en_tokenizer):
    word_str = 'Hello'
    for p in open_puncts:
        string = p + "`" + word_str
-        tokens = EN(string)
+        tokens = en_tokenizer(string)
        assert len(tokens) == 3
        assert tokens[0].orth_ == p
        assert tokens[1].orth_ == "`"
        assert tokens[2].orth_ == word_str


-def test_three_same_open(open_puncts, EN):
+def test_three_same_open(open_puncts, en_tokenizer):
    word_str = 'Hello'
    for p in open_puncts:
        string = p + p + p + word_str
-        tokens = EN(string)
+        tokens = en_tokenizer(string)
        assert len(tokens) == 4
        assert tokens[0].orth_ == p
        assert tokens[3].orth_ == word_str


-def test_open_appostrophe(EN):
+def test_open_appostrophe(en_tokenizer):
    string = "'The"
-    tokens = EN(string)
+    tokens = en_tokenizer(string)
    assert len(tokens) == 2
    assert tokens[0].orth_ == "'"
--- a/tests/tokenizer/test_special_affix.py
+++ b/tests/tokenizer/test_special_affix.py
@ -3,50 +3,44 @@ and suffix punctuation."""
 from __future__ import unicode_literals
 import pytest

-from spacy.en import English

-@pytest.fixture
-def EN():
-    return English().tokenizer
+def test_no_special(en_tokenizer):
+    assert len(en_tokenizer("(can)")) == 3


-def test_no_special(EN):
-    assert len(EN("(can)")) == 3
+def test_no_punct(en_tokenizer):
+    assert len(en_tokenizer("can't")) == 2


-def test_no_punct(EN):
-    assert len(EN("can't")) == 2
+def test_prefix(en_tokenizer):
+    assert len(en_tokenizer("(can't")) == 3


-def test_prefix(EN):
-    assert len(EN("(can't")) == 3
+def test_suffix(en_tokenizer):
+    assert len(en_tokenizer("can't)")) == 3


-def test_suffix(EN):
-    assert len(EN("can't)")) == 3
+def test_wrap(en_tokenizer):
+    assert len(en_tokenizer("(can't)")) == 4


-def test_wrap(EN):
-    assert len(EN("(can't)")) == 4
+def test_uneven_wrap(en_tokenizer):
+    assert len(en_tokenizer("(can't?)")) == 5


-def test_uneven_wrap(EN):
-    assert len(EN("(can't?)")) == 5
+def test_prefix_interact(en_tokenizer):
+    assert len(en_tokenizer("U.S.")) == 1
+    assert len(en_tokenizer("us.")) == 2
+    assert len(en_tokenizer("(U.S.")) == 2


-def test_prefix_interact(EN):
-    assert len(EN("U.S.")) == 1
-    assert len(EN("us.")) == 2
-    assert len(EN("(U.S.")) == 2
+def test_suffix_interact(en_tokenizer):
+    assert len(en_tokenizer("U.S.)")) == 2


-def test_suffix_interact(EN):
-    assert len(EN("U.S.)")) == 2
+def test_even_wrap_interact(en_tokenizer):
+    assert len(en_tokenizer("(U.S.)")) == 3


-def test_even_wrap_interact(EN):
-    assert len(EN("(U.S.)")) == 3
-
-
-def test_uneven_wrap_interact(EN):
-    assert len(EN("(U.S.?)")) == 4
+def test_uneven_wrap_interact(en_tokenizer):
+    assert len(en_tokenizer("(U.S.?)")) == 4
--- a/tests/tokenizer/test_surround_punct.py
+++ b/tests/tokenizer/test_surround_punct.py
@ -1,7 +1,4 @@
 from __future__ import unicode_literals
-
-from spacy.en import English
-
 import pytest


@ -10,27 +7,22 @@ def paired_puncts():
    return [('(', ')'),  ('[', ']'), ('{', '}'), ('*', '*')]


-@pytest.fixture
-def EN():
-    return English().tokenizer
-
-
-def test_token(paired_puncts, EN):
+def test_token(paired_puncts, en_tokenizer):
    word_str = 'Hello'
    for open_, close_ in paired_puncts:
        string = open_ + word_str + close_
-        tokens = EN(string)
+        tokens = en_tokenizer(string)
        assert len(tokens) == 3
        assert tokens[0].orth_ == open_
        assert tokens[1].orth_ == word_str
        assert tokens[2].orth_ == close_


-def test_two_different(paired_puncts, EN):
+def test_two_different(paired_puncts, en_tokenizer):
    word_str = 'Hello'
    for open_, close_ in paired_puncts:
        string = "`" + open_ + word_str + close_ + "'"
-        tokens = EN(string)
+        tokens = en_tokenizer(string)
        assert len(tokens) == 5
        assert tokens[0].orth_ == "`"
        assert tokens[1].orth_ == open_
--- a/tests/tokenizer/test_tokenizer.py
+++ b/tests/tokenizer/test_tokenizer.py
@ -3,32 +3,25 @@ from __future__ import unicode_literals

 import pytest

-from spacy.en import English

-
-@pytest.fixture
-def EN():
-    return English().tokenizer
-
-
-def test_no_word(EN):
-    tokens = EN(u'')
+def test_no_word(en_tokenizer):
+    tokens = en_tokenizer(u'')
    assert len(tokens) == 0


-def test_single_word(EN):
-    tokens = EN(u'hello')
+def test_single_word(en_tokenizer):
+    tokens = en_tokenizer(u'hello')
    assert tokens[0].orth_ == 'hello'


-def test_two_words(EN):
-    tokens = EN('hello possums')
+def test_two_words(en_tokenizer):
+    tokens = en_tokenizer('hello possums')
    assert len(tokens) == 2
    assert tokens[0].orth_ != tokens[1].orth_


-def test_punct(EN):
-    tokens = EN('hello, possums.')
+def test_punct(en_tokenizer):
+    tokens = en_tokenizer('hello, possums.')
    assert len(tokens) == 4
    assert tokens[0].orth_ == 'hello'
    assert tokens[1].orth_ == ','
@ -36,34 +29,34 @@ def test_punct(EN):
    assert tokens[1].orth_ != 'hello'


-def test_digits(EN):
-    tokens = EN('The year: 1984.')
+def test_digits(en_tokenizer):
+    tokens = en_tokenizer('The year: 1984.')
    assert len(tokens) == 5
-    assert tokens[0].orth == EN.vocab['The'].orth
-    assert tokens[3].orth == EN.vocab['1984'].orth
+    assert tokens[0].orth == en_tokenizer.vocab['The'].orth
+    assert tokens[3].orth == en_tokenizer.vocab['1984'].orth


-def test_contraction(EN):
-    tokens = EN("don't giggle")
+def test_contraction(en_tokenizer):
+    tokens = en_tokenizer("don't giggle")
    assert len(tokens) == 3
-    assert tokens[1].orth == EN.vocab["n't"].orth
-    tokens = EN("i said don't!")
+    assert tokens[1].orth == en_tokenizer.vocab["n't"].orth
+    tokens = en_tokenizer("i said don't!")
    assert len(tokens) == 5
-    assert tokens[4].orth == EN.vocab['!'].orth
+    assert tokens[4].orth == en_tokenizer.vocab['!'].orth


-def test_contraction_punct(EN):
-    tokens = EN("(can't")
+def test_contraction_punct(en_tokenizer):
+    tokens = en_tokenizer("(can't")
    assert len(tokens) == 3
-    tokens = EN("`ain't")
+    tokens = en_tokenizer("`ain't")
    assert len(tokens) == 3
-    tokens = EN('''"isn't''')
+    tokens = en_tokenizer('''"isn't''')
    assert len(tokens) == 3
-    tokens = EN("can't!")
+    tokens = en_tokenizer("can't!")
    assert len(tokens) == 3


-def test_sample(EN):
+def test_sample(en_tokenizer):
    text = """Tributes pour in for late British Labour Party leader

 Tributes poured in from around the world Thursday
@ -75,7 +68,7 @@ untimely death" of the rapier-tongued Scottish barrister and parliamentarian.

 "Mr. Smith, throughout his distinguished"""

-    tokens = EN(text)
+    tokens = en_tokenizer(text)
    assert len(tokens) > 5


--- a/tests/tokenizer/test_tokens_from_list.py
+++ b/tests/tokenizer/test_tokens_from_list.py
@ -1,16 +1,9 @@
 from __future__ import unicode_literals
 import pytest

-from spacy.en import English

-
-@pytest.fixture
-def EN():
-    return English()
-
-
-def test1(EN):
+def test1(en_tokenizer):
    words = ['JAPAN', 'GET', 'LUCKY']
-    tokens = EN.tokenizer.tokens_from_list(words)
+    tokens = en_tokenizer.tokens_from_list(words)
    assert len(tokens) == 3
    assert tokens[0].orth_ == 'JAPAN'
--- a/tests/tokenizer/test_whitespace.py
+++ b/tests/tokenizer/test_whitespace.py
@ -1,41 +1,35 @@
 """Test that tokens are created correctly for whitespace."""
 from __future__ import unicode_literals

-from spacy.en import English
 import pytest


-@pytest.fixture
-def EN():
-    return English().tokenizer
-
-
-def test_single_space(EN):
-    tokens = EN('hello possums')
+def test_single_space(en_tokenizer):
+    tokens = en_tokenizer('hello possums')
    assert len(tokens) == 2


-def test_double_space(EN):
-    tokens = EN('hello  possums')
+def test_double_space(en_tokenizer):
+    tokens = en_tokenizer('hello  possums')
    assert len(tokens) == 3
    assert tokens[1].orth_ == ' '


-def test_newline(EN):
-    tokens = EN('hello\npossums')
+def test_newline(en_tokenizer):
+    tokens = en_tokenizer('hello\npossums')
    assert len(tokens) == 3


-def test_newline_space(EN):
-    tokens = EN('hello \npossums')
+def test_newline_space(en_tokenizer):
+    tokens = en_tokenizer('hello \npossums')
    assert len(tokens) == 3


-def test_newline_double_space(EN):
-    tokens = EN('hello  \npossums')
+def test_newline_double_space(en_tokenizer):
+    tokens = en_tokenizer('hello  \npossums')
    assert len(tokens) == 3


-def test_newline_space_wrap(EN):
-    tokens = EN('hello \n possums')
+def test_newline_space_wrap(en_tokenizer):
+    tokens = en_tokenizer('hello \n possums')
    assert len(tokens) == 3