* Set up tokenizer/ tests properly, using a session-scoped fixture to avoid long load/unload times. Tokenizer tests now complete in 20 seconds.

This commit is contained in:
Matthew Honnibal 2015-06-07 17:24:49 +02:00
parent 1d5f20fdda
commit 877abb0e5b
13 changed files with 126 additions and 190 deletions

View File

@ -0,0 +1,11 @@
import pytest
from spacy.en import English
@pytest.fixture(scope="session")
def EN():
return English(load_vectors=False)
@pytest.fixture(scope="session")
def en_tokenizer(EN):
return EN.tokenizer

View File

@ -1,34 +1,31 @@
from __future__ import unicode_literals
import pytest
from spacy.en import English
EN = English()
def test_possess():
tokens = EN("Mike's", parse=False, tag=False)
assert EN.vocab.strings[tokens[0].orth] == "Mike"
assert EN.vocab.strings[tokens[1].orth] == "'s"
def test_possess(en_tokenizer):
tokens = en_tokenizer("Mike's")
assert en_tokenizer.vocab.strings[tokens[0].orth] == "Mike"
assert en_tokenizer.vocab.strings[tokens[1].orth] == "'s"
assert len(tokens) == 2
def test_apostrophe():
tokens = EN("schools'", parse=False, tag=False)
def test_apostrophe(en_tokenizer):
tokens = en_tokenizer("schools'")
assert len(tokens) == 2
assert tokens[1].orth_ == "'"
assert tokens[0].orth_ == "schools"
def test_LL():
tokens = EN("we'll", parse=False)
def test_LL(en_tokenizer):
tokens = en_tokenizer("we'll")
assert len(tokens) == 2
assert tokens[1].orth_ == "'ll"
assert tokens[1].lemma_ == "will"
assert tokens[0].orth_ == "we"
def test_aint():
tokens = EN("ain't", parse=False)
def test_aint(en_tokenizer):
tokens = en_tokenizer("ain't")
assert len(tokens) == 2
assert tokens[0].orth_ == "ai"
assert tokens[0].lemma_ == "be"
@ -36,19 +33,19 @@ def test_aint():
assert tokens[1].lemma_ == "not"
def test_capitalized():
tokens = EN("can't", parse=False)
def test_capitalized(en_tokenizer):
tokens = en_tokenizer("can't")
assert len(tokens) == 2
tokens = EN("Can't", parse=False)
tokens = en_tokenizer("Can't")
assert len(tokens) == 2
tokens = EN("Ain't", parse=False)
tokens = en_tokenizer("Ain't")
assert len(tokens) == 2
assert tokens[0].orth_ == "Ai"
assert tokens[0].lemma_ == "be"
def test_punct():
tokens = EN("We've", parse=False)
def test_punct(en_tokenizer):
tokens = en_tokenizer("We've")
assert len(tokens) == 2
tokens = EN("``We've", parse=False)
tokens = en_tokenizer("``We've")
assert len(tokens) == 3

View File

@ -1,17 +1,10 @@
from __future__ import unicode_literals
import pytest
from spacy.en import English
@pytest.fixture
def EN():
return English()
def test_tweebo_challenge(EN):
def test_tweebo_challenge(en_tokenizer):
text = u""":o :/ :'( >:o (: :) >.< XD -__- o.O ;D :-) @_@ :P 8D :1 >:( :D =| ") :> ...."""
tokens = EN(text, parse=False, tag=False)
tokens = en_tokenizer(text)
assert tokens[0].orth_ == ":o"
assert tokens[1].orth_ == ":/"
assert tokens[2].orth_ == ":'("
@ -36,7 +29,7 @@ def test_tweebo_challenge(EN):
assert tokens[21].orth_ == '....'
def test_false_positive(EN):
def test_false_positive(en_tokenizer):
text = "example:)"
tokens = EN(text, parse=False, tag=False)
tokens = en_tokenizer(text)
assert len(tokens) == 3

View File

@ -3,18 +3,11 @@
from __future__ import unicode_literals
import pytest
from spacy.en import English
@pytest.fixture
def nlp():
nlp = English()
return nlp.tokenizer
def test_simple_punct(nlp):
def test_simple_punct(en_tokenizer):
text = 'to walk, do foo'
tokens = nlp(text)
tokens = en_tokenizer(text)
assert tokens[0].idx == 0
assert tokens[1].idx == 3
assert tokens[2].idx == 7
@ -22,9 +15,9 @@ def test_simple_punct(nlp):
assert tokens[4].idx == 12
def test_complex_punct(nlp):
def test_complex_punct(en_tokenizer):
text = 'Tom (D., Ill.)!'
tokens = nlp(text)
tokens = en_tokenizer(text)
assert tokens[0].idx == 0
assert len(tokens[0]) == 3
assert tokens[1].idx == 4

View File

@ -2,17 +2,13 @@ from __future__ import unicode_literals
import pytest
from spacy.en import English
EN = English()
def test_hyphen():
tokens = EN.tokenizer('best-known')
def test_hyphen(en_tokenizer):
tokens = en_tokenizer('best-known')
assert len(tokens) == 3
def test_period():
tokens = EN.tokenizer('best.Known')
def test_period(en_tokenizer):
tokens = en_tokenizer('best.Known')
assert len(tokens) == 3
tokens = EN.tokenizer('zombo.com')
tokens = en_tokenizer('zombo.com')
assert len(tokens) == 1

View File

@ -1,14 +1,9 @@
from __future__ import unicode_literals
import pytest
from spacy.en import English
def test_only_pre1():
EN = English()
assert len(EN("(")) == 1
def test_only_pre1(en_tokenizer):
assert len(en_tokenizer("(")) == 1
def test_only_pre2():
EN = English()
assert len(EN("((")) == 2
def test_only_pre2(en_tokenizer):
assert len(en_tokenizer("((")) == 2

View File

@ -1,7 +1,4 @@
from __future__ import unicode_literals
from spacy.en import English
import pytest
@ -10,42 +7,37 @@ def close_puncts():
return [')', ']', '}', '*']
@pytest.fixture
def EN():
return English()
def test_close(close_puncts, EN):
def test_close(close_puncts, en_tokenizer):
word_str = 'Hello'
for p in close_puncts:
string = word_str + p
tokens = EN(string, parse=False, tag=False)
tokens = en_tokenizer(string)
assert len(tokens) == 2
assert tokens[1].string == p
assert tokens[0].string == word_str
def test_two_different_close(close_puncts, EN):
def test_two_different_close(close_puncts, en_tokenizer):
word_str = 'Hello'
for p in close_puncts:
string = word_str + p + "'"
tokens = EN(string, parse=False, tag=False)
tokens = en_tokenizer(string)
assert len(tokens) == 3
assert tokens[0].string == word_str
assert tokens[1].string == p
assert tokens[2].string == "'"
def test_three_same_close(close_puncts, EN):
def test_three_same_close(close_puncts, en_tokenizer):
word_str = 'Hello'
for p in close_puncts:
string = word_str + p + p + p
tokens = EN(string, tag=False, parse=False)
tokens = en_tokenizer(string)
assert len(tokens) == 4
assert tokens[0].string == word_str
assert tokens[1].string == p
def test_double_end_quote(EN):
assert len(EN("Hello''", tag=False, parse=False)) == 2
assert len(EN("''", tag=False, parse=False)) == 1
def test_double_end_quote(en_tokenizer):
assert len(en_tokenizer("Hello''")) == 2
assert len(en_tokenizer("''")) == 1

View File

@ -1,7 +1,5 @@
from __future__ import unicode_literals
from spacy.en import English
import pytest
@ -10,44 +8,39 @@ def open_puncts():
return ['(', '[', '{', '*']
@pytest.fixture
def EN():
return English().tokenizer
def test_open(open_puncts, EN):
def test_open(open_puncts, en_tokenizer):
word_str = 'Hello'
for p in open_puncts:
string = p + word_str
tokens = EN(string)
tokens = en_tokenizer(string)
assert len(tokens) == 2
assert tokens[0].orth_ == p
assert tokens[1].orth_ == word_str
def test_two_different_open(open_puncts, EN):
def test_two_different_open(open_puncts, en_tokenizer):
word_str = 'Hello'
for p in open_puncts:
string = p + "`" + word_str
tokens = EN(string)
tokens = en_tokenizer(string)
assert len(tokens) == 3
assert tokens[0].orth_ == p
assert tokens[1].orth_ == "`"
assert tokens[2].orth_ == word_str
def test_three_same_open(open_puncts, EN):
def test_three_same_open(open_puncts, en_tokenizer):
word_str = 'Hello'
for p in open_puncts:
string = p + p + p + word_str
tokens = EN(string)
tokens = en_tokenizer(string)
assert len(tokens) == 4
assert tokens[0].orth_ == p
assert tokens[3].orth_ == word_str
def test_open_appostrophe(EN):
def test_open_appostrophe(en_tokenizer):
string = "'The"
tokens = EN(string)
tokens = en_tokenizer(string)
assert len(tokens) == 2
assert tokens[0].orth_ == "'"

View File

@ -3,50 +3,44 @@ and suffix punctuation."""
from __future__ import unicode_literals
import pytest
from spacy.en import English
@pytest.fixture
def EN():
return English().tokenizer
def test_no_special(en_tokenizer):
assert len(en_tokenizer("(can)")) == 3
def test_no_special(EN):
assert len(EN("(can)")) == 3
def test_no_punct(en_tokenizer):
assert len(en_tokenizer("can't")) == 2
def test_no_punct(EN):
assert len(EN("can't")) == 2
def test_prefix(en_tokenizer):
assert len(en_tokenizer("(can't")) == 3
def test_prefix(EN):
assert len(EN("(can't")) == 3
def test_suffix(en_tokenizer):
assert len(en_tokenizer("can't)")) == 3
def test_suffix(EN):
assert len(EN("can't)")) == 3
def test_wrap(en_tokenizer):
assert len(en_tokenizer("(can't)")) == 4
def test_wrap(EN):
assert len(EN("(can't)")) == 4
def test_uneven_wrap(en_tokenizer):
assert len(en_tokenizer("(can't?)")) == 5
def test_uneven_wrap(EN):
assert len(EN("(can't?)")) == 5
def test_prefix_interact(en_tokenizer):
assert len(en_tokenizer("U.S.")) == 1
assert len(en_tokenizer("us.")) == 2
assert len(en_tokenizer("(U.S.")) == 2
def test_prefix_interact(EN):
assert len(EN("U.S.")) == 1
assert len(EN("us.")) == 2
assert len(EN("(U.S.")) == 2
def test_suffix_interact(en_tokenizer):
assert len(en_tokenizer("U.S.)")) == 2
def test_suffix_interact(EN):
assert len(EN("U.S.)")) == 2
def test_even_wrap_interact(en_tokenizer):
assert len(en_tokenizer("(U.S.)")) == 3
def test_even_wrap_interact(EN):
assert len(EN("(U.S.)")) == 3
def test_uneven_wrap_interact(EN):
assert len(EN("(U.S.?)")) == 4
def test_uneven_wrap_interact(en_tokenizer):
assert len(en_tokenizer("(U.S.?)")) == 4

View File

@ -1,7 +1,4 @@
from __future__ import unicode_literals
from spacy.en import English
import pytest
@ -10,27 +7,22 @@ def paired_puncts():
return [('(', ')'), ('[', ']'), ('{', '}'), ('*', '*')]
@pytest.fixture
def EN():
return English().tokenizer
def test_token(paired_puncts, EN):
def test_token(paired_puncts, en_tokenizer):
word_str = 'Hello'
for open_, close_ in paired_puncts:
string = open_ + word_str + close_
tokens = EN(string)
tokens = en_tokenizer(string)
assert len(tokens) == 3
assert tokens[0].orth_ == open_
assert tokens[1].orth_ == word_str
assert tokens[2].orth_ == close_
def test_two_different(paired_puncts, EN):
def test_two_different(paired_puncts, en_tokenizer):
word_str = 'Hello'
for open_, close_ in paired_puncts:
string = "`" + open_ + word_str + close_ + "'"
tokens = EN(string)
tokens = en_tokenizer(string)
assert len(tokens) == 5
assert tokens[0].orth_ == "`"
assert tokens[1].orth_ == open_

View File

@ -3,32 +3,25 @@ from __future__ import unicode_literals
import pytest
from spacy.en import English
@pytest.fixture
def EN():
return English().tokenizer
def test_no_word(EN):
tokens = EN(u'')
def test_no_word(en_tokenizer):
tokens = en_tokenizer(u'')
assert len(tokens) == 0
def test_single_word(EN):
tokens = EN(u'hello')
def test_single_word(en_tokenizer):
tokens = en_tokenizer(u'hello')
assert tokens[0].orth_ == 'hello'
def test_two_words(EN):
tokens = EN('hello possums')
def test_two_words(en_tokenizer):
tokens = en_tokenizer('hello possums')
assert len(tokens) == 2
assert tokens[0].orth_ != tokens[1].orth_
def test_punct(EN):
tokens = EN('hello, possums.')
def test_punct(en_tokenizer):
tokens = en_tokenizer('hello, possums.')
assert len(tokens) == 4
assert tokens[0].orth_ == 'hello'
assert tokens[1].orth_ == ','
@ -36,34 +29,34 @@ def test_punct(EN):
assert tokens[1].orth_ != 'hello'
def test_digits(EN):
tokens = EN('The year: 1984.')
def test_digits(en_tokenizer):
tokens = en_tokenizer('The year: 1984.')
assert len(tokens) == 5
assert tokens[0].orth == EN.vocab['The'].orth
assert tokens[3].orth == EN.vocab['1984'].orth
assert tokens[0].orth == en_tokenizer.vocab['The'].orth
assert tokens[3].orth == en_tokenizer.vocab['1984'].orth
def test_contraction(EN):
tokens = EN("don't giggle")
def test_contraction(en_tokenizer):
tokens = en_tokenizer("don't giggle")
assert len(tokens) == 3
assert tokens[1].orth == EN.vocab["n't"].orth
tokens = EN("i said don't!")
assert tokens[1].orth == en_tokenizer.vocab["n't"].orth
tokens = en_tokenizer("i said don't!")
assert len(tokens) == 5
assert tokens[4].orth == EN.vocab['!'].orth
assert tokens[4].orth == en_tokenizer.vocab['!'].orth
def test_contraction_punct(EN):
tokens = EN("(can't")
def test_contraction_punct(en_tokenizer):
tokens = en_tokenizer("(can't")
assert len(tokens) == 3
tokens = EN("`ain't")
tokens = en_tokenizer("`ain't")
assert len(tokens) == 3
tokens = EN('''"isn't''')
tokens = en_tokenizer('''"isn't''')
assert len(tokens) == 3
tokens = EN("can't!")
tokens = en_tokenizer("can't!")
assert len(tokens) == 3
def test_sample(EN):
def test_sample(en_tokenizer):
text = """Tributes pour in for late British Labour Party leader
Tributes poured in from around the world Thursday
@ -75,7 +68,7 @@ untimely death" of the rapier-tongued Scottish barrister and parliamentarian.
"Mr. Smith, throughout his distinguished"""
tokens = EN(text)
tokens = en_tokenizer(text)
assert len(tokens) > 5

View File

@ -1,16 +1,9 @@
from __future__ import unicode_literals
import pytest
from spacy.en import English
@pytest.fixture
def EN():
return English()
def test1(EN):
def test1(en_tokenizer):
words = ['JAPAN', 'GET', 'LUCKY']
tokens = EN.tokenizer.tokens_from_list(words)
tokens = en_tokenizer.tokens_from_list(words)
assert len(tokens) == 3
assert tokens[0].orth_ == 'JAPAN'

View File

@ -1,41 +1,35 @@
"""Test that tokens are created correctly for whitespace."""
from __future__ import unicode_literals
from spacy.en import English
import pytest
@pytest.fixture
def EN():
return English().tokenizer
def test_single_space(EN):
tokens = EN('hello possums')
def test_single_space(en_tokenizer):
tokens = en_tokenizer('hello possums')
assert len(tokens) == 2
def test_double_space(EN):
tokens = EN('hello possums')
def test_double_space(en_tokenizer):
tokens = en_tokenizer('hello possums')
assert len(tokens) == 3
assert tokens[1].orth_ == ' '
def test_newline(EN):
tokens = EN('hello\npossums')
def test_newline(en_tokenizer):
tokens = en_tokenizer('hello\npossums')
assert len(tokens) == 3
def test_newline_space(EN):
tokens = EN('hello \npossums')
def test_newline_space(en_tokenizer):
tokens = en_tokenizer('hello \npossums')
assert len(tokens) == 3
def test_newline_double_space(EN):
tokens = EN('hello \npossums')
def test_newline_double_space(en_tokenizer):
tokens = en_tokenizer('hello \npossums')
assert len(tokens) == 3
def test_newline_space_wrap(EN):
tokens = EN('hello \n possums')
def test_newline_space_wrap(en_tokenizer):
tokens = en_tokenizer('hello \n possums')
assert len(tokens) == 3