* Set up tokenizer/ tests properly, using a session-scoped fixture to avoid long load/unload times. Tokenizer tests now complete in 20 seconds.

This commit is contained in:
Matthew Honnibal 2015-06-07 17:24:49 +02:00
parent 1d5f20fdda
commit 877abb0e5b
13 changed files with 126 additions and 190 deletions

View File

@ -0,0 +1,11 @@
import pytest
from spacy.en import English
@pytest.fixture(scope="session")
def EN():
return English(load_vectors=False)
@pytest.fixture(scope="session")
def en_tokenizer(EN):
return EN.tokenizer

View File

@ -1,34 +1,31 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import pytest import pytest
from spacy.en import English
EN = English() def test_possess(en_tokenizer):
tokens = en_tokenizer("Mike's")
def test_possess(): assert en_tokenizer.vocab.strings[tokens[0].orth] == "Mike"
tokens = EN("Mike's", parse=False, tag=False) assert en_tokenizer.vocab.strings[tokens[1].orth] == "'s"
assert EN.vocab.strings[tokens[0].orth] == "Mike"
assert EN.vocab.strings[tokens[1].orth] == "'s"
assert len(tokens) == 2 assert len(tokens) == 2
def test_apostrophe(): def test_apostrophe(en_tokenizer):
tokens = EN("schools'", parse=False, tag=False) tokens = en_tokenizer("schools'")
assert len(tokens) == 2 assert len(tokens) == 2
assert tokens[1].orth_ == "'" assert tokens[1].orth_ == "'"
assert tokens[0].orth_ == "schools" assert tokens[0].orth_ == "schools"
def test_LL(): def test_LL(en_tokenizer):
tokens = EN("we'll", parse=False) tokens = en_tokenizer("we'll")
assert len(tokens) == 2 assert len(tokens) == 2
assert tokens[1].orth_ == "'ll" assert tokens[1].orth_ == "'ll"
assert tokens[1].lemma_ == "will" assert tokens[1].lemma_ == "will"
assert tokens[0].orth_ == "we" assert tokens[0].orth_ == "we"
def test_aint(): def test_aint(en_tokenizer):
tokens = EN("ain't", parse=False) tokens = en_tokenizer("ain't")
assert len(tokens) == 2 assert len(tokens) == 2
assert tokens[0].orth_ == "ai" assert tokens[0].orth_ == "ai"
assert tokens[0].lemma_ == "be" assert tokens[0].lemma_ == "be"
@ -36,19 +33,19 @@ def test_aint():
assert tokens[1].lemma_ == "not" assert tokens[1].lemma_ == "not"
def test_capitalized(): def test_capitalized(en_tokenizer):
tokens = EN("can't", parse=False) tokens = en_tokenizer("can't")
assert len(tokens) == 2 assert len(tokens) == 2
tokens = EN("Can't", parse=False) tokens = en_tokenizer("Can't")
assert len(tokens) == 2 assert len(tokens) == 2
tokens = EN("Ain't", parse=False) tokens = en_tokenizer("Ain't")
assert len(tokens) == 2 assert len(tokens) == 2
assert tokens[0].orth_ == "Ai" assert tokens[0].orth_ == "Ai"
assert tokens[0].lemma_ == "be" assert tokens[0].lemma_ == "be"
def test_punct(): def test_punct(en_tokenizer):
tokens = EN("We've", parse=False) tokens = en_tokenizer("We've")
assert len(tokens) == 2 assert len(tokens) == 2
tokens = EN("``We've", parse=False) tokens = en_tokenizer("``We've")
assert len(tokens) == 3 assert len(tokens) == 3

View File

@ -1,17 +1,10 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import pytest import pytest
from spacy.en import English
def test_tweebo_challenge(en_tokenizer):
@pytest.fixture
def EN():
return English()
def test_tweebo_challenge(EN):
text = u""":o :/ :'( >:o (: :) >.< XD -__- o.O ;D :-) @_@ :P 8D :1 >:( :D =| ") :> ....""" text = u""":o :/ :'( >:o (: :) >.< XD -__- o.O ;D :-) @_@ :P 8D :1 >:( :D =| ") :> ...."""
tokens = EN(text, parse=False, tag=False) tokens = en_tokenizer(text)
assert tokens[0].orth_ == ":o" assert tokens[0].orth_ == ":o"
assert tokens[1].orth_ == ":/" assert tokens[1].orth_ == ":/"
assert tokens[2].orth_ == ":'(" assert tokens[2].orth_ == ":'("
@ -36,7 +29,7 @@ def test_tweebo_challenge(EN):
assert tokens[21].orth_ == '....' assert tokens[21].orth_ == '....'
def test_false_positive(EN): def test_false_positive(en_tokenizer):
text = "example:)" text = "example:)"
tokens = EN(text, parse=False, tag=False) tokens = en_tokenizer(text)
assert len(tokens) == 3 assert len(tokens) == 3

View File

@ -3,18 +3,11 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import pytest import pytest
from spacy.en import English
@pytest.fixture def test_simple_punct(en_tokenizer):
def nlp():
nlp = English()
return nlp.tokenizer
def test_simple_punct(nlp):
text = 'to walk, do foo' text = 'to walk, do foo'
tokens = nlp(text) tokens = en_tokenizer(text)
assert tokens[0].idx == 0 assert tokens[0].idx == 0
assert tokens[1].idx == 3 assert tokens[1].idx == 3
assert tokens[2].idx == 7 assert tokens[2].idx == 7
@ -22,9 +15,9 @@ def test_simple_punct(nlp):
assert tokens[4].idx == 12 assert tokens[4].idx == 12
def test_complex_punct(nlp): def test_complex_punct(en_tokenizer):
text = 'Tom (D., Ill.)!' text = 'Tom (D., Ill.)!'
tokens = nlp(text) tokens = en_tokenizer(text)
assert tokens[0].idx == 0 assert tokens[0].idx == 0
assert len(tokens[0]) == 3 assert len(tokens[0]) == 3
assert tokens[1].idx == 4 assert tokens[1].idx == 4

View File

@ -2,17 +2,13 @@ from __future__ import unicode_literals
import pytest import pytest
from spacy.en import English def test_hyphen(en_tokenizer):
tokens = en_tokenizer('best-known')
EN = English()
def test_hyphen():
tokens = EN.tokenizer('best-known')
assert len(tokens) == 3 assert len(tokens) == 3
def test_period(): def test_period(en_tokenizer):
tokens = EN.tokenizer('best.Known') tokens = en_tokenizer('best.Known')
assert len(tokens) == 3 assert len(tokens) == 3
tokens = EN.tokenizer('zombo.com') tokens = en_tokenizer('zombo.com')
assert len(tokens) == 1 assert len(tokens) == 1

View File

@ -1,14 +1,9 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import pytest
from spacy.en import English
def test_only_pre1(): def test_only_pre1(en_tokenizer):
EN = English() assert len(en_tokenizer("(")) == 1
assert len(EN("(")) == 1
def test_only_pre2(): def test_only_pre2(en_tokenizer):
EN = English() assert len(en_tokenizer("((")) == 2
assert len(EN("((")) == 2

View File

@ -1,7 +1,4 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from spacy.en import English
import pytest import pytest
@ -10,42 +7,37 @@ def close_puncts():
return [')', ']', '}', '*'] return [')', ']', '}', '*']
@pytest.fixture def test_close(close_puncts, en_tokenizer):
def EN():
return English()
def test_close(close_puncts, EN):
word_str = 'Hello' word_str = 'Hello'
for p in close_puncts: for p in close_puncts:
string = word_str + p string = word_str + p
tokens = EN(string, parse=False, tag=False) tokens = en_tokenizer(string)
assert len(tokens) == 2 assert len(tokens) == 2
assert tokens[1].string == p assert tokens[1].string == p
assert tokens[0].string == word_str assert tokens[0].string == word_str
def test_two_different_close(close_puncts, EN): def test_two_different_close(close_puncts, en_tokenizer):
word_str = 'Hello' word_str = 'Hello'
for p in close_puncts: for p in close_puncts:
string = word_str + p + "'" string = word_str + p + "'"
tokens = EN(string, parse=False, tag=False) tokens = en_tokenizer(string)
assert len(tokens) == 3 assert len(tokens) == 3
assert tokens[0].string == word_str assert tokens[0].string == word_str
assert tokens[1].string == p assert tokens[1].string == p
assert tokens[2].string == "'" assert tokens[2].string == "'"
def test_three_same_close(close_puncts, EN): def test_three_same_close(close_puncts, en_tokenizer):
word_str = 'Hello' word_str = 'Hello'
for p in close_puncts: for p in close_puncts:
string = word_str + p + p + p string = word_str + p + p + p
tokens = EN(string, tag=False, parse=False) tokens = en_tokenizer(string)
assert len(tokens) == 4 assert len(tokens) == 4
assert tokens[0].string == word_str assert tokens[0].string == word_str
assert tokens[1].string == p assert tokens[1].string == p
def test_double_end_quote(EN): def test_double_end_quote(en_tokenizer):
assert len(EN("Hello''", tag=False, parse=False)) == 2 assert len(en_tokenizer("Hello''")) == 2
assert len(EN("''", tag=False, parse=False)) == 1 assert len(en_tokenizer("''")) == 1

View File

@ -1,7 +1,5 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from spacy.en import English
import pytest import pytest
@ -10,44 +8,39 @@ def open_puncts():
return ['(', '[', '{', '*'] return ['(', '[', '{', '*']
@pytest.fixture def test_open(open_puncts, en_tokenizer):
def EN():
return English().tokenizer
def test_open(open_puncts, EN):
word_str = 'Hello' word_str = 'Hello'
for p in open_puncts: for p in open_puncts:
string = p + word_str string = p + word_str
tokens = EN(string) tokens = en_tokenizer(string)
assert len(tokens) == 2 assert len(tokens) == 2
assert tokens[0].orth_ == p assert tokens[0].orth_ == p
assert tokens[1].orth_ == word_str assert tokens[1].orth_ == word_str
def test_two_different_open(open_puncts, EN): def test_two_different_open(open_puncts, en_tokenizer):
word_str = 'Hello' word_str = 'Hello'
for p in open_puncts: for p in open_puncts:
string = p + "`" + word_str string = p + "`" + word_str
tokens = EN(string) tokens = en_tokenizer(string)
assert len(tokens) == 3 assert len(tokens) == 3
assert tokens[0].orth_ == p assert tokens[0].orth_ == p
assert tokens[1].orth_ == "`" assert tokens[1].orth_ == "`"
assert tokens[2].orth_ == word_str assert tokens[2].orth_ == word_str
def test_three_same_open(open_puncts, EN): def test_three_same_open(open_puncts, en_tokenizer):
word_str = 'Hello' word_str = 'Hello'
for p in open_puncts: for p in open_puncts:
string = p + p + p + word_str string = p + p + p + word_str
tokens = EN(string) tokens = en_tokenizer(string)
assert len(tokens) == 4 assert len(tokens) == 4
assert tokens[0].orth_ == p assert tokens[0].orth_ == p
assert tokens[3].orth_ == word_str assert tokens[3].orth_ == word_str
def test_open_appostrophe(EN): def test_open_appostrophe(en_tokenizer):
string = "'The" string = "'The"
tokens = EN(string) tokens = en_tokenizer(string)
assert len(tokens) == 2 assert len(tokens) == 2
assert tokens[0].orth_ == "'" assert tokens[0].orth_ == "'"

View File

@ -3,50 +3,44 @@ and suffix punctuation."""
from __future__ import unicode_literals from __future__ import unicode_literals
import pytest import pytest
from spacy.en import English
@pytest.fixture def test_no_special(en_tokenizer):
def EN(): assert len(en_tokenizer("(can)")) == 3
return English().tokenizer
def test_no_special(EN): def test_no_punct(en_tokenizer):
assert len(EN("(can)")) == 3 assert len(en_tokenizer("can't")) == 2
def test_no_punct(EN): def test_prefix(en_tokenizer):
assert len(EN("can't")) == 2 assert len(en_tokenizer("(can't")) == 3
def test_prefix(EN): def test_suffix(en_tokenizer):
assert len(EN("(can't")) == 3 assert len(en_tokenizer("can't)")) == 3
def test_suffix(EN): def test_wrap(en_tokenizer):
assert len(EN("can't)")) == 3 assert len(en_tokenizer("(can't)")) == 4
def test_wrap(EN): def test_uneven_wrap(en_tokenizer):
assert len(EN("(can't)")) == 4 assert len(en_tokenizer("(can't?)")) == 5
def test_uneven_wrap(EN): def test_prefix_interact(en_tokenizer):
assert len(EN("(can't?)")) == 5 assert len(en_tokenizer("U.S.")) == 1
assert len(en_tokenizer("us.")) == 2
assert len(en_tokenizer("(U.S.")) == 2
def test_prefix_interact(EN): def test_suffix_interact(en_tokenizer):
assert len(EN("U.S.")) == 1 assert len(en_tokenizer("U.S.)")) == 2
assert len(EN("us.")) == 2
assert len(EN("(U.S.")) == 2
def test_suffix_interact(EN): def test_even_wrap_interact(en_tokenizer):
assert len(EN("U.S.)")) == 2 assert len(en_tokenizer("(U.S.)")) == 3
def test_even_wrap_interact(EN): def test_uneven_wrap_interact(en_tokenizer):
assert len(EN("(U.S.)")) == 3 assert len(en_tokenizer("(U.S.?)")) == 4
def test_uneven_wrap_interact(EN):
assert len(EN("(U.S.?)")) == 4

View File

@ -1,7 +1,4 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from spacy.en import English
import pytest import pytest
@ -10,27 +7,22 @@ def paired_puncts():
return [('(', ')'), ('[', ']'), ('{', '}'), ('*', '*')] return [('(', ')'), ('[', ']'), ('{', '}'), ('*', '*')]
@pytest.fixture def test_token(paired_puncts, en_tokenizer):
def EN():
return English().tokenizer
def test_token(paired_puncts, EN):
word_str = 'Hello' word_str = 'Hello'
for open_, close_ in paired_puncts: for open_, close_ in paired_puncts:
string = open_ + word_str + close_ string = open_ + word_str + close_
tokens = EN(string) tokens = en_tokenizer(string)
assert len(tokens) == 3 assert len(tokens) == 3
assert tokens[0].orth_ == open_ assert tokens[0].orth_ == open_
assert tokens[1].orth_ == word_str assert tokens[1].orth_ == word_str
assert tokens[2].orth_ == close_ assert tokens[2].orth_ == close_
def test_two_different(paired_puncts, EN): def test_two_different(paired_puncts, en_tokenizer):
word_str = 'Hello' word_str = 'Hello'
for open_, close_ in paired_puncts: for open_, close_ in paired_puncts:
string = "`" + open_ + word_str + close_ + "'" string = "`" + open_ + word_str + close_ + "'"
tokens = EN(string) tokens = en_tokenizer(string)
assert len(tokens) == 5 assert len(tokens) == 5
assert tokens[0].orth_ == "`" assert tokens[0].orth_ == "`"
assert tokens[1].orth_ == open_ assert tokens[1].orth_ == open_

View File

@ -3,32 +3,25 @@ from __future__ import unicode_literals
import pytest import pytest
from spacy.en import English
def test_no_word(en_tokenizer):
@pytest.fixture tokens = en_tokenizer(u'')
def EN():
return English().tokenizer
def test_no_word(EN):
tokens = EN(u'')
assert len(tokens) == 0 assert len(tokens) == 0
def test_single_word(EN): def test_single_word(en_tokenizer):
tokens = EN(u'hello') tokens = en_tokenizer(u'hello')
assert tokens[0].orth_ == 'hello' assert tokens[0].orth_ == 'hello'
def test_two_words(EN): def test_two_words(en_tokenizer):
tokens = EN('hello possums') tokens = en_tokenizer('hello possums')
assert len(tokens) == 2 assert len(tokens) == 2
assert tokens[0].orth_ != tokens[1].orth_ assert tokens[0].orth_ != tokens[1].orth_
def test_punct(EN): def test_punct(en_tokenizer):
tokens = EN('hello, possums.') tokens = en_tokenizer('hello, possums.')
assert len(tokens) == 4 assert len(tokens) == 4
assert tokens[0].orth_ == 'hello' assert tokens[0].orth_ == 'hello'
assert tokens[1].orth_ == ',' assert tokens[1].orth_ == ','
@ -36,34 +29,34 @@ def test_punct(EN):
assert tokens[1].orth_ != 'hello' assert tokens[1].orth_ != 'hello'
def test_digits(EN): def test_digits(en_tokenizer):
tokens = EN('The year: 1984.') tokens = en_tokenizer('The year: 1984.')
assert len(tokens) == 5 assert len(tokens) == 5
assert tokens[0].orth == EN.vocab['The'].orth assert tokens[0].orth == en_tokenizer.vocab['The'].orth
assert tokens[3].orth == EN.vocab['1984'].orth assert tokens[3].orth == en_tokenizer.vocab['1984'].orth
def test_contraction(EN): def test_contraction(en_tokenizer):
tokens = EN("don't giggle") tokens = en_tokenizer("don't giggle")
assert len(tokens) == 3 assert len(tokens) == 3
assert tokens[1].orth == EN.vocab["n't"].orth assert tokens[1].orth == en_tokenizer.vocab["n't"].orth
tokens = EN("i said don't!") tokens = en_tokenizer("i said don't!")
assert len(tokens) == 5 assert len(tokens) == 5
assert tokens[4].orth == EN.vocab['!'].orth assert tokens[4].orth == en_tokenizer.vocab['!'].orth
def test_contraction_punct(EN): def test_contraction_punct(en_tokenizer):
tokens = EN("(can't") tokens = en_tokenizer("(can't")
assert len(tokens) == 3 assert len(tokens) == 3
tokens = EN("`ain't") tokens = en_tokenizer("`ain't")
assert len(tokens) == 3 assert len(tokens) == 3
tokens = EN('''"isn't''') tokens = en_tokenizer('''"isn't''')
assert len(tokens) == 3 assert len(tokens) == 3
tokens = EN("can't!") tokens = en_tokenizer("can't!")
assert len(tokens) == 3 assert len(tokens) == 3
def test_sample(EN): def test_sample(en_tokenizer):
text = """Tributes pour in for late British Labour Party leader text = """Tributes pour in for late British Labour Party leader
Tributes poured in from around the world Thursday Tributes poured in from around the world Thursday
@ -75,7 +68,7 @@ untimely death" of the rapier-tongued Scottish barrister and parliamentarian.
"Mr. Smith, throughout his distinguished""" "Mr. Smith, throughout his distinguished"""
tokens = EN(text) tokens = en_tokenizer(text)
assert len(tokens) > 5 assert len(tokens) > 5

View File

@ -1,16 +1,9 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import pytest import pytest
from spacy.en import English
def test1(en_tokenizer):
@pytest.fixture
def EN():
return English()
def test1(EN):
words = ['JAPAN', 'GET', 'LUCKY'] words = ['JAPAN', 'GET', 'LUCKY']
tokens = EN.tokenizer.tokens_from_list(words) tokens = en_tokenizer.tokens_from_list(words)
assert len(tokens) == 3 assert len(tokens) == 3
assert tokens[0].orth_ == 'JAPAN' assert tokens[0].orth_ == 'JAPAN'

View File

@ -1,41 +1,35 @@
"""Test that tokens are created correctly for whitespace.""" """Test that tokens are created correctly for whitespace."""
from __future__ import unicode_literals from __future__ import unicode_literals
from spacy.en import English
import pytest import pytest
@pytest.fixture def test_single_space(en_tokenizer):
def EN(): tokens = en_tokenizer('hello possums')
return English().tokenizer
def test_single_space(EN):
tokens = EN('hello possums')
assert len(tokens) == 2 assert len(tokens) == 2
def test_double_space(EN): def test_double_space(en_tokenizer):
tokens = EN('hello possums') tokens = en_tokenizer('hello possums')
assert len(tokens) == 3 assert len(tokens) == 3
assert tokens[1].orth_ == ' ' assert tokens[1].orth_ == ' '
def test_newline(EN): def test_newline(en_tokenizer):
tokens = EN('hello\npossums') tokens = en_tokenizer('hello\npossums')
assert len(tokens) == 3 assert len(tokens) == 3
def test_newline_space(EN): def test_newline_space(en_tokenizer):
tokens = EN('hello \npossums') tokens = en_tokenizer('hello \npossums')
assert len(tokens) == 3 assert len(tokens) == 3
def test_newline_double_space(EN): def test_newline_double_space(en_tokenizer):
tokens = EN('hello \npossums') tokens = en_tokenizer('hello \npossums')
assert len(tokens) == 3 assert len(tokens) == 3
def test_newline_space_wrap(EN): def test_newline_space_wrap(en_tokenizer):
tokens = EN('hello \n possums') tokens = en_tokenizer('hello \n possums')
assert len(tokens) == 3 assert len(tokens) == 3