diff --git a/tests/tokenizer/conftest.py b/tests/tokenizer/conftest.py new file mode 100644 index 000000000..ccaa741a0 --- /dev/null +++ b/tests/tokenizer/conftest.py @@ -0,0 +1,11 @@ +import pytest +from spacy.en import English + + +@pytest.fixture(scope="session") +def EN(): + return English(load_vectors=False) + +@pytest.fixture(scope="session") +def en_tokenizer(EN): + return EN.tokenizer diff --git a/tests/tokenizer/test_contractions.py b/tests/tokenizer/test_contractions.py index 3d0ee11ee..1d12e3d22 100644 --- a/tests/tokenizer/test_contractions.py +++ b/tests/tokenizer/test_contractions.py @@ -1,34 +1,31 @@ from __future__ import unicode_literals import pytest -from spacy.en import English -EN = English() - -def test_possess(): - tokens = EN("Mike's", parse=False, tag=False) - assert EN.vocab.strings[tokens[0].orth] == "Mike" - assert EN.vocab.strings[tokens[1].orth] == "'s" +def test_possess(en_tokenizer): + tokens = en_tokenizer("Mike's") + assert en_tokenizer.vocab.strings[tokens[0].orth] == "Mike" + assert en_tokenizer.vocab.strings[tokens[1].orth] == "'s" assert len(tokens) == 2 -def test_apostrophe(): - tokens = EN("schools'", parse=False, tag=False) +def test_apostrophe(en_tokenizer): + tokens = en_tokenizer("schools'") assert len(tokens) == 2 assert tokens[1].orth_ == "'" assert tokens[0].orth_ == "schools" -def test_LL(): - tokens = EN("we'll", parse=False) +def test_LL(en_tokenizer): + tokens = en_tokenizer("we'll") assert len(tokens) == 2 assert tokens[1].orth_ == "'ll" assert tokens[1].lemma_ == "will" assert tokens[0].orth_ == "we" -def test_aint(): - tokens = EN("ain't", parse=False) +def test_aint(en_tokenizer): + tokens = en_tokenizer("ain't") assert len(tokens) == 2 assert tokens[0].orth_ == "ai" assert tokens[0].lemma_ == "be" @@ -36,19 +33,19 @@ def test_aint(): assert tokens[1].lemma_ == "not" -def test_capitalized(): - tokens = EN("can't", parse=False) +def test_capitalized(en_tokenizer): + tokens = en_tokenizer("can't") assert len(tokens) == 2 - tokens = EN("Can't", parse=False) + tokens = en_tokenizer("Can't") assert len(tokens) == 2 - tokens = EN("Ain't", parse=False) + tokens = en_tokenizer("Ain't") assert len(tokens) == 2 assert tokens[0].orth_ == "Ai" assert tokens[0].lemma_ == "be" -def test_punct(): - tokens = EN("We've", parse=False) +def test_punct(en_tokenizer): + tokens = en_tokenizer("We've") assert len(tokens) == 2 - tokens = EN("``We've", parse=False) + tokens = en_tokenizer("``We've") assert len(tokens) == 3 diff --git a/tests/tokenizer/test_emoticons.py b/tests/tokenizer/test_emoticons.py index 2b250a328..e0022dbbd 100644 --- a/tests/tokenizer/test_emoticons.py +++ b/tests/tokenizer/test_emoticons.py @@ -1,17 +1,10 @@ from __future__ import unicode_literals import pytest -from spacy.en import English - -@pytest.fixture -def EN(): - return English() - - -def test_tweebo_challenge(EN): +def test_tweebo_challenge(en_tokenizer): text = u""":o :/ :'( >:o (: :) >.< XD -__- o.O ;D :-) @_@ :P 8D :1 >:( :D =| ") :> ....""" - tokens = EN(text, parse=False, tag=False) + tokens = en_tokenizer(text) assert tokens[0].orth_ == ":o" assert tokens[1].orth_ == ":/" assert tokens[2].orth_ == ":'(" @@ -36,7 +29,7 @@ def test_tweebo_challenge(EN): assert tokens[21].orth_ == '....' -def test_false_positive(EN): +def test_false_positive(en_tokenizer): text = "example:)" - tokens = EN(text, parse=False, tag=False) + tokens = en_tokenizer(text) assert len(tokens) == 3 diff --git a/tests/tokenizer/test_indices.py b/tests/tokenizer/test_indices.py index ecd2e610c..5df7bcc59 100644 --- a/tests/tokenizer/test_indices.py +++ b/tests/tokenizer/test_indices.py @@ -3,18 +3,11 @@ from __future__ import unicode_literals import pytest -from spacy.en import English -@pytest.fixture -def nlp(): - nlp = English() - return nlp.tokenizer - - -def test_simple_punct(nlp): +def test_simple_punct(en_tokenizer): text = 'to walk, do foo' - tokens = nlp(text) + tokens = en_tokenizer(text) assert tokens[0].idx == 0 assert tokens[1].idx == 3 assert tokens[2].idx == 7 @@ -22,9 +15,9 @@ def test_simple_punct(nlp): assert tokens[4].idx == 12 -def test_complex_punct(nlp): +def test_complex_punct(en_tokenizer): text = 'Tom (D., Ill.)!' - tokens = nlp(text) + tokens = en_tokenizer(text) assert tokens[0].idx == 0 assert len(tokens[0]) == 3 assert tokens[1].idx == 4 diff --git a/tests/tokenizer/test_infix.py b/tests/tokenizer/test_infix.py index 5c6558009..2e9fbe1e4 100644 --- a/tests/tokenizer/test_infix.py +++ b/tests/tokenizer/test_infix.py @@ -2,17 +2,13 @@ from __future__ import unicode_literals import pytest -from spacy.en import English - -EN = English() - -def test_hyphen(): - tokens = EN.tokenizer('best-known') +def test_hyphen(en_tokenizer): + tokens = en_tokenizer('best-known') assert len(tokens) == 3 -def test_period(): - tokens = EN.tokenizer('best.Known') +def test_period(en_tokenizer): + tokens = en_tokenizer('best.Known') assert len(tokens) == 3 - tokens = EN.tokenizer('zombo.com') + tokens = en_tokenizer('zombo.com') assert len(tokens) == 1 diff --git a/tests/tokenizer/test_only_punct.py b/tests/tokenizer/test_only_punct.py index a09beb9ef..12c958088 100644 --- a/tests/tokenizer/test_only_punct.py +++ b/tests/tokenizer/test_only_punct.py @@ -1,14 +1,9 @@ from __future__ import unicode_literals -import pytest - -from spacy.en import English -def test_only_pre1(): - EN = English() - assert len(EN("(")) == 1 +def test_only_pre1(en_tokenizer): + assert len(en_tokenizer("(")) == 1 -def test_only_pre2(): - EN = English() - assert len(EN("((")) == 2 +def test_only_pre2(en_tokenizer): + assert len(en_tokenizer("((")) == 2 diff --git a/tests/tokenizer/test_post_punct.py b/tests/tokenizer/test_post_punct.py index 95b32f261..ff1120c63 100644 --- a/tests/tokenizer/test_post_punct.py +++ b/tests/tokenizer/test_post_punct.py @@ -1,7 +1,4 @@ from __future__ import unicode_literals - -from spacy.en import English - import pytest @@ -10,42 +7,37 @@ def close_puncts(): return [')', ']', '}', '*'] -@pytest.fixture -def EN(): - return English() - - -def test_close(close_puncts, EN): +def test_close(close_puncts, en_tokenizer): word_str = 'Hello' for p in close_puncts: string = word_str + p - tokens = EN(string, parse=False, tag=False) + tokens = en_tokenizer(string) assert len(tokens) == 2 assert tokens[1].string == p assert tokens[0].string == word_str -def test_two_different_close(close_puncts, EN): +def test_two_different_close(close_puncts, en_tokenizer): word_str = 'Hello' for p in close_puncts: string = word_str + p + "'" - tokens = EN(string, parse=False, tag=False) + tokens = en_tokenizer(string) assert len(tokens) == 3 assert tokens[0].string == word_str assert tokens[1].string == p assert tokens[2].string == "'" -def test_three_same_close(close_puncts, EN): +def test_three_same_close(close_puncts, en_tokenizer): word_str = 'Hello' for p in close_puncts: string = word_str + p + p + p - tokens = EN(string, tag=False, parse=False) + tokens = en_tokenizer(string) assert len(tokens) == 4 assert tokens[0].string == word_str assert tokens[1].string == p -def test_double_end_quote(EN): - assert len(EN("Hello''", tag=False, parse=False)) == 2 - assert len(EN("''", tag=False, parse=False)) == 1 +def test_double_end_quote(en_tokenizer): + assert len(en_tokenizer("Hello''")) == 2 + assert len(en_tokenizer("''")) == 1 diff --git a/tests/tokenizer/test_pre_punct.py b/tests/tokenizer/test_pre_punct.py index 1a987e59b..9aec1dc7b 100644 --- a/tests/tokenizer/test_pre_punct.py +++ b/tests/tokenizer/test_pre_punct.py @@ -1,7 +1,5 @@ from __future__ import unicode_literals -from spacy.en import English - import pytest @@ -10,44 +8,39 @@ def open_puncts(): return ['(', '[', '{', '*'] -@pytest.fixture -def EN(): - return English().tokenizer - - -def test_open(open_puncts, EN): +def test_open(open_puncts, en_tokenizer): word_str = 'Hello' for p in open_puncts: string = p + word_str - tokens = EN(string) + tokens = en_tokenizer(string) assert len(tokens) == 2 assert tokens[0].orth_ == p assert tokens[1].orth_ == word_str -def test_two_different_open(open_puncts, EN): +def test_two_different_open(open_puncts, en_tokenizer): word_str = 'Hello' for p in open_puncts: string = p + "`" + word_str - tokens = EN(string) + tokens = en_tokenizer(string) assert len(tokens) == 3 assert tokens[0].orth_ == p assert tokens[1].orth_ == "`" assert tokens[2].orth_ == word_str -def test_three_same_open(open_puncts, EN): +def test_three_same_open(open_puncts, en_tokenizer): word_str = 'Hello' for p in open_puncts: string = p + p + p + word_str - tokens = EN(string) + tokens = en_tokenizer(string) assert len(tokens) == 4 assert tokens[0].orth_ == p assert tokens[3].orth_ == word_str -def test_open_appostrophe(EN): +def test_open_appostrophe(en_tokenizer): string = "'The" - tokens = EN(string) + tokens = en_tokenizer(string) assert len(tokens) == 2 assert tokens[0].orth_ == "'" diff --git a/tests/tokenizer/test_special_affix.py b/tests/tokenizer/test_special_affix.py index cdc6a6d78..62cf114f1 100644 --- a/tests/tokenizer/test_special_affix.py +++ b/tests/tokenizer/test_special_affix.py @@ -3,50 +3,44 @@ and suffix punctuation.""" from __future__ import unicode_literals import pytest -from spacy.en import English -@pytest.fixture -def EN(): - return English().tokenizer +def test_no_special(en_tokenizer): + assert len(en_tokenizer("(can)")) == 3 -def test_no_special(EN): - assert len(EN("(can)")) == 3 +def test_no_punct(en_tokenizer): + assert len(en_tokenizer("can't")) == 2 -def test_no_punct(EN): - assert len(EN("can't")) == 2 +def test_prefix(en_tokenizer): + assert len(en_tokenizer("(can't")) == 3 -def test_prefix(EN): - assert len(EN("(can't")) == 3 +def test_suffix(en_tokenizer): + assert len(en_tokenizer("can't)")) == 3 -def test_suffix(EN): - assert len(EN("can't)")) == 3 +def test_wrap(en_tokenizer): + assert len(en_tokenizer("(can't)")) == 4 -def test_wrap(EN): - assert len(EN("(can't)")) == 4 +def test_uneven_wrap(en_tokenizer): + assert len(en_tokenizer("(can't?)")) == 5 -def test_uneven_wrap(EN): - assert len(EN("(can't?)")) == 5 +def test_prefix_interact(en_tokenizer): + assert len(en_tokenizer("U.S.")) == 1 + assert len(en_tokenizer("us.")) == 2 + assert len(en_tokenizer("(U.S.")) == 2 -def test_prefix_interact(EN): - assert len(EN("U.S.")) == 1 - assert len(EN("us.")) == 2 - assert len(EN("(U.S.")) == 2 +def test_suffix_interact(en_tokenizer): + assert len(en_tokenizer("U.S.)")) == 2 -def test_suffix_interact(EN): - assert len(EN("U.S.)")) == 2 +def test_even_wrap_interact(en_tokenizer): + assert len(en_tokenizer("(U.S.)")) == 3 -def test_even_wrap_interact(EN): - assert len(EN("(U.S.)")) == 3 - - -def test_uneven_wrap_interact(EN): - assert len(EN("(U.S.?)")) == 4 +def test_uneven_wrap_interact(en_tokenizer): + assert len(en_tokenizer("(U.S.?)")) == 4 diff --git a/tests/tokenizer/test_surround_punct.py b/tests/tokenizer/test_surround_punct.py index fb6a6beb1..7c7a50904 100644 --- a/tests/tokenizer/test_surround_punct.py +++ b/tests/tokenizer/test_surround_punct.py @@ -1,7 +1,4 @@ from __future__ import unicode_literals - -from spacy.en import English - import pytest @@ -10,27 +7,22 @@ def paired_puncts(): return [('(', ')'), ('[', ']'), ('{', '}'), ('*', '*')] -@pytest.fixture -def EN(): - return English().tokenizer - - -def test_token(paired_puncts, EN): +def test_token(paired_puncts, en_tokenizer): word_str = 'Hello' for open_, close_ in paired_puncts: string = open_ + word_str + close_ - tokens = EN(string) + tokens = en_tokenizer(string) assert len(tokens) == 3 assert tokens[0].orth_ == open_ assert tokens[1].orth_ == word_str assert tokens[2].orth_ == close_ -def test_two_different(paired_puncts, EN): +def test_two_different(paired_puncts, en_tokenizer): word_str = 'Hello' for open_, close_ in paired_puncts: string = "`" + open_ + word_str + close_ + "'" - tokens = EN(string) + tokens = en_tokenizer(string) assert len(tokens) == 5 assert tokens[0].orth_ == "`" assert tokens[1].orth_ == open_ diff --git a/tests/tokenizer/test_tokenizer.py b/tests/tokenizer/test_tokenizer.py index 26d24b063..804e09114 100644 --- a/tests/tokenizer/test_tokenizer.py +++ b/tests/tokenizer/test_tokenizer.py @@ -3,32 +3,25 @@ from __future__ import unicode_literals import pytest -from spacy.en import English - -@pytest.fixture -def EN(): - return English().tokenizer - - -def test_no_word(EN): - tokens = EN(u'') +def test_no_word(en_tokenizer): + tokens = en_tokenizer(u'') assert len(tokens) == 0 -def test_single_word(EN): - tokens = EN(u'hello') +def test_single_word(en_tokenizer): + tokens = en_tokenizer(u'hello') assert tokens[0].orth_ == 'hello' -def test_two_words(EN): - tokens = EN('hello possums') +def test_two_words(en_tokenizer): + tokens = en_tokenizer('hello possums') assert len(tokens) == 2 assert tokens[0].orth_ != tokens[1].orth_ -def test_punct(EN): - tokens = EN('hello, possums.') +def test_punct(en_tokenizer): + tokens = en_tokenizer('hello, possums.') assert len(tokens) == 4 assert tokens[0].orth_ == 'hello' assert tokens[1].orth_ == ',' @@ -36,34 +29,34 @@ def test_punct(EN): assert tokens[1].orth_ != 'hello' -def test_digits(EN): - tokens = EN('The year: 1984.') +def test_digits(en_tokenizer): + tokens = en_tokenizer('The year: 1984.') assert len(tokens) == 5 - assert tokens[0].orth == EN.vocab['The'].orth - assert tokens[3].orth == EN.vocab['1984'].orth + assert tokens[0].orth == en_tokenizer.vocab['The'].orth + assert tokens[3].orth == en_tokenizer.vocab['1984'].orth -def test_contraction(EN): - tokens = EN("don't giggle") +def test_contraction(en_tokenizer): + tokens = en_tokenizer("don't giggle") assert len(tokens) == 3 - assert tokens[1].orth == EN.vocab["n't"].orth - tokens = EN("i said don't!") + assert tokens[1].orth == en_tokenizer.vocab["n't"].orth + tokens = en_tokenizer("i said don't!") assert len(tokens) == 5 - assert tokens[4].orth == EN.vocab['!'].orth + assert tokens[4].orth == en_tokenizer.vocab['!'].orth -def test_contraction_punct(EN): - tokens = EN("(can't") +def test_contraction_punct(en_tokenizer): + tokens = en_tokenizer("(can't") assert len(tokens) == 3 - tokens = EN("`ain't") + tokens = en_tokenizer("`ain't") assert len(tokens) == 3 - tokens = EN('''"isn't''') + tokens = en_tokenizer('''"isn't''') assert len(tokens) == 3 - tokens = EN("can't!") + tokens = en_tokenizer("can't!") assert len(tokens) == 3 -def test_sample(EN): +def test_sample(en_tokenizer): text = """Tributes pour in for late British Labour Party leader Tributes poured in from around the world Thursday @@ -75,7 +68,7 @@ untimely death" of the rapier-tongued Scottish barrister and parliamentarian. "Mr. Smith, throughout his distinguished""" - tokens = EN(text) + tokens = en_tokenizer(text) assert len(tokens) > 5 diff --git a/tests/tokenizer/test_tokens_from_list.py b/tests/tokenizer/test_tokens_from_list.py index 5bb5d7d69..c30326c54 100644 --- a/tests/tokenizer/test_tokens_from_list.py +++ b/tests/tokenizer/test_tokens_from_list.py @@ -1,16 +1,9 @@ from __future__ import unicode_literals import pytest -from spacy.en import English - -@pytest.fixture -def EN(): - return English() - - -def test1(EN): +def test1(en_tokenizer): words = ['JAPAN', 'GET', 'LUCKY'] - tokens = EN.tokenizer.tokens_from_list(words) + tokens = en_tokenizer.tokens_from_list(words) assert len(tokens) == 3 assert tokens[0].orth_ == 'JAPAN' diff --git a/tests/tokenizer/test_whitespace.py b/tests/tokenizer/test_whitespace.py index eb87881dd..9d3fb7f5d 100644 --- a/tests/tokenizer/test_whitespace.py +++ b/tests/tokenizer/test_whitespace.py @@ -1,41 +1,35 @@ """Test that tokens are created correctly for whitespace.""" from __future__ import unicode_literals -from spacy.en import English import pytest -@pytest.fixture -def EN(): - return English().tokenizer - - -def test_single_space(EN): - tokens = EN('hello possums') +def test_single_space(en_tokenizer): + tokens = en_tokenizer('hello possums') assert len(tokens) == 2 -def test_double_space(EN): - tokens = EN('hello possums') +def test_double_space(en_tokenizer): + tokens = en_tokenizer('hello possums') assert len(tokens) == 3 assert tokens[1].orth_ == ' ' -def test_newline(EN): - tokens = EN('hello\npossums') +def test_newline(en_tokenizer): + tokens = en_tokenizer('hello\npossums') assert len(tokens) == 3 -def test_newline_space(EN): - tokens = EN('hello \npossums') +def test_newline_space(en_tokenizer): + tokens = en_tokenizer('hello \npossums') assert len(tokens) == 3 -def test_newline_double_space(EN): - tokens = EN('hello \npossums') +def test_newline_double_space(en_tokenizer): + tokens = en_tokenizer('hello \npossums') assert len(tokens) == 3 -def test_newline_space_wrap(EN): - tokens = EN('hello \n possums') +def test_newline_space_wrap(en_tokenizer): + tokens = en_tokenizer('hello \n possums') assert len(tokens) == 3