mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 18:06:29 +03:00
* Set up tokenizer/ tests properly, using a session-scoped fixture to avoid long load/unload times. Tokenizer tests now complete in 20 seconds.
This commit is contained in:
parent
1d5f20fdda
commit
877abb0e5b
11
tests/tokenizer/conftest.py
Normal file
11
tests/tokenizer/conftest.py
Normal file
|
@ -0,0 +1,11 @@
|
|||
import pytest
|
||||
from spacy.en import English
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def EN():
|
||||
return English(load_vectors=False)
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def en_tokenizer(EN):
|
||||
return EN.tokenizer
|
|
@ -1,34 +1,31 @@
|
|||
from __future__ import unicode_literals
|
||||
import pytest
|
||||
|
||||
from spacy.en import English
|
||||
|
||||
EN = English()
|
||||
|
||||
def test_possess():
|
||||
tokens = EN("Mike's", parse=False, tag=False)
|
||||
assert EN.vocab.strings[tokens[0].orth] == "Mike"
|
||||
assert EN.vocab.strings[tokens[1].orth] == "'s"
|
||||
def test_possess(en_tokenizer):
|
||||
tokens = en_tokenizer("Mike's")
|
||||
assert en_tokenizer.vocab.strings[tokens[0].orth] == "Mike"
|
||||
assert en_tokenizer.vocab.strings[tokens[1].orth] == "'s"
|
||||
assert len(tokens) == 2
|
||||
|
||||
|
||||
def test_apostrophe():
|
||||
tokens = EN("schools'", parse=False, tag=False)
|
||||
def test_apostrophe(en_tokenizer):
|
||||
tokens = en_tokenizer("schools'")
|
||||
assert len(tokens) == 2
|
||||
assert tokens[1].orth_ == "'"
|
||||
assert tokens[0].orth_ == "schools"
|
||||
|
||||
|
||||
def test_LL():
|
||||
tokens = EN("we'll", parse=False)
|
||||
def test_LL(en_tokenizer):
|
||||
tokens = en_tokenizer("we'll")
|
||||
assert len(tokens) == 2
|
||||
assert tokens[1].orth_ == "'ll"
|
||||
assert tokens[1].lemma_ == "will"
|
||||
assert tokens[0].orth_ == "we"
|
||||
|
||||
|
||||
def test_aint():
|
||||
tokens = EN("ain't", parse=False)
|
||||
def test_aint(en_tokenizer):
|
||||
tokens = en_tokenizer("ain't")
|
||||
assert len(tokens) == 2
|
||||
assert tokens[0].orth_ == "ai"
|
||||
assert tokens[0].lemma_ == "be"
|
||||
|
@ -36,19 +33,19 @@ def test_aint():
|
|||
assert tokens[1].lemma_ == "not"
|
||||
|
||||
|
||||
def test_capitalized():
|
||||
tokens = EN("can't", parse=False)
|
||||
def test_capitalized(en_tokenizer):
|
||||
tokens = en_tokenizer("can't")
|
||||
assert len(tokens) == 2
|
||||
tokens = EN("Can't", parse=False)
|
||||
tokens = en_tokenizer("Can't")
|
||||
assert len(tokens) == 2
|
||||
tokens = EN("Ain't", parse=False)
|
||||
tokens = en_tokenizer("Ain't")
|
||||
assert len(tokens) == 2
|
||||
assert tokens[0].orth_ == "Ai"
|
||||
assert tokens[0].lemma_ == "be"
|
||||
|
||||
|
||||
def test_punct():
|
||||
tokens = EN("We've", parse=False)
|
||||
def test_punct(en_tokenizer):
|
||||
tokens = en_tokenizer("We've")
|
||||
assert len(tokens) == 2
|
||||
tokens = EN("``We've", parse=False)
|
||||
tokens = en_tokenizer("``We've")
|
||||
assert len(tokens) == 3
|
||||
|
|
|
@ -1,17 +1,10 @@
|
|||
from __future__ import unicode_literals
|
||||
import pytest
|
||||
|
||||
from spacy.en import English
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def EN():
|
||||
return English()
|
||||
|
||||
|
||||
def test_tweebo_challenge(EN):
|
||||
def test_tweebo_challenge(en_tokenizer):
|
||||
text = u""":o :/ :'( >:o (: :) >.< XD -__- o.O ;D :-) @_@ :P 8D :1 >:( :D =| ") :> ...."""
|
||||
tokens = EN(text, parse=False, tag=False)
|
||||
tokens = en_tokenizer(text)
|
||||
assert tokens[0].orth_ == ":o"
|
||||
assert tokens[1].orth_ == ":/"
|
||||
assert tokens[2].orth_ == ":'("
|
||||
|
@ -36,7 +29,7 @@ def test_tweebo_challenge(EN):
|
|||
assert tokens[21].orth_ == '....'
|
||||
|
||||
|
||||
def test_false_positive(EN):
|
||||
def test_false_positive(en_tokenizer):
|
||||
text = "example:)"
|
||||
tokens = EN(text, parse=False, tag=False)
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
|
|
@ -3,18 +3,11 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
from spacy.en import English
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def nlp():
|
||||
nlp = English()
|
||||
return nlp.tokenizer
|
||||
|
||||
|
||||
def test_simple_punct(nlp):
|
||||
def test_simple_punct(en_tokenizer):
|
||||
text = 'to walk, do foo'
|
||||
tokens = nlp(text)
|
||||
tokens = en_tokenizer(text)
|
||||
assert tokens[0].idx == 0
|
||||
assert tokens[1].idx == 3
|
||||
assert tokens[2].idx == 7
|
||||
|
@ -22,9 +15,9 @@ def test_simple_punct(nlp):
|
|||
assert tokens[4].idx == 12
|
||||
|
||||
|
||||
def test_complex_punct(nlp):
|
||||
def test_complex_punct(en_tokenizer):
|
||||
text = 'Tom (D., Ill.)!'
|
||||
tokens = nlp(text)
|
||||
tokens = en_tokenizer(text)
|
||||
assert tokens[0].idx == 0
|
||||
assert len(tokens[0]) == 3
|
||||
assert tokens[1].idx == 4
|
||||
|
|
|
@ -2,17 +2,13 @@ from __future__ import unicode_literals
|
|||
|
||||
import pytest
|
||||
|
||||
from spacy.en import English
|
||||
|
||||
EN = English()
|
||||
|
||||
def test_hyphen():
|
||||
tokens = EN.tokenizer('best-known')
|
||||
def test_hyphen(en_tokenizer):
|
||||
tokens = en_tokenizer('best-known')
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
def test_period():
|
||||
tokens = EN.tokenizer('best.Known')
|
||||
def test_period(en_tokenizer):
|
||||
tokens = en_tokenizer('best.Known')
|
||||
assert len(tokens) == 3
|
||||
tokens = EN.tokenizer('zombo.com')
|
||||
tokens = en_tokenizer('zombo.com')
|
||||
assert len(tokens) == 1
|
||||
|
|
|
@ -1,14 +1,9 @@
|
|||
from __future__ import unicode_literals
|
||||
import pytest
|
||||
|
||||
from spacy.en import English
|
||||
|
||||
|
||||
def test_only_pre1():
|
||||
EN = English()
|
||||
assert len(EN("(")) == 1
|
||||
def test_only_pre1(en_tokenizer):
|
||||
assert len(en_tokenizer("(")) == 1
|
||||
|
||||
|
||||
def test_only_pre2():
|
||||
EN = English()
|
||||
assert len(EN("((")) == 2
|
||||
def test_only_pre2(en_tokenizer):
|
||||
assert len(en_tokenizer("((")) == 2
|
||||
|
|
|
@ -1,7 +1,4 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
from spacy.en import English
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
|
@ -10,42 +7,37 @@ def close_puncts():
|
|||
return [')', ']', '}', '*']
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def EN():
|
||||
return English()
|
||||
|
||||
|
||||
def test_close(close_puncts, EN):
|
||||
def test_close(close_puncts, en_tokenizer):
|
||||
word_str = 'Hello'
|
||||
for p in close_puncts:
|
||||
string = word_str + p
|
||||
tokens = EN(string, parse=False, tag=False)
|
||||
tokens = en_tokenizer(string)
|
||||
assert len(tokens) == 2
|
||||
assert tokens[1].string == p
|
||||
assert tokens[0].string == word_str
|
||||
|
||||
|
||||
def test_two_different_close(close_puncts, EN):
|
||||
def test_two_different_close(close_puncts, en_tokenizer):
|
||||
word_str = 'Hello'
|
||||
for p in close_puncts:
|
||||
string = word_str + p + "'"
|
||||
tokens = EN(string, parse=False, tag=False)
|
||||
tokens = en_tokenizer(string)
|
||||
assert len(tokens) == 3
|
||||
assert tokens[0].string == word_str
|
||||
assert tokens[1].string == p
|
||||
assert tokens[2].string == "'"
|
||||
|
||||
|
||||
def test_three_same_close(close_puncts, EN):
|
||||
def test_three_same_close(close_puncts, en_tokenizer):
|
||||
word_str = 'Hello'
|
||||
for p in close_puncts:
|
||||
string = word_str + p + p + p
|
||||
tokens = EN(string, tag=False, parse=False)
|
||||
tokens = en_tokenizer(string)
|
||||
assert len(tokens) == 4
|
||||
assert tokens[0].string == word_str
|
||||
assert tokens[1].string == p
|
||||
|
||||
|
||||
def test_double_end_quote(EN):
|
||||
assert len(EN("Hello''", tag=False, parse=False)) == 2
|
||||
assert len(EN("''", tag=False, parse=False)) == 1
|
||||
def test_double_end_quote(en_tokenizer):
|
||||
assert len(en_tokenizer("Hello''")) == 2
|
||||
assert len(en_tokenizer("''")) == 1
|
||||
|
|
|
@ -1,7 +1,5 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
from spacy.en import English
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
|
@ -10,44 +8,39 @@ def open_puncts():
|
|||
return ['(', '[', '{', '*']
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def EN():
|
||||
return English().tokenizer
|
||||
|
||||
|
||||
def test_open(open_puncts, EN):
|
||||
def test_open(open_puncts, en_tokenizer):
|
||||
word_str = 'Hello'
|
||||
for p in open_puncts:
|
||||
string = p + word_str
|
||||
tokens = EN(string)
|
||||
tokens = en_tokenizer(string)
|
||||
assert len(tokens) == 2
|
||||
assert tokens[0].orth_ == p
|
||||
assert tokens[1].orth_ == word_str
|
||||
|
||||
|
||||
def test_two_different_open(open_puncts, EN):
|
||||
def test_two_different_open(open_puncts, en_tokenizer):
|
||||
word_str = 'Hello'
|
||||
for p in open_puncts:
|
||||
string = p + "`" + word_str
|
||||
tokens = EN(string)
|
||||
tokens = en_tokenizer(string)
|
||||
assert len(tokens) == 3
|
||||
assert tokens[0].orth_ == p
|
||||
assert tokens[1].orth_ == "`"
|
||||
assert tokens[2].orth_ == word_str
|
||||
|
||||
|
||||
def test_three_same_open(open_puncts, EN):
|
||||
def test_three_same_open(open_puncts, en_tokenizer):
|
||||
word_str = 'Hello'
|
||||
for p in open_puncts:
|
||||
string = p + p + p + word_str
|
||||
tokens = EN(string)
|
||||
tokens = en_tokenizer(string)
|
||||
assert len(tokens) == 4
|
||||
assert tokens[0].orth_ == p
|
||||
assert tokens[3].orth_ == word_str
|
||||
|
||||
|
||||
def test_open_appostrophe(EN):
|
||||
def test_open_appostrophe(en_tokenizer):
|
||||
string = "'The"
|
||||
tokens = EN(string)
|
||||
tokens = en_tokenizer(string)
|
||||
assert len(tokens) == 2
|
||||
assert tokens[0].orth_ == "'"
|
||||
|
|
|
@ -3,50 +3,44 @@ and suffix punctuation."""
|
|||
from __future__ import unicode_literals
|
||||
import pytest
|
||||
|
||||
from spacy.en import English
|
||||
|
||||
@pytest.fixture
|
||||
def EN():
|
||||
return English().tokenizer
|
||||
def test_no_special(en_tokenizer):
|
||||
assert len(en_tokenizer("(can)")) == 3
|
||||
|
||||
|
||||
def test_no_special(EN):
|
||||
assert len(EN("(can)")) == 3
|
||||
def test_no_punct(en_tokenizer):
|
||||
assert len(en_tokenizer("can't")) == 2
|
||||
|
||||
|
||||
def test_no_punct(EN):
|
||||
assert len(EN("can't")) == 2
|
||||
def test_prefix(en_tokenizer):
|
||||
assert len(en_tokenizer("(can't")) == 3
|
||||
|
||||
|
||||
def test_prefix(EN):
|
||||
assert len(EN("(can't")) == 3
|
||||
def test_suffix(en_tokenizer):
|
||||
assert len(en_tokenizer("can't)")) == 3
|
||||
|
||||
|
||||
def test_suffix(EN):
|
||||
assert len(EN("can't)")) == 3
|
||||
def test_wrap(en_tokenizer):
|
||||
assert len(en_tokenizer("(can't)")) == 4
|
||||
|
||||
|
||||
def test_wrap(EN):
|
||||
assert len(EN("(can't)")) == 4
|
||||
def test_uneven_wrap(en_tokenizer):
|
||||
assert len(en_tokenizer("(can't?)")) == 5
|
||||
|
||||
|
||||
def test_uneven_wrap(EN):
|
||||
assert len(EN("(can't?)")) == 5
|
||||
def test_prefix_interact(en_tokenizer):
|
||||
assert len(en_tokenizer("U.S.")) == 1
|
||||
assert len(en_tokenizer("us.")) == 2
|
||||
assert len(en_tokenizer("(U.S.")) == 2
|
||||
|
||||
|
||||
def test_prefix_interact(EN):
|
||||
assert len(EN("U.S.")) == 1
|
||||
assert len(EN("us.")) == 2
|
||||
assert len(EN("(U.S.")) == 2
|
||||
def test_suffix_interact(en_tokenizer):
|
||||
assert len(en_tokenizer("U.S.)")) == 2
|
||||
|
||||
|
||||
def test_suffix_interact(EN):
|
||||
assert len(EN("U.S.)")) == 2
|
||||
def test_even_wrap_interact(en_tokenizer):
|
||||
assert len(en_tokenizer("(U.S.)")) == 3
|
||||
|
||||
|
||||
def test_even_wrap_interact(EN):
|
||||
assert len(EN("(U.S.)")) == 3
|
||||
|
||||
|
||||
def test_uneven_wrap_interact(EN):
|
||||
assert len(EN("(U.S.?)")) == 4
|
||||
def test_uneven_wrap_interact(en_tokenizer):
|
||||
assert len(en_tokenizer("(U.S.?)")) == 4
|
||||
|
|
|
@ -1,7 +1,4 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
from spacy.en import English
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
|
@ -10,27 +7,22 @@ def paired_puncts():
|
|||
return [('(', ')'), ('[', ']'), ('{', '}'), ('*', '*')]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def EN():
|
||||
return English().tokenizer
|
||||
|
||||
|
||||
def test_token(paired_puncts, EN):
|
||||
def test_token(paired_puncts, en_tokenizer):
|
||||
word_str = 'Hello'
|
||||
for open_, close_ in paired_puncts:
|
||||
string = open_ + word_str + close_
|
||||
tokens = EN(string)
|
||||
tokens = en_tokenizer(string)
|
||||
assert len(tokens) == 3
|
||||
assert tokens[0].orth_ == open_
|
||||
assert tokens[1].orth_ == word_str
|
||||
assert tokens[2].orth_ == close_
|
||||
|
||||
|
||||
def test_two_different(paired_puncts, EN):
|
||||
def test_two_different(paired_puncts, en_tokenizer):
|
||||
word_str = 'Hello'
|
||||
for open_, close_ in paired_puncts:
|
||||
string = "`" + open_ + word_str + close_ + "'"
|
||||
tokens = EN(string)
|
||||
tokens = en_tokenizer(string)
|
||||
assert len(tokens) == 5
|
||||
assert tokens[0].orth_ == "`"
|
||||
assert tokens[1].orth_ == open_
|
||||
|
|
|
@ -3,32 +3,25 @@ from __future__ import unicode_literals
|
|||
|
||||
import pytest
|
||||
|
||||
from spacy.en import English
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def EN():
|
||||
return English().tokenizer
|
||||
|
||||
|
||||
def test_no_word(EN):
|
||||
tokens = EN(u'')
|
||||
def test_no_word(en_tokenizer):
|
||||
tokens = en_tokenizer(u'')
|
||||
assert len(tokens) == 0
|
||||
|
||||
|
||||
def test_single_word(EN):
|
||||
tokens = EN(u'hello')
|
||||
def test_single_word(en_tokenizer):
|
||||
tokens = en_tokenizer(u'hello')
|
||||
assert tokens[0].orth_ == 'hello'
|
||||
|
||||
|
||||
def test_two_words(EN):
|
||||
tokens = EN('hello possums')
|
||||
def test_two_words(en_tokenizer):
|
||||
tokens = en_tokenizer('hello possums')
|
||||
assert len(tokens) == 2
|
||||
assert tokens[0].orth_ != tokens[1].orth_
|
||||
|
||||
|
||||
def test_punct(EN):
|
||||
tokens = EN('hello, possums.')
|
||||
def test_punct(en_tokenizer):
|
||||
tokens = en_tokenizer('hello, possums.')
|
||||
assert len(tokens) == 4
|
||||
assert tokens[0].orth_ == 'hello'
|
||||
assert tokens[1].orth_ == ','
|
||||
|
@ -36,34 +29,34 @@ def test_punct(EN):
|
|||
assert tokens[1].orth_ != 'hello'
|
||||
|
||||
|
||||
def test_digits(EN):
|
||||
tokens = EN('The year: 1984.')
|
||||
def test_digits(en_tokenizer):
|
||||
tokens = en_tokenizer('The year: 1984.')
|
||||
assert len(tokens) == 5
|
||||
assert tokens[0].orth == EN.vocab['The'].orth
|
||||
assert tokens[3].orth == EN.vocab['1984'].orth
|
||||
assert tokens[0].orth == en_tokenizer.vocab['The'].orth
|
||||
assert tokens[3].orth == en_tokenizer.vocab['1984'].orth
|
||||
|
||||
|
||||
def test_contraction(EN):
|
||||
tokens = EN("don't giggle")
|
||||
def test_contraction(en_tokenizer):
|
||||
tokens = en_tokenizer("don't giggle")
|
||||
assert len(tokens) == 3
|
||||
assert tokens[1].orth == EN.vocab["n't"].orth
|
||||
tokens = EN("i said don't!")
|
||||
assert tokens[1].orth == en_tokenizer.vocab["n't"].orth
|
||||
tokens = en_tokenizer("i said don't!")
|
||||
assert len(tokens) == 5
|
||||
assert tokens[4].orth == EN.vocab['!'].orth
|
||||
assert tokens[4].orth == en_tokenizer.vocab['!'].orth
|
||||
|
||||
|
||||
def test_contraction_punct(EN):
|
||||
tokens = EN("(can't")
|
||||
def test_contraction_punct(en_tokenizer):
|
||||
tokens = en_tokenizer("(can't")
|
||||
assert len(tokens) == 3
|
||||
tokens = EN("`ain't")
|
||||
tokens = en_tokenizer("`ain't")
|
||||
assert len(tokens) == 3
|
||||
tokens = EN('''"isn't''')
|
||||
tokens = en_tokenizer('''"isn't''')
|
||||
assert len(tokens) == 3
|
||||
tokens = EN("can't!")
|
||||
tokens = en_tokenizer("can't!")
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
def test_sample(EN):
|
||||
def test_sample(en_tokenizer):
|
||||
text = """Tributes pour in for late British Labour Party leader
|
||||
|
||||
Tributes poured in from around the world Thursday
|
||||
|
@ -75,7 +68,7 @@ untimely death" of the rapier-tongued Scottish barrister and parliamentarian.
|
|||
|
||||
"Mr. Smith, throughout his distinguished"""
|
||||
|
||||
tokens = EN(text)
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) > 5
|
||||
|
||||
|
||||
|
|
|
@ -1,16 +1,9 @@
|
|||
from __future__ import unicode_literals
|
||||
import pytest
|
||||
|
||||
from spacy.en import English
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def EN():
|
||||
return English()
|
||||
|
||||
|
||||
def test1(EN):
|
||||
def test1(en_tokenizer):
|
||||
words = ['JAPAN', 'GET', 'LUCKY']
|
||||
tokens = EN.tokenizer.tokens_from_list(words)
|
||||
tokens = en_tokenizer.tokens_from_list(words)
|
||||
assert len(tokens) == 3
|
||||
assert tokens[0].orth_ == 'JAPAN'
|
||||
|
|
|
@ -1,41 +1,35 @@
|
|||
"""Test that tokens are created correctly for whitespace."""
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from spacy.en import English
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def EN():
|
||||
return English().tokenizer
|
||||
|
||||
|
||||
def test_single_space(EN):
|
||||
tokens = EN('hello possums')
|
||||
def test_single_space(en_tokenizer):
|
||||
tokens = en_tokenizer('hello possums')
|
||||
assert len(tokens) == 2
|
||||
|
||||
|
||||
def test_double_space(EN):
|
||||
tokens = EN('hello possums')
|
||||
def test_double_space(en_tokenizer):
|
||||
tokens = en_tokenizer('hello possums')
|
||||
assert len(tokens) == 3
|
||||
assert tokens[1].orth_ == ' '
|
||||
|
||||
|
||||
def test_newline(EN):
|
||||
tokens = EN('hello\npossums')
|
||||
def test_newline(en_tokenizer):
|
||||
tokens = en_tokenizer('hello\npossums')
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
def test_newline_space(EN):
|
||||
tokens = EN('hello \npossums')
|
||||
def test_newline_space(en_tokenizer):
|
||||
tokens = en_tokenizer('hello \npossums')
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
def test_newline_double_space(EN):
|
||||
tokens = EN('hello \npossums')
|
||||
def test_newline_double_space(en_tokenizer):
|
||||
tokens = en_tokenizer('hello \npossums')
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
def test_newline_space_wrap(EN):
|
||||
tokens = EN('hello \n possums')
|
||||
def test_newline_space_wrap(en_tokenizer):
|
||||
tokens = en_tokenizer('hello \n possums')
|
||||
assert len(tokens) == 3
|
||||
|
|
Loading…
Reference in New Issue
Block a user