mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-25 05:01:02 +03:00 
			
		
		
		
	* Set up tokenizer/ tests properly, using a session-scoped fixture to avoid long load/unload times. Tokenizer tests now complete in 20 seconds.
This commit is contained in:
		
							parent
							
								
									1d5f20fdda
								
							
						
					
					
						commit
						877abb0e5b
					
				
							
								
								
									
										11
									
								
								tests/tokenizer/conftest.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										11
									
								
								tests/tokenizer/conftest.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,11 @@ | |||
| import pytest | ||||
| from spacy.en import English | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture(scope="session") | ||||
| def EN(): | ||||
|     return English(load_vectors=False) | ||||
| 
 | ||||
| @pytest.fixture(scope="session") | ||||
| def en_tokenizer(EN): | ||||
|     return EN.tokenizer | ||||
|  | @ -1,34 +1,31 @@ | |||
| from __future__ import unicode_literals | ||||
| import pytest | ||||
| 
 | ||||
| from spacy.en import English | ||||
| 
 | ||||
| EN = English() | ||||
| 
 | ||||
| def test_possess(): | ||||
|     tokens = EN("Mike's", parse=False, tag=False) | ||||
|     assert EN.vocab.strings[tokens[0].orth] == "Mike" | ||||
|     assert EN.vocab.strings[tokens[1].orth] == "'s" | ||||
| def test_possess(en_tokenizer): | ||||
|     tokens = en_tokenizer("Mike's") | ||||
|     assert en_tokenizer.vocab.strings[tokens[0].orth] == "Mike" | ||||
|     assert en_tokenizer.vocab.strings[tokens[1].orth] == "'s" | ||||
|     assert len(tokens) == 2 | ||||
| 
 | ||||
| 
 | ||||
| def test_apostrophe(): | ||||
|     tokens = EN("schools'", parse=False, tag=False) | ||||
| def test_apostrophe(en_tokenizer): | ||||
|     tokens = en_tokenizer("schools'") | ||||
|     assert len(tokens) == 2 | ||||
|     assert tokens[1].orth_ == "'" | ||||
|     assert tokens[0].orth_ == "schools" | ||||
| 
 | ||||
| 
 | ||||
| def test_LL(): | ||||
|     tokens = EN("we'll", parse=False) | ||||
| def test_LL(en_tokenizer): | ||||
|     tokens = en_tokenizer("we'll") | ||||
|     assert len(tokens) == 2 | ||||
|     assert tokens[1].orth_ == "'ll" | ||||
|     assert tokens[1].lemma_ == "will" | ||||
|     assert tokens[0].orth_ == "we" | ||||
| 
 | ||||
| 
 | ||||
| def test_aint(): | ||||
|     tokens = EN("ain't", parse=False) | ||||
| def test_aint(en_tokenizer): | ||||
|     tokens = en_tokenizer("ain't") | ||||
|     assert len(tokens) == 2 | ||||
|     assert tokens[0].orth_ == "ai" | ||||
|     assert tokens[0].lemma_ == "be" | ||||
|  | @ -36,19 +33,19 @@ def test_aint(): | |||
|     assert tokens[1].lemma_ == "not" | ||||
| 
 | ||||
| 
 | ||||
| def test_capitalized(): | ||||
|     tokens = EN("can't", parse=False) | ||||
| def test_capitalized(en_tokenizer): | ||||
|     tokens = en_tokenizer("can't") | ||||
|     assert len(tokens) == 2 | ||||
|     tokens = EN("Can't", parse=False) | ||||
|     tokens = en_tokenizer("Can't") | ||||
|     assert len(tokens) == 2 | ||||
|     tokens = EN("Ain't", parse=False) | ||||
|     tokens = en_tokenizer("Ain't") | ||||
|     assert len(tokens) == 2 | ||||
|     assert tokens[0].orth_ == "Ai" | ||||
|     assert tokens[0].lemma_ == "be" | ||||
| 
 | ||||
| 
 | ||||
| def test_punct(): | ||||
|     tokens = EN("We've", parse=False) | ||||
| def test_punct(en_tokenizer): | ||||
|     tokens = en_tokenizer("We've") | ||||
|     assert len(tokens) == 2 | ||||
|     tokens = EN("``We've", parse=False) | ||||
|     tokens = en_tokenizer("``We've") | ||||
|     assert len(tokens) == 3 | ||||
|  |  | |||
|  | @ -1,17 +1,10 @@ | |||
| from __future__ import unicode_literals | ||||
| import pytest | ||||
| 
 | ||||
| from spacy.en import English | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture | ||||
| def EN(): | ||||
|     return English() | ||||
| 
 | ||||
| 
 | ||||
| def test_tweebo_challenge(EN): | ||||
| def test_tweebo_challenge(en_tokenizer): | ||||
|     text = u""":o :/ :'( >:o (: :) >.< XD -__- o.O ;D :-) @_@ :P 8D :1 >:( :D =| ") :> ....""" | ||||
|     tokens = EN(text, parse=False, tag=False) | ||||
|     tokens = en_tokenizer(text) | ||||
|     assert tokens[0].orth_ == ":o" | ||||
|     assert tokens[1].orth_ == ":/" | ||||
|     assert tokens[2].orth_ == ":'(" | ||||
|  | @ -36,7 +29,7 @@ def test_tweebo_challenge(EN): | |||
|     assert tokens[21].orth_ == '....' | ||||
| 
 | ||||
| 
 | ||||
| def test_false_positive(EN): | ||||
| def test_false_positive(en_tokenizer): | ||||
|     text = "example:)" | ||||
|     tokens = EN(text, parse=False, tag=False) | ||||
|     tokens = en_tokenizer(text) | ||||
|     assert len(tokens) == 3 | ||||
|  |  | |||
|  | @ -3,18 +3,11 @@ | |||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| import pytest | ||||
| from spacy.en import English | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture | ||||
| def nlp(): | ||||
|     nlp = English() | ||||
|     return nlp.tokenizer | ||||
| 
 | ||||
| 
 | ||||
| def test_simple_punct(nlp): | ||||
| def test_simple_punct(en_tokenizer): | ||||
|     text = 'to walk, do foo' | ||||
|     tokens = nlp(text) | ||||
|     tokens = en_tokenizer(text) | ||||
|     assert tokens[0].idx == 0 | ||||
|     assert tokens[1].idx == 3 | ||||
|     assert tokens[2].idx == 7 | ||||
|  | @ -22,9 +15,9 @@ def test_simple_punct(nlp): | |||
|     assert tokens[4].idx == 12 | ||||
| 
 | ||||
| 
 | ||||
| def test_complex_punct(nlp): | ||||
| def test_complex_punct(en_tokenizer): | ||||
|     text = 'Tom (D., Ill.)!' | ||||
|     tokens = nlp(text) | ||||
|     tokens = en_tokenizer(text) | ||||
|     assert tokens[0].idx == 0 | ||||
|     assert len(tokens[0]) == 3 | ||||
|     assert tokens[1].idx == 4 | ||||
|  |  | |||
|  | @ -2,17 +2,13 @@ from __future__ import unicode_literals | |||
| 
 | ||||
| import pytest | ||||
| 
 | ||||
| from spacy.en import English | ||||
| 
 | ||||
| EN = English() | ||||
| 
 | ||||
| def test_hyphen(): | ||||
|     tokens = EN.tokenizer('best-known') | ||||
| def test_hyphen(en_tokenizer): | ||||
|     tokens = en_tokenizer('best-known') | ||||
|     assert len(tokens) == 3 | ||||
| 
 | ||||
| 
 | ||||
| def test_period(): | ||||
|     tokens = EN.tokenizer('best.Known') | ||||
| def test_period(en_tokenizer): | ||||
|     tokens = en_tokenizer('best.Known') | ||||
|     assert len(tokens) == 3 | ||||
|     tokens = EN.tokenizer('zombo.com') | ||||
|     tokens = en_tokenizer('zombo.com') | ||||
|     assert len(tokens) == 1 | ||||
|  |  | |||
|  | @ -1,14 +1,9 @@ | |||
| from __future__ import unicode_literals | ||||
| import pytest | ||||
| 
 | ||||
| from spacy.en import English | ||||
| 
 | ||||
| 
 | ||||
| def test_only_pre1(): | ||||
|     EN = English() | ||||
|     assert len(EN("(")) == 1 | ||||
| def test_only_pre1(en_tokenizer): | ||||
|     assert len(en_tokenizer("(")) == 1 | ||||
| 
 | ||||
| 
 | ||||
| def test_only_pre2(): | ||||
|     EN = English() | ||||
|     assert len(EN("((")) == 2 | ||||
| def test_only_pre2(en_tokenizer): | ||||
|     assert len(en_tokenizer("((")) == 2 | ||||
|  |  | |||
|  | @ -1,7 +1,4 @@ | |||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| from spacy.en import English | ||||
| 
 | ||||
| import pytest | ||||
| 
 | ||||
| 
 | ||||
|  | @ -10,42 +7,37 @@ def close_puncts(): | |||
|     return [')', ']', '}', '*'] | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture | ||||
| def EN(): | ||||
|     return English() | ||||
| 
 | ||||
| 
 | ||||
| def test_close(close_puncts, EN): | ||||
| def test_close(close_puncts, en_tokenizer): | ||||
|     word_str = 'Hello' | ||||
|     for p in close_puncts: | ||||
|         string = word_str + p | ||||
|         tokens = EN(string, parse=False, tag=False) | ||||
|         tokens = en_tokenizer(string) | ||||
|         assert len(tokens) == 2 | ||||
|         assert tokens[1].string == p | ||||
|         assert tokens[0].string == word_str | ||||
| 
 | ||||
| 
 | ||||
| def test_two_different_close(close_puncts, EN): | ||||
| def test_two_different_close(close_puncts, en_tokenizer): | ||||
|     word_str = 'Hello' | ||||
|     for p in close_puncts: | ||||
|         string = word_str + p + "'" | ||||
|         tokens = EN(string, parse=False, tag=False) | ||||
|         tokens = en_tokenizer(string) | ||||
|         assert len(tokens) == 3 | ||||
|         assert tokens[0].string == word_str | ||||
|         assert tokens[1].string == p | ||||
|         assert tokens[2].string == "'" | ||||
| 
 | ||||
| 
 | ||||
| def test_three_same_close(close_puncts, EN): | ||||
| def test_three_same_close(close_puncts, en_tokenizer): | ||||
|     word_str = 'Hello' | ||||
|     for p in close_puncts: | ||||
|         string = word_str + p + p + p | ||||
|         tokens = EN(string, tag=False, parse=False) | ||||
|         tokens = en_tokenizer(string) | ||||
|         assert len(tokens) == 4 | ||||
|         assert tokens[0].string == word_str | ||||
|         assert tokens[1].string == p | ||||
| 
 | ||||
| 
 | ||||
| def test_double_end_quote(EN): | ||||
|     assert len(EN("Hello''", tag=False, parse=False)) == 2 | ||||
|     assert len(EN("''", tag=False, parse=False)) == 1 | ||||
| def test_double_end_quote(en_tokenizer): | ||||
|     assert len(en_tokenizer("Hello''")) == 2 | ||||
|     assert len(en_tokenizer("''")) == 1 | ||||
|  |  | |||
|  | @ -1,7 +1,5 @@ | |||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| from spacy.en import English | ||||
| 
 | ||||
| import pytest | ||||
| 
 | ||||
| 
 | ||||
|  | @ -10,44 +8,39 @@ def open_puncts(): | |||
|     return ['(', '[', '{', '*'] | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture | ||||
| def EN(): | ||||
|     return English().tokenizer | ||||
| 
 | ||||
| 
 | ||||
| def test_open(open_puncts, EN): | ||||
| def test_open(open_puncts, en_tokenizer): | ||||
|     word_str = 'Hello' | ||||
|     for p in open_puncts: | ||||
|         string = p + word_str | ||||
|         tokens = EN(string) | ||||
|         tokens = en_tokenizer(string) | ||||
|         assert len(tokens) == 2 | ||||
|         assert tokens[0].orth_ == p | ||||
|         assert tokens[1].orth_ == word_str | ||||
| 
 | ||||
| 
 | ||||
| def test_two_different_open(open_puncts, EN): | ||||
| def test_two_different_open(open_puncts, en_tokenizer): | ||||
|     word_str = 'Hello' | ||||
|     for p in open_puncts: | ||||
|         string = p + "`" + word_str | ||||
|         tokens = EN(string) | ||||
|         tokens = en_tokenizer(string) | ||||
|         assert len(tokens) == 3 | ||||
|         assert tokens[0].orth_ == p | ||||
|         assert tokens[1].orth_ == "`" | ||||
|         assert tokens[2].orth_ == word_str | ||||
| 
 | ||||
| 
 | ||||
| def test_three_same_open(open_puncts, EN): | ||||
| def test_three_same_open(open_puncts, en_tokenizer): | ||||
|     word_str = 'Hello' | ||||
|     for p in open_puncts: | ||||
|         string = p + p + p + word_str | ||||
|         tokens = EN(string) | ||||
|         tokens = en_tokenizer(string) | ||||
|         assert len(tokens) == 4 | ||||
|         assert tokens[0].orth_ == p | ||||
|         assert tokens[3].orth_ == word_str | ||||
| 
 | ||||
| 
 | ||||
| def test_open_appostrophe(EN): | ||||
| def test_open_appostrophe(en_tokenizer): | ||||
|     string = "'The" | ||||
|     tokens = EN(string) | ||||
|     tokens = en_tokenizer(string) | ||||
|     assert len(tokens) == 2 | ||||
|     assert tokens[0].orth_ == "'" | ||||
|  |  | |||
|  | @ -3,50 +3,44 @@ and suffix punctuation.""" | |||
| from __future__ import unicode_literals | ||||
| import pytest | ||||
| 
 | ||||
| from spacy.en import English | ||||
| 
 | ||||
| @pytest.fixture | ||||
| def EN(): | ||||
|     return English().tokenizer | ||||
| def test_no_special(en_tokenizer): | ||||
|     assert len(en_tokenizer("(can)")) == 3 | ||||
| 
 | ||||
| 
 | ||||
| def test_no_special(EN): | ||||
|     assert len(EN("(can)")) == 3 | ||||
| def test_no_punct(en_tokenizer): | ||||
|     assert len(en_tokenizer("can't")) == 2 | ||||
| 
 | ||||
| 
 | ||||
| def test_no_punct(EN): | ||||
|     assert len(EN("can't")) == 2 | ||||
| def test_prefix(en_tokenizer): | ||||
|     assert len(en_tokenizer("(can't")) == 3 | ||||
| 
 | ||||
| 
 | ||||
| def test_prefix(EN): | ||||
|     assert len(EN("(can't")) == 3 | ||||
| def test_suffix(en_tokenizer): | ||||
|     assert len(en_tokenizer("can't)")) == 3 | ||||
| 
 | ||||
| 
 | ||||
| def test_suffix(EN): | ||||
|     assert len(EN("can't)")) == 3 | ||||
| def test_wrap(en_tokenizer): | ||||
|     assert len(en_tokenizer("(can't)")) == 4 | ||||
| 
 | ||||
| 
 | ||||
| def test_wrap(EN): | ||||
|     assert len(EN("(can't)")) == 4 | ||||
| def test_uneven_wrap(en_tokenizer): | ||||
|     assert len(en_tokenizer("(can't?)")) == 5 | ||||
| 
 | ||||
| 
 | ||||
| def test_uneven_wrap(EN): | ||||
|     assert len(EN("(can't?)")) == 5 | ||||
| def test_prefix_interact(en_tokenizer): | ||||
|     assert len(en_tokenizer("U.S.")) == 1 | ||||
|     assert len(en_tokenizer("us.")) == 2 | ||||
|     assert len(en_tokenizer("(U.S.")) == 2 | ||||
| 
 | ||||
| 
 | ||||
| def test_prefix_interact(EN): | ||||
|     assert len(EN("U.S.")) == 1 | ||||
|     assert len(EN("us.")) == 2 | ||||
|     assert len(EN("(U.S.")) == 2 | ||||
| def test_suffix_interact(en_tokenizer): | ||||
|     assert len(en_tokenizer("U.S.)")) == 2 | ||||
| 
 | ||||
| 
 | ||||
| def test_suffix_interact(EN): | ||||
|     assert len(EN("U.S.)")) == 2 | ||||
| def test_even_wrap_interact(en_tokenizer): | ||||
|     assert len(en_tokenizer("(U.S.)")) == 3 | ||||
| 
 | ||||
| 
 | ||||
| def test_even_wrap_interact(EN): | ||||
|     assert len(EN("(U.S.)")) == 3 | ||||
| 
 | ||||
| 
 | ||||
| def test_uneven_wrap_interact(EN): | ||||
|     assert len(EN("(U.S.?)")) == 4 | ||||
| def test_uneven_wrap_interact(en_tokenizer): | ||||
|     assert len(en_tokenizer("(U.S.?)")) == 4 | ||||
|  |  | |||
|  | @ -1,7 +1,4 @@ | |||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| from spacy.en import English | ||||
| 
 | ||||
| import pytest | ||||
| 
 | ||||
| 
 | ||||
|  | @ -10,27 +7,22 @@ def paired_puncts(): | |||
|     return [('(', ')'),  ('[', ']'), ('{', '}'), ('*', '*')] | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture | ||||
| def EN(): | ||||
|     return English().tokenizer | ||||
| 
 | ||||
| 
 | ||||
| def test_token(paired_puncts, EN): | ||||
| def test_token(paired_puncts, en_tokenizer): | ||||
|     word_str = 'Hello' | ||||
|     for open_, close_ in paired_puncts: | ||||
|         string = open_ + word_str + close_ | ||||
|         tokens = EN(string) | ||||
|         tokens = en_tokenizer(string) | ||||
|         assert len(tokens) == 3 | ||||
|         assert tokens[0].orth_ == open_ | ||||
|         assert tokens[1].orth_ == word_str | ||||
|         assert tokens[2].orth_ == close_ | ||||
| 
 | ||||
| 
 | ||||
| def test_two_different(paired_puncts, EN): | ||||
| def test_two_different(paired_puncts, en_tokenizer): | ||||
|     word_str = 'Hello' | ||||
|     for open_, close_ in paired_puncts: | ||||
|         string = "`" + open_ + word_str + close_ + "'" | ||||
|         tokens = EN(string) | ||||
|         tokens = en_tokenizer(string) | ||||
|         assert len(tokens) == 5 | ||||
|         assert tokens[0].orth_ == "`" | ||||
|         assert tokens[1].orth_ == open_ | ||||
|  |  | |||
|  | @ -3,32 +3,25 @@ from __future__ import unicode_literals | |||
| 
 | ||||
| import pytest | ||||
| 
 | ||||
| from spacy.en import English | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture | ||||
| def EN(): | ||||
|     return English().tokenizer | ||||
| 
 | ||||
| 
 | ||||
| def test_no_word(EN): | ||||
|     tokens = EN(u'') | ||||
| def test_no_word(en_tokenizer): | ||||
|     tokens = en_tokenizer(u'') | ||||
|     assert len(tokens) == 0 | ||||
| 
 | ||||
| 
 | ||||
| def test_single_word(EN): | ||||
|     tokens = EN(u'hello') | ||||
| def test_single_word(en_tokenizer): | ||||
|     tokens = en_tokenizer(u'hello') | ||||
|     assert tokens[0].orth_ == 'hello' | ||||
| 
 | ||||
| 
 | ||||
| def test_two_words(EN): | ||||
|     tokens = EN('hello possums') | ||||
| def test_two_words(en_tokenizer): | ||||
|     tokens = en_tokenizer('hello possums') | ||||
|     assert len(tokens) == 2 | ||||
|     assert tokens[0].orth_ != tokens[1].orth_ | ||||
| 
 | ||||
| 
 | ||||
| def test_punct(EN): | ||||
|     tokens = EN('hello, possums.') | ||||
| def test_punct(en_tokenizer): | ||||
|     tokens = en_tokenizer('hello, possums.') | ||||
|     assert len(tokens) == 4 | ||||
|     assert tokens[0].orth_ == 'hello' | ||||
|     assert tokens[1].orth_ == ',' | ||||
|  | @ -36,34 +29,34 @@ def test_punct(EN): | |||
|     assert tokens[1].orth_ != 'hello' | ||||
| 
 | ||||
| 
 | ||||
| def test_digits(EN): | ||||
|     tokens = EN('The year: 1984.') | ||||
| def test_digits(en_tokenizer): | ||||
|     tokens = en_tokenizer('The year: 1984.') | ||||
|     assert len(tokens) == 5 | ||||
|     assert tokens[0].orth == EN.vocab['The'].orth | ||||
|     assert tokens[3].orth == EN.vocab['1984'].orth | ||||
|     assert tokens[0].orth == en_tokenizer.vocab['The'].orth | ||||
|     assert tokens[3].orth == en_tokenizer.vocab['1984'].orth | ||||
| 
 | ||||
| 
 | ||||
| def test_contraction(EN): | ||||
|     tokens = EN("don't giggle") | ||||
| def test_contraction(en_tokenizer): | ||||
|     tokens = en_tokenizer("don't giggle") | ||||
|     assert len(tokens) == 3 | ||||
|     assert tokens[1].orth == EN.vocab["n't"].orth | ||||
|     tokens = EN("i said don't!") | ||||
|     assert tokens[1].orth == en_tokenizer.vocab["n't"].orth | ||||
|     tokens = en_tokenizer("i said don't!") | ||||
|     assert len(tokens) == 5 | ||||
|     assert tokens[4].orth == EN.vocab['!'].orth | ||||
|     assert tokens[4].orth == en_tokenizer.vocab['!'].orth | ||||
| 
 | ||||
| 
 | ||||
| def test_contraction_punct(EN): | ||||
|     tokens = EN("(can't") | ||||
| def test_contraction_punct(en_tokenizer): | ||||
|     tokens = en_tokenizer("(can't") | ||||
|     assert len(tokens) == 3 | ||||
|     tokens = EN("`ain't") | ||||
|     tokens = en_tokenizer("`ain't") | ||||
|     assert len(tokens) == 3 | ||||
|     tokens = EN('''"isn't''') | ||||
|     tokens = en_tokenizer('''"isn't''') | ||||
|     assert len(tokens) == 3 | ||||
|     tokens = EN("can't!") | ||||
|     tokens = en_tokenizer("can't!") | ||||
|     assert len(tokens) == 3 | ||||
| 
 | ||||
| 
 | ||||
| def test_sample(EN): | ||||
| def test_sample(en_tokenizer): | ||||
|     text = """Tributes pour in for late British Labour Party leader | ||||
| 
 | ||||
| Tributes poured in from around the world Thursday | ||||
|  | @ -75,7 +68,7 @@ untimely death" of the rapier-tongued Scottish barrister and parliamentarian. | |||
| 
 | ||||
| "Mr. Smith, throughout his distinguished""" | ||||
| 
 | ||||
|     tokens = EN(text) | ||||
|     tokens = en_tokenizer(text) | ||||
|     assert len(tokens) > 5 | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,16 +1,9 @@ | |||
| from __future__ import unicode_literals | ||||
| import pytest | ||||
| 
 | ||||
| from spacy.en import English | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture | ||||
| def EN(): | ||||
|     return English() | ||||
| 
 | ||||
| 
 | ||||
| def test1(EN): | ||||
| def test1(en_tokenizer): | ||||
|     words = ['JAPAN', 'GET', 'LUCKY'] | ||||
|     tokens = EN.tokenizer.tokens_from_list(words) | ||||
|     tokens = en_tokenizer.tokens_from_list(words) | ||||
|     assert len(tokens) == 3 | ||||
|     assert tokens[0].orth_ == 'JAPAN' | ||||
|  |  | |||
|  | @ -1,41 +1,35 @@ | |||
| """Test that tokens are created correctly for whitespace.""" | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| from spacy.en import English | ||||
| import pytest | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture | ||||
| def EN(): | ||||
|     return English().tokenizer | ||||
| 
 | ||||
| 
 | ||||
| def test_single_space(EN): | ||||
|     tokens = EN('hello possums') | ||||
| def test_single_space(en_tokenizer): | ||||
|     tokens = en_tokenizer('hello possums') | ||||
|     assert len(tokens) == 2 | ||||
| 
 | ||||
| 
 | ||||
| def test_double_space(EN): | ||||
|     tokens = EN('hello  possums') | ||||
| def test_double_space(en_tokenizer): | ||||
|     tokens = en_tokenizer('hello  possums') | ||||
|     assert len(tokens) == 3 | ||||
|     assert tokens[1].orth_ == ' ' | ||||
| 
 | ||||
| 
 | ||||
| def test_newline(EN): | ||||
|     tokens = EN('hello\npossums') | ||||
| def test_newline(en_tokenizer): | ||||
|     tokens = en_tokenizer('hello\npossums') | ||||
|     assert len(tokens) == 3 | ||||
| 
 | ||||
| 
 | ||||
| def test_newline_space(EN): | ||||
|     tokens = EN('hello \npossums') | ||||
| def test_newline_space(en_tokenizer): | ||||
|     tokens = en_tokenizer('hello \npossums') | ||||
|     assert len(tokens) == 3 | ||||
| 
 | ||||
| 
 | ||||
| def test_newline_double_space(EN): | ||||
|     tokens = EN('hello  \npossums') | ||||
| def test_newline_double_space(en_tokenizer): | ||||
|     tokens = en_tokenizer('hello  \npossums') | ||||
|     assert len(tokens) == 3 | ||||
| 
 | ||||
| 
 | ||||
| def test_newline_space_wrap(EN): | ||||
|     tokens = EN('hello \n possums') | ||||
| def test_newline_space_wrap(en_tokenizer): | ||||
|     tokens = en_tokenizer('hello \n possums') | ||||
|     assert len(tokens) == 3 | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user