mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-26 05:31:15 +03:00 
			
		
		
		
	* Set up tokenizer/ tests properly, using a session-scoped fixture to avoid long load/unload times. Tokenizer tests now complete in 20 seconds.
This commit is contained in:
		
							parent
							
								
									1d5f20fdda
								
							
						
					
					
						commit
						877abb0e5b
					
				
							
								
								
									
										11
									
								
								tests/tokenizer/conftest.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										11
									
								
								tests/tokenizer/conftest.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,11 @@ | ||||||
|  | import pytest | ||||||
|  | from spacy.en import English | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @pytest.fixture(scope="session") | ||||||
|  | def EN(): | ||||||
|  |     return English(load_vectors=False) | ||||||
|  | 
 | ||||||
|  | @pytest.fixture(scope="session") | ||||||
|  | def en_tokenizer(EN): | ||||||
|  |     return EN.tokenizer | ||||||
|  | @ -1,34 +1,31 @@ | ||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
| import pytest | import pytest | ||||||
| 
 | 
 | ||||||
| from spacy.en import English |  | ||||||
| 
 | 
 | ||||||
| EN = English() | def test_possess(en_tokenizer): | ||||||
| 
 |     tokens = en_tokenizer("Mike's") | ||||||
| def test_possess(): |     assert en_tokenizer.vocab.strings[tokens[0].orth] == "Mike" | ||||||
|     tokens = EN("Mike's", parse=False, tag=False) |     assert en_tokenizer.vocab.strings[tokens[1].orth] == "'s" | ||||||
|     assert EN.vocab.strings[tokens[0].orth] == "Mike" |  | ||||||
|     assert EN.vocab.strings[tokens[1].orth] == "'s" |  | ||||||
|     assert len(tokens) == 2 |     assert len(tokens) == 2 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_apostrophe(): | def test_apostrophe(en_tokenizer): | ||||||
|     tokens = EN("schools'", parse=False, tag=False) |     tokens = en_tokenizer("schools'") | ||||||
|     assert len(tokens) == 2 |     assert len(tokens) == 2 | ||||||
|     assert tokens[1].orth_ == "'" |     assert tokens[1].orth_ == "'" | ||||||
|     assert tokens[0].orth_ == "schools" |     assert tokens[0].orth_ == "schools" | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_LL(): | def test_LL(en_tokenizer): | ||||||
|     tokens = EN("we'll", parse=False) |     tokens = en_tokenizer("we'll") | ||||||
|     assert len(tokens) == 2 |     assert len(tokens) == 2 | ||||||
|     assert tokens[1].orth_ == "'ll" |     assert tokens[1].orth_ == "'ll" | ||||||
|     assert tokens[1].lemma_ == "will" |     assert tokens[1].lemma_ == "will" | ||||||
|     assert tokens[0].orth_ == "we" |     assert tokens[0].orth_ == "we" | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_aint(): | def test_aint(en_tokenizer): | ||||||
|     tokens = EN("ain't", parse=False) |     tokens = en_tokenizer("ain't") | ||||||
|     assert len(tokens) == 2 |     assert len(tokens) == 2 | ||||||
|     assert tokens[0].orth_ == "ai" |     assert tokens[0].orth_ == "ai" | ||||||
|     assert tokens[0].lemma_ == "be" |     assert tokens[0].lemma_ == "be" | ||||||
|  | @ -36,19 +33,19 @@ def test_aint(): | ||||||
|     assert tokens[1].lemma_ == "not" |     assert tokens[1].lemma_ == "not" | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_capitalized(): | def test_capitalized(en_tokenizer): | ||||||
|     tokens = EN("can't", parse=False) |     tokens = en_tokenizer("can't") | ||||||
|     assert len(tokens) == 2 |     assert len(tokens) == 2 | ||||||
|     tokens = EN("Can't", parse=False) |     tokens = en_tokenizer("Can't") | ||||||
|     assert len(tokens) == 2 |     assert len(tokens) == 2 | ||||||
|     tokens = EN("Ain't", parse=False) |     tokens = en_tokenizer("Ain't") | ||||||
|     assert len(tokens) == 2 |     assert len(tokens) == 2 | ||||||
|     assert tokens[0].orth_ == "Ai" |     assert tokens[0].orth_ == "Ai" | ||||||
|     assert tokens[0].lemma_ == "be" |     assert tokens[0].lemma_ == "be" | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_punct(): | def test_punct(en_tokenizer): | ||||||
|     tokens = EN("We've", parse=False) |     tokens = en_tokenizer("We've") | ||||||
|     assert len(tokens) == 2 |     assert len(tokens) == 2 | ||||||
|     tokens = EN("``We've", parse=False) |     tokens = en_tokenizer("``We've") | ||||||
|     assert len(tokens) == 3 |     assert len(tokens) == 3 | ||||||
|  |  | ||||||
|  | @ -1,17 +1,10 @@ | ||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
| import pytest | import pytest | ||||||
| 
 | 
 | ||||||
| from spacy.en import English |  | ||||||
| 
 | 
 | ||||||
| 
 | def test_tweebo_challenge(en_tokenizer): | ||||||
| @pytest.fixture |  | ||||||
| def EN(): |  | ||||||
|     return English() |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def test_tweebo_challenge(EN): |  | ||||||
|     text = u""":o :/ :'( >:o (: :) >.< XD -__- o.O ;D :-) @_@ :P 8D :1 >:( :D =| ") :> ....""" |     text = u""":o :/ :'( >:o (: :) >.< XD -__- o.O ;D :-) @_@ :P 8D :1 >:( :D =| ") :> ....""" | ||||||
|     tokens = EN(text, parse=False, tag=False) |     tokens = en_tokenizer(text) | ||||||
|     assert tokens[0].orth_ == ":o" |     assert tokens[0].orth_ == ":o" | ||||||
|     assert tokens[1].orth_ == ":/" |     assert tokens[1].orth_ == ":/" | ||||||
|     assert tokens[2].orth_ == ":'(" |     assert tokens[2].orth_ == ":'(" | ||||||
|  | @ -36,7 +29,7 @@ def test_tweebo_challenge(EN): | ||||||
|     assert tokens[21].orth_ == '....' |     assert tokens[21].orth_ == '....' | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_false_positive(EN): | def test_false_positive(en_tokenizer): | ||||||
|     text = "example:)" |     text = "example:)" | ||||||
|     tokens = EN(text, parse=False, tag=False) |     tokens = en_tokenizer(text) | ||||||
|     assert len(tokens) == 3 |     assert len(tokens) == 3 | ||||||
|  |  | ||||||
|  | @ -3,18 +3,11 @@ | ||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
| 
 | 
 | ||||||
| import pytest | import pytest | ||||||
| from spacy.en import English |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @pytest.fixture | def test_simple_punct(en_tokenizer): | ||||||
| def nlp(): |  | ||||||
|     nlp = English() |  | ||||||
|     return nlp.tokenizer |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def test_simple_punct(nlp): |  | ||||||
|     text = 'to walk, do foo' |     text = 'to walk, do foo' | ||||||
|     tokens = nlp(text) |     tokens = en_tokenizer(text) | ||||||
|     assert tokens[0].idx == 0 |     assert tokens[0].idx == 0 | ||||||
|     assert tokens[1].idx == 3 |     assert tokens[1].idx == 3 | ||||||
|     assert tokens[2].idx == 7 |     assert tokens[2].idx == 7 | ||||||
|  | @ -22,9 +15,9 @@ def test_simple_punct(nlp): | ||||||
|     assert tokens[4].idx == 12 |     assert tokens[4].idx == 12 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_complex_punct(nlp): | def test_complex_punct(en_tokenizer): | ||||||
|     text = 'Tom (D., Ill.)!' |     text = 'Tom (D., Ill.)!' | ||||||
|     tokens = nlp(text) |     tokens = en_tokenizer(text) | ||||||
|     assert tokens[0].idx == 0 |     assert tokens[0].idx == 0 | ||||||
|     assert len(tokens[0]) == 3 |     assert len(tokens[0]) == 3 | ||||||
|     assert tokens[1].idx == 4 |     assert tokens[1].idx == 4 | ||||||
|  |  | ||||||
|  | @ -2,17 +2,13 @@ from __future__ import unicode_literals | ||||||
| 
 | 
 | ||||||
| import pytest | import pytest | ||||||
| 
 | 
 | ||||||
| from spacy.en import English | def test_hyphen(en_tokenizer): | ||||||
| 
 |     tokens = en_tokenizer('best-known') | ||||||
| EN = English() |  | ||||||
| 
 |  | ||||||
| def test_hyphen(): |  | ||||||
|     tokens = EN.tokenizer('best-known') |  | ||||||
|     assert len(tokens) == 3 |     assert len(tokens) == 3 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_period(): | def test_period(en_tokenizer): | ||||||
|     tokens = EN.tokenizer('best.Known') |     tokens = en_tokenizer('best.Known') | ||||||
|     assert len(tokens) == 3 |     assert len(tokens) == 3 | ||||||
|     tokens = EN.tokenizer('zombo.com') |     tokens = en_tokenizer('zombo.com') | ||||||
|     assert len(tokens) == 1 |     assert len(tokens) == 1 | ||||||
|  |  | ||||||
|  | @ -1,14 +1,9 @@ | ||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
| import pytest |  | ||||||
| 
 |  | ||||||
| from spacy.en import English |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_only_pre1(): | def test_only_pre1(en_tokenizer): | ||||||
|     EN = English() |     assert len(en_tokenizer("(")) == 1 | ||||||
|     assert len(EN("(")) == 1 |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_only_pre2(): | def test_only_pre2(en_tokenizer): | ||||||
|     EN = English() |     assert len(en_tokenizer("((")) == 2 | ||||||
|     assert len(EN("((")) == 2 |  | ||||||
|  |  | ||||||
|  | @ -1,7 +1,4 @@ | ||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
| 
 |  | ||||||
| from spacy.en import English |  | ||||||
| 
 |  | ||||||
| import pytest | import pytest | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -10,42 +7,37 @@ def close_puncts(): | ||||||
|     return [')', ']', '}', '*'] |     return [')', ']', '}', '*'] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @pytest.fixture | def test_close(close_puncts, en_tokenizer): | ||||||
| def EN(): |  | ||||||
|     return English() |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def test_close(close_puncts, EN): |  | ||||||
|     word_str = 'Hello' |     word_str = 'Hello' | ||||||
|     for p in close_puncts: |     for p in close_puncts: | ||||||
|         string = word_str + p |         string = word_str + p | ||||||
|         tokens = EN(string, parse=False, tag=False) |         tokens = en_tokenizer(string) | ||||||
|         assert len(tokens) == 2 |         assert len(tokens) == 2 | ||||||
|         assert tokens[1].string == p |         assert tokens[1].string == p | ||||||
|         assert tokens[0].string == word_str |         assert tokens[0].string == word_str | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_two_different_close(close_puncts, EN): | def test_two_different_close(close_puncts, en_tokenizer): | ||||||
|     word_str = 'Hello' |     word_str = 'Hello' | ||||||
|     for p in close_puncts: |     for p in close_puncts: | ||||||
|         string = word_str + p + "'" |         string = word_str + p + "'" | ||||||
|         tokens = EN(string, parse=False, tag=False) |         tokens = en_tokenizer(string) | ||||||
|         assert len(tokens) == 3 |         assert len(tokens) == 3 | ||||||
|         assert tokens[0].string == word_str |         assert tokens[0].string == word_str | ||||||
|         assert tokens[1].string == p |         assert tokens[1].string == p | ||||||
|         assert tokens[2].string == "'" |         assert tokens[2].string == "'" | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_three_same_close(close_puncts, EN): | def test_three_same_close(close_puncts, en_tokenizer): | ||||||
|     word_str = 'Hello' |     word_str = 'Hello' | ||||||
|     for p in close_puncts: |     for p in close_puncts: | ||||||
|         string = word_str + p + p + p |         string = word_str + p + p + p | ||||||
|         tokens = EN(string, tag=False, parse=False) |         tokens = en_tokenizer(string) | ||||||
|         assert len(tokens) == 4 |         assert len(tokens) == 4 | ||||||
|         assert tokens[0].string == word_str |         assert tokens[0].string == word_str | ||||||
|         assert tokens[1].string == p |         assert tokens[1].string == p | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_double_end_quote(EN): | def test_double_end_quote(en_tokenizer): | ||||||
|     assert len(EN("Hello''", tag=False, parse=False)) == 2 |     assert len(en_tokenizer("Hello''")) == 2 | ||||||
|     assert len(EN("''", tag=False, parse=False)) == 1 |     assert len(en_tokenizer("''")) == 1 | ||||||
|  |  | ||||||
|  | @ -1,7 +1,5 @@ | ||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
| 
 | 
 | ||||||
| from spacy.en import English |  | ||||||
| 
 |  | ||||||
| import pytest | import pytest | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -10,44 +8,39 @@ def open_puncts(): | ||||||
|     return ['(', '[', '{', '*'] |     return ['(', '[', '{', '*'] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @pytest.fixture | def test_open(open_puncts, en_tokenizer): | ||||||
| def EN(): |  | ||||||
|     return English().tokenizer |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def test_open(open_puncts, EN): |  | ||||||
|     word_str = 'Hello' |     word_str = 'Hello' | ||||||
|     for p in open_puncts: |     for p in open_puncts: | ||||||
|         string = p + word_str |         string = p + word_str | ||||||
|         tokens = EN(string) |         tokens = en_tokenizer(string) | ||||||
|         assert len(tokens) == 2 |         assert len(tokens) == 2 | ||||||
|         assert tokens[0].orth_ == p |         assert tokens[0].orth_ == p | ||||||
|         assert tokens[1].orth_ == word_str |         assert tokens[1].orth_ == word_str | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_two_different_open(open_puncts, EN): | def test_two_different_open(open_puncts, en_tokenizer): | ||||||
|     word_str = 'Hello' |     word_str = 'Hello' | ||||||
|     for p in open_puncts: |     for p in open_puncts: | ||||||
|         string = p + "`" + word_str |         string = p + "`" + word_str | ||||||
|         tokens = EN(string) |         tokens = en_tokenizer(string) | ||||||
|         assert len(tokens) == 3 |         assert len(tokens) == 3 | ||||||
|         assert tokens[0].orth_ == p |         assert tokens[0].orth_ == p | ||||||
|         assert tokens[1].orth_ == "`" |         assert tokens[1].orth_ == "`" | ||||||
|         assert tokens[2].orth_ == word_str |         assert tokens[2].orth_ == word_str | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_three_same_open(open_puncts, EN): | def test_three_same_open(open_puncts, en_tokenizer): | ||||||
|     word_str = 'Hello' |     word_str = 'Hello' | ||||||
|     for p in open_puncts: |     for p in open_puncts: | ||||||
|         string = p + p + p + word_str |         string = p + p + p + word_str | ||||||
|         tokens = EN(string) |         tokens = en_tokenizer(string) | ||||||
|         assert len(tokens) == 4 |         assert len(tokens) == 4 | ||||||
|         assert tokens[0].orth_ == p |         assert tokens[0].orth_ == p | ||||||
|         assert tokens[3].orth_ == word_str |         assert tokens[3].orth_ == word_str | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_open_appostrophe(EN): | def test_open_appostrophe(en_tokenizer): | ||||||
|     string = "'The" |     string = "'The" | ||||||
|     tokens = EN(string) |     tokens = en_tokenizer(string) | ||||||
|     assert len(tokens) == 2 |     assert len(tokens) == 2 | ||||||
|     assert tokens[0].orth_ == "'" |     assert tokens[0].orth_ == "'" | ||||||
|  |  | ||||||
|  | @ -3,50 +3,44 @@ and suffix punctuation.""" | ||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
| import pytest | import pytest | ||||||
| 
 | 
 | ||||||
| from spacy.en import English |  | ||||||
| 
 | 
 | ||||||
| @pytest.fixture | def test_no_special(en_tokenizer): | ||||||
| def EN(): |     assert len(en_tokenizer("(can)")) == 3 | ||||||
|     return English().tokenizer |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_no_special(EN): | def test_no_punct(en_tokenizer): | ||||||
|     assert len(EN("(can)")) == 3 |     assert len(en_tokenizer("can't")) == 2 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_no_punct(EN): | def test_prefix(en_tokenizer): | ||||||
|     assert len(EN("can't")) == 2 |     assert len(en_tokenizer("(can't")) == 3 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_prefix(EN): | def test_suffix(en_tokenizer): | ||||||
|     assert len(EN("(can't")) == 3 |     assert len(en_tokenizer("can't)")) == 3 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_suffix(EN): | def test_wrap(en_tokenizer): | ||||||
|     assert len(EN("can't)")) == 3 |     assert len(en_tokenizer("(can't)")) == 4 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_wrap(EN): | def test_uneven_wrap(en_tokenizer): | ||||||
|     assert len(EN("(can't)")) == 4 |     assert len(en_tokenizer("(can't?)")) == 5 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_uneven_wrap(EN): | def test_prefix_interact(en_tokenizer): | ||||||
|     assert len(EN("(can't?)")) == 5 |     assert len(en_tokenizer("U.S.")) == 1 | ||||||
|  |     assert len(en_tokenizer("us.")) == 2 | ||||||
|  |     assert len(en_tokenizer("(U.S.")) == 2 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_prefix_interact(EN): | def test_suffix_interact(en_tokenizer): | ||||||
|     assert len(EN("U.S.")) == 1 |     assert len(en_tokenizer("U.S.)")) == 2 | ||||||
|     assert len(EN("us.")) == 2 |  | ||||||
|     assert len(EN("(U.S.")) == 2 |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_suffix_interact(EN): | def test_even_wrap_interact(en_tokenizer): | ||||||
|     assert len(EN("U.S.)")) == 2 |     assert len(en_tokenizer("(U.S.)")) == 3 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_even_wrap_interact(EN): | def test_uneven_wrap_interact(en_tokenizer): | ||||||
|     assert len(EN("(U.S.)")) == 3 |     assert len(en_tokenizer("(U.S.?)")) == 4 | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def test_uneven_wrap_interact(EN): |  | ||||||
|     assert len(EN("(U.S.?)")) == 4 |  | ||||||
|  |  | ||||||
|  | @ -1,7 +1,4 @@ | ||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
| 
 |  | ||||||
| from spacy.en import English |  | ||||||
| 
 |  | ||||||
| import pytest | import pytest | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -10,27 +7,22 @@ def paired_puncts(): | ||||||
|     return [('(', ')'),  ('[', ']'), ('{', '}'), ('*', '*')] |     return [('(', ')'),  ('[', ']'), ('{', '}'), ('*', '*')] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @pytest.fixture | def test_token(paired_puncts, en_tokenizer): | ||||||
| def EN(): |  | ||||||
|     return English().tokenizer |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def test_token(paired_puncts, EN): |  | ||||||
|     word_str = 'Hello' |     word_str = 'Hello' | ||||||
|     for open_, close_ in paired_puncts: |     for open_, close_ in paired_puncts: | ||||||
|         string = open_ + word_str + close_ |         string = open_ + word_str + close_ | ||||||
|         tokens = EN(string) |         tokens = en_tokenizer(string) | ||||||
|         assert len(tokens) == 3 |         assert len(tokens) == 3 | ||||||
|         assert tokens[0].orth_ == open_ |         assert tokens[0].orth_ == open_ | ||||||
|         assert tokens[1].orth_ == word_str |         assert tokens[1].orth_ == word_str | ||||||
|         assert tokens[2].orth_ == close_ |         assert tokens[2].orth_ == close_ | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_two_different(paired_puncts, EN): | def test_two_different(paired_puncts, en_tokenizer): | ||||||
|     word_str = 'Hello' |     word_str = 'Hello' | ||||||
|     for open_, close_ in paired_puncts: |     for open_, close_ in paired_puncts: | ||||||
|         string = "`" + open_ + word_str + close_ + "'" |         string = "`" + open_ + word_str + close_ + "'" | ||||||
|         tokens = EN(string) |         tokens = en_tokenizer(string) | ||||||
|         assert len(tokens) == 5 |         assert len(tokens) == 5 | ||||||
|         assert tokens[0].orth_ == "`" |         assert tokens[0].orth_ == "`" | ||||||
|         assert tokens[1].orth_ == open_ |         assert tokens[1].orth_ == open_ | ||||||
|  |  | ||||||
|  | @ -3,32 +3,25 @@ from __future__ import unicode_literals | ||||||
| 
 | 
 | ||||||
| import pytest | import pytest | ||||||
| 
 | 
 | ||||||
| from spacy.en import English |  | ||||||
| 
 | 
 | ||||||
| 
 | def test_no_word(en_tokenizer): | ||||||
| @pytest.fixture |     tokens = en_tokenizer(u'') | ||||||
| def EN(): |  | ||||||
|     return English().tokenizer |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def test_no_word(EN): |  | ||||||
|     tokens = EN(u'') |  | ||||||
|     assert len(tokens) == 0 |     assert len(tokens) == 0 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_single_word(EN): | def test_single_word(en_tokenizer): | ||||||
|     tokens = EN(u'hello') |     tokens = en_tokenizer(u'hello') | ||||||
|     assert tokens[0].orth_ == 'hello' |     assert tokens[0].orth_ == 'hello' | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_two_words(EN): | def test_two_words(en_tokenizer): | ||||||
|     tokens = EN('hello possums') |     tokens = en_tokenizer('hello possums') | ||||||
|     assert len(tokens) == 2 |     assert len(tokens) == 2 | ||||||
|     assert tokens[0].orth_ != tokens[1].orth_ |     assert tokens[0].orth_ != tokens[1].orth_ | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_punct(EN): | def test_punct(en_tokenizer): | ||||||
|     tokens = EN('hello, possums.') |     tokens = en_tokenizer('hello, possums.') | ||||||
|     assert len(tokens) == 4 |     assert len(tokens) == 4 | ||||||
|     assert tokens[0].orth_ == 'hello' |     assert tokens[0].orth_ == 'hello' | ||||||
|     assert tokens[1].orth_ == ',' |     assert tokens[1].orth_ == ',' | ||||||
|  | @ -36,34 +29,34 @@ def test_punct(EN): | ||||||
|     assert tokens[1].orth_ != 'hello' |     assert tokens[1].orth_ != 'hello' | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_digits(EN): | def test_digits(en_tokenizer): | ||||||
|     tokens = EN('The year: 1984.') |     tokens = en_tokenizer('The year: 1984.') | ||||||
|     assert len(tokens) == 5 |     assert len(tokens) == 5 | ||||||
|     assert tokens[0].orth == EN.vocab['The'].orth |     assert tokens[0].orth == en_tokenizer.vocab['The'].orth | ||||||
|     assert tokens[3].orth == EN.vocab['1984'].orth |     assert tokens[3].orth == en_tokenizer.vocab['1984'].orth | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_contraction(EN): | def test_contraction(en_tokenizer): | ||||||
|     tokens = EN("don't giggle") |     tokens = en_tokenizer("don't giggle") | ||||||
|     assert len(tokens) == 3 |     assert len(tokens) == 3 | ||||||
|     assert tokens[1].orth == EN.vocab["n't"].orth |     assert tokens[1].orth == en_tokenizer.vocab["n't"].orth | ||||||
|     tokens = EN("i said don't!") |     tokens = en_tokenizer("i said don't!") | ||||||
|     assert len(tokens) == 5 |     assert len(tokens) == 5 | ||||||
|     assert tokens[4].orth == EN.vocab['!'].orth |     assert tokens[4].orth == en_tokenizer.vocab['!'].orth | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_contraction_punct(EN): | def test_contraction_punct(en_tokenizer): | ||||||
|     tokens = EN("(can't") |     tokens = en_tokenizer("(can't") | ||||||
|     assert len(tokens) == 3 |     assert len(tokens) == 3 | ||||||
|     tokens = EN("`ain't") |     tokens = en_tokenizer("`ain't") | ||||||
|     assert len(tokens) == 3 |     assert len(tokens) == 3 | ||||||
|     tokens = EN('''"isn't''') |     tokens = en_tokenizer('''"isn't''') | ||||||
|     assert len(tokens) == 3 |     assert len(tokens) == 3 | ||||||
|     tokens = EN("can't!") |     tokens = en_tokenizer("can't!") | ||||||
|     assert len(tokens) == 3 |     assert len(tokens) == 3 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_sample(EN): | def test_sample(en_tokenizer): | ||||||
|     text = """Tributes pour in for late British Labour Party leader |     text = """Tributes pour in for late British Labour Party leader | ||||||
| 
 | 
 | ||||||
| Tributes poured in from around the world Thursday | Tributes poured in from around the world Thursday | ||||||
|  | @ -75,7 +68,7 @@ untimely death" of the rapier-tongued Scottish barrister and parliamentarian. | ||||||
| 
 | 
 | ||||||
| "Mr. Smith, throughout his distinguished""" | "Mr. Smith, throughout his distinguished""" | ||||||
| 
 | 
 | ||||||
|     tokens = EN(text) |     tokens = en_tokenizer(text) | ||||||
|     assert len(tokens) > 5 |     assert len(tokens) > 5 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -1,16 +1,9 @@ | ||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
| import pytest | import pytest | ||||||
| 
 | 
 | ||||||
| from spacy.en import English |  | ||||||
| 
 | 
 | ||||||
| 
 | def test1(en_tokenizer): | ||||||
| @pytest.fixture |  | ||||||
| def EN(): |  | ||||||
|     return English() |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def test1(EN): |  | ||||||
|     words = ['JAPAN', 'GET', 'LUCKY'] |     words = ['JAPAN', 'GET', 'LUCKY'] | ||||||
|     tokens = EN.tokenizer.tokens_from_list(words) |     tokens = en_tokenizer.tokens_from_list(words) | ||||||
|     assert len(tokens) == 3 |     assert len(tokens) == 3 | ||||||
|     assert tokens[0].orth_ == 'JAPAN' |     assert tokens[0].orth_ == 'JAPAN' | ||||||
|  |  | ||||||
|  | @ -1,41 +1,35 @@ | ||||||
| """Test that tokens are created correctly for whitespace.""" | """Test that tokens are created correctly for whitespace.""" | ||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
| 
 | 
 | ||||||
| from spacy.en import English |  | ||||||
| import pytest | import pytest | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @pytest.fixture | def test_single_space(en_tokenizer): | ||||||
| def EN(): |     tokens = en_tokenizer('hello possums') | ||||||
|     return English().tokenizer |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def test_single_space(EN): |  | ||||||
|     tokens = EN('hello possums') |  | ||||||
|     assert len(tokens) == 2 |     assert len(tokens) == 2 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_double_space(EN): | def test_double_space(en_tokenizer): | ||||||
|     tokens = EN('hello  possums') |     tokens = en_tokenizer('hello  possums') | ||||||
|     assert len(tokens) == 3 |     assert len(tokens) == 3 | ||||||
|     assert tokens[1].orth_ == ' ' |     assert tokens[1].orth_ == ' ' | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_newline(EN): | def test_newline(en_tokenizer): | ||||||
|     tokens = EN('hello\npossums') |     tokens = en_tokenizer('hello\npossums') | ||||||
|     assert len(tokens) == 3 |     assert len(tokens) == 3 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_newline_space(EN): | def test_newline_space(en_tokenizer): | ||||||
|     tokens = EN('hello \npossums') |     tokens = en_tokenizer('hello \npossums') | ||||||
|     assert len(tokens) == 3 |     assert len(tokens) == 3 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_newline_double_space(EN): | def test_newline_double_space(en_tokenizer): | ||||||
|     tokens = EN('hello  \npossums') |     tokens = en_tokenizer('hello  \npossums') | ||||||
|     assert len(tokens) == 3 |     assert len(tokens) == 3 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_newline_space_wrap(EN): | def test_newline_space_wrap(en_tokenizer): | ||||||
|     tokens = EN('hello \n possums') |     tokens = en_tokenizer('hello \n possums') | ||||||
|     assert len(tokens) == 3 |     assert len(tokens) == 3 | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user