mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 10:46:29 +03:00
Remove redundant test_tokenizer.py for English
This commit is contained in:
parent
8216ba599b
commit
5bb4081f52
|
@ -1,108 +0,0 @@
|
||||||
# coding: utf-8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
from os import path
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
from ....util import utf8open
|
|
||||||
|
|
||||||
|
|
||||||
def test_tokenizer_handles_no_word(en_tokenizer):
|
|
||||||
tokens = en_tokenizer("")
|
|
||||||
assert len(tokens) == 0
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["hello"])
|
|
||||||
def test_tokenizer_handles_single_word(en_tokenizer, text):
|
|
||||||
tokens = en_tokenizer(text)
|
|
||||||
assert tokens[0].text == text
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["hello possums"])
|
|
||||||
def test_tokenizer_handles_two_words(en_tokenizer, text):
|
|
||||||
tokens = en_tokenizer(text)
|
|
||||||
assert len(tokens) == 2
|
|
||||||
assert tokens[0].text != tokens[1].text
|
|
||||||
|
|
||||||
|
|
||||||
def test_tokenizer_handles_punct(en_tokenizer):
|
|
||||||
text = "hello, possums."
|
|
||||||
tokens = en_tokenizer(text)
|
|
||||||
assert len(tokens) == 4
|
|
||||||
assert tokens[0].text == "hello"
|
|
||||||
assert tokens[1].text == ","
|
|
||||||
assert tokens[2].text == "possums"
|
|
||||||
assert tokens[1].text != "hello"
|
|
||||||
|
|
||||||
|
|
||||||
def test_tokenizer_handles_digits(en_tokenizer):
|
|
||||||
text = "The year: 1984."
|
|
||||||
tokens = en_tokenizer(text)
|
|
||||||
assert len(tokens) == 5
|
|
||||||
assert tokens[0].text == "The"
|
|
||||||
assert tokens[3].text == "1984"
|
|
||||||
|
|
||||||
|
|
||||||
def test_tokenizer_handles_basic_contraction(en_tokenizer):
|
|
||||||
text = "don't giggle"
|
|
||||||
tokens = en_tokenizer(text)
|
|
||||||
assert len(tokens) == 3
|
|
||||||
assert tokens[1].text == "n't"
|
|
||||||
text = "i said don't!"
|
|
||||||
tokens = en_tokenizer(text)
|
|
||||||
assert len(tokens) == 5
|
|
||||||
assert tokens[4].text == "!"
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["`ain't", '''"isn't''', "can't!"])
|
|
||||||
def test_tokenizer_handles_basic_contraction_punct(en_tokenizer, text):
|
|
||||||
tokens = en_tokenizer(text)
|
|
||||||
assert len(tokens) == 3
|
|
||||||
|
|
||||||
|
|
||||||
def test_tokenizer_handles_long_text(en_tokenizer):
|
|
||||||
text = """Tributes pour in for late British Labour Party leader
|
|
||||||
|
|
||||||
Tributes poured in from around the world Thursday
|
|
||||||
to the late Labour Party leader John Smith, who died earlier from a massive
|
|
||||||
heart attack aged 55.
|
|
||||||
|
|
||||||
In Washington, the US State Department issued a statement regretting "the
|
|
||||||
untimely death" of the rapier-tongued Scottish barrister and parliamentarian.
|
|
||||||
|
|
||||||
"Mr. Smith, throughout his distinguished"""
|
|
||||||
|
|
||||||
tokens = en_tokenizer(text)
|
|
||||||
assert len(tokens) > 5
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('file_name', ["sun.txt"])
|
|
||||||
def test_tokenizer_handle_text_from_file(en_tokenizer, file_name):
|
|
||||||
loc = path.join(path.dirname(__file__), file_name)
|
|
||||||
text = utf8open(loc).read()
|
|
||||||
assert len(text) != 0
|
|
||||||
tokens = en_tokenizer(text)
|
|
||||||
assert len(tokens) > 100
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text,length', [
|
|
||||||
("The U.S. Army likes Shock and Awe.", 8),
|
|
||||||
("U.N. regulations are not a part of their concern.", 10),
|
|
||||||
("“Isn't it?”", 6),
|
|
||||||
("""Yes! "I'd rather have a walk", Ms. Comble sighed. """, 15),
|
|
||||||
("""'Me too!', Mr. P. Delaware cried. """, 11),
|
|
||||||
("They ran about 10km.", 6),
|
|
||||||
# ("But then the 6,000-year ice age came...", 10)
|
|
||||||
])
|
|
||||||
def test_tokenizer_handles_cnts(en_tokenizer, text, length):
|
|
||||||
tokens = en_tokenizer(text)
|
|
||||||
assert len(tokens) == length
|
|
||||||
|
|
||||||
|
|
||||||
def test_tokenizer_suspected_freeing_strings(en_tokenizer):
|
|
||||||
text1 = "Betty Botter bought a pound of butter."
|
|
||||||
text2 = "Betty also bought a pound of butter."
|
|
||||||
tokens1 = en_tokenizer(text1)
|
|
||||||
tokens2 = en_tokenizer(text2)
|
|
||||||
assert tokens1[0].text == "Betty"
|
|
||||||
assert tokens2[0].text == "Betty"
|
|
Loading…
Reference in New Issue
Block a user