spaCy/spacy/tests/tokenizer/test_tokenizer.py

109 lines
3.2 KiB
Python
Raw Normal View History

# coding: utf-8
from __future__ import unicode_literals
from os import path
2014-12-21 12:38:27 +03:00
import pytest
from spacy.util import utf8open
def test_tokenizer_handles_no_word(en_tokenizer):
tokens = en_tokenizer("")
assert len(tokens) == 0
2015-04-19 22:39:18 +03:00
@pytest.mark.parametrize('text', ["hello"])
def test_tokenizer_handles_single_word(en_tokenizer, text):
tokens = en_tokenizer(text)
assert tokens[0].text == text
@pytest.mark.parametrize('text', ["hello possums"])
def test_tokenizer_handles_two_words(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 2
assert tokens[0].text != tokens[1].text
def test_tokenizer_handles_punct(en_tokenizer):
text = "hello, possums."
tokens = en_tokenizer(text)
assert len(tokens) == 4
assert tokens[0].text == "hello"
assert tokens[1].text == ","
assert tokens[2].text == "possums"
assert tokens[1].text != "hello"
def test_tokenizer_handles_digits(en_tokenizer):
text = "The year: 1984."
tokens = en_tokenizer(text)
assert len(tokens) == 5
assert tokens[0].text == "The"
assert tokens[3].text == "1984"
def test_tokenizer_handles_basic_contraction(en_tokenizer):
text = "don't giggle"
tokens = en_tokenizer(text)
assert len(tokens) == 3
assert tokens[1].text == "n't"
text = "i said don't!"
tokens = en_tokenizer(text)
assert len(tokens) == 5
assert tokens[4].text == "!"
@pytest.mark.parametrize('text', ["`ain't", '''"isn't''', "can't!"])
def test_tokenizer_handles_basic_contraction_punct(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 3
2015-04-19 22:39:18 +03:00
def test_tokenizer_handles_long_text(en_tokenizer):
text = """Tributes pour in for late British Labour Party leader
2015-04-19 11:31:31 +03:00
Tributes poured in from around the world Thursday
to the late Labour Party leader John Smith, who died earlier from a massive
heart attack aged 55.
2015-04-19 11:31:31 +03:00
In Washington, the US State Department issued a statement regretting "the
untimely death" of the rapier-tongued Scottish barrister and parliamentarian.
"Mr. Smith, throughout his distinguished"""
2015-04-19 11:31:31 +03:00
tokens = en_tokenizer(text)
assert len(tokens) > 5
@pytest.mark.parametrize('file_name', ["sun.txt"])
def test_tokenizer_handle_text_from_file(en_tokenizer, file_name):
loc = path.join(path.dirname(__file__), file_name)
text = utf8open(loc).read()
assert len(text) != 0
2015-06-07 22:29:17 +03:00
tokens = en_tokenizer(text)
assert len(tokens) > 100
@pytest.mark.parametrize('text,length', [
("The U.S. Army likes Shock and Awe.", 8),
("U.N. regulations are not a part of their concern.", 10),
("“Isn't it?”", 6),
("""Yes! "I'd rather have a walk", Ms. Comble sighed. """, 15),
("""'Me too!', Mr. P. Delaware cried. """, 11),
("They ran about 10km.", 6),
# ("But then the 6,000-year ice age came...", 10)
])
def test_tokenizer_handles_cnts(en_tokenizer, text, length):
2015-06-07 22:29:17 +03:00
tokens = en_tokenizer(text)
assert len(tokens) == length
def test_tokenizer_suspected_freeing_strings(en_tokenizer):
text1 = "Betty Botter bought a pound of butter."
text2 = "Betty also bought a pound of butter."
tokens1 = en_tokenizer(text1)
tokens2 = en_tokenizer(text2)
assert tokens1[0].text == "Betty"
assert tokens2[0].text == "Betty"