2014-09-15 08:31:58 +04:00
|
|
|
# coding: utf-8
|
2014-07-07 06:23:46 +04:00
|
|
|
from __future__ import unicode_literals
|
2017-01-05 15:15:52 +03:00
|
|
|
from os import path
|
2014-07-07 06:23:46 +04:00
|
|
|
|
2014-12-21 12:38:27 +03:00
|
|
|
import pytest
|
2015-10-24 08:18:47 +03:00
|
|
|
|
2017-01-05 18:25:04 +03:00
|
|
|
from ....util import utf8open
|
2016-11-24 15:51:59 +03:00
|
|
|
|
2014-07-07 06:23:46 +04:00
|
|
|
|
2017-01-05 15:17:05 +03:00
|
|
|
def test_tokenizer_handles_no_word(en_tokenizer):
|
|
|
|
tokens = en_tokenizer("")
|
2015-02-07 05:01:44 +03:00
|
|
|
assert len(tokens) == 0
|
|
|
|
|
2015-04-19 22:39:18 +03:00
|
|
|
|
2017-01-05 15:17:05 +03:00
|
|
|
@pytest.mark.parametrize('text', ["hello"])
|
|
|
|
def test_tokenizer_handles_single_word(en_tokenizer, text):
|
|
|
|
tokens = en_tokenizer(text)
|
|
|
|
assert tokens[0].text == text
|
2014-07-07 06:23:46 +04:00
|
|
|
|
|
|
|
|
2017-01-05 15:17:05 +03:00
|
|
|
@pytest.mark.parametrize('text', ["hello possums"])
|
|
|
|
def test_tokenizer_handles_two_words(en_tokenizer, text):
|
|
|
|
tokens = en_tokenizer(text)
|
2014-10-23 17:59:17 +04:00
|
|
|
assert len(tokens) == 2
|
2017-01-05 15:17:05 +03:00
|
|
|
assert tokens[0].text != tokens[1].text
|
2014-07-07 06:23:46 +04:00
|
|
|
|
|
|
|
|
2017-01-05 15:17:05 +03:00
|
|
|
def test_tokenizer_handles_punct(en_tokenizer):
|
|
|
|
text = "hello, possums."
|
|
|
|
tokens = en_tokenizer(text)
|
2014-09-15 08:31:58 +04:00
|
|
|
assert len(tokens) == 4
|
2017-01-05 15:17:05 +03:00
|
|
|
assert tokens[0].text == "hello"
|
|
|
|
assert tokens[1].text == ","
|
|
|
|
assert tokens[2].text == "possums"
|
|
|
|
assert tokens[1].text != "hello"
|
2014-07-07 06:23:46 +04:00
|
|
|
|
|
|
|
|
2017-01-05 15:17:05 +03:00
|
|
|
def test_tokenizer_handles_digits(en_tokenizer):
|
|
|
|
text = "The year: 1984."
|
|
|
|
tokens = en_tokenizer(text)
|
2014-10-23 17:59:17 +04:00
|
|
|
assert len(tokens) == 5
|
2017-01-05 15:17:05 +03:00
|
|
|
assert tokens[0].text == "The"
|
|
|
|
assert tokens[3].text == "1984"
|
2014-07-07 06:23:46 +04:00
|
|
|
|
|
|
|
|
2017-01-05 15:17:05 +03:00
|
|
|
def test_tokenizer_handles_basic_contraction(en_tokenizer):
|
|
|
|
text = "don't giggle"
|
|
|
|
tokens = en_tokenizer(text)
|
2014-10-23 17:59:17 +04:00
|
|
|
assert len(tokens) == 3
|
2017-01-05 15:17:05 +03:00
|
|
|
assert tokens[1].text == "n't"
|
|
|
|
text = "i said don't!"
|
|
|
|
tokens = en_tokenizer(text)
|
2014-10-23 17:59:17 +04:00
|
|
|
assert len(tokens) == 5
|
2017-01-05 15:17:05 +03:00
|
|
|
assert tokens[4].text == "!"
|
2014-09-12 20:00:42 +04:00
|
|
|
|
2017-01-05 15:17:05 +03:00
|
|
|
|
|
|
|
@pytest.mark.parametrize('text', ["`ain't", '''"isn't''', "can't!"])
|
|
|
|
def test_tokenizer_handles_basic_contraction_punct(en_tokenizer, text):
|
|
|
|
tokens = en_tokenizer(text)
|
2014-09-12 20:00:42 +04:00
|
|
|
assert len(tokens) == 3
|
|
|
|
|
2015-04-19 22:39:18 +03:00
|
|
|
|
2017-01-05 15:17:05 +03:00
|
|
|
def test_tokenizer_handles_long_text(en_tokenizer):
|
2014-09-15 03:32:51 +04:00
|
|
|
text = """Tributes pour in for late British Labour Party leader
|
|
|
|
|
2015-04-19 11:31:31 +03:00
|
|
|
Tributes poured in from around the world Thursday
|
|
|
|
to the late Labour Party leader John Smith, who died earlier from a massive
|
2014-09-15 03:32:51 +04:00
|
|
|
heart attack aged 55.
|
|
|
|
|
2015-04-19 11:31:31 +03:00
|
|
|
In Washington, the US State Department issued a statement regretting "the
|
2014-09-15 03:32:51 +04:00
|
|
|
untimely death" of the rapier-tongued Scottish barrister and parliamentarian.
|
|
|
|
|
|
|
|
"Mr. Smith, throughout his distinguished"""
|
2015-04-19 11:31:31 +03:00
|
|
|
|
2015-06-07 18:24:49 +03:00
|
|
|
tokens = en_tokenizer(text)
|
2014-09-15 03:32:51 +04:00
|
|
|
assert len(tokens) > 5
|
2014-09-15 08:31:58 +04:00
|
|
|
|
|
|
|
|
2017-01-05 15:15:52 +03:00
|
|
|
@pytest.mark.parametrize('file_name', ["sun.txt"])
|
|
|
|
def test_tokenizer_handle_text_from_file(en_tokenizer, file_name):
|
|
|
|
loc = path.join(path.dirname(__file__), file_name)
|
|
|
|
text = utf8open(loc).read()
|
|
|
|
assert len(text) != 0
|
2015-06-07 22:29:17 +03:00
|
|
|
tokens = en_tokenizer(text)
|
2017-01-05 15:17:05 +03:00
|
|
|
assert len(tokens) > 100
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize('text,length', [
|
|
|
|
("The U.S. Army likes Shock and Awe.", 8),
|
|
|
|
("U.N. regulations are not a part of their concern.", 10),
|
|
|
|
("“Isn't it?”", 6),
|
|
|
|
("""Yes! "I'd rather have a walk", Ms. Comble sighed. """, 15),
|
|
|
|
("""'Me too!', Mr. P. Delaware cried. """, 11),
|
|
|
|
("They ran about 10km.", 6),
|
|
|
|
# ("But then the 6,000-year ice age came...", 10)
|
|
|
|
])
|
|
|
|
def test_tokenizer_handles_cnts(en_tokenizer, text, length):
|
2015-06-07 22:29:17 +03:00
|
|
|
tokens = en_tokenizer(text)
|
2017-01-05 15:17:05 +03:00
|
|
|
assert len(tokens) == length
|
2016-01-19 15:20:14 +03:00
|
|
|
|
|
|
|
|
2017-01-05 15:16:55 +03:00
|
|
|
def test_tokenizer_suspected_freeing_strings(en_tokenizer):
|
|
|
|
text1 = "Betty Botter bought a pound of butter."
|
|
|
|
text2 = "Betty also bought a pound of butter."
|
|
|
|
tokens1 = en_tokenizer(text1)
|
|
|
|
tokens2 = en_tokenizer(text2)
|
|
|
|
assert tokens1[0].text == "Betty"
|
|
|
|
assert tokens2[0].text == "Betty"
|