2014-09-15 08:31:58 +04:00
|
|
|
# coding: utf-8
|
2014-07-07 06:23:46 +04:00
|
|
|
from __future__ import unicode_literals
|
|
|
|
|
2014-12-21 12:38:27 +03:00
|
|
|
import pytest
|
2015-10-24 08:18:47 +03:00
|
|
|
import io
|
|
|
|
import pickle
|
|
|
|
import cloudpickle
|
|
|
|
import tempfile
|
|
|
|
|
2016-11-24 15:51:59 +03:00
|
|
|
from ... import util
|
2016-12-18 18:55:32 +03:00
|
|
|
from ...language_data import TOKENIZER_PREFIXES
|
2016-11-24 15:51:59 +03:00
|
|
|
|
2016-12-18 18:55:32 +03:00
|
|
|
en_search_prefixes = util.compile_prefix_regex(TOKENIZER_PREFIXES).search
|
2015-10-24 08:18:47 +03:00
|
|
|
|
2016-02-10 21:24:37 +03:00
|
|
|
# @pytest.mark.xfail
|
|
|
|
# def test_pickle(en_tokenizer):
|
|
|
|
# file_ = io.BytesIO()
|
|
|
|
# cloudpickle.dump(en_tokenizer, file_)
|
|
|
|
# file_.seek(0)
|
|
|
|
# loaded = pickle.load(file_)
|
|
|
|
# assert loaded is not None
|
2014-07-07 06:23:46 +04:00
|
|
|
|
2016-11-24 15:51:59 +03:00
|
|
|
def test_pre_punct_regex():
|
|
|
|
string = "(can't"
|
|
|
|
match = en_search_prefixes(string)
|
|
|
|
assert match.group() == "("
|
2014-07-07 06:23:46 +04:00
|
|
|
|
2015-06-07 18:24:49 +03:00
|
|
|
def test_no_word(en_tokenizer):
|
|
|
|
tokens = en_tokenizer(u'')
|
2015-02-07 05:01:44 +03:00
|
|
|
assert len(tokens) == 0
|
|
|
|
|
2015-04-19 22:39:18 +03:00
|
|
|
|
2015-06-07 18:24:49 +03:00
|
|
|
def test_single_word(en_tokenizer):
|
|
|
|
tokens = en_tokenizer(u'hello')
|
2015-01-23 23:22:30 +03:00
|
|
|
assert tokens[0].orth_ == 'hello'
|
2014-07-07 06:23:46 +04:00
|
|
|
|
|
|
|
|
2015-06-07 18:24:49 +03:00
|
|
|
def test_two_words(en_tokenizer):
|
|
|
|
tokens = en_tokenizer('hello possums')
|
2014-10-23 17:59:17 +04:00
|
|
|
assert len(tokens) == 2
|
2015-01-23 23:22:30 +03:00
|
|
|
assert tokens[0].orth_ != tokens[1].orth_
|
2014-07-07 06:23:46 +04:00
|
|
|
|
|
|
|
|
2015-06-07 18:24:49 +03:00
|
|
|
def test_punct(en_tokenizer):
|
|
|
|
tokens = en_tokenizer('hello, possums.')
|
2014-09-15 08:31:58 +04:00
|
|
|
assert len(tokens) == 4
|
2015-01-23 23:22:30 +03:00
|
|
|
assert tokens[0].orth_ == 'hello'
|
|
|
|
assert tokens[1].orth_ == ','
|
|
|
|
assert tokens[2].orth_ == 'possums'
|
|
|
|
assert tokens[1].orth_ != 'hello'
|
2014-07-07 06:23:46 +04:00
|
|
|
|
|
|
|
|
2015-06-07 18:24:49 +03:00
|
|
|
def test_digits(en_tokenizer):
|
|
|
|
tokens = en_tokenizer('The year: 1984.')
|
2014-10-23 17:59:17 +04:00
|
|
|
assert len(tokens) == 5
|
2015-06-07 18:24:49 +03:00
|
|
|
assert tokens[0].orth == en_tokenizer.vocab['The'].orth
|
|
|
|
assert tokens[3].orth == en_tokenizer.vocab['1984'].orth
|
2014-07-07 06:23:46 +04:00
|
|
|
|
|
|
|
|
2015-06-07 18:24:49 +03:00
|
|
|
def test_contraction(en_tokenizer):
|
|
|
|
tokens = en_tokenizer("don't giggle")
|
2014-10-23 17:59:17 +04:00
|
|
|
assert len(tokens) == 3
|
2015-06-07 18:24:49 +03:00
|
|
|
assert tokens[1].orth == en_tokenizer.vocab["n't"].orth
|
|
|
|
tokens = en_tokenizer("i said don't!")
|
2014-10-23 17:59:17 +04:00
|
|
|
assert len(tokens) == 5
|
2015-06-07 18:24:49 +03:00
|
|
|
assert tokens[4].orth == en_tokenizer.vocab['!'].orth
|
2014-09-12 20:00:42 +04:00
|
|
|
|
2015-06-07 18:24:49 +03:00
|
|
|
def test_contraction_punct(en_tokenizer):
|
2016-11-24 15:51:59 +03:00
|
|
|
tokens = [w.text for w in en_tokenizer("(can't")]
|
|
|
|
assert tokens == ['(', 'ca', "n't"]
|
2015-06-07 18:24:49 +03:00
|
|
|
tokens = en_tokenizer("`ain't")
|
2014-09-12 20:00:42 +04:00
|
|
|
assert len(tokens) == 3
|
2015-06-07 18:24:49 +03:00
|
|
|
tokens = en_tokenizer('''"isn't''')
|
2014-09-12 20:00:42 +04:00
|
|
|
assert len(tokens) == 3
|
2015-06-07 18:24:49 +03:00
|
|
|
tokens = en_tokenizer("can't!")
|
2014-09-12 20:00:42 +04:00
|
|
|
assert len(tokens) == 3
|
|
|
|
|
2015-04-19 22:39:18 +03:00
|
|
|
|
2015-06-07 18:24:49 +03:00
|
|
|
def test_sample(en_tokenizer):
|
2014-09-15 03:32:51 +04:00
|
|
|
text = """Tributes pour in for late British Labour Party leader
|
|
|
|
|
2015-04-19 11:31:31 +03:00
|
|
|
Tributes poured in from around the world Thursday
|
|
|
|
to the late Labour Party leader John Smith, who died earlier from a massive
|
2014-09-15 03:32:51 +04:00
|
|
|
heart attack aged 55.
|
|
|
|
|
2015-04-19 11:31:31 +03:00
|
|
|
In Washington, the US State Department issued a statement regretting "the
|
2014-09-15 03:32:51 +04:00
|
|
|
untimely death" of the rapier-tongued Scottish barrister and parliamentarian.
|
|
|
|
|
|
|
|
"Mr. Smith, throughout his distinguished"""
|
2015-04-19 11:31:31 +03:00
|
|
|
|
2015-06-07 18:24:49 +03:00
|
|
|
tokens = en_tokenizer(text)
|
2014-09-15 03:32:51 +04:00
|
|
|
assert len(tokens) > 5
|
2014-09-15 08:31:58 +04:00
|
|
|
|
|
|
|
|
2015-06-07 22:29:17 +03:00
|
|
|
def test_cnts1(en_tokenizer):
|
2014-09-15 08:31:58 +04:00
|
|
|
text = u"""The U.S. Army likes Shock and Awe."""
|
2015-06-07 22:29:17 +03:00
|
|
|
tokens = en_tokenizer(text)
|
2014-09-15 08:31:58 +04:00
|
|
|
assert len(tokens) == 8
|
|
|
|
|
2014-12-09 06:48:01 +03:00
|
|
|
|
2015-06-07 22:29:17 +03:00
|
|
|
def test_cnts2(en_tokenizer):
|
2014-09-15 08:31:58 +04:00
|
|
|
text = u"""U.N. regulations are not a part of their concern."""
|
2015-06-07 22:29:17 +03:00
|
|
|
tokens = en_tokenizer(text)
|
2014-09-15 08:31:58 +04:00
|
|
|
assert len(tokens) == 10
|
|
|
|
|
2014-12-09 06:48:01 +03:00
|
|
|
|
2015-06-07 22:29:17 +03:00
|
|
|
def test_cnts3(en_tokenizer):
|
2014-09-15 08:31:58 +04:00
|
|
|
text = u"“Isn't it?”"
|
2015-06-07 22:29:17 +03:00
|
|
|
tokens = en_tokenizer(text)
|
2015-01-23 23:22:30 +03:00
|
|
|
words = [t.orth_ for t in tokens]
|
2014-12-09 06:48:01 +03:00
|
|
|
assert len(words) == 6
|
|
|
|
|
2014-09-15 08:31:58 +04:00
|
|
|
|
2015-06-07 22:29:17 +03:00
|
|
|
def test_cnts4(en_tokenizer):
|
2014-09-15 08:31:58 +04:00
|
|
|
text = u"""Yes! "I'd rather have a walk", Ms. Comble sighed. """
|
2015-06-07 22:29:17 +03:00
|
|
|
tokens = en_tokenizer(text)
|
2015-01-23 23:22:30 +03:00
|
|
|
words = [t.orth_ for t in tokens]
|
2014-12-09 06:48:01 +03:00
|
|
|
assert len(words) == 15
|
|
|
|
|
2014-09-15 08:31:58 +04:00
|
|
|
|
2015-06-07 22:29:17 +03:00
|
|
|
def test_cnts5(en_tokenizer):
|
2014-09-15 08:31:58 +04:00
|
|
|
text = """'Me too!', Mr. P. Delaware cried. """
|
2015-06-07 22:29:17 +03:00
|
|
|
tokens = en_tokenizer(text)
|
2014-09-15 08:31:58 +04:00
|
|
|
assert len(tokens) == 11
|
|
|
|
|
2016-01-19 15:20:14 +03:00
|
|
|
|
|
|
|
@pytest.mark.xfail
|
|
|
|
def test_mr(en_tokenizer):
|
|
|
|
text = """Today is Tuesday.Mr."""
|
|
|
|
tokens = en_tokenizer(text)
|
|
|
|
assert len(tokens) == 5
|
|
|
|
assert [w.orth_ for w in tokens] == ['Today', 'is', 'Tuesday', '.', 'Mr.']
|
2015-06-07 22:29:17 +03:00
|
|
|
|
2014-12-09 06:48:01 +03:00
|
|
|
|
2015-06-07 22:29:17 +03:00
|
|
|
def test_cnts6(en_tokenizer):
|
2014-09-15 08:31:58 +04:00
|
|
|
text = u'They ran about 10km.'
|
2015-06-07 22:29:17 +03:00
|
|
|
tokens = en_tokenizer(text)
|
2015-01-23 23:22:30 +03:00
|
|
|
words = [t.orth_ for t in tokens]
|
2014-12-09 06:48:01 +03:00
|
|
|
assert len(words) == 6
|
|
|
|
|
2015-06-07 22:29:17 +03:00
|
|
|
def test_bracket_period(en_tokenizer):
|
2015-01-22 14:25:18 +03:00
|
|
|
text = u'(And a 6a.m. run through Washington Park).'
|
2015-06-07 22:29:17 +03:00
|
|
|
tokens = en_tokenizer(text)
|
2015-01-23 23:22:30 +03:00
|
|
|
assert tokens[len(tokens) - 1].orth_ == u'.'
|
2015-01-22 14:25:18 +03:00
|
|
|
|
2015-02-18 14:37:04 +03:00
|
|
|
|
2015-06-07 22:29:17 +03:00
|
|
|
def test_ie(en_tokenizer):
|
2015-02-18 14:37:04 +03:00
|
|
|
text = u"It's mediocre i.e. bad."
|
2015-06-07 22:29:17 +03:00
|
|
|
tokens = en_tokenizer(text)
|
2015-02-18 14:37:04 +03:00
|
|
|
assert len(tokens) == 6
|
|
|
|
assert tokens[3].orth_ == "i.e."
|
|
|
|
|
2016-01-16 18:41:26 +03:00
|
|
|
|
|
|
|
def test_two_whitespace(en_tokenizer):
|
|
|
|
orig_str = u'there are 2 spaces after this '
|
|
|
|
tokens = en_tokenizer(orig_str)
|
|
|
|
assert repr(tokens.text_with_ws) == repr(orig_str)
|
|
|
|
|
2016-01-19 15:20:14 +03:00
|
|
|
|
|
|
|
@pytest.mark.xfail
|
|
|
|
def test_em_dash_infix(en_tokenizer):
|
|
|
|
# Re Issue #225
|
|
|
|
tokens = en_tokenizer('''Will this road take me to Puddleton?\u2014No, '''
|
|
|
|
'''you'll have to walk there.\u2014Ariel.''')
|
|
|
|
assert tokens[6].text == 'Puddleton'
|
|
|
|
assert tokens[7].text == '?'
|
|
|
|
assert tokens[8].text == '\u2014'
|
|
|
|
|
2014-11-04 18:03:22 +03:00
|
|
|
#def test_cnts7():
|
|
|
|
# text = 'But then the 6,000-year ice age came...'
|
|
|
|
# tokens = EN.tokenize(text)
|
|
|
|
# assert len(tokens) == 10
|