spaCy/spacy/tests/tokenizer/test_tokenizer.py

159 lines
4.3 KiB
Python
Raw Normal View History

# coding: utf-8
from __future__ import unicode_literals
from os import path
2014-12-21 12:38:27 +03:00
import pytest
import io
import pickle
import cloudpickle
import tempfile
from ... import util
2016-12-18 18:55:32 +03:00
from ...language_data import TOKENIZER_PREFIXES
from spacy.util import utf8open
2016-12-18 18:55:32 +03:00
en_search_prefixes = util.compile_prefix_regex(TOKENIZER_PREFIXES).search
2016-02-10 21:24:37 +03:00
# @pytest.mark.xfail
# def test_pickle(en_tokenizer):
# file_ = io.BytesIO()
# cloudpickle.dump(en_tokenizer, file_)
# file_.seek(0)
# loaded = pickle.load(file_)
# assert loaded is not None
assert len(tokens) == 0
2015-04-19 22:39:18 +03:00
def test_single_word(en_tokenizer):
tokens = en_tokenizer(u'hello')
assert tokens[0].orth_ == 'hello'
def test_two_words(en_tokenizer):
tokens = en_tokenizer('hello possums')
assert len(tokens) == 2
assert tokens[0].orth_ != tokens[1].orth_
def test_punct(en_tokenizer):
tokens = en_tokenizer('hello, possums.')
assert len(tokens) == 4
assert tokens[0].orth_ == 'hello'
assert tokens[1].orth_ == ','
assert tokens[2].orth_ == 'possums'
assert tokens[1].orth_ != 'hello'
def test_digits(en_tokenizer):
tokens = en_tokenizer('The year: 1984.')
assert len(tokens) == 5
assert tokens[0].orth == en_tokenizer.vocab['The'].orth
assert tokens[3].orth == en_tokenizer.vocab['1984'].orth
def test_contraction(en_tokenizer):
tokens = en_tokenizer("don't giggle")
assert len(tokens) == 3
assert tokens[1].orth == en_tokenizer.vocab["n't"].orth
tokens = en_tokenizer("i said don't!")
assert len(tokens) == 5
assert tokens[4].orth == en_tokenizer.vocab['!'].orth
assert len(tokens) == 3
2015-04-19 22:39:18 +03:00
def test_sample(en_tokenizer):
text = """Tributes pour in for late British Labour Party leader
2015-04-19 11:31:31 +03:00
Tributes poured in from around the world Thursday
to the late Labour Party leader John Smith, who died earlier from a massive
heart attack aged 55.
2015-04-19 11:31:31 +03:00
In Washington, the US State Department issued a statement regretting "the
untimely death" of the rapier-tongued Scottish barrister and parliamentarian.
"Mr. Smith, throughout his distinguished"""
2015-04-19 11:31:31 +03:00
tokens = en_tokenizer(text)
assert len(tokens) > 5
2015-06-07 22:29:17 +03:00
def test_cnts1(en_tokenizer):
text = u"""The U.S. Army likes Shock and Awe."""
@pytest.mark.parametrize('file_name', ["sun.txt"])
def test_tokenizer_handle_text_from_file(en_tokenizer, file_name):
loc = path.join(path.dirname(__file__), file_name)
text = utf8open(loc).read()
assert len(text) != 0
2015-06-07 22:29:17 +03:00
tokens = en_tokenizer(text)
assert len(tokens) == 8
2015-06-07 22:29:17 +03:00
def test_cnts2(en_tokenizer):
text = u"""U.N. regulations are not a part of their concern."""
2015-06-07 22:29:17 +03:00
tokens = en_tokenizer(text)
assert len(tokens) == 10
2015-06-07 22:29:17 +03:00
def test_cnts3(en_tokenizer):
text = u"“Isn't it?”"
2015-06-07 22:29:17 +03:00
tokens = en_tokenizer(text)
words = [t.orth_ for t in tokens]
assert len(words) == 6
2015-06-07 22:29:17 +03:00
def test_cnts4(en_tokenizer):
text = u"""Yes! "I'd rather have a walk", Ms. Comble sighed. """
2015-06-07 22:29:17 +03:00
tokens = en_tokenizer(text)
words = [t.orth_ for t in tokens]
assert len(words) == 15
2015-06-07 22:29:17 +03:00
def test_cnts5(en_tokenizer):
text = """'Me too!', Mr. P. Delaware cried. """
2015-06-07 22:29:17 +03:00
tokens = en_tokenizer(text)
assert len(tokens) == 11
@pytest.mark.xfail
def test_mr(en_tokenizer):
text = """Today is Tuesday.Mr."""
tokens = en_tokenizer(text)
assert len(tokens) == 5
assert [w.orth_ for w in tokens] == ['Today', 'is', 'Tuesday', '.', 'Mr.']
2015-06-07 22:29:17 +03:00
2015-06-07 22:29:17 +03:00
def test_cnts6(en_tokenizer):
text = u'They ran about 10km.'
2015-06-07 22:29:17 +03:00
tokens = en_tokenizer(text)
words = [t.orth_ for t in tokens]
assert len(words) == 6
2015-06-07 22:29:17 +03:00
def test_bracket_period(en_tokenizer):
2015-01-22 14:25:18 +03:00
text = u'(And a 6a.m. run through Washington Park).'
2015-06-07 22:29:17 +03:00
tokens = en_tokenizer(text)
assert tokens[len(tokens) - 1].orth_ == u'.'
2015-01-22 14:25:18 +03:00
2015-02-18 14:37:04 +03:00
2015-06-07 22:29:17 +03:00
def test_ie(en_tokenizer):
2015-02-18 14:37:04 +03:00
text = u"It's mediocre i.e. bad."
2015-06-07 22:29:17 +03:00
tokens = en_tokenizer(text)
2015-02-18 14:37:04 +03:00
assert len(tokens) == 6
assert tokens[3].orth_ == "i.e."
#def test_cnts7():
# text = 'But then the 6,000-year ice age came...'
# tokens = EN.tokenize(text)
# assert len(tokens) == 10
def test_tokenizer_suspected_freeing_strings(en_tokenizer):
text1 = "Betty Botter bought a pound of butter."
text2 = "Betty also bought a pound of butter."
tokens1 = en_tokenizer(text1)
tokens2 = en_tokenizer(text2)
assert tokens1[0].text == "Betty"
assert tokens2[0].text == "Betty"