2017-01-05 15:09:48 +03:00
|
|
|
# coding: utf-8
|
2017-01-05 15:10:21 +03:00
|
|
|
"""Test that tokens are created correctly for contractions."""
|
|
|
|
|
|
|
|
|
2014-07-07 06:23:46 +04:00
|
|
|
from __future__ import unicode_literals
|
|
|
|
|
2017-01-04 02:48:42 +03:00
|
|
|
import pytest
|
2014-07-07 06:23:46 +04:00
|
|
|
|
2017-01-04 01:02:16 +03:00
|
|
|
|
2017-01-04 02:48:42 +03:00
|
|
|
@pytest.mark.parametrize('text_poss,text', [("Robin's", "Robin"), ("Alexis's", "Alexis")])
|
|
|
|
def test_tokenizer_handles_poss_contraction(en_tokenizer, text_poss, text):
|
2017-01-04 01:02:16 +03:00
|
|
|
tokens = en_tokenizer(text_poss)
|
2014-08-18 21:14:00 +04:00
|
|
|
assert len(tokens) == 2
|
2017-01-04 01:02:16 +03:00
|
|
|
assert tokens[0].text == text
|
|
|
|
assert tokens[1].text == "'s"
|
2014-07-07 06:23:46 +04:00
|
|
|
|
|
|
|
|
2017-01-04 01:02:16 +03:00
|
|
|
@pytest.mark.parametrize('text', ["schools'", "Alexis'"])
|
|
|
|
def test_tokenizer_splits_trailing_apos(en_tokenizer, text):
|
|
|
|
tokens = en_tokenizer(text)
|
2014-07-07 06:23:46 +04:00
|
|
|
assert len(tokens) == 2
|
2017-01-04 01:02:16 +03:00
|
|
|
assert tokens[0].text == text.split("'")[0]
|
|
|
|
assert tokens[1].text == "'"
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize('text', ["'em", "nothin'", "ol'"])
|
|
|
|
def text_tokenizer_doesnt_split_apos_exc(en_tokenizer, text):
|
|
|
|
tokens = en_tokenizer(text)
|
|
|
|
assert len(tokens) == 1
|
|
|
|
assert tokens[0].text == text
|
2014-07-07 06:23:46 +04:00
|
|
|
|
|
|
|
|
2017-01-04 01:02:16 +03:00
|
|
|
@pytest.mark.parametrize('text', ["we'll", "You'll", "there'll"])
|
|
|
|
def test_tokenizer_handles_ll_contraction(en_tokenizer, text):
|
|
|
|
tokens = en_tokenizer(text)
|
2014-07-07 06:23:46 +04:00
|
|
|
assert len(tokens) == 2
|
2017-01-04 01:02:16 +03:00
|
|
|
assert tokens[0].text == text.split("'")[0]
|
|
|
|
assert tokens[1].text == "'ll"
|
2015-01-14 19:51:47 +03:00
|
|
|
assert tokens[1].lemma_ == "will"
|
2014-07-07 06:23:46 +04:00
|
|
|
|
|
|
|
|
2017-01-04 02:48:42 +03:00
|
|
|
@pytest.mark.parametrize('text_lower,text_title', [("can't", "Can't"), ("ain't", "Ain't")])
|
|
|
|
def test_tokenizer_handles_capitalization(en_tokenizer, text_lower, text_title):
|
2017-01-04 01:02:16 +03:00
|
|
|
tokens_lower = en_tokenizer(text_lower)
|
|
|
|
tokens_title = en_tokenizer(text_title)
|
|
|
|
assert tokens_title[0].text == tokens_lower[0].text.title()
|
|
|
|
assert tokens_lower[0].text == tokens_title[0].text.lower()
|
|
|
|
assert tokens_lower[1].text == tokens_title[1].text
|
2014-07-07 07:07:21 +04:00
|
|
|
|
2014-12-07 14:08:04 +03:00
|
|
|
|
2017-01-04 01:02:16 +03:00
|
|
|
@pytest.mark.parametrize('pron', ["I", "You", "He", "She", "It", "We", "They"])
|
2017-01-04 02:48:42 +03:00
|
|
|
@pytest.mark.parametrize('contraction', ["'ll", "'d"])
|
|
|
|
def test_tokenizer_keeps_title_case(en_tokenizer, pron, contraction):
|
|
|
|
tokens = en_tokenizer(pron + contraction)
|
|
|
|
assert tokens[0].text == pron
|
|
|
|
assert tokens[1].text == contraction
|
2014-12-07 14:08:04 +03:00
|
|
|
|
2015-12-29 20:09:09 +03:00
|
|
|
|
2017-01-04 01:02:16 +03:00
|
|
|
@pytest.mark.parametrize('exc', ["Ill", "ill", "Hell", "hell", "Well", "well"])
|
|
|
|
def test_tokenizer_excludes_ambiguous(en_tokenizer, exc):
|
|
|
|
tokens = en_tokenizer(exc)
|
|
|
|
assert len(tokens) == 1
|
2015-12-29 20:09:09 +03:00
|
|
|
|
2017-01-04 01:02:16 +03:00
|
|
|
|
2017-01-04 02:48:42 +03:00
|
|
|
@pytest.mark.parametrize('wo_punct,w_punct', [("We've", "``We've"), ("couldn't", "couldn't)")])
|
|
|
|
def test_tokenizer_splits_defined_punct(en_tokenizer, wo_punct, w_punct):
|
2017-01-04 01:02:16 +03:00
|
|
|
tokens = en_tokenizer(wo_punct)
|
2015-12-29 20:09:09 +03:00
|
|
|
assert len(tokens) == 2
|
2017-01-04 01:02:16 +03:00
|
|
|
tokens = en_tokenizer(w_punct)
|
|
|
|
assert len(tokens) == 3
|