spaCy/spacy/tests/tokenizer/test_tokenizer.py

# coding: utf-8
from __future__ import unicode_literals

import pytest
import io
import pickle
import cloudpickle
import tempfile

from ... import util
from ...language_data import TOKENIZER_PREFIXES

en_search_prefixes = util.compile_prefix_regex(TOKENIZER_PREFIXES).search

# @pytest.mark.xfail
# def test_pickle(en_tokenizer):
#     file_ = io.BytesIO()
#     cloudpickle.dump(en_tokenizer, file_)
#     file_.seek(0)
#     loaded = pickle.load(file_)
#     assert loaded is not None

def test_pre_punct_regex():
    string = "(can't"
    match = en_search_prefixes(string)
    assert match.group() == "("

def test_no_word(en_tokenizer):
    tokens = en_tokenizer(u'')
    assert len(tokens) == 0


def test_single_word(en_tokenizer):
    tokens = en_tokenizer(u'hello')
    assert tokens[0].orth_ == 'hello'


def test_two_words(en_tokenizer):
    tokens = en_tokenizer('hello possums')
    assert len(tokens) == 2
    assert tokens[0].orth_ != tokens[1].orth_


def test_punct(en_tokenizer):
    tokens = en_tokenizer('hello, possums.')
    assert len(tokens) == 4
    assert tokens[0].orth_ == 'hello'
    assert tokens[1].orth_ == ','
    assert tokens[2].orth_ == 'possums'
    assert tokens[1].orth_ != 'hello'


def test_digits(en_tokenizer):
    tokens = en_tokenizer('The year: 1984.')
    assert len(tokens) == 5
    assert tokens[0].orth == en_tokenizer.vocab['The'].orth
    assert tokens[3].orth == en_tokenizer.vocab['1984'].orth


def test_contraction(en_tokenizer):
    tokens = en_tokenizer("don't giggle")
    assert len(tokens) == 3
    assert tokens[1].orth == en_tokenizer.vocab["n't"].orth
    tokens = en_tokenizer("i said don't!")
    assert len(tokens) == 5
    assert tokens[4].orth == en_tokenizer.vocab['!'].orth

def test_contraction_punct(en_tokenizer):
    tokens = [w.text for w in en_tokenizer("(can't")]
    assert tokens == ['(', 'ca', "n't"]
    tokens = en_tokenizer("`ain't")
    assert len(tokens) == 3
    tokens = en_tokenizer('''"isn't''')
    assert len(tokens) == 3
    tokens = en_tokenizer("can't!")
    assert len(tokens) == 3


def test_sample(en_tokenizer):
    text = """Tributes pour in for late British Labour Party leader

Tributes poured in from around the world Thursday
to the late Labour Party leader John Smith, who died earlier from a massive
heart attack aged 55.

In Washington, the US State Department issued a statement regretting "the
untimely death" of the rapier-tongued Scottish barrister and parliamentarian.

"Mr. Smith, throughout his distinguished"""

    tokens = en_tokenizer(text)
    assert len(tokens) > 5


def test_cnts1(en_tokenizer):
    text = u"""The U.S. Army likes Shock and Awe."""
    tokens = en_tokenizer(text)
    assert len(tokens) == 8


def test_cnts2(en_tokenizer):
    text = u"""U.N. regulations are not a part of their concern."""
    tokens = en_tokenizer(text)
    assert len(tokens) == 10


def test_cnts3(en_tokenizer):
    text = u"“Isn't it?”"
    tokens = en_tokenizer(text)
    words = [t.orth_ for t in tokens]
    assert len(words) == 6


def test_cnts4(en_tokenizer):
    text = u"""Yes! "I'd rather have a walk", Ms. Comble sighed. """
    tokens = en_tokenizer(text)
    words = [t.orth_ for t in tokens]
    assert len(words) == 15


def test_cnts5(en_tokenizer):
    text = """'Me too!', Mr. P. Delaware cried. """
    tokens = en_tokenizer(text)
    assert len(tokens) == 11


@pytest.mark.xfail
def test_mr(en_tokenizer):
    text = """Today is Tuesday.Mr."""
    tokens = en_tokenizer(text)
    assert len(tokens) == 5
    assert [w.orth_ for w in tokens] == ['Today', 'is', 'Tuesday', '.', 'Mr.']


def test_cnts6(en_tokenizer):
    text = u'They ran about 10km.'
    tokens = en_tokenizer(text)
    words = [t.orth_ for t in tokens]
    assert len(words) == 6

def test_bracket_period(en_tokenizer):
    text = u'(And a 6a.m. run through Washington Park).'
    tokens = en_tokenizer(text)
    assert tokens[len(tokens) - 1].orth_ == u'.'


def test_ie(en_tokenizer):
    text = u"It's mediocre i.e. bad."
    tokens = en_tokenizer(text)
    assert len(tokens) == 6
    assert tokens[3].orth_ == "i.e."


def test_two_whitespace(en_tokenizer):
    orig_str = u'there are 2 spaces after this  '
    tokens = en_tokenizer(orig_str)
    assert repr(tokens.text_with_ws) == repr(orig_str)


@pytest.mark.xfail
def test_em_dash_infix(en_tokenizer):
    # Re Issue #225
    tokens = en_tokenizer('''Will this road take me to Puddleton?\u2014No, '''
                          '''you'll have to walk there.\u2014Ariel.''')
    assert tokens[6].text == 'Puddleton'
    assert tokens[7].text == '?'
    assert tokens[8].text == '\u2014'

#def test_cnts7():
#    text = 'But then the 6,000-year ice age came...'
#    tokens = EN.tokenize(text)
#    assert len(tokens) == 10
* Add new tests for fancier tokenization cases 2014-09-15 08:31:58 +04:00			`# coding: utf-8`
* Tests passing for reorganized version 2014-07-07 06:23:46 +04:00			`from __future__ import unicode_literals`

* Upd tokenizer test 2014-12-21 12:38:27 +03:00			`import pytest`
Add __reduce__ to Tokenizer so that English pickles. - Add tests to test_pickle and test_tokenizer that save to tempfiles. 2015-10-24 08:18:47 +03:00			`import io`
			`import pickle`
			`import cloudpickle`
			`import tempfile`

Add test for tokenizer regular expressions 2016-11-24 15:51:59 +03:00			`from ... import util`
Fix tokenizer test 2016-12-18 18:55:32 +03:00			`from ...language_data import TOKENIZER_PREFIXES`
Add test for tokenizer regular expressions 2016-11-24 15:51:59 +03:00
Fix tokenizer test 2016-12-18 18:55:32 +03:00			`en_search_prefixes = util.compile_prefix_regex(TOKENIZER_PREFIXES).search`
Add __reduce__ to Tokenizer so that English pickles. - Add tests to test_pickle and test_tokenizer that save to tempfiles. 2015-10-24 08:18:47 +03:00
Update test_tokenizer.py 2016-02-10 21:24:37 +03:00			`# @pytest.mark.xfail`
			`# def test_pickle(en_tokenizer):`
			`# file_ = io.BytesIO()`
			`# cloudpickle.dump(en_tokenizer, file_)`
			`# file_.seek(0)`
			`# loaded = pickle.load(file_)`
			`# assert loaded is not None`
* Tests passing for reorganized version 2014-07-07 06:23:46 +04:00
Add test for tokenizer regular expressions 2016-11-24 15:51:59 +03:00			`def test_pre_punct_regex():`
			`string = "(can't"`
			`match = en_search_prefixes(string)`
			`assert match.group() == "("`
* Tests passing for reorganized version 2014-07-07 06:23:46 +04:00
* Set up tokenizer/ tests properly, using a session-scoped fixture to avoid long load/unload times. Tokenizer tests now complete in 20 seconds. 2015-06-07 18:24:49 +03:00			`def test_no_word(en_tokenizer):`
			`tokens = en_tokenizer(u'')`
Add rokenizer test for zero length string 2015-02-07 05:01:44 +03:00			`assert len(tokens) == 0`

Tweak line spacing 2015-04-19 22:39:18 +03:00
* Set up tokenizer/ tests properly, using a session-scoped fixture to avoid long load/unload times. Tokenizer tests now complete in 20 seconds. 2015-06-07 18:24:49 +03:00			`def test_single_word(en_tokenizer):`
			`tokens = en_tokenizer(u'hello')`
* Upd tests for new meaning of 'string' 2015-01-23 23:22:30 +03:00			`assert tokens[0].orth_ == 'hello'`
* Tests passing for reorganized version 2014-07-07 06:23:46 +04:00

* Set up tokenizer/ tests properly, using a session-scoped fixture to avoid long load/unload times. Tokenizer tests now complete in 20 seconds. 2015-06-07 18:24:49 +03:00			`def test_two_words(en_tokenizer):`
			`tokens = en_tokenizer('hello possums')`
* Large refactor, particularly to Python API 2014-10-23 17:59:17 +04:00			`assert len(tokens) == 2`
* Upd tests for new meaning of 'string' 2015-01-23 23:22:30 +03:00			`assert tokens[0].orth_ != tokens[1].orth_`
* Tests passing for reorganized version 2014-07-07 06:23:46 +04:00

* Set up tokenizer/ tests properly, using a session-scoped fixture to avoid long load/unload times. Tokenizer tests now complete in 20 seconds. 2015-06-07 18:24:49 +03:00			`def test_punct(en_tokenizer):`
			`tokens = en_tokenizer('hello, possums.')`
* Add new tests for fancier tokenization cases 2014-09-15 08:31:58 +04:00			`assert len(tokens) == 4`
* Upd tests for new meaning of 'string' 2015-01-23 23:22:30 +03:00			`assert tokens[0].orth_ == 'hello'`
			`assert tokens[1].orth_ == ','`
			`assert tokens[2].orth_ == 'possums'`
			`assert tokens[1].orth_ != 'hello'`
* Tests passing for reorganized version 2014-07-07 06:23:46 +04:00

* Set up tokenizer/ tests properly, using a session-scoped fixture to avoid long load/unload times. Tokenizer tests now complete in 20 seconds. 2015-06-07 18:24:49 +03:00			`def test_digits(en_tokenizer):`
			`tokens = en_tokenizer('The year: 1984.')`
* Large refactor, particularly to Python API 2014-10-23 17:59:17 +04:00			`assert len(tokens) == 5`
* Set up tokenizer/ tests properly, using a session-scoped fixture to avoid long load/unload times. Tokenizer tests now complete in 20 seconds. 2015-06-07 18:24:49 +03:00			`assert tokens[0].orth == en_tokenizer.vocab['The'].orth`
			`assert tokens[3].orth == en_tokenizer.vocab['1984'].orth`
* Tests passing for reorganized version 2014-07-07 06:23:46 +04:00

* Set up tokenizer/ tests properly, using a session-scoped fixture to avoid long load/unload times. Tokenizer tests now complete in 20 seconds. 2015-06-07 18:24:49 +03:00			`def test_contraction(en_tokenizer):`
			`tokens = en_tokenizer("don't giggle")`
* Large refactor, particularly to Python API 2014-10-23 17:59:17 +04:00			`assert len(tokens) == 3`
* Set up tokenizer/ tests properly, using a session-scoped fixture to avoid long load/unload times. Tokenizer tests now complete in 20 seconds. 2015-06-07 18:24:49 +03:00			`assert tokens[1].orth == en_tokenizer.vocab["n't"].orth`
			`tokens = en_tokenizer("i said don't!")`
* Large refactor, particularly to Python API 2014-10-23 17:59:17 +04:00			`assert len(tokens) == 5`
* Set up tokenizer/ tests properly, using a session-scoped fixture to avoid long load/unload times. Tokenizer tests now complete in 20 seconds. 2015-06-07 18:24:49 +03:00			`assert tokens[4].orth == en_tokenizer.vocab['!'].orth`
* Fix bug with trailing punct on contractions. Reduced efficiency, and slightly hacky implementation. 2014-09-12 20:00:42 +04:00
* Set up tokenizer/ tests properly, using a session-scoped fixture to avoid long load/unload times. Tokenizer tests now complete in 20 seconds. 2015-06-07 18:24:49 +03:00			`def test_contraction_punct(en_tokenizer):`
Add test for tokenizer regular expressions 2016-11-24 15:51:59 +03:00			`tokens = [w.text for w in en_tokenizer("(can't")]`
			`assert tokens == ['(', 'ca', "n't"]`
* Set up tokenizer/ tests properly, using a session-scoped fixture to avoid long load/unload times. Tokenizer tests now complete in 20 seconds. 2015-06-07 18:24:49 +03:00			tokens = en_tokenizer("`ain't")
* Fix bug with trailing punct on contractions. Reduced efficiency, and slightly hacky implementation. 2014-09-12 20:00:42 +04:00			`assert len(tokens) == 3`
* Set up tokenizer/ tests properly, using a session-scoped fixture to avoid long load/unload times. Tokenizer tests now complete in 20 seconds. 2015-06-07 18:24:49 +03:00			`tokens = en_tokenizer('''"isn't''')`
* Fix bug with trailing punct on contractions. Reduced efficiency, and slightly hacky implementation. 2014-09-12 20:00:42 +04:00			`assert len(tokens) == 3`
* Set up tokenizer/ tests properly, using a session-scoped fixture to avoid long load/unload times. Tokenizer tests now complete in 20 seconds. 2015-06-07 18:24:49 +03:00			`tokens = en_tokenizer("can't!")`
* Fix bug with trailing punct on contractions. Reduced efficiency, and slightly hacky implementation. 2014-09-12 20:00:42 +04:00			`assert len(tokens) == 3`

Tweak line spacing 2015-04-19 22:39:18 +03:00
* Set up tokenizer/ tests properly, using a session-scoped fixture to avoid long load/unload times. Tokenizer tests now complete in 20 seconds. 2015-06-07 18:24:49 +03:00			`def test_sample(en_tokenizer):`
* Update tokenization tests for new tokenizer rules 2014-09-15 03:32:51 +04:00			`text = """Tributes pour in for late British Labour Party leader`

Remove trailing whitespace 2015-04-19 11:31:31 +03:00			`Tributes poured in from around the world Thursday`
			`to the late Labour Party leader John Smith, who died earlier from a massive`
* Update tokenization tests for new tokenizer rules 2014-09-15 03:32:51 +04:00			`heart attack aged 55.`

Remove trailing whitespace 2015-04-19 11:31:31 +03:00			`In Washington, the US State Department issued a statement regretting "the`
* Update tokenization tests for new tokenizer rules 2014-09-15 03:32:51 +04:00			`untimely death" of the rapier-tongued Scottish barrister and parliamentarian.`

			`"Mr. Smith, throughout his distinguished"""`
Remove trailing whitespace 2015-04-19 11:31:31 +03:00
* Set up tokenizer/ tests properly, using a session-scoped fixture to avoid long load/unload times. Tokenizer tests now complete in 20 seconds. 2015-06-07 18:24:49 +03:00			`tokens = en_tokenizer(text)`
* Update tokenization tests for new tokenizer rules 2014-09-15 03:32:51 +04:00			`assert len(tokens) > 5`
* Add new tests for fancier tokenization cases 2014-09-15 08:31:58 +04:00

* Add explicit tokenizer test 2015-06-07 22:29:17 +03:00			`def test_cnts1(en_tokenizer):`
* Add new tests for fancier tokenization cases 2014-09-15 08:31:58 +04:00			`text = u"""The U.S. Army likes Shock and Awe."""`
* Add explicit tokenizer test 2015-06-07 22:29:17 +03:00			`tokens = en_tokenizer(text)`
* Add new tests for fancier tokenization cases 2014-09-15 08:31:58 +04:00			`assert len(tokens) == 8`

* Work on fixing special-cases, reading them in as JSON objects so that they can specify lemmas 2014-12-09 06:48:01 +03:00
* Add explicit tokenizer test 2015-06-07 22:29:17 +03:00			`def test_cnts2(en_tokenizer):`
* Add new tests for fancier tokenization cases 2014-09-15 08:31:58 +04:00			`text = u"""U.N. regulations are not a part of their concern."""`
* Add explicit tokenizer test 2015-06-07 22:29:17 +03:00			`tokens = en_tokenizer(text)`
* Add new tests for fancier tokenization cases 2014-09-15 08:31:58 +04:00			`assert len(tokens) == 10`

* Work on fixing special-cases, reading them in as JSON objects so that they can specify lemmas 2014-12-09 06:48:01 +03:00
* Add explicit tokenizer test 2015-06-07 22:29:17 +03:00			`def test_cnts3(en_tokenizer):`
* Add new tests for fancier tokenization cases 2014-09-15 08:31:58 +04:00			`text = u"“Isn't it?”"`
* Add explicit tokenizer test 2015-06-07 22:29:17 +03:00			`tokens = en_tokenizer(text)`
* Upd tests for new meaning of 'string' 2015-01-23 23:22:30 +03:00			`words = [t.orth_ for t in tokens]`
* Work on fixing special-cases, reading them in as JSON objects so that they can specify lemmas 2014-12-09 06:48:01 +03:00			`assert len(words) == 6`

* Add new tests for fancier tokenization cases 2014-09-15 08:31:58 +04:00
* Add explicit tokenizer test 2015-06-07 22:29:17 +03:00			`def test_cnts4(en_tokenizer):`
* Add new tests for fancier tokenization cases 2014-09-15 08:31:58 +04:00			`text = u"""Yes! "I'd rather have a walk", Ms. Comble sighed. """`
* Add explicit tokenizer test 2015-06-07 22:29:17 +03:00			`tokens = en_tokenizer(text)`
* Upd tests for new meaning of 'string' 2015-01-23 23:22:30 +03:00			`words = [t.orth_ for t in tokens]`
* Work on fixing special-cases, reading them in as JSON objects so that they can specify lemmas 2014-12-09 06:48:01 +03:00			`assert len(words) == 15`

* Add new tests for fancier tokenization cases 2014-09-15 08:31:58 +04:00
* Add explicit tokenizer test 2015-06-07 22:29:17 +03:00			`def test_cnts5(en_tokenizer):`
* Add new tests for fancier tokenization cases 2014-09-15 08:31:58 +04:00			`text = """'Me too!', Mr. P. Delaware cried. """`
* Add explicit tokenizer test 2015-06-07 22:29:17 +03:00			`tokens = en_tokenizer(text)`
* Add new tests for fancier tokenization cases 2014-09-15 08:31:58 +04:00			`assert len(tokens) == 11`

* Add xfail test for Issue #225: tokenization with non-whitespace delimiters 2016-01-19 15:20:14 +03:00
			`@pytest.mark.xfail`
			`def test_mr(en_tokenizer):`
			`text = """Today is Tuesday.Mr."""`
			`tokens = en_tokenizer(text)`
			`assert len(tokens) == 5`
			`assert [w.orth_ for w in tokens] == ['Today', 'is', 'Tuesday', '.', 'Mr.']`
* Add explicit tokenizer test 2015-06-07 22:29:17 +03:00
* Work on fixing special-cases, reading them in as JSON objects so that they can specify lemmas 2014-12-09 06:48:01 +03:00
* Add explicit tokenizer test 2015-06-07 22:29:17 +03:00			`def test_cnts6(en_tokenizer):`
* Add new tests for fancier tokenization cases 2014-09-15 08:31:58 +04:00			`text = u'They ran about 10km.'`
* Add explicit tokenizer test 2015-06-07 22:29:17 +03:00			`tokens = en_tokenizer(text)`
* Upd tests for new meaning of 'string' 2015-01-23 23:22:30 +03:00			`words = [t.orth_ for t in tokens]`
* Work on fixing special-cases, reading them in as JSON objects so that they can specify lemmas 2014-12-09 06:48:01 +03:00			`assert len(words) == 6`

* Add explicit tokenizer test 2015-06-07 22:29:17 +03:00			`def test_bracket_period(en_tokenizer):`
* Add test for ). in tokenizer 2015-01-22 14:25:18 +03:00			`text = u'(And a 6a.m. run through Washington Park).'`
* Add explicit tokenizer test 2015-06-07 22:29:17 +03:00			`tokens = en_tokenizer(text)`
* Upd tests for new meaning of 'string' 2015-01-23 23:22:30 +03:00			`assert tokens[len(tokens) - 1].orth_ == u'.'`
* Add test for ). in tokenizer 2015-01-22 14:25:18 +03:00
* Upd tokenizer with i.e. tests 2015-02-18 14:37:04 +03:00
* Add explicit tokenizer test 2015-06-07 22:29:17 +03:00			`def test_ie(en_tokenizer):`
* Upd tokenizer with i.e. tests 2015-02-18 14:37:04 +03:00			`text = u"It's mediocre i.e. bad."`
* Add explicit tokenizer test 2015-06-07 22:29:17 +03:00			`tokens = en_tokenizer(text)`
* Upd tokenizer with i.e. tests 2015-02-18 14:37:04 +03:00			`assert len(tokens) == 6`
			`assert tokens[3].orth_ == "i.e."`

* Add test for Issue #154: Additional whitespace introduced when string ends with a whitespace token. 2016-01-16 18:41:26 +03:00
			`def test_two_whitespace(en_tokenizer):`
			`orig_str = u'there are 2 spaces after this '`
			`tokens = en_tokenizer(orig_str)`
			`assert repr(tokens.text_with_ws) == repr(orig_str)`

* Add xfail test for Issue #225: tokenization with non-whitespace delimiters 2016-01-19 15:20:14 +03:00
			`@pytest.mark.xfail`
			`def test_em_dash_infix(en_tokenizer):`
			`# Re Issue #225`
			`tokens = en_tokenizer('''Will this road take me to Puddleton?\u2014No, '''`
			`'''you'll have to walk there.\u2014Ariel.''')`
			`assert tokens[6].text == 'Puddleton'`
			`assert tokens[7].text == '?'`
			`assert tokens[8].text == '\u2014'`

* Comment out tests of hyphenation, while we decide what hyphenation policy should be. 2014-11-04 18:03:22 +03:00			`#def test_cnts7():`
			`# text = 'But then the 6,000-year ice age came...'`
			`# tokens = EN.tokenize(text)`
			`# assert len(tokens) == 10`