spaCy/spacy/tests/regression/test_issue2656.py

# coding: utf8
from __future__ import unicode_literals
from spacy.lang.en import English


def test_issue2656():
    """ Test that tokenizer correctly splits of punctuation after numbers with decimal points """
    text = "I went for 40.3, and got home by 10.0."
    nlp = English()
    doc = nlp(text)

    assert len(doc) == 11

    assert doc[0].text == "I"
    assert doc[1].text == "went"
    assert doc[2].text == "for"
    assert doc[3].text == "40.3"
    assert doc[4].text == ","
    assert doc[5].text == "and"
    assert doc[6].text == "got"
    assert doc[7].text == "home"
    assert doc[8].text == "by"
    assert doc[9].text == "10.0"
    assert doc[10].text == "."
Clean up of char classes, few tokenizer fixes and faster default French tokenizer (#3293) * splitting up latin unicode interval * removing hyphen as infix for French * adding failing test for issue 1235 * test for issue #3002 which now works * partial fix for issue #2070 * keep the hyphen as infix for French (as it was) * restore french expressions with hyphen as infix (as it was) * added succeeding unit test for Issue #2656 * Fix issue #2822 with custom Italian exception * Fix issue #2926 by allowing numbers right before infix / * splitting up latin unicode interval * removing hyphen as infix for French * adding failing test for issue 1235 * test for issue #3002 which now works * partial fix for issue #2070 * keep the hyphen as infix for French (as it was) * restore french expressions with hyphen as infix (as it was) * added succeeding unit test for Issue #2656 * Fix issue #2822 with custom Italian exception * Fix issue #2926 by allowing numbers right before infix / * remove duplicate * remove xfail for Issue #2179 fixed by Matt * adjust documentation and remove reference to regex lib 2019-02-21 00:10:13 +03:00			`# coding: utf8`
			`from __future__ import unicode_literals`
			`from spacy.lang.en import English`


			`def test_issue2656():`
			`""" Test that tokenizer correctly splits of punctuation after numbers with decimal points """`
			`text = "I went for 40.3, and got home by 10.0."`
			`nlp = English()`
			`doc = nlp(text)`

			`assert len(doc) == 11`

			`assert doc[0].text == "I"`
			`assert doc[1].text == "went"`
			`assert doc[2].text == "for"`
			`assert doc[3].text == "40.3"`
			`assert doc[4].text == ","`
			`assert doc[5].text == "and"`
			`assert doc[6].text == "got"`
			`assert doc[7].text == "home"`
			`assert doc[8].text == "by"`
			`assert doc[9].text == "10.0"`
			`assert doc[10].text == "."`