mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-28 02:46:35 +03:00
9a478b6db8
* splitting up latin unicode interval * removing hyphen as infix for French * adding failing test for issue 1235 * test for issue #3002 which now works * partial fix for issue #2070 * keep the hyphen as infix for French (as it was) * restore french expressions with hyphen as infix (as it was) * added succeeding unit test for Issue #2656 * Fix issue #2822 with custom Italian exception * Fix issue #2926 by allowing numbers right before infix / * splitting up latin unicode interval * removing hyphen as infix for French * adding failing test for issue 1235 * test for issue #3002 which now works * partial fix for issue #2070 * keep the hyphen as infix for French (as it was) * restore french expressions with hyphen as infix (as it was) * added succeeding unit test for Issue #2656 * Fix issue #2822 with custom Italian exception * Fix issue #2926 by allowing numbers right before infix / * remove duplicate * remove xfail for Issue #2179 fixed by Matt * adjust documentation and remove reference to regex lib
25 lines
681 B
Python
25 lines
681 B
Python
# coding: utf8
|
|
from __future__ import unicode_literals
|
|
from spacy.lang.en import English
|
|
|
|
|
|
def test_issue2656():
|
|
""" Test that tokenizer correctly splits of punctuation after numbers with decimal points """
|
|
text = "I went for 40.3, and got home by 10.0."
|
|
nlp = English()
|
|
doc = nlp(text)
|
|
|
|
assert len(doc) == 11
|
|
|
|
assert doc[0].text == "I"
|
|
assert doc[1].text == "went"
|
|
assert doc[2].text == "for"
|
|
assert doc[3].text == "40.3"
|
|
assert doc[4].text == ","
|
|
assert doc[5].text == "and"
|
|
assert doc[6].text == "got"
|
|
assert doc[7].text == "home"
|
|
assert doc[8].text == "by"
|
|
assert doc[9].text == "10.0"
|
|
assert doc[10].text == "."
|