mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-27 17:54:39 +03:00
* Add new tests for fancier tokenization cases
This commit is contained in:
parent
6fc06bfe2f
commit
db191361ee
|
@ -1,3 +1,4 @@
|
||||||
|
# coding: utf-8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from spacy.en import EN
|
from spacy.en import EN
|
||||||
|
@ -17,19 +18,19 @@ def test_two_words():
|
||||||
|
|
||||||
def test_punct():
|
def test_punct():
|
||||||
tokens = EN.tokenize('hello, possums.')
|
tokens = EN.tokenize('hello, possums.')
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 4
|
||||||
assert tokens[0].string == EN.lexicon.lookup('hello').string
|
assert tokens[0].string == EN.lexicon.lookup('hello').string
|
||||||
assert tokens[1].string == EN.lexicon.lookup(',').string
|
assert tokens[1].string == EN.lexicon.lookup(',').string
|
||||||
assert tokens[2].string == EN.lexicon.lookup('possums.').string
|
assert tokens[2].string == EN.lexicon.lookup('possums').string
|
||||||
assert tokens[1].string != EN.lexicon.lookup('hello').string
|
assert tokens[1].string != EN.lexicon.lookup('hello').string
|
||||||
|
|
||||||
|
|
||||||
def test_digits():
|
def test_digits():
|
||||||
lex_ids = EN.tokenize('The year: 1984.')
|
lex_ids = EN.tokenize('The year: 1984.')
|
||||||
assert lex_ids.string(3) == "1984."
|
assert lex_ids.string(3) == "1984"
|
||||||
assert len(lex_ids) == 4
|
assert len(lex_ids) == 5
|
||||||
assert lex_ids[0].string == EN.lexicon.lookup('The').string
|
assert lex_ids[0].string == EN.lexicon.lookup('The').string
|
||||||
assert lex_ids[3].string == EN.lexicon.lookup('1984.').string
|
assert lex_ids[3].string == EN.lexicon.lookup('1984').string
|
||||||
|
|
||||||
|
|
||||||
def test_contraction():
|
def test_contraction():
|
||||||
|
@ -65,3 +66,39 @@ untimely death" of the rapier-tongued Scottish barrister and parliamentarian.
|
||||||
|
|
||||||
tokens = EN.tokenize(text)
|
tokens = EN.tokenize(text)
|
||||||
assert len(tokens) > 5
|
assert len(tokens) > 5
|
||||||
|
|
||||||
|
|
||||||
|
def test_cnts1():
|
||||||
|
text = u"""The U.S. Army likes Shock and Awe."""
|
||||||
|
tokens = EN.tokenize(text)
|
||||||
|
assert len(tokens) == 8
|
||||||
|
|
||||||
|
def test_cnts2():
|
||||||
|
text = u"""U.N. regulations are not a part of their concern."""
|
||||||
|
tokens = EN.tokenize(text)
|
||||||
|
assert len(tokens) == 10
|
||||||
|
|
||||||
|
def test_cnts3():
|
||||||
|
text = u"“Isn't it?”"
|
||||||
|
tokens = EN.tokenize(text)
|
||||||
|
assert len(tokens) == 6
|
||||||
|
|
||||||
|
def test_cnts4():
|
||||||
|
text = u"""Yes! "I'd rather have a walk", Ms. Comble sighed. """
|
||||||
|
tokens = EN.tokenize(text)
|
||||||
|
assert len(tokens) == 15
|
||||||
|
|
||||||
|
def test_cnts5():
|
||||||
|
text = """'Me too!', Mr. P. Delaware cried. """
|
||||||
|
tokens = EN.tokenize(text)
|
||||||
|
assert len(tokens) == 11
|
||||||
|
|
||||||
|
def test_cnts6():
|
||||||
|
text = u'They ran about 10km.'
|
||||||
|
tokens = EN.tokenize(text)
|
||||||
|
assert len(tokens) == 6
|
||||||
|
|
||||||
|
def test_cnts7():
|
||||||
|
text = 'But then the 6,000-year ice age came...'
|
||||||
|
tokens = EN.tokenize(text)
|
||||||
|
assert len(tokens) == 8
|
||||||
|
|
Loading…
Reference in New Issue
Block a user