mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-03 21:24:11 +03:00
* Upd tokenizer test
This commit is contained in:
parent
69e3a07fa1
commit
0d9972f4b0
|
@ -1,22 +1,28 @@
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from spacy.en import EN
|
import pytest
|
||||||
|
|
||||||
|
from spacy.en import English
|
||||||
|
|
||||||
|
|
||||||
def test_single_word():
|
@pytest.fixture
|
||||||
tokens = EN.tokenize(u'hello')
|
def EN():
|
||||||
|
return English(pos_tag=False, parse=False)
|
||||||
|
|
||||||
|
def test_single_word(EN):
|
||||||
|
tokens = EN(u'hello')
|
||||||
assert tokens[0].string == 'hello'
|
assert tokens[0].string == 'hello'
|
||||||
|
|
||||||
|
|
||||||
def test_two_words():
|
def test_two_words(EN):
|
||||||
tokens = EN.tokenize('hello possums')
|
tokens = EN('hello possums')
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
assert tokens[0].string != tokens[1].string
|
assert tokens[0].string != tokens[1].string
|
||||||
|
|
||||||
|
|
||||||
def test_punct():
|
def test_punct(EN):
|
||||||
tokens = EN.tokenize('hello, possums.')
|
tokens = EN('hello, possums.')
|
||||||
assert len(tokens) == 4
|
assert len(tokens) == 4
|
||||||
assert tokens[0].string == 'hello'
|
assert tokens[0].string == 'hello'
|
||||||
assert tokens[1].string == ','
|
assert tokens[1].string == ','
|
||||||
|
@ -24,33 +30,33 @@ def test_punct():
|
||||||
assert tokens[1].string != 'hello'
|
assert tokens[1].string != 'hello'
|
||||||
|
|
||||||
|
|
||||||
def test_digits():
|
def test_digits(EN):
|
||||||
tokens = EN.tokenize('The year: 1984.')
|
tokens = EN('The year: 1984.')
|
||||||
assert len(tokens) == 5
|
assert len(tokens) == 5
|
||||||
assert tokens[0].sic == EN.lexicon['The']['sic']
|
assert tokens[0].sic == EN.vocab['The']['sic']
|
||||||
assert tokens[3].sic == EN.lexicon['1984']['sic']
|
assert tokens[3].sic == EN.vocab['1984']['sic']
|
||||||
|
|
||||||
|
|
||||||
def test_contraction():
|
def test_contraction(EN):
|
||||||
tokens = EN.tokenize("don't giggle")
|
tokens = EN("don't giggle")
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
assert tokens[1].sic == EN.lexicon["n't"]['sic']
|
assert tokens[1].sic == EN.vocab["n't"]['sic']
|
||||||
tokens = EN.tokenize("i said don't!")
|
tokens = EN("i said don't!")
|
||||||
assert len(tokens) == 5
|
assert len(tokens) == 5
|
||||||
assert tokens[4].sic == EN.lexicon['!']['sic']
|
assert tokens[4].sic == EN.vocab['!']['sic']
|
||||||
|
|
||||||
|
|
||||||
def test_contraction_punct():
|
def test_contraction_punct(EN):
|
||||||
tokens = EN.tokenize("(can't")
|
tokens = EN("(can't")
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
tokens = EN.tokenize("`ain't")
|
tokens = EN("`ain't")
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
tokens = EN.tokenize('''"isn't''')
|
tokens = EN('''"isn't''')
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
tokens = EN.tokenize("can't!")
|
tokens = EN("can't!")
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
|
|
||||||
def test_sample():
|
def test_sample(EN):
|
||||||
text = """Tributes pour in for late British Labour Party leader
|
text = """Tributes pour in for late British Labour Party leader
|
||||||
|
|
||||||
Tributes poured in from around the world Thursday
|
Tributes poured in from around the world Thursday
|
||||||
|
@ -62,45 +68,45 @@ untimely death" of the rapier-tongued Scottish barrister and parliamentarian.
|
||||||
|
|
||||||
"Mr. Smith, throughout his distinguished"""
|
"Mr. Smith, throughout his distinguished"""
|
||||||
|
|
||||||
tokens = EN.tokenize(text)
|
tokens = EN(text)
|
||||||
assert len(tokens) > 5
|
assert len(tokens) > 5
|
||||||
|
|
||||||
|
|
||||||
def test_cnts1():
|
def test_cnts1(EN):
|
||||||
text = u"""The U.S. Army likes Shock and Awe."""
|
text = u"""The U.S. Army likes Shock and Awe."""
|
||||||
tokens = EN.tokenize(text)
|
tokens = EN(text)
|
||||||
assert len(tokens) == 8
|
assert len(tokens) == 8
|
||||||
|
|
||||||
|
|
||||||
def test_cnts2():
|
def test_cnts2(EN):
|
||||||
text = u"""U.N. regulations are not a part of their concern."""
|
text = u"""U.N. regulations are not a part of their concern."""
|
||||||
tokens = EN.tokenize(text)
|
tokens = EN(text)
|
||||||
assert len(tokens) == 10
|
assert len(tokens) == 10
|
||||||
|
|
||||||
|
|
||||||
def test_cnts3():
|
def test_cnts3(EN):
|
||||||
text = u"“Isn't it?”"
|
text = u"“Isn't it?”"
|
||||||
tokens = EN.tokenize(text)
|
tokens = EN(text)
|
||||||
words = [t.string for t in tokens]
|
words = [t.string for t in tokens]
|
||||||
assert len(words) == 6
|
assert len(words) == 6
|
||||||
|
|
||||||
|
|
||||||
def test_cnts4():
|
def test_cnts4(EN):
|
||||||
text = u"""Yes! "I'd rather have a walk", Ms. Comble sighed. """
|
text = u"""Yes! "I'd rather have a walk", Ms. Comble sighed. """
|
||||||
tokens = EN.tokenize(text)
|
tokens = EN(text)
|
||||||
words = [t.string for t in tokens]
|
words = [t.string for t in tokens]
|
||||||
assert len(words) == 15
|
assert len(words) == 15
|
||||||
|
|
||||||
|
|
||||||
def test_cnts5():
|
def test_cnts5(EN):
|
||||||
text = """'Me too!', Mr. P. Delaware cried. """
|
text = """'Me too!', Mr. P. Delaware cried. """
|
||||||
tokens = EN.tokenize(text)
|
tokens = EN(text)
|
||||||
assert len(tokens) == 11
|
assert len(tokens) == 11
|
||||||
|
|
||||||
|
|
||||||
def test_cnts6():
|
def test_cnts6(EN):
|
||||||
text = u'They ran about 10km.'
|
text = u'They ran about 10km.'
|
||||||
tokens = EN.tokenize(text)
|
tokens = EN(text)
|
||||||
words = [t.string for t in tokens]
|
words = [t.string for t in tokens]
|
||||||
assert len(words) == 6
|
assert len(words) == 6
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user