* Upd tokenizer test

This commit is contained in:
Matthew Honnibal 2014-12-21 20:38:27 +11:00
parent 69e3a07fa1
commit 0d9972f4b0

View File

@ -1,22 +1,28 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
from spacy.en import EN import pytest
from spacy.en import English
def test_single_word(): @pytest.fixture
tokens = EN.tokenize(u'hello') def EN():
return English(pos_tag=False, parse=False)
def test_single_word(EN):
tokens = EN(u'hello')
assert tokens[0].string == 'hello' assert tokens[0].string == 'hello'
def test_two_words(): def test_two_words(EN):
tokens = EN.tokenize('hello possums') tokens = EN('hello possums')
assert len(tokens) == 2 assert len(tokens) == 2
assert tokens[0].string != tokens[1].string assert tokens[0].string != tokens[1].string
def test_punct(): def test_punct(EN):
tokens = EN.tokenize('hello, possums.') tokens = EN('hello, possums.')
assert len(tokens) == 4 assert len(tokens) == 4
assert tokens[0].string == 'hello' assert tokens[0].string == 'hello'
assert tokens[1].string == ',' assert tokens[1].string == ','
@ -24,33 +30,33 @@ def test_punct():
assert tokens[1].string != 'hello' assert tokens[1].string != 'hello'
def test_digits(): def test_digits(EN):
tokens = EN.tokenize('The year: 1984.') tokens = EN('The year: 1984.')
assert len(tokens) == 5 assert len(tokens) == 5
assert tokens[0].sic == EN.lexicon['The']['sic'] assert tokens[0].sic == EN.vocab['The']['sic']
assert tokens[3].sic == EN.lexicon['1984']['sic'] assert tokens[3].sic == EN.vocab['1984']['sic']
def test_contraction(): def test_contraction(EN):
tokens = EN.tokenize("don't giggle") tokens = EN("don't giggle")
assert len(tokens) == 3 assert len(tokens) == 3
assert tokens[1].sic == EN.lexicon["n't"]['sic'] assert tokens[1].sic == EN.vocab["n't"]['sic']
tokens = EN.tokenize("i said don't!") tokens = EN("i said don't!")
assert len(tokens) == 5 assert len(tokens) == 5
assert tokens[4].sic == EN.lexicon['!']['sic'] assert tokens[4].sic == EN.vocab['!']['sic']
def test_contraction_punct(): def test_contraction_punct(EN):
tokens = EN.tokenize("(can't") tokens = EN("(can't")
assert len(tokens) == 3 assert len(tokens) == 3
tokens = EN.tokenize("`ain't") tokens = EN("`ain't")
assert len(tokens) == 3 assert len(tokens) == 3
tokens = EN.tokenize('''"isn't''') tokens = EN('''"isn't''')
assert len(tokens) == 3 assert len(tokens) == 3
tokens = EN.tokenize("can't!") tokens = EN("can't!")
assert len(tokens) == 3 assert len(tokens) == 3
def test_sample(): def test_sample(EN):
text = """Tributes pour in for late British Labour Party leader text = """Tributes pour in for late British Labour Party leader
Tributes poured in from around the world Thursday Tributes poured in from around the world Thursday
@ -62,45 +68,45 @@ untimely death" of the rapier-tongued Scottish barrister and parliamentarian.
"Mr. Smith, throughout his distinguished""" "Mr. Smith, throughout his distinguished"""
tokens = EN.tokenize(text) tokens = EN(text)
assert len(tokens) > 5 assert len(tokens) > 5
def test_cnts1(): def test_cnts1(EN):
text = u"""The U.S. Army likes Shock and Awe.""" text = u"""The U.S. Army likes Shock and Awe."""
tokens = EN.tokenize(text) tokens = EN(text)
assert len(tokens) == 8 assert len(tokens) == 8
def test_cnts2(): def test_cnts2(EN):
text = u"""U.N. regulations are not a part of their concern.""" text = u"""U.N. regulations are not a part of their concern."""
tokens = EN.tokenize(text) tokens = EN(text)
assert len(tokens) == 10 assert len(tokens) == 10
def test_cnts3(): def test_cnts3(EN):
text = u"“Isn't it?”" text = u"“Isn't it?”"
tokens = EN.tokenize(text) tokens = EN(text)
words = [t.string for t in tokens] words = [t.string for t in tokens]
assert len(words) == 6 assert len(words) == 6
def test_cnts4(): def test_cnts4(EN):
text = u"""Yes! "I'd rather have a walk", Ms. Comble sighed. """ text = u"""Yes! "I'd rather have a walk", Ms. Comble sighed. """
tokens = EN.tokenize(text) tokens = EN(text)
words = [t.string for t in tokens] words = [t.string for t in tokens]
assert len(words) == 15 assert len(words) == 15
def test_cnts5(): def test_cnts5(EN):
text = """'Me too!', Mr. P. Delaware cried. """ text = """'Me too!', Mr. P. Delaware cried. """
tokens = EN.tokenize(text) tokens = EN(text)
assert len(tokens) == 11 assert len(tokens) == 11
def test_cnts6(): def test_cnts6(EN):
text = u'They ran about 10km.' text = u'They ran about 10km.'
tokens = EN.tokenize(text) tokens = EN(text)
words = [t.string for t in tokens] words = [t.string for t in tokens]
assert len(words) == 6 assert len(words) == 6