* More work on reorganising tests, using conftest.py

This commit is contained in:
Matthew Honnibal 2015-06-07 18:02:24 +02:00
parent 674ee5dde7
commit 88041f69d1
27 changed files with 94 additions and 271 deletions

View File

@ -1,31 +0,0 @@
import pytest
import os
from os import path
from spacy.munge.read_ontonotes import sgml_extract
text_data = open(path.join(path.dirname(__file__), 'web_sample1.sgm')).read()
def test_example_extract():
article = sgml_extract(text_data)
assert article['docid'] == 'blogspot.com_alaindewitt_20060924104100_ENG_20060924_104100'
assert article['doctype'] == 'BLOG TEXT'
assert article['datetime'] == '2006-09-24T10:41:00'
assert article['headline'].strip() == 'Devastating Critique of the Arab World by One of Its Own'
assert article['poster'] == 'Alain DeWitt'
assert article['postdate'] == '2006-09-24T10:41:00'
assert article['text'].startswith('Thanks again to my fri'), article['text'][:10]
assert article['text'].endswith(' tide will turn."'), article['text'][-10:]
assert '<' not in article['text'], article['text'][:10]
def test_directory():
context_dir = '/usr/local/data/OntoNotes5/data/english/metadata/context/wb/sel'
for fn in os.listdir(context_dir):
with open(path.join(context_dir, fn)) as file_:
text = file_.read()
article = sgml_extract(text)

View File

@ -1,46 +0,0 @@
from spacy.munge import read_ptb
import pytest
from os import path
ptb_loc = path.join(path.dirname(__file__), 'wsj_0001.parse')
file3_loc = path.join(path.dirname(__file__), 'wsj_0003.parse')
@pytest.fixture
def ptb_text():
return open(path.join(ptb_loc)).read()
@pytest.fixture
def sentence_strings(ptb_text):
return read_ptb.split(ptb_text)
def test_split(sentence_strings):
assert len(sentence_strings) == 2
assert sentence_strings[0].startswith('(TOP (S (NP-SBJ')
assert sentence_strings[0].endswith('(. .)))')
assert sentence_strings[1].startswith('(TOP (S (NP-SBJ')
assert sentence_strings[1].endswith('(. .)))')
def test_tree_read(sentence_strings):
words, brackets = read_ptb.parse(sentence_strings[0])
assert len(brackets) == 11
string = ("Pierre Vinken , 61 years old , will join the board as a nonexecutive "
"director Nov. 29 .")
word_strings = string.split()
starts = [s for l, s, e in brackets]
ends = [e for l, s, e in brackets]
assert min(starts) == 0
assert max(ends) == len(words)
assert brackets[-1] == ('S', 0, len(words))
assert ('NP-SBJ', 0, 7) in brackets
def test_traces():
sent_strings = sentence_strings(open(file3_loc).read())
words, brackets = read_ptb.parse(sent_strings[0])
assert len(words) == 36

View File

@ -1,7 +1,6 @@
"""Test the Token.conjuncts property""" """Test the Token.conjuncts property"""
from __future__ import unicode_literals from __future__ import unicode_literals
from spacy.en import English
import pytest import pytest
@ -9,9 +8,8 @@ def orths(tokens):
return [t.orth_ for t in tokens] return [t.orth_ for t in tokens]
def test_simple_two(): def test_simple_two(EN):
nlp = English() tokens = EN('I lost money and pride.', tag=True, parse=True)
tokens = nlp('I lost money and pride.', tag=True, parse=True)
pride = tokens[4] pride = tokens[4]
for t in tokens: for t in tokens:
print t.orth_, t.tag_, t.head.orth_ print t.orth_, t.tag_, t.head.orth_
@ -20,13 +18,12 @@ def test_simple_two():
assert orths(money.conjuncts) == ['money', 'pride'] assert orths(money.conjuncts) == ['money', 'pride']
def test_comma_three(): #def test_comma_three(EN):
nlp = English() # tokens = EN('I found my wallet, phone and keys.')
tokens = nlp('I found my wallet, phone and keys.') # keys = tokens[-2]
keys = tokens[-2] # assert orths(keys.conjuncts) == ['wallet', 'phone', 'keys']
assert orths(keys.conjuncts) == ['wallet', 'phone', 'keys'] # wallet = tokens[3]
wallet = tokens[3] # assert orths(wallet.conjuncts) == ['wallet', 'phone', 'keys']
assert orths(wallet.conjuncts) == ['wallet', 'phone', 'keys']
# This is failing due to parse errors # This is failing due to parse errors

View File

@ -1,11 +1,6 @@
from spacy.en import English
def test_simple_types(EN):
nlp = English() tokens = EN(u'Mr. Best flew to New York on Saturday morning.')
def test_simple_types():
tokens = nlp(u'Mr. Best flew to New York on Saturday morning.')
ents = list(tokens.ents) ents = list(tokens.ents)
assert ents[0].start == 1 assert ents[0].start == 1
assert ents[0].end == 2 assert ents[0].end == 2

View File

@ -1,11 +1,7 @@
from spacy.en import English
import pytest import pytest
NLP = English()
def test_root(EN):
def test_root(): tokens = EN(u"i don't have other assistance")
tokens = NLP(u"i don't have other assistance")
for t in tokens: for t in tokens:
assert t.dep != 0, t.orth_ assert t.dep != 0, t.orth_

View File

@ -2,8 +2,6 @@ from __future__ import unicode_literals
from os import path from os import path
import codecs import codecs
from spacy.en import English
import pytest import pytest
@ -14,13 +12,8 @@ def sun_text():
return text return text
@pytest.fixture def test_consistency(EN, sun_text):
def nlp(): tokens = EN(sun_text)
return English()
def test_consistency(nlp, sun_text):
tokens = nlp(sun_text)
for head in tokens: for head in tokens:
for child in head.lefts: for child in head.lefts:
assert child.head is head assert child.head is head
@ -28,8 +21,8 @@ def test_consistency(nlp, sun_text):
assert child.head is head assert child.head is head
def test_child_consistency(nlp, sun_text): def test_child_consistency(EN, sun_text):
tokens = nlp(sun_text) tokens = EN(sun_text)
lefts = {} lefts = {}
rights = {} rights = {}
@ -60,9 +53,9 @@ def test_child_consistency(nlp, sun_text):
assert not children assert not children
def test_edges(nlp): def test_edges(EN):
sun_text = u"Chemically, about three quarters of the Sun's mass consists of hydrogen, while the rest is mostly helium." sun_text = u"Chemically, about three quarters of the Sun's mass consists of hydrogen, while the rest is mostly helium."
tokens = nlp(sun_text) tokens = EN(sun_text)
for token in tokens: for token in tokens:
subtree = list(token.subtree) subtree = list(token.subtree)
debug = '\t'.join((token.orth_, token.left_edge.orth_, subtree[0].orth_)) debug = '\t'.join((token.orth_, token.left_edge.orth_, subtree[0].orth_))

View File

@ -1,14 +1,8 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from spacy.en import English
import pytest import pytest
@pytest.fixture
def EN():
return English()
def test_single_period(EN): def test_single_period(EN):
string = 'A test sentence.' string = 'A test sentence.'
words = EN(string) words = EN(string)

View File

@ -1,10 +1,7 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from spacy.en import English
EN = English()
def test_subtrees(): def test_subtrees(EN):
sent = EN('The four wheels on the bus turned quickly') sent = EN('The four wheels on the bus turned quickly')
wheels = sent[2] wheels = sent[2]
bus = sent[5] bus = sent[5]

View File

@ -1,14 +1,8 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import pytest
from spacy.en import English def test_merge_tokens(en_nlp):
tokens = en_nlp(u'Los Angeles start.')
NLU = English()
def test_merge_tokens():
tokens = NLU(u'Los Angeles start.')
assert len(tokens) == 4 assert len(tokens) == 4
assert tokens[0].head.orth_ == 'Angeles' assert tokens[0].head.orth_ == 'Angeles'
assert tokens[1].head.orth_ == 'start' assert tokens[1].head.orth_ == 'start'
@ -18,8 +12,8 @@ def test_merge_tokens():
assert tokens[0].head.orth_ == 'start' assert tokens[0].head.orth_ == 'start'
def test_merge_heads(): def test_merge_heads(en_nlp):
tokens = NLU(u'I found a pilates class near work.') tokens = en_nlp(u'I found a pilates class near work.')
assert len(tokens) == 8 assert len(tokens) == 8
tokens.merge(tokens[3].idx, tokens[4].idx + len(tokens[4]), tokens[4].tag_, tokens.merge(tokens[3].idx, tokens[4].idx + len(tokens[4]), tokens[4].tag_,
'pilates class', 'O') 'pilates class', 'O')
@ -34,4 +28,4 @@ def test_merge_heads():
def test_issue_54(): def test_issue_54():
text = u'Talks given by women had a slightly higher number of questions asked (3.2$\pm$0.2) than talks given by men (2.6$\pm$0.1).' text = u'Talks given by women had a slightly higher number of questions asked (3.2$\pm$0.2) than talks given by men (2.6$\pm$0.1).'
tokens = NLU(text, merge_mwes=True) tokens = en_nlp(text, merge_mwes=True)

View File

@ -1,16 +1,12 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from spacy.en import English
import pytest import pytest
@pytest.fixture @pytest.fixture
def doc(): def doc(en_nlp):
EN = English() return en_nlp('This is a sentence. This is another sentence. And a third.')
return EN('This is a sentence. This is another sentence. And a third.')
def test_sent_spans(doc): def test_sent_spans(doc):

View File

@ -1,13 +1,9 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from spacy.en import English
import pytest import pytest
NLU = English()
def test_am_pm(en_nlp):
def test_am_pm():
numbers = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12'] numbers = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12']
variants = ['a.m.', 'am', 'p.m.', 'pm'] variants = ['a.m.', 'am', 'p.m.', 'pm']
spaces = ['', ' '] spaces = ['', ' ']
@ -15,7 +11,7 @@ def test_am_pm():
for var in variants: for var in variants:
for space in spaces: for space in spaces:
string = u"The meeting was at %s%s%s wasn't it?" % (num, space, var) string = u"The meeting was at %s%s%s wasn't it?" % (num, space, var)
tokens = NLU(string, merge_mwes=True) tokens = en_nlp(string, merge_mwes=True)
assert tokens[4].orth_ == '%s%s%s' % (num, space, var) assert tokens[4].orth_ == '%s%s%s' % (num, space, var)
ents = list(tokens.ents) ents = list(tokens.ents)
assert len(ents) == 1 assert len(ents) == 1

View File

@ -4,9 +4,6 @@ from spacy.en import English
import pytest import pytest
@pytest.fixture
def EN():
return English()
@pytest.fixture @pytest.fixture
def tagged(EN): def tagged(EN):

View File

@ -5,12 +5,6 @@ import pytest
from spacy.en import English from spacy.en import English
@pytest.fixture
def EN():
return English()
@pytest.fixture @pytest.fixture
def morph_exc(): def morph_exc():
return { return {
@ -18,9 +12,11 @@ def morph_exc():
} }
def test_load_exc(EN, morph_exc): def test_load_exc(morph_exc):
EN.tagger.load_morph_exceptions(morph_exc) # Do this local as we want to modify it
tokens = EN('I like his style.', tag=True, parse=False) nlp = English()
nlp.tagger.load_morph_exceptions(morph_exc)
tokens = nlp('I like his style.', tag=True, parse=False)
his = tokens[2] his = tokens[2]
assert his.tag_ == 'PRP$' assert his.tag_ == 'PRP$'
assert his.lemma_ == '-PRP-' assert his.lemma_ == '-PRP-'

View File

@ -2,9 +2,8 @@ from spacy.en import English
import six import six
def test_tag_names(): def test_tag_names(EN):
nlp = English() tokens = EN(u'I ate pizzas with anchovies.', parse=False, tag=True)
tokens = nlp(u'I ate pizzas with anchovies.', parse=True, tag=True)
pizza = tokens[2] pizza = tokens[2]
assert type(pizza.pos) == int assert type(pizza.pos) == int
assert isinstance(pizza.pos_, six.text_type) assert isinstance(pizza.pos_, six.text_type)

View File

@ -2,10 +2,6 @@ import pytest
from spacy.en import English from spacy.en import English
@pytest.fixture(scope="session") @pytest.fixture(scope="module")
def EN():
return English(load_vectors=False)
@pytest.fixture(scope="session")
def en_tokenizer(EN): def en_tokenizer(EN):
return EN.tokenizer return EN.tokenizer

View File

@ -0,0 +1,9 @@
"""Test suspected freeing of strings"""
from __future__ import unicode_literals
def test_one(en_tokenizer):
tokens = en_tokenizer('Betty Botter bought a pound of butter.')
assert tokens[0].orth_ == 'Betty'
tokens2 = en_tokenizer('Betty also bought a pound of butter.')
assert tokens2[0].orth_ == 'Betty'

View File

@ -1,6 +1,5 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from spacy.en import English
from spacy.util import utf8open from spacy.util import utf8open
import pytest import pytest
@ -16,8 +15,7 @@ def sun_txt():
return utf8open(loc).read() return utf8open(loc).read()
def test_tokenize(sun_txt): def test_tokenize(sun_txt, EN):
nlp = English()
assert len(sun_txt) != 0 assert len(sun_txt) != 0
tokens = nlp(sun_txt) tokens = nlp(sun_txt)
assert True assert len(tokens) > 100

View File

@ -3,13 +3,10 @@ from __future__ import unicode_literals
import pytest import pytest
from spacy.en import English
from spacy.en import attrs from spacy.en import attrs
EN = English() def test_attr_of_token(EN):
def test_attr_of_token():
text = u'An example sentence.' text = u'An example sentence.'
tokens = EN(text, tag=True, parse=False) tokens = EN(text, tag=True, parse=False)
example = EN.vocab[u'example'] example = EN.vocab[u'example']
@ -18,7 +15,7 @@ def test_attr_of_token():
assert feats_array[0][0] != feats_array[0][1] assert feats_array[0][0] != feats_array[0][1]
def test_tag(): def test_tag(EN):
text = u'A nice sentence.' text = u'A nice sentence.'
tokens = EN(text) tokens = EN(text)
assert tokens[0].tag != tokens[1].tag != tokens[2].tag != tokens[3].tag assert tokens[0].tag != tokens[1].tag != tokens[2].tag != tokens[3].tag
@ -29,7 +26,7 @@ def test_tag():
assert feats_array[3][1] == tokens[3].tag assert feats_array[3][1] == tokens[3].tag
def test_dep(): def test_dep(EN):
text = u'A nice sentence.' text = u'A nice sentence.'
tokens = EN(text) tokens = EN(text)
feats_array = tokens.to_array((attrs.ORTH, attrs.DEP)) feats_array = tokens.to_array((attrs.ORTH, attrs.DEP))

View File

@ -6,16 +6,10 @@ from spacy.en.attrs import IS_STOP
import pytest import pytest
nlp = English()
def test_strings(EN):
@pytest.fixture tokens = EN(u'Give it back! He pleaded.')
def token(): token = tokens[0]
tokens = nlp(u'Give it back! He pleaded.')
return tokens[0]
def test_strings(token):
assert token.orth_ == 'Give' assert token.orth_ == 'Give'
assert token.lower_ == 'give' assert token.lower_ == 'give'
assert token.shape_ == 'Xxxx' assert token.shape_ == 'Xxxx'
@ -27,13 +21,16 @@ def test_strings(token):
assert token.dep_ == 'ROOT' assert token.dep_ == 'ROOT'
def test_flags(token): def test_flags(EN):
tokens = EN(u'Give it back! He pleaded.')
token = tokens[0]
assert token.check_flag(IS_ALPHA) assert token.check_flag(IS_ALPHA)
assert not token.check_flag(IS_DIGIT) assert not token.check_flag(IS_DIGIT)
# TODO: Test more of these, esp. if a bug is found # TODO: Test more of these, esp. if a bug is found
def test_single_token_string(): def test_single_token_string(EN):
nlp = English()
tokens = nlp(u'foobar') tokens = EN(u'foobar')
assert tokens[0].string == 'foobar' assert tokens[0].string == 'foobar'

View File

@ -4,9 +4,11 @@ import gc
from spacy.en import English from spacy.en import English
# Let this have its own instances, as we have to be careful about memory here
# that's the point, after all
def get_orphan_token(text, i): def get_orphan_token(text, i):
nlp = English() nlp = English(load_vectors=False)
tokens = nlp(text) tokens = nlp(text)
gc.collect() gc.collect()
token = tokens[i] token = tokens[i]

View File

@ -1,16 +1,10 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from spacy.en import English
import pytest import pytest
@pytest.fixture def test_getitem(EN):
def tokens(): tokens = EN(u'Give it back! He pleaded.')
nlp = English()
return nlp(u'Give it back! He pleaded.')
def test_getitem(tokens):
assert tokens[0].orth_ == 'Give' assert tokens[0].orth_ == 'Give'
assert tokens[-1].orth_ == '.' assert tokens[-1].orth_ == '.'
with pytest.raises(IndexError): with pytest.raises(IndexError):

View File

@ -5,11 +5,6 @@ from spacy.en import English
import pytest import pytest
@pytest.fixture
def EN():
return English()
def test_vec(EN): def test_vec(EN):
hype = EN.vocab['hype'] hype = EN.vocab['hype']
assert hype.orth_ == 'hype' assert hype.orth_ == 'hype'

View File

@ -1,19 +1,12 @@
import pytest import pytest
from spacy.en import English
def test_range_iter(en_vocab):
for i in range(len(en_vocab)):
lex = en_vocab[i]
@pytest.fixture def test_iter(en_vocab):
def EN():
return English()
def test_range_iter(EN):
for i in range(len(EN.vocab)):
lex = EN.vocab[i]
def test_iter(EN):
i = 0 i = 0
for lex in EN.vocab: for lex in en_vocab:
i += 1 i += 1

View File

@ -2,28 +2,22 @@ from __future__ import unicode_literals
import pytest import pytest
from spacy.en import English
from spacy.en.attrs import * from spacy.en.attrs import *
@pytest.fixture def test_is_alpha(en_vocab):
def EN(): the = en_vocab['the']
return English()
def test_is_alpha(EN):
the = EN.vocab['the']
assert the.flags & (1 << IS_ALPHA) assert the.flags & (1 << IS_ALPHA)
year = EN.vocab['1999'] year = en_vocab['1999']
assert not year.flags & (1 << IS_ALPHA) assert not year.flags & (1 << IS_ALPHA)
mixed = EN.vocab['hello1'] mixed = en_vocab['hello1']
assert not mixed.flags & (1 << IS_ALPHA) assert not mixed.flags & (1 << IS_ALPHA)
def test_is_digit(EN): def test_is_digit(en_vocab):
the = EN.vocab['the'] the = en_vocab['the']
assert not the.flags & (1 << IS_DIGIT) assert not the.flags & (1 << IS_DIGIT)
year = EN.vocab['1999'] year = en_vocab['1999']
assert year.flags & (1 << IS_DIGIT) assert year.flags & (1 << IS_DIGIT)
mixed = EN.vocab['hello1'] mixed = en_vocab['hello1']
assert not mixed.flags & (1 << IS_DIGIT) assert not mixed.flags & (1 << IS_DIGIT)

View File

@ -1,18 +0,0 @@
"""Test suspected freeing of strings"""
from __future__ import unicode_literals
import pytest
from spacy.en import English
@pytest.fixture
def EN():
return English()
def test_one(EN):
tokens = EN('Betty Botter bought a pound of butter.')
assert tokens[0].orth_ == 'Betty'
tokens2 = EN('Betty also bought a pound of butter.')
assert tokens2[0].orth_ == 'Betty'

View File

@ -1,34 +1,27 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import pytest import pytest
from spacy.en import English
def test_neq(en_vocab):
addr = en_vocab['Hello']
assert en_vocab['bye'].orth != addr.orth
@pytest.fixture def test_eq(en_vocab):
def EN(): addr = en_vocab['Hello']
return English() assert en_vocab['Hello'].orth == addr.orth
def test_neq(EN): def test_case_neq(en_vocab):
addr = EN.vocab['Hello'] addr = en_vocab['Hello']
assert EN.vocab['bye'].orth != addr.orth assert en_vocab['hello'].orth != addr.orth
def test_eq(EN): def test_punct_neq(en_vocab):
addr = EN.vocab['Hello'] addr = en_vocab['Hello']
assert EN.vocab['Hello'].orth == addr.orth assert en_vocab['Hello,'].orth != addr.orth
def test_case_neq(EN): def test_shape_attr(en_vocab):
addr = EN.vocab['Hello'] example = en_vocab['example']
assert EN.vocab['hello'].orth != addr.orth
def test_punct_neq(EN):
addr = EN.vocab['Hello']
assert EN.vocab['Hello,'].orth != addr.orth
def test_shape_attr(EN):
example = EN.vocab['example']
assert example.orth != example.shape assert example.orth != example.shape