Modernise Doc API tests and don't depend on models

This commit is contained in:
Ines Montani 2017-01-11 18:05:36 +01:00
parent 8bf3bb5c44
commit 6e883f4c00

View File

@ -1,39 +1,37 @@
# coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
from spacy.tokens import Doc from ..util import get_doc
from spacy.en import English
import numpy
from spacy.attrs import HEAD
from ...vocab import Vocab
from ...tokens.doc import Doc
import pytest import pytest
import numpy
def test_tokens_compare_by_string_position():
vocab = Vocab() @pytest.mark.parametrize('text', [["one", "two", "three"]])
doc = Doc(vocab, [u'one', u'two', u'three']) def test_doc_api_compare_by_string_position(en_vocab, text):
doc = get_doc(en_vocab, text)
# Get the tokens in this order, so their ID ordering doesn't match the idx # Get the tokens in this order, so their ID ordering doesn't match the idx
three = doc[-1] token3 = doc[-1]
two = doc[-2] token2 = doc[-2]
one = doc[-1] token1 = doc[-1]
one, two, three = doc token1, token2, token3 = doc
assert one < two < three assert token1 < token2 < token3
assert not one > two assert not token1 > token2
assert two > one assert token2 > token1
assert two <= three assert token2 <= token3
assert three >= one assert token3 >= token1
@pytest.mark.models def test_doc_api_getitem(en_tokenizer):
def test_getitem(EN): text = "Give it back! He pleaded."
tokens = EN(u'Give it back! He pleaded.') tokens = en_tokenizer(text)
assert tokens[0].orth_ == 'Give' assert tokens[0].text == 'Give'
assert tokens[-1].orth_ == '.' assert tokens[-1].text == '.'
with pytest.raises(IndexError): with pytest.raises(IndexError):
tokens[len(tokens)] tokens[len(tokens)]
def to_str(span): def to_str(span):
return '/'.join(token.orth_ for token in span) return '/'.join(token.text for token in span)
span = tokens[1:1] span = tokens[1:1]
assert not to_str(span) assert not to_str(span)
@ -99,68 +97,54 @@ def test_getitem(EN):
assert subspan.start == subspan.end == 4 and not to_str(subspan) assert subspan.start == subspan.end == 4 and not to_str(subspan)
@pytest.mark.models @pytest.mark.parametrize('text', ["Give it back! He pleaded.",
def test_serialize(EN): " Give it back! He pleaded. "])
tokens = EN(u'Give it back! He pleaded.') def test_doc_api_serialize(en_tokenizer, text):
packed = tokens.to_bytes() tokens = en_tokenizer(text)
new_tokens = Doc(EN.vocab).from_bytes(packed) new_tokens = get_doc(tokens.vocab).from_bytes(tokens.to_bytes())
assert tokens.string == new_tokens.string assert tokens.string == new_tokens.string
assert [t.orth_ for t in tokens] == [t.orth_ for t in new_tokens] assert [t.text for t in tokens] == [t.text for t in new_tokens]
assert [t.orth for t in tokens] == [t.orth for t in new_tokens] assert [t.orth for t in tokens] == [t.orth for t in new_tokens]
@pytest.mark.models def test_doc_api_set_ents(en_tokenizer):
def test_serialize_whitespace(EN): text = "I use goggle chrone to surf the web"
tokens = EN(u' Give it back! He pleaded. ') tokens = en_tokenizer(text)
packed = tokens.to_bytes()
new_tokens = Doc(EN.vocab).from_bytes(packed)
assert tokens.string == new_tokens.string
assert [t.orth_ for t in tokens] == [t.orth_ for t in new_tokens]
assert [t.orth for t in tokens] == [t.orth for t in new_tokens]
def test_set_ents(EN):
tokens = EN.tokenizer(u'I use goggle chrone to surf the web')
assert len(tokens.ents) == 0 assert len(tokens.ents) == 0
tokens.ents = [(EN.vocab.strings['PRODUCT'], 2, 4)] tokens.ents = [(tokens.vocab.strings['PRODUCT'], 2, 4)]
assert len(list(tokens.ents)) == 1 assert len(list(tokens.ents)) == 1
assert [t.ent_iob for t in tokens] == [0, 0, 3, 1, 0, 0, 0, 0] assert [t.ent_iob for t in tokens] == [0, 0, 3, 1, 0, 0, 0, 0]
ent = tokens.ents[0] assert tokens.ents[0].label_ == 'PRODUCT'
assert ent.label_ == 'PRODUCT' assert tokens.ents[0].start == 2
assert ent.start == 2 assert tokens.ents[0].end == 4
assert ent.end == 4
def test_merge(EN): def test_doc_api_merge(en_tokenizer):
doc = EN('WKRO played songs by the beach boys all night') text = "WKRO played songs by the beach boys all night"
assert len(doc) == 9
# merge 'The Beach Boys' # merge 'The Beach Boys'
doc = en_tokenizer(text)
assert len(doc) == 9
doc.merge(doc[4].idx, doc[6].idx + len(doc[6]), 'NAMED', 'LEMMA', 'TYPE') doc.merge(doc[4].idx, doc[6].idx + len(doc[6]), 'NAMED', 'LEMMA', 'TYPE')
assert len(doc) == 7 assert len(doc) == 7
assert doc[4].text == 'the beach boys' assert doc[4].text == 'the beach boys'
assert doc[4].text_with_ws == 'the beach boys ' assert doc[4].text_with_ws == 'the beach boys '
assert doc[4].tag_ == 'NAMED' assert doc[4].tag_ == 'NAMED'
# merge 'all night'
def test_merge_end_string(EN): doc = en_tokenizer(text)
doc = EN('WKRO played songs by the beach boys all night')
assert len(doc) == 9 assert len(doc) == 9
# merge 'The Beach Boys'
doc.merge(doc[7].idx, doc[8].idx + len(doc[8]), 'NAMED', 'LEMMA', 'TYPE') doc.merge(doc[7].idx, doc[8].idx + len(doc[8]), 'NAMED', 'LEMMA', 'TYPE')
assert len(doc) == 8 assert len(doc) == 8
assert doc[7].text == 'all night' assert doc[7].text == 'all night'
assert doc[7].text_with_ws == 'all night' assert doc[7].text_with_ws == 'all night'
@pytest.mark.models def test_doc_api_merge_children(en_tokenizer):
def test_merge_children(EN):
"""Test that attachments work correctly after merging.""" """Test that attachments work correctly after merging."""
doc = EN('WKRO played songs by the beach boys all night') text = "WKRO played songs by the beach boys all night"
# merge 'The Beach Boys' doc = en_tokenizer(text)
assert len(doc) == 9
doc.merge(doc[4].idx, doc[6].idx + len(doc[6]), 'NAMED', 'LEMMA', 'TYPE') doc.merge(doc[4].idx, doc[6].idx + len(doc[6]), 'NAMED', 'LEMMA', 'TYPE')
for word in doc: for word in doc:
@ -170,28 +154,31 @@ def test_merge_children(EN):
assert word in list(word.head.rights) assert word in list(word.head.rights)
def test_merge_hang(): def test_doc_api_merge_hang(en_tokenizer):
text = 'through North and South Carolina' text = "through North and South Carolina"
EN = English(parser=False) doc = en_tokenizer(text)
doc = EN(text, tag=True)
heads = numpy.asarray([[0, 3, -1, -2, -4]], dtype='int32')
doc.from_array([HEAD], heads.T)
doc.merge(18, 32, '', '', 'ORG') doc.merge(18, 32, '', '', 'ORG')
doc.merge(8, 32, '', '', 'ORG') doc.merge(8, 32, '', '', 'ORG')
def test_sents_empty_string(EN): def test_doc_api_sents_empty_string(en_tokenizer):
doc = EN(u'') doc = en_tokenizer("")
doc.is_parsed = True doc.is_parsed = True
sents = list(doc.sents) sents = list(doc.sents)
assert len(sents) == 0 assert len(sents) == 0
@pytest.mark.models def test_doc_api_runtime_error(en_tokenizer):
def test_runtime_error(EN):
# Example that caused run-time error while parsing Reddit # Example that caused run-time error while parsing Reddit
text = u'67% of black households are single parent \n\n72% of all black babies born out of wedlock \n\n50% of all black kids don\u2019t finish high school' text = "67% of black households are single parent \n\n72% of all black babies born out of wedlock \n\n50% of all black kids don\u2019t finish high school"
doc = EN(text) deps = ['nsubj', 'prep', 'amod', 'pobj', 'ROOT', 'amod', 'attr', '',
'nummod', 'prep', 'det', 'amod', 'pobj', 'acl', 'prep', 'prep',
'pobj', '', 'nummod', 'prep', 'det', 'amod', 'pobj', 'aux', 'neg',
'ROOT', 'amod', 'dobj']
tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, [t.text for t in tokens], deps=deps)
nps = [] nps = []
for np in doc.noun_chunks: for np in doc.noun_chunks:
while len(np) > 1 and np[0].dep_ not in ('advmod', 'amod', 'compound'): while len(np) > 1 and np[0].dep_ not in ('advmod', 'amod', 'compound'):
@ -205,20 +192,28 @@ def test_runtime_error(EN):
doc.merge(*np) doc.merge(*np)
@pytest.mark.models def test_doc_api_right_edge(en_tokenizer):
def test_right_edge(EN):
# Test for bug occurring from Unshift action, causing incorrect right edge # Test for bug occurring from Unshift action, causing incorrect right edge
doc = EN(u'''I have proposed to myself, for the sake of such as live ''' text = "I have proposed to myself, for the sake of such as live under the government of the Romans, to translate those books into the Greek tongue."
u'''under the government of the Romans, to translate those books ''' heads = [2, 1, 0, -1, -1, -3, 15, 1, -2, -1, 1, -3, -1, -1, 1, -2, -1, 1,
u'''into the Greek tongue.''') -2, -7, 1, -19, 1, -2, -3, 2, 1, -3, -26]
token = doc[6]
assert token.text == u'for' tokens = en_tokenizer(text)
subtree = [w.text for w in token.subtree] doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads)
assert subtree == [u'for' , u'the', u'sake', u'of', u'such', u'as', u'live', u'under', u'the', u'government', u'of', u'the', u'Romans', u','] assert doc[6].text == 'for'
assert token.right_edge.text == u',' subtree = [w.text for w in doc[6].subtree]
assert subtree == ['for' , 'the', 'sake', 'of', 'such', 'as', 'live', 'under', 'the', 'government', 'of', 'the', 'Romans', ',']
assert doc[6].right_edge.text == ','
@pytest.mark.vectors @pytest.mark.parametrize('text,vectors', [
def test_has_vector(EN): ("apple orange pear", ["apple -1 -1 -1", "orange -1 -1 0", "pear -1 0 -1"])
doc = EN(u'''apple orange pear''') ])
def test_doc_api_has_vector(en_tokenizer, text_file, text, vectors):
text_file.write('\n'.join(vectors))
text_file.seek(0)
vector_length = en_tokenizer.vocab.load_vectors(text_file)
assert vector_length == 3
doc = en_tokenizer(text)
assert doc.has_vector assert doc.has_vector