Tidy up retokenizer tests

This commit is contained in:
Ines Montani 2019-02-24 14:14:11 +01:00
parent 723e27cb8c
commit d8f69d592f
3 changed files with 90 additions and 93 deletions

View File

@ -6,7 +6,6 @@ import pytest
import numpy import numpy
from spacy.tokens import Doc from spacy.tokens import Doc
from spacy.vocab import Vocab from spacy.vocab import Vocab
from spacy.attrs import LEMMA
from spacy.errors import ModelsWarning from spacy.errors import ModelsWarning
from ..util import get_doc from ..util import get_doc
@ -139,81 +138,6 @@ def test_doc_api_set_ents(en_tokenizer):
assert tokens.ents[0].end == 4 assert tokens.ents[0].end == 4
def test_doc_api_merge(en_tokenizer):
text = "WKRO played songs by the beach boys all night"
attrs = {"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE"}
# merge both with bulk merge
doc = en_tokenizer(text)
assert len(doc) == 9
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[4:7], attrs=attrs)
retokenizer.merge(doc[7:9], attrs=attrs)
assert len(doc) == 6
assert doc[4].text == "the beach boys"
assert doc[4].text_with_ws == "the beach boys "
assert doc[4].tag_ == "NAMED"
assert doc[5].text == "all night"
assert doc[5].text_with_ws == "all night"
assert doc[5].tag_ == "NAMED"
def test_doc_api_merge_children(en_tokenizer):
"""Test that attachments work correctly after merging."""
text = "WKRO played songs by the beach boys all night"
attrs = {"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE"}
doc = en_tokenizer(text)
assert len(doc) == 9
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[4:7], attrs=attrs)
for word in doc:
if word.i < word.head.i:
assert word in list(word.head.lefts)
elif word.i > word.head.i:
assert word in list(word.head.rights)
def test_doc_api_merge_hang(en_tokenizer):
text = "through North and South Carolina"
doc = en_tokenizer(text)
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[3:5], attrs={"lemma": "", "ent_type": "ORG"})
retokenizer.merge(doc[1:2], attrs={"lemma": "", "ent_type": "ORG"})
def test_doc_api_retokenizer(en_tokenizer):
doc = en_tokenizer("WKRO played songs by the beach boys all night")
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[4:7])
assert len(doc) == 7
assert doc[4].text == "the beach boys"
def test_doc_api_retokenizer_attrs(en_tokenizer):
doc = en_tokenizer("WKRO played songs by the beach boys all night")
# test both string and integer attributes and values
attrs = {LEMMA: "boys", "ENT_TYPE": doc.vocab.strings["ORG"]}
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[4:7], attrs=attrs)
assert len(doc) == 7
assert doc[4].text == "the beach boys"
assert doc[4].lemma_ == "boys"
assert doc[4].ent_type_ == "ORG"
@pytest.mark.xfail
def test_doc_api_retokenizer_lex_attrs(en_tokenizer):
"""Test that lexical attributes can be changed (see #2390)."""
doc = en_tokenizer("WKRO played beach boys songs")
assert not any(token.is_stop for token in doc)
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[2:4], attrs={"LEMMA": "boys", "IS_STOP": True})
assert doc[2].text == "beach boys"
assert doc[2].lemma_ == "boys"
assert doc[2].is_stop
new_doc = Doc(doc.vocab, words=["beach boys"])
assert new_doc[0].is_stop
def test_doc_api_sents_empty_string(en_tokenizer): def test_doc_api_sents_empty_string(en_tokenizer):
doc = en_tokenizer("") doc = en_tokenizer("")
doc.is_parsed = True doc.is_parsed = True

View File

@ -1,14 +1,89 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import pytest
from spacy.attrs import LEMMA
from spacy.vocab import Vocab from spacy.vocab import Vocab
from spacy.tokens import Doc from spacy.tokens import Doc
import pytest
from ..util import get_doc from ..util import get_doc
def test_spans_merge_tokens(en_tokenizer): def test_doc_retokenize_merge(en_tokenizer):
text = "WKRO played songs by the beach boys all night"
attrs = {"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE"}
doc = en_tokenizer(text)
assert len(doc) == 9
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[4:7], attrs=attrs)
retokenizer.merge(doc[7:9], attrs=attrs)
assert len(doc) == 6
assert doc[4].text == "the beach boys"
assert doc[4].text_with_ws == "the beach boys "
assert doc[4].tag_ == "NAMED"
assert doc[5].text == "all night"
assert doc[5].text_with_ws == "all night"
assert doc[5].tag_ == "NAMED"
def test_doc_retokenize_merge_children(en_tokenizer):
"""Test that attachments work correctly after merging."""
text = "WKRO played songs by the beach boys all night"
attrs = {"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE"}
doc = en_tokenizer(text)
assert len(doc) == 9
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[4:7], attrs=attrs)
for word in doc:
if word.i < word.head.i:
assert word in list(word.head.lefts)
elif word.i > word.head.i:
assert word in list(word.head.rights)
def test_doc_retokenize_merge_hang(en_tokenizer):
text = "through North and South Carolina"
doc = en_tokenizer(text)
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[3:5], attrs={"lemma": "", "ent_type": "ORG"})
retokenizer.merge(doc[1:2], attrs={"lemma": "", "ent_type": "ORG"})
def test_doc_retokenize_retokenizer(en_tokenizer):
doc = en_tokenizer("WKRO played songs by the beach boys all night")
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[4:7])
assert len(doc) == 7
assert doc[4].text == "the beach boys"
def test_doc_retokenize_retokenizer_attrs(en_tokenizer):
doc = en_tokenizer("WKRO played songs by the beach boys all night")
# test both string and integer attributes and values
attrs = {LEMMA: "boys", "ENT_TYPE": doc.vocab.strings["ORG"]}
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[4:7], attrs=attrs)
assert len(doc) == 7
assert doc[4].text == "the beach boys"
assert doc[4].lemma_ == "boys"
assert doc[4].ent_type_ == "ORG"
@pytest.mark.xfail
def test_doc_retokenize_lex_attrs(en_tokenizer):
"""Test that lexical attributes can be changed (see #2390)."""
doc = en_tokenizer("WKRO played beach boys songs")
assert not any(token.is_stop for token in doc)
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[2:4], attrs={"LEMMA": "boys", "IS_STOP": True})
assert doc[2].text == "beach boys"
assert doc[2].lemma_ == "boys"
assert doc[2].is_stop
new_doc = Doc(doc.vocab, words=["beach boys"])
assert new_doc[0].is_stop
def test_doc_retokenize_spans_merge_tokens(en_tokenizer):
text = "Los Angeles start." text = "Los Angeles start."
heads = [1, 1, 0, -1] heads = [1, 1, 0, -1]
tokens = en_tokenizer(text) tokens = en_tokenizer(text)
@ -25,7 +100,7 @@ def test_spans_merge_tokens(en_tokenizer):
assert doc[0].ent_type_ == "GPE" assert doc[0].ent_type_ == "GPE"
def test_spans_merge_heads(en_tokenizer): def test_doc_retokenize_spans_merge_heads(en_tokenizer):
text = "I found a pilates class near work." text = "I found a pilates class near work."
heads = [1, 0, 2, 1, -3, -1, -1, -6] heads = [1, 0, 2, 1, -3, -1, -1, -6]
tokens = en_tokenizer(text) tokens = en_tokenizer(text)
@ -43,7 +118,7 @@ def test_spans_merge_heads(en_tokenizer):
assert doc[5].head.i == 4 assert doc[5].head.i == 4
def test_spans_merge_non_disjoint(en_tokenizer): def test_doc_retokenize_spans_merge_non_disjoint(en_tokenizer):
text = "Los Angeles start." text = "Los Angeles start."
doc = en_tokenizer(text) doc = en_tokenizer(text)
with pytest.raises(ValueError): with pytest.raises(ValueError):
@ -58,7 +133,7 @@ def test_spans_merge_non_disjoint(en_tokenizer):
) )
def test_span_np_merges(en_tokenizer): def test_doc_retokenize_span_np_merges(en_tokenizer):
text = "displaCy is a parse tool built with Javascript" text = "displaCy is a parse tool built with Javascript"
heads = [1, 0, 2, 1, -3, -1, -1, -1] heads = [1, 0, 2, 1, -3, -1, -1, -1]
tokens = en_tokenizer(text) tokens = en_tokenizer(text)
@ -87,7 +162,7 @@ def test_span_np_merges(en_tokenizer):
retokenizer.merge(ent) retokenizer.merge(ent)
def test_spans_entity_merge(en_tokenizer): def test_doc_retokenize_spans_entity_merge(en_tokenizer):
# fmt: off # fmt: off
text = "Stewart Lee is a stand up comedian who lives in England and loves Joe Pasquale.\n" text = "Stewart Lee is a stand up comedian who lives in England and loves Joe Pasquale.\n"
heads = [1, 1, 0, 1, 2, -1, -4, 1, -2, -1, -1, -3, -10, 1, -2, -13, -1] heads = [1, 1, 0, 1, 2, -1, -4, 1, -2, -1, -1, -3, -10, 1, -2, -13, -1]
@ -108,7 +183,7 @@ def test_spans_entity_merge(en_tokenizer):
assert len(doc) == 15 assert len(doc) == 15
def test_spans_entity_merge_iob(): def test_doc_retokenize_spans_entity_merge_iob():
# Test entity IOB stays consistent after merging # Test entity IOB stays consistent after merging
words = ["a", "b", "c", "d", "e"] words = ["a", "b", "c", "d", "e"]
doc = Doc(Vocab(), words=words) doc = Doc(Vocab(), words=words)
@ -147,7 +222,7 @@ def test_spans_entity_merge_iob():
assert doc[4].ent_iob_ == "I" assert doc[4].ent_iob_ == "I"
def test_spans_sentence_update_after_merge(en_tokenizer): def test_doc_retokenize_spans_sentence_update_after_merge(en_tokenizer):
# fmt: off # fmt: off
text = "Stewart Lee is a stand up comedian. He lives in England and loves Joe Pasquale." text = "Stewart Lee is a stand up comedian. He lives in England and loves Joe Pasquale."
heads = [1, 1, 0, 1, 2, -1, -4, -5, 1, 0, -1, -1, -3, -4, 1, -2, -7] heads = [1, 1, 0, 1, 2, -1, -4, -5, 1, 0, -1, -1, -3, -4, 1, -2, -7]
@ -155,7 +230,6 @@ def test_spans_sentence_update_after_merge(en_tokenizer):
'punct', 'nsubj', 'ROOT', 'prep', 'pobj', 'cc', 'conj', 'punct', 'nsubj', 'ROOT', 'prep', 'pobj', 'cc', 'conj',
'compound', 'dobj', 'punct'] 'compound', 'dobj', 'punct']
# fmt: on # fmt: on
tokens = en_tokenizer(text) tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps) doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
sent1, sent2 = list(doc.sents) sent1, sent2 = list(doc.sents)
@ -169,7 +243,7 @@ def test_spans_sentence_update_after_merge(en_tokenizer):
assert len(sent2) == init_len2 - 1 assert len(sent2) == init_len2 - 1
def test_spans_subtree_size_check(en_tokenizer): def test_doc_retokenize_spans_subtree_size_check(en_tokenizer):
# fmt: off # fmt: off
text = "Stewart Lee is a stand up comedian who lives in England and loves Joe Pasquale" text = "Stewart Lee is a stand up comedian who lives in England and loves Joe Pasquale"
heads = [1, 1, 0, 1, 2, -1, -4, 1, -2, -1, -1, -3, -10, 1, -2] heads = [1, 1, 0, 1, 2, -1, -4, 1, -2, -1, -1, -3, -10, 1, -2]
@ -177,7 +251,6 @@ def test_spans_subtree_size_check(en_tokenizer):
"nsubj", "relcl", "prep", "pobj", "cc", "conj", "compound", "nsubj", "relcl", "prep", "pobj", "cc", "conj", "compound",
"dobj"] "dobj"]
# fmt: on # fmt: on
tokens = en_tokenizer(text) tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps) doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
sent1 = list(doc.sents)[0] sent1 = list(doc.sents)[0]

View File

@ -8,7 +8,7 @@ from spacy.tokens import Doc
from ..util import get_doc from ..util import get_doc
def test_doc_split(en_vocab): def test_doc_retokenize_split(en_vocab):
words = ["LosAngeles", "start", "."] words = ["LosAngeles", "start", "."]
heads = [1, 1, 0] heads = [1, 1, 0]
doc = get_doc(en_vocab, words=words, heads=heads) doc = get_doc(en_vocab, words=words, heads=heads)
@ -41,7 +41,7 @@ def test_doc_split(en_vocab):
assert len(str(doc)) == 19 assert len(str(doc)) == 19
def test_split_dependencies(en_vocab): def test_doc_retokenize_split_dependencies(en_vocab):
doc = Doc(en_vocab, words=["LosAngeles", "start", "."]) doc = Doc(en_vocab, words=["LosAngeles", "start", "."])
dep1 = doc.vocab.strings.add("amod") dep1 = doc.vocab.strings.add("amod")
dep2 = doc.vocab.strings.add("subject") dep2 = doc.vocab.strings.add("subject")
@ -56,7 +56,7 @@ def test_split_dependencies(en_vocab):
assert doc[1].dep == dep2 assert doc[1].dep == dep2
def test_split_heads_error(en_vocab): def test_doc_retokenize_split_heads_error(en_vocab):
doc = Doc(en_vocab, words=["LosAngeles", "start", "."]) doc = Doc(en_vocab, words=["LosAngeles", "start", "."])
# Not enough heads # Not enough heads
with pytest.raises(ValueError): with pytest.raises(ValueError):
@ -69,7 +69,7 @@ def test_split_heads_error(en_vocab):
retokenizer.split(doc[0], ["Los", "Angeles"], [doc[1], doc[1], doc[1]]) retokenizer.split(doc[0], ["Los", "Angeles"], [doc[1], doc[1], doc[1]])
def test_spans_entity_merge_iob(): def test_doc_retokenize_spans_entity_split_iob():
# Test entity IOB stays consistent after merging # Test entity IOB stays consistent after merging
words = ["abc", "d", "e"] words = ["abc", "d", "e"]
doc = Doc(Vocab(), words=words) doc = Doc(Vocab(), words=words)
@ -84,7 +84,7 @@ def test_spans_entity_merge_iob():
assert doc[3].ent_iob_ == "I" assert doc[3].ent_iob_ == "I"
def test_spans_sentence_update_after_merge(en_vocab): def test_doc_retokenize_spans_sentence_update_after_split(en_vocab):
# fmt: off # fmt: off
words = ["StewartLee", "is", "a", "stand", "up", "comedian", ".", "He", words = ["StewartLee", "is", "a", "stand", "up", "comedian", ".", "He",
"lives", "in", "England", "and", "loves", "JoePasquale", "."] "lives", "in", "England", "and", "loves", "JoePasquale", "."]
@ -114,7 +114,7 @@ def test_spans_sentence_update_after_merge(en_vocab):
assert len(sent2) == init_len2 + 1 assert len(sent2) == init_len2 + 1
def test_split_orths_mismatch(en_vocab): def test_doc_retokenize_split_orths_mismatch(en_vocab):
"""Test that the regular retokenizer.split raises an error if the orths """Test that the regular retokenizer.split raises an error if the orths
don't match the original token text. There might still be a method that don't match the original token text. There might still be a method that
allows this, but for the default use cases, merging and splitting should allows this, but for the default use cases, merging and splitting should