From d8f69d592fa16b02464dafe0b47e14968fc0d8e4 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 24 Feb 2019 14:14:11 +0100 Subject: [PATCH] Tidy up retokenizer tests --- spacy/tests/doc/test_doc_api.py | 76 --------------- ...span_merge.py => test_retokenize_merge.py} | 95 ++++++++++++++++--- ..._doc_split.py => test_retokenize_split.py} | 12 +-- 3 files changed, 90 insertions(+), 93 deletions(-) rename spacy/tests/doc/{test_span_merge.py => test_retokenize_merge.py} (66%) rename spacy/tests/doc/{test_doc_split.py => test_retokenize_split.py} (92%) diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index 878ecd240..1c3c948c3 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -6,7 +6,6 @@ import pytest import numpy from spacy.tokens import Doc from spacy.vocab import Vocab -from spacy.attrs import LEMMA from spacy.errors import ModelsWarning from ..util import get_doc @@ -139,81 +138,6 @@ def test_doc_api_set_ents(en_tokenizer): assert tokens.ents[0].end == 4 -def test_doc_api_merge(en_tokenizer): - text = "WKRO played songs by the beach boys all night" - attrs = {"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE"} - # merge both with bulk merge - doc = en_tokenizer(text) - assert len(doc) == 9 - with doc.retokenize() as retokenizer: - retokenizer.merge(doc[4:7], attrs=attrs) - retokenizer.merge(doc[7:9], attrs=attrs) - assert len(doc) == 6 - assert doc[4].text == "the beach boys" - assert doc[4].text_with_ws == "the beach boys " - assert doc[4].tag_ == "NAMED" - assert doc[5].text == "all night" - assert doc[5].text_with_ws == "all night" - assert doc[5].tag_ == "NAMED" - - -def test_doc_api_merge_children(en_tokenizer): - """Test that attachments work correctly after merging.""" - text = "WKRO played songs by the beach boys all night" - attrs = {"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE"} - doc = en_tokenizer(text) - assert len(doc) == 9 - with doc.retokenize() as retokenizer: - retokenizer.merge(doc[4:7], attrs=attrs) - for word in doc: - if word.i < word.head.i: - assert word in list(word.head.lefts) - elif word.i > word.head.i: - assert word in list(word.head.rights) - - -def test_doc_api_merge_hang(en_tokenizer): - text = "through North and South Carolina" - doc = en_tokenizer(text) - with doc.retokenize() as retokenizer: - retokenizer.merge(doc[3:5], attrs={"lemma": "", "ent_type": "ORG"}) - retokenizer.merge(doc[1:2], attrs={"lemma": "", "ent_type": "ORG"}) - - -def test_doc_api_retokenizer(en_tokenizer): - doc = en_tokenizer("WKRO played songs by the beach boys all night") - with doc.retokenize() as retokenizer: - retokenizer.merge(doc[4:7]) - assert len(doc) == 7 - assert doc[4].text == "the beach boys" - - -def test_doc_api_retokenizer_attrs(en_tokenizer): - doc = en_tokenizer("WKRO played songs by the beach boys all night") - # test both string and integer attributes and values - attrs = {LEMMA: "boys", "ENT_TYPE": doc.vocab.strings["ORG"]} - with doc.retokenize() as retokenizer: - retokenizer.merge(doc[4:7], attrs=attrs) - assert len(doc) == 7 - assert doc[4].text == "the beach boys" - assert doc[4].lemma_ == "boys" - assert doc[4].ent_type_ == "ORG" - - -@pytest.mark.xfail -def test_doc_api_retokenizer_lex_attrs(en_tokenizer): - """Test that lexical attributes can be changed (see #2390).""" - doc = en_tokenizer("WKRO played beach boys songs") - assert not any(token.is_stop for token in doc) - with doc.retokenize() as retokenizer: - retokenizer.merge(doc[2:4], attrs={"LEMMA": "boys", "IS_STOP": True}) - assert doc[2].text == "beach boys" - assert doc[2].lemma_ == "boys" - assert doc[2].is_stop - new_doc = Doc(doc.vocab, words=["beach boys"]) - assert new_doc[0].is_stop - - def test_doc_api_sents_empty_string(en_tokenizer): doc = en_tokenizer("") doc.is_parsed = True diff --git a/spacy/tests/doc/test_span_merge.py b/spacy/tests/doc/test_retokenize_merge.py similarity index 66% rename from spacy/tests/doc/test_span_merge.py rename to spacy/tests/doc/test_retokenize_merge.py index 87d475f1f..8c1b2a25a 100644 --- a/spacy/tests/doc/test_span_merge.py +++ b/spacy/tests/doc/test_retokenize_merge.py @@ -1,14 +1,89 @@ # coding: utf-8 from __future__ import unicode_literals +import pytest +from spacy.attrs import LEMMA from spacy.vocab import Vocab from spacy.tokens import Doc -import pytest from ..util import get_doc -def test_spans_merge_tokens(en_tokenizer): +def test_doc_retokenize_merge(en_tokenizer): + text = "WKRO played songs by the beach boys all night" + attrs = {"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE"} + doc = en_tokenizer(text) + assert len(doc) == 9 + with doc.retokenize() as retokenizer: + retokenizer.merge(doc[4:7], attrs=attrs) + retokenizer.merge(doc[7:9], attrs=attrs) + assert len(doc) == 6 + assert doc[4].text == "the beach boys" + assert doc[4].text_with_ws == "the beach boys " + assert doc[4].tag_ == "NAMED" + assert doc[5].text == "all night" + assert doc[5].text_with_ws == "all night" + assert doc[5].tag_ == "NAMED" + + +def test_doc_retokenize_merge_children(en_tokenizer): + """Test that attachments work correctly after merging.""" + text = "WKRO played songs by the beach boys all night" + attrs = {"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE"} + doc = en_tokenizer(text) + assert len(doc) == 9 + with doc.retokenize() as retokenizer: + retokenizer.merge(doc[4:7], attrs=attrs) + for word in doc: + if word.i < word.head.i: + assert word in list(word.head.lefts) + elif word.i > word.head.i: + assert word in list(word.head.rights) + + +def test_doc_retokenize_merge_hang(en_tokenizer): + text = "through North and South Carolina" + doc = en_tokenizer(text) + with doc.retokenize() as retokenizer: + retokenizer.merge(doc[3:5], attrs={"lemma": "", "ent_type": "ORG"}) + retokenizer.merge(doc[1:2], attrs={"lemma": "", "ent_type": "ORG"}) + + +def test_doc_retokenize_retokenizer(en_tokenizer): + doc = en_tokenizer("WKRO played songs by the beach boys all night") + with doc.retokenize() as retokenizer: + retokenizer.merge(doc[4:7]) + assert len(doc) == 7 + assert doc[4].text == "the beach boys" + + +def test_doc_retokenize_retokenizer_attrs(en_tokenizer): + doc = en_tokenizer("WKRO played songs by the beach boys all night") + # test both string and integer attributes and values + attrs = {LEMMA: "boys", "ENT_TYPE": doc.vocab.strings["ORG"]} + with doc.retokenize() as retokenizer: + retokenizer.merge(doc[4:7], attrs=attrs) + assert len(doc) == 7 + assert doc[4].text == "the beach boys" + assert doc[4].lemma_ == "boys" + assert doc[4].ent_type_ == "ORG" + + +@pytest.mark.xfail +def test_doc_retokenize_lex_attrs(en_tokenizer): + """Test that lexical attributes can be changed (see #2390).""" + doc = en_tokenizer("WKRO played beach boys songs") + assert not any(token.is_stop for token in doc) + with doc.retokenize() as retokenizer: + retokenizer.merge(doc[2:4], attrs={"LEMMA": "boys", "IS_STOP": True}) + assert doc[2].text == "beach boys" + assert doc[2].lemma_ == "boys" + assert doc[2].is_stop + new_doc = Doc(doc.vocab, words=["beach boys"]) + assert new_doc[0].is_stop + + +def test_doc_retokenize_spans_merge_tokens(en_tokenizer): text = "Los Angeles start." heads = [1, 1, 0, -1] tokens = en_tokenizer(text) @@ -25,7 +100,7 @@ def test_spans_merge_tokens(en_tokenizer): assert doc[0].ent_type_ == "GPE" -def test_spans_merge_heads(en_tokenizer): +def test_doc_retokenize_spans_merge_heads(en_tokenizer): text = "I found a pilates class near work." heads = [1, 0, 2, 1, -3, -1, -1, -6] tokens = en_tokenizer(text) @@ -43,7 +118,7 @@ def test_spans_merge_heads(en_tokenizer): assert doc[5].head.i == 4 -def test_spans_merge_non_disjoint(en_tokenizer): +def test_doc_retokenize_spans_merge_non_disjoint(en_tokenizer): text = "Los Angeles start." doc = en_tokenizer(text) with pytest.raises(ValueError): @@ -58,7 +133,7 @@ def test_spans_merge_non_disjoint(en_tokenizer): ) -def test_span_np_merges(en_tokenizer): +def test_doc_retokenize_span_np_merges(en_tokenizer): text = "displaCy is a parse tool built with Javascript" heads = [1, 0, 2, 1, -3, -1, -1, -1] tokens = en_tokenizer(text) @@ -87,7 +162,7 @@ def test_span_np_merges(en_tokenizer): retokenizer.merge(ent) -def test_spans_entity_merge(en_tokenizer): +def test_doc_retokenize_spans_entity_merge(en_tokenizer): # fmt: off text = "Stewart Lee is a stand up comedian who lives in England and loves Joe Pasquale.\n" heads = [1, 1, 0, 1, 2, -1, -4, 1, -2, -1, -1, -3, -10, 1, -2, -13, -1] @@ -108,7 +183,7 @@ def test_spans_entity_merge(en_tokenizer): assert len(doc) == 15 -def test_spans_entity_merge_iob(): +def test_doc_retokenize_spans_entity_merge_iob(): # Test entity IOB stays consistent after merging words = ["a", "b", "c", "d", "e"] doc = Doc(Vocab(), words=words) @@ -147,7 +222,7 @@ def test_spans_entity_merge_iob(): assert doc[4].ent_iob_ == "I" -def test_spans_sentence_update_after_merge(en_tokenizer): +def test_doc_retokenize_spans_sentence_update_after_merge(en_tokenizer): # fmt: off text = "Stewart Lee is a stand up comedian. He lives in England and loves Joe Pasquale." heads = [1, 1, 0, 1, 2, -1, -4, -5, 1, 0, -1, -1, -3, -4, 1, -2, -7] @@ -155,7 +230,6 @@ def test_spans_sentence_update_after_merge(en_tokenizer): 'punct', 'nsubj', 'ROOT', 'prep', 'pobj', 'cc', 'conj', 'compound', 'dobj', 'punct'] # fmt: on - tokens = en_tokenizer(text) doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps) sent1, sent2 = list(doc.sents) @@ -169,7 +243,7 @@ def test_spans_sentence_update_after_merge(en_tokenizer): assert len(sent2) == init_len2 - 1 -def test_spans_subtree_size_check(en_tokenizer): +def test_doc_retokenize_spans_subtree_size_check(en_tokenizer): # fmt: off text = "Stewart Lee is a stand up comedian who lives in England and loves Joe Pasquale" heads = [1, 1, 0, 1, 2, -1, -4, 1, -2, -1, -1, -3, -10, 1, -2] @@ -177,7 +251,6 @@ def test_spans_subtree_size_check(en_tokenizer): "nsubj", "relcl", "prep", "pobj", "cc", "conj", "compound", "dobj"] # fmt: on - tokens = en_tokenizer(text) doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps) sent1 = list(doc.sents)[0] diff --git a/spacy/tests/doc/test_doc_split.py b/spacy/tests/doc/test_retokenize_split.py similarity index 92% rename from spacy/tests/doc/test_doc_split.py rename to spacy/tests/doc/test_retokenize_split.py index 3999aabca..b93a781f7 100644 --- a/spacy/tests/doc/test_doc_split.py +++ b/spacy/tests/doc/test_retokenize_split.py @@ -8,7 +8,7 @@ from spacy.tokens import Doc from ..util import get_doc -def test_doc_split(en_vocab): +def test_doc_retokenize_split(en_vocab): words = ["LosAngeles", "start", "."] heads = [1, 1, 0] doc = get_doc(en_vocab, words=words, heads=heads) @@ -41,7 +41,7 @@ def test_doc_split(en_vocab): assert len(str(doc)) == 19 -def test_split_dependencies(en_vocab): +def test_doc_retokenize_split_dependencies(en_vocab): doc = Doc(en_vocab, words=["LosAngeles", "start", "."]) dep1 = doc.vocab.strings.add("amod") dep2 = doc.vocab.strings.add("subject") @@ -56,7 +56,7 @@ def test_split_dependencies(en_vocab): assert doc[1].dep == dep2 -def test_split_heads_error(en_vocab): +def test_doc_retokenize_split_heads_error(en_vocab): doc = Doc(en_vocab, words=["LosAngeles", "start", "."]) # Not enough heads with pytest.raises(ValueError): @@ -69,7 +69,7 @@ def test_split_heads_error(en_vocab): retokenizer.split(doc[0], ["Los", "Angeles"], [doc[1], doc[1], doc[1]]) -def test_spans_entity_merge_iob(): +def test_doc_retokenize_spans_entity_split_iob(): # Test entity IOB stays consistent after merging words = ["abc", "d", "e"] doc = Doc(Vocab(), words=words) @@ -84,7 +84,7 @@ def test_spans_entity_merge_iob(): assert doc[3].ent_iob_ == "I" -def test_spans_sentence_update_after_merge(en_vocab): +def test_doc_retokenize_spans_sentence_update_after_split(en_vocab): # fmt: off words = ["StewartLee", "is", "a", "stand", "up", "comedian", ".", "He", "lives", "in", "England", "and", "loves", "JoePasquale", "."] @@ -114,7 +114,7 @@ def test_spans_sentence_update_after_merge(en_vocab): assert len(sent2) == init_len2 + 1 -def test_split_orths_mismatch(en_vocab): +def test_doc_retokenize_split_orths_mismatch(en_vocab): """Test that the regular retokenizer.split raises an error if the orths don't match the original token text. There might still be a method that allows this, but for the default use cases, merging and splitting should