mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 01:04:34 +03:00
Tidy up retokenizer tests
This commit is contained in:
parent
723e27cb8c
commit
d8f69d592f
|
@ -6,7 +6,6 @@ import pytest
|
|||
import numpy
|
||||
from spacy.tokens import Doc
|
||||
from spacy.vocab import Vocab
|
||||
from spacy.attrs import LEMMA
|
||||
from spacy.errors import ModelsWarning
|
||||
|
||||
from ..util import get_doc
|
||||
|
@ -139,81 +138,6 @@ def test_doc_api_set_ents(en_tokenizer):
|
|||
assert tokens.ents[0].end == 4
|
||||
|
||||
|
||||
def test_doc_api_merge(en_tokenizer):
|
||||
text = "WKRO played songs by the beach boys all night"
|
||||
attrs = {"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE"}
|
||||
# merge both with bulk merge
|
||||
doc = en_tokenizer(text)
|
||||
assert len(doc) == 9
|
||||
with doc.retokenize() as retokenizer:
|
||||
retokenizer.merge(doc[4:7], attrs=attrs)
|
||||
retokenizer.merge(doc[7:9], attrs=attrs)
|
||||
assert len(doc) == 6
|
||||
assert doc[4].text == "the beach boys"
|
||||
assert doc[4].text_with_ws == "the beach boys "
|
||||
assert doc[4].tag_ == "NAMED"
|
||||
assert doc[5].text == "all night"
|
||||
assert doc[5].text_with_ws == "all night"
|
||||
assert doc[5].tag_ == "NAMED"
|
||||
|
||||
|
||||
def test_doc_api_merge_children(en_tokenizer):
|
||||
"""Test that attachments work correctly after merging."""
|
||||
text = "WKRO played songs by the beach boys all night"
|
||||
attrs = {"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE"}
|
||||
doc = en_tokenizer(text)
|
||||
assert len(doc) == 9
|
||||
with doc.retokenize() as retokenizer:
|
||||
retokenizer.merge(doc[4:7], attrs=attrs)
|
||||
for word in doc:
|
||||
if word.i < word.head.i:
|
||||
assert word in list(word.head.lefts)
|
||||
elif word.i > word.head.i:
|
||||
assert word in list(word.head.rights)
|
||||
|
||||
|
||||
def test_doc_api_merge_hang(en_tokenizer):
|
||||
text = "through North and South Carolina"
|
||||
doc = en_tokenizer(text)
|
||||
with doc.retokenize() as retokenizer:
|
||||
retokenizer.merge(doc[3:5], attrs={"lemma": "", "ent_type": "ORG"})
|
||||
retokenizer.merge(doc[1:2], attrs={"lemma": "", "ent_type": "ORG"})
|
||||
|
||||
|
||||
def test_doc_api_retokenizer(en_tokenizer):
|
||||
doc = en_tokenizer("WKRO played songs by the beach boys all night")
|
||||
with doc.retokenize() as retokenizer:
|
||||
retokenizer.merge(doc[4:7])
|
||||
assert len(doc) == 7
|
||||
assert doc[4].text == "the beach boys"
|
||||
|
||||
|
||||
def test_doc_api_retokenizer_attrs(en_tokenizer):
|
||||
doc = en_tokenizer("WKRO played songs by the beach boys all night")
|
||||
# test both string and integer attributes and values
|
||||
attrs = {LEMMA: "boys", "ENT_TYPE": doc.vocab.strings["ORG"]}
|
||||
with doc.retokenize() as retokenizer:
|
||||
retokenizer.merge(doc[4:7], attrs=attrs)
|
||||
assert len(doc) == 7
|
||||
assert doc[4].text == "the beach boys"
|
||||
assert doc[4].lemma_ == "boys"
|
||||
assert doc[4].ent_type_ == "ORG"
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
def test_doc_api_retokenizer_lex_attrs(en_tokenizer):
|
||||
"""Test that lexical attributes can be changed (see #2390)."""
|
||||
doc = en_tokenizer("WKRO played beach boys songs")
|
||||
assert not any(token.is_stop for token in doc)
|
||||
with doc.retokenize() as retokenizer:
|
||||
retokenizer.merge(doc[2:4], attrs={"LEMMA": "boys", "IS_STOP": True})
|
||||
assert doc[2].text == "beach boys"
|
||||
assert doc[2].lemma_ == "boys"
|
||||
assert doc[2].is_stop
|
||||
new_doc = Doc(doc.vocab, words=["beach boys"])
|
||||
assert new_doc[0].is_stop
|
||||
|
||||
|
||||
def test_doc_api_sents_empty_string(en_tokenizer):
|
||||
doc = en_tokenizer("")
|
||||
doc.is_parsed = True
|
||||
|
|
|
@ -1,14 +1,89 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
from spacy.attrs import LEMMA
|
||||
from spacy.vocab import Vocab
|
||||
from spacy.tokens import Doc
|
||||
import pytest
|
||||
|
||||
from ..util import get_doc
|
||||
|
||||
|
||||
def test_spans_merge_tokens(en_tokenizer):
|
||||
def test_doc_retokenize_merge(en_tokenizer):
|
||||
text = "WKRO played songs by the beach boys all night"
|
||||
attrs = {"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE"}
|
||||
doc = en_tokenizer(text)
|
||||
assert len(doc) == 9
|
||||
with doc.retokenize() as retokenizer:
|
||||
retokenizer.merge(doc[4:7], attrs=attrs)
|
||||
retokenizer.merge(doc[7:9], attrs=attrs)
|
||||
assert len(doc) == 6
|
||||
assert doc[4].text == "the beach boys"
|
||||
assert doc[4].text_with_ws == "the beach boys "
|
||||
assert doc[4].tag_ == "NAMED"
|
||||
assert doc[5].text == "all night"
|
||||
assert doc[5].text_with_ws == "all night"
|
||||
assert doc[5].tag_ == "NAMED"
|
||||
|
||||
|
||||
def test_doc_retokenize_merge_children(en_tokenizer):
|
||||
"""Test that attachments work correctly after merging."""
|
||||
text = "WKRO played songs by the beach boys all night"
|
||||
attrs = {"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE"}
|
||||
doc = en_tokenizer(text)
|
||||
assert len(doc) == 9
|
||||
with doc.retokenize() as retokenizer:
|
||||
retokenizer.merge(doc[4:7], attrs=attrs)
|
||||
for word in doc:
|
||||
if word.i < word.head.i:
|
||||
assert word in list(word.head.lefts)
|
||||
elif word.i > word.head.i:
|
||||
assert word in list(word.head.rights)
|
||||
|
||||
|
||||
def test_doc_retokenize_merge_hang(en_tokenizer):
|
||||
text = "through North and South Carolina"
|
||||
doc = en_tokenizer(text)
|
||||
with doc.retokenize() as retokenizer:
|
||||
retokenizer.merge(doc[3:5], attrs={"lemma": "", "ent_type": "ORG"})
|
||||
retokenizer.merge(doc[1:2], attrs={"lemma": "", "ent_type": "ORG"})
|
||||
|
||||
|
||||
def test_doc_retokenize_retokenizer(en_tokenizer):
|
||||
doc = en_tokenizer("WKRO played songs by the beach boys all night")
|
||||
with doc.retokenize() as retokenizer:
|
||||
retokenizer.merge(doc[4:7])
|
||||
assert len(doc) == 7
|
||||
assert doc[4].text == "the beach boys"
|
||||
|
||||
|
||||
def test_doc_retokenize_retokenizer_attrs(en_tokenizer):
|
||||
doc = en_tokenizer("WKRO played songs by the beach boys all night")
|
||||
# test both string and integer attributes and values
|
||||
attrs = {LEMMA: "boys", "ENT_TYPE": doc.vocab.strings["ORG"]}
|
||||
with doc.retokenize() as retokenizer:
|
||||
retokenizer.merge(doc[4:7], attrs=attrs)
|
||||
assert len(doc) == 7
|
||||
assert doc[4].text == "the beach boys"
|
||||
assert doc[4].lemma_ == "boys"
|
||||
assert doc[4].ent_type_ == "ORG"
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
def test_doc_retokenize_lex_attrs(en_tokenizer):
|
||||
"""Test that lexical attributes can be changed (see #2390)."""
|
||||
doc = en_tokenizer("WKRO played beach boys songs")
|
||||
assert not any(token.is_stop for token in doc)
|
||||
with doc.retokenize() as retokenizer:
|
||||
retokenizer.merge(doc[2:4], attrs={"LEMMA": "boys", "IS_STOP": True})
|
||||
assert doc[2].text == "beach boys"
|
||||
assert doc[2].lemma_ == "boys"
|
||||
assert doc[2].is_stop
|
||||
new_doc = Doc(doc.vocab, words=["beach boys"])
|
||||
assert new_doc[0].is_stop
|
||||
|
||||
|
||||
def test_doc_retokenize_spans_merge_tokens(en_tokenizer):
|
||||
text = "Los Angeles start."
|
||||
heads = [1, 1, 0, -1]
|
||||
tokens = en_tokenizer(text)
|
||||
|
@ -25,7 +100,7 @@ def test_spans_merge_tokens(en_tokenizer):
|
|||
assert doc[0].ent_type_ == "GPE"
|
||||
|
||||
|
||||
def test_spans_merge_heads(en_tokenizer):
|
||||
def test_doc_retokenize_spans_merge_heads(en_tokenizer):
|
||||
text = "I found a pilates class near work."
|
||||
heads = [1, 0, 2, 1, -3, -1, -1, -6]
|
||||
tokens = en_tokenizer(text)
|
||||
|
@ -43,7 +118,7 @@ def test_spans_merge_heads(en_tokenizer):
|
|||
assert doc[5].head.i == 4
|
||||
|
||||
|
||||
def test_spans_merge_non_disjoint(en_tokenizer):
|
||||
def test_doc_retokenize_spans_merge_non_disjoint(en_tokenizer):
|
||||
text = "Los Angeles start."
|
||||
doc = en_tokenizer(text)
|
||||
with pytest.raises(ValueError):
|
||||
|
@ -58,7 +133,7 @@ def test_spans_merge_non_disjoint(en_tokenizer):
|
|||
)
|
||||
|
||||
|
||||
def test_span_np_merges(en_tokenizer):
|
||||
def test_doc_retokenize_span_np_merges(en_tokenizer):
|
||||
text = "displaCy is a parse tool built with Javascript"
|
||||
heads = [1, 0, 2, 1, -3, -1, -1, -1]
|
||||
tokens = en_tokenizer(text)
|
||||
|
@ -87,7 +162,7 @@ def test_span_np_merges(en_tokenizer):
|
|||
retokenizer.merge(ent)
|
||||
|
||||
|
||||
def test_spans_entity_merge(en_tokenizer):
|
||||
def test_doc_retokenize_spans_entity_merge(en_tokenizer):
|
||||
# fmt: off
|
||||
text = "Stewart Lee is a stand up comedian who lives in England and loves Joe Pasquale.\n"
|
||||
heads = [1, 1, 0, 1, 2, -1, -4, 1, -2, -1, -1, -3, -10, 1, -2, -13, -1]
|
||||
|
@ -108,7 +183,7 @@ def test_spans_entity_merge(en_tokenizer):
|
|||
assert len(doc) == 15
|
||||
|
||||
|
||||
def test_spans_entity_merge_iob():
|
||||
def test_doc_retokenize_spans_entity_merge_iob():
|
||||
# Test entity IOB stays consistent after merging
|
||||
words = ["a", "b", "c", "d", "e"]
|
||||
doc = Doc(Vocab(), words=words)
|
||||
|
@ -147,7 +222,7 @@ def test_spans_entity_merge_iob():
|
|||
assert doc[4].ent_iob_ == "I"
|
||||
|
||||
|
||||
def test_spans_sentence_update_after_merge(en_tokenizer):
|
||||
def test_doc_retokenize_spans_sentence_update_after_merge(en_tokenizer):
|
||||
# fmt: off
|
||||
text = "Stewart Lee is a stand up comedian. He lives in England and loves Joe Pasquale."
|
||||
heads = [1, 1, 0, 1, 2, -1, -4, -5, 1, 0, -1, -1, -3, -4, 1, -2, -7]
|
||||
|
@ -155,7 +230,6 @@ def test_spans_sentence_update_after_merge(en_tokenizer):
|
|||
'punct', 'nsubj', 'ROOT', 'prep', 'pobj', 'cc', 'conj',
|
||||
'compound', 'dobj', 'punct']
|
||||
# fmt: on
|
||||
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
||||
sent1, sent2 = list(doc.sents)
|
||||
|
@ -169,7 +243,7 @@ def test_spans_sentence_update_after_merge(en_tokenizer):
|
|||
assert len(sent2) == init_len2 - 1
|
||||
|
||||
|
||||
def test_spans_subtree_size_check(en_tokenizer):
|
||||
def test_doc_retokenize_spans_subtree_size_check(en_tokenizer):
|
||||
# fmt: off
|
||||
text = "Stewart Lee is a stand up comedian who lives in England and loves Joe Pasquale"
|
||||
heads = [1, 1, 0, 1, 2, -1, -4, 1, -2, -1, -1, -3, -10, 1, -2]
|
||||
|
@ -177,7 +251,6 @@ def test_spans_subtree_size_check(en_tokenizer):
|
|||
"nsubj", "relcl", "prep", "pobj", "cc", "conj", "compound",
|
||||
"dobj"]
|
||||
# fmt: on
|
||||
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
||||
sent1 = list(doc.sents)[0]
|
|
@ -8,7 +8,7 @@ from spacy.tokens import Doc
|
|||
from ..util import get_doc
|
||||
|
||||
|
||||
def test_doc_split(en_vocab):
|
||||
def test_doc_retokenize_split(en_vocab):
|
||||
words = ["LosAngeles", "start", "."]
|
||||
heads = [1, 1, 0]
|
||||
doc = get_doc(en_vocab, words=words, heads=heads)
|
||||
|
@ -41,7 +41,7 @@ def test_doc_split(en_vocab):
|
|||
assert len(str(doc)) == 19
|
||||
|
||||
|
||||
def test_split_dependencies(en_vocab):
|
||||
def test_doc_retokenize_split_dependencies(en_vocab):
|
||||
doc = Doc(en_vocab, words=["LosAngeles", "start", "."])
|
||||
dep1 = doc.vocab.strings.add("amod")
|
||||
dep2 = doc.vocab.strings.add("subject")
|
||||
|
@ -56,7 +56,7 @@ def test_split_dependencies(en_vocab):
|
|||
assert doc[1].dep == dep2
|
||||
|
||||
|
||||
def test_split_heads_error(en_vocab):
|
||||
def test_doc_retokenize_split_heads_error(en_vocab):
|
||||
doc = Doc(en_vocab, words=["LosAngeles", "start", "."])
|
||||
# Not enough heads
|
||||
with pytest.raises(ValueError):
|
||||
|
@ -69,7 +69,7 @@ def test_split_heads_error(en_vocab):
|
|||
retokenizer.split(doc[0], ["Los", "Angeles"], [doc[1], doc[1], doc[1]])
|
||||
|
||||
|
||||
def test_spans_entity_merge_iob():
|
||||
def test_doc_retokenize_spans_entity_split_iob():
|
||||
# Test entity IOB stays consistent after merging
|
||||
words = ["abc", "d", "e"]
|
||||
doc = Doc(Vocab(), words=words)
|
||||
|
@ -84,7 +84,7 @@ def test_spans_entity_merge_iob():
|
|||
assert doc[3].ent_iob_ == "I"
|
||||
|
||||
|
||||
def test_spans_sentence_update_after_merge(en_vocab):
|
||||
def test_doc_retokenize_spans_sentence_update_after_split(en_vocab):
|
||||
# fmt: off
|
||||
words = ["StewartLee", "is", "a", "stand", "up", "comedian", ".", "He",
|
||||
"lives", "in", "England", "and", "loves", "JoePasquale", "."]
|
||||
|
@ -114,7 +114,7 @@ def test_spans_sentence_update_after_merge(en_vocab):
|
|||
assert len(sent2) == init_len2 + 1
|
||||
|
||||
|
||||
def test_split_orths_mismatch(en_vocab):
|
||||
def test_doc_retokenize_split_orths_mismatch(en_vocab):
|
||||
"""Test that the regular retokenizer.split raises an error if the orths
|
||||
don't match the original token text. There might still be a method that
|
||||
allows this, but for the default use cases, merging and splitting should
|
Loading…
Reference in New Issue
Block a user