Merge branch 'develop' of https://github.com/explosion/spaCy into develop

This commit is contained in:
Matthew Honnibal 2019-02-15 14:03:54 +01:00
commit 2dbc61bc26

View File

@ -8,60 +8,43 @@ from spacy.tokens import Doc
from ..util import get_doc from ..util import get_doc
def test_doc_split(en_tokenizer): def test_doc_split(en_vocab):
text = "LosAngeles start." words = ["LosAngeles", "start", "."]
heads = [1, 1, 0] heads = [1, 1, 0]
tokens = en_tokenizer(text) doc = get_doc(en_vocab, words=words, heads=heads)
doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads)
assert len(doc) == 3 assert len(doc) == 3
assert len(str(doc)) == 19 assert len(str(doc)) == 19
assert doc[0].head.text == "start" assert doc[0].head.text == "start"
assert doc[1].head.text == "." assert doc[1].head.text == "."
with doc.retokenize() as retokenizer: with doc.retokenize() as retokenizer:
retokenizer.split( attrs = {"tag": "NNP", "lemma": "Los Angeles", "ent_type": "GPE"}
doc[0], retokenizer.split(doc[0], ["Los", "Angeles"], [1, 0], attrs=attrs)
["Los", "Angeles"],
[1, 0],
attrs={"tag": "NNP", "lemma": "Los Angeles", "ent_type": "GPE"},
)
assert len(doc) == 4 assert len(doc) == 4
assert doc[0].text == "Los" assert doc[0].text == "Los"
assert doc[0].head.text == "Angeles" assert doc[0].head.text == "Angeles"
assert doc[0].idx == 0 assert doc[0].idx == 0
assert doc[1].idx == 3 assert doc[1].idx == 3
assert doc[1].text == "Angeles" assert doc[1].text == "Angeles"
assert doc[1].head.text == "start" assert doc[1].head.text == "start"
assert doc[2].text == "start" assert doc[2].text == "start"
assert doc[2].head.text == "." assert doc[2].head.text == "."
assert doc[3].text == "." assert doc[3].text == "."
assert doc[3].head.text == "." assert doc[3].head.text == "."
assert len(str(doc)) == 19 assert len(str(doc)) == 19
def test_split_dependencies(en_tokenizer): def test_split_dependencies(en_vocab):
text = "LosAngeles start." doc = Doc(en_vocab, words=["LosAngeles", "start", "."])
tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, [t.text for t in tokens])
dep1 = doc.vocab.strings.add("amod") dep1 = doc.vocab.strings.add("amod")
dep2 = doc.vocab.strings.add("subject") dep2 = doc.vocab.strings.add("subject")
with doc.retokenize() as retokenizer: with doc.retokenize() as retokenizer:
retokenizer.split(doc[0], ["Los", "Angeles"], [1, 0], [dep1, dep2]) retokenizer.split(doc[0], ["Los", "Angeles"], [1, 0], [dep1, dep2])
assert doc[0].dep == dep1 assert doc[0].dep == dep1
assert doc[1].dep == dep2 assert doc[1].dep == dep2
def test_split_heads_error(en_tokenizer): def test_split_heads_error(en_vocab):
text = "LosAngeles start." doc = Doc(en_vocab, words=["LosAngeles", "start", "."])
tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, [t.text for t in tokens])
# Not enough heads # Not enough heads
with pytest.raises(ValueError): with pytest.raises(ValueError):
with doc.retokenize() as retokenizer: with doc.retokenize() as retokenizer:
@ -83,6 +66,19 @@ def test_split_heads_error(en_tokenizer):
retokenizer.split(doc[0], ["Los", "Angeles"], [0, 0]) retokenizer.split(doc[0], ["Los", "Angeles"], [0, 0])
@pytest.mark.xfail
def test_split_heads_out_of_bounds(en_vocab):
"""Test that the retokenizer raises an error for out-of-bounds heads. The
indices are relative, so head 1 for "Angeles" would be the token following
it, which is out-of-bounds. Previously, the retokenizer would accept this
and spaCy would then fail later.
"""
doc = Doc(en_vocab, words=["Start", "LosAngeles"])
with pytest.raises(ValueError):
with doc.retokenize() as retokenizer:
retokenizer.split(doc[1], ["Los", "Angeles"], [0, 1])
def test_spans_entity_merge_iob(): def test_spans_entity_merge_iob():
# Test entity IOB stays consistent after merging # Test entity IOB stays consistent after merging
words = ["abc", "d", "e"] words = ["abc", "d", "e"]
@ -90,7 +86,6 @@ def test_spans_entity_merge_iob():
doc.ents = [(doc.vocab.strings.add("ent-abcd"), 0, 2)] doc.ents = [(doc.vocab.strings.add("ent-abcd"), 0, 2)]
assert doc[0].ent_iob_ == "B" assert doc[0].ent_iob_ == "B"
assert doc[1].ent_iob_ == "I" assert doc[1].ent_iob_ == "I"
with doc.retokenize() as retokenizer: with doc.retokenize() as retokenizer:
retokenizer.split(doc[0], ["a", "b", "c"], [1, 1, 0]) retokenizer.split(doc[0], ["a", "b", "c"], [1, 1, 0])
assert doc[0].ent_iob_ == "B" assert doc[0].ent_iob_ == "B"
@ -99,16 +94,15 @@ def test_spans_entity_merge_iob():
assert doc[3].ent_iob_ == "I" assert doc[3].ent_iob_ == "I"
def test_spans_sentence_update_after_merge(en_tokenizer): def test_spans_sentence_update_after_merge(en_vocab):
# fmt: off # fmt: off
text = "StewartLee is a stand up comedian. He lives in England and loves JoePasquale." words = ["StewartLee", "is", "a", "stand", "up", "comedian", ".", "He",
"lives", "in", "England", "and", "loves", "JoePasquale", "."]
heads = [1, 0, 1, 2, -1, -4, -5, 1, 0, -1, -1, -3, -4, 1, -2] heads = [1, 0, 1, 2, -1, -4, -5, 1, 0, -1, -1, -3, -4, 1, -2]
deps = ["nsubj", "ROOT", "det", "amod", "prt", "attr", "punct", "nsubj", deps = ["nsubj", "ROOT", "det", "amod", "prt", "attr", "punct", "nsubj",
"ROOT", "prep", "pobj", "cc", "conj", "compound", "punct"] "ROOT", "prep", "pobj", "cc", "conj", "compound", "punct"]
# fmt: on # fmt: on
doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads, deps=deps)
sent1, sent2 = list(doc.sents) sent1, sent2 = list(doc.sents)
init_len = len(sent1) init_len = len(sent1)
init_len2 = len(sent2) init_len2 = len(sent2)
@ -118,3 +112,17 @@ def test_spans_sentence_update_after_merge(en_tokenizer):
sent1, sent2 = list(doc.sents) sent1, sent2 = list(doc.sents)
assert len(sent1) == init_len + 1 assert len(sent1) == init_len + 1
assert len(sent2) == init_len2 + 1 assert len(sent2) == init_len2 + 1
@pytest.mark.xfail
def test_split_orths_mismatch(en_vocab):
"""Test that the regular retokenizer.split raises an error if the orths
don't match the original token text. There might still be a method that
allows this, but for the default use cases, merging and splitting should
always conform with spaCy's non-destructive tokenization policy. Otherwise,
it can lead to very confusing and unexpected results.
"""
doc = Doc(en_vocab, words=["LosAngeles", "start", "."])
with pytest.raises(ValueError):
with doc.retokenize() as retokenizer:
retokenizer.split(doc[0], ["L", "A"], [0, -1])