mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 10:46:29 +03:00
1be8a4dab3
* Add support for kb_id to be displayed via displacy.serve. The current support is only limited to the manual option in displacy.render * Commit to check pre-commit hooks are run. * Update spacy/displacy/__init__.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Changes as per suggestions on the PR. * Update website/docs/api/top-level.md Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update website/docs/api/top-level.md Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * tag option as new from 3.2.1 onwards Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
144 lines
5.4 KiB
Python
144 lines
5.4 KiB
Python
import pytest
|
||
|
||
from spacy import displacy
|
||
from spacy.displacy.render import DependencyRenderer, EntityRenderer
|
||
from spacy.lang.fa import Persian
|
||
from spacy.tokens import Span, Doc
|
||
|
||
|
||
def test_displacy_parse_ents(en_vocab):
|
||
"""Test that named entities on a Doc are converted into displaCy's format."""
|
||
doc = Doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"])
|
||
doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings["ORG"])]
|
||
ents = displacy.parse_ents(doc)
|
||
assert isinstance(ents, dict)
|
||
assert ents["text"] == "But Google is starting from behind "
|
||
assert ents["ents"] == [
|
||
{"start": 4, "end": 10, "label": "ORG", "kb_id": "", "kb_url": "#"}
|
||
]
|
||
|
||
doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings["ORG"], kb_id="Q95")]
|
||
ents = displacy.parse_ents(doc)
|
||
assert isinstance(ents, dict)
|
||
assert ents["text"] == "But Google is starting from behind "
|
||
assert ents["ents"] == [
|
||
{"start": 4, "end": 10, "label": "ORG", "kb_id": "Q95", "kb_url": "#"}
|
||
]
|
||
|
||
|
||
def test_displacy_parse_ents_with_kb_id_options(en_vocab):
|
||
"""Test that named entities with kb_id on a Doc are converted into displaCy's format."""
|
||
doc = Doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"])
|
||
doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings["ORG"], kb_id="Q95")]
|
||
|
||
ents = displacy.parse_ents(
|
||
doc, {"kb_url_template": "https://www.wikidata.org/wiki/{}"}
|
||
)
|
||
assert isinstance(ents, dict)
|
||
assert ents["text"] == "But Google is starting from behind "
|
||
assert ents["ents"] == [
|
||
{
|
||
"start": 4,
|
||
"end": 10,
|
||
"label": "ORG",
|
||
"kb_id": "Q95",
|
||
"kb_url": "https://www.wikidata.org/wiki/Q95",
|
||
}
|
||
]
|
||
|
||
|
||
def test_displacy_parse_deps(en_vocab):
|
||
"""Test that deps and tags on a Doc are converted into displaCy's format."""
|
||
words = ["This", "is", "a", "sentence"]
|
||
heads = [1, 1, 3, 1]
|
||
pos = ["DET", "VERB", "DET", "NOUN"]
|
||
tags = ["DT", "VBZ", "DT", "NN"]
|
||
deps = ["nsubj", "ROOT", "det", "attr"]
|
||
doc = Doc(en_vocab, words=words, heads=heads, pos=pos, tags=tags, deps=deps)
|
||
deps = displacy.parse_deps(doc)
|
||
assert isinstance(deps, dict)
|
||
assert deps["words"] == [
|
||
{"lemma": None, "text": words[0], "tag": pos[0]},
|
||
{"lemma": None, "text": words[1], "tag": pos[1]},
|
||
{"lemma": None, "text": words[2], "tag": pos[2]},
|
||
{"lemma": None, "text": words[3], "tag": pos[3]},
|
||
]
|
||
assert deps["arcs"] == [
|
||
{"start": 0, "end": 1, "label": "nsubj", "dir": "left"},
|
||
{"start": 2, "end": 3, "label": "det", "dir": "left"},
|
||
{"start": 1, "end": 3, "label": "attr", "dir": "right"},
|
||
]
|
||
|
||
|
||
def test_displacy_invalid_arcs():
|
||
renderer = DependencyRenderer()
|
||
words = [{"text": "This", "tag": "DET"}, {"text": "is", "tag": "VERB"}]
|
||
arcs = [
|
||
{"start": 0, "end": 1, "label": "nsubj", "dir": "left"},
|
||
{"start": -1, "end": 2, "label": "det", "dir": "left"},
|
||
]
|
||
with pytest.raises(ValueError):
|
||
renderer.render([{"words": words, "arcs": arcs}])
|
||
|
||
|
||
def test_displacy_spans(en_vocab):
|
||
"""Test that displaCy can render Spans."""
|
||
doc = Doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"])
|
||
doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings["ORG"])]
|
||
html = displacy.render(doc[1:4], style="ent")
|
||
assert html.startswith("<div")
|
||
|
||
|
||
def test_displacy_raises_for_wrong_type(en_vocab):
|
||
with pytest.raises(ValueError):
|
||
displacy.render("hello world")
|
||
|
||
|
||
def test_displacy_rtl():
|
||
# Source: http://www.sobhe.ir/hazm/ – is this correct?
|
||
words = ["ما", "بسیار", "کتاب", "می\u200cخوانیم"]
|
||
# These are (likely) wrong, but it's just for testing
|
||
pos = ["PRO", "ADV", "N_PL", "V_SUB"] # needs to match lang.fa.tag_map
|
||
deps = ["foo", "bar", "foo", "baz"]
|
||
heads = [1, 0, 3, 1]
|
||
nlp = Persian()
|
||
doc = Doc(nlp.vocab, words=words, tags=pos, heads=heads, deps=deps)
|
||
doc.ents = [Span(doc, 1, 3, label="TEST")]
|
||
html = displacy.render(doc, page=True, style="dep")
|
||
assert "direction: rtl" in html
|
||
assert 'direction="rtl"' in html
|
||
assert f'lang="{nlp.lang}"' in html
|
||
html = displacy.render(doc, page=True, style="ent")
|
||
assert "direction: rtl" in html
|
||
assert f'lang="{nlp.lang}"' in html
|
||
|
||
|
||
def test_displacy_render_wrapper(en_vocab):
|
||
"""Test that displaCy accepts custom rendering wrapper."""
|
||
|
||
def wrapper(html):
|
||
return "TEST" + html + "TEST"
|
||
|
||
displacy.set_render_wrapper(wrapper)
|
||
doc = Doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"])
|
||
doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings["ORG"])]
|
||
html = displacy.render(doc, style="ent")
|
||
assert html.startswith("TEST<div")
|
||
assert html.endswith("/div>TEST")
|
||
# Restore
|
||
displacy.set_render_wrapper(lambda html: html)
|
||
|
||
|
||
def test_displacy_options_case():
|
||
ents = ["foo", "BAR"]
|
||
colors = {"FOO": "red", "bar": "green"}
|
||
renderer = EntityRenderer({"ents": ents, "colors": colors})
|
||
text = "abcd"
|
||
labels = ["foo", "bar", "FOO", "BAR"]
|
||
spans = [{"start": i, "end": i + 1, "label": labels[i]} for i in range(len(text))]
|
||
result = renderer.render_ents("abcde", spans, None).split("\n\n")
|
||
assert "red" in result[0] and "foo" in result[0]
|
||
assert "green" in result[1] and "bar" in result[1]
|
||
assert "red" in result[2] and "FOO" in result[2]
|
||
assert "green" in result[3] and "BAR" in result[3]
|