Add displacy support for overlapping Spans (#10332)

* Fix docstring for EntityRenderer

* Add warning in displacy if doc.spans are empty

* Implement parse_spans converter

One notable change here is that the default spans_key is sc, and
it's set by the user through the options.

* Implement SpanRenderer

Here, I implemented a SpanRenderer that looks similar to the
EntityRenderer except for some templates.  The spans_key, by default, is
set to sc, but can be configured in the options (see parse_spans). The
way I rendered these spans is per-token, i.e., I first check if each
token (1) belongs to a given span type and (2) a starting token of a
given span type. Once I have this information, I render them into the
markup.

* Fix mypy issues on typing

* Add tests for displacy spans support

* Update colors from RGB to hex

Co-authored-by: Ines Montani <ines@ines.io>

* Remove unnecessary CSS properties

* Add documentation for website

* Remove unnecesasry scripts

* Update wording on the documentation

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Put typing dependency on top of file

* Put back z-index so that spans overlap properly

* Make warning more explicit for spans_key

Co-authored-by: Ines Montani <ines@ines.io>
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
This commit is contained in:
Lj Miranda 2022-03-17 01:14:34 +08:00 committed by GitHub
parent e021dc6279
commit a79cd3542b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 501 additions and 15 deletions

View File

@ -4,10 +4,10 @@ spaCy's built in visualization suite for dependencies and named entities.
DOCS: https://spacy.io/api/top-level#displacy
USAGE: https://spacy.io/usage/visualizers
"""
from typing import Union, Iterable, Optional, Dict, Any, Callable
from typing import List, Union, Iterable, Optional, Dict, Any, Callable
import warnings
from .render import DependencyRenderer, EntityRenderer
from .render import DependencyRenderer, EntityRenderer, SpanRenderer
from ..tokens import Doc, Span
from ..errors import Errors, Warnings
from ..util import is_in_jupyter
@ -44,6 +44,7 @@ def render(
factories = {
"dep": (DependencyRenderer, parse_deps),
"ent": (EntityRenderer, parse_ents),
"span": (SpanRenderer, parse_spans),
}
if style not in factories:
raise ValueError(Errors.E087.format(style=style))
@ -203,6 +204,42 @@ def parse_ents(doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
return {"text": doc.text, "ents": ents, "title": title, "settings": settings}
def parse_spans(doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
"""Generate spans in [{start: i, end: i, label: 'label'}] format.
doc (Doc): Document to parse.
options (Dict[str, any]): Span-specific visualisation options.
RETURNS (dict): Generated span types keyed by text (original text) and spans.
"""
kb_url_template = options.get("kb_url_template", None)
spans_key = options.get("spans_key", "sc")
spans = [
{
"start": span.start_char,
"end": span.end_char,
"start_token": span.start,
"end_token": span.end,
"label": span.label_,
"kb_id": span.kb_id_ if span.kb_id_ else "",
"kb_url": kb_url_template.format(span.kb_id_) if kb_url_template else "#",
}
for span in doc.spans[spans_key]
]
tokens = [token.text for token in doc]
if not spans:
warnings.warn(Warnings.W117.format(spans_key=spans_key))
title = doc.user_data.get("title", None) if hasattr(doc, "user_data") else None
settings = get_doc_settings(doc)
return {
"text": doc.text,
"spans": spans,
"title": title,
"settings": settings,
"tokens": tokens,
}
def set_render_wrapper(func: Callable[[str], str]) -> None:
"""Set an optional wrapper function that is called around the generated
HTML markup on displacy.render. This can be used to allow integration into

View File

@ -1,12 +1,15 @@
from typing import Dict, Any, List, Optional, Union
from typing import Any, Dict, List, Optional, Union
import uuid
import itertools
from .templates import TPL_DEP_SVG, TPL_DEP_WORDS, TPL_DEP_WORDS_LEMMA, TPL_DEP_ARCS
from .templates import TPL_ENT, TPL_ENT_RTL, TPL_FIGURE, TPL_TITLE, TPL_PAGE
from .templates import TPL_ENTS, TPL_KB_LINK
from ..util import minify_html, escape_html, registry
from ..errors import Errors
from ..util import escape_html, minify_html, registry
from .templates import TPL_DEP_ARCS, TPL_DEP_SVG, TPL_DEP_WORDS
from .templates import TPL_DEP_WORDS_LEMMA, TPL_ENT, TPL_ENT_RTL, TPL_ENTS
from .templates import TPL_FIGURE, TPL_KB_LINK, TPL_PAGE, TPL_SPAN
from .templates import TPL_SPAN_RTL, TPL_SPAN_SLICE, TPL_SPAN_SLICE_RTL
from .templates import TPL_SPAN_START, TPL_SPAN_START_RTL, TPL_SPANS
from .templates import TPL_TITLE
DEFAULT_LANG = "en"
DEFAULT_DIR = "ltr"
@ -33,6 +36,168 @@ DEFAULT_LABEL_COLORS = {
}
class SpanRenderer:
"""Render Spans as SVGs."""
style = "span"
def __init__(self, options: Dict[str, Any] = {}) -> None:
"""Initialise span renderer
options (dict): Visualiser-specific options (colors, spans)
"""
# Set up the colors and overall look
colors = dict(DEFAULT_LABEL_COLORS)
user_colors = registry.displacy_colors.get_all()
for user_color in user_colors.values():
if callable(user_color):
# Since this comes from the function registry, we want to make
# sure we support functions that *return* a dict of colors
user_color = user_color()
if not isinstance(user_color, dict):
raise ValueError(Errors.E925.format(obj=type(user_color)))
colors.update(user_color)
colors.update(options.get("colors", {}))
self.default_color = DEFAULT_ENTITY_COLOR
self.colors = {label.upper(): color for label, color in colors.items()}
# Set up how the text and labels will be rendered
self.direction = DEFAULT_DIR
self.lang = DEFAULT_LANG
self.top_offset = options.get("top_offset", 40)
self.top_offset_step = options.get("top_offset_step", 17)
# Set up which templates will be used
template = options.get("template")
if template:
self.span_template = template["span"]
self.span_slice_template = template["slice"]
self.span_start_template = template["start"]
else:
if self.direction == "rtl":
self.span_template = TPL_SPAN_RTL
self.span_slice_template = TPL_SPAN_SLICE_RTL
self.span_start_template = TPL_SPAN_START_RTL
else:
self.span_template = TPL_SPAN
self.span_slice_template = TPL_SPAN_SLICE
self.span_start_template = TPL_SPAN_START
def render(
self, parsed: List[Dict[str, Any]], page: bool = False, minify: bool = False
) -> str:
"""Render complete markup.
parsed (list): Dependency parses to render.
page (bool): Render parses wrapped as full HTML page.
minify (bool): Minify HTML markup.
RETURNS (str): Rendered HTML markup.
"""
rendered = []
for i, p in enumerate(parsed):
if i == 0:
settings = p.get("settings", {})
self.direction = settings.get("direction", DEFAULT_DIR)
self.lang = settings.get("lang", DEFAULT_LANG)
rendered.append(self.render_spans(p["tokens"], p["spans"], p.get("title")))
if page:
docs = "".join([TPL_FIGURE.format(content=doc) for doc in rendered])
markup = TPL_PAGE.format(content=docs, lang=self.lang, dir=self.direction)
else:
markup = "".join(rendered)
if minify:
return minify_html(markup)
return markup
def render_spans(
self,
tokens: List[str],
spans: List[Dict[str, Any]],
title: Optional[str],
) -> str:
"""Render span types in text.
Spans are rendered per-token, this means that for each token, we check if it's part
of a span slice (a member of a span type) or a span start (the starting token of a
given span type).
tokens (list): Individual tokens in the text
spans (list): Individual entity spans and their start, end, label, kb_id and kb_url.
title (str / None): Document title set in Doc.user_data['title'].
"""
per_token_info = []
for idx, token in enumerate(tokens):
# Identify if a token belongs to a Span (and which) and if it's a
# start token of said Span. We'll use this for the final HTML render
token_markup: Dict[str, Any] = {}
token_markup["text"] = token
entities = []
for span in spans:
ent = {}
if span["start_token"] <= idx < span["end_token"]:
ent["label"] = span["label"]
ent["is_start"] = True if idx == span["start_token"] else False
kb_id = span.get("kb_id", "")
kb_url = span.get("kb_url", "#")
ent["kb_link"] = (
TPL_KB_LINK.format(kb_id=kb_id, kb_url=kb_url) if kb_id else ""
)
entities.append(ent)
token_markup["entities"] = entities
per_token_info.append(token_markup)
markup = self._render_markup(per_token_info)
markup = TPL_SPANS.format(content=markup, dir=self.direction)
if title:
markup = TPL_TITLE.format(title=title) + markup
return markup
def _render_markup(self, per_token_info: List[Dict[str, Any]]) -> str:
"""Render the markup from per-token information"""
markup = ""
for token in per_token_info:
entities = sorted(token["entities"], key=lambda d: d["label"])
if entities:
slices = self._get_span_slices(token["entities"])
starts = self._get_span_starts(token["entities"])
markup += self.span_template.format(
text=token["text"], span_slices=slices, span_starts=starts
)
else:
markup += escape_html(token["text"] + " ")
return markup
def _get_span_slices(self, entities: List[Dict]) -> str:
"""Get the rendered markup of all Span slices"""
span_slices = []
for entity, step in zip(entities, itertools.count(step=self.top_offset_step)):
color = self.colors.get(entity["label"].upper(), self.default_color)
span_slice = self.span_slice_template.format(
bg=color, top_offset=self.top_offset + step
)
span_slices.append(span_slice)
return "".join(span_slices)
def _get_span_starts(self, entities: List[Dict]) -> str:
"""Get the rendered markup of all Span start tokens"""
span_starts = []
for entity, step in zip(entities, itertools.count(step=self.top_offset_step)):
color = self.colors.get(entity["label"].upper(), self.default_color)
span_start = (
self.span_start_template.format(
bg=color,
top_offset=self.top_offset + step,
label=entity["label"],
kb_link=entity["kb_link"],
)
if entity["is_start"]
else ""
)
span_starts.append(span_start)
return "".join(span_starts)
class DependencyRenderer:
"""Render dependency parses as SVGs."""
@ -242,7 +407,7 @@ class EntityRenderer:
style = "ent"
def __init__(self, options: Dict[str, Any] = {}) -> None:
"""Initialise dependency renderer.
"""Initialise entity renderer.
options (dict): Visualiser-specific options (colors, ents)
"""

View File

@ -62,6 +62,55 @@ TPL_ENT_RTL = """
</mark>
"""
TPL_SPANS = """
<div class="spans" style="line-height: 2.5; direction: {dir}">{content}</div>
"""
TPL_SPAN = """
<span style="font-weight: bold; display: inline-block; position: relative;">
{text}
{span_slices}
{span_starts}
</span>
"""
TPL_SPAN_SLICE = """
<span style="background: {bg}; top: {top_offset}px; height: 4px; left: -1px; width: calc(100% + 2px); position: absolute;">
</span>
"""
TPL_SPAN_START = """
<span style="background: {bg}; top: {top_offset}px; height: 4px; border-top-left-radius: 3px; border-bottom-left-radius: 3px; left: -1px; width: calc(100% + 2px); position: absolute;">
<span style="background: {bg}; z-index: 10; color: #000; top: -0.5em; padding: 2px 3px; position: absolute; font-size: 0.6em; font-weight: bold; line-height: 1; border-radius: 3px">
{label}{kb_link}
</span>
</span>
"""
TPL_SPAN_RTL = """
<span style="font-weight: bold; display: inline-block; position: relative;">
{text}
{span_slices}
{span_starts}
</span>
"""
TPL_SPAN_SLICE_RTL = """
<span style="background: {bg}; top: {top_offset}px; height: 4px; left: -1px; width: calc(100% + 2px); position: absolute;">
</span>
"""
TPL_SPAN_START_RTL = """
<span style="background: {bg}; top: {top_offset}px; height: 4px; border-top-left-radius: 3px; border-bottom-left-radius: 3px; left: -1px; width: calc(100% + 2px); position: absolute;">
<span style="background: {bg}; z-index: 10; color: #000; top: -0.5em; padding: 2px 3px; position: absolute; font-size: 0.6em; font-weight: bold; line-height: 1; border-radius: 3px">
{label}{kb_link}
</span>
</span>
"""
# Important: this needs to start with a space!
TPL_KB_LINK = """
<a style="text-decoration: none; color: inherit; font-weight: normal" href="{kb_url}">{kb_id}</a>

View File

@ -192,6 +192,10 @@ class Warnings(metaclass=ErrorsWithCodes):
W115 = ("Skipping {method}: the floret vector table cannot be modified. "
"Vectors are calculated from character ngrams.")
W116 = ("Unable to clean attribute '{attr}'.")
W117 = ("No spans to visualize found in Doc object with spans_key: '{spans_key}'. If this is "
"surprising to you, make sure the Doc was processed using a model "
"that supports span categorization, and check the `doc.spans[spans_key]` "
"property manually if necessary.")
class Errors(metaclass=ErrorsWithCodes):

View File

@ -96,6 +96,92 @@ def test_issue5838():
assert found == 4
def test_displacy_parse_spans(en_vocab):
"""Test that spans on a Doc are converted into displaCy's format."""
doc = Doc(en_vocab, words=["Welcome", "to", "the", "Bank", "of", "China"])
doc.spans["sc"] = [Span(doc, 3, 6, "ORG"), Span(doc, 5, 6, "GPE")]
spans = displacy.parse_spans(doc)
assert isinstance(spans, dict)
assert spans["text"] == "Welcome to the Bank of China "
assert spans["spans"] == [
{
"start": 15,
"end": 28,
"start_token": 3,
"end_token": 6,
"label": "ORG",
"kb_id": "",
"kb_url": "#",
},
{
"start": 23,
"end": 28,
"start_token": 5,
"end_token": 6,
"label": "GPE",
"kb_id": "",
"kb_url": "#",
},
]
def test_displacy_parse_spans_with_kb_id_options(en_vocab):
"""Test that spans with kb_id on a Doc are converted into displaCy's format"""
doc = Doc(en_vocab, words=["Welcome", "to", "the", "Bank", "of", "China"])
doc.spans["sc"] = [
Span(doc, 3, 6, "ORG", kb_id="Q790068"),
Span(doc, 5, 6, "GPE", kb_id="Q148"),
]
spans = displacy.parse_spans(
doc, {"kb_url_template": "https://wikidata.org/wiki/{}"}
)
assert isinstance(spans, dict)
assert spans["text"] == "Welcome to the Bank of China "
assert spans["spans"] == [
{
"start": 15,
"end": 28,
"start_token": 3,
"end_token": 6,
"label": "ORG",
"kb_id": "Q790068",
"kb_url": "https://wikidata.org/wiki/Q790068",
},
{
"start": 23,
"end": 28,
"start_token": 5,
"end_token": 6,
"label": "GPE",
"kb_id": "Q148",
"kb_url": "https://wikidata.org/wiki/Q148",
},
]
def test_displacy_parse_spans_different_spans_key(en_vocab):
"""Test that spans in a different spans key will be parsed"""
doc = Doc(en_vocab, words=["Welcome", "to", "the", "Bank", "of", "China"])
doc.spans["sc"] = [Span(doc, 3, 6, "ORG"), Span(doc, 5, 6, "GPE")]
doc.spans["custom"] = [Span(doc, 3, 6, "BANK")]
spans = displacy.parse_spans(doc, options={"spans_key": "custom"})
assert isinstance(spans, dict)
assert spans["text"] == "Welcome to the Bank of China "
assert spans["spans"] == [
{
"start": 15,
"end": 28,
"start_token": 3,
"end_token": 6,
"label": "BANK",
"kb_id": "",
"kb_url": "#",
}
]
def test_displacy_parse_ents(en_vocab):
"""Test that named entities on a Doc are converted into displaCy's format."""
doc = Doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"])

View File

@ -320,12 +320,31 @@ If a setting is not present in the options, the default value will be used.
| `template` <Tag variant="new">2.2</Tag> | Optional template to overwrite the HTML used to render entity spans. Should be a format string and can use `{bg}`, `{text}` and `{label}`. See [`templates.py`](%%GITHUB_SPACY/spacy/displacy/templates.py) for examples. ~~Optional[str]~~ |
| `kb_url_template` <Tag variant="new">3.2.1</Tag> | Optional template to construct the KB url for the entity to link to. Expects a python f-string format with single field to fill in. ~~Optional[str]~~ |
By default, displaCy comes with colors for all entity types used by
[spaCy's trained pipelines](/models). If you're using custom entity types, you
can use the `colors` setting to add your own colors for them. Your application
or pipeline package can also expose a
[`spacy_displacy_colors` entry point](/usage/saving-loading#entry-points-displacy)
to add custom labels and their colors automatically.
#### Span Visualizer options {#displacy_options-span}
> #### Example
>
> ```python
> options = {"spans_key": "sc"}
> displacy.serve(doc, style="span", options=options)
> ```
| Name | Description |
|-----------------|---------------------------------------------------------------------------------------------------------------------------------------------------------|
| `spans_key` | Which spans key to render spans from. Default is `"sc"`. ~~str~~ |
| `templates` | Dictionary containing the keys `"span"`, `"slice"`, and `"start"`. These dictate how the overall span, a span slice, and the starting token will be rendered. ~~Optional[Dict[str, str]~~ |
| `kb_url_template` | Optional template to construct the KB url for the entity to link to. Expects a python f-string format with single field to fill in ~~Optional[str]~~ |
| `colors` | Color overrides. Entity types should be mapped to color names or values. ~~Dict[str, str]~~ |
By default, displaCy comes with colors for all entity types used by [spaCy's
trained pipelines](/models) for both entity and span visualizer. If you're
using custom entity types, you can use the `colors` setting to add your own
colors for them. Your application or pipeline package can also expose a
[`spacy_displacy_colors` entry
point](/usage/saving-loading#entry-points-displacy) to add custom labels and
their colors automatically.
By default, displaCy links to `#` for entities without a `kb_id` set on their
span. If you wish to link an entity to their URL then consider using the
@ -335,6 +354,7 @@ span. If you wish to link an entity to their URL then consider using the
should redirect you to their Wikidata page, in this case
`https://www.wikidata.org/wiki/Q95`.
## registry {#registry source="spacy/util.py" new="3"}
spaCy's function registry extends

View File

@ -0,0 +1,31 @@
<div class="spans"
style="line-height: 2.5; font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol'; font-size: 18px; direction: ltr">
Welcome to the
<span style="font-weight: bold; display: inline-block; position: relative;">
Bank
<span
style="background: #ddd; top: 40px; height: 4px; left: -1px; width: calc(100% + 2px); position: absolute;">
</span>
<span
style="background: #ddd; top: 40px; height: 4px; border-top-left-radius: 3px; border-bottom-left-radius: 3px; left: -1px; width: calc(100% + 2px); position: absolute;">
<span
style="background: #ddd; color: #000; top: -0.5em; padding: 2px 3px; position: absolute; font-size: 0.6em; font-weight: bold; line-height: 1; border-radius: 3px">
BANK
</span>
</span>
</span>
<span style="font-weight: bold; display: inline-block; position: relative;">
of
<span
style="background: #ddd; top: 40px; height: 4px; left: -1px; width: calc(100% + 2px); position: absolute;">
</span>
</span>
<span style="font-weight: bold; display: inline-block; position: relative;">
China
<span
style="background: #ddd; top: 40px; height: 4px; left: -1px; width: calc(100% + 2px); position: absolute;">
</span>
</span>
.
</div>

View File

@ -0,0 +1,41 @@
<div class="spans"
style="line-height: 2.5; direction: ltr; font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol'; font-size: 18px">
Welcome to the
<span style="font-weight: bold; display: inline-block; position: relative;">
Bank
<span
style="background: #7aecec; top: 40px; height: 4px; left: -1px; width: calc(100% + 2px); position: absolute;">
</span>
<span
style="background: #7aecec; top: 40px; height: 4px; border-top-left-radius: 3px; border-bottom-left-radius: 3px; left: -1px; width: calc(100% + 2px); position: absolute;">
<span
style="background: #7aecec; color: #000; top: -0.5em; padding: 2px 3px; position: absolute; font-size: 0.6em; font-weight: bold; line-height: 1; border-radius: 3px">
ORG
</span>
</span>
</span>
<span style="font-weight: bold; display: inline-block; position: relative;">
of
<span
style="background: #7aecec; top: 40px; height: 4px; left: -1px; width: calc(100% + 2px); position: absolute;">
</span>
</span>
<span style="font-weight: bold; display: inline-block; position: relative;">
China
<span
style="background: #7aecec; top: 40px; height: 4px; left: -1px; width: calc(100% + 2px); position: absolute;">
</span>
<span
style="background: #feca74; top: 57px; height: 4px; left: -1px; width: calc(100% + 2px); position: absolute;">
</span>
<span
style="background: #feca74; top: 57px; height: 4px; border-top-left-radius: 3px; border-bottom-left-radius: 3px; left: -1px; width: calc(100% + 2px); position: absolute;">
<span
style="background: #feca74; color: #000; top: -0.5em; padding: 2px 3px; position: absolute; font-size: 0.6em; font-weight: bold; line-height: 1; border-radius: 3px">
GPE
</span>
</span>
</span>
.
</div>

View File

@ -167,6 +167,59 @@ This feature is especially handy if you're using displaCy to compare performance
at different stages of a process, e.g. during training. Here you could use the
title for a brief description of the text example and the number of iterations.
## Visualizing spans {#span}
The span visualizer, `span`, highlights overlapping spans in a text.
```python
### Span example
import spacy
from spacy import displacy
from spacy.tokens import Span
text = "Welcome to the Bank of China."
nlp = spacy.blank("en")
doc = nlp(text)
doc.spans["sc"] = [
Span(doc, 3, 6, "ORG"),
Span(doc, 5, 6, "GPE"),
]
displacy.serve(doc, style="span")
```
import DisplacySpanHtml from 'images/displacy-span.html'
<Iframe title="displaCy visualizer for entities" html={DisplacySpanHtml} height={180} />
The span visualizer lets you customize the following `options`:
| Argument | Description |
|-----------------|---------------------------------------------------------------------------------------------------------------------------------------------------------|
| `spans_key` | Which spans key to render spans from. Default is `"sc"`. ~~str~~ |
| `templates` | Dictionary containing the keys `"span"`, `"slice"`, and `"start"`. These dictate how the overall span, a span slice, and the starting token will be rendered. ~~Optional[Dict[str, str]~~ |
| `kb_url_template` | Optional template to construct the KB url for the entity to link to. Expects a python f-string format with single field to fill in ~~Optional[str]~~ |
| `colors` | Color overrides. Entity types should be mapped to color names or values. ~~Dict[str, str]~~ |
Because spans can be stored across different keys in `doc.spans`, you need to specify
which one displaCy should use with `spans_key` (`sc` is the default).
> #### Options example
>
> ```python
> doc.spans["custom"] = [Span(doc, 3, 6, "BANK")]
> options = {"spans_key": "custom"}
> displacy.serve(doc, style="span", options=options)
import DisplacySpanCustomHtml from 'images/displacy-span-custom.html'
<Iframe title="displaCy visualizer for spans (custom spans_key)" html={DisplacySpanCustomHtml} height={225} />
## Using displaCy in Jupyter notebooks {#jupyter}
displaCy is able to detect whether you're working in a