From 1be8a4dab305466cc731f1bd9124ae13df274d54 Mon Sep 17 00:00:00 2001 From: Narayan Acharya Date: Mon, 29 Nov 2021 11:13:26 -0500 Subject: [PATCH] Displacy serve entity linking support without `manual=True` support. (#9748) * Add support for kb_id to be displayed via displacy.serve. The current support is only limited to the manual option in displacy.render * Commit to check pre-commit hooks are run. * Update spacy/displacy/__init__.py Co-authored-by: Sofie Van Landeghem * Changes as per suggestions on the PR. * Update website/docs/api/top-level.md Co-authored-by: Sofie Van Landeghem * Update website/docs/api/top-level.md Co-authored-by: Sofie Van Landeghem * tag option as new from 3.2.1 onwards Co-authored-by: Sofie Van Landeghem Co-authored-by: svlandeg --- spacy/displacy/__init__.py | 12 ++++++++++-- spacy/tests/test_displacy.py | 36 +++++++++++++++++++++++++++++++++-- website/docs/api/top-level.md | 26 ++++++++++++++++--------- 3 files changed, 61 insertions(+), 13 deletions(-) diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py index d9418f675..25d530c83 100644 --- a/spacy/displacy/__init__.py +++ b/spacy/displacy/__init__.py @@ -181,11 +181,19 @@ def parse_deps(orig_doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]: def parse_ents(doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]: """Generate named entities in [{start: i, end: i, label: 'label'}] format. - doc (Doc): Document do parse. + doc (Doc): Document to parse. + options (Dict[str, Any]): NER-specific visualisation options. RETURNS (dict): Generated entities keyed by text (original text) and ents. """ + kb_url_template = options.get("kb_url_template", None) ents = [ - {"start": ent.start_char, "end": ent.end_char, "label": ent.label_} + { + "start": ent.start_char, + "end": ent.end_char, + "label": ent.label_, + "kb_id": ent.kb_id_ if ent.kb_id_ else "", + "kb_url": kb_url_template.format(ent.kb_id_) if kb_url_template else "#", + } for ent in doc.ents ] if not ents: diff --git a/spacy/tests/test_displacy.py b/spacy/tests/test_displacy.py index 040dd657f..790925888 100644 --- a/spacy/tests/test_displacy.py +++ b/spacy/tests/test_displacy.py @@ -1,8 +1,9 @@ import pytest + from spacy import displacy from spacy.displacy.render import DependencyRenderer, EntityRenderer -from spacy.tokens import Span, Doc from spacy.lang.fa import Persian +from spacy.tokens import Span, Doc def test_displacy_parse_ents(en_vocab): @@ -12,7 +13,38 @@ def test_displacy_parse_ents(en_vocab): ents = displacy.parse_ents(doc) assert isinstance(ents, dict) assert ents["text"] == "But Google is starting from behind " - assert ents["ents"] == [{"start": 4, "end": 10, "label": "ORG"}] + assert ents["ents"] == [ + {"start": 4, "end": 10, "label": "ORG", "kb_id": "", "kb_url": "#"} + ] + + doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings["ORG"], kb_id="Q95")] + ents = displacy.parse_ents(doc) + assert isinstance(ents, dict) + assert ents["text"] == "But Google is starting from behind " + assert ents["ents"] == [ + {"start": 4, "end": 10, "label": "ORG", "kb_id": "Q95", "kb_url": "#"} + ] + + +def test_displacy_parse_ents_with_kb_id_options(en_vocab): + """Test that named entities with kb_id on a Doc are converted into displaCy's format.""" + doc = Doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"]) + doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings["ORG"], kb_id="Q95")] + + ents = displacy.parse_ents( + doc, {"kb_url_template": "https://www.wikidata.org/wiki/{}"} + ) + assert isinstance(ents, dict) + assert ents["text"] == "But Google is starting from behind " + assert ents["ents"] == [ + { + "start": 4, + "end": 10, + "label": "ORG", + "kb_id": "Q95", + "kb_url": "https://www.wikidata.org/wiki/Q95", + } + ] def test_displacy_parse_deps(en_vocab): diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index 4361db4c0..be19f9c3a 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -313,11 +313,12 @@ If a setting is not present in the options, the default value will be used. > displacy.serve(doc, style="ent", options=options) > ``` -| Name | Description | -| --------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `ents` | Entity types to highlight or `None` for all types (default). ~~Optional[List[str]]~~ | -| `colors` | Color overrides. Entity types should be mapped to color names or values. ~~Dict[str, str]~~ | -| `template` 2.2 | Optional template to overwrite the HTML used to render entity spans. Should be a format string and can use `{bg}`, `{text}` and `{label}`. See [`templates.py`](%%GITHUB_SPACY/spacy/displacy/templates.py) for examples. ~~Optional[str]~~ | +| Name | Description | +| ------------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `ents` | Entity types to highlight or `None` for all types (default). ~~Optional[List[str]]~~ | +| `colors` | Color overrides. Entity types should be mapped to color names or values. ~~Dict[str, str]~~ | +| `template` 2.2 | Optional template to overwrite the HTML used to render entity spans. Should be a format string and can use `{bg}`, `{text}` and `{label}`. See [`templates.py`](%%GITHUB_SPACY/spacy/displacy/templates.py) for examples. ~~Optional[str]~~ | +| `kb_url_template` 3.2.1 | Optional template to construct the KB url for the entity to link to. Expects a python f-string format with single field to fill in. ~~Optional[str]~~ | By default, displaCy comes with colors for all entity types used by [spaCy's trained pipelines](/models). If you're using custom entity types, you @@ -326,6 +327,14 @@ or pipeline package can also expose a [`spacy_displacy_colors` entry point](/usage/saving-loading#entry-points-displacy) to add custom labels and their colors automatically. +By default, displaCy links to `#` for entities without a `kb_id` set on their +span. If you wish to link an entity to their URL then consider using the +`kb_url_template` option from above. For example if the `kb_id` on a span is +`Q95` and this is a Wikidata identifier then this option can be set to +`https://www.wikidata.org/wiki/{}`. Clicking on your entity in the rendered HTML +should redirect you to their Wikidata page, in this case +`https://www.wikidata.org/wiki/Q95`. + ## registry {#registry source="spacy/util.py" new="3"} spaCy's function registry extends @@ -412,10 +421,10 @@ finished. To log each training step, a and the accuracy scores on the development set. The built-in, default logger is the ConsoleLogger, which prints results to the -console in tabular format. The +console in tabular format. The [spacy-loggers](https://github.com/explosion/spacy-loggers) package, included as -a dependency of spaCy, enables other loggers: currently it provides one that sends -results to a [Weights & Biases](https://www.wandb.com/) dashboard. +a dependency of spaCy, enables other loggers: currently it provides one that +sends results to a [Weights & Biases](https://www.wandb.com/) dashboard. Instead of using one of the built-in loggers, you can [implement your own](/usage/training#custom-logging). @@ -466,7 +475,6 @@ start decreasing across epochs. - ## Readers {#readers} ### File readers {#file-readers source="github.com/explosion/srsly" new="3"}