Add displaCy data structures to docs (2) (#12875)

* Add data structures to docs

* Adjusted descriptions for more consistency

* Add _optional_ flag to parameters

* Add tests and adjust optional title key in doc

* Add title to dep visualizations

* fix typo

---------

Co-authored-by: thomashacker <EdwardSchmuhl@web.de>
This commit is contained in:
Sofie Van Landeghem 2023-07-31 10:47:57 +02:00 committed by GitHub
parent 49055ed7c8
commit c9e9dccf79
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 200 additions and 1 deletions

View File

@ -313,6 +313,8 @@ class DependencyRenderer:
self.lang = settings.get("lang", DEFAULT_LANG) self.lang = settings.get("lang", DEFAULT_LANG)
render_id = f"{id_prefix}-{i}" render_id = f"{id_prefix}-{i}"
svg = self.render_svg(render_id, p["words"], p["arcs"]) svg = self.render_svg(render_id, p["words"], p["arcs"])
if p.get("title"):
svg = TPL_TITLE.format(title=p.get("title")) + svg
rendered.append(svg) rendered.append(svg)
if page: if page:
content = "".join([TPL_FIGURE.format(content=svg) for svg in rendered]) content = "".join([TPL_FIGURE.format(content=svg) for svg in rendered])

View File

@ -350,6 +350,78 @@ def test_displacy_render_wrapper(en_vocab):
displacy.set_render_wrapper(lambda html: html) displacy.set_render_wrapper(lambda html: html)
def test_displacy_render_manual_dep():
"""Test displacy.render with manual data for dep style"""
parsed_dep = {
"words": [
{"text": "This", "tag": "DT"},
{"text": "is", "tag": "VBZ"},
{"text": "a", "tag": "DT"},
{"text": "sentence", "tag": "NN"},
],
"arcs": [
{"start": 0, "end": 1, "label": "nsubj", "dir": "left"},
{"start": 2, "end": 3, "label": "det", "dir": "left"},
{"start": 1, "end": 3, "label": "attr", "dir": "right"},
],
"title": "Title",
}
html = displacy.render([parsed_dep], style="dep", manual=True)
for word in parsed_dep["words"]:
assert word["text"] in html
assert word["tag"] in html
def test_displacy_render_manual_ent():
"""Test displacy.render with manual data for ent style"""
parsed_ents = [
{
"text": "But Google is starting from behind.",
"ents": [{"start": 4, "end": 10, "label": "ORG"}],
},
{
"text": "But Google is starting from behind.",
"ents": [{"start": -100, "end": 100, "label": "COMPANY"}],
"title": "Title",
},
]
html = displacy.render(parsed_ents, style="ent", manual=True)
for parsed_ent in parsed_ents:
assert parsed_ent["ents"][0]["label"] in html
if "title" in parsed_ent:
assert parsed_ent["title"] in html
def test_displacy_render_manual_span():
"""Test displacy.render with manual data for span style"""
parsed_spans = [
{
"text": "Welcome to the Bank of China.",
"spans": [
{"start_token": 3, "end_token": 6, "label": "ORG"},
{"start_token": 5, "end_token": 6, "label": "GPE"},
],
"tokens": ["Welcome", "to", "the", "Bank", "of", "China", "."],
},
{
"text": "Welcome to the Bank of China.",
"spans": [
{"start_token": 3, "end_token": 6, "label": "ORG"},
{"start_token": 5, "end_token": 6, "label": "GPE"},
],
"tokens": ["Welcome", "to", "the", "Bank", "of", "China", "."],
"title": "Title",
},
]
html = displacy.render(parsed_spans, style="span", manual=True)
for parsed_span in parsed_spans:
assert parsed_span["spans"][0]["label"] in html
if "title" in parsed_span:
assert parsed_span["title"] in html
def test_displacy_options_case(): def test_displacy_options_case():
ents = ["foo", "BAR"] ents = ["foo", "BAR"]
colors = {"FOO": "red", "bar": "green"} colors = {"FOO": "red", "bar": "green"}

View File

@ -343,6 +343,130 @@ use with the `manual=True` argument in `displacy.render`.
| `options` | Span-specific visualisation options. ~~Dict[str, Any]~~ | | `options` | Span-specific visualisation options. ~~Dict[str, Any]~~ |
| **RETURNS** | Generated entities keyed by text (original text) and ents. ~~dict~~ | | **RETURNS** | Generated entities keyed by text (original text) and ents. ~~dict~~ |
### Visualizer data structures {id="displacy_structures"}
You can use displaCy's data format to manually render data. This can be useful
if you want to visualize output from other libraries. You can find examples of
displaCy's different data formats below.
> #### DEP example data structure
>
> ```json
> {
> "words": [
> { "text": "This", "tag": "DT" },
> { "text": "is", "tag": "VBZ" },
> { "text": "a", "tag": "DT" },
> { "text": "sentence", "tag": "NN" }
> ],
> "arcs": [
> { "start": 0, "end": 1, "label": "nsubj", "dir": "left" },
> { "start": 2, "end": 3, "label": "det", "dir": "left" },
> { "start": 1, "end": 3, "label": "attr", "dir": "right" }
> ]
> }
> ```
#### Dependency Visualizer data structure {id="structure-dep"}
| Dictionary Key | Description |
| -------------- | ----------------------------------------------------------------------------------------------------------- |
| `words` | List of dictionaries describing a word token (see structure below). ~~List[Dict[str, Any]]~~ |
| `arcs` | List of dictionaries describing the relations between words (see structure below). ~~List[Dict[str, Any]]~~ |
| _Optional_ | |
| `title` | Title of the visualization. ~~Optional[str]~~ |
| `settings` | Dependency Visualizer options (see [here](/api/top-level#displacy_options)). ~~Dict[str, Any]~~ |
<Accordion title="Words data structure">
| Dictionary Key | Description |
| -------------- | ---------------------------------------- |
| `text` | Text content of the word. ~~str~~ |
| `tag` | Fine-grained part-of-speech. ~~str~~ |
| `lemma` | Base form of the word. ~~Optional[str]~~ |
</Accordion>
<Accordion title="Arcs data structure">
| Dictionary Key | Description |
| -------------- | ---------------------------------------------------- |
| `start` | The index of the starting token. ~~int~~ |
| `end` | The index of the ending token. ~~int~~ |
| `label` | The type of dependency relation. ~~str~~ |
| `dir` | Direction of the relation (`left`, `right`). ~~str~~ |
</Accordion>
> #### ENT example data structure
>
> ```json
> {
> "text": "But Google is starting from behind.",
> "ents": [{ "start": 4, "end": 10, "label": "ORG" }]
> }
> ```
#### Named Entity Recognition data structure {id="structure-ent"}
| Dictionary Key | Description |
| -------------- | ------------------------------------------------------------------------------------------- |
| `text` | String representation of the document text. ~~str~~ |
| `ents` | List of dictionaries describing entities (see structure below). ~~List[Dict[str, Any]]~~ |
| _Optional_ | |
| `title` | Title of the visualization. ~~Optional[str]~~ |
| `settings` | Entity Visualizer options (see [here](/api/top-level#displacy_options)). ~~Dict[str, Any]~~ |
<Accordion title="Ents data structure">
| Dictionary Key | Description |
| -------------- | ---------------------------------------------------------------------- |
| `start` | The index of the first character of the entity. ~~int~~ |
| `end` | The index of the last character of the entity. (not inclusive) ~~int~~ |
| `label` | Label attached to the entity. ~~str~~ |
| _Optional_ | |
| `kb_id` | `KnowledgeBase` ID. ~~str~~ |
| `kb_url` | `KnowledgeBase` URL. ~~str~~ |
</Accordion>
> #### SPAN example data structure
>
> ```json
> {
> "text": "Welcome to the Bank of China.",
> "spans": [
> { "start_token": 3, "end_token": 6, "label": "ORG" },
> { "start_token": 5, "end_token": 6, "label": "GPE" }
> ],
> "tokens": ["Welcome", "to", "the", "Bank", "of", "China", "."]
> }
> ```
#### Span Classification data structure {id="structure-span"}
| Dictionary Key | Description |
| -------------- | ----------------------------------------------------------------------------------------- |
| `text` | String representation of the document text. ~~str~~ |
| `spans` | List of dictionaries describing spans (see structure below). ~~List[Dict[str, Any]]~~ |
| `tokens` | List of word tokens. ~~List[str]~~ |
| _Optional_ | |
| `title` | Title of the visualization. ~~Optional[str]~~ |
| `settings` | Span Visualizer options (see [here](/api/top-level#displacy_options)). ~~Dict[str, Any]~~ |
<Accordion title="Spans data structure">
| Dictionary Key | Description |
| -------------- | ------------------------------------------------------------- |
| `start_token` | The index of the first token of the span in `tokens`. ~~int~~ |
| `end_token` | The index of the last token of the span in `tokens`. ~~int~~ |
| `label` | Label attached to the span. ~~str~~ |
| _Optional_ | |
| `kb_id` | `KnowledgeBase` ID. ~~str~~ |
| `kb_url` | `KnowledgeBase` URL. ~~str~~ |
</Accordion>
### Visualizer options {id="displacy_options"} ### Visualizer options {id="displacy_options"}
The `options` argument lets you specify additional settings for each visualizer. The `options` argument lets you specify additional settings for each visualizer.

View File

@ -349,7 +349,8 @@ or
[SyntaxNet](https://github.com/tensorflow/models/tree/master/research/syntaxnet). [SyntaxNet](https://github.com/tensorflow/models/tree/master/research/syntaxnet).
If you set `manual=True` on either `render()` or `serve()`, you can pass in data If you set `manual=True` on either `render()` or `serve()`, you can pass in data
in displaCy's format as a dictionary (instead of `Doc` objects). There are in displaCy's format as a dictionary (instead of `Doc` objects). There are
helper functions for converting `Doc` objects to displaCy's format for use with helper functions for converting `Doc` objects to
[displaCy's format](/api/top-level#displacy_structures) for use with
`manual=True`: [`displacy.parse_deps`](/api/top-level#displacy.parse_deps), `manual=True`: [`displacy.parse_deps`](/api/top-level#displacy.parse_deps),
[`displacy.parse_ents`](/api/top-level#displacy.parse_ents), and [`displacy.parse_ents`](/api/top-level#displacy.parse_ents), and
[`displacy.parse_spans`](/api/top-level#displacy.parse_spans). [`displacy.parse_spans`](/api/top-level#displacy.parse_spans).