add lemma option to displacy 'dep' visualiser (#5041)

* add lemma option to displacy 'dep' visualiser

* more compact list comprehension

* add option to doc

* fix test and add lemmas to util.get_doc

* fix capital

* remove lemma from get_doc

* cleanup
This commit is contained in:
Sofie Van Landeghem 2020-02-22 14:11:51 +01:00 committed by GitHub
parent 2164e71ea8
commit 479bd8d09f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 25 additions and 11 deletions

View File

@ -144,10 +144,12 @@ def parse_deps(orig_doc, options={}):
for span, tag, lemma, ent_type in spans: for span, tag, lemma, ent_type in spans:
attrs = {"tag": tag, "lemma": lemma, "ent_type": ent_type} attrs = {"tag": tag, "lemma": lemma, "ent_type": ent_type}
retokenizer.merge(span, attrs=attrs) retokenizer.merge(span, attrs=attrs)
if options.get("fine_grained"): fine_grained = options.get("fine_grained")
words = [{"text": w.text, "tag": w.tag_} for w in doc] add_lemma = options.get("add_lemma")
else: words = [{"text": w.text,
words = [{"text": w.text, "tag": w.pos_} for w in doc] "tag": w.tag_ if fine_grained else w.pos_,
"lemma": w.lemma_ if add_lemma else None} for w in doc]
arcs = [] arcs = []
for word in doc: for word in doc:
if word.i < word.head.i: if word.i < word.head.i:

View File

@ -3,7 +3,7 @@ from __future__ import unicode_literals
import uuid import uuid
from .templates import TPL_DEP_SVG, TPL_DEP_WORDS, TPL_DEP_ARCS, TPL_ENTS from .templates import TPL_DEP_SVG, TPL_DEP_WORDS, TPL_DEP_WORDS_LEMMA, TPL_DEP_ARCS, TPL_ENTS
from .templates import TPL_ENT, TPL_ENT_RTL, TPL_FIGURE, TPL_TITLE, TPL_PAGE from .templates import TPL_ENT, TPL_ENT_RTL, TPL_FIGURE, TPL_TITLE, TPL_PAGE
from ..util import minify_html, escape_html, registry from ..util import minify_html, escape_html, registry
from ..errors import Errors from ..errors import Errors
@ -83,7 +83,7 @@ class DependencyRenderer(object):
self.width = self.offset_x + len(words) * self.distance self.width = self.offset_x + len(words) * self.distance
self.height = self.offset_y + 3 * self.word_spacing self.height = self.offset_y + 3 * self.word_spacing
self.id = render_id self.id = render_id
words = [self.render_word(w["text"], w["tag"], i) for i, w in enumerate(words)] words = [self.render_word(w["text"], w["tag"], w.get("lemma", None), i) for i, w in enumerate(words)]
arcs = [ arcs = [
self.render_arrow(a["label"], a["start"], a["end"], a["dir"], i) self.render_arrow(a["label"], a["start"], a["end"], a["dir"], i)
for i, a in enumerate(arcs) for i, a in enumerate(arcs)
@ -101,7 +101,7 @@ class DependencyRenderer(object):
lang=self.lang, lang=self.lang,
) )
def render_word(self, text, tag, i): def render_word(self, text, tag, lemma, i,):
"""Render individual word. """Render individual word.
text (unicode): Word text. text (unicode): Word text.
@ -114,6 +114,8 @@ class DependencyRenderer(object):
if self.direction == "rtl": if self.direction == "rtl":
x = self.width - x x = self.width - x
html_text = escape_html(text) html_text = escape_html(text)
if lemma is not None:
return TPL_DEP_WORDS_LEMMA.format(text=html_text, tag=tag, lemma=lemma, x=x, y=y)
return TPL_DEP_WORDS.format(text=html_text, tag=tag, x=x, y=y) return TPL_DEP_WORDS.format(text=html_text, tag=tag, x=x, y=y)
def render_arrow(self, label, start, end, direction, i): def render_arrow(self, label, start, end, direction, i):

View File

@ -18,6 +18,15 @@ TPL_DEP_WORDS = """
""" """
TPL_DEP_WORDS_LEMMA = """
<text class="displacy-token" fill="currentColor" text-anchor="middle" y="{y}">
<tspan class="displacy-word" fill="currentColor" x="{x}">{text}</tspan>
<tspan class="displacy-lemma" dy="2em" fill="currentColor" x="{x}">{lemma}</tspan>
<tspan class="displacy-tag" dy="2em" fill="currentColor" x="{x}">{tag}</tspan>
</text>
"""
TPL_DEP_ARCS = """ TPL_DEP_ARCS = """
<g class="displacy-arrow"> <g class="displacy-arrow">
<path class="displacy-arc" id="arrow-{id}-{i}" stroke-width="{stroke}px" d="{arc}" fill="none" stroke="currentColor"/> <path class="displacy-arc" id="arrow-{id}-{i}" stroke-width="{stroke}px" d="{arc}" fill="none" stroke="currentColor"/>

View File

@ -31,10 +31,10 @@ def test_displacy_parse_deps(en_vocab):
deps = displacy.parse_deps(doc) deps = displacy.parse_deps(doc)
assert isinstance(deps, dict) assert isinstance(deps, dict)
assert deps["words"] == [ assert deps["words"] == [
{"text": "This", "tag": "DET"}, {"lemma": None, "text": "This", "tag": "DET"},
{"text": "is", "tag": "AUX"}, {"lemma": None, "text": "is", "tag": "AUX"},
{"text": "a", "tag": "DET"}, {"lemma": None, "text": "a", "tag": "DET"},
{"text": "sentence", "tag": "NOUN"}, {"lemma": None, "text": "sentence", "tag": "NOUN"},
] ]
assert deps["arcs"] == [ assert deps["arcs"] == [
{"start": 0, "end": 1, "label": "nsubj", "dir": "left"}, {"start": 0, "end": 1, "label": "nsubj", "dir": "left"},

View File

@ -239,6 +239,7 @@ If a setting is not present in the options, the default value will be used.
| Name | Type | Description | Default | | Name | Type | Description | Default |
| ------------------ | ------- | --------------------------------------------------------------------------------------------------------------- | ----------------------- | | ------------------ | ------- | --------------------------------------------------------------------------------------------------------------- | ----------------------- |
| `fine_grained` | bool | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). | `False` | | `fine_grained` | bool | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). | `False` |
| `add_lemma` | bool | Print the lemma's in a separate row below the token texts in the `dep` visualisation. | `False` |
| `collapse_punct` | bool | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. | `True` | | `collapse_punct` | bool | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. | `True` |
| `collapse_phrases` | bool | Merge noun phrases into one token. | `False` | | `collapse_phrases` | bool | Merge noun phrases into one token. | `False` |
| `compact` | bool | "Compact mode" with square arrows that takes up less space. | `False` | | `compact` | bool | "Compact mode" with square arrows that takes up less space. | `False` |