From 479bd8d09fd22118463706363e7d23b0578ceea9 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Sat, 22 Feb 2020 14:11:51 +0100 Subject: [PATCH] add lemma option to displacy 'dep' visualiser (#5041) * add lemma option to displacy 'dep' visualiser * more compact list comprehension * add option to doc * fix test and add lemmas to util.get_doc * fix capital * remove lemma from get_doc * cleanup --- spacy/displacy/__init__.py | 10 ++++++---- spacy/displacy/render.py | 8 +++++--- spacy/displacy/templates.py | 9 +++++++++ spacy/tests/test_displacy.py | 8 ++++---- website/docs/api/top-level.md | 1 + 5 files changed, 25 insertions(+), 11 deletions(-) diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py index c17b80aef..e13b0403b 100644 --- a/spacy/displacy/__init__.py +++ b/spacy/displacy/__init__.py @@ -144,10 +144,12 @@ def parse_deps(orig_doc, options={}): for span, tag, lemma, ent_type in spans: attrs = {"tag": tag, "lemma": lemma, "ent_type": ent_type} retokenizer.merge(span, attrs=attrs) - if options.get("fine_grained"): - words = [{"text": w.text, "tag": w.tag_} for w in doc] - else: - words = [{"text": w.text, "tag": w.pos_} for w in doc] + fine_grained = options.get("fine_grained") + add_lemma = options.get("add_lemma") + words = [{"text": w.text, + "tag": w.tag_ if fine_grained else w.pos_, + "lemma": w.lemma_ if add_lemma else None} for w in doc] + arcs = [] for word in doc: if word.i < word.head.i: diff --git a/spacy/displacy/render.py b/spacy/displacy/render.py index d6e33437b..68df324d6 100644 --- a/spacy/displacy/render.py +++ b/spacy/displacy/render.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals import uuid -from .templates import TPL_DEP_SVG, TPL_DEP_WORDS, TPL_DEP_ARCS, TPL_ENTS +from .templates import TPL_DEP_SVG, TPL_DEP_WORDS, TPL_DEP_WORDS_LEMMA, TPL_DEP_ARCS, TPL_ENTS from .templates import TPL_ENT, TPL_ENT_RTL, TPL_FIGURE, TPL_TITLE, TPL_PAGE from ..util import minify_html, escape_html, registry from ..errors import Errors @@ -83,7 +83,7 @@ class DependencyRenderer(object): self.width = self.offset_x + len(words) * self.distance self.height = self.offset_y + 3 * self.word_spacing self.id = render_id - words = [self.render_word(w["text"], w["tag"], i) for i, w in enumerate(words)] + words = [self.render_word(w["text"], w["tag"], w.get("lemma", None), i) for i, w in enumerate(words)] arcs = [ self.render_arrow(a["label"], a["start"], a["end"], a["dir"], i) for i, a in enumerate(arcs) @@ -101,7 +101,7 @@ class DependencyRenderer(object): lang=self.lang, ) - def render_word(self, text, tag, i): + def render_word(self, text, tag, lemma, i,): """Render individual word. text (unicode): Word text. @@ -114,6 +114,8 @@ class DependencyRenderer(object): if self.direction == "rtl": x = self.width - x html_text = escape_html(text) + if lemma is not None: + return TPL_DEP_WORDS_LEMMA.format(text=html_text, tag=tag, lemma=lemma, x=x, y=y) return TPL_DEP_WORDS.format(text=html_text, tag=tag, x=x, y=y) def render_arrow(self, label, start, end, direction, i): diff --git a/spacy/displacy/templates.py b/spacy/displacy/templates.py index ade75d1d6..f29eab86f 100644 --- a/spacy/displacy/templates.py +++ b/spacy/displacy/templates.py @@ -18,6 +18,15 @@ TPL_DEP_WORDS = """ """ +TPL_DEP_WORDS_LEMMA = """ + + {text} + {lemma} + {tag} + +""" + + TPL_DEP_ARCS = """ diff --git a/spacy/tests/test_displacy.py b/spacy/tests/test_displacy.py index 2d1f1bd8f..d04c0506f 100644 --- a/spacy/tests/test_displacy.py +++ b/spacy/tests/test_displacy.py @@ -31,10 +31,10 @@ def test_displacy_parse_deps(en_vocab): deps = displacy.parse_deps(doc) assert isinstance(deps, dict) assert deps["words"] == [ - {"text": "This", "tag": "DET"}, - {"text": "is", "tag": "AUX"}, - {"text": "a", "tag": "DET"}, - {"text": "sentence", "tag": "NOUN"}, + {"lemma": None, "text": "This", "tag": "DET"}, + {"lemma": None, "text": "is", "tag": "AUX"}, + {"lemma": None, "text": "a", "tag": "DET"}, + {"lemma": None, "text": "sentence", "tag": "NOUN"}, ] assert deps["arcs"] == [ {"start": 0, "end": 1, "label": "nsubj", "dir": "left"}, diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index 50ba0e3d9..266df87f0 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -239,6 +239,7 @@ If a setting is not present in the options, the default value will be used. | Name | Type | Description | Default | | ------------------ | ------- | --------------------------------------------------------------------------------------------------------------- | ----------------------- | | `fine_grained` | bool | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). | `False` | +| `add_lemma` | bool | Print the lemma's in a separate row below the token texts in the `dep` visualisation. | `False` | | `collapse_punct` | bool | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. | `True` | | `collapse_phrases` | bool | Merge noun phrases into one token. | `False` | | `compact` | bool | "Compact mode" with square arrows that takes up less space. | `False` |