mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 09:14:32 +03:00
add lemma option to displacy 'dep' visualiser (#5041)
* add lemma option to displacy 'dep' visualiser * more compact list comprehension * add option to doc * fix test and add lemmas to util.get_doc * fix capital * remove lemma from get_doc * cleanup
This commit is contained in:
parent
2164e71ea8
commit
479bd8d09f
|
@ -144,10 +144,12 @@ def parse_deps(orig_doc, options={}):
|
|||
for span, tag, lemma, ent_type in spans:
|
||||
attrs = {"tag": tag, "lemma": lemma, "ent_type": ent_type}
|
||||
retokenizer.merge(span, attrs=attrs)
|
||||
if options.get("fine_grained"):
|
||||
words = [{"text": w.text, "tag": w.tag_} for w in doc]
|
||||
else:
|
||||
words = [{"text": w.text, "tag": w.pos_} for w in doc]
|
||||
fine_grained = options.get("fine_grained")
|
||||
add_lemma = options.get("add_lemma")
|
||||
words = [{"text": w.text,
|
||||
"tag": w.tag_ if fine_grained else w.pos_,
|
||||
"lemma": w.lemma_ if add_lemma else None} for w in doc]
|
||||
|
||||
arcs = []
|
||||
for word in doc:
|
||||
if word.i < word.head.i:
|
||||
|
|
|
@ -3,7 +3,7 @@ from __future__ import unicode_literals
|
|||
|
||||
import uuid
|
||||
|
||||
from .templates import TPL_DEP_SVG, TPL_DEP_WORDS, TPL_DEP_ARCS, TPL_ENTS
|
||||
from .templates import TPL_DEP_SVG, TPL_DEP_WORDS, TPL_DEP_WORDS_LEMMA, TPL_DEP_ARCS, TPL_ENTS
|
||||
from .templates import TPL_ENT, TPL_ENT_RTL, TPL_FIGURE, TPL_TITLE, TPL_PAGE
|
||||
from ..util import minify_html, escape_html, registry
|
||||
from ..errors import Errors
|
||||
|
@ -83,7 +83,7 @@ class DependencyRenderer(object):
|
|||
self.width = self.offset_x + len(words) * self.distance
|
||||
self.height = self.offset_y + 3 * self.word_spacing
|
||||
self.id = render_id
|
||||
words = [self.render_word(w["text"], w["tag"], i) for i, w in enumerate(words)]
|
||||
words = [self.render_word(w["text"], w["tag"], w.get("lemma", None), i) for i, w in enumerate(words)]
|
||||
arcs = [
|
||||
self.render_arrow(a["label"], a["start"], a["end"], a["dir"], i)
|
||||
for i, a in enumerate(arcs)
|
||||
|
@ -101,7 +101,7 @@ class DependencyRenderer(object):
|
|||
lang=self.lang,
|
||||
)
|
||||
|
||||
def render_word(self, text, tag, i):
|
||||
def render_word(self, text, tag, lemma, i,):
|
||||
"""Render individual word.
|
||||
|
||||
text (unicode): Word text.
|
||||
|
@ -114,6 +114,8 @@ class DependencyRenderer(object):
|
|||
if self.direction == "rtl":
|
||||
x = self.width - x
|
||||
html_text = escape_html(text)
|
||||
if lemma is not None:
|
||||
return TPL_DEP_WORDS_LEMMA.format(text=html_text, tag=tag, lemma=lemma, x=x, y=y)
|
||||
return TPL_DEP_WORDS.format(text=html_text, tag=tag, x=x, y=y)
|
||||
|
||||
def render_arrow(self, label, start, end, direction, i):
|
||||
|
|
|
@ -18,6 +18,15 @@ TPL_DEP_WORDS = """
|
|||
"""
|
||||
|
||||
|
||||
TPL_DEP_WORDS_LEMMA = """
|
||||
<text class="displacy-token" fill="currentColor" text-anchor="middle" y="{y}">
|
||||
<tspan class="displacy-word" fill="currentColor" x="{x}">{text}</tspan>
|
||||
<tspan class="displacy-lemma" dy="2em" fill="currentColor" x="{x}">{lemma}</tspan>
|
||||
<tspan class="displacy-tag" dy="2em" fill="currentColor" x="{x}">{tag}</tspan>
|
||||
</text>
|
||||
"""
|
||||
|
||||
|
||||
TPL_DEP_ARCS = """
|
||||
<g class="displacy-arrow">
|
||||
<path class="displacy-arc" id="arrow-{id}-{i}" stroke-width="{stroke}px" d="{arc}" fill="none" stroke="currentColor"/>
|
||||
|
|
|
@ -31,10 +31,10 @@ def test_displacy_parse_deps(en_vocab):
|
|||
deps = displacy.parse_deps(doc)
|
||||
assert isinstance(deps, dict)
|
||||
assert deps["words"] == [
|
||||
{"text": "This", "tag": "DET"},
|
||||
{"text": "is", "tag": "AUX"},
|
||||
{"text": "a", "tag": "DET"},
|
||||
{"text": "sentence", "tag": "NOUN"},
|
||||
{"lemma": None, "text": "This", "tag": "DET"},
|
||||
{"lemma": None, "text": "is", "tag": "AUX"},
|
||||
{"lemma": None, "text": "a", "tag": "DET"},
|
||||
{"lemma": None, "text": "sentence", "tag": "NOUN"},
|
||||
]
|
||||
assert deps["arcs"] == [
|
||||
{"start": 0, "end": 1, "label": "nsubj", "dir": "left"},
|
||||
|
|
|
@ -239,6 +239,7 @@ If a setting is not present in the options, the default value will be used.
|
|||
| Name | Type | Description | Default |
|
||||
| ------------------ | ------- | --------------------------------------------------------------------------------------------------------------- | ----------------------- |
|
||||
| `fine_grained` | bool | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). | `False` |
|
||||
| `add_lemma` | bool | Print the lemma's in a separate row below the token texts in the `dep` visualisation. | `False` |
|
||||
| `collapse_punct` | bool | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. | `True` |
|
||||
| `collapse_phrases` | bool | Merge noun phrases into one token. | `False` |
|
||||
| `compact` | bool | "Compact mode" with square arrows that takes up less space. | `False` |
|
||||
|
|
Loading…
Reference in New Issue
Block a user