mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 09:56:28 +03:00
add lemma option to displacy 'dep' visualiser (#5041)
* add lemma option to displacy 'dep' visualiser * more compact list comprehension * add option to doc * fix test and add lemmas to util.get_doc * fix capital * remove lemma from get_doc * cleanup
This commit is contained in:
parent
2164e71ea8
commit
479bd8d09f
|
@ -144,10 +144,12 @@ def parse_deps(orig_doc, options={}):
|
||||||
for span, tag, lemma, ent_type in spans:
|
for span, tag, lemma, ent_type in spans:
|
||||||
attrs = {"tag": tag, "lemma": lemma, "ent_type": ent_type}
|
attrs = {"tag": tag, "lemma": lemma, "ent_type": ent_type}
|
||||||
retokenizer.merge(span, attrs=attrs)
|
retokenizer.merge(span, attrs=attrs)
|
||||||
if options.get("fine_grained"):
|
fine_grained = options.get("fine_grained")
|
||||||
words = [{"text": w.text, "tag": w.tag_} for w in doc]
|
add_lemma = options.get("add_lemma")
|
||||||
else:
|
words = [{"text": w.text,
|
||||||
words = [{"text": w.text, "tag": w.pos_} for w in doc]
|
"tag": w.tag_ if fine_grained else w.pos_,
|
||||||
|
"lemma": w.lemma_ if add_lemma else None} for w in doc]
|
||||||
|
|
||||||
arcs = []
|
arcs = []
|
||||||
for word in doc:
|
for word in doc:
|
||||||
if word.i < word.head.i:
|
if word.i < word.head.i:
|
||||||
|
|
|
@ -3,7 +3,7 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
import uuid
|
import uuid
|
||||||
|
|
||||||
from .templates import TPL_DEP_SVG, TPL_DEP_WORDS, TPL_DEP_ARCS, TPL_ENTS
|
from .templates import TPL_DEP_SVG, TPL_DEP_WORDS, TPL_DEP_WORDS_LEMMA, TPL_DEP_ARCS, TPL_ENTS
|
||||||
from .templates import TPL_ENT, TPL_ENT_RTL, TPL_FIGURE, TPL_TITLE, TPL_PAGE
|
from .templates import TPL_ENT, TPL_ENT_RTL, TPL_FIGURE, TPL_TITLE, TPL_PAGE
|
||||||
from ..util import minify_html, escape_html, registry
|
from ..util import minify_html, escape_html, registry
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
|
@ -83,7 +83,7 @@ class DependencyRenderer(object):
|
||||||
self.width = self.offset_x + len(words) * self.distance
|
self.width = self.offset_x + len(words) * self.distance
|
||||||
self.height = self.offset_y + 3 * self.word_spacing
|
self.height = self.offset_y + 3 * self.word_spacing
|
||||||
self.id = render_id
|
self.id = render_id
|
||||||
words = [self.render_word(w["text"], w["tag"], i) for i, w in enumerate(words)]
|
words = [self.render_word(w["text"], w["tag"], w.get("lemma", None), i) for i, w in enumerate(words)]
|
||||||
arcs = [
|
arcs = [
|
||||||
self.render_arrow(a["label"], a["start"], a["end"], a["dir"], i)
|
self.render_arrow(a["label"], a["start"], a["end"], a["dir"], i)
|
||||||
for i, a in enumerate(arcs)
|
for i, a in enumerate(arcs)
|
||||||
|
@ -101,7 +101,7 @@ class DependencyRenderer(object):
|
||||||
lang=self.lang,
|
lang=self.lang,
|
||||||
)
|
)
|
||||||
|
|
||||||
def render_word(self, text, tag, i):
|
def render_word(self, text, tag, lemma, i,):
|
||||||
"""Render individual word.
|
"""Render individual word.
|
||||||
|
|
||||||
text (unicode): Word text.
|
text (unicode): Word text.
|
||||||
|
@ -114,6 +114,8 @@ class DependencyRenderer(object):
|
||||||
if self.direction == "rtl":
|
if self.direction == "rtl":
|
||||||
x = self.width - x
|
x = self.width - x
|
||||||
html_text = escape_html(text)
|
html_text = escape_html(text)
|
||||||
|
if lemma is not None:
|
||||||
|
return TPL_DEP_WORDS_LEMMA.format(text=html_text, tag=tag, lemma=lemma, x=x, y=y)
|
||||||
return TPL_DEP_WORDS.format(text=html_text, tag=tag, x=x, y=y)
|
return TPL_DEP_WORDS.format(text=html_text, tag=tag, x=x, y=y)
|
||||||
|
|
||||||
def render_arrow(self, label, start, end, direction, i):
|
def render_arrow(self, label, start, end, direction, i):
|
||||||
|
|
|
@ -18,6 +18,15 @@ TPL_DEP_WORDS = """
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
TPL_DEP_WORDS_LEMMA = """
|
||||||
|
<text class="displacy-token" fill="currentColor" text-anchor="middle" y="{y}">
|
||||||
|
<tspan class="displacy-word" fill="currentColor" x="{x}">{text}</tspan>
|
||||||
|
<tspan class="displacy-lemma" dy="2em" fill="currentColor" x="{x}">{lemma}</tspan>
|
||||||
|
<tspan class="displacy-tag" dy="2em" fill="currentColor" x="{x}">{tag}</tspan>
|
||||||
|
</text>
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
TPL_DEP_ARCS = """
|
TPL_DEP_ARCS = """
|
||||||
<g class="displacy-arrow">
|
<g class="displacy-arrow">
|
||||||
<path class="displacy-arc" id="arrow-{id}-{i}" stroke-width="{stroke}px" d="{arc}" fill="none" stroke="currentColor"/>
|
<path class="displacy-arc" id="arrow-{id}-{i}" stroke-width="{stroke}px" d="{arc}" fill="none" stroke="currentColor"/>
|
||||||
|
|
|
@ -31,10 +31,10 @@ def test_displacy_parse_deps(en_vocab):
|
||||||
deps = displacy.parse_deps(doc)
|
deps = displacy.parse_deps(doc)
|
||||||
assert isinstance(deps, dict)
|
assert isinstance(deps, dict)
|
||||||
assert deps["words"] == [
|
assert deps["words"] == [
|
||||||
{"text": "This", "tag": "DET"},
|
{"lemma": None, "text": "This", "tag": "DET"},
|
||||||
{"text": "is", "tag": "AUX"},
|
{"lemma": None, "text": "is", "tag": "AUX"},
|
||||||
{"text": "a", "tag": "DET"},
|
{"lemma": None, "text": "a", "tag": "DET"},
|
||||||
{"text": "sentence", "tag": "NOUN"},
|
{"lemma": None, "text": "sentence", "tag": "NOUN"},
|
||||||
]
|
]
|
||||||
assert deps["arcs"] == [
|
assert deps["arcs"] == [
|
||||||
{"start": 0, "end": 1, "label": "nsubj", "dir": "left"},
|
{"start": 0, "end": 1, "label": "nsubj", "dir": "left"},
|
||||||
|
|
|
@ -239,6 +239,7 @@ If a setting is not present in the options, the default value will be used.
|
||||||
| Name | Type | Description | Default |
|
| Name | Type | Description | Default |
|
||||||
| ------------------ | ------- | --------------------------------------------------------------------------------------------------------------- | ----------------------- |
|
| ------------------ | ------- | --------------------------------------------------------------------------------------------------------------- | ----------------------- |
|
||||||
| `fine_grained` | bool | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). | `False` |
|
| `fine_grained` | bool | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). | `False` |
|
||||||
|
| `add_lemma` | bool | Print the lemma's in a separate row below the token texts in the `dep` visualisation. | `False` |
|
||||||
| `collapse_punct` | bool | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. | `True` |
|
| `collapse_punct` | bool | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. | `True` |
|
||||||
| `collapse_phrases` | bool | Merge noun phrases into one token. | `False` |
|
| `collapse_phrases` | bool | Merge noun phrases into one token. | `False` |
|
||||||
| `compact` | bool | "Compact mode" with square arrows that takes up less space. | `False` |
|
| `compact` | bool | "Compact mode" with square arrows that takes up less space. | `False` |
|
||||||
|
|
Loading…
Reference in New Issue
Block a user