mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
commit
1d7c18e58a
|
@ -16,7 +16,7 @@ def load(name, **overrides):
|
||||||
meta = util.parse_package_meta(model_path)
|
meta = util.parse_package_meta(model_path)
|
||||||
if 'lang' not in meta:
|
if 'lang' not in meta:
|
||||||
raise IOError('No language setting found in model meta.')
|
raise IOError('No language setting found in model meta.')
|
||||||
cls = util.load_lang_class(meta['lang'])
|
cls = util.get_lang_class(meta['lang'])
|
||||||
overrides['meta'] = meta
|
overrides['meta'] = meta
|
||||||
overrides['path'] = model_path
|
overrides['path'] = model_path
|
||||||
return cls(**overrides)
|
return cls(**overrides)
|
||||||
|
|
|
@ -10,9 +10,9 @@ PRON_LEMMA = "-PRON-"
|
||||||
|
|
||||||
|
|
||||||
def depr_model_download(lang):
|
def depr_model_download(lang):
|
||||||
"""
|
"""Replace en/de download modules within, warn and ownload default models.
|
||||||
Replace download modules within en and de with deprecation warning and
|
|
||||||
download default language model (using shortcut).
|
lang (unicode): Language shortcut, 'en' or 'de'.
|
||||||
"""
|
"""
|
||||||
prints("The spacy.%s.download command is now deprecated. Please use "
|
prints("The spacy.%s.download command is now deprecated. Please use "
|
||||||
"python -m spacy download [model name or shortcut] instead. For "
|
"python -m spacy download [model name or shortcut] instead. For "
|
||||||
|
@ -24,6 +24,12 @@ def depr_model_download(lang):
|
||||||
|
|
||||||
|
|
||||||
def resolve_load_name(name, **overrides):
|
def resolve_load_name(name, **overrides):
|
||||||
|
"""Resolve model loading if deprecated path kwarg is specified in overrides.
|
||||||
|
|
||||||
|
name (unicode): Name of model to load.
|
||||||
|
**overrides: Overrides specified in spacy.load().
|
||||||
|
RETURNS: Model name or value of path kwarg.
|
||||||
|
"""
|
||||||
if overrides.get('path') not in (None, False, True):
|
if overrides.get('path') not in (None, False, True):
|
||||||
name = overrides.get('path')
|
name = overrides.get('path')
|
||||||
prints("To load a model from a path, you can now use the first argument. "
|
prints("To load a model from a path, you can now use the first argument. "
|
||||||
|
|
103
spacy/displacy/__init__.py
Normal file
103
spacy/displacy/__init__.py
Normal file
|
@ -0,0 +1,103 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from .render import DependencyRenderer, EntityRenderer
|
||||||
|
from ..tokens import Doc
|
||||||
|
from ..util import prints
|
||||||
|
|
||||||
|
|
||||||
|
_html = {}
|
||||||
|
|
||||||
|
|
||||||
|
def render(docs, style='dep', page=False, minify=False, jupyter=False, options={}):
|
||||||
|
"""Render displaCy visualisation.
|
||||||
|
|
||||||
|
docs (list or Doc): Document(s) to visualise.
|
||||||
|
style (unicode): Visualisation style, 'dep' or 'ent'.
|
||||||
|
page (bool): Render markup as full HTML page.
|
||||||
|
minify (bool): Minify HTML markup.
|
||||||
|
jupyter (bool): Experimental, use Jupyter's display() to output markup.
|
||||||
|
options (dict): Visualiser-specific options, e.g. colors.
|
||||||
|
RETURNS (unicode): Rendered HTML markup.
|
||||||
|
"""
|
||||||
|
if isinstance(docs, Doc):
|
||||||
|
docs = [docs]
|
||||||
|
if style is 'dep':
|
||||||
|
renderer = DependencyRenderer(options=options)
|
||||||
|
parsed = [parse_deps(doc, options) for doc in docs]
|
||||||
|
elif style is 'ent':
|
||||||
|
renderer = EntityRenderer(options=options)
|
||||||
|
parsed = [parse_ents(doc, options) for doc in docs]
|
||||||
|
_html['parsed'] = renderer.render(parsed, page=page, minify=minify).strip()
|
||||||
|
html = _html['parsed']
|
||||||
|
if jupyter: # return HTML rendered by IPython display()
|
||||||
|
from IPython.core.display import display, HTML
|
||||||
|
return display(HTML(html))
|
||||||
|
return html
|
||||||
|
|
||||||
|
|
||||||
|
def serve(docs, style='dep', page=True, minify=False, options={}, port=5000):
|
||||||
|
"""Serve displaCy visualisation.
|
||||||
|
|
||||||
|
docs (list or Doc): Document(s) to visualise.
|
||||||
|
style (unicode): Visualisation style, 'dep' or 'ent'.
|
||||||
|
page (bool): Render markup as full HTML page.
|
||||||
|
minify (bool): Minify HTML markup.
|
||||||
|
options (dict): Visualiser-specific options, e.g. colors.
|
||||||
|
port (int): Port to serve visualisation.
|
||||||
|
"""
|
||||||
|
from wsgiref import simple_server
|
||||||
|
render(docs, style=style, page=page, minify=minify, options=options)
|
||||||
|
httpd = simple_server.make_server('0.0.0.0', port, app)
|
||||||
|
prints("Using the '%s' visualizer" % style, title="Serving on port %d..." % port)
|
||||||
|
httpd.serve_forever()
|
||||||
|
|
||||||
|
|
||||||
|
def app(environ, start_response):
|
||||||
|
start_response('200 OK', [('Content-type', 'text/html; charset=utf-8')])
|
||||||
|
res = _html['parsed'].encode(encoding='utf-8')
|
||||||
|
return [res]
|
||||||
|
|
||||||
|
|
||||||
|
def parse_deps(doc, options={}):
|
||||||
|
"""Generate dependency parse in {'words': [], 'arcs': []} format.
|
||||||
|
|
||||||
|
doc (Doc): Document do parse.
|
||||||
|
RETURNS (dict): Generated dependency parse keyed by words and arcs.
|
||||||
|
"""
|
||||||
|
if options.get('collapse_punct', True):
|
||||||
|
spans = []
|
||||||
|
for word in doc[:-1]:
|
||||||
|
if word.is_punct or not word.nbor(1).is_punct:
|
||||||
|
continue
|
||||||
|
start = word.i
|
||||||
|
end = word.i + 1
|
||||||
|
while end < len(doc) and doc[end].is_punct:
|
||||||
|
end += 1
|
||||||
|
span = doc[start : end]
|
||||||
|
spans.append((span.start_char, span.end_char, word.tag_,
|
||||||
|
word.lemma_, word.ent_type_))
|
||||||
|
for span_props in spans:
|
||||||
|
doc.merge(*span_props)
|
||||||
|
words = [{'text': w.text, 'tag': w.tag_} for w in doc]
|
||||||
|
arcs = []
|
||||||
|
for word in doc:
|
||||||
|
if word.i < word.head.i:
|
||||||
|
arcs.append({'start': word.i, 'end': word.head.i,
|
||||||
|
'label': word.dep_, 'dir': 'left'})
|
||||||
|
elif word.i > word.head.i:
|
||||||
|
arcs.append({'start': word.head.i, 'end': word.i,
|
||||||
|
'label': word.dep_, 'dir': 'right'})
|
||||||
|
return {'words': words, 'arcs': arcs}
|
||||||
|
|
||||||
|
|
||||||
|
def parse_ents(doc, options={}):
|
||||||
|
"""Generate named entities in [{start: i, end: i, label: 'label'}] format.
|
||||||
|
|
||||||
|
doc (Doc): Document do parse.
|
||||||
|
RETURNS (dict): Generated entities keyed by text (original text) and ents.
|
||||||
|
"""
|
||||||
|
ents = [{'start': ent.start_char, 'end': ent.end_char, 'label': ent.label_}
|
||||||
|
for ent in doc.ents]
|
||||||
|
title = doc.user_data.get('title', None) if hasattr(doc, 'user_data') else None
|
||||||
|
return {'text': doc.text, 'ents': ents, 'title': title}
|
217
spacy/displacy/render.py
Normal file
217
spacy/displacy/render.py
Normal file
|
@ -0,0 +1,217 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from .templates import TPL_DEP_SVG, TPL_DEP_WORDS, TPL_DEP_ARCS
|
||||||
|
from .templates import TPL_ENT, TPL_ENTS, TPL_FIGURE, TPL_TITLE, TPL_PAGE
|
||||||
|
from ..util import minify_html
|
||||||
|
|
||||||
|
|
||||||
|
class DependencyRenderer(object):
|
||||||
|
"""Render dependency parses as SVGs."""
|
||||||
|
style = 'dep'
|
||||||
|
|
||||||
|
def __init__(self, options={}):
|
||||||
|
"""Initialise dependency renderer.
|
||||||
|
|
||||||
|
options (dict): Visualiser-specific options (compact, word_spacing,
|
||||||
|
arrow_spacing, arrow_width, arrow_stroke, distance,
|
||||||
|
offset_x, color, bg, font)
|
||||||
|
"""
|
||||||
|
self.compact = options.get('compact', False)
|
||||||
|
distance, arrow_width = (85, 8) if self.compact else (175, 10)
|
||||||
|
self.word_spacing = options.get('word_spacing', 45)
|
||||||
|
self.arrow_spacing = options.get('arrow_spacing', 20)
|
||||||
|
self.arrow_width = options.get('arrow_width', arrow_width)
|
||||||
|
self.arrow_stroke = options.get('arrow_stroke', 2)
|
||||||
|
self.distance = options.get('distance', distance)
|
||||||
|
self.offset_x = options.get('offset_x', 50)
|
||||||
|
self.color = options.get('color', '#000000')
|
||||||
|
self.bg = options.get('bg', '#ffffff')
|
||||||
|
self.font = options.get('font', 'Arial')
|
||||||
|
|
||||||
|
def render(self, parsed, page=False, minify=False):
|
||||||
|
"""Render complete markup.
|
||||||
|
|
||||||
|
parsed (list): Dependency parses to render.
|
||||||
|
page (bool): Render parses wrapped as full HTML page.
|
||||||
|
minify (bool): Minify HTML markup.
|
||||||
|
RETURNS (unicode): Rendered SVG or HTML markup.
|
||||||
|
"""
|
||||||
|
rendered = [self.render_svg(i, p['words'], p['arcs'])
|
||||||
|
for i, p in enumerate(parsed)]
|
||||||
|
if page:
|
||||||
|
content = ''.join([TPL_FIGURE.format(content=svg) for svg in rendered])
|
||||||
|
markup = TPL_PAGE.format(content=content)
|
||||||
|
else:
|
||||||
|
markup = ''.join(rendered)
|
||||||
|
if minify:
|
||||||
|
return minify_html(markup)
|
||||||
|
return markup
|
||||||
|
|
||||||
|
def render_svg(self, render_id, words, arcs):
|
||||||
|
"""Render SVG.
|
||||||
|
|
||||||
|
render_id (int): Unique ID, typically index of document.
|
||||||
|
words (list): Individual words and their tags.
|
||||||
|
arcs (list): Individual arcs and their start, end, direction and label.
|
||||||
|
RETURNS (unicode): Rendered SVG markup.
|
||||||
|
"""
|
||||||
|
self.levels = self.get_levels(arcs)
|
||||||
|
self.highest_level = len(self.levels)
|
||||||
|
self.offset_y = self.distance/2*self.highest_level+self.arrow_stroke
|
||||||
|
self.width = self.offset_x+len(words)*self.distance
|
||||||
|
self.height = self.offset_y+3*self.word_spacing
|
||||||
|
self.id = render_id
|
||||||
|
words = [self.render_word(w['text'], w['tag'], i)
|
||||||
|
for i, w in enumerate(words)]
|
||||||
|
arcs = [self.render_arrow(a['label'], a['start'], a['end'], a['dir'], i)
|
||||||
|
for i, a in enumerate(arcs)]
|
||||||
|
content = ''.join(words) + ''.join(arcs)
|
||||||
|
return TPL_DEP_SVG.format(id=self.id, width=self.width, height=self.height,
|
||||||
|
color=self.color, bg=self.bg, font=self.font,
|
||||||
|
content=content)
|
||||||
|
|
||||||
|
def render_word(self, text, tag, i):
|
||||||
|
"""Render individual word.
|
||||||
|
|
||||||
|
text (unicode): Word text.
|
||||||
|
tag (unicode): Part-of-speech tag.
|
||||||
|
i (int): Unique ID, typically word index.
|
||||||
|
RETURNS (unicode): Rendered SVG markup.
|
||||||
|
"""
|
||||||
|
y = self.offset_y+self.word_spacing
|
||||||
|
x = self.offset_x+i*self.distance
|
||||||
|
return TPL_DEP_WORDS.format(text=text, tag=tag, x=x, y=y)
|
||||||
|
|
||||||
|
def render_arrow(self, label, start, end, direction, i):
|
||||||
|
"""Render indivicual arrow.
|
||||||
|
|
||||||
|
label (unicode): Dependency label.
|
||||||
|
start (int): Index of start word.
|
||||||
|
end (int): Index of end word.
|
||||||
|
direction (unicode): Arrow direction, 'left' or 'right'.
|
||||||
|
i (int): Unique ID, typically arrow index.
|
||||||
|
RETURNS (unicode): Rendered SVG markup.
|
||||||
|
"""
|
||||||
|
level = self.levels.index(end-start)+1
|
||||||
|
x_start = self.offset_x+start*self.distance+self.arrow_spacing
|
||||||
|
y = self.offset_y
|
||||||
|
x_end = (self.offset_x+(end-start)*self.distance+start*self.distance
|
||||||
|
-self.arrow_spacing*(self.highest_level-level)/4)
|
||||||
|
y_curve = self.offset_y-level*self.distance/2
|
||||||
|
if y_curve == 0 and len(self.levels) > 5:
|
||||||
|
y_curve = -self.distance
|
||||||
|
arrowhead = self.get_arrowhead(direction, x_start, y, x_end)
|
||||||
|
arc = self.get_arc(x_start, y, y_curve, x_end)
|
||||||
|
return TPL_DEP_ARCS.format(id=self.id, i=i, stroke=self.arrow_stroke,
|
||||||
|
head=arrowhead, label=label, arc=arc)
|
||||||
|
|
||||||
|
def get_arc(self, x_start, y, y_curve, x_end):
|
||||||
|
"""Render individual arc.
|
||||||
|
|
||||||
|
x_start (int): X-coordinate of arrow start point.
|
||||||
|
y (int): Y-coordinate of arrow start and end point.
|
||||||
|
y_curve (int): Y-corrdinate of Cubic Bézier y_curve point.
|
||||||
|
x_end (int): X-coordinate of arrow end point.
|
||||||
|
RETURNS (unicode): Definition of the arc path ('d' attribute).
|
||||||
|
"""
|
||||||
|
template = "M{x},{y} C{x},{c} {e},{c} {e},{y}"
|
||||||
|
if self.compact:
|
||||||
|
template = "M{x},{y} {x},{c} {e},{c} {e},{y}"
|
||||||
|
return template.format(x=x_start, y=y, c=y_curve, e=x_end)
|
||||||
|
|
||||||
|
def get_arrowhead(self, direction, x, y, end):
|
||||||
|
"""Render individual arrow head.
|
||||||
|
|
||||||
|
direction (unicode): Arrow direction, 'left' or 'right'.
|
||||||
|
x (int): X-coordinate of arrow start point.
|
||||||
|
y (int): Y-coordinate of arrow start and end point.
|
||||||
|
end (int): X-coordinate of arrow end point.
|
||||||
|
RETURNS (unicode): Definition of the arrow head path ('d' attribute).
|
||||||
|
"""
|
||||||
|
if direction is 'left':
|
||||||
|
pos1, pos2, pos3 = (x, x-self.arrow_width+2, x+self.arrow_width-2)
|
||||||
|
else:
|
||||||
|
pos1, pos2, pos3 = (end, end+self.arrow_width-2, end-self.arrow_width+2)
|
||||||
|
arrowhead = (pos1, y+2, pos2, y-self.arrow_width, pos3, y-self.arrow_width)
|
||||||
|
return "M{},{} L{},{} {},{}".format(*arrowhead)
|
||||||
|
|
||||||
|
def get_levels(self, arcs):
|
||||||
|
"""Calculate available arc height "levels".
|
||||||
|
Used to calculate arrow heights dynamically and without wasting space.
|
||||||
|
|
||||||
|
args (list): Individual arcs and their start, end, direction and label.
|
||||||
|
RETURNS (list): Arc levels sorted from lowest to highest.
|
||||||
|
"""
|
||||||
|
levels = set(map(lambda arc: arc['end'] - arc['start'], arcs))
|
||||||
|
return sorted(list(levels))
|
||||||
|
|
||||||
|
|
||||||
|
class EntityRenderer(object):
|
||||||
|
"""Render named entities as HTML."""
|
||||||
|
style = 'ent'
|
||||||
|
|
||||||
|
def __init__(self, options={}):
|
||||||
|
"""Initialise dependency renderer.
|
||||||
|
|
||||||
|
options (dict): Visualiser-specific options (colors, ents)
|
||||||
|
"""
|
||||||
|
colors = {'ORG': '#7aecec', 'PRODUCT': '#bfeeb7', 'GPE': '#feca74',
|
||||||
|
'LOC': '#ff9561', 'PERSON': '#aa9cfc', 'NORP': '#c887fb',
|
||||||
|
'FACILITY': '#9cc9cc', 'EVENT': '#ffeb80', 'LANGUAGE': '#ff8197',
|
||||||
|
'WORK_OF_ART': '#f0d0ff', 'DATE': '#bfe1d9', 'TIME': '#bfe1d9',
|
||||||
|
'MONEY': '#e4e7d2', 'QUANTITY': '#e4e7d2', 'ORDINAL': '#e4e7d2',
|
||||||
|
'CARDINAL': '#e4e7d2', 'PERCENT': '#e4e7d2'}
|
||||||
|
colors.update(options.get('colors', {}))
|
||||||
|
self.default_color = '#ddd'
|
||||||
|
self.colors = colors
|
||||||
|
self.ents = options.get('ents', None)
|
||||||
|
|
||||||
|
def render(self, parsed, page=False, minify=False):
|
||||||
|
"""Render complete markup.
|
||||||
|
|
||||||
|
parsed (list): Dependency parses to render.
|
||||||
|
page (bool): Render parses wrapped as full HTML page.
|
||||||
|
minify (bool): Minify HTML markup.
|
||||||
|
RETURNS (unicode): Rendered HTML markup.
|
||||||
|
"""
|
||||||
|
rendered = [self.render_ents(p['text'], p['ents'], p['title']) for p in parsed]
|
||||||
|
if page:
|
||||||
|
docs = ''.join([TPL_FIGURE.format(content=doc) for doc in rendered])
|
||||||
|
markup = TPL_PAGE.format(content=docs)
|
||||||
|
else:
|
||||||
|
markup = ''.join(rendered)
|
||||||
|
if minify:
|
||||||
|
return minify_html(markup)
|
||||||
|
return markup
|
||||||
|
|
||||||
|
def render_ents(self, text, spans, title):
|
||||||
|
"""Render entities in text.
|
||||||
|
|
||||||
|
text (unicode): Original text.
|
||||||
|
spans (list): Individual entity spans and their start, end and label.
|
||||||
|
title (unicode or None): Document title set in Doc.user_data['title'].
|
||||||
|
"""
|
||||||
|
markup = ''
|
||||||
|
offset = 0
|
||||||
|
for span in spans:
|
||||||
|
label = span['label']
|
||||||
|
start = span['start']
|
||||||
|
end = span['end']
|
||||||
|
entity = text[start:end]
|
||||||
|
fragments = text[offset:start].split('\n')
|
||||||
|
for i, fragment in enumerate(fragments):
|
||||||
|
markup += fragment
|
||||||
|
if len(fragments) > 1 and i != len(fragments)-1:
|
||||||
|
markup += '</br>'
|
||||||
|
if self.ents is None or label.upper() in self.ents:
|
||||||
|
color = self.colors.get(label.upper(), self.default_color)
|
||||||
|
markup += TPL_ENT.format(label=label, text=entity, bg=color)
|
||||||
|
else:
|
||||||
|
markup += entity
|
||||||
|
offset = end
|
||||||
|
markup += text[offset:]
|
||||||
|
markup = TPL_ENTS.format(content=markup, colors=self.colors)
|
||||||
|
if title:
|
||||||
|
markup = TPL_TITLE.format(title=title) + markup
|
||||||
|
return markup
|
63
spacy/displacy/templates.py
Normal file
63
spacy/displacy/templates.py
Normal file
|
@ -0,0 +1,63 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
|
# setting explicit height and max-width: none on the SVG is required for
|
||||||
|
# Jupyter to render it properly in a cell
|
||||||
|
|
||||||
|
TPL_DEP_SVG = """
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" id="{id}" class="displacy" width="{width}" height="{height}" style="max-width: none; height: {height}px; color: {color}; background: {bg}; font-family: {font}">{content}</svg>
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
TPL_DEP_WORDS = """
|
||||||
|
<text class="displacy-token" fill="currentColor" text-anchor="middle" y="{y}">
|
||||||
|
<tspan class="displacy-word" fill="currentColor" x="{x}">{text}</tspan>
|
||||||
|
<tspan class="displacy-tag" dy="2em" fill="currentColor" x="{x}">{tag}</tspan>
|
||||||
|
</text>
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
TPL_DEP_ARCS = """
|
||||||
|
<g class="displacy-arrow">
|
||||||
|
<path class="displacy-arc" id="arrow-{id}-{i}" stroke-width="{stroke}px" d="{arc}" fill="none" stroke="currentColor"/>
|
||||||
|
<text dy="1.25em" style="font-size: 0.8em">
|
||||||
|
<textPath xlink:href="#arrow-{id}-{i}" class="displacy-label" startOffset="50%" fill="currentColor" text-anchor="middle">{label}</textPath>
|
||||||
|
</text>
|
||||||
|
<path class="displacy-arrowhead" d="{head}" fill="currentColor"/>
|
||||||
|
</g>
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
TPL_FIGURE = """
|
||||||
|
<figure style="margin-bottom: 6rem">{content}</figure>
|
||||||
|
"""
|
||||||
|
|
||||||
|
TPL_TITLE = """
|
||||||
|
<h2 style="margin: 0">{title}</h2>
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
TPL_ENTS = """
|
||||||
|
<div class="entities" style="line-height: 2.5">{content}</div>
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
TPL_ENT = """
|
||||||
|
<mark class="entity" style="background: {bg}; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone">
|
||||||
|
{text}
|
||||||
|
<span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem">{label}</span>
|
||||||
|
</mark>
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
TPL_PAGE = """
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>displaCy</title>
|
||||||
|
</head>
|
||||||
|
|
||||||
|
<body style="font-size: 16px; font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol'; padding: 4rem 2rem;">{content}</body>
|
||||||
|
</html>
|
||||||
|
"""
|
|
@ -18,67 +18,67 @@ _languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'he', 'hu', 'it', 'nb',
|
||||||
|
|
||||||
@pytest.fixture(params=_languages)
|
@pytest.fixture(params=_languages)
|
||||||
def tokenizer(request):
|
def tokenizer(request):
|
||||||
lang = util.load_lang_class(request.param)
|
lang = util.get_lang_class(request.param)
|
||||||
return lang.Defaults.create_tokenizer()
|
return lang.Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def en_tokenizer():
|
def en_tokenizer():
|
||||||
return util.load_lang_class('en').Defaults.create_tokenizer()
|
return util.get_lang_class('en').Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def en_vocab():
|
def en_vocab():
|
||||||
return util.load_lang_class('en').Defaults.create_vocab()
|
return util.get_lang_class('en').Defaults.create_vocab()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def en_parser():
|
def en_parser():
|
||||||
return util.load_lang_class('en').Defaults.create_parser()
|
return util.get_lang_class('en').Defaults.create_parser()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def es_tokenizer():
|
def es_tokenizer():
|
||||||
return util.load_lang_class('es').Defaults.create_tokenizer()
|
return util.get_lang_class('es').Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def de_tokenizer():
|
def de_tokenizer():
|
||||||
return util.load_lang_class('de').Defaults.create_tokenizer()
|
return util.get_lang_class('de').Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope='module')
|
@pytest.fixture(scope='module')
|
||||||
def fr_tokenizer():
|
def fr_tokenizer():
|
||||||
return util.load_lang_class('fr').Defaults.create_tokenizer()
|
return util.get_lang_class('fr').Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def hu_tokenizer():
|
def hu_tokenizer():
|
||||||
return util.load_lang_class('hu').Defaults.create_tokenizer()
|
return util.get_lang_class('hu').Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def fi_tokenizer():
|
def fi_tokenizer():
|
||||||
return util.load_lang_class('fi').Defaults.create_tokenizer()
|
return util.get_lang_class('fi').Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def sv_tokenizer():
|
def sv_tokenizer():
|
||||||
return util.load_lang_class('sv').Defaults.create_tokenizer()
|
return util.get_lang_class('sv').Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def bn_tokenizer():
|
def bn_tokenizer():
|
||||||
return util.load_lang_class('bn').Defaults.create_tokenizer()
|
return util.get_lang_class('bn').Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def he_tokenizer():
|
def he_tokenizer():
|
||||||
return util.load_lang_class('he').Defaults.create_tokenizer()
|
return util.get_lang_class('he').Defaults.create_tokenizer()
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def nb_tokenizer():
|
def nb_tokenizer():
|
||||||
return util.load_lang_class('nb').Defaults.create_tokenizer()
|
return util.get_lang_class('nb').Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
|
@ -88,12 +88,12 @@ def stringstore():
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def en_entityrecognizer():
|
def en_entityrecognizer():
|
||||||
return util.load_lang_class('en').Defaults.create_entity()
|
return util.get_lang_class('en').Defaults.create_entity()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def lemmatizer():
|
def lemmatizer():
|
||||||
return util.load_lang_class('en').Defaults.create_lemmatizer()
|
return util.get_lang_class('en').Defaults.create_lemmatizer()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
|
|
145
spacy/util.py
145
spacy/util.py
|
@ -25,39 +25,37 @@ try:
|
||||||
except ImportError:
|
except ImportError:
|
||||||
cupy = None
|
cupy = None
|
||||||
|
|
||||||
def set_lang_class(name, cls):
|
def get_lang_class(lang):
|
||||||
|
"""Import and load a Language class.
|
||||||
|
|
||||||
|
lang (unicode): Two-letter language code, e.g. 'en'.
|
||||||
|
RETURNS (Language): Language class.
|
||||||
|
"""
|
||||||
global LANGUAGES
|
global LANGUAGES
|
||||||
LANGUAGES[name] = cls
|
if not lang in LANGUAGES:
|
||||||
|
try:
|
||||||
|
module = importlib.import_module('.lang.%s' % lang, 'spacy')
|
||||||
def get_lang_class(name):
|
except ImportError:
|
||||||
if name in LANGUAGES:
|
raise ImportError("Can't import language %s from spacy.lang." %lang)
|
||||||
return LANGUAGES[name]
|
LANGUAGES[lang] = getattr(module, module.__all__[0])
|
||||||
lang = re.split('[^a-zA-Z0-9]', name, 1)[0]
|
|
||||||
if lang not in LANGUAGES:
|
|
||||||
raise RuntimeError('Language not supported: %s' % name)
|
|
||||||
return LANGUAGES[lang]
|
return LANGUAGES[lang]
|
||||||
|
|
||||||
|
|
||||||
def load_lang_class(lang):
|
def set_lang_class(name, cls):
|
||||||
"""Import and load a Language class.
|
"""Set a custom Language class name that can be loaded via get_lang_class.
|
||||||
|
|
||||||
Args:
|
name (unicode): Name of Language class.
|
||||||
lang (unicode): Two-letter language code, e.g. 'en'.
|
cls (Language): Language class.
|
||||||
Returns:
|
|
||||||
Language: Language class.
|
|
||||||
"""
|
"""
|
||||||
module = importlib.import_module('.lang.%s' % lang, 'spacy')
|
global LANGUAGES
|
||||||
return getattr(module, module.__all__[0])
|
LANGUAGES[name] = cls
|
||||||
|
|
||||||
|
|
||||||
def get_data_path(require_exists=True):
|
def get_data_path(require_exists=True):
|
||||||
"""Get path to spaCy data directory.
|
"""Get path to spaCy data directory.
|
||||||
|
|
||||||
Args:
|
require_exists (bool): Only return path if it exists, otherwise None.
|
||||||
require_exists (bool): Only return path if it exists, otherwise None.
|
RETURNS (Path or None): Data path or None.
|
||||||
Returns:
|
|
||||||
Path or None: Data path or None.
|
|
||||||
"""
|
"""
|
||||||
if not require_exists:
|
if not require_exists:
|
||||||
return _data_path
|
return _data_path
|
||||||
|
@ -68,14 +66,18 @@ def get_data_path(require_exists=True):
|
||||||
def set_data_path(path):
|
def set_data_path(path):
|
||||||
"""Set path to spaCy data directory.
|
"""Set path to spaCy data directory.
|
||||||
|
|
||||||
Args:
|
path (unicode or Path): Path to new data directory.
|
||||||
path (unicode or Path): Path to new data directory.
|
|
||||||
"""
|
"""
|
||||||
global _data_path
|
global _data_path
|
||||||
_data_path = ensure_path(path)
|
_data_path = ensure_path(path)
|
||||||
|
|
||||||
|
|
||||||
def ensure_path(path):
|
def ensure_path(path):
|
||||||
|
"""Ensure string is converted to a Path.
|
||||||
|
|
||||||
|
path: Anything. If string, it's converted to Path.
|
||||||
|
RETURNS: Path or original argument.
|
||||||
|
"""
|
||||||
if isinstance(path, basestring_):
|
if isinstance(path, basestring_):
|
||||||
return Path(path)
|
return Path(path)
|
||||||
else:
|
else:
|
||||||
|
@ -85,10 +87,8 @@ def ensure_path(path):
|
||||||
def resolve_model_path(name):
|
def resolve_model_path(name):
|
||||||
"""Resolve a model name or string to a model path.
|
"""Resolve a model name or string to a model path.
|
||||||
|
|
||||||
Args:
|
name (unicode): Package name, shortcut link or model path.
|
||||||
name (unicode): Package name, shortcut link or model path.
|
RETURNS (Path): Path to model data directory.
|
||||||
Returns:
|
|
||||||
Path: Path to model data directory.
|
|
||||||
"""
|
"""
|
||||||
data_path = get_data_path()
|
data_path = get_data_path()
|
||||||
if not data_path or not data_path.exists():
|
if not data_path or not data_path.exists():
|
||||||
|
@ -108,11 +108,8 @@ def resolve_model_path(name):
|
||||||
def is_package(name):
|
def is_package(name):
|
||||||
"""Check if string maps to a package installed via pip.
|
"""Check if string maps to a package installed via pip.
|
||||||
|
|
||||||
Args:
|
name (unicode): Name of package.
|
||||||
name (unicode): Name of package.
|
RETURNS (bool): True if installed package, False if not.
|
||||||
Returns:
|
|
||||||
bool: True if installed package, False if not.
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
packages = pip.get_installed_distributions()
|
packages = pip.get_installed_distributions()
|
||||||
for package in packages:
|
for package in packages:
|
||||||
|
@ -124,10 +121,8 @@ def is_package(name):
|
||||||
def get_model_package_path(package_name):
|
def get_model_package_path(package_name):
|
||||||
"""Get path to a model package installed via pip.
|
"""Get path to a model package installed via pip.
|
||||||
|
|
||||||
Args:
|
package_name (unicode): Name of installed package.
|
||||||
package_name (unicode): Name of installed package.
|
RETURNS (Path): Path to model data directory.
|
||||||
Returns:
|
|
||||||
Path: Path to model data directory.
|
|
||||||
"""
|
"""
|
||||||
# Here we're importing the module just to find it. This is worryingly
|
# Here we're importing the module just to find it. This is worryingly
|
||||||
# indirect, but it's otherwise very difficult to find the package.
|
# indirect, but it's otherwise very difficult to find the package.
|
||||||
|
@ -142,11 +137,9 @@ def get_model_package_path(package_name):
|
||||||
def parse_package_meta(package_path, require=True):
|
def parse_package_meta(package_path, require=True):
|
||||||
"""Check if a meta.json exists in a package and return its contents.
|
"""Check if a meta.json exists in a package and return its contents.
|
||||||
|
|
||||||
Args:
|
package_path (Path): Path to model package directory.
|
||||||
package_path (Path): Path to model package directory.
|
require (bool): If True, raise error if no meta.json is found.
|
||||||
require (bool): If True, raise error if no meta.json is found.
|
RETURNS (dict or None): Model meta.json data or None.
|
||||||
Returns:
|
|
||||||
dict or None: Model meta.json data or None.
|
|
||||||
"""
|
"""
|
||||||
location = package_path / 'meta.json'
|
location = package_path / 'meta.json'
|
||||||
if location.is_file():
|
if location.is_file():
|
||||||
|
@ -201,11 +194,9 @@ def compile_infix_regex(entries):
|
||||||
def update_exc(base_exceptions, *addition_dicts):
|
def update_exc(base_exceptions, *addition_dicts):
|
||||||
"""Update and validate tokenizer exceptions. Will overwrite exceptions.
|
"""Update and validate tokenizer exceptions. Will overwrite exceptions.
|
||||||
|
|
||||||
Args:
|
base_exceptions (dict): Base exceptions.
|
||||||
base_exceptions (dict): Base exceptions.
|
*addition_dicts (dict): Exceptions to add to the base dict, in order.
|
||||||
*addition_dicts (dict): Exceptions to add to the base dict, in order.
|
RETURNS (dict): Combined tokenizer exceptions.
|
||||||
Returns:
|
|
||||||
dict: Combined tokenizer exceptions.
|
|
||||||
"""
|
"""
|
||||||
exc = dict(base_exceptions)
|
exc = dict(base_exceptions)
|
||||||
for additions in addition_dicts:
|
for additions in addition_dicts:
|
||||||
|
@ -229,12 +220,10 @@ def expand_exc(excs, search, replace):
|
||||||
"""Find string in tokenizer exceptions, duplicate entry and replace string.
|
"""Find string in tokenizer exceptions, duplicate entry and replace string.
|
||||||
For example, to add additional versions with typographic apostrophes.
|
For example, to add additional versions with typographic apostrophes.
|
||||||
|
|
||||||
Args:
|
excs (dict): Tokenizer exceptions.
|
||||||
excs (dict): Tokenizer exceptions.
|
search (unicode): String to find and replace.
|
||||||
search (unicode): String to find and replace.
|
replace (unicode): Replacement.
|
||||||
replace (unicode): Replacement.
|
RETURNS (dict): Combined tokenizer exceptions.
|
||||||
Returns:
|
|
||||||
dict:
|
|
||||||
"""
|
"""
|
||||||
def _fix_token(token, search, replace):
|
def _fix_token(token, search, replace):
|
||||||
fixed = dict(token)
|
fixed = dict(token)
|
||||||
|
@ -278,10 +267,8 @@ def check_renamed_kwargs(renamed, kwargs):
|
||||||
def read_json(location):
|
def read_json(location):
|
||||||
"""Open and load JSON from file.
|
"""Open and load JSON from file.
|
||||||
|
|
||||||
Args:
|
location (Path): Path to JSON file.
|
||||||
location (Path): Path to JSON file.
|
RETURNS (dict): Loaded JSON content.
|
||||||
Returns:
|
|
||||||
dict: Loaded JSON content.
|
|
||||||
"""
|
"""
|
||||||
with location.open('r', encoding='utf8') as f:
|
with location.open('r', encoding='utf8') as f:
|
||||||
return ujson.load(f)
|
return ujson.load(f)
|
||||||
|
@ -290,11 +277,9 @@ def read_json(location):
|
||||||
def get_raw_input(description, default=False):
|
def get_raw_input(description, default=False):
|
||||||
"""Get user input from the command line via raw_input / input.
|
"""Get user input from the command line via raw_input / input.
|
||||||
|
|
||||||
Args:
|
description (unicode): Text to display before prompt.
|
||||||
description (unicode): Text to display before prompt.
|
default (unicode or False/None): Default value to display with prompt.
|
||||||
default (unicode or False/None): Default value to display with prompt.
|
RETURNS (unicode): User input.
|
||||||
Returns:
|
|
||||||
unicode: User input.
|
|
||||||
"""
|
"""
|
||||||
additional = ' (default: %s)' % default if default else ''
|
additional = ' (default: %s)' % default if default else ''
|
||||||
prompt = ' %s%s: ' % (description, additional)
|
prompt = ' %s%s: ' % (description, additional)
|
||||||
|
@ -305,9 +290,8 @@ def get_raw_input(description, default=False):
|
||||||
def print_table(data, title=None):
|
def print_table(data, title=None):
|
||||||
"""Print data in table format.
|
"""Print data in table format.
|
||||||
|
|
||||||
Args:
|
data (dict or list of tuples): Label/value pairs.
|
||||||
data (dict or list of tuples): Label/value pairs.
|
title (unicode or None): Title, will be printed above.
|
||||||
title (unicode or None): Title, will be printed above.
|
|
||||||
"""
|
"""
|
||||||
if isinstance(data, dict):
|
if isinstance(data, dict):
|
||||||
data = list(data.items())
|
data = list(data.items())
|
||||||
|
@ -321,9 +305,8 @@ def print_table(data, title=None):
|
||||||
def print_markdown(data, title=None):
|
def print_markdown(data, title=None):
|
||||||
"""Print data in GitHub-flavoured Markdown format for issues etc.
|
"""Print data in GitHub-flavoured Markdown format for issues etc.
|
||||||
|
|
||||||
Args:
|
data (dict or list of tuples): Label/value pairs.
|
||||||
data (dict or list of tuples): Label/value pairs.
|
title (unicode or None): Title, will be rendered as headline 2.
|
||||||
title (unicode or None): Title, will be rendered as headline 2.
|
|
||||||
"""
|
"""
|
||||||
def excl_value(value):
|
def excl_value(value):
|
||||||
return Path(value).exists() # contains path (personal info)
|
return Path(value).exists() # contains path (personal info)
|
||||||
|
@ -339,10 +322,8 @@ def print_markdown(data, title=None):
|
||||||
def prints(*texts, **kwargs):
|
def prints(*texts, **kwargs):
|
||||||
"""Print formatted message (manual ANSI escape sequences to avoid dependency)
|
"""Print formatted message (manual ANSI escape sequences to avoid dependency)
|
||||||
|
|
||||||
Args:
|
*texts (unicode): Texts to print. Each argument is rendered as paragraph.
|
||||||
*texts (unicode): Texts to print. Each argument is rendered as paragraph.
|
**kwargs: 'title' becomes coloured headline. 'exits'=True performs sys exit.
|
||||||
**kwargs: 'title' is rendered as coloured headline. 'exits'=True performs
|
|
||||||
system exit after printing.
|
|
||||||
"""
|
"""
|
||||||
exits = kwargs.get('exits', False)
|
exits = kwargs.get('exits', False)
|
||||||
title = kwargs.get('title', None)
|
title = kwargs.get('title', None)
|
||||||
|
@ -356,12 +337,10 @@ def prints(*texts, **kwargs):
|
||||||
def _wrap(text, wrap_max=80, indent=4):
|
def _wrap(text, wrap_max=80, indent=4):
|
||||||
"""Wrap text at given width using textwrap module.
|
"""Wrap text at given width using textwrap module.
|
||||||
|
|
||||||
Args:
|
text (unicode): Text to wrap. If it's a Path, it's converted to string.
|
||||||
text (unicode): Text to wrap. If it's a Path, it's converted to string.
|
wrap_max (int): Maximum line length (indent is deducted).
|
||||||
wrap_max (int): Maximum line length (indent is deducted).
|
indent (int): Number of spaces for indentation.
|
||||||
indent (int): Number of spaces for indentation.
|
RETURNS (unicode): Wrapped text.
|
||||||
Returns:
|
|
||||||
unicode: Wrapped text.
|
|
||||||
"""
|
"""
|
||||||
indent = indent * ' '
|
indent = indent * ' '
|
||||||
wrap_width = wrap_max - len(indent)
|
wrap_width = wrap_max - len(indent)
|
||||||
|
@ -370,3 +349,13 @@ def _wrap(text, wrap_max=80, indent=4):
|
||||||
return textwrap.fill(text, width=wrap_width, initial_indent=indent,
|
return textwrap.fill(text, width=wrap_width, initial_indent=indent,
|
||||||
subsequent_indent=indent, break_long_words=False,
|
subsequent_indent=indent, break_long_words=False,
|
||||||
break_on_hyphens=False)
|
break_on_hyphens=False)
|
||||||
|
|
||||||
|
|
||||||
|
def minify_html(html):
|
||||||
|
"""Perform a template-specific, rudimentary HTML minification for displaCy.
|
||||||
|
Disclaimer: NOT a general-purpose solution, only removes indentation/newlines.
|
||||||
|
|
||||||
|
html (unicode): Markup to minify.
|
||||||
|
RETURNS (unicode): "Minified" HTML.
|
||||||
|
"""
|
||||||
|
return html.strip().replace(' ', '').replace('\n', '')
|
||||||
|
|
|
@ -3,7 +3,6 @@
|
||||||
include _includes/_mixins
|
include _includes/_mixins
|
||||||
|
|
||||||
doctype html
|
doctype html
|
||||||
|
|
||||||
html(lang="en")
|
html(lang="en")
|
||||||
title
|
title
|
||||||
if SECTION == "docs" && SUBSECTION && SUBSECTION != "index"
|
if SECTION == "docs" && SUBSECTION && SUBSECTION != "index"
|
||||||
|
|
BIN
website/assets/img/docs/displacy_jupyter.jpg
Normal file
BIN
website/assets/img/docs/displacy_jupyter.jpg
Normal file
Binary file not shown.
After Width: | Height: | Size: 99 KiB |
|
@ -21,6 +21,7 @@
|
||||||
"GoldParse": "goldparse"
|
"GoldParse": "goldparse"
|
||||||
},
|
},
|
||||||
"Other": {
|
"Other": {
|
||||||
|
"displaCy": "displacy",
|
||||||
"Utility Functions": "util",
|
"Utility Functions": "util",
|
||||||
"Annotation Specs": "annotation",
|
"Annotation Specs": "annotation",
|
||||||
"Feature Scheme": "features"
|
"Feature Scheme": "features"
|
||||||
|
@ -111,6 +112,11 @@
|
||||||
"tag": "class"
|
"tag": "class"
|
||||||
},
|
},
|
||||||
|
|
||||||
|
"displacy": {
|
||||||
|
"title": "displaCy",
|
||||||
|
"tag": "module"
|
||||||
|
},
|
||||||
|
|
||||||
"util": {
|
"util": {
|
||||||
"title": "Utility Functions"
|
"title": "Utility Functions"
|
||||||
},
|
},
|
||||||
|
|
229
website/docs/api/displacy.jade
Normal file
229
website/docs/api/displacy.jade
Normal file
|
@ -0,0 +1,229 @@
|
||||||
|
//- 💫 DOCS > API > DISPLACY
|
||||||
|
|
||||||
|
include ../../_includes/_mixins
|
||||||
|
|
||||||
|
p
|
||||||
|
| As of v2.0, spaCy comes with a built-in visualization suite. For more
|
||||||
|
| info and examples, see the usage workflow on
|
||||||
|
| #[+a("/docs/usage/visualizers") visualizing spaCy].
|
||||||
|
|
||||||
|
|
||||||
|
+h(2, "serve") serve
|
||||||
|
+tag method
|
||||||
|
|
||||||
|
p
|
||||||
|
| Serve a dependency parse tree or named entity visualization to view it
|
||||||
|
| in your browser. Will run a simple web server.
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
import spacy
|
||||||
|
from spacy import displacy
|
||||||
|
nlp = spacy.load('en')
|
||||||
|
doc1 = nlp(u'This is a sentence.')
|
||||||
|
doc2 = nlp(u'This is another sentence.')
|
||||||
|
displacy.serve([doc1, doc2], style='dep')
|
||||||
|
|
||||||
|
+table(["Name", "Type", "Description", "Default"])
|
||||||
|
+row
|
||||||
|
+cell #[code docs]
|
||||||
|
+cell list or #[code Doc]
|
||||||
|
+cell Document(s) to visualize.
|
||||||
|
+cell
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code style]
|
||||||
|
+cell unicode
|
||||||
|
+cell Visualization style, #[code 'dep'] or #[code 'ent'].
|
||||||
|
+cell #[code 'dep']
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code page]
|
||||||
|
+cell bool
|
||||||
|
+cell Render markup as full HTML page.
|
||||||
|
+cell #[code True]
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code minify]
|
||||||
|
+cell bool
|
||||||
|
+cell Minify HTML markup.
|
||||||
|
+cell #[code False]
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code options]
|
||||||
|
+cell dict
|
||||||
|
+cell #[+a("#options") Visualizer-specific options], e.g. colors.
|
||||||
|
+cell #[code {}]
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code port]
|
||||||
|
+cell int
|
||||||
|
+cell Port to serve visualization.
|
||||||
|
+cell #[code 5000]
|
||||||
|
|
||||||
|
+h(2, "render") render
|
||||||
|
+tag method
|
||||||
|
|
||||||
|
p Render a dependency parse tree or named entity visualization.
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
import spacy
|
||||||
|
from spacy import displacy
|
||||||
|
nlp = spacy.load('en')
|
||||||
|
doc = nlp(u'This is a sentence.')
|
||||||
|
html = displacy.render(doc, style='dep')
|
||||||
|
|
||||||
|
+table(["Name", "Type", "Description", "Default"])
|
||||||
|
+row
|
||||||
|
+cell #[code docs]
|
||||||
|
+cell list or #[code Doc]
|
||||||
|
+cell Document(s) to visualize.
|
||||||
|
+cell
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code style]
|
||||||
|
+cell unicode
|
||||||
|
+cell Visualization style, #[code 'dep'] or #[code 'ent'].
|
||||||
|
+cell #[code 'dep']
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code page]
|
||||||
|
+cell bool
|
||||||
|
+cell Render markup as full HTML page.
|
||||||
|
+cell #[code False]
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code minify]
|
||||||
|
+cell bool
|
||||||
|
+cell Minify HTML markup.
|
||||||
|
+cell #[code False]
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code jupyter]
|
||||||
|
+cell bool
|
||||||
|
+cell
|
||||||
|
| Returns markup using #[+a("http://jupyter.org/") Jupyter]'s
|
||||||
|
| internal methods, ready to be rendered in a notebook.
|
||||||
|
+cell #[code False]
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code options]
|
||||||
|
+cell dict
|
||||||
|
+cell #[+a("#options") Visualizer-specific options], e.g. colors.
|
||||||
|
+cell #[code {}]
|
||||||
|
|
||||||
|
+footrow
|
||||||
|
+cell return
|
||||||
|
+cell unicode
|
||||||
|
+cell Rendered HTML markup.
|
||||||
|
+cell
|
||||||
|
|
||||||
|
+h(2, "options") Visualizer options
|
||||||
|
|
||||||
|
p
|
||||||
|
| The #[code options] argument lets you specify additional settings for
|
||||||
|
| each visualizer. If a setting is not present in the options, the default
|
||||||
|
| value will be used.
|
||||||
|
|
||||||
|
+h(3, "options-dep") Dependency Visualizer options
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
options = {'compact': True, 'color': 'blue'}
|
||||||
|
displacy.serve(doc, style='dep', options=options)
|
||||||
|
|
||||||
|
+table(["Name", "Type", "Description", "Default"])
|
||||||
|
+row
|
||||||
|
+cell #[code collapse_punct]
|
||||||
|
+cell bool
|
||||||
|
+cell
|
||||||
|
| Attach punctuation to tokens. Can make the parse more readable,
|
||||||
|
| as it prevents long arcs to attach punctuation.
|
||||||
|
+cell #[code True]
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code compact]
|
||||||
|
+cell bool
|
||||||
|
+cell "Compact mode" with square arrows that takes up less space.
|
||||||
|
+cell #[code False]
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code color]
|
||||||
|
+cell unicode
|
||||||
|
+cell Text color (HEX, RGB or color names).
|
||||||
|
+cell #[code '#000000']
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code bg]
|
||||||
|
+cell unicode
|
||||||
|
+cell Background color (HEX, RGB or color names).
|
||||||
|
+cell #[code '#ffffff']
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code font]
|
||||||
|
+cell unicode
|
||||||
|
+cell Font name or font family for all text.
|
||||||
|
+cell #[code 'Arial']
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code offset_x]
|
||||||
|
+cell int
|
||||||
|
+cell Spacing on left side of the SVG in px.
|
||||||
|
+cell #[code 50]
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code arrow_stroke]
|
||||||
|
+cell int
|
||||||
|
+cell Width of arrow path in px.
|
||||||
|
+cell #[code 2]
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code arrow_width]
|
||||||
|
+cell int
|
||||||
|
+cell Width of arrow head in px.
|
||||||
|
+cell #[code 10] / #[code 8] (compact)
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code arrow_spacing]
|
||||||
|
+cell int
|
||||||
|
+cell Spacing between arrows in px to avoid overlaps.
|
||||||
|
+cell #[code 20]
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code word_spacing]
|
||||||
|
+cell int
|
||||||
|
+cell Horizontal spacing between words and arcs in px.
|
||||||
|
+cell #[code 45]
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code distance]
|
||||||
|
+cell int
|
||||||
|
+cell Distance between words in px.
|
||||||
|
+cell #[code 175] / #[code 85] (compact)
|
||||||
|
|
||||||
|
+h(3, "options-ent") Named Entity Visualizer options
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
options = {'ents': ['PERSON', 'ORG', 'PRODUCT'],
|
||||||
|
'colors': {'ORG': 'yellow'}}
|
||||||
|
displacy.serve(doc, style='ent', options=options)
|
||||||
|
|
||||||
|
+table(["Name", "Type", "Description", "Default"])
|
||||||
|
+row
|
||||||
|
+cell #[code ents]
|
||||||
|
+cell list
|
||||||
|
+cell
|
||||||
|
| Entity types to highlight (#[code None] for all types).
|
||||||
|
+cell #[code None]
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code colors]
|
||||||
|
+cell dict
|
||||||
|
+cell
|
||||||
|
| Color overrides. Entity types in lowercase should be mapped to
|
||||||
|
| color names or values.
|
||||||
|
+cell #[code {}]
|
||||||
|
|
||||||
|
p
|
||||||
|
| By default, displaCy comes with colours for all
|
||||||
|
| #[+a("/docs/api/annotation#named-entities") entity types supported by spaCy].
|
||||||
|
| If you're using custom entity types, you can use the #[code colors]
|
||||||
|
| setting to add your own colours for them.
|
|
@ -49,7 +49,7 @@ p
|
||||||
+cell unicode or #[code Path]
|
+cell unicode or #[code Path]
|
||||||
+cell Path to new data directory.
|
+cell Path to new data directory.
|
||||||
|
|
||||||
+h(2, "load_lang_class") load_lang_class
|
+h(2, "get_lang_class") get_lang_class
|
||||||
+tag function
|
+tag function
|
||||||
|
|
||||||
p
|
p
|
||||||
|
@ -59,7 +59,7 @@ p
|
||||||
|
|
||||||
+aside-code("Example").
|
+aside-code("Example").
|
||||||
for lang_id in ['en', 'de']:
|
for lang_id in ['en', 'de']:
|
||||||
lang_class = util.load_lang_class(lang_id)
|
lang_class = util.get_lang_class(lang_id)
|
||||||
lang = lang_class()
|
lang = lang_class()
|
||||||
tokenizer = lang.Defaults.create_tokenizer()
|
tokenizer = lang.Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
|
@ -4,9 +4,9 @@
|
||||||
"Installation": "./",
|
"Installation": "./",
|
||||||
"Models": "models",
|
"Models": "models",
|
||||||
"Lightning tour": "lightning-tour",
|
"Lightning tour": "lightning-tour",
|
||||||
|
"Visualizers": "visualizers",
|
||||||
"Command line": "cli",
|
"Command line": "cli",
|
||||||
"Troubleshooting": "troubleshooting",
|
"Troubleshooting": "troubleshooting"
|
||||||
"Resources": "resources"
|
|
||||||
},
|
},
|
||||||
"Workflows": {
|
"Workflows": {
|
||||||
"Loading the pipeline": "language-processing-pipeline",
|
"Loading the pipeline": "language-processing-pipeline",
|
||||||
|
@ -43,6 +43,11 @@
|
||||||
|
|
||||||
"lightning-tour": {
|
"lightning-tour": {
|
||||||
"title": "Lightning tour",
|
"title": "Lightning tour",
|
||||||
|
"next": "visualizers"
|
||||||
|
},
|
||||||
|
|
||||||
|
"visualizers": {
|
||||||
|
"title": "Visualizers",
|
||||||
"next": "cli"
|
"next": "cli"
|
||||||
},
|
},
|
||||||
|
|
||||||
|
|
|
@ -80,7 +80,7 @@ p
|
||||||
| compute. As of spaCy v2.0, #[code Language] classes are not imported on
|
| compute. As of spaCy v2.0, #[code Language] classes are not imported on
|
||||||
| initialisation and are only loaded when you import them directly, or load
|
| initialisation and are only loaded when you import them directly, or load
|
||||||
| a model that requires a language to be loaded. To lazy-load languages in
|
| a model that requires a language to be loaded. To lazy-load languages in
|
||||||
| your application, you can use the #[code util.load_lang_class()] helper
|
| your application, you can use the #[code util.get_lang_class()] helper
|
||||||
| function with the two-letter language code as its argument.
|
| function with the two-letter language code as its argument.
|
||||||
|
|
||||||
+h(2, "language-data") Adding language data
|
+h(2, "language-data") Adding language data
|
||||||
|
@ -486,7 +486,7 @@ p
|
||||||
| #[+src(gh("spaCy", "spacy/tests/lang")) tests/lang] in a directory named
|
| #[+src(gh("spaCy", "spacy/tests/lang")) tests/lang] in a directory named
|
||||||
| after the language ID. You'll also need to create a fixture for your
|
| after the language ID. You'll also need to create a fixture for your
|
||||||
| tokenizer in the #[+src(gh("spaCy", "spacy/tests/conftest.py")) conftest.py].
|
| tokenizer in the #[+src(gh("spaCy", "spacy/tests/conftest.py")) conftest.py].
|
||||||
| Always use the #[code load_lang_class()] helper function within the fixture,
|
| Always use the #[code get_lang_class()] helper function within the fixture,
|
||||||
| instead of importing the class at the top of the file. This will load the
|
| instead of importing the class at the top of the file. This will load the
|
||||||
| language data only when it's needed. (Otherwise, #[em all data] would be
|
| language data only when it's needed. (Otherwise, #[em all data] would be
|
||||||
| loaded every time you run a test.)
|
| loaded every time you run a test.)
|
||||||
|
@ -494,7 +494,7 @@ p
|
||||||
+code.
|
+code.
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def en_tokenizer():
|
def en_tokenizer():
|
||||||
return util.load_lang_class('en').Defaults.create_tokenizer()
|
return util.get_lang_class('en').Defaults.create_tokenizer()
|
||||||
|
|
||||||
p
|
p
|
||||||
| When adding test cases, always
|
| When adding test cases, always
|
||||||
|
|
|
@ -105,17 +105,36 @@ p
|
||||||
| consistent state.
|
| consistent state.
|
||||||
|
|
||||||
|
|
||||||
+h(2, "displacy") The displaCy #[sup ENT] visualizer
|
+h(2, "displacy") Visualizing named entities
|
||||||
|
|
||||||
p
|
p
|
||||||
| The #[+a(DEMOS_URL + "/displacy-ent/") displaCy #[sup ENT] visualizer]
|
| The #[+a(DEMOS_URL + "/displacy-ent/") displaCy #[sup ENT] visualizer]
|
||||||
| lets you explore an entity recognition model's behaviour interactively.
|
| lets you explore an entity recognition model's behaviour interactively.
|
||||||
| If you're training a model, it's very useful to run the visualization
|
| If you're training a model, it's very useful to run the visualization
|
||||||
| server yourself. To help you do that, we've open-sourced both the
|
| yourself. To help you do that, spaCy v2.0+ comes with a visualization
|
||||||
| #[+a(gh("spacy-services")) back-end service] and the
|
| module. Simply pass a #[code Doc] or a list of #[code Doc] objects to
|
||||||
| #[+a(gh("displacy-ent")) front-end client].
|
| displaCy and run #[+api("displacy#serve") #[code displacy.serve]] to
|
||||||
|
| run the web server, or #[+api("displacy#render") #[code displacy.render]]
|
||||||
|
| to generate the raw markup.
|
||||||
|
|
||||||
+codepen("ALxpQO", 450)
|
p
|
||||||
|
| For more details and examples, see the
|
||||||
|
| #[+a("/docs/usage/visualizers") usage workflow on visualizing spaCy].
|
||||||
|
|
||||||
|
+code("Named Entity example").
|
||||||
|
import spacy
|
||||||
|
from spacy import displacy
|
||||||
|
|
||||||
|
text = """But Google is starting from behind. The company made a late push
|
||||||
|
into hardware, and Apple’s Siri, available on iPhones, and Amazon’s Alexa
|
||||||
|
software, which runs on its Echo and Dot devices, have clear leads in
|
||||||
|
consumer adoption."""
|
||||||
|
|
||||||
|
nlp = spacy.load('custom_ner_model')
|
||||||
|
doc = nlp(text)
|
||||||
|
displacy.serve(doc, style='ent')
|
||||||
|
|
||||||
|
+codepen("a73f8b68f9af3157855962b283b364e4", 345)
|
||||||
|
|
||||||
+h(2, "entity-types") Built-in entity types
|
+h(2, "entity-types") Built-in entity types
|
||||||
|
|
||||||
|
|
278
website/docs/usage/visualizers.jade
Normal file
278
website/docs/usage/visualizers.jade
Normal file
|
@ -0,0 +1,278 @@
|
||||||
|
//- 💫 DOCS > USAGE > VISUALIZERS
|
||||||
|
|
||||||
|
include ../../_includes/_mixins
|
||||||
|
|
||||||
|
p
|
||||||
|
| As of v2.0, our popular visualizers, #[+a(DEMOS_URL + "/displacy") displaCy]
|
||||||
|
| and #[+a(DEMOS_URL + "displacy-ent") displaCy #[sup ENT]] are finally an
|
||||||
|
| official part of the library. Visualizing a dependency parse or named
|
||||||
|
| entities in a text is not only a fun NLP demo – it can also be incredibly
|
||||||
|
| helpful in speeding up development and debugging your code and training
|
||||||
|
| process. Instead of printing a list of dependency labels or entity spans,
|
||||||
|
| you can simply pass your #[code Doc] objects to #[code displacy] and view
|
||||||
|
| the visualizations in your browser, or export them as HTML files or
|
||||||
|
| vector graphics. displaCy also comes with a #[+a("#jupyter") Jupyter hook]
|
||||||
|
| that returns the markup in a format ready to be rendered in a notebook.
|
||||||
|
|
||||||
|
+aside("What about the old visualizers?")
|
||||||
|
| Our JavaScript-based visualizers #[+src(gh("displacy")) displacy.js] and
|
||||||
|
| #[+src(gh("displacy-ent")) displacy-ent.js] will still be available on
|
||||||
|
| GitHub. If you're looking to implement web-based visualizations, we
|
||||||
|
| generally recommend using those instead of spaCy's built-in
|
||||||
|
| #[code displacy] module. It'll allow your application to perform all
|
||||||
|
| rendering on the client and only rely on the server for the text
|
||||||
|
| processing. The generated markup is also more compatible with modern web
|
||||||
|
| standards.
|
||||||
|
|
||||||
|
+h(2, "getting-started") Getting started
|
||||||
|
|
||||||
|
p
|
||||||
|
| The quickest way visualize #[code Doc] is to use
|
||||||
|
| #[+api("displacy#serve") #[code displacy.serve]]. This will spin up a
|
||||||
|
| simple web server and let you view the result straight from your browser.
|
||||||
|
| displaCy can either take a single #[code Doc] or a list of #[code Doc]
|
||||||
|
| objects as its first argument. This lets you construct them however you
|
||||||
|
| like – using any model or modifications you like.
|
||||||
|
|
||||||
|
+h(3, "dep") Visualizing the dependency parse
|
||||||
|
|
||||||
|
p
|
||||||
|
| The dependency visualizer, #[code dep], shows part-of-speech tags
|
||||||
|
| and syntactic dependencies.
|
||||||
|
|
||||||
|
+code("Dependency example").
|
||||||
|
import spacy
|
||||||
|
from spacy import displacy
|
||||||
|
|
||||||
|
nlp = spacy.load('en')
|
||||||
|
doc = nlp(u'This is a sentence.')
|
||||||
|
displacy.serve(doc, style='dep')
|
||||||
|
|
||||||
|
+codepen("f0e85b64d469d6617251d8241716d55f", 370)
|
||||||
|
|
||||||
|
p
|
||||||
|
| The argument #[code options] lets you specify a dictionary of settings
|
||||||
|
| to customise the layout, for example:
|
||||||
|
|
||||||
|
+table(["Name", "Type", "Description", "Default"])
|
||||||
|
+row
|
||||||
|
+cell #[code compact]
|
||||||
|
+cell bool
|
||||||
|
+cell "Compact mode" with square arrows that takes up less space.
|
||||||
|
+cell #[code False]
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code color]
|
||||||
|
+cell unicode
|
||||||
|
+cell Text color (HEX, RGB or color names).
|
||||||
|
+cell #[code '#000000']
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code bg]
|
||||||
|
+cell unicode
|
||||||
|
+cell Background color (HEX, RGB or color names).
|
||||||
|
+cell #[code '#ffffff']
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code font]
|
||||||
|
+cell unicode
|
||||||
|
+cell Font name or font family for all text.
|
||||||
|
+cell #[code 'Arial']
|
||||||
|
|
||||||
|
p
|
||||||
|
| For a list of all available options, see the
|
||||||
|
| #[+api("displacy#options") #[code displacy] API documentation].
|
||||||
|
|
||||||
|
+aside-code("Options example").
|
||||||
|
options = {'compact': True, 'bg': '#09a3d5',
|
||||||
|
'color': 'white', 'font': 'Source Sans Pro'}
|
||||||
|
displacy.serve(doc, style='dep', options=options)
|
||||||
|
|
||||||
|
+codepen("39c02c893a84794353de77a605d817fd", 360)
|
||||||
|
|
||||||
|
+h(3, "ent") Visualizing the entity recognizer
|
||||||
|
|
||||||
|
p
|
||||||
|
| The entity visualizer, #[code ent], highlights named entities and
|
||||||
|
| their labels in a text.
|
||||||
|
|
||||||
|
+code("Named Entity example").
|
||||||
|
import spacy
|
||||||
|
from spacy import displacy
|
||||||
|
|
||||||
|
text = """But Google is starting from behind. The company made a late push
|
||||||
|
into hardware, and Apple’s Siri, available on iPhones, and Amazon’s Alexa
|
||||||
|
software, which runs on its Echo and Dot devices, have clear leads in
|
||||||
|
consumer adoption."""
|
||||||
|
|
||||||
|
nlp = spacy.load('custom_ner_model')
|
||||||
|
doc = nlp(text)
|
||||||
|
displacy.serve(doc, style='ent')
|
||||||
|
|
||||||
|
+codepen("a73f8b68f9af3157855962b283b364e4", 345)
|
||||||
|
|
||||||
|
p The entity visualizer lets you customise the following #[code options]:
|
||||||
|
|
||||||
|
+table(["Name", "Type", "Description", "Default"])
|
||||||
|
+row
|
||||||
|
+cell #[code ents]
|
||||||
|
+cell list
|
||||||
|
+cell
|
||||||
|
| Entity types to highlight (#[code None] for all types).
|
||||||
|
+cell #[code None]
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code colors]
|
||||||
|
+cell dict
|
||||||
|
+cell
|
||||||
|
| Color overrides. Entity types in lowercase should be mapped to
|
||||||
|
| color names or values.
|
||||||
|
+cell #[code {}]
|
||||||
|
|
||||||
|
p
|
||||||
|
| If you specify a list of #[code ents], only those entity types will be
|
||||||
|
| rendered – for example, you can choose to display #[code PERSON] entities.
|
||||||
|
| Internally, the visualizer knows nothing about available entity types and
|
||||||
|
| will render whichever spans and labels it receives. This makes it
|
||||||
|
| especially easy to work with custom entity types. By default, displaCy
|
||||||
|
| comes with colours for all
|
||||||
|
| #[+a("/docs/api/annotation#named-entities") entity types supported by spaCy].
|
||||||
|
| If you're using custom entity types, you can use the #[code colors]
|
||||||
|
| setting to add your own colours for them.
|
||||||
|
|
||||||
|
+aside-code("Options example").
|
||||||
|
colors = {'ORG': 'linear-gradient(90deg, #aa9cfc, #fc9ce7)'}
|
||||||
|
options = {'ents': ['ORG'], 'colors': colors}
|
||||||
|
displacy.serve(doc, style='ent', options=options)
|
||||||
|
|
||||||
|
+codepen("f42ec690762b6f007022a7acd6d0c7d4", 300)
|
||||||
|
|
||||||
|
p
|
||||||
|
| The above example uses a little trick: Since the background colour values
|
||||||
|
| are added as the #[code background] style attribute, you can use any
|
||||||
|
| #[+a("https://tympanus.net/codrops/css_reference/background/") valid background value]
|
||||||
|
| or shorthand — including gradients and even images!
|
||||||
|
|
||||||
|
+h(2, "render") Rendering visualizations
|
||||||
|
|
||||||
|
p
|
||||||
|
| If you don't need the web server and just want to generate the markup
|
||||||
|
| – for example, to export it to a file or serve it in a custom
|
||||||
|
| way – you can use #[+api("displacy#render") #[code displacy.render]]
|
||||||
|
| instead. It works the same, but returns a string containing the markup.
|
||||||
|
|
||||||
|
+code("Example").
|
||||||
|
import spacy
|
||||||
|
from spacy import displacy
|
||||||
|
|
||||||
|
nlp = spacy.load('en')
|
||||||
|
doc1 = nlp(u'This is a sentence.')
|
||||||
|
doc2 = nlp(u'This is another sentence.')
|
||||||
|
html = displacy.render([doc1, doc2], style='dep', page=True)
|
||||||
|
|
||||||
|
p
|
||||||
|
| #[code page=True] renders the markup wrapped as a full HTML page.
|
||||||
|
| For minified and more compact HTML markup, you can set #[code minify=True].
|
||||||
|
| If you're rendering a dependency parse, you can also export it as an
|
||||||
|
| #[code .svg] file.
|
||||||
|
|
||||||
|
+aside("What's SVG?")
|
||||||
|
| Unlike other image formats, the SVG (Scalable Vector Graphics) uses XML
|
||||||
|
| markup that's easy to manipulate
|
||||||
|
| #[+a("https://www.smashingmagazine.com/2014/11/styling-and-animating-svgs-with-css/") using CSS] or
|
||||||
|
| #[+a("https://css-tricks.com/smil-is-dead-long-live-smil-a-guide-to-alternatives-to-smil-features/") JavaScript].
|
||||||
|
| Essentially, SVG lets you design with code, which makes it a perfect fit
|
||||||
|
| for visualizing dependency trees. SVGs can be embedded online in an
|
||||||
|
| #[code <img>] tag, or inlined in an HTML document. They're also
|
||||||
|
| pretty easy to #[+a("https://convertio.co/image-converter/") convert].
|
||||||
|
|
||||||
|
+code.
|
||||||
|
svg = displacy.render(doc, style='dep')
|
||||||
|
output_path = Path('/images/sentence.svg')
|
||||||
|
output_path.open('w', encoding='utf-8').write(svg)
|
||||||
|
|
||||||
|
+infobox("Important note")
|
||||||
|
| Since each visualization is generated as a separate SVG, exporting
|
||||||
|
| #[code .svg] files only works if you're rendering #[strong one single doc]
|
||||||
|
| at a time. (This makes sense – after all, each visualization should be
|
||||||
|
| a standalone graphic.) So instead of rendering all #[code Doc]s at one,
|
||||||
|
| loop over them and export them separately.
|
||||||
|
|
||||||
|
+h(2, "jupyter") Using displaCy in Jupyter notebooks
|
||||||
|
|
||||||
|
p
|
||||||
|
| If you're working with a #[+a("https://jupyter.org") Jupyter] notebook,
|
||||||
|
| you can use displaCy's "Jupyter mode" to return markup that can be
|
||||||
|
| rendered in a cell straight away. When you export your notebook, the
|
||||||
|
| visualizations will be included as HTML.
|
||||||
|
|
||||||
|
+code("Jupyter Example").
|
||||||
|
# don't forget to install a model, e.g.: python -m spacy download en
|
||||||
|
import spacy
|
||||||
|
from spacy import displacy
|
||||||
|
|
||||||
|
doc = nlp(u'Rats are various medium-sized, long-tailed rodents.')
|
||||||
|
displacy.render(doc, style='dep', jupyter=True)
|
||||||
|
|
||||||
|
doc2 = nlp(LONG_NEWS_ARTICLE)
|
||||||
|
displacy.render(doc2, style='ent', jupyter=True)
|
||||||
|
|
||||||
|
+image("/assets/img/docs/displacy_jupyter.jpg", 700, false, "Example of using the displaCy dependency and named entity visualizer in a Jupyter notebook")
|
||||||
|
|
||||||
|
p
|
||||||
|
| Internally, displaCy imports #[code display] and #[code HTML] from
|
||||||
|
| #[code IPython.core.display] and returns a Jupyter HTML object. If you
|
||||||
|
| were doing it manually, it'd look like this:
|
||||||
|
|
||||||
|
+code.
|
||||||
|
from IPython.core.display import display, HTML
|
||||||
|
|
||||||
|
html = displacy.render(doc, style='dep')
|
||||||
|
return display(HTML(html))
|
||||||
|
|
||||||
|
+h(2, "examples") Usage examples
|
||||||
|
|
||||||
|
|
||||||
|
+h(2, "manual-usage") Rendering data manually
|
||||||
|
|
||||||
|
p
|
||||||
|
| You can also use displaCy to manually render data. This can be useful if
|
||||||
|
| you want to visualize output from other libraries, like
|
||||||
|
| #[+a("http://www.nltk.org") NLTK] or
|
||||||
|
| #[+a("https://github.com/tensorflow/models/tree/master/syntaxnet") SyntaxNet].
|
||||||
|
| Simply convert the dependency parse or recognised entities to displaCy's
|
||||||
|
| format and import #[code DependencyRenderer] or #[code EntityRenderer]
|
||||||
|
| from #[code spacy.displacy.render]. A renderer class can be is initialised
|
||||||
|
| with a dictionary of options. To generate the visualization markup, call
|
||||||
|
| the renderer's #[code render()] method on a list of dictionaries (one
|
||||||
|
| per visualization).
|
||||||
|
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
from spacy.displacy.render import EntityRenderer
|
||||||
|
|
||||||
|
ex = [{'text': 'But Google is starting from behind.',
|
||||||
|
'ents': [{'start': 4, 'end': 10, 'label': 'ORG'}],
|
||||||
|
'title': None}]
|
||||||
|
renderer = EntityRenderer()
|
||||||
|
html = renderer.render(ex)
|
||||||
|
|
||||||
|
+code("DependencyRenderer input").
|
||||||
|
[{
|
||||||
|
'words': [
|
||||||
|
{'text': 'This', 'tag': 'DT'},
|
||||||
|
{'text': 'is', 'tag': 'VBZ'},
|
||||||
|
{'text': 'a', 'tag': 'DT'},
|
||||||
|
{'text': 'sentence', 'tag': 'NN'}],
|
||||||
|
'arcs': [
|
||||||
|
{'start': 0, 'end': 1, 'label': 'nsubj', 'dir': 'left'},
|
||||||
|
{'start': 2, 'end': 3, 'label': 'det', 'dir': 'left'},
|
||||||
|
{'start': 1, 'end': 3, 'label': 'attr', 'dir': 'right'}]
|
||||||
|
}]
|
||||||
|
|
||||||
|
+code("EntityRenderer input").
|
||||||
|
[{
|
||||||
|
'text': 'But Google is starting from behind.',
|
||||||
|
'ents': [{'start': 4, 'end': 10, 'label': 'ORG'}],
|
||||||
|
'title': None
|
||||||
|
}]
|
Loading…
Reference in New Issue
Block a user