Merge branch 'develop' of https://github.com/explosion/spaCy into develop

This commit is contained in:
Matthew Honnibal 2017-05-15 21:53:47 +02:00
commit 1d7c18e58a
16 changed files with 1024 additions and 110 deletions

View File

@ -16,7 +16,7 @@ def load(name, **overrides):
meta = util.parse_package_meta(model_path) meta = util.parse_package_meta(model_path)
if 'lang' not in meta: if 'lang' not in meta:
raise IOError('No language setting found in model meta.') raise IOError('No language setting found in model meta.')
cls = util.load_lang_class(meta['lang']) cls = util.get_lang_class(meta['lang'])
overrides['meta'] = meta overrides['meta'] = meta
overrides['path'] = model_path overrides['path'] = model_path
return cls(**overrides) return cls(**overrides)

View File

@ -10,9 +10,9 @@ PRON_LEMMA = "-PRON-"
def depr_model_download(lang): def depr_model_download(lang):
""" """Replace en/de download modules within, warn and ownload default models.
Replace download modules within en and de with deprecation warning and
download default language model (using shortcut). lang (unicode): Language shortcut, 'en' or 'de'.
""" """
prints("The spacy.%s.download command is now deprecated. Please use " prints("The spacy.%s.download command is now deprecated. Please use "
"python -m spacy download [model name or shortcut] instead. For " "python -m spacy download [model name or shortcut] instead. For "
@ -24,6 +24,12 @@ def depr_model_download(lang):
def resolve_load_name(name, **overrides): def resolve_load_name(name, **overrides):
"""Resolve model loading if deprecated path kwarg is specified in overrides.
name (unicode): Name of model to load.
**overrides: Overrides specified in spacy.load().
RETURNS: Model name or value of path kwarg.
"""
if overrides.get('path') not in (None, False, True): if overrides.get('path') not in (None, False, True):
name = overrides.get('path') name = overrides.get('path')
prints("To load a model from a path, you can now use the first argument. " prints("To load a model from a path, you can now use the first argument. "

103
spacy/displacy/__init__.py Normal file
View File

@ -0,0 +1,103 @@
# coding: utf8
from __future__ import unicode_literals
from .render import DependencyRenderer, EntityRenderer
from ..tokens import Doc
from ..util import prints
_html = {}
def render(docs, style='dep', page=False, minify=False, jupyter=False, options={}):
"""Render displaCy visualisation.
docs (list or Doc): Document(s) to visualise.
style (unicode): Visualisation style, 'dep' or 'ent'.
page (bool): Render markup as full HTML page.
minify (bool): Minify HTML markup.
jupyter (bool): Experimental, use Jupyter's display() to output markup.
options (dict): Visualiser-specific options, e.g. colors.
RETURNS (unicode): Rendered HTML markup.
"""
if isinstance(docs, Doc):
docs = [docs]
if style is 'dep':
renderer = DependencyRenderer(options=options)
parsed = [parse_deps(doc, options) for doc in docs]
elif style is 'ent':
renderer = EntityRenderer(options=options)
parsed = [parse_ents(doc, options) for doc in docs]
_html['parsed'] = renderer.render(parsed, page=page, minify=minify).strip()
html = _html['parsed']
if jupyter: # return HTML rendered by IPython display()
from IPython.core.display import display, HTML
return display(HTML(html))
return html
def serve(docs, style='dep', page=True, minify=False, options={}, port=5000):
"""Serve displaCy visualisation.
docs (list or Doc): Document(s) to visualise.
style (unicode): Visualisation style, 'dep' or 'ent'.
page (bool): Render markup as full HTML page.
minify (bool): Minify HTML markup.
options (dict): Visualiser-specific options, e.g. colors.
port (int): Port to serve visualisation.
"""
from wsgiref import simple_server
render(docs, style=style, page=page, minify=minify, options=options)
httpd = simple_server.make_server('0.0.0.0', port, app)
prints("Using the '%s' visualizer" % style, title="Serving on port %d..." % port)
httpd.serve_forever()
def app(environ, start_response):
start_response('200 OK', [('Content-type', 'text/html; charset=utf-8')])
res = _html['parsed'].encode(encoding='utf-8')
return [res]
def parse_deps(doc, options={}):
"""Generate dependency parse in {'words': [], 'arcs': []} format.
doc (Doc): Document do parse.
RETURNS (dict): Generated dependency parse keyed by words and arcs.
"""
if options.get('collapse_punct', True):
spans = []
for word in doc[:-1]:
if word.is_punct or not word.nbor(1).is_punct:
continue
start = word.i
end = word.i + 1
while end < len(doc) and doc[end].is_punct:
end += 1
span = doc[start : end]
spans.append((span.start_char, span.end_char, word.tag_,
word.lemma_, word.ent_type_))
for span_props in spans:
doc.merge(*span_props)
words = [{'text': w.text, 'tag': w.tag_} for w in doc]
arcs = []
for word in doc:
if word.i < word.head.i:
arcs.append({'start': word.i, 'end': word.head.i,
'label': word.dep_, 'dir': 'left'})
elif word.i > word.head.i:
arcs.append({'start': word.head.i, 'end': word.i,
'label': word.dep_, 'dir': 'right'})
return {'words': words, 'arcs': arcs}
def parse_ents(doc, options={}):
"""Generate named entities in [{start: i, end: i, label: 'label'}] format.
doc (Doc): Document do parse.
RETURNS (dict): Generated entities keyed by text (original text) and ents.
"""
ents = [{'start': ent.start_char, 'end': ent.end_char, 'label': ent.label_}
for ent in doc.ents]
title = doc.user_data.get('title', None) if hasattr(doc, 'user_data') else None
return {'text': doc.text, 'ents': ents, 'title': title}

217
spacy/displacy/render.py Normal file
View File

@ -0,0 +1,217 @@
# coding: utf8
from __future__ import unicode_literals
from .templates import TPL_DEP_SVG, TPL_DEP_WORDS, TPL_DEP_ARCS
from .templates import TPL_ENT, TPL_ENTS, TPL_FIGURE, TPL_TITLE, TPL_PAGE
from ..util import minify_html
class DependencyRenderer(object):
"""Render dependency parses as SVGs."""
style = 'dep'
def __init__(self, options={}):
"""Initialise dependency renderer.
options (dict): Visualiser-specific options (compact, word_spacing,
arrow_spacing, arrow_width, arrow_stroke, distance,
offset_x, color, bg, font)
"""
self.compact = options.get('compact', False)
distance, arrow_width = (85, 8) if self.compact else (175, 10)
self.word_spacing = options.get('word_spacing', 45)
self.arrow_spacing = options.get('arrow_spacing', 20)
self.arrow_width = options.get('arrow_width', arrow_width)
self.arrow_stroke = options.get('arrow_stroke', 2)
self.distance = options.get('distance', distance)
self.offset_x = options.get('offset_x', 50)
self.color = options.get('color', '#000000')
self.bg = options.get('bg', '#ffffff')
self.font = options.get('font', 'Arial')
def render(self, parsed, page=False, minify=False):
"""Render complete markup.
parsed (list): Dependency parses to render.
page (bool): Render parses wrapped as full HTML page.
minify (bool): Minify HTML markup.
RETURNS (unicode): Rendered SVG or HTML markup.
"""
rendered = [self.render_svg(i, p['words'], p['arcs'])
for i, p in enumerate(parsed)]
if page:
content = ''.join([TPL_FIGURE.format(content=svg) for svg in rendered])
markup = TPL_PAGE.format(content=content)
else:
markup = ''.join(rendered)
if minify:
return minify_html(markup)
return markup
def render_svg(self, render_id, words, arcs):
"""Render SVG.
render_id (int): Unique ID, typically index of document.
words (list): Individual words and their tags.
arcs (list): Individual arcs and their start, end, direction and label.
RETURNS (unicode): Rendered SVG markup.
"""
self.levels = self.get_levels(arcs)
self.highest_level = len(self.levels)
self.offset_y = self.distance/2*self.highest_level+self.arrow_stroke
self.width = self.offset_x+len(words)*self.distance
self.height = self.offset_y+3*self.word_spacing
self.id = render_id
words = [self.render_word(w['text'], w['tag'], i)
for i, w in enumerate(words)]
arcs = [self.render_arrow(a['label'], a['start'], a['end'], a['dir'], i)
for i, a in enumerate(arcs)]
content = ''.join(words) + ''.join(arcs)
return TPL_DEP_SVG.format(id=self.id, width=self.width, height=self.height,
color=self.color, bg=self.bg, font=self.font,
content=content)
def render_word(self, text, tag, i):
"""Render individual word.
text (unicode): Word text.
tag (unicode): Part-of-speech tag.
i (int): Unique ID, typically word index.
RETURNS (unicode): Rendered SVG markup.
"""
y = self.offset_y+self.word_spacing
x = self.offset_x+i*self.distance
return TPL_DEP_WORDS.format(text=text, tag=tag, x=x, y=y)
def render_arrow(self, label, start, end, direction, i):
"""Render indivicual arrow.
label (unicode): Dependency label.
start (int): Index of start word.
end (int): Index of end word.
direction (unicode): Arrow direction, 'left' or 'right'.
i (int): Unique ID, typically arrow index.
RETURNS (unicode): Rendered SVG markup.
"""
level = self.levels.index(end-start)+1
x_start = self.offset_x+start*self.distance+self.arrow_spacing
y = self.offset_y
x_end = (self.offset_x+(end-start)*self.distance+start*self.distance
-self.arrow_spacing*(self.highest_level-level)/4)
y_curve = self.offset_y-level*self.distance/2
if y_curve == 0 and len(self.levels) > 5:
y_curve = -self.distance
arrowhead = self.get_arrowhead(direction, x_start, y, x_end)
arc = self.get_arc(x_start, y, y_curve, x_end)
return TPL_DEP_ARCS.format(id=self.id, i=i, stroke=self.arrow_stroke,
head=arrowhead, label=label, arc=arc)
def get_arc(self, x_start, y, y_curve, x_end):
"""Render individual arc.
x_start (int): X-coordinate of arrow start point.
y (int): Y-coordinate of arrow start and end point.
y_curve (int): Y-corrdinate of Cubic Bézier y_curve point.
x_end (int): X-coordinate of arrow end point.
RETURNS (unicode): Definition of the arc path ('d' attribute).
"""
template = "M{x},{y} C{x},{c} {e},{c} {e},{y}"
if self.compact:
template = "M{x},{y} {x},{c} {e},{c} {e},{y}"
return template.format(x=x_start, y=y, c=y_curve, e=x_end)
def get_arrowhead(self, direction, x, y, end):
"""Render individual arrow head.
direction (unicode): Arrow direction, 'left' or 'right'.
x (int): X-coordinate of arrow start point.
y (int): Y-coordinate of arrow start and end point.
end (int): X-coordinate of arrow end point.
RETURNS (unicode): Definition of the arrow head path ('d' attribute).
"""
if direction is 'left':
pos1, pos2, pos3 = (x, x-self.arrow_width+2, x+self.arrow_width-2)
else:
pos1, pos2, pos3 = (end, end+self.arrow_width-2, end-self.arrow_width+2)
arrowhead = (pos1, y+2, pos2, y-self.arrow_width, pos3, y-self.arrow_width)
return "M{},{} L{},{} {},{}".format(*arrowhead)
def get_levels(self, arcs):
"""Calculate available arc height "levels".
Used to calculate arrow heights dynamically and without wasting space.
args (list): Individual arcs and their start, end, direction and label.
RETURNS (list): Arc levels sorted from lowest to highest.
"""
levels = set(map(lambda arc: arc['end'] - arc['start'], arcs))
return sorted(list(levels))
class EntityRenderer(object):
"""Render named entities as HTML."""
style = 'ent'
def __init__(self, options={}):
"""Initialise dependency renderer.
options (dict): Visualiser-specific options (colors, ents)
"""
colors = {'ORG': '#7aecec', 'PRODUCT': '#bfeeb7', 'GPE': '#feca74',
'LOC': '#ff9561', 'PERSON': '#aa9cfc', 'NORP': '#c887fb',
'FACILITY': '#9cc9cc', 'EVENT': '#ffeb80', 'LANGUAGE': '#ff8197',
'WORK_OF_ART': '#f0d0ff', 'DATE': '#bfe1d9', 'TIME': '#bfe1d9',
'MONEY': '#e4e7d2', 'QUANTITY': '#e4e7d2', 'ORDINAL': '#e4e7d2',
'CARDINAL': '#e4e7d2', 'PERCENT': '#e4e7d2'}
colors.update(options.get('colors', {}))
self.default_color = '#ddd'
self.colors = colors
self.ents = options.get('ents', None)
def render(self, parsed, page=False, minify=False):
"""Render complete markup.
parsed (list): Dependency parses to render.
page (bool): Render parses wrapped as full HTML page.
minify (bool): Minify HTML markup.
RETURNS (unicode): Rendered HTML markup.
"""
rendered = [self.render_ents(p['text'], p['ents'], p['title']) for p in parsed]
if page:
docs = ''.join([TPL_FIGURE.format(content=doc) for doc in rendered])
markup = TPL_PAGE.format(content=docs)
else:
markup = ''.join(rendered)
if minify:
return minify_html(markup)
return markup
def render_ents(self, text, spans, title):
"""Render entities in text.
text (unicode): Original text.
spans (list): Individual entity spans and their start, end and label.
title (unicode or None): Document title set in Doc.user_data['title'].
"""
markup = ''
offset = 0
for span in spans:
label = span['label']
start = span['start']
end = span['end']
entity = text[start:end]
fragments = text[offset:start].split('\n')
for i, fragment in enumerate(fragments):
markup += fragment
if len(fragments) > 1 and i != len(fragments)-1:
markup += '</br>'
if self.ents is None or label.upper() in self.ents:
color = self.colors.get(label.upper(), self.default_color)
markup += TPL_ENT.format(label=label, text=entity, bg=color)
else:
markup += entity
offset = end
markup += text[offset:]
markup = TPL_ENTS.format(content=markup, colors=self.colors)
if title:
markup = TPL_TITLE.format(title=title) + markup
return markup

View File

@ -0,0 +1,63 @@
# coding: utf8
from __future__ import unicode_literals
# setting explicit height and max-width: none on the SVG is required for
# Jupyter to render it properly in a cell
TPL_DEP_SVG = """
<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" id="{id}" class="displacy" width="{width}" height="{height}" style="max-width: none; height: {height}px; color: {color}; background: {bg}; font-family: {font}">{content}</svg>
"""
TPL_DEP_WORDS = """
<text class="displacy-token" fill="currentColor" text-anchor="middle" y="{y}">
<tspan class="displacy-word" fill="currentColor" x="{x}">{text}</tspan>
<tspan class="displacy-tag" dy="2em" fill="currentColor" x="{x}">{tag}</tspan>
</text>
"""
TPL_DEP_ARCS = """
<g class="displacy-arrow">
<path class="displacy-arc" id="arrow-{id}-{i}" stroke-width="{stroke}px" d="{arc}" fill="none" stroke="currentColor"/>
<text dy="1.25em" style="font-size: 0.8em">
<textPath xlink:href="#arrow-{id}-{i}" class="displacy-label" startOffset="50%" fill="currentColor" text-anchor="middle">{label}</textPath>
</text>
<path class="displacy-arrowhead" d="{head}" fill="currentColor"/>
</g>
"""
TPL_FIGURE = """
<figure style="margin-bottom: 6rem">{content}</figure>
"""
TPL_TITLE = """
<h2 style="margin: 0">{title}</h2>
"""
TPL_ENTS = """
<div class="entities" style="line-height: 2.5">{content}</div>
"""
TPL_ENT = """
<mark class="entity" style="background: {bg}; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone">
{text}
<span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem">{label}</span>
</mark>
"""
TPL_PAGE = """
<!DOCTYPE html>
<html>
<head>
<title>displaCy</title>
</head>
<body style="font-size: 16px; font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol'; padding: 4rem 2rem;">{content}</body>
</html>
"""

View File

@ -18,67 +18,67 @@ _languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'he', 'hu', 'it', 'nb',
@pytest.fixture(params=_languages) @pytest.fixture(params=_languages)
def tokenizer(request): def tokenizer(request):
lang = util.load_lang_class(request.param) lang = util.get_lang_class(request.param)
return lang.Defaults.create_tokenizer() return lang.Defaults.create_tokenizer()
@pytest.fixture @pytest.fixture
def en_tokenizer(): def en_tokenizer():
return util.load_lang_class('en').Defaults.create_tokenizer() return util.get_lang_class('en').Defaults.create_tokenizer()
@pytest.fixture @pytest.fixture
def en_vocab(): def en_vocab():
return util.load_lang_class('en').Defaults.create_vocab() return util.get_lang_class('en').Defaults.create_vocab()
@pytest.fixture @pytest.fixture
def en_parser(): def en_parser():
return util.load_lang_class('en').Defaults.create_parser() return util.get_lang_class('en').Defaults.create_parser()
@pytest.fixture @pytest.fixture
def es_tokenizer(): def es_tokenizer():
return util.load_lang_class('es').Defaults.create_tokenizer() return util.get_lang_class('es').Defaults.create_tokenizer()
@pytest.fixture @pytest.fixture
def de_tokenizer(): def de_tokenizer():
return util.load_lang_class('de').Defaults.create_tokenizer() return util.get_lang_class('de').Defaults.create_tokenizer()
@pytest.fixture(scope='module') @pytest.fixture(scope='module')
def fr_tokenizer(): def fr_tokenizer():
return util.load_lang_class('fr').Defaults.create_tokenizer() return util.get_lang_class('fr').Defaults.create_tokenizer()
@pytest.fixture @pytest.fixture
def hu_tokenizer(): def hu_tokenizer():
return util.load_lang_class('hu').Defaults.create_tokenizer() return util.get_lang_class('hu').Defaults.create_tokenizer()
@pytest.fixture @pytest.fixture
def fi_tokenizer(): def fi_tokenizer():
return util.load_lang_class('fi').Defaults.create_tokenizer() return util.get_lang_class('fi').Defaults.create_tokenizer()
@pytest.fixture @pytest.fixture
def sv_tokenizer(): def sv_tokenizer():
return util.load_lang_class('sv').Defaults.create_tokenizer() return util.get_lang_class('sv').Defaults.create_tokenizer()
@pytest.fixture @pytest.fixture
def bn_tokenizer(): def bn_tokenizer():
return util.load_lang_class('bn').Defaults.create_tokenizer() return util.get_lang_class('bn').Defaults.create_tokenizer()
@pytest.fixture @pytest.fixture
def he_tokenizer(): def he_tokenizer():
return util.load_lang_class('he').Defaults.create_tokenizer() return util.get_lang_class('he').Defaults.create_tokenizer()
@pytest.fixture @pytest.fixture
def nb_tokenizer(): def nb_tokenizer():
return util.load_lang_class('nb').Defaults.create_tokenizer() return util.get_lang_class('nb').Defaults.create_tokenizer()
@pytest.fixture @pytest.fixture
@ -88,12 +88,12 @@ def stringstore():
@pytest.fixture @pytest.fixture
def en_entityrecognizer(): def en_entityrecognizer():
return util.load_lang_class('en').Defaults.create_entity() return util.get_lang_class('en').Defaults.create_entity()
@pytest.fixture @pytest.fixture
def lemmatizer(): def lemmatizer():
return util.load_lang_class('en').Defaults.create_lemmatizer() return util.get_lang_class('en').Defaults.create_lemmatizer()
@pytest.fixture @pytest.fixture

View File

@ -25,39 +25,37 @@ try:
except ImportError: except ImportError:
cupy = None cupy = None
def set_lang_class(name, cls): def get_lang_class(lang):
"""Import and load a Language class.
lang (unicode): Two-letter language code, e.g. 'en'.
RETURNS (Language): Language class.
"""
global LANGUAGES global LANGUAGES
LANGUAGES[name] = cls if not lang in LANGUAGES:
try:
module = importlib.import_module('.lang.%s' % lang, 'spacy')
def get_lang_class(name): except ImportError:
if name in LANGUAGES: raise ImportError("Can't import language %s from spacy.lang." %lang)
return LANGUAGES[name] LANGUAGES[lang] = getattr(module, module.__all__[0])
lang = re.split('[^a-zA-Z0-9]', name, 1)[0]
if lang not in LANGUAGES:
raise RuntimeError('Language not supported: %s' % name)
return LANGUAGES[lang] return LANGUAGES[lang]
def load_lang_class(lang): def set_lang_class(name, cls):
"""Import and load a Language class. """Set a custom Language class name that can be loaded via get_lang_class.
Args: name (unicode): Name of Language class.
lang (unicode): Two-letter language code, e.g. 'en'. cls (Language): Language class.
Returns:
Language: Language class.
""" """
module = importlib.import_module('.lang.%s' % lang, 'spacy') global LANGUAGES
return getattr(module, module.__all__[0]) LANGUAGES[name] = cls
def get_data_path(require_exists=True): def get_data_path(require_exists=True):
"""Get path to spaCy data directory. """Get path to spaCy data directory.
Args: require_exists (bool): Only return path if it exists, otherwise None.
require_exists (bool): Only return path if it exists, otherwise None. RETURNS (Path or None): Data path or None.
Returns:
Path or None: Data path or None.
""" """
if not require_exists: if not require_exists:
return _data_path return _data_path
@ -68,14 +66,18 @@ def get_data_path(require_exists=True):
def set_data_path(path): def set_data_path(path):
"""Set path to spaCy data directory. """Set path to spaCy data directory.
Args: path (unicode or Path): Path to new data directory.
path (unicode or Path): Path to new data directory.
""" """
global _data_path global _data_path
_data_path = ensure_path(path) _data_path = ensure_path(path)
def ensure_path(path): def ensure_path(path):
"""Ensure string is converted to a Path.
path: Anything. If string, it's converted to Path.
RETURNS: Path or original argument.
"""
if isinstance(path, basestring_): if isinstance(path, basestring_):
return Path(path) return Path(path)
else: else:
@ -85,10 +87,8 @@ def ensure_path(path):
def resolve_model_path(name): def resolve_model_path(name):
"""Resolve a model name or string to a model path. """Resolve a model name or string to a model path.
Args: name (unicode): Package name, shortcut link or model path.
name (unicode): Package name, shortcut link or model path. RETURNS (Path): Path to model data directory.
Returns:
Path: Path to model data directory.
""" """
data_path = get_data_path() data_path = get_data_path()
if not data_path or not data_path.exists(): if not data_path or not data_path.exists():
@ -108,11 +108,8 @@ def resolve_model_path(name):
def is_package(name): def is_package(name):
"""Check if string maps to a package installed via pip. """Check if string maps to a package installed via pip.
Args: name (unicode): Name of package.
name (unicode): Name of package. RETURNS (bool): True if installed package, False if not.
Returns:
bool: True if installed package, False if not.
""" """
packages = pip.get_installed_distributions() packages = pip.get_installed_distributions()
for package in packages: for package in packages:
@ -124,10 +121,8 @@ def is_package(name):
def get_model_package_path(package_name): def get_model_package_path(package_name):
"""Get path to a model package installed via pip. """Get path to a model package installed via pip.
Args: package_name (unicode): Name of installed package.
package_name (unicode): Name of installed package. RETURNS (Path): Path to model data directory.
Returns:
Path: Path to model data directory.
""" """
# Here we're importing the module just to find it. This is worryingly # Here we're importing the module just to find it. This is worryingly
# indirect, but it's otherwise very difficult to find the package. # indirect, but it's otherwise very difficult to find the package.
@ -142,11 +137,9 @@ def get_model_package_path(package_name):
def parse_package_meta(package_path, require=True): def parse_package_meta(package_path, require=True):
"""Check if a meta.json exists in a package and return its contents. """Check if a meta.json exists in a package and return its contents.
Args: package_path (Path): Path to model package directory.
package_path (Path): Path to model package directory. require (bool): If True, raise error if no meta.json is found.
require (bool): If True, raise error if no meta.json is found. RETURNS (dict or None): Model meta.json data or None.
Returns:
dict or None: Model meta.json data or None.
""" """
location = package_path / 'meta.json' location = package_path / 'meta.json'
if location.is_file(): if location.is_file():
@ -201,11 +194,9 @@ def compile_infix_regex(entries):
def update_exc(base_exceptions, *addition_dicts): def update_exc(base_exceptions, *addition_dicts):
"""Update and validate tokenizer exceptions. Will overwrite exceptions. """Update and validate tokenizer exceptions. Will overwrite exceptions.
Args: base_exceptions (dict): Base exceptions.
base_exceptions (dict): Base exceptions. *addition_dicts (dict): Exceptions to add to the base dict, in order.
*addition_dicts (dict): Exceptions to add to the base dict, in order. RETURNS (dict): Combined tokenizer exceptions.
Returns:
dict: Combined tokenizer exceptions.
""" """
exc = dict(base_exceptions) exc = dict(base_exceptions)
for additions in addition_dicts: for additions in addition_dicts:
@ -229,12 +220,10 @@ def expand_exc(excs, search, replace):
"""Find string in tokenizer exceptions, duplicate entry and replace string. """Find string in tokenizer exceptions, duplicate entry and replace string.
For example, to add additional versions with typographic apostrophes. For example, to add additional versions with typographic apostrophes.
Args: excs (dict): Tokenizer exceptions.
excs (dict): Tokenizer exceptions. search (unicode): String to find and replace.
search (unicode): String to find and replace. replace (unicode): Replacement.
replace (unicode): Replacement. RETURNS (dict): Combined tokenizer exceptions.
Returns:
dict:
""" """
def _fix_token(token, search, replace): def _fix_token(token, search, replace):
fixed = dict(token) fixed = dict(token)
@ -278,10 +267,8 @@ def check_renamed_kwargs(renamed, kwargs):
def read_json(location): def read_json(location):
"""Open and load JSON from file. """Open and load JSON from file.
Args: location (Path): Path to JSON file.
location (Path): Path to JSON file. RETURNS (dict): Loaded JSON content.
Returns:
dict: Loaded JSON content.
""" """
with location.open('r', encoding='utf8') as f: with location.open('r', encoding='utf8') as f:
return ujson.load(f) return ujson.load(f)
@ -290,11 +277,9 @@ def read_json(location):
def get_raw_input(description, default=False): def get_raw_input(description, default=False):
"""Get user input from the command line via raw_input / input. """Get user input from the command line via raw_input / input.
Args: description (unicode): Text to display before prompt.
description (unicode): Text to display before prompt. default (unicode or False/None): Default value to display with prompt.
default (unicode or False/None): Default value to display with prompt. RETURNS (unicode): User input.
Returns:
unicode: User input.
""" """
additional = ' (default: %s)' % default if default else '' additional = ' (default: %s)' % default if default else ''
prompt = ' %s%s: ' % (description, additional) prompt = ' %s%s: ' % (description, additional)
@ -305,9 +290,8 @@ def get_raw_input(description, default=False):
def print_table(data, title=None): def print_table(data, title=None):
"""Print data in table format. """Print data in table format.
Args: data (dict or list of tuples): Label/value pairs.
data (dict or list of tuples): Label/value pairs. title (unicode or None): Title, will be printed above.
title (unicode or None): Title, will be printed above.
""" """
if isinstance(data, dict): if isinstance(data, dict):
data = list(data.items()) data = list(data.items())
@ -321,9 +305,8 @@ def print_table(data, title=None):
def print_markdown(data, title=None): def print_markdown(data, title=None):
"""Print data in GitHub-flavoured Markdown format for issues etc. """Print data in GitHub-flavoured Markdown format for issues etc.
Args: data (dict or list of tuples): Label/value pairs.
data (dict or list of tuples): Label/value pairs. title (unicode or None): Title, will be rendered as headline 2.
title (unicode or None): Title, will be rendered as headline 2.
""" """
def excl_value(value): def excl_value(value):
return Path(value).exists() # contains path (personal info) return Path(value).exists() # contains path (personal info)
@ -339,10 +322,8 @@ def print_markdown(data, title=None):
def prints(*texts, **kwargs): def prints(*texts, **kwargs):
"""Print formatted message (manual ANSI escape sequences to avoid dependency) """Print formatted message (manual ANSI escape sequences to avoid dependency)
Args: *texts (unicode): Texts to print. Each argument is rendered as paragraph.
*texts (unicode): Texts to print. Each argument is rendered as paragraph. **kwargs: 'title' becomes coloured headline. 'exits'=True performs sys exit.
**kwargs: 'title' is rendered as coloured headline. 'exits'=True performs
system exit after printing.
""" """
exits = kwargs.get('exits', False) exits = kwargs.get('exits', False)
title = kwargs.get('title', None) title = kwargs.get('title', None)
@ -356,12 +337,10 @@ def prints(*texts, **kwargs):
def _wrap(text, wrap_max=80, indent=4): def _wrap(text, wrap_max=80, indent=4):
"""Wrap text at given width using textwrap module. """Wrap text at given width using textwrap module.
Args: text (unicode): Text to wrap. If it's a Path, it's converted to string.
text (unicode): Text to wrap. If it's a Path, it's converted to string. wrap_max (int): Maximum line length (indent is deducted).
wrap_max (int): Maximum line length (indent is deducted). indent (int): Number of spaces for indentation.
indent (int): Number of spaces for indentation. RETURNS (unicode): Wrapped text.
Returns:
unicode: Wrapped text.
""" """
indent = indent * ' ' indent = indent * ' '
wrap_width = wrap_max - len(indent) wrap_width = wrap_max - len(indent)
@ -370,3 +349,13 @@ def _wrap(text, wrap_max=80, indent=4):
return textwrap.fill(text, width=wrap_width, initial_indent=indent, return textwrap.fill(text, width=wrap_width, initial_indent=indent,
subsequent_indent=indent, break_long_words=False, subsequent_indent=indent, break_long_words=False,
break_on_hyphens=False) break_on_hyphens=False)
def minify_html(html):
"""Perform a template-specific, rudimentary HTML minification for displaCy.
Disclaimer: NOT a general-purpose solution, only removes indentation/newlines.
html (unicode): Markup to minify.
RETURNS (unicode): "Minified" HTML.
"""
return html.strip().replace(' ', '').replace('\n', '')

View File

@ -3,7 +3,6 @@
include _includes/_mixins include _includes/_mixins
doctype html doctype html
html(lang="en") html(lang="en")
title title
if SECTION == "docs" && SUBSECTION && SUBSECTION != "index" if SECTION == "docs" && SUBSECTION && SUBSECTION != "index"

Binary file not shown.

After

Width:  |  Height:  |  Size: 99 KiB

View File

@ -21,6 +21,7 @@
"GoldParse": "goldparse" "GoldParse": "goldparse"
}, },
"Other": { "Other": {
"displaCy": "displacy",
"Utility Functions": "util", "Utility Functions": "util",
"Annotation Specs": "annotation", "Annotation Specs": "annotation",
"Feature Scheme": "features" "Feature Scheme": "features"
@ -111,6 +112,11 @@
"tag": "class" "tag": "class"
}, },
"displacy": {
"title": "displaCy",
"tag": "module"
},
"util": { "util": {
"title": "Utility Functions" "title": "Utility Functions"
}, },

View File

@ -0,0 +1,229 @@
//- 💫 DOCS > API > DISPLACY
include ../../_includes/_mixins
p
| As of v2.0, spaCy comes with a built-in visualization suite. For more
| info and examples, see the usage workflow on
| #[+a("/docs/usage/visualizers") visualizing spaCy].
+h(2, "serve") serve
+tag method
p
| Serve a dependency parse tree or named entity visualization to view it
| in your browser. Will run a simple web server.
+aside-code("Example").
import spacy
from spacy import displacy
nlp = spacy.load('en')
doc1 = nlp(u'This is a sentence.')
doc2 = nlp(u'This is another sentence.')
displacy.serve([doc1, doc2], style='dep')
+table(["Name", "Type", "Description", "Default"])
+row
+cell #[code docs]
+cell list or #[code Doc]
+cell Document(s) to visualize.
+cell
+row
+cell #[code style]
+cell unicode
+cell Visualization style, #[code 'dep'] or #[code 'ent'].
+cell #[code 'dep']
+row
+cell #[code page]
+cell bool
+cell Render markup as full HTML page.
+cell #[code True]
+row
+cell #[code minify]
+cell bool
+cell Minify HTML markup.
+cell #[code False]
+row
+cell #[code options]
+cell dict
+cell #[+a("#options") Visualizer-specific options], e.g. colors.
+cell #[code {}]
+row
+cell #[code port]
+cell int
+cell Port to serve visualization.
+cell #[code 5000]
+h(2, "render") render
+tag method
p Render a dependency parse tree or named entity visualization.
+aside-code("Example").
import spacy
from spacy import displacy
nlp = spacy.load('en')
doc = nlp(u'This is a sentence.')
html = displacy.render(doc, style='dep')
+table(["Name", "Type", "Description", "Default"])
+row
+cell #[code docs]
+cell list or #[code Doc]
+cell Document(s) to visualize.
+cell
+row
+cell #[code style]
+cell unicode
+cell Visualization style, #[code 'dep'] or #[code 'ent'].
+cell #[code 'dep']
+row
+cell #[code page]
+cell bool
+cell Render markup as full HTML page.
+cell #[code False]
+row
+cell #[code minify]
+cell bool
+cell Minify HTML markup.
+cell #[code False]
+row
+cell #[code jupyter]
+cell bool
+cell
| Returns markup using #[+a("http://jupyter.org/") Jupyter]'s
| internal methods, ready to be rendered in a notebook.
+cell #[code False]
+row
+cell #[code options]
+cell dict
+cell #[+a("#options") Visualizer-specific options], e.g. colors.
+cell #[code {}]
+footrow
+cell return
+cell unicode
+cell Rendered HTML markup.
+cell
+h(2, "options") Visualizer options
p
| The #[code options] argument lets you specify additional settings for
| each visualizer. If a setting is not present in the options, the default
| value will be used.
+h(3, "options-dep") Dependency Visualizer options
+aside-code("Example").
options = {'compact': True, 'color': 'blue'}
displacy.serve(doc, style='dep', options=options)
+table(["Name", "Type", "Description", "Default"])
+row
+cell #[code collapse_punct]
+cell bool
+cell
| Attach punctuation to tokens. Can make the parse more readable,
| as it prevents long arcs to attach punctuation.
+cell #[code True]
+row
+cell #[code compact]
+cell bool
+cell "Compact mode" with square arrows that takes up less space.
+cell #[code False]
+row
+cell #[code color]
+cell unicode
+cell Text color (HEX, RGB or color names).
+cell #[code '#000000']
+row
+cell #[code bg]
+cell unicode
+cell Background color (HEX, RGB or color names).
+cell #[code '#ffffff']
+row
+cell #[code font]
+cell unicode
+cell Font name or font family for all text.
+cell #[code 'Arial']
+row
+cell #[code offset_x]
+cell int
+cell Spacing on left side of the SVG in px.
+cell #[code 50]
+row
+cell #[code arrow_stroke]
+cell int
+cell Width of arrow path in px.
+cell #[code 2]
+row
+cell #[code arrow_width]
+cell int
+cell Width of arrow head in px.
+cell #[code 10] / #[code 8] (compact)
+row
+cell #[code arrow_spacing]
+cell int
+cell Spacing between arrows in px to avoid overlaps.
+cell #[code 20]
+row
+cell #[code word_spacing]
+cell int
+cell Horizontal spacing between words and arcs in px.
+cell #[code 45]
+row
+cell #[code distance]
+cell int
+cell Distance between words in px.
+cell #[code 175] / #[code 85] (compact)
+h(3, "options-ent") Named Entity Visualizer options
+aside-code("Example").
options = {'ents': ['PERSON', 'ORG', 'PRODUCT'],
'colors': {'ORG': 'yellow'}}
displacy.serve(doc, style='ent', options=options)
+table(["Name", "Type", "Description", "Default"])
+row
+cell #[code ents]
+cell list
+cell
| Entity types to highlight (#[code None] for all types).
+cell #[code None]
+row
+cell #[code colors]
+cell dict
+cell
| Color overrides. Entity types in lowercase should be mapped to
| color names or values.
+cell #[code {}]
p
| By default, displaCy comes with colours for all
| #[+a("/docs/api/annotation#named-entities") entity types supported by spaCy].
| If you're using custom entity types, you can use the #[code colors]
| setting to add your own colours for them.

View File

@ -49,7 +49,7 @@ p
+cell unicode or #[code Path] +cell unicode or #[code Path]
+cell Path to new data directory. +cell Path to new data directory.
+h(2, "load_lang_class") load_lang_class +h(2, "get_lang_class") get_lang_class
+tag function +tag function
p p
@ -59,7 +59,7 @@ p
+aside-code("Example"). +aside-code("Example").
for lang_id in ['en', 'de']: for lang_id in ['en', 'de']:
lang_class = util.load_lang_class(lang_id) lang_class = util.get_lang_class(lang_id)
lang = lang_class() lang = lang_class()
tokenizer = lang.Defaults.create_tokenizer() tokenizer = lang.Defaults.create_tokenizer()

View File

@ -4,9 +4,9 @@
"Installation": "./", "Installation": "./",
"Models": "models", "Models": "models",
"Lightning tour": "lightning-tour", "Lightning tour": "lightning-tour",
"Visualizers": "visualizers",
"Command line": "cli", "Command line": "cli",
"Troubleshooting": "troubleshooting", "Troubleshooting": "troubleshooting"
"Resources": "resources"
}, },
"Workflows": { "Workflows": {
"Loading the pipeline": "language-processing-pipeline", "Loading the pipeline": "language-processing-pipeline",
@ -43,6 +43,11 @@
"lightning-tour": { "lightning-tour": {
"title": "Lightning tour", "title": "Lightning tour",
"next": "visualizers"
},
"visualizers": {
"title": "Visualizers",
"next": "cli" "next": "cli"
}, },

View File

@ -80,7 +80,7 @@ p
| compute. As of spaCy v2.0, #[code Language] classes are not imported on | compute. As of spaCy v2.0, #[code Language] classes are not imported on
| initialisation and are only loaded when you import them directly, or load | initialisation and are only loaded when you import them directly, or load
| a model that requires a language to be loaded. To lazy-load languages in | a model that requires a language to be loaded. To lazy-load languages in
| your application, you can use the #[code util.load_lang_class()] helper | your application, you can use the #[code util.get_lang_class()] helper
| function with the two-letter language code as its argument. | function with the two-letter language code as its argument.
+h(2, "language-data") Adding language data +h(2, "language-data") Adding language data
@ -486,7 +486,7 @@ p
| #[+src(gh("spaCy", "spacy/tests/lang")) tests/lang] in a directory named | #[+src(gh("spaCy", "spacy/tests/lang")) tests/lang] in a directory named
| after the language ID. You'll also need to create a fixture for your | after the language ID. You'll also need to create a fixture for your
| tokenizer in the #[+src(gh("spaCy", "spacy/tests/conftest.py")) conftest.py]. | tokenizer in the #[+src(gh("spaCy", "spacy/tests/conftest.py")) conftest.py].
| Always use the #[code load_lang_class()] helper function within the fixture, | Always use the #[code get_lang_class()] helper function within the fixture,
| instead of importing the class at the top of the file. This will load the | instead of importing the class at the top of the file. This will load the
| language data only when it's needed. (Otherwise, #[em all data] would be | language data only when it's needed. (Otherwise, #[em all data] would be
| loaded every time you run a test.) | loaded every time you run a test.)
@ -494,7 +494,7 @@ p
+code. +code.
@pytest.fixture @pytest.fixture
def en_tokenizer(): def en_tokenizer():
return util.load_lang_class('en').Defaults.create_tokenizer() return util.get_lang_class('en').Defaults.create_tokenizer()
p p
| When adding test cases, always | When adding test cases, always

View File

@ -105,17 +105,36 @@ p
| consistent state. | consistent state.
+h(2, "displacy") The displaCy #[sup ENT] visualizer +h(2, "displacy") Visualizing named entities
p p
| The #[+a(DEMOS_URL + "/displacy-ent/") displaCy #[sup ENT] visualizer] | The #[+a(DEMOS_URL + "/displacy-ent/") displaCy #[sup ENT] visualizer]
| lets you explore an entity recognition model's behaviour interactively. | lets you explore an entity recognition model's behaviour interactively.
| If you're training a model, it's very useful to run the visualization | If you're training a model, it's very useful to run the visualization
| server yourself. To help you do that, we've open-sourced both the | yourself. To help you do that, spaCy v2.0+ comes with a visualization
| #[+a(gh("spacy-services")) back-end service] and the | module. Simply pass a #[code Doc] or a list of #[code Doc] objects to
| #[+a(gh("displacy-ent")) front-end client]. | displaCy and run #[+api("displacy#serve") #[code displacy.serve]] to
| run the web server, or #[+api("displacy#render") #[code displacy.render]]
| to generate the raw markup.
+codepen("ALxpQO", 450) p
| For more details and examples, see the
| #[+a("/docs/usage/visualizers") usage workflow on visualizing spaCy].
+code("Named Entity example").
import spacy
from spacy import displacy
text = """But Google is starting from behind. The company made a late push
into hardware, and Apples Siri, available on iPhones, and Amazons Alexa
software, which runs on its Echo and Dot devices, have clear leads in
consumer adoption."""
nlp = spacy.load('custom_ner_model')
doc = nlp(text)
displacy.serve(doc, style='ent')
+codepen("a73f8b68f9af3157855962b283b364e4", 345)
+h(2, "entity-types") Built-in entity types +h(2, "entity-types") Built-in entity types

View File

@ -0,0 +1,278 @@
//- 💫 DOCS > USAGE > VISUALIZERS
include ../../_includes/_mixins
p
| As of v2.0, our popular visualizers, #[+a(DEMOS_URL + "/displacy") displaCy]
| and #[+a(DEMOS_URL + "displacy-ent") displaCy #[sup ENT]] are finally an
| official part of the library. Visualizing a dependency parse or named
| entities in a text is not only a fun NLP demo it can also be incredibly
| helpful in speeding up development and debugging your code and training
| process. Instead of printing a list of dependency labels or entity spans,
| you can simply pass your #[code Doc] objects to #[code displacy] and view
| the visualizations in your browser, or export them as HTML files or
| vector graphics. displaCy also comes with a #[+a("#jupyter") Jupyter hook]
| that returns the markup in a format ready to be rendered in a notebook.
+aside("What about the old visualizers?")
| Our JavaScript-based visualizers #[+src(gh("displacy")) displacy.js] and
| #[+src(gh("displacy-ent")) displacy-ent.js] will still be available on
| GitHub. If you're looking to implement web-based visualizations, we
| generally recommend using those instead of spaCy's built-in
| #[code displacy] module. It'll allow your application to perform all
| rendering on the client and only rely on the server for the text
| processing. The generated markup is also more compatible with modern web
| standards.
+h(2, "getting-started") Getting started
p
| The quickest way visualize #[code Doc] is to use
| #[+api("displacy#serve") #[code displacy.serve]]. This will spin up a
| simple web server and let you view the result straight from your browser.
| displaCy can either take a single #[code Doc] or a list of #[code Doc]
| objects as its first argument. This lets you construct them however you
| like using any model or modifications you like.
+h(3, "dep") Visualizing the dependency parse
p
| The dependency visualizer, #[code dep], shows part-of-speech tags
| and syntactic dependencies.
+code("Dependency example").
import spacy
from spacy import displacy
nlp = spacy.load('en')
doc = nlp(u'This is a sentence.')
displacy.serve(doc, style='dep')
+codepen("f0e85b64d469d6617251d8241716d55f", 370)
p
| The argument #[code options] lets you specify a dictionary of settings
| to customise the layout, for example:
+table(["Name", "Type", "Description", "Default"])
+row
+cell #[code compact]
+cell bool
+cell "Compact mode" with square arrows that takes up less space.
+cell #[code False]
+row
+cell #[code color]
+cell unicode
+cell Text color (HEX, RGB or color names).
+cell #[code '#000000']
+row
+cell #[code bg]
+cell unicode
+cell Background color (HEX, RGB or color names).
+cell #[code '#ffffff']
+row
+cell #[code font]
+cell unicode
+cell Font name or font family for all text.
+cell #[code 'Arial']
p
| For a list of all available options, see the
| #[+api("displacy#options") #[code displacy] API documentation].
+aside-code("Options example").
options = {'compact': True, 'bg': '&#35;09a3d5',
'color': 'white', 'font': 'Source Sans Pro'}
displacy.serve(doc, style='dep', options=options)
+codepen("39c02c893a84794353de77a605d817fd", 360)
+h(3, "ent") Visualizing the entity recognizer
p
| The entity visualizer, #[code ent], highlights named entities and
| their labels in a text.
+code("Named Entity example").
import spacy
from spacy import displacy
text = """But Google is starting from behind. The company made a late push
into hardware, and Apples Siri, available on iPhones, and Amazons Alexa
software, which runs on its Echo and Dot devices, have clear leads in
consumer adoption."""
nlp = spacy.load('custom_ner_model')
doc = nlp(text)
displacy.serve(doc, style='ent')
+codepen("a73f8b68f9af3157855962b283b364e4", 345)
p The entity visualizer lets you customise the following #[code options]:
+table(["Name", "Type", "Description", "Default"])
+row
+cell #[code ents]
+cell list
+cell
| Entity types to highlight (#[code None] for all types).
+cell #[code None]
+row
+cell #[code colors]
+cell dict
+cell
| Color overrides. Entity types in lowercase should be mapped to
| color names or values.
+cell #[code {}]
p
| If you specify a list of #[code ents], only those entity types will be
| rendered for example, you can choose to display #[code PERSON] entities.
| Internally, the visualizer knows nothing about available entity types and
| will render whichever spans and labels it receives. This makes it
| especially easy to work with custom entity types. By default, displaCy
| comes with colours for all
| #[+a("/docs/api/annotation#named-entities") entity types supported by spaCy].
| If you're using custom entity types, you can use the #[code colors]
| setting to add your own colours for them.
+aside-code("Options example").
colors = {'ORG': 'linear-gradient(90deg, #aa9cfc, #fc9ce7)'}
options = {'ents': ['ORG'], 'colors': colors}
displacy.serve(doc, style='ent', options=options)
+codepen("f42ec690762b6f007022a7acd6d0c7d4", 300)
p
| The above example uses a little trick: Since the background colour values
| are added as the #[code background] style attribute, you can use any
| #[+a("https://tympanus.net/codrops/css_reference/background/") valid background value]
| or shorthand — including gradients and even images!
+h(2, "render") Rendering visualizations
p
| If you don't need the web server and just want to generate the markup
| for example, to export it to a file or serve it in a custom
| way you can use #[+api("displacy#render") #[code displacy.render]]
| instead. It works the same, but returns a string containing the markup.
+code("Example").
import spacy
from spacy import displacy
nlp = spacy.load('en')
doc1 = nlp(u'This is a sentence.')
doc2 = nlp(u'This is another sentence.')
html = displacy.render([doc1, doc2], style='dep', page=True)
p
| #[code page=True] renders the markup wrapped as a full HTML page.
| For minified and more compact HTML markup, you can set #[code minify=True].
| If you're rendering a dependency parse, you can also export it as an
| #[code .svg] file.
+aside("What's SVG?")
| Unlike other image formats, the SVG (Scalable Vector Graphics) uses XML
| markup that's easy to manipulate
| #[+a("https://www.smashingmagazine.com/2014/11/styling-and-animating-svgs-with-css/") using CSS] or
| #[+a("https://css-tricks.com/smil-is-dead-long-live-smil-a-guide-to-alternatives-to-smil-features/") JavaScript].
| Essentially, SVG lets you design with code, which makes it a perfect fit
| for visualizing dependency trees. SVGs can be embedded online in an
| #[code &lt;img&gt;] tag, or inlined in an HTML document. They're also
| pretty easy to #[+a("https://convertio.co/image-converter/") convert].
+code.
svg = displacy.render(doc, style='dep')
output_path = Path('/images/sentence.svg')
output_path.open('w', encoding='utf-8').write(svg)
+infobox("Important note")
| Since each visualization is generated as a separate SVG, exporting
| #[code .svg] files only works if you're rendering #[strong one single doc]
| at a time. (This makes sense after all, each visualization should be
| a standalone graphic.) So instead of rendering all #[code Doc]s at one,
| loop over them and export them separately.
+h(2, "jupyter") Using displaCy in Jupyter notebooks
p
| If you're working with a #[+a("https://jupyter.org") Jupyter] notebook,
| you can use displaCy's "Jupyter mode" to return markup that can be
| rendered in a cell straight away. When you export your notebook, the
| visualizations will be included as HTML.
+code("Jupyter Example").
# don't forget to install a model, e.g.: python -m spacy download en
import spacy
from spacy import displacy
doc = nlp(u'Rats are various medium-sized, long-tailed rodents.')
displacy.render(doc, style='dep', jupyter=True)
doc2 = nlp(LONG_NEWS_ARTICLE)
displacy.render(doc2, style='ent', jupyter=True)
+image("/assets/img/docs/displacy_jupyter.jpg", 700, false, "Example of using the displaCy dependency and named entity visualizer in a Jupyter notebook")
p
| Internally, displaCy imports #[code display] and #[code HTML] from
| #[code IPython.core.display] and returns a Jupyter HTML object. If you
| were doing it manually, it'd look like this:
+code.
from IPython.core.display import display, HTML
html = displacy.render(doc, style='dep')
return display(HTML(html))
+h(2, "examples") Usage examples
+h(2, "manual-usage") Rendering data manually
p
| You can also use displaCy to manually render data. This can be useful if
| you want to visualize output from other libraries, like
| #[+a("http://www.nltk.org") NLTK] or
| #[+a("https://github.com/tensorflow/models/tree/master/syntaxnet") SyntaxNet].
| Simply convert the dependency parse or recognised entities to displaCy's
| format and import #[code DependencyRenderer] or #[code EntityRenderer]
| from #[code spacy.displacy.render]. A renderer class can be is initialised
| with a dictionary of options. To generate the visualization markup, call
| the renderer's #[code render()] method on a list of dictionaries (one
| per visualization).
+aside-code("Example").
from spacy.displacy.render import EntityRenderer
ex = [{'text': 'But Google is starting from behind.',
'ents': [{'start': 4, 'end': 10, 'label': 'ORG'}],
'title': None}]
renderer = EntityRenderer()
html = renderer.render(ex)
+code("DependencyRenderer input").
[{
'words': [
{'text': 'This', 'tag': 'DT'},
{'text': 'is', 'tag': 'VBZ'},
{'text': 'a', 'tag': 'DT'},
{'text': 'sentence', 'tag': 'NN'}],
'arcs': [
{'start': 0, 'end': 1, 'label': 'nsubj', 'dir': 'left'},
{'start': 2, 'end': 3, 'label': 'det', 'dir': 'left'},
{'start': 1, 'end': 3, 'label': 'attr', 'dir': 'right'}]
}]
+code("EntityRenderer input").
[{
'text': 'But Google is starting from behind.',
'ents': [{'start': 4, 'end': 10, 'label': 'ORG'}],
'title': None
}]