spaCy/spacy/displacy/__init__.py

# coding: utf8
from __future__ import unicode_literals

from .render import DependencyRenderer, EntityRenderer
from ..tokens import Doc
from ..compat import b_to_str
from ..util import prints, is_in_jupyter


_html = {}
IS_JUPYTER = is_in_jupyter()


def render(docs, style='dep', page=False, minify=False, jupyter=IS_JUPYTER,
           options={}, manual=False):
    """Render displaCy visualisation.

    docs (list or Doc): Document(s) to visualise.
    style (unicode): Visualisation style, 'dep' or 'ent'.
    page (bool): Render markup as full HTML page.
    minify (bool): Minify HTML markup.
    jupyter (bool): Experimental, use Jupyter's `display()` to output markup.
    options (dict): Visualiser-specific options, e.g. colors.
    manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
    RETURNS (unicode): Rendered HTML markup.
    """
    factories = {'dep': (DependencyRenderer, parse_deps),
                 'ent': (EntityRenderer, parse_ents)}
    if style not in factories:
        raise ValueError("Unknown style: %s" % style)
    if isinstance(docs, Doc) or isinstance(docs, dict):
        docs = [docs]
    renderer, converter = factories[style]
    renderer = renderer(options=options)
    parsed = [converter(doc, options) for doc in docs] if not manual else docs
    _html['parsed'] = renderer.render(parsed, page=page, minify=minify).strip()
    html = _html['parsed']
    if jupyter:  # return HTML rendered by IPython display()
        from IPython.core.display import display, HTML
        return display(HTML(html))
    return html


def serve(docs, style='dep', page=True, minify=False, options={}, manual=False,
          port=5000):
    """Serve displaCy visualisation.

    docs (list or Doc): Document(s) to visualise.
    style (unicode): Visualisation style, 'dep' or 'ent'.
    page (bool): Render markup as full HTML page.
    minify (bool): Minify HTML markup.
    options (dict): Visualiser-specific options, e.g. colors.
    manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
    port (int): Port to serve visualisation.
    """
    from wsgiref import simple_server
    render(docs, style=style, page=page, minify=minify, options=options,
           manual=manual)
    httpd = simple_server.make_server('0.0.0.0', port, app)
    prints("Using the '%s' visualizer" % style,
           title="Serving on port %d..." % port)
    try:
        httpd.serve_forever()
    except KeyboardInterrupt:
        prints("Shutting down server on port %d." % port)
    finally:
        httpd.server_close()


def app(environ, start_response):
    # headers and status need to be bytes in Python 2, see #1227
    headers = [(b_to_str(b'Content-type'),
                b_to_str(b'text/html; charset=utf-8'))]
    start_response(b_to_str(b'200 OK'), headers)
    res = _html['parsed'].encode(encoding='utf-8')
    return [res]


def parse_deps(orig_doc, options={}):
    """Generate dependency parse in {'words': [], 'arcs': []} format.

    doc (Doc): Document do parse.
    RETURNS (dict): Generated dependency parse keyed by words and arcs.
    """
    doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes())
    if options.get('collapse_punct', True):
        spans = []
        for word in doc[:-1]:
            if word.is_punct or not word.nbor(1).is_punct:
                continue
            start = word.i
            end = word.i + 1
            while end < len(doc) and doc[end].is_punct:
                end += 1
            span = doc[start:end]
            spans.append((span.start_char, span.end_char, word.tag_,
                          word.lemma_, word.ent_type_))
        for span_props in spans:
            doc.merge(*span_props)
    if options.get('fine_grained'):
        words = [{'text': w.text, 'tag': w.tag_} for w in doc]
    else:
        words = [{'text': w.text, 'tag': w.pos_} for w in doc]
    arcs = []
    for word in doc:
        if word.i < word.head.i:
            arcs.append({'start': word.i, 'end': word.head.i,
                         'label': word.dep_, 'dir': 'left'})
        elif word.i > word.head.i:
            arcs.append({'start': word.head.i, 'end': word.i,
                         'label': word.dep_, 'dir': 'right'})
    return {'words': words, 'arcs': arcs}


def parse_ents(doc, options={}):
    """Generate named entities in [{start: i, end: i, label: 'label'}] format.

    doc (Doc): Document do parse.
    RETURNS (dict): Generated entities keyed by text (original text) and ents.
    """
    ents = [{'start': ent.start_char, 'end': ent.end_char, 'label': ent.label_}
            for ent in doc.ents]
    title = (doc.user_data.get('title', None)
             if hasattr(doc, 'user_data') else None)
    return {'text': doc.text, 'ents': ents, 'title': title}
Add displaCy visualisers (see #1058) 2017-05-14 18:50:23 +03:00			`# coding: utf8`
			`from __future__ import unicode_literals`

			`from .render import DependencyRenderer, EntityRenderer`
			`from ..tokens import Doc`
Add workaround for displaCy server on Python 2/3 (resolves #1227) Make sure status and headers are bytes on Python 2 and strings on Python 3 2017-08-01 02:11:35 +03:00			`from ..compat import b_to_str`
Add is_in_jupyter() helper for displaCy (see #1058) 2017-05-18 15:13:14 +03:00			`from ..util import prints, is_in_jupyter`
Add displaCy visualisers (see #1058) 2017-05-14 18:50:23 +03:00

			`_html = {}`
Add is_in_jupyter() helper for displaCy (see #1058) 2017-05-18 15:13:14 +03:00			`IS_JUPYTER = is_in_jupyter()`
Add displaCy visualisers (see #1058) 2017-05-14 18:50:23 +03:00

Tidy up displaCy and add "manual" option Also don't require title in EntityRenderer 2017-05-22 19:48:20 +03:00			`def render(docs, style='dep', page=False, minify=False, jupyter=IS_JUPYTER,`
Tidy up displaCy 2017-10-27 15:39:19 +03:00			`options={}, manual=False):`
Add displaCy visualisers (see #1058) 2017-05-14 18:50:23 +03:00			`"""Render displaCy visualisation.`

			`docs (list or Doc): Document(s) to visualise.`
			`style (unicode): Visualisation style, 'dep' or 'ent'.`
			`page (bool): Render markup as full HTML page.`
			`minify (bool): Minify HTML markup.`
Tidy up displaCy and add "manual" option Also don't require title in EntityRenderer 2017-05-22 19:48:20 +03:00			jupyter (bool): Experimental, use Jupyter's `display()` to output markup.
Add displaCy visualisers (see #1058) 2017-05-14 18:50:23 +03:00			`options (dict): Visualiser-specific options, e.g. colors.`
Tidy up displaCy 2017-10-27 15:39:19 +03:00			manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
Update docstrings 2017-05-14 20:30:47 +03:00			`RETURNS (unicode): Rendered HTML markup.`
Add displaCy visualisers (see #1058) 2017-05-14 18:50:23 +03:00			`"""`
Tidy up displaCy and add "manual" option Also don't require title in EntityRenderer 2017-05-22 19:48:20 +03:00			`factories = {'dep': (DependencyRenderer, parse_deps),`
			`'ent': (EntityRenderer, parse_ents)}`
			`if style not in factories:`
Fix style check in displacy 2017-05-17 15:57:24 +03:00			`raise ValueError("Unknown style: %s" % style)`
Tidy up displaCy and add "manual" option Also don't require title in EntityRenderer 2017-05-22 19:48:20 +03:00			`if isinstance(docs, Doc) or isinstance(docs, dict):`
			`docs = [docs]`
			`renderer, converter = factories[style]`
			`renderer = renderer(options=options)`
			`parsed = [converter(doc, options) for doc in docs] if not manual else docs`
Add Jupyter notebook support (see #1058) 2017-05-14 19:39:01 +03:00			`_html['parsed'] = renderer.render(parsed, page=page, minify=minify).strip()`
			`html = _html['parsed']`
Tidy up displaCy 2017-10-27 15:39:19 +03:00			`if jupyter: # return HTML rendered by IPython display()`
Add Jupyter notebook support (see #1058) 2017-05-14 19:39:01 +03:00			`from IPython.core.display import display, HTML`
			`return display(HTML(html))`
			`return html`
Add displaCy visualisers (see #1058) 2017-05-14 18:50:23 +03:00

Tidy up displaCy and add "manual" option Also don't require title in EntityRenderer 2017-05-22 19:48:20 +03:00			`def serve(docs, style='dep', page=True, minify=False, options={}, manual=False,`
			`port=5000):`
Add displaCy visualisers (see #1058) 2017-05-14 18:50:23 +03:00			`"""Serve displaCy visualisation.`

			`docs (list or Doc): Document(s) to visualise.`
			`style (unicode): Visualisation style, 'dep' or 'ent'.`
			`page (bool): Render markup as full HTML page.`
			`minify (bool): Minify HTML markup.`
			`options (dict): Visualiser-specific options, e.g. colors.`
Tidy up displaCy 2017-10-27 15:39:19 +03:00			manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
Add displaCy visualisers (see #1058) 2017-05-14 18:50:23 +03:00			`port (int): Port to serve visualisation.`
			`"""`
			`from wsgiref import simple_server`
Tidy up displaCy 2017-10-27 15:39:19 +03:00			`render(docs, style=style, page=page, minify=minify, options=options,`
			`manual=manual)`
Add displaCy visualisers (see #1058) 2017-05-14 18:50:23 +03:00			`httpd = simple_server.make_server('0.0.0.0', port, app)`
Tidy up displaCy 2017-10-27 15:39:19 +03:00			`prints("Using the '%s' visualizer" % style,`
			`title="Serving on port %d..." % port)`
Shut down displaCy server on KeyboardInterrupt 2017-06-03 14:24:56 +03:00			`try:`
			`httpd.serve_forever()`
			`except KeyboardInterrupt:`
			`prints("Shutting down server on port %d." % port)`
			`finally:`
			`httpd.server_close()`
Add displaCy visualisers (see #1058) 2017-05-14 18:50:23 +03:00

			`def app(environ, start_response):`
Add workaround for displaCy server on Python 2/3 (resolves #1227) Make sure status and headers are bytes on Python 2 and strings on Python 3 2017-08-01 02:11:35 +03:00			`# headers and status need to be bytes in Python 2, see #1227`
Tidy up displaCy 2017-10-27 15:39:19 +03:00			`headers = [(b_to_str(b'Content-type'),`
			`b_to_str(b'text/html; charset=utf-8'))]`
Add workaround for displaCy server on Python 2/3 (resolves #1227) Make sure status and headers are bytes on Python 2 and strings on Python 3 2017-08-01 02:11:35 +03:00			`start_response(b_to_str(b'200 OK'), headers)`
Add displaCy visualisers (see #1058) 2017-05-14 18:50:23 +03:00			`res = _html['parsed'].encode(encoding='utf-8')`
			`return [res]`


Clone Doc to never merge punctuation on original Doc 2017-06-03 14:24:43 +03:00			`def parse_deps(orig_doc, options={}):`
Add displaCy visualisers (see #1058) 2017-05-14 18:50:23 +03:00			`"""Generate dependency parse in {'words': [], 'arcs': []} format.`

			`doc (Doc): Document do parse.`
			`RETURNS (dict): Generated dependency parse keyed by words and arcs.`
			`"""`
Clone Doc to never merge punctuation on original Doc 2017-06-03 14:24:43 +03:00			`doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes())`
Add displaCy visualisers (see #1058) 2017-05-14 18:50:23 +03:00			`if options.get('collapse_punct', True):`
			`spans = []`
			`for word in doc[:-1]:`
			`if word.is_punct or not word.nbor(1).is_punct:`
			`continue`
			`start = word.i`
			`end = word.i + 1`
			`while end < len(doc) and doc[end].is_punct:`
			`end += 1`
Tidy up displaCy 2017-10-27 15:39:19 +03:00			`span = doc[start:end]`
Add displaCy visualisers (see #1058) 2017-05-14 18:50:23 +03:00			`spans.append((span.start_char, span.end_char, word.tag_,`
Tidy up displaCy 2017-10-27 15:39:19 +03:00			`word.lemma_, word.ent_type_))`
Add displaCy visualisers (see #1058) 2017-05-14 18:50:23 +03:00			`for span_props in spans:`
			`doc.merge(*span_props)`
Allow 'fine_grained' option in displaCy (see #1703) Shows token.tag_ instead of token.pos_. Disabled by default, to not cause rendering issues for models with long fine-grained tags (e.g. merged morphological features). 2017-12-09 17:11:12 +03:00			`if options.get('fine_grained'):`
			`words = [{'text': w.text, 'tag': w.tag_} for w in doc]`
			`else:`
			`words = [{'text': w.text, 'tag': w.pos_} for w in doc]`
Add displaCy visualisers (see #1058) 2017-05-14 18:50:23 +03:00			`arcs = []`
			`for word in doc:`
			`if word.i < word.head.i:`
			`arcs.append({'start': word.i, 'end': word.head.i,`
			`'label': word.dep_, 'dir': 'left'})`
			`elif word.i > word.head.i:`
			`arcs.append({'start': word.head.i, 'end': word.i,`
			`'label': word.dep_, 'dir': 'right'})`
			`return {'words': words, 'arcs': arcs}`


			`def parse_ents(doc, options={}):`
			`"""Generate named entities in [{start: i, end: i, label: 'label'}] format.`

			`doc (Doc): Document do parse.`
			`RETURNS (dict): Generated entities keyed by text (original text) and ents.`
			`"""`
			`ents = [{'start': ent.start_char, 'end': ent.end_char, 'label': ent.label_}`
Tidy up displaCy 2017-10-27 15:39:19 +03:00			`for ent in doc.ents]`
			`title = (doc.user_data.get('title', None)`
			`if hasattr(doc, 'user_data') else None)`
Add displaCy visualisers (see #1058) 2017-05-14 18:50:23 +03:00			`return {'text': doc.text, 'ents': ents, 'title': title}`