spaCy/spacy/displacy/__init__.py

156 lines
5.4 KiB
Python
Raw Normal View History

2017-05-14 18:50:23 +03:00
# coding: utf8
from __future__ import unicode_literals
from .render import DependencyRenderer, EntityRenderer
from ..tokens import Doc, Span
from ..compat import b_to_str
from ..errors import Errors, Warnings, user_warning
💫 New JSON helpers, training data internals & CLI rewrite (#2932) * Support nowrap setting in util.prints * Tidy up and fix whitespace * Simplify script and use read_jsonl helper * Add JSON schemas (see #2928) * Deprecate Doc.print_tree Will be replaced with Doc.to_json, which will produce a unified format * Add Doc.to_json() method (see #2928) Converts Doc objects to JSON using the same unified format as the training data. Method also supports serializing selected custom attributes in the doc._. space. * Remove outdated test * Add write_json and write_jsonl helpers * WIP: Update spacy train * Tidy up spacy train * WIP: Use wasabi for formatting * Add GoldParse helpers for JSON format * WIP: add debug-data command * Fix typo * Add missing import * Update wasabi pin * Add missing import * 💫 Refactor CLI (#2943) To be merged into #2932. ## Description - [x] refactor CLI To use [`wasabi`](https://github.com/ines/wasabi) - [x] use [`black`](https://github.com/ambv/black) for auto-formatting - [x] add `flake8` config - [x] move all messy UD-related scripts to `cli.ud` - [x] make converters function that take the opened file and return the converted data (instead of having them handle the IO) ### Types of change enhancement ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information. * Update wasabi pin * Delete old test * Update errors * Fix typo * Tidy up and format remaining code * Fix formatting * Improve formatting of messages * Auto-format remaining code * Add tok2vec stuff to spacy.train * Fix typo * Update wasabi pin * Fix path checks for when train() is called as function * Reformat and tidy up pretrain script * Update argument annotations * Raise error if model language doesn't match lang * Document new train command
2018-11-30 22:16:14 +03:00
from ..util import is_in_jupyter
2017-05-14 18:50:23 +03:00
_html = {}
IS_JUPYTER = is_in_jupyter()
2017-05-14 18:50:23 +03:00
def render(
docs,
style="dep",
page=False,
minify=False,
jupyter=IS_JUPYTER,
options={},
manual=False,
):
2017-05-14 18:50:23 +03:00
"""Render displaCy visualisation.
docs (list or Doc): Document(s) to visualise.
style (unicode): Visualisation style, 'dep' or 'ent'.
page (bool): Render markup as full HTML page.
minify (bool): Minify HTML markup.
jupyter (bool): Experimental, use Jupyter's `display()` to output markup.
2017-05-14 18:50:23 +03:00
options (dict): Visualiser-specific options, e.g. colors.
2017-10-27 15:39:19 +03:00
manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
2017-05-14 20:30:47 +03:00
RETURNS (unicode): Rendered HTML markup.
2017-05-14 18:50:23 +03:00
"""
factories = {
"dep": (DependencyRenderer, parse_deps),
"ent": (EntityRenderer, parse_ents),
}
if style not in factories:
raise ValueError(Errors.E087.format(style=style))
if isinstance(docs, (Doc, Span, dict)):
docs = [docs]
docs = [obj if not isinstance(obj, Span) else obj.as_doc() for obj in docs]
if not all(isinstance(obj, (Doc, Span, dict)) for obj in docs):
raise ValueError(Errors.E096)
renderer, converter = factories[style]
renderer = renderer(options=options)
parsed = [converter(doc, options) for doc in docs] if not manual else docs
_html["parsed"] = renderer.render(parsed, page=page, minify=minify).strip()
html = _html["parsed"]
2017-10-27 15:39:19 +03:00
if jupyter: # return HTML rendered by IPython display()
from IPython.core.display import display, HTML
return display(HTML(html))
return html
2017-05-14 18:50:23 +03:00
def serve(
docs, style="dep", page=True, minify=False, options={}, manual=False, port=5000
):
2017-05-14 18:50:23 +03:00
"""Serve displaCy visualisation.
docs (list or Doc): Document(s) to visualise.
style (unicode): Visualisation style, 'dep' or 'ent'.
page (bool): Render markup as full HTML page.
minify (bool): Minify HTML markup.
options (dict): Visualiser-specific options, e.g. colors.
2017-10-27 15:39:19 +03:00
manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
2017-05-14 18:50:23 +03:00
port (int): Port to serve visualisation.
"""
from wsgiref import simple_server
render(docs, style=style, page=page, minify=minify, options=options, manual=manual)
httpd = simple_server.make_server("0.0.0.0", port, app)
💫 New JSON helpers, training data internals & CLI rewrite (#2932) * Support nowrap setting in util.prints * Tidy up and fix whitespace * Simplify script and use read_jsonl helper * Add JSON schemas (see #2928) * Deprecate Doc.print_tree Will be replaced with Doc.to_json, which will produce a unified format * Add Doc.to_json() method (see #2928) Converts Doc objects to JSON using the same unified format as the training data. Method also supports serializing selected custom attributes in the doc._. space. * Remove outdated test * Add write_json and write_jsonl helpers * WIP: Update spacy train * Tidy up spacy train * WIP: Use wasabi for formatting * Add GoldParse helpers for JSON format * WIP: add debug-data command * Fix typo * Add missing import * Update wasabi pin * Add missing import * 💫 Refactor CLI (#2943) To be merged into #2932. ## Description - [x] refactor CLI To use [`wasabi`](https://github.com/ines/wasabi) - [x] use [`black`](https://github.com/ambv/black) for auto-formatting - [x] add `flake8` config - [x] move all messy UD-related scripts to `cli.ud` - [x] make converters function that take the opened file and return the converted data (instead of having them handle the IO) ### Types of change enhancement ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information. * Update wasabi pin * Delete old test * Update errors * Fix typo * Tidy up and format remaining code * Fix formatting * Improve formatting of messages * Auto-format remaining code * Add tok2vec stuff to spacy.train * Fix typo * Update wasabi pin * Fix path checks for when train() is called as function * Reformat and tidy up pretrain script * Update argument annotations * Raise error if model language doesn't match lang * Document new train command
2018-11-30 22:16:14 +03:00
print("\nUsing the '{}' visualizer".format(style))
print("Serving on port {}...\n".format(port))
try:
httpd.serve_forever()
except KeyboardInterrupt:
💫 New JSON helpers, training data internals & CLI rewrite (#2932) * Support nowrap setting in util.prints * Tidy up and fix whitespace * Simplify script and use read_jsonl helper * Add JSON schemas (see #2928) * Deprecate Doc.print_tree Will be replaced with Doc.to_json, which will produce a unified format * Add Doc.to_json() method (see #2928) Converts Doc objects to JSON using the same unified format as the training data. Method also supports serializing selected custom attributes in the doc._. space. * Remove outdated test * Add write_json and write_jsonl helpers * WIP: Update spacy train * Tidy up spacy train * WIP: Use wasabi for formatting * Add GoldParse helpers for JSON format * WIP: add debug-data command * Fix typo * Add missing import * Update wasabi pin * Add missing import * 💫 Refactor CLI (#2943) To be merged into #2932. ## Description - [x] refactor CLI To use [`wasabi`](https://github.com/ines/wasabi) - [x] use [`black`](https://github.com/ambv/black) for auto-formatting - [x] add `flake8` config - [x] move all messy UD-related scripts to `cli.ud` - [x] make converters function that take the opened file and return the converted data (instead of having them handle the IO) ### Types of change enhancement ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information. * Update wasabi pin * Delete old test * Update errors * Fix typo * Tidy up and format remaining code * Fix formatting * Improve formatting of messages * Auto-format remaining code * Add tok2vec stuff to spacy.train * Fix typo * Update wasabi pin * Fix path checks for when train() is called as function * Reformat and tidy up pretrain script * Update argument annotations * Raise error if model language doesn't match lang * Document new train command
2018-11-30 22:16:14 +03:00
print("Shutting down server on port {}.".format(port))
finally:
httpd.server_close()
2017-05-14 18:50:23 +03:00
def app(environ, start_response):
# Headers and status need to be bytes in Python 2, see #1227
headers = [(b_to_str(b"Content-type"), b_to_str(b"text/html; charset=utf-8"))]
start_response(b_to_str(b"200 OK"), headers)
res = _html["parsed"].encode(encoding="utf-8")
2017-05-14 18:50:23 +03:00
return [res]
def parse_deps(orig_doc, options={}):
2017-05-14 18:50:23 +03:00
"""Generate dependency parse in {'words': [], 'arcs': []} format.
doc (Doc): Document do parse.
RETURNS (dict): Generated dependency parse keyed by words and arcs.
"""
doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes())
if not doc.is_parsed:
user_warning(Warnings.W005)
if options.get("collapse_phrases", False):
for np in list(doc.noun_chunks):
np.merge(tag=np.root.tag_, lemma=np.root.lemma_, ent_type=np.root.ent_type_)
if options.get("collapse_punct", True):
2017-05-14 18:50:23 +03:00
spans = []
for word in doc[:-1]:
if word.is_punct or not word.nbor(1).is_punct:
continue
start = word.i
end = word.i + 1
while end < len(doc) and doc[end].is_punct:
end += 1
2017-10-27 15:39:19 +03:00
span = doc[start:end]
spans.append(
(span.start_char, span.end_char, word.tag_, word.lemma_, word.ent_type_)
)
for start, end, tag, lemma, ent_type in spans:
doc.merge(start, end, tag=tag, lemma=lemma, ent_type=ent_type)
if options.get("fine_grained"):
words = [{"text": w.text, "tag": w.tag_} for w in doc]
else:
words = [{"text": w.text, "tag": w.pos_} for w in doc]
2017-05-14 18:50:23 +03:00
arcs = []
for word in doc:
if word.i < word.head.i:
arcs.append(
{"start": word.i, "end": word.head.i, "label": word.dep_, "dir": "left"}
)
2017-05-14 18:50:23 +03:00
elif word.i > word.head.i:
arcs.append(
{
"start": word.head.i,
"end": word.i,
"label": word.dep_,
"dir": "right",
}
)
return {"words": words, "arcs": arcs}
2017-05-14 18:50:23 +03:00
def parse_ents(doc, options={}):
"""Generate named entities in [{start: i, end: i, label: 'label'}] format.
doc (Doc): Document do parse.
RETURNS (dict): Generated entities keyed by text (original text) and ents.
"""
ents = [
{"start": ent.start_char, "end": ent.end_char, "label": ent.label_}
for ent in doc.ents
]
if not ents:
user_warning(Warnings.W006)
title = doc.user_data.get("title", None) if hasattr(doc, "user_data") else None
return {"text": doc.text, "ents": ents, "title": title}