diff --git a/spacy/__init__.py b/spacy/__init__.py index 344fc427e..8dc0937f5 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -16,7 +16,7 @@ def load(name, **overrides): meta = util.parse_package_meta(model_path) if 'lang' not in meta: raise IOError('No language setting found in model meta.') - cls = util.load_lang_class(meta['lang']) + cls = util.get_lang_class(meta['lang']) overrides['meta'] = meta overrides['path'] = model_path return cls(**overrides) diff --git a/spacy/deprecated.py b/spacy/deprecated.py index c3b50cd85..77273d193 100644 --- a/spacy/deprecated.py +++ b/spacy/deprecated.py @@ -10,9 +10,9 @@ PRON_LEMMA = "-PRON-" def depr_model_download(lang): - """ - Replace download modules within en and de with deprecation warning and - download default language model (using shortcut). + """Replace en/de download modules within, warn and ownload default models. + + lang (unicode): Language shortcut, 'en' or 'de'. """ prints("The spacy.%s.download command is now deprecated. Please use " "python -m spacy download [model name or shortcut] instead. For " @@ -24,6 +24,12 @@ def depr_model_download(lang): def resolve_load_name(name, **overrides): + """Resolve model loading if deprecated path kwarg is specified in overrides. + + name (unicode): Name of model to load. + **overrides: Overrides specified in spacy.load(). + RETURNS: Model name or value of path kwarg. + """ if overrides.get('path') not in (None, False, True): name = overrides.get('path') prints("To load a model from a path, you can now use the first argument. " diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py new file mode 100644 index 000000000..45671612f --- /dev/null +++ b/spacy/displacy/__init__.py @@ -0,0 +1,103 @@ +# coding: utf8 +from __future__ import unicode_literals + +from .render import DependencyRenderer, EntityRenderer +from ..tokens import Doc +from ..util import prints + + +_html = {} + + +def render(docs, style='dep', page=False, minify=False, jupyter=False, options={}): + """Render displaCy visualisation. + + docs (list or Doc): Document(s) to visualise. + style (unicode): Visualisation style, 'dep' or 'ent'. + page (bool): Render markup as full HTML page. + minify (bool): Minify HTML markup. + jupyter (bool): Experimental, use Jupyter's display() to output markup. + options (dict): Visualiser-specific options, e.g. colors. + RETURNS (unicode): Rendered HTML markup. + """ + if isinstance(docs, Doc): + docs = [docs] + if style is 'dep': + renderer = DependencyRenderer(options=options) + parsed = [parse_deps(doc, options) for doc in docs] + elif style is 'ent': + renderer = EntityRenderer(options=options) + parsed = [parse_ents(doc, options) for doc in docs] + _html['parsed'] = renderer.render(parsed, page=page, minify=minify).strip() + html = _html['parsed'] + if jupyter: # return HTML rendered by IPython display() + from IPython.core.display import display, HTML + return display(HTML(html)) + return html + + +def serve(docs, style='dep', page=True, minify=False, options={}, port=5000): + """Serve displaCy visualisation. + + docs (list or Doc): Document(s) to visualise. + style (unicode): Visualisation style, 'dep' or 'ent'. + page (bool): Render markup as full HTML page. + minify (bool): Minify HTML markup. + options (dict): Visualiser-specific options, e.g. colors. + port (int): Port to serve visualisation. + """ + from wsgiref import simple_server + render(docs, style=style, page=page, minify=minify, options=options) + httpd = simple_server.make_server('0.0.0.0', port, app) + prints("Using the '%s' visualizer" % style, title="Serving on port %d..." % port) + httpd.serve_forever() + + +def app(environ, start_response): + start_response('200 OK', [('Content-type', 'text/html; charset=utf-8')]) + res = _html['parsed'].encode(encoding='utf-8') + return [res] + + +def parse_deps(doc, options={}): + """Generate dependency parse in {'words': [], 'arcs': []} format. + + doc (Doc): Document do parse. + RETURNS (dict): Generated dependency parse keyed by words and arcs. + """ + if options.get('collapse_punct', True): + spans = [] + for word in doc[:-1]: + if word.is_punct or not word.nbor(1).is_punct: + continue + start = word.i + end = word.i + 1 + while end < len(doc) and doc[end].is_punct: + end += 1 + span = doc[start : end] + spans.append((span.start_char, span.end_char, word.tag_, + word.lemma_, word.ent_type_)) + for span_props in spans: + doc.merge(*span_props) + words = [{'text': w.text, 'tag': w.tag_} for w in doc] + arcs = [] + for word in doc: + if word.i < word.head.i: + arcs.append({'start': word.i, 'end': word.head.i, + 'label': word.dep_, 'dir': 'left'}) + elif word.i > word.head.i: + arcs.append({'start': word.head.i, 'end': word.i, + 'label': word.dep_, 'dir': 'right'}) + return {'words': words, 'arcs': arcs} + + +def parse_ents(doc, options={}): + """Generate named entities in [{start: i, end: i, label: 'label'}] format. + + doc (Doc): Document do parse. + RETURNS (dict): Generated entities keyed by text (original text) and ents. + """ + ents = [{'start': ent.start_char, 'end': ent.end_char, 'label': ent.label_} + for ent in doc.ents] + title = doc.user_data.get('title', None) if hasattr(doc, 'user_data') else None + return {'text': doc.text, 'ents': ents, 'title': title} diff --git a/spacy/displacy/render.py b/spacy/displacy/render.py new file mode 100644 index 000000000..6a786437a --- /dev/null +++ b/spacy/displacy/render.py @@ -0,0 +1,217 @@ +# coding: utf8 +from __future__ import unicode_literals + +from .templates import TPL_DEP_SVG, TPL_DEP_WORDS, TPL_DEP_ARCS +from .templates import TPL_ENT, TPL_ENTS, TPL_FIGURE, TPL_TITLE, TPL_PAGE +from ..util import minify_html + + +class DependencyRenderer(object): + """Render dependency parses as SVGs.""" + style = 'dep' + + def __init__(self, options={}): + """Initialise dependency renderer. + + options (dict): Visualiser-specific options (compact, word_spacing, + arrow_spacing, arrow_width, arrow_stroke, distance, + offset_x, color, bg, font) + """ + self.compact = options.get('compact', False) + distance, arrow_width = (85, 8) if self.compact else (175, 10) + self.word_spacing = options.get('word_spacing', 45) + self.arrow_spacing = options.get('arrow_spacing', 20) + self.arrow_width = options.get('arrow_width', arrow_width) + self.arrow_stroke = options.get('arrow_stroke', 2) + self.distance = options.get('distance', distance) + self.offset_x = options.get('offset_x', 50) + self.color = options.get('color', '#000000') + self.bg = options.get('bg', '#ffffff') + self.font = options.get('font', 'Arial') + + def render(self, parsed, page=False, minify=False): + """Render complete markup. + + parsed (list): Dependency parses to render. + page (bool): Render parses wrapped as full HTML page. + minify (bool): Minify HTML markup. + RETURNS (unicode): Rendered SVG or HTML markup. + """ + rendered = [self.render_svg(i, p['words'], p['arcs']) + for i, p in enumerate(parsed)] + if page: + content = ''.join([TPL_FIGURE.format(content=svg) for svg in rendered]) + markup = TPL_PAGE.format(content=content) + else: + markup = ''.join(rendered) + if minify: + return minify_html(markup) + return markup + + def render_svg(self, render_id, words, arcs): + """Render SVG. + + render_id (int): Unique ID, typically index of document. + words (list): Individual words and their tags. + arcs (list): Individual arcs and their start, end, direction and label. + RETURNS (unicode): Rendered SVG markup. + """ + self.levels = self.get_levels(arcs) + self.highest_level = len(self.levels) + self.offset_y = self.distance/2*self.highest_level+self.arrow_stroke + self.width = self.offset_x+len(words)*self.distance + self.height = self.offset_y+3*self.word_spacing + self.id = render_id + words = [self.render_word(w['text'], w['tag'], i) + for i, w in enumerate(words)] + arcs = [self.render_arrow(a['label'], a['start'], a['end'], a['dir'], i) + for i, a in enumerate(arcs)] + content = ''.join(words) + ''.join(arcs) + return TPL_DEP_SVG.format(id=self.id, width=self.width, height=self.height, + color=self.color, bg=self.bg, font=self.font, + content=content) + + def render_word(self, text, tag, i): + """Render individual word. + + text (unicode): Word text. + tag (unicode): Part-of-speech tag. + i (int): Unique ID, typically word index. + RETURNS (unicode): Rendered SVG markup. + """ + y = self.offset_y+self.word_spacing + x = self.offset_x+i*self.distance + return TPL_DEP_WORDS.format(text=text, tag=tag, x=x, y=y) + + def render_arrow(self, label, start, end, direction, i): + """Render indivicual arrow. + + label (unicode): Dependency label. + start (int): Index of start word. + end (int): Index of end word. + direction (unicode): Arrow direction, 'left' or 'right'. + i (int): Unique ID, typically arrow index. + RETURNS (unicode): Rendered SVG markup. + """ + level = self.levels.index(end-start)+1 + x_start = self.offset_x+start*self.distance+self.arrow_spacing + y = self.offset_y + x_end = (self.offset_x+(end-start)*self.distance+start*self.distance + -self.arrow_spacing*(self.highest_level-level)/4) + y_curve = self.offset_y-level*self.distance/2 + if y_curve == 0 and len(self.levels) > 5: + y_curve = -self.distance + arrowhead = self.get_arrowhead(direction, x_start, y, x_end) + arc = self.get_arc(x_start, y, y_curve, x_end) + return TPL_DEP_ARCS.format(id=self.id, i=i, stroke=self.arrow_stroke, + head=arrowhead, label=label, arc=arc) + + def get_arc(self, x_start, y, y_curve, x_end): + """Render individual arc. + + x_start (int): X-coordinate of arrow start point. + y (int): Y-coordinate of arrow start and end point. + y_curve (int): Y-corrdinate of Cubic Bézier y_curve point. + x_end (int): X-coordinate of arrow end point. + RETURNS (unicode): Definition of the arc path ('d' attribute). + """ + template = "M{x},{y} C{x},{c} {e},{c} {e},{y}" + if self.compact: + template = "M{x},{y} {x},{c} {e},{c} {e},{y}" + return template.format(x=x_start, y=y, c=y_curve, e=x_end) + + def get_arrowhead(self, direction, x, y, end): + """Render individual arrow head. + + direction (unicode): Arrow direction, 'left' or 'right'. + x (int): X-coordinate of arrow start point. + y (int): Y-coordinate of arrow start and end point. + end (int): X-coordinate of arrow end point. + RETURNS (unicode): Definition of the arrow head path ('d' attribute). + """ + if direction is 'left': + pos1, pos2, pos3 = (x, x-self.arrow_width+2, x+self.arrow_width-2) + else: + pos1, pos2, pos3 = (end, end+self.arrow_width-2, end-self.arrow_width+2) + arrowhead = (pos1, y+2, pos2, y-self.arrow_width, pos3, y-self.arrow_width) + return "M{},{} L{},{} {},{}".format(*arrowhead) + + def get_levels(self, arcs): + """Calculate available arc height "levels". + Used to calculate arrow heights dynamically and without wasting space. + + args (list): Individual arcs and their start, end, direction and label. + RETURNS (list): Arc levels sorted from lowest to highest. + """ + levels = set(map(lambda arc: arc['end'] - arc['start'], arcs)) + return sorted(list(levels)) + + +class EntityRenderer(object): + """Render named entities as HTML.""" + style = 'ent' + + def __init__(self, options={}): + """Initialise dependency renderer. + + options (dict): Visualiser-specific options (colors, ents) + """ + colors = {'ORG': '#7aecec', 'PRODUCT': '#bfeeb7', 'GPE': '#feca74', + 'LOC': '#ff9561', 'PERSON': '#aa9cfc', 'NORP': '#c887fb', + 'FACILITY': '#9cc9cc', 'EVENT': '#ffeb80', 'LANGUAGE': '#ff8197', + 'WORK_OF_ART': '#f0d0ff', 'DATE': '#bfe1d9', 'TIME': '#bfe1d9', + 'MONEY': '#e4e7d2', 'QUANTITY': '#e4e7d2', 'ORDINAL': '#e4e7d2', + 'CARDINAL': '#e4e7d2', 'PERCENT': '#e4e7d2'} + colors.update(options.get('colors', {})) + self.default_color = '#ddd' + self.colors = colors + self.ents = options.get('ents', None) + + def render(self, parsed, page=False, minify=False): + """Render complete markup. + + parsed (list): Dependency parses to render. + page (bool): Render parses wrapped as full HTML page. + minify (bool): Minify HTML markup. + RETURNS (unicode): Rendered HTML markup. + """ + rendered = [self.render_ents(p['text'], p['ents'], p['title']) for p in parsed] + if page: + docs = ''.join([TPL_FIGURE.format(content=doc) for doc in rendered]) + markup = TPL_PAGE.format(content=docs) + else: + markup = ''.join(rendered) + if minify: + return minify_html(markup) + return markup + + def render_ents(self, text, spans, title): + """Render entities in text. + + text (unicode): Original text. + spans (list): Individual entity spans and their start, end and label. + title (unicode or None): Document title set in Doc.user_data['title']. + """ + markup = '' + offset = 0 + for span in spans: + label = span['label'] + start = span['start'] + end = span['end'] + entity = text[start:end] + fragments = text[offset:start].split('\n') + for i, fragment in enumerate(fragments): + markup += fragment + if len(fragments) > 1 and i != len(fragments)-1: + markup += '
' + if self.ents is None or label.upper() in self.ents: + color = self.colors.get(label.upper(), self.default_color) + markup += TPL_ENT.format(label=label, text=entity, bg=color) + else: + markup += entity + offset = end + markup += text[offset:] + markup = TPL_ENTS.format(content=markup, colors=self.colors) + if title: + markup = TPL_TITLE.format(title=title) + markup + return markup diff --git a/spacy/displacy/templates.py b/spacy/displacy/templates.py new file mode 100644 index 000000000..54df44489 --- /dev/null +++ b/spacy/displacy/templates.py @@ -0,0 +1,63 @@ +# coding: utf8 +from __future__ import unicode_literals + + +# setting explicit height and max-width: none on the SVG is required for +# Jupyter to render it properly in a cell + +TPL_DEP_SVG = """ +{content} +""" + + +TPL_DEP_WORDS = """ + + {text} + {tag} + +""" + + +TPL_DEP_ARCS = """ + + + + {label} + + + +""" + + +TPL_FIGURE = """ +
{content}
+""" + +TPL_TITLE = """ +

{title}

+""" + + +TPL_ENTS = """ +
{content}
+""" + + +TPL_ENT = """ + + {text} + {label} + +""" + + +TPL_PAGE = """ + + + + displaCy + + + {content} + +""" diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 66ba94ea6..6b577be62 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -18,67 +18,67 @@ _languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'he', 'hu', 'it', 'nb', @pytest.fixture(params=_languages) def tokenizer(request): - lang = util.load_lang_class(request.param) + lang = util.get_lang_class(request.param) return lang.Defaults.create_tokenizer() @pytest.fixture def en_tokenizer(): - return util.load_lang_class('en').Defaults.create_tokenizer() + return util.get_lang_class('en').Defaults.create_tokenizer() @pytest.fixture def en_vocab(): - return util.load_lang_class('en').Defaults.create_vocab() + return util.get_lang_class('en').Defaults.create_vocab() @pytest.fixture def en_parser(): - return util.load_lang_class('en').Defaults.create_parser() + return util.get_lang_class('en').Defaults.create_parser() @pytest.fixture def es_tokenizer(): - return util.load_lang_class('es').Defaults.create_tokenizer() + return util.get_lang_class('es').Defaults.create_tokenizer() @pytest.fixture def de_tokenizer(): - return util.load_lang_class('de').Defaults.create_tokenizer() + return util.get_lang_class('de').Defaults.create_tokenizer() @pytest.fixture(scope='module') def fr_tokenizer(): - return util.load_lang_class('fr').Defaults.create_tokenizer() + return util.get_lang_class('fr').Defaults.create_tokenizer() @pytest.fixture def hu_tokenizer(): - return util.load_lang_class('hu').Defaults.create_tokenizer() + return util.get_lang_class('hu').Defaults.create_tokenizer() @pytest.fixture def fi_tokenizer(): - return util.load_lang_class('fi').Defaults.create_tokenizer() + return util.get_lang_class('fi').Defaults.create_tokenizer() @pytest.fixture def sv_tokenizer(): - return util.load_lang_class('sv').Defaults.create_tokenizer() + return util.get_lang_class('sv').Defaults.create_tokenizer() @pytest.fixture def bn_tokenizer(): - return util.load_lang_class('bn').Defaults.create_tokenizer() + return util.get_lang_class('bn').Defaults.create_tokenizer() @pytest.fixture def he_tokenizer(): - return util.load_lang_class('he').Defaults.create_tokenizer() + return util.get_lang_class('he').Defaults.create_tokenizer() @pytest.fixture def nb_tokenizer(): - return util.load_lang_class('nb').Defaults.create_tokenizer() + return util.get_lang_class('nb').Defaults.create_tokenizer() @pytest.fixture @@ -88,12 +88,12 @@ def stringstore(): @pytest.fixture def en_entityrecognizer(): - return util.load_lang_class('en').Defaults.create_entity() + return util.get_lang_class('en').Defaults.create_entity() @pytest.fixture def lemmatizer(): - return util.load_lang_class('en').Defaults.create_lemmatizer() + return util.get_lang_class('en').Defaults.create_lemmatizer() @pytest.fixture diff --git a/spacy/util.py b/spacy/util.py index 50d0ab73d..717e4f160 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -25,39 +25,37 @@ try: except ImportError: cupy = None -def set_lang_class(name, cls): +def get_lang_class(lang): + """Import and load a Language class. + + lang (unicode): Two-letter language code, e.g. 'en'. + RETURNS (Language): Language class. + """ global LANGUAGES - LANGUAGES[name] = cls - - -def get_lang_class(name): - if name in LANGUAGES: - return LANGUAGES[name] - lang = re.split('[^a-zA-Z0-9]', name, 1)[0] - if lang not in LANGUAGES: - raise RuntimeError('Language not supported: %s' % name) + if not lang in LANGUAGES: + try: + module = importlib.import_module('.lang.%s' % lang, 'spacy') + except ImportError: + raise ImportError("Can't import language %s from spacy.lang." %lang) + LANGUAGES[lang] = getattr(module, module.__all__[0]) return LANGUAGES[lang] -def load_lang_class(lang): - """Import and load a Language class. +def set_lang_class(name, cls): + """Set a custom Language class name that can be loaded via get_lang_class. - Args: - lang (unicode): Two-letter language code, e.g. 'en'. - Returns: - Language: Language class. + name (unicode): Name of Language class. + cls (Language): Language class. """ - module = importlib.import_module('.lang.%s' % lang, 'spacy') - return getattr(module, module.__all__[0]) + global LANGUAGES + LANGUAGES[name] = cls def get_data_path(require_exists=True): """Get path to spaCy data directory. - Args: - require_exists (bool): Only return path if it exists, otherwise None. - Returns: - Path or None: Data path or None. + require_exists (bool): Only return path if it exists, otherwise None. + RETURNS (Path or None): Data path or None. """ if not require_exists: return _data_path @@ -68,14 +66,18 @@ def get_data_path(require_exists=True): def set_data_path(path): """Set path to spaCy data directory. - Args: - path (unicode or Path): Path to new data directory. + path (unicode or Path): Path to new data directory. """ global _data_path _data_path = ensure_path(path) def ensure_path(path): + """Ensure string is converted to a Path. + + path: Anything. If string, it's converted to Path. + RETURNS: Path or original argument. + """ if isinstance(path, basestring_): return Path(path) else: @@ -85,10 +87,8 @@ def ensure_path(path): def resolve_model_path(name): """Resolve a model name or string to a model path. - Args: - name (unicode): Package name, shortcut link or model path. - Returns: - Path: Path to model data directory. + name (unicode): Package name, shortcut link or model path. + RETURNS (Path): Path to model data directory. """ data_path = get_data_path() if not data_path or not data_path.exists(): @@ -108,11 +108,8 @@ def resolve_model_path(name): def is_package(name): """Check if string maps to a package installed via pip. - Args: - name (unicode): Name of package. - Returns: - bool: True if installed package, False if not. - + name (unicode): Name of package. + RETURNS (bool): True if installed package, False if not. """ packages = pip.get_installed_distributions() for package in packages: @@ -124,10 +121,8 @@ def is_package(name): def get_model_package_path(package_name): """Get path to a model package installed via pip. - Args: - package_name (unicode): Name of installed package. - Returns: - Path: Path to model data directory. + package_name (unicode): Name of installed package. + RETURNS (Path): Path to model data directory. """ # Here we're importing the module just to find it. This is worryingly # indirect, but it's otherwise very difficult to find the package. @@ -142,11 +137,9 @@ def get_model_package_path(package_name): def parse_package_meta(package_path, require=True): """Check if a meta.json exists in a package and return its contents. - Args: - package_path (Path): Path to model package directory. - require (bool): If True, raise error if no meta.json is found. - Returns: - dict or None: Model meta.json data or None. + package_path (Path): Path to model package directory. + require (bool): If True, raise error if no meta.json is found. + RETURNS (dict or None): Model meta.json data or None. """ location = package_path / 'meta.json' if location.is_file(): @@ -201,11 +194,9 @@ def compile_infix_regex(entries): def update_exc(base_exceptions, *addition_dicts): """Update and validate tokenizer exceptions. Will overwrite exceptions. - Args: - base_exceptions (dict): Base exceptions. - *addition_dicts (dict): Exceptions to add to the base dict, in order. - Returns: - dict: Combined tokenizer exceptions. + base_exceptions (dict): Base exceptions. + *addition_dicts (dict): Exceptions to add to the base dict, in order. + RETURNS (dict): Combined tokenizer exceptions. """ exc = dict(base_exceptions) for additions in addition_dicts: @@ -229,12 +220,10 @@ def expand_exc(excs, search, replace): """Find string in tokenizer exceptions, duplicate entry and replace string. For example, to add additional versions with typographic apostrophes. - Args: - excs (dict): Tokenizer exceptions. - search (unicode): String to find and replace. - replace (unicode): Replacement. - Returns: - dict: + excs (dict): Tokenizer exceptions. + search (unicode): String to find and replace. + replace (unicode): Replacement. + RETURNS (dict): Combined tokenizer exceptions. """ def _fix_token(token, search, replace): fixed = dict(token) @@ -278,10 +267,8 @@ def check_renamed_kwargs(renamed, kwargs): def read_json(location): """Open and load JSON from file. - Args: - location (Path): Path to JSON file. - Returns: - dict: Loaded JSON content. + location (Path): Path to JSON file. + RETURNS (dict): Loaded JSON content. """ with location.open('r', encoding='utf8') as f: return ujson.load(f) @@ -290,11 +277,9 @@ def read_json(location): def get_raw_input(description, default=False): """Get user input from the command line via raw_input / input. - Args: - description (unicode): Text to display before prompt. - default (unicode or False/None): Default value to display with prompt. - Returns: - unicode: User input. + description (unicode): Text to display before prompt. + default (unicode or False/None): Default value to display with prompt. + RETURNS (unicode): User input. """ additional = ' (default: %s)' % default if default else '' prompt = ' %s%s: ' % (description, additional) @@ -305,9 +290,8 @@ def get_raw_input(description, default=False): def print_table(data, title=None): """Print data in table format. - Args: - data (dict or list of tuples): Label/value pairs. - title (unicode or None): Title, will be printed above. + data (dict or list of tuples): Label/value pairs. + title (unicode or None): Title, will be printed above. """ if isinstance(data, dict): data = list(data.items()) @@ -321,9 +305,8 @@ def print_table(data, title=None): def print_markdown(data, title=None): """Print data in GitHub-flavoured Markdown format for issues etc. - Args: - data (dict or list of tuples): Label/value pairs. - title (unicode or None): Title, will be rendered as headline 2. + data (dict or list of tuples): Label/value pairs. + title (unicode or None): Title, will be rendered as headline 2. """ def excl_value(value): return Path(value).exists() # contains path (personal info) @@ -339,10 +322,8 @@ def print_markdown(data, title=None): def prints(*texts, **kwargs): """Print formatted message (manual ANSI escape sequences to avoid dependency) - Args: - *texts (unicode): Texts to print. Each argument is rendered as paragraph. - **kwargs: 'title' is rendered as coloured headline. 'exits'=True performs - system exit after printing. + *texts (unicode): Texts to print. Each argument is rendered as paragraph. + **kwargs: 'title' becomes coloured headline. 'exits'=True performs sys exit. """ exits = kwargs.get('exits', False) title = kwargs.get('title', None) @@ -356,12 +337,10 @@ def prints(*texts, **kwargs): def _wrap(text, wrap_max=80, indent=4): """Wrap text at given width using textwrap module. - Args: - text (unicode): Text to wrap. If it's a Path, it's converted to string. - wrap_max (int): Maximum line length (indent is deducted). - indent (int): Number of spaces for indentation. - Returns: - unicode: Wrapped text. + text (unicode): Text to wrap. If it's a Path, it's converted to string. + wrap_max (int): Maximum line length (indent is deducted). + indent (int): Number of spaces for indentation. + RETURNS (unicode): Wrapped text. """ indent = indent * ' ' wrap_width = wrap_max - len(indent) @@ -370,3 +349,13 @@ def _wrap(text, wrap_max=80, indent=4): return textwrap.fill(text, width=wrap_width, initial_indent=indent, subsequent_indent=indent, break_long_words=False, break_on_hyphens=False) + + +def minify_html(html): + """Perform a template-specific, rudimentary HTML minification for displaCy. + Disclaimer: NOT a general-purpose solution, only removes indentation/newlines. + + html (unicode): Markup to minify. + RETURNS (unicode): "Minified" HTML. + """ + return html.strip().replace(' ', '').replace('\n', '') diff --git a/website/_layout.jade b/website/_layout.jade index d5c52df3f..ccca2863f 100644 --- a/website/_layout.jade +++ b/website/_layout.jade @@ -3,7 +3,6 @@ include _includes/_mixins doctype html - html(lang="en") title if SECTION == "docs" && SUBSECTION && SUBSECTION != "index" diff --git a/website/assets/img/docs/displacy_jupyter.jpg b/website/assets/img/docs/displacy_jupyter.jpg new file mode 100644 index 000000000..4f815309a Binary files /dev/null and b/website/assets/img/docs/displacy_jupyter.jpg differ diff --git a/website/docs/api/_data.json b/website/docs/api/_data.json index 2e0d80ca1..d0dc498da 100644 --- a/website/docs/api/_data.json +++ b/website/docs/api/_data.json @@ -21,6 +21,7 @@ "GoldParse": "goldparse" }, "Other": { + "displaCy": "displacy", "Utility Functions": "util", "Annotation Specs": "annotation", "Feature Scheme": "features" @@ -111,6 +112,11 @@ "tag": "class" }, + "displacy": { + "title": "displaCy", + "tag": "module" + }, + "util": { "title": "Utility Functions" }, diff --git a/website/docs/api/displacy.jade b/website/docs/api/displacy.jade new file mode 100644 index 000000000..92b1543fd --- /dev/null +++ b/website/docs/api/displacy.jade @@ -0,0 +1,229 @@ +//- 💫 DOCS > API > DISPLACY + +include ../../_includes/_mixins + +p + | As of v2.0, spaCy comes with a built-in visualization suite. For more + | info and examples, see the usage workflow on + | #[+a("/docs/usage/visualizers") visualizing spaCy]. + + ++h(2, "serve") serve + +tag method + +p + | Serve a dependency parse tree or named entity visualization to view it + | in your browser. Will run a simple web server. + ++aside-code("Example"). + import spacy + from spacy import displacy + nlp = spacy.load('en') + doc1 = nlp(u'This is a sentence.') + doc2 = nlp(u'This is another sentence.') + displacy.serve([doc1, doc2], style='dep') + ++table(["Name", "Type", "Description", "Default"]) + +row + +cell #[code docs] + +cell list or #[code Doc] + +cell Document(s) to visualize. + +cell + + +row + +cell #[code style] + +cell unicode + +cell Visualization style, #[code 'dep'] or #[code 'ent']. + +cell #[code 'dep'] + + +row + +cell #[code page] + +cell bool + +cell Render markup as full HTML page. + +cell #[code True] + + +row + +cell #[code minify] + +cell bool + +cell Minify HTML markup. + +cell #[code False] + + +row + +cell #[code options] + +cell dict + +cell #[+a("#options") Visualizer-specific options], e.g. colors. + +cell #[code {}] + + +row + +cell #[code port] + +cell int + +cell Port to serve visualization. + +cell #[code 5000] + ++h(2, "render") render + +tag method + +p Render a dependency parse tree or named entity visualization. + ++aside-code("Example"). + import spacy + from spacy import displacy + nlp = spacy.load('en') + doc = nlp(u'This is a sentence.') + html = displacy.render(doc, style='dep') + ++table(["Name", "Type", "Description", "Default"]) + +row + +cell #[code docs] + +cell list or #[code Doc] + +cell Document(s) to visualize. + +cell + + +row + +cell #[code style] + +cell unicode + +cell Visualization style, #[code 'dep'] or #[code 'ent']. + +cell #[code 'dep'] + + +row + +cell #[code page] + +cell bool + +cell Render markup as full HTML page. + +cell #[code False] + + +row + +cell #[code minify] + +cell bool + +cell Minify HTML markup. + +cell #[code False] + + +row + +cell #[code jupyter] + +cell bool + +cell + | Returns markup using #[+a("http://jupyter.org/") Jupyter]'s + | internal methods, ready to be rendered in a notebook. + +cell #[code False] + + +row + +cell #[code options] + +cell dict + +cell #[+a("#options") Visualizer-specific options], e.g. colors. + +cell #[code {}] + + +footrow + +cell return + +cell unicode + +cell Rendered HTML markup. + +cell + ++h(2, "options") Visualizer options + +p + | The #[code options] argument lets you specify additional settings for + | each visualizer. If a setting is not present in the options, the default + | value will be used. + ++h(3, "options-dep") Dependency Visualizer options + ++aside-code("Example"). + options = {'compact': True, 'color': 'blue'} + displacy.serve(doc, style='dep', options=options) + ++table(["Name", "Type", "Description", "Default"]) + +row + +cell #[code collapse_punct] + +cell bool + +cell + | Attach punctuation to tokens. Can make the parse more readable, + | as it prevents long arcs to attach punctuation. + +cell #[code True] + + +row + +cell #[code compact] + +cell bool + +cell "Compact mode" with square arrows that takes up less space. + +cell #[code False] + + +row + +cell #[code color] + +cell unicode + +cell Text color (HEX, RGB or color names). + +cell #[code '#000000'] + + +row + +cell #[code bg] + +cell unicode + +cell Background color (HEX, RGB or color names). + +cell #[code '#ffffff'] + + +row + +cell #[code font] + +cell unicode + +cell Font name or font family for all text. + +cell #[code 'Arial'] + + +row + +cell #[code offset_x] + +cell int + +cell Spacing on left side of the SVG in px. + +cell #[code 50] + + +row + +cell #[code arrow_stroke] + +cell int + +cell Width of arrow path in px. + +cell #[code 2] + + +row + +cell #[code arrow_width] + +cell int + +cell Width of arrow head in px. + +cell #[code 10] / #[code 8] (compact) + + +row + +cell #[code arrow_spacing] + +cell int + +cell Spacing between arrows in px to avoid overlaps. + +cell #[code 20] + + +row + +cell #[code word_spacing] + +cell int + +cell Horizontal spacing between words and arcs in px. + +cell #[code 45] + + +row + +cell #[code distance] + +cell int + +cell Distance between words in px. + +cell #[code 175] / #[code 85] (compact) + ++h(3, "options-ent") Named Entity Visualizer options + ++aside-code("Example"). + options = {'ents': ['PERSON', 'ORG', 'PRODUCT'], + 'colors': {'ORG': 'yellow'}} + displacy.serve(doc, style='ent', options=options) + ++table(["Name", "Type", "Description", "Default"]) + +row + +cell #[code ents] + +cell list + +cell + | Entity types to highlight (#[code None] for all types). + +cell #[code None] + + +row + +cell #[code colors] + +cell dict + +cell + | Color overrides. Entity types in lowercase should be mapped to + | color names or values. + +cell #[code {}] + +p + | By default, displaCy comes with colours for all + | #[+a("/docs/api/annotation#named-entities") entity types supported by spaCy]. + | If you're using custom entity types, you can use the #[code colors] + | setting to add your own colours for them. diff --git a/website/docs/api/util.jade b/website/docs/api/util.jade index 3b1f305a9..97ed7c6e0 100644 --- a/website/docs/api/util.jade +++ b/website/docs/api/util.jade @@ -49,7 +49,7 @@ p +cell unicode or #[code Path] +cell Path to new data directory. -+h(2, "load_lang_class") load_lang_class ++h(2, "get_lang_class") get_lang_class +tag function p @@ -59,7 +59,7 @@ p +aside-code("Example"). for lang_id in ['en', 'de']: - lang_class = util.load_lang_class(lang_id) + lang_class = util.get_lang_class(lang_id) lang = lang_class() tokenizer = lang.Defaults.create_tokenizer() diff --git a/website/docs/usage/_data.json b/website/docs/usage/_data.json index 78e8b3e27..6e63016d4 100644 --- a/website/docs/usage/_data.json +++ b/website/docs/usage/_data.json @@ -4,9 +4,9 @@ "Installation": "./", "Models": "models", "Lightning tour": "lightning-tour", + "Visualizers": "visualizers", "Command line": "cli", - "Troubleshooting": "troubleshooting", - "Resources": "resources" + "Troubleshooting": "troubleshooting" }, "Workflows": { "Loading the pipeline": "language-processing-pipeline", @@ -43,6 +43,11 @@ "lightning-tour": { "title": "Lightning tour", + "next": "visualizers" + }, + + "visualizers": { + "title": "Visualizers", "next": "cli" }, diff --git a/website/docs/usage/adding-languages.jade b/website/docs/usage/adding-languages.jade index d8f4a9a06..2d90028f0 100644 --- a/website/docs/usage/adding-languages.jade +++ b/website/docs/usage/adding-languages.jade @@ -80,7 +80,7 @@ p | compute. As of spaCy v2.0, #[code Language] classes are not imported on | initialisation and are only loaded when you import them directly, or load | a model that requires a language to be loaded. To lazy-load languages in - | your application, you can use the #[code util.load_lang_class()] helper + | your application, you can use the #[code util.get_lang_class()] helper | function with the two-letter language code as its argument. +h(2, "language-data") Adding language data @@ -486,7 +486,7 @@ p | #[+src(gh("spaCy", "spacy/tests/lang")) tests/lang] in a directory named | after the language ID. You'll also need to create a fixture for your | tokenizer in the #[+src(gh("spaCy", "spacy/tests/conftest.py")) conftest.py]. - | Always use the #[code load_lang_class()] helper function within the fixture, + | Always use the #[code get_lang_class()] helper function within the fixture, | instead of importing the class at the top of the file. This will load the | language data only when it's needed. (Otherwise, #[em all data] would be | loaded every time you run a test.) @@ -494,7 +494,7 @@ p +code. @pytest.fixture def en_tokenizer(): - return util.load_lang_class('en').Defaults.create_tokenizer() + return util.get_lang_class('en').Defaults.create_tokenizer() p | When adding test cases, always diff --git a/website/docs/usage/entity-recognition.jade b/website/docs/usage/entity-recognition.jade index ab8ce22d0..5f0dfc581 100644 --- a/website/docs/usage/entity-recognition.jade +++ b/website/docs/usage/entity-recognition.jade @@ -105,17 +105,36 @@ p | consistent state. -+h(2, "displacy") The displaCy #[sup ENT] visualizer ++h(2, "displacy") Visualizing named entities p | The #[+a(DEMOS_URL + "/displacy-ent/") displaCy #[sup ENT] visualizer] | lets you explore an entity recognition model's behaviour interactively. | If you're training a model, it's very useful to run the visualization - | server yourself. To help you do that, we've open-sourced both the - | #[+a(gh("spacy-services")) back-end service] and the - | #[+a(gh("displacy-ent")) front-end client]. + | yourself. To help you do that, spaCy v2.0+ comes with a visualization + | module. Simply pass a #[code Doc] or a list of #[code Doc] objects to + | displaCy and run #[+api("displacy#serve") #[code displacy.serve]] to + | run the web server, or #[+api("displacy#render") #[code displacy.render]] + | to generate the raw markup. -+codepen("ALxpQO", 450) +p + | For more details and examples, see the + | #[+a("/docs/usage/visualizers") usage workflow on visualizing spaCy]. + ++code("Named Entity example"). + import spacy + from spacy import displacy + + text = """But Google is starting from behind. The company made a late push + into hardware, and Apple’s Siri, available on iPhones, and Amazon’s Alexa + software, which runs on its Echo and Dot devices, have clear leads in + consumer adoption.""" + + nlp = spacy.load('custom_ner_model') + doc = nlp(text) + displacy.serve(doc, style='ent') + ++codepen("a73f8b68f9af3157855962b283b364e4", 345) +h(2, "entity-types") Built-in entity types diff --git a/website/docs/usage/visualizers.jade b/website/docs/usage/visualizers.jade new file mode 100644 index 000000000..785b5b4fd --- /dev/null +++ b/website/docs/usage/visualizers.jade @@ -0,0 +1,278 @@ +//- 💫 DOCS > USAGE > VISUALIZERS + +include ../../_includes/_mixins + +p + | As of v2.0, our popular visualizers, #[+a(DEMOS_URL + "/displacy") displaCy] + | and #[+a(DEMOS_URL + "displacy-ent") displaCy #[sup ENT]] are finally an + | official part of the library. Visualizing a dependency parse or named + | entities in a text is not only a fun NLP demo – it can also be incredibly + | helpful in speeding up development and debugging your code and training + | process. Instead of printing a list of dependency labels or entity spans, + | you can simply pass your #[code Doc] objects to #[code displacy] and view + | the visualizations in your browser, or export them as HTML files or + | vector graphics. displaCy also comes with a #[+a("#jupyter") Jupyter hook] + | that returns the markup in a format ready to be rendered in a notebook. + ++aside("What about the old visualizers?") + | Our JavaScript-based visualizers #[+src(gh("displacy")) displacy.js] and + | #[+src(gh("displacy-ent")) displacy-ent.js] will still be available on + | GitHub. If you're looking to implement web-based visualizations, we + | generally recommend using those instead of spaCy's built-in + | #[code displacy] module. It'll allow your application to perform all + | rendering on the client and only rely on the server for the text + | processing. The generated markup is also more compatible with modern web + | standards. + ++h(2, "getting-started") Getting started + +p + | The quickest way visualize #[code Doc] is to use + | #[+api("displacy#serve") #[code displacy.serve]]. This will spin up a + | simple web server and let you view the result straight from your browser. + | displaCy can either take a single #[code Doc] or a list of #[code Doc] + | objects as its first argument. This lets you construct them however you + | like – using any model or modifications you like. + ++h(3, "dep") Visualizing the dependency parse + +p + | The dependency visualizer, #[code dep], shows part-of-speech tags + | and syntactic dependencies. + ++code("Dependency example"). + import spacy + from spacy import displacy + + nlp = spacy.load('en') + doc = nlp(u'This is a sentence.') + displacy.serve(doc, style='dep') + ++codepen("f0e85b64d469d6617251d8241716d55f", 370) + +p + | The argument #[code options] lets you specify a dictionary of settings + | to customise the layout, for example: + ++table(["Name", "Type", "Description", "Default"]) + +row + +cell #[code compact] + +cell bool + +cell "Compact mode" with square arrows that takes up less space. + +cell #[code False] + + +row + +cell #[code color] + +cell unicode + +cell Text color (HEX, RGB or color names). + +cell #[code '#000000'] + + +row + +cell #[code bg] + +cell unicode + +cell Background color (HEX, RGB or color names). + +cell #[code '#ffffff'] + + +row + +cell #[code font] + +cell unicode + +cell Font name or font family for all text. + +cell #[code 'Arial'] + +p + | For a list of all available options, see the + | #[+api("displacy#options") #[code displacy] API documentation]. + ++aside-code("Options example"). + options = {'compact': True, 'bg': '#09a3d5', + 'color': 'white', 'font': 'Source Sans Pro'} + displacy.serve(doc, style='dep', options=options) + ++codepen("39c02c893a84794353de77a605d817fd", 360) + ++h(3, "ent") Visualizing the entity recognizer + +p + | The entity visualizer, #[code ent], highlights named entities and + | their labels in a text. + ++code("Named Entity example"). + import spacy + from spacy import displacy + + text = """But Google is starting from behind. The company made a late push + into hardware, and Apple’s Siri, available on iPhones, and Amazon’s Alexa + software, which runs on its Echo and Dot devices, have clear leads in + consumer adoption.""" + + nlp = spacy.load('custom_ner_model') + doc = nlp(text) + displacy.serve(doc, style='ent') + ++codepen("a73f8b68f9af3157855962b283b364e4", 345) + +p The entity visualizer lets you customise the following #[code options]: + ++table(["Name", "Type", "Description", "Default"]) + +row + +cell #[code ents] + +cell list + +cell + | Entity types to highlight (#[code None] for all types). + +cell #[code None] + + +row + +cell #[code colors] + +cell dict + +cell + | Color overrides. Entity types in lowercase should be mapped to + | color names or values. + +cell #[code {}] + +p + | If you specify a list of #[code ents], only those entity types will be + | rendered – for example, you can choose to display #[code PERSON] entities. + | Internally, the visualizer knows nothing about available entity types and + | will render whichever spans and labels it receives. This makes it + | especially easy to work with custom entity types. By default, displaCy + | comes with colours for all + | #[+a("/docs/api/annotation#named-entities") entity types supported by spaCy]. + | If you're using custom entity types, you can use the #[code colors] + | setting to add your own colours for them. + ++aside-code("Options example"). + colors = {'ORG': 'linear-gradient(90deg, #aa9cfc, #fc9ce7)'} + options = {'ents': ['ORG'], 'colors': colors} + displacy.serve(doc, style='ent', options=options) + ++codepen("f42ec690762b6f007022a7acd6d0c7d4", 300) + +p + | The above example uses a little trick: Since the background colour values + | are added as the #[code background] style attribute, you can use any + | #[+a("https://tympanus.net/codrops/css_reference/background/") valid background value] + | or shorthand — including gradients and even images! + ++h(2, "render") Rendering visualizations + +p + | If you don't need the web server and just want to generate the markup + | – for example, to export it to a file or serve it in a custom + | way – you can use #[+api("displacy#render") #[code displacy.render]] + | instead. It works the same, but returns a string containing the markup. + ++code("Example"). + import spacy + from spacy import displacy + + nlp = spacy.load('en') + doc1 = nlp(u'This is a sentence.') + doc2 = nlp(u'This is another sentence.') + html = displacy.render([doc1, doc2], style='dep', page=True) + +p + | #[code page=True] renders the markup wrapped as a full HTML page. + | For minified and more compact HTML markup, you can set #[code minify=True]. + | If you're rendering a dependency parse, you can also export it as an + | #[code .svg] file. + ++aside("What's SVG?") + | Unlike other image formats, the SVG (Scalable Vector Graphics) uses XML + | markup that's easy to manipulate + | #[+a("https://www.smashingmagazine.com/2014/11/styling-and-animating-svgs-with-css/") using CSS] or + | #[+a("https://css-tricks.com/smil-is-dead-long-live-smil-a-guide-to-alternatives-to-smil-features/") JavaScript]. + | Essentially, SVG lets you design with code, which makes it a perfect fit + | for visualizing dependency trees. SVGs can be embedded online in an + | #[code <img>] tag, or inlined in an HTML document. They're also + | pretty easy to #[+a("https://convertio.co/image-converter/") convert]. + ++code. + svg = displacy.render(doc, style='dep') + output_path = Path('/images/sentence.svg') + output_path.open('w', encoding='utf-8').write(svg) + ++infobox("Important note") + | Since each visualization is generated as a separate SVG, exporting + | #[code .svg] files only works if you're rendering #[strong one single doc] + | at a time. (This makes sense – after all, each visualization should be + | a standalone graphic.) So instead of rendering all #[code Doc]s at one, + | loop over them and export them separately. + ++h(2, "jupyter") Using displaCy in Jupyter notebooks + +p + | If you're working with a #[+a("https://jupyter.org") Jupyter] notebook, + | you can use displaCy's "Jupyter mode" to return markup that can be + | rendered in a cell straight away. When you export your notebook, the + | visualizations will be included as HTML. + ++code("Jupyter Example"). + # don't forget to install a model, e.g.: python -m spacy download en + import spacy + from spacy import displacy + + doc = nlp(u'Rats are various medium-sized, long-tailed rodents.') + displacy.render(doc, style='dep', jupyter=True) + + doc2 = nlp(LONG_NEWS_ARTICLE) + displacy.render(doc2, style='ent', jupyter=True) + ++image("/assets/img/docs/displacy_jupyter.jpg", 700, false, "Example of using the displaCy dependency and named entity visualizer in a Jupyter notebook") + +p + | Internally, displaCy imports #[code display] and #[code HTML] from + | #[code IPython.core.display] and returns a Jupyter HTML object. If you + | were doing it manually, it'd look like this: + ++code. + from IPython.core.display import display, HTML + + html = displacy.render(doc, style='dep') + return display(HTML(html)) + ++h(2, "examples") Usage examples + + ++h(2, "manual-usage") Rendering data manually + +p + | You can also use displaCy to manually render data. This can be useful if + | you want to visualize output from other libraries, like + | #[+a("http://www.nltk.org") NLTK] or + | #[+a("https://github.com/tensorflow/models/tree/master/syntaxnet") SyntaxNet]. + | Simply convert the dependency parse or recognised entities to displaCy's + | format and import #[code DependencyRenderer] or #[code EntityRenderer] + | from #[code spacy.displacy.render]. A renderer class can be is initialised + | with a dictionary of options. To generate the visualization markup, call + | the renderer's #[code render()] method on a list of dictionaries (one + | per visualization). + + ++aside-code("Example"). + from spacy.displacy.render import EntityRenderer + + ex = [{'text': 'But Google is starting from behind.', + 'ents': [{'start': 4, 'end': 10, 'label': 'ORG'}], + 'title': None}] + renderer = EntityRenderer() + html = renderer.render(ex) + ++code("DependencyRenderer input"). + [{ + 'words': [ + {'text': 'This', 'tag': 'DT'}, + {'text': 'is', 'tag': 'VBZ'}, + {'text': 'a', 'tag': 'DT'}, + {'text': 'sentence', 'tag': 'NN'}], + 'arcs': [ + {'start': 0, 'end': 1, 'label': 'nsubj', 'dir': 'left'}, + {'start': 2, 'end': 3, 'label': 'det', 'dir': 'left'}, + {'start': 1, 'end': 3, 'label': 'attr', 'dir': 'right'}] + }] + ++code("EntityRenderer input"). + [{ + 'text': 'But Google is starting from behind.', + 'ents': [{'start': 4, 'end': 10, 'label': 'ORG'}], + 'title': None + }]