spaCy/spacy/util.py

# coding: utf8
from __future__ import unicode_literals, print_function
import os
import io
import json
import re
import os.path
import pathlib
import sys

import six
import textwrap

from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE


try:
    basestring
except NameError:
    basestring = str


LANGUAGES = {}
_data_path = pathlib.Path(__file__).parent / 'data'


def set_lang_class(name, cls):
    global LANGUAGES
    LANGUAGES[name] = cls


def get_lang_class(name):
    lang = re.split('[^a-zA-Z0-9]', name, 1)[0]
    if lang not in LANGUAGES:
        raise RuntimeError('Language not supported: %s' % lang)
    return LANGUAGES[lang]


def get_data_path(require_exists=True):
    if not require_exists:
        return _data_path
    else:
        return _data_path if _data_path.exists() else None


def set_data_path(path):
    global _data_path
    if isinstance(path, basestring):
        path = pathlib.Path(path)
    _data_path = path


def or_(val1, val2):
    if val1 is not None:
        return val1
    elif callable(val2):
        return val2()
    else:
        return val2


def match_best_version(target_name, target_version, path):
    path = path if not isinstance(path, basestring) else pathlib.Path(path)
    if path is None or not path.exists():
        return None
    matches = []
    for data_name in path.iterdir():
        name, version = split_data_name(data_name.parts[-1])
        if name == target_name and constraint_match(target_version, version):
            matches.append((tuple(float(v) for v in version.split('.')), data_name))
    if matches:
        return pathlib.Path(max(matches)[1])
    else:
        return None


def split_data_name(name):
    return name.split('-', 1) if '-' in name else (name, '')


def constraint_match(constraint_string, version):
    # From http://github.com/spacy-io/sputnik
    if not constraint_string:
        return True

    constraints = [c.strip() for c in constraint_string.split(',') if c.strip()]

    for c in constraints:
        if not re.match(r'[><=][=]?\d+(\.\d+)*', c):
            raise ValueError('invalid constraint: %s' % c)

    return all(semver.match(version, c) for c in constraints)


def read_regex(path):
    path = path if not isinstance(path, basestring) else pathlib.Path(path)
    with path.open() as file_:
        entries = file_.read().split('\n')
    expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
    return re.compile(expression)


def compile_prefix_regex(entries):
    if '(' in entries:
        # Handle deprecated data
        expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
        return re.compile(expression)
    else:
        expression = '|'.join(['^' + piece for piece in entries if piece.strip()])
        return re.compile(expression)


def compile_suffix_regex(entries):
    expression = '|'.join([piece + '$' for piece in entries if piece.strip()])
    return re.compile(expression)


def compile_infix_regex(entries):
    expression = '|'.join([piece for piece in entries if piece.strip()])
    return re.compile(expression)


def normalize_slice(length, start, stop, step=None):
    if not (step is None or step == 1):
        raise ValueError("Stepped slices not supported in Span objects."
                         "Try: list(tokens)[start:stop:step] instead.")
    if start is None:
       start = 0
    elif start < 0:
       start += length
    start = min(length, max(0, start))

    if stop is None:
       stop = length
    elif stop < 0:
       stop += length
    stop = min(length, max(start, stop))

    assert 0 <= start <= stop <= length
    return start, stop


def utf8open(loc, mode='r'):
    return io.open(loc, mode, encoding='utf8')


def check_renamed_kwargs(renamed, kwargs):
    for old, new in renamed.items():
        if old in kwargs:
            raise TypeError("Keyword argument %s now renamed to %s" % (old, new))


def parse_package_meta(package_path, package, require=True):
    location = os.path.join(str(package_path), package, 'meta.json')
    if os.path.isfile(location):
        with io.open(location, encoding='utf8') as f:
            meta = json.load(f)
            return meta
    elif require:
        raise IOError("Could not read meta.json from %s" % location)
    else:
        return None


def print_table(data, **kwargs):
    """Print data in table format. Can either take a list of tuples or a
    dictionary, which will be converted to a list of tuples."""

    if type(data) == dict:
        data = list(data.items())

    tpl_msg = '\n{msg}\n'
    tpl_title = '\n    \033[93m{msg}\033[0m'
    tpl_row ="    {:<15}" * len(data[0])
    table = '\n'.join([tpl_row.format(l, v) for l, v in data])

    if 'title' in kwargs and kwargs['title']:
        print(tpl_title.format(msg=kwargs['title']))

    print(tpl_msg.format(msg=table))


def print_markdown(data, **kwargs):
    """Print listed data in GitHub-flavoured Markdown format so it can be
    copy-pasted into issues. Can either take a list of tuples or a dictionary,
    which will be converted to a list of tuples."""

    def excl_value(value):
        # don't print value if it contains absolute path of directory
        # (i.e. personal info that shouldn't need to be shared)
        # other conditions can be included here if necessary
        if str(pathlib.Path(__file__).parent) in value:
            return True

    if type(data) == dict:
        data = list(data.items())

    tpl_msg = "\n{msg}\n"
    tpl_title = "\n## {msg}"
    tpl_row = "* **{l}:** {v}"
    markdown = '\n'.join([tpl_row.format(l=l, v=v) for l, v in data if not excl_value(v)])

    if 'title' in kwargs and kwargs['title']:
        print(tpl_title.format(msg=kwargs['title']))

    print(tpl_msg.format(msg=markdown))


def print_msg(*text, **kwargs):
    """Print formatted message. Each positional argument is rendered as newline-
    separated paragraph. If kwarg 'title' exist, title is printed above the text
    and highlighted (using ANSI escape sequences manually to avoid unnecessary
    dependency)."""

    message = '\n\n'.join([_wrap_text(t) for t in text])
    tpl_msg = '\n{msg}\n'
    tpl_title = '\n\033[93m{msg}\033[0m'

    if 'title' in kwargs and kwargs['title']:
        title = _wrap_text(kwargs['title'])
        print(tpl_title.format(msg=title))
    print(tpl_msg.format(msg=message))


def _wrap_text(text):
    """Wrap text at given width using textwrap module. Indent should consist of
    spaces. Its length is deducted from wrap width to ensure exact wrapping."""

    wrap_max = 80
    indent = '    '
    wrap_width = wrap_max - len(indent)
    return textwrap.fill(text, width=wrap_width, initial_indent=indent,
                               subsequent_indent=indent, break_long_words=False,
                               break_on_hyphens=False)


def sys_exit(*messages, **kwargs):
    """Performs SystemExit. For modules used from the command line, like
    download and link. To print message, use the same arguments as for
    print_msg()."""

    if messages:
        print_msg(*messages, **kwargs)
    sys.exit(0)
Use consistent unicode declarations 2017-03-12 15:07:28 +03:00			`# coding: utf8`
Add util functions for printing and wrapping messages 2017-03-15 19:35:57 +03:00			`from __future__ import unicode_literals, print_function`
access model via sputnik 2015-12-07 08:01:28 +03:00			`import os`
changing deprecated codecs.open to io.open =) 2015-09-30 21:10:15 +03:00			`import io`
* Make PyPy work 2015-01-05 09:54:13 +03:00			`import json`
* Add util.py 2014-09-25 20:26:22 +04:00			`import re`
* Add MockPackage class, to see whether we can proxy for Sputnik in a lightweight way 2015-12-29 18:55:03 +03:00			`import os.path`
Finish refactoring data loading 2016-09-24 21:26:17 +03:00			`import pathlib`
Move sys_exit() function to util 2017-03-16 19:08:58 +03:00			`import sys`
Revert "Fix formatting and remove unused code" This reverts commit d7898d586f6186459f3e8b4c016a78872fd159bc. 2017-03-16 11:58:41 +03:00
distinct load() and from_package() methods 2016-01-16 12:00:57 +03:00			`import six`
Add util functions for printing and wrapping messages 2017-03-15 19:35:57 +03:00			`import textwrap`

integrate with sputnik 2016-01-13 21:46:17 +03:00			`from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE`
* Add MockPackage class, to see whether we can proxy for Sputnik in a lightweight way 2015-12-29 18:55:03 +03:00
Fix spacing 2017-03-21 00:48:21 +03:00
Python 3 compatible basestring 2016-09-24 23:08:43 +03:00			`try:`
Whitespace 2016-09-24 23:17:01 +03:00			`basestring`
Python 3 compatible basestring 2016-09-24 23:08:43 +03:00			`except NameError:`
Whitespace 2016-09-24 23:17:01 +03:00			`basestring = str`
* Add MockPackage class, to see whether we can proxy for Sputnik in a lightweight way 2015-12-29 18:55:03 +03:00
Refactor training, with new spacy.train module. Defaults still a little awkward. 2016-10-09 13:24:24 +03:00
add lang registration facility 2016-03-25 20:54:45 +03:00			`LANGUAGES = {}`
Finish refactoring data loading 2016-09-24 21:26:17 +03:00			`_data_path = pathlib.Path(__file__).parent / 'data'`
add lang registration facility 2016-03-25 20:54:45 +03:00

relative imports in __init__.py 2016-03-26 13:44:53 +03:00			`def set_lang_class(name, cls):`
add lang registration facility 2016-03-25 20:54:45 +03:00			`global LANGUAGES`
			`LANGUAGES[name] = cls`


relative imports in __init__.py 2016-03-26 13:44:53 +03:00			`def get_lang_class(name):`
Fix get_lang_class parsing (take 2) 2016-05-17 02:40:31 +03:00			`lang = re.split('[^a-zA-Z0-9]', name, 1)[0]`
add lang registration facility 2016-03-25 20:54:45 +03:00			`if lang not in LANGUAGES:`
			`raise RuntimeError('Language not supported: %s' % lang)`
			`return LANGUAGES[lang]`


Unbreak data download 2017-01-10 01:40:26 +03:00			`def get_data_path(require_exists=True):`
			`if not require_exists:`
			`return _data_path`
			`else:`
			`return _data_path if _data_path.exists() else None`
Finish refactoring data loading 2016-09-24 21:26:17 +03:00

			`def set_data_path(path):`
			`global _data_path`
			`if isinstance(path, basestring):`
			`path = pathlib.Path(path)`
			`_data_path = path`


Refactor so that the tokenizer data is read from Python data, rather than from disk 2016-09-25 15:49:53 +03:00			`def or_(val1, val2):`
			`if val1 is not None:`
			`return val1`
			`elif callable(val2):`
			`return val2()`
			`else:`
			`return val2`


Finish refactoring data loading 2016-09-24 21:26:17 +03:00			`def match_best_version(target_name, target_version, path):`
			`path = path if not isinstance(path, basestring) else pathlib.Path(path)`
Return None for data path if it doesn't exist 2017-01-09 16:10:05 +03:00			`if path is None or not path.exists():`
Return None in match_best_version if not path exists. 2016-10-15 15:47:29 +03:00			`return None`
Finish refactoring data loading 2016-09-24 21:26:17 +03:00			`matches = []`
			`for data_name in path.iterdir():`
			`name, version = split_data_name(data_name.parts[-1])`
			`if name == target_name and constraint_match(target_version, version):`
			`matches.append((tuple(float(v) for v in version.split('.')), data_name))`
			`if matches:`
			`return pathlib.Path(max(matches)[1])`
			`else:`
			`return None`


			`def split_data_name(name):`
			`return name.split('-', 1) if '-' in name else (name, '')`


Revert "Fix formatting and remove unused code" This reverts commit d7898d586f6186459f3e8b4c016a78872fd159bc. 2017-03-16 11:58:41 +03:00			`def constraint_match(constraint_string, version):`
			`# From http://github.com/spacy-io/sputnik`
			`if not constraint_string:`
			`return True`

			`constraints = [c.strip() for c in constraint_string.split(',') if c.strip()]`

			`for c in constraints:`
			`if not re.match(r'[><=][=]?\d+(\.\d+)*', c):`
			`raise ValueError('invalid constraint: %s' % c)`

			`return all(semver.match(version, c) for c in constraints)`


Finish refactoring data loading 2016-09-24 21:26:17 +03:00			`def read_regex(path):`
			`path = path if not isinstance(path, basestring) else pathlib.Path(path)`
			`with path.open() as file_:`
			`entries = file_.read().split('\n')`
			`expression = '\|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])`
			`return re.compile(expression)`


Refactor so that the tokenizer data is read from Python data, rather than from disk 2016-09-25 15:49:53 +03:00			`def compile_prefix_regex(entries):`
Handle deprecated tokenizer prefix data 2017-01-08 22:33:28 +03:00			`if '(' in entries:`
			`# Handle deprecated data`
			`expression = '\|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])`
			`return re.compile(expression)`
			`else:`
			`expression = '\|'.join(['^' + piece for piece in entries if piece.strip()])`
			`return re.compile(expression)`
Finish refactoring data loading 2016-09-24 21:26:17 +03:00

Refactor so that the tokenizer data is read from Python data, rather than from disk 2016-09-25 15:49:53 +03:00			`def compile_suffix_regex(entries):`
Finish refactoring data loading 2016-09-24 21:26:17 +03:00			`expression = '\|'.join([piece + '$' for piece in entries if piece.strip()])`
			`return re.compile(expression)`


Refactor so that the tokenizer data is read from Python data, rather than from disk 2016-09-25 15:49:53 +03:00			`def compile_infix_regex(entries):`
Finish refactoring data loading 2016-09-24 21:26:17 +03:00			`expression = '\|'.join([piece for piece in entries if piece.strip()])`
			`return re.compile(expression)`


Refactor to remove duplicate slicing logic 2015-10-07 11:25:35 +03:00			`def normalize_slice(length, start, stop, step=None):`
			`if not (step is None or step == 1):`
			`raise ValueError("Stepped slices not supported in Span objects."`
			`"Try: list(tokens)[start:stop:step] instead.")`
			`if start is None:`
			`start = 0`
			`elif start < 0:`
			`start += length`
			`start = min(length, max(0, start))`

			`if stop is None:`
			`stop = length`
			`elif stop < 0:`
			`stop += length`
			`stop = min(length, max(start, stop))`

			`assert 0 <= start <= stop <= length`
			`return start, stop`


* Add util.py 2014-09-25 20:26:22 +04:00			`def utf8open(loc, mode='r'):`
changing deprecated codecs.open to io.open =) 2015-09-30 21:10:15 +03:00			`return io.open(loc, mode, encoding='utf8')`
Refactor so that the tokenizer data is read from Python data, rather than from disk 2016-09-25 15:49:53 +03:00

			`def check_renamed_kwargs(renamed, kwargs):`
			`for old, new in renamed.items():`
			`if old in kwargs:`
			`raise TypeError("Keyword argument %s now renamed to %s" % (old, new))`
Add util functions for printing and wrapping messages 2017-03-15 19:35:57 +03:00

Fix loading when no package found 2017-03-17 02:30:02 +03:00			`def parse_package_meta(package_path, package, require=True):`
Add util function to load and parse package meta.json 2017-03-16 19:10:05 +03:00			`location = os.path.join(str(package_path), package, 'meta.json')`
Fix loading when no package found 2017-03-17 02:30:02 +03:00			`if os.path.isfile(location):`
Add util function to load and parse package meta.json 2017-03-16 19:10:05 +03:00			`with io.open(location, encoding='utf8') as f:`
			`meta = json.load(f)`
			`return meta`
Fix loading when no package found 2017-03-17 02:30:02 +03:00			`elif require:`
			`raise IOError("Could not read meta.json from %s" % location)`
			`else:`
			`return None`
Add util function to load and parse package meta.json 2017-03-16 19:10:05 +03:00

Add util functions to print data as table or markdown list 2017-03-18 15:00:14 +03:00			`def print_table(data, **kwargs):`
			`"""Print data in table format. Can either take a list of tuples or a`
			`dictionary, which will be converted to a list of tuples."""`

			`if type(data) == dict:`
			`data = list(data.items())`

			`tpl_msg = '\n{msg}\n'`
			`tpl_title = '\n \033[93m{msg}\033[0m'`
			`tpl_row =" {:<15}" * len(data[0])`
			`table = '\n'.join([tpl_row.format(l, v) for l, v in data])`

			`if 'title' in kwargs and kwargs['title']:`
			`print(tpl_title.format(msg=kwargs['title']))`

			`print(tpl_msg.format(msg=table))`


			`def print_markdown(data, **kwargs):`
			`"""Print listed data in GitHub-flavoured Markdown format so it can be`
			`copy-pasted into issues. Can either take a list of tuples or a dictionary,`
			`which will be converted to a list of tuples."""`

			`def excl_value(value):`
			`# don't print value if it contains absolute path of directory`
			`# (i.e. personal info that shouldn't need to be shared)`
			`# other conditions can be included here if necessary`
			`if str(pathlib.Path(__file__).parent) in value:`
			`return True`

			`if type(data) == dict:`
			`data = list(data.items())`

			`tpl_msg = "\n{msg}\n"`
			`tpl_title = "\n## {msg}"`
			`tpl_row = "* {l}: {v}"`
			`markdown = '\n'.join([tpl_row.format(l=l, v=v) for l, v in data if not excl_value(v)])`

			`if 'title' in kwargs and kwargs['title']:`
			`print(tpl_title.format(msg=kwargs['title']))`

			`print(tpl_msg.format(msg=markdown))`


Add util functions for printing and wrapping messages 2017-03-15 19:35:57 +03:00			`def print_msg(text, *kwargs):`
			`"""Print formatted message. Each positional argument is rendered as newline-`
			`separated paragraph. If kwarg 'title' exist, title is printed above the text`
			`and highlighted (using ANSI escape sequences manually to avoid unnecessary`
			`dependency)."""`

			`message = '\n\n'.join([_wrap_text(t) for t in text])`
			`tpl_msg = '\n{msg}\n'`
			`tpl_title = '\n\033[93m{msg}\033[0m'`

			`if 'title' in kwargs and kwargs['title']:`
			`title = _wrap_text(kwargs['title'])`
			`print(tpl_title.format(msg=title))`
			`print(tpl_msg.format(msg=message))`


			`def _wrap_text(text):`
			`"""Wrap text at given width using textwrap module. Indent should consist of`
			`spaces. Its length is deducted from wrap width to ensure exact wrapping."""`

			`wrap_max = 80`
			`indent = ' '`
			`wrap_width = wrap_max - len(indent)`
			`return textwrap.fill(text, width=wrap_width, initial_indent=indent,`
Don't break text in when rendering print_msg 2017-03-16 19:09:50 +03:00			`subsequent_indent=indent, break_long_words=False,`
			`break_on_hyphens=False)`
Move sys_exit() function to util 2017-03-16 19:08:58 +03:00

			`def sys_exit(messages, *kwargs):`
			`"""Performs SystemExit. For modules used from the command line, like`
			`download and link. To print message, use the same arguments as for`
			`print_msg()."""`

			`if messages:`
			`print_msg(messages, *kwargs)`
			`sys.exit(0)`