spaCy/spacy/util.py

# coding: utf8
from __future__ import unicode_literals, print_function

import os
import ujson
import pkg_resources
import importlib
import regex as re
from pathlib import Path
import sys
import textwrap
import random
from collections import OrderedDict
import inspect
import warnings
from thinc.neural._classes.model import Model
import functools
import cytoolz

from .symbols import ORTH
from .compat import cupy, CudaStream, path2str, basestring_, input_, unicode_
from .compat import import_file

import msgpack
import msgpack_numpy
msgpack_numpy.patch()


LANGUAGES = {}
_data_path = Path(__file__).parent / 'data'
_PRINT_ENV = False


def set_env_log(value):
    global _PRINT_ENV
    _PRINT_ENV = value


def get_lang_class(lang):
    """Import and load a Language class.

    lang (unicode): Two-letter language code, e.g. 'en'.
    RETURNS (Language): Language class.
    """
    global LANGUAGES
    if lang not in LANGUAGES:
        try:
            module = importlib.import_module('.lang.%s' % lang, 'spacy')
        except ImportError:
            msg = "Can't import language %s from spacy.lang."
            raise ImportError(msg % lang)
        LANGUAGES[lang] = getattr(module, module.__all__[0])
    return LANGUAGES[lang]


def set_lang_class(name, cls):
    """Set a custom Language class name that can be loaded via get_lang_class.

    name (unicode): Name of Language class.
    cls (Language): Language class.
    """
    global LANGUAGES
    LANGUAGES[name] = cls


def get_data_path(require_exists=True):
    """Get path to spaCy data directory.

    require_exists (bool): Only return path if it exists, otherwise None.
    RETURNS (Path or None): Data path or None.
    """
    if not require_exists:
        return _data_path
    else:
        return _data_path if _data_path.exists() else None


def set_data_path(path):
    """Set path to spaCy data directory.

    path (unicode or Path): Path to new data directory.
    """
    global _data_path
    _data_path = ensure_path(path)


def ensure_path(path):
    """Ensure string is converted to a Path.

    path: Anything. If string, it's converted to Path.
    RETURNS: Path or original argument.
    """
    if isinstance(path, basestring_):
        return Path(path)
    else:
        return path


def load_model(name, **overrides):
    """Load a model from a shortcut link, package or data path.

    name (unicode): Package name, shortcut link or model path.
    **overrides: Specific overrides, like pipeline components to disable.
    RETURNS (Language): `Language` class with the loaded model.
    """
    data_path = get_data_path()
    if not data_path or not data_path.exists():
        raise IOError("Can't find spaCy data path: %s" % path2str(data_path))
    if isinstance(name, basestring_):  # in data dir / shortcut
        if name in set([d.name for d in data_path.iterdir()]):
            return load_model_from_link(name, **overrides)
        if is_package(name):  # installed as package
            return load_model_from_package(name, **overrides)
        if Path(name).exists():  # path to model data directory
            return load_model_from_path(Path(name), **overrides)
    elif hasattr(name, 'exists'):  # Path or Path-like to model data
        return load_model_from_path(name, **overrides)
    raise IOError("Can't find model '%s'" % name)


def load_model_from_link(name, **overrides):
    """Load a model from a shortcut link, or directory in spaCy data path."""
    path = get_data_path() / name / '__init__.py'
    try:
        cls = import_file(name, path)
    except AttributeError:
        raise IOError(
            "Cant' load '%s'. If you're using a shortcut link, make sure it "
            "points to a valid package (not just a data directory)." % name)
    return cls.load(**overrides)


def load_model_from_package(name, **overrides):
    """Load a model from an installed package."""
    cls = importlib.import_module(name)
    return cls.load(**overrides)


def load_model_from_path(model_path, meta=False, **overrides):
    """Load a model from a data directory path. Creates Language class with
    pipeline from meta.json and then calls from_disk() with path."""
    if not meta:
        meta = get_model_meta(model_path)
    cls = get_lang_class(meta['lang'])
    nlp = cls(meta=meta, **overrides)
    pipeline = meta.get('pipeline', [])
    disable = overrides.get('disable', [])
    if pipeline is True:
        pipeline = nlp.Defaults.pipe_names
    elif pipeline in (False, None):
        pipeline = []
    for name in pipeline:
        if name not in disable:
            config = meta.get('pipeline_args', {}).get(name, {})
            component = nlp.create_pipe(name, config=config)
            nlp.add_pipe(component, name=name)
    return nlp.from_disk(model_path)


def load_model_from_init_py(init_file, **overrides):
    """Helper function to use in the `load()` method of a model package's
    __init__.py.

    init_file (unicode): Path to model's __init__.py, i.e. `__file__`.
    **overrides: Specific overrides, like pipeline components to disable.
    RETURNS (Language): `Language` class with loaded model.
    """
    model_path = Path(init_file).parent
    meta = get_model_meta(model_path)
    data_dir = '%s_%s-%s' % (meta['lang'], meta['name'], meta['version'])
    data_path = model_path / data_dir
    if not model_path.exists():
        msg = "Can't find model directory: %s"
        raise ValueError(msg % path2str(data_path))
    return load_model_from_path(data_path, meta, **overrides)


def get_model_meta(path):
    """Get model meta.json from a directory path and validate its contents.

    path (unicode or Path): Path to model directory.
    RETURNS (dict): The model's meta data.
    """
    model_path = ensure_path(path)
    if not model_path.exists():
        msg = "Can't find model directory: %s"
        raise ValueError(msg % path2str(model_path))
    meta_path = model_path / 'meta.json'
    if not meta_path.is_file():
        raise IOError("Could not read meta.json from %s" % meta_path)
    meta = read_json(meta_path)
    for setting in ['lang', 'name', 'version']:
        if setting not in meta or not meta[setting]:
            msg = "No valid '%s' setting found in model meta.json"
            raise ValueError(msg % setting)
    return meta


def is_package(name):
    """Check if string maps to a package installed via pip.

    name (unicode): Name of package.
    RETURNS (bool): True if installed package, False if not.
    """
    name = name.lower()  # compare package name against lowercase name
    packages = pkg_resources.working_set.by_key.keys()
    for package in packages:
        if package.lower().replace('-', '_') == name:
            return True
    return False


def get_package_path(name):
    """Get the path to an installed package.

    name (unicode): Package name.
    RETURNS (Path): Path to installed package.
    """
    name = name.lower()  # use lowercase version to be safe
    # Here we're importing the module just to find it. This is worryingly
    # indirect, but it's otherwise very difficult to find the package.
    pkg = importlib.import_module(name)
    return Path(pkg.__file__).parent


def is_in_jupyter():
    """Check if user is running spaCy from a Jupyter notebook by detecting the
    IPython kernel. Mainly used for the displaCy visualizer.

    RETURNS (bool): True if in Jupyter, False if not.
    """
    try:
        cfg = get_ipython().config
        if cfg['IPKernelApp']['parent_appname'] == 'ipython-notebook':
            return True
    except NameError:
        return False
    return False


def get_cuda_stream(require=False):
    return CudaStream() if CudaStream is not None else None


def get_async(stream, numpy_array):
    if cupy is None:
        return numpy_array
    else:
        array = cupy.ndarray(numpy_array.shape, order='C',
                             dtype=numpy_array.dtype)
        array.set(numpy_array, stream=stream)
        return array


def env_opt(name, default=None):
    if type(default) is float:
        type_convert = float
    else:
        type_convert = int
    if 'SPACY_' + name.upper() in os.environ:
        value = type_convert(os.environ['SPACY_' + name.upper()])
        if _PRINT_ENV:
            print(name, "=", repr(value), "via", "$SPACY_" + name.upper())
        return value
    elif name in os.environ:
        value = type_convert(os.environ[name])
        if _PRINT_ENV:
            print(name, "=", repr(value), "via", '$' + name)
        return value
    else:
        if _PRINT_ENV:
            print(name, '=', repr(default), "by default")
        return default


def read_regex(path):
    path = ensure_path(path)
    with path.open() as file_:
        entries = file_.read().split('\n')
    expression = '|'.join(['^' + re.escape(piece)
                           for piece in entries if piece.strip()])
    return re.compile(expression)


def compile_prefix_regex(entries):
    if '(' in entries:
        # Handle deprecated data
        expression = '|'.join(['^' + re.escape(piece)
                               for piece in entries if piece.strip()])
        return re.compile(expression)
    else:
        expression = '|'.join(['^' + piece
                               for piece in entries if piece.strip()])
        return re.compile(expression)


def compile_suffix_regex(entries):
    expression = '|'.join([piece + '$' for piece in entries if piece.strip()])
    return re.compile(expression)


def compile_infix_regex(entries):
    expression = '|'.join([piece for piece in entries if piece.strip()])
    return re.compile(expression)


def add_lookups(default_func, *lookups):
    """Extend an attribute function with special cases. If a word is in the
    lookups, the value is returned. Otherwise the previous function is used.

    default_func (callable): The default function to execute.
    *lookups (dict): Lookup dictionary mapping string to attribute value.
    RETURNS (callable): Lexical attribute getter.
    """
    # This is implemented as functools.partial instead of a closure, to allow
    # pickle to work.
    return functools.partial(_get_attr_unless_lookup, default_func, lookups)


def _get_attr_unless_lookup(default_func, lookups, string):
    for lookup in lookups:
        if string in lookup:
            return lookup[string]
    return default_func(string)


def update_exc(base_exceptions, *addition_dicts):
    """Update and validate tokenizer exceptions. Will overwrite exceptions.

    base_exceptions (dict): Base exceptions.
    *addition_dicts (dict): Exceptions to add to the base dict, in order.
    RETURNS (dict): Combined tokenizer exceptions.
    """
    exc = dict(base_exceptions)
    for additions in addition_dicts:
        for orth, token_attrs in additions.items():
            if not all(isinstance(attr[ORTH], unicode_)
                       for attr in token_attrs):
                msg = "Invalid ORTH value in exception: key='%s', orths='%s'"
                raise ValueError(msg % (orth, token_attrs))
            described_orth = ''.join(attr[ORTH] for attr in token_attrs)
            if orth != described_orth:
                msg = ("Invalid tokenizer exception: ORTH values combined "
                       "don't match original string. key='%s', orths='%s'")
                raise ValueError(msg % (orth, described_orth))
        exc.update(additions)
    exc = expand_exc(exc, "'", "’")
    return exc


def expand_exc(excs, search, replace):
    """Find string in tokenizer exceptions, duplicate entry and replace string.
    For example, to add additional versions with typographic apostrophes.

    excs (dict): Tokenizer exceptions.
    search (unicode): String to find and replace.
    replace (unicode): Replacement.
    RETURNS (dict): Combined tokenizer exceptions.
    """
    def _fix_token(token, search, replace):
        fixed = dict(token)
        fixed[ORTH] = fixed[ORTH].replace(search, replace)
        return fixed
    new_excs = dict(excs)
    for token_string, tokens in excs.items():
        if search in token_string:
            new_key = token_string.replace(search, replace)
            new_value = [_fix_token(t, search, replace) for t in tokens]
            new_excs[new_key] = new_value
    return new_excs


def normalize_slice(length, start, stop, step=None):
    if not (step is None or step == 1):
        raise ValueError("Stepped slices not supported in Span objects."
                         "Try: list(tokens)[start:stop:step] instead.")
    if start is None:
        start = 0
    elif start < 0:
        start += length
    start = min(length, max(0, start))
    if stop is None:
        stop = length
    elif stop < 0:
        stop += length
    stop = min(length, max(start, stop))
    assert 0 <= start <= stop <= length
    return start, stop


def minibatch(items, size=8):
    """Iterate over batches of items. `size` may be an iterator,
    so that batch-size can vary on each step.
    """
    if isinstance(size, int):
        size_ = itertools.repeat(size)
    else:
        size_ = size
    items = iter(items)
    while True:
        batch_size = next(size_)
        batch = list(cytoolz.take(int(batch_size), items))
        if len(batch) == 0:
            break
        yield list(batch)


def compounding(start, stop, compound):
    """Yield an infinite series of compounding values. Each time the
    generator is called, a value is produced by multiplying the previous
    value by the compound rate.

    EXAMPLE:
      >>> sizes = compounding(1., 10., 1.5)
      >>> assert next(sizes) == 1.
      >>> assert next(sizes) == 1 * 1.5
      >>> assert next(sizes) == 1.5 * 1.5
    """
    def clip(value):
        return max(value, stop) if (start > stop) else min(value, stop)
    curr = float(start)
    while True:
        yield clip(curr)
        curr *= compound


def decaying(start, stop, decay):
    """Yield an infinite series of linearly decaying values."""
    def clip(value):
        return max(value, stop) if (start > stop) else min(value, stop)
    nr_upd = 1.
    while True:
        yield clip(start * 1./(1. + decay * nr_upd))
        nr_upd += 1


def itershuffle(iterable, bufsize=1000):
    """Shuffle an iterator. This works by holding `bufsize` items back
    and yielding them sometime later. Obviously, this is not unbiased –
    but should be good enough for batching. Larger bufsize means less bias.
    From https://gist.github.com/andres-erbsen/1307752

    iterable (iterable): Iterator to shuffle.
    bufsize (int): Items to hold back.
    YIELDS (iterable): The shuffled iterator.
    """
    iterable = iter(iterable)
    buf = []
    try:
        while True:
            for i in range(random.randint(1, bufsize-len(buf))):
                buf.append(iterable.next())
            random.shuffle(buf)
            for i in range(random.randint(1, bufsize)):
                if buf:
                    yield buf.pop()
                else:
                    break
    except StopIteration:
        random.shuffle(buf)
        while buf:
            yield buf.pop()
        raise StopIteration


def read_json(location):
    """Open and load JSON from file.

    location (Path): Path to JSON file.
    RETURNS (dict): Loaded JSON content.
    """
    location = ensure_path(location)
    with location.open('r', encoding='utf8') as f:
        return ujson.load(f)


def get_raw_input(description, default=False):
    """Get user input from the command line via raw_input / input.

    description (unicode): Text to display before prompt.
    default (unicode or False/None): Default value to display with prompt.
    RETURNS (unicode): User input.
    """
    additional = ' (default: %s)' % default if default else ''
    prompt = '    %s%s: ' % (description, additional)
    user_input = input_(prompt)
    return user_input


def to_bytes(getters, exclude):
    serialized = OrderedDict()
    for key, getter in getters.items():
        if key not in exclude:
            serialized[key] = getter()
    return msgpack.dumps(serialized, use_bin_type=True, encoding='utf8')


def from_bytes(bytes_data, setters, exclude):
    msg = msgpack.loads(bytes_data, encoding='utf8')
    for key, setter in setters.items():
        if key not in exclude and key in msg:
            setter(msg[key])
    return msg


def to_disk(path, writers, exclude):
    path = ensure_path(path)
    if not path.exists():
        path.mkdir()
    for key, writer in writers.items():
        if key not in exclude:
            writer(path / key)
    return path


def from_disk(path, readers, exclude):
    path = ensure_path(path)
    for key, reader in readers.items():
        if key not in exclude:
            reader(path / key)
    return path


def deprecated(message, filter='always'):
    """Show a deprecation warning.

    message (unicode): The message to display.
    filter (unicode): Filter value.
    """
    stack = inspect.stack()[-1]
    with warnings.catch_warnings():
        warnings.simplefilter(filter, DeprecationWarning)
        warnings.warn_explicit(message, DeprecationWarning, stack[1], stack[2])


def print_table(data, title=None):
    """Print data in table format.

    data (dict or list of tuples): Label/value pairs.
    title (unicode or None): Title, will be printed above.
    """
    if isinstance(data, dict):
        data = list(data.items())
    tpl_row = '    {:<15}' * len(data[0])
    table = '\n'.join([tpl_row.format(l, unicode_(v)) for l, v in data])
    if title:
        print('\n    \033[93m{}\033[0m'.format(title))
    print('\n{}\n'.format(table))


def print_markdown(data, title=None):
    """Print data in GitHub-flavoured Markdown format for issues etc.

    data (dict or list of tuples): Label/value pairs.
    title (unicode or None): Title, will be rendered as headline 2.
    """
    def excl_value(value):
        # contains path, i.e. personal info
        return isinstance(value, basestring_) and Path(value).exists()

    if isinstance(data, dict):
        data = list(data.items())
    markdown = ["* **{}:** {}".format(l, unicode_(v))
                for l, v in data if not excl_value(v)]
    if title:
        print("\n## {}".format(title))
    print('\n{}\n'.format('\n'.join(markdown)))


def prints(*texts, **kwargs):
    """Print formatted message (manual ANSI escape sequences to avoid
    dependency)

    *texts (unicode): Texts to print. Each argument is rendered as paragraph.
    **kwargs: 'title' becomes coloured headline. exits=True performs sys exit.
    """
    exits = kwargs.get('exits', None)
    title = kwargs.get('title', None)
    title = '\033[93m{}\033[0m\n'.format(_wrap(title)) if title else ''
    message = '\n\n'.join([_wrap(text) for text in texts])
    print('\n{}{}\n'.format(title, message))
    if exits is not None:
        sys.exit(exits)


def _wrap(text, wrap_max=80, indent=4):
    """Wrap text at given width using textwrap module.

    text (unicode): Text to wrap. If it's a Path, it's converted to string.
    wrap_max (int): Maximum line length (indent is deducted).
    indent (int): Number of spaces for indentation.
    RETURNS (unicode): Wrapped text.
    """
    indent = indent * ' '
    wrap_width = wrap_max - len(indent)
    if isinstance(text, Path):
        text = path2str(text)
    return textwrap.fill(text, width=wrap_width, initial_indent=indent,
                         subsequent_indent=indent, break_long_words=False,
                         break_on_hyphens=False)


def minify_html(html):
    """Perform a template-specific, rudimentary HTML minification for displaCy.
    Disclaimer: NOT a general-purpose solution, only removes indentation and
    newlines.

    html (unicode): Markup to minify.
    RETURNS (unicode): "Minified" HTML.
    """
    return html.strip().replace('    ', '').replace('\n', '')


def use_gpu(gpu_id):
    try:
        import cupy.cuda.device
    except ImportError:
        return None
    from thinc.neural.ops import CupyOps
    device = cupy.cuda.device.Device(gpu_id)
    device.use()
    Model.ops = CupyOps()
    Model.Ops = CupyOps
    return device
-												Use consistent unicode declarations

											
										
										
											2017-03-12 15:07:28 +03:00
+								# coding: utf8
-												Add util functions for printing and wrapping messages

											
										
										
											2017-03-15 19:35:57 +03:00
+								from __future__ import unicode_literals, print_function
-												Clean up imports, unused code, whitespace, docstrings

											
										
										
											2017-04-15 13:05:47 +03:00
-												Add util.env_opt support: Can set hyper params through environment variables.

											
										
										
											2017-05-18 12:36:53 +03:00
+								import os
-												Fix json imports and use ujson

											
										
										
											2017-04-15 13:13:34 +03:00
+								import ujson
-												Use pkg_resources instead of pip for is_package (resolves #1293)

											
										
										
											2017-09-16 21:27:59 +03:00
+								import pkg_resources
-												Move is_package and get_model_package_path to util

											
										
										
											2017-05-08 00:24:51 +03:00
+								import importlib
-												Use `regex` instead of `re`


											
										
										
											2017-04-20 02:22:52 +03:00
+								import regex as re
-												Clean up imports, unused code, whitespace, docstrings

											
										
										
											2017-04-15 13:05:47 +03:00
+								from pathlib import Path
-												Move sys_exit() function to util

											
										
										
											2017-03-16 19:08:58 +03:00
+								import sys
-												Add util functions for printing and wrapping messages

											
										
										
											2017-03-15 19:35:57 +03:00
+								import textwrap
-												Add itershuffle utility function. Maybe belongs in thinc

											
										
										
											2017-05-21 17:05:05 +03:00
+								import random
-												Fix to/from disk methods

											
										
										
											2017-05-31 14:42:39 +03:00
+								from collections import OrderedDict
-												Add deprecated helper

Uses warning to show DeprecationWarning and custom stack trace

											
										
										
											2017-11-01 18:32:36 +03:00
+								import inspect
 								import warnings
-												Add util function to enable GPU

											
										
										
											2017-09-21 03:16:35 +03:00
+								from thinc.neural._classes.model import Model
-												Convert closure to functools.partial, to promote pickling

											
										
										
											2017-10-17 19:20:52 +03:00
+								import functools
-												Fix missing import

											
										
										
											2017-11-07 15:20:12 +03:00
+								import cytoolz
-												Add util functions for printing and wrapping messages

											
										
										
											2017-03-15 19:35:57 +03:00
-												Tidy up util and helpers

											
										
										
											2017-10-27 15:39:09 +03:00
+								from .symbols import ORTH
 								from .compat import cupy, CudaStream, path2str, basestring_, input_, unicode_
 								from .compat import import_file
-												Move serialization functions to util

											
										
										
											2017-05-29 11:13:42 +03:00
+								import msgpack
 								import msgpack_numpy
 								msgpack_numpy.patch()
-												Handle raw_input vs input in Python 2 and 3

											
										
										
											2017-03-21 00:48:32 +03:00
-												add lang registration facility

											
										
										
											2016-03-25 20:54:45 +03:00
+								LANGUAGES = {}
-												Clean up imports, unused code, whitespace, docstrings

											
										
										
											2017-04-15 13:05:47 +03:00
+								_data_path = Path(__file__).parent / 'data'
-												Tidy up util and helpers

											
										
										
											2017-10-27 15:39:09 +03:00
+								_PRINT_ENV = False
 								def set_env_log(value):
 								    global _PRINT_ENV
 								    _PRINT_ENV = value
-												add lang registration facility

											
										
										
											2016-03-25 20:54:45 +03:00
-												Merge load_lang_class and get_lang_class

											
										
										
											2017-05-14 02:31:10 +03:00
+								def get_lang_class(lang):
 								    """Import and load a Language class.
-												add lang registration facility

											
										
										
											2016-03-25 20:54:45 +03:00
-												Merge load_lang_class and get_lang_class

											
										
										
											2017-05-14 02:31:10 +03:00
+								    lang (unicode): Two-letter language code, e.g. 'en'.
 								    RETURNS (Language): Language class.
 								    """
 								    global LANGUAGES
-												Tidy up util and helpers

											
										
										
											2017-10-27 15:39:09 +03:00
+								    if lang not in LANGUAGES:
-												Merge load_lang_class and get_lang_class

											
										
										
											2017-05-14 02:31:10 +03:00
+								        try:
 								            module = importlib.import_module('.lang.%s' % lang, 'spacy')
 								        except ImportError:
-												Tidy up util and helpers

											
										
										
											2017-10-27 15:39:09 +03:00
+								            msg = "Can't import language %s from spacy.lang."
 								            raise ImportError(msg % lang)
-												Merge load_lang_class and get_lang_class

											
										
										
											2017-05-14 02:31:10 +03:00
+								        LANGUAGES[lang] = getattr(module, module.__all__[0])
-												add lang registration facility

											
										
										
											2016-03-25 20:54:45 +03:00
+								    return LANGUAGES[lang]
-												Merge load_lang_class and get_lang_class

											
										
										
											2017-05-14 02:31:10 +03:00
+								def set_lang_class(name, cls):
 								    """Set a custom Language class name that can be loaded via get_lang_class.
-												Add docstrings, error messages and fix consistency

											
										
										
											2017-05-13 22:22:49 +03:00
-												Merge load_lang_class and get_lang_class

											
										
										
											2017-05-14 02:31:10 +03:00
+								    name (unicode): Name of Language class.
 								    cls (Language): Language class.
-												Add docstrings, error messages and fix consistency

											
										
										
											2017-05-13 22:22:49 +03:00
+								    """
-												Merge load_lang_class and get_lang_class

											
										
										
											2017-05-14 02:31:10 +03:00
+								    global LANGUAGES
 								    LANGUAGES[name] = cls
-												Add load_lang_class() util function

											
										
										
											2017-05-09 00:50:45 +03:00
-												Unbreak data download

											
										
										
											2017-01-10 01:40:26 +03:00
+								def get_data_path(require_exists=True):
-												Add docstrings, error messages and fix consistency

											
										
										
											2017-05-13 22:22:49 +03:00
+								    """Get path to spaCy data directory.
-												Update docstrings

											
										
										
											2017-05-14 02:30:29 +03:00
+								    require_exists (bool): Only return path if it exists, otherwise None.
 								    RETURNS (Path or None): Data path or None.
-												Add docstrings, error messages and fix consistency

											
										
										
											2017-05-13 22:22:49 +03:00
+								    """
-												Unbreak data download

											
										
										
											2017-01-10 01:40:26 +03:00
+								    if not require_exists:
 								        return _data_path
 								    else:
 								        return _data_path if _data_path.exists() else None
-												Finish refactoring data loading

											
										
										
											2016-09-24 21:26:17 +03:00
 								def set_data_path(path):
-												Add docstrings, error messages and fix consistency

											
										
										
											2017-05-13 22:22:49 +03:00
+								    """Set path to spaCy data directory.
-												Update docstrings

											
										
										
											2017-05-14 02:30:29 +03:00
+								    path (unicode or Path): Path to new data directory.
-												Add docstrings, error messages and fix consistency

											
										
										
											2017-05-13 22:22:49 +03:00
+								    """
-												Finish refactoring data loading

											
										
										
											2016-09-24 21:26:17 +03:00
+								    global _data_path
-												Add compat functions and remove old workarounds

Add ensure_path util function to handle checking instance of path

											
										
										
											2017-04-15 13:11:16 +03:00
+								    _data_path = ensure_path(path)
 								def ensure_path(path):
-												Update docstrings

											
										
										
											2017-05-14 02:30:29 +03:00
+								    """Ensure string is converted to a Path.
 								    path: Anything. If string, it's converted to Path.
 								    RETURNS: Path or original argument.
 								    """
-												Add compat functions and remove old workarounds

Add ensure_path util function to handle checking instance of path

											
										
										
											2017-04-15 13:11:16 +03:00
+								    if isinstance(path, basestring_):
 								        return Path(path)
 								    else:
 								        return path
-												Finish refactoring data loading

											
										
										
											2016-09-24 21:26:17 +03:00
-												Fix and document model loading with pipeline and overrides

											
										
										
											2017-05-29 15:10:10 +03:00
+								def load_model(name, **overrides):
-												Update util functions for model loading

											
										
										
											2017-05-28 01:22:00 +03:00
+								    """Load a model from a shortcut link, package or data path.
-												Add docstrings, error messages and fix consistency

											
										
										
											2017-05-13 22:22:49 +03:00
-												Update docstrings

											
										
										
											2017-05-14 02:30:29 +03:00
+								    name (unicode): Package name, shortcut link or model path.
-												Fix and document model loading with pipeline and overrides

											
										
										
											2017-05-29 15:10:10 +03:00
+								    **overrides: Specific overrides, like pipeline components to disable.
-												Update util functions for model loading

											
										
										
											2017-05-28 01:22:00 +03:00
+								    RETURNS (Language): `Language` class with the loaded model.
-												Add docstrings, error messages and fix consistency

											
										
										
											2017-05-13 22:22:49 +03:00
+								    """
-												Reorder util functions

											
										
										
											2017-05-09 00:51:15 +03:00
+								    data_path = get_data_path()
 								    if not data_path or not data_path.exists():
 								        raise IOError("Can't find spaCy data path: %s" % path2str(data_path))
-												Tidy up util and helpers

											
										
										
											2017-10-27 15:39:09 +03:00
+								    if isinstance(name, basestring_):  # in data dir / shortcut
 								        if name in set([d.name for d in data_path.iterdir()]):
-												Update spacy.load() helper functions

											
										
										
											2017-06-05 14:02:31 +03:00
+								            return load_model_from_link(name, **overrides)
-												Tidy up util and helpers

											
										
										
											2017-10-27 15:39:09 +03:00
+								        if is_package(name):  # installed as package
-												Update spacy.load() helper functions

											
										
										
											2017-06-05 14:02:31 +03:00
+								            return load_model_from_package(name, **overrides)
-												Tidy up util and helpers

											
										
										
											2017-10-27 15:39:09 +03:00
+								        if Path(name).exists():  # path to model data directory
-												Update spacy.load() helper functions

											
										
										
											2017-06-05 14:02:31 +03:00
+								            return load_model_from_path(Path(name), **overrides)
-												Tidy up util and helpers

											
										
										
											2017-10-27 15:39:09 +03:00
+								    elif hasattr(name, 'exists'):  # Path or Path-like to model data
-												Update spacy.load() helper functions

											
										
										
											2017-06-05 14:02:31 +03:00
+								        return load_model_from_path(name, **overrides)
-												Reorder util functions

											
										
										
											2017-05-09 00:51:15 +03:00
+								    raise IOError("Can't find model '%s'" % name)
-												Update spacy.load() helper functions

											
										
										
											2017-06-05 14:02:31 +03:00
+								def load_model_from_link(name, **overrides):
 								    """Load a model from a shortcut link, or directory in spaCy data path."""
-												Fix data loading on Python 2

											
										
										
											2017-08-18 22:57:06 +03:00
+								    path = get_data_path() / name / '__init__.py'
-												Update spacy.load() helper functions

											
										
										
											2017-06-05 14:02:31 +03:00
+								    try:
-												Fix data loading on Python 2

											
										
										
											2017-08-18 22:57:06 +03:00
+								        cls = import_file(name, path)
-												Update spacy.load() helper functions

											
										
										
											2017-06-05 14:02:31 +03:00
+								    except AttributeError:
 								        raise IOError(
 								            "Cant' load '%s'. If you're using a shortcut link, make sure it "
-												Tidy up util and helpers

											
										
										
											2017-10-27 15:39:09 +03:00
+								            "points to a valid package (not just a data directory)." % name)
-												Update spacy.load() helper functions

											
										
										
											2017-06-05 14:02:31 +03:00
+								    return cls.load(**overrides)
 								def load_model_from_package(name, **overrides):
 								    """Load a model from an installed package."""
 								    cls = importlib.import_module(name)
 								    return cls.load(**overrides)
 								def load_model_from_path(model_path, meta=False, **overrides):
 								    """Load a model from a data directory path. Creates Language class with
 								    pipeline from meta.json and then calls from_disk() with path."""
 								    if not meta:
 								        meta = get_model_meta(model_path)
 								    cls = get_lang_class(meta['lang'])
-												Implement new Language methods and pipeline API

											
										
										
											2017-10-07 01:25:54 +03:00
+								    nlp = cls(meta=meta, **overrides)
-												Add disable option and True/False/None values for pipeline

											
										
										
											2017-10-07 01:29:08 +03:00
+								    pipeline = meta.get('pipeline', [])
 								    disable = overrides.get('disable', [])
 								    if pipeline is True:
 								        pipeline = nlp.Defaults.pipe_names
 								    elif pipeline in (False, None):
 								        pipeline = []
 								    for name in pipeline:
 								        if name not in disable:
 								            config = meta.get('pipeline_args', {}).get(name, {})
 								            component = nlp.create_pipe(name, config=config)
 								            nlp.add_pipe(component, name=name)
-												Update spacy.load() helper functions

											
										
										
											2017-06-05 14:02:31 +03:00
+								    return nlp.from_disk(model_path)
-												Fix and document model loading with pipeline and overrides

											
										
										
											2017-05-29 15:10:10 +03:00
+								def load_model_from_init_py(init_file, **overrides):
-												Update util functions for model loading

											
										
										
											2017-05-28 01:22:00 +03:00
+								    """Helper function to use in the `load()` method of a model package's
 								    __init__.py.
 								    init_file (unicode): Path to model's __init__.py, i.e. `__file__`.
-												Fix and document model loading with pipeline and overrides

											
										
										
											2017-05-29 15:10:10 +03:00
+								    **overrides: Specific overrides, like pipeline components to disable.
-												Update util functions for model loading

											
										
										
											2017-05-28 01:22:00 +03:00
+								    RETURNS (Language): `Language` class with loaded model.
 								    """
 								    model_path = Path(init_file).parent
-												Fix and document model loading with pipeline and overrides

											
										
										
											2017-05-29 15:10:10 +03:00
+								    meta = get_model_meta(model_path)
 								    data_dir = '%s_%s-%s' % (meta['lang'], meta['name'], meta['version'])
 								    data_path = model_path / data_dir
 								    if not model_path.exists():
-												Tidy up util and helpers

											
										
										
											2017-10-27 15:39:09 +03:00
+								        msg = "Can't find model directory: %s"
 								        raise ValueError(msg % path2str(data_path))
-												Update spacy.load() helper functions

											
										
										
											2017-06-05 14:02:31 +03:00
+								    return load_model_from_path(data_path, meta, **overrides)
-												Update util functions for model loading

											
										
										
											2017-05-28 01:22:00 +03:00
-												Fix and document model loading with pipeline and overrides

											
										
										
											2017-05-29 15:10:10 +03:00
+								def get_model_meta(path):
 								    """Get model meta.json from a directory path and validate its contents.
-												Update util functions for model loading

											
										
										
											2017-05-28 01:22:00 +03:00
-												Fix and document model loading with pipeline and overrides

											
										
										
											2017-05-29 15:10:10 +03:00
+								    path (unicode or Path): Path to model directory.
 								    RETURNS (dict): The model's meta data.
-												Update util functions for model loading

											
										
										
											2017-05-28 01:22:00 +03:00
+								    """
-												Fix and document model loading with pipeline and overrides

											
										
										
											2017-05-29 15:10:10 +03:00
+								    model_path = ensure_path(path)
 								    if not model_path.exists():
-												Tidy up util and helpers

											
										
										
											2017-10-27 15:39:09 +03:00
+								        msg = "Can't find model directory: %s"
 								        raise ValueError(msg % path2str(model_path))
-												Update util functions for model loading

											
										
										
											2017-05-28 01:22:00 +03:00
+								    meta_path = model_path / 'meta.json'
 								    if not meta_path.is_file():
-												Fix and document model loading with pipeline and overrides

											
										
										
											2017-05-29 15:10:10 +03:00
+								        raise IOError("Could not read meta.json from %s" % meta_path)
 								    meta = read_json(meta_path)
-												Update util functions for model loading

											
										
										
											2017-05-28 01:22:00 +03:00
+								    for setting in ['lang', 'name', 'version']:
-												Add more validation for model meta

											
										
										
											2017-08-29 12:21:44 +03:00
+								        if setting not in meta or not meta[setting]:
-												Tidy up util and helpers

											
										
										
											2017-10-27 15:39:09 +03:00
+								            msg = "No valid '%s' setting found in model meta.json"
 								            raise ValueError(msg % setting)
-												Fix and document model loading with pipeline and overrides

											
										
										
											2017-05-29 15:10:10 +03:00
+								    return meta
-												Update util functions for model loading

											
										
										
											2017-05-28 01:22:00 +03:00
-												Add docstrings, error messages and fix consistency

											
										
										
											2017-05-13 22:22:49 +03:00
+								def is_package(name):
 								    """Check if string maps to a package installed via pip.
-												Update docstrings

											
										
										
											2017-05-14 02:30:29 +03:00
+								    name (unicode): Name of package.
 								    RETURNS (bool): True if installed package, False if not.
-												Reorder util functions

											
										
										
											2017-05-09 00:51:15 +03:00
+								    """
-												Always compare lowercase package names

Otherwise, is_package will return False if model name contains
uppercase characters. See this issue:
https://support.prodi.gy/t/saving-a-trained-ner-model-as-a-loadable-modu
le/46/6

											
										
										
											2017-09-29 21:55:17 +03:00
+								    name = name.lower()  # compare package name against lowercase name
-												Use pkg_resources instead of pip for is_package (resolves #1293)

											
										
										
											2017-09-16 21:27:59 +03:00
+								    packages = pkg_resources.working_set.by_key.keys()
-												Reorder util functions

											
										
										
											2017-05-09 00:51:15 +03:00
+								    for package in packages:
-												Always compare lowercase package names

Otherwise, is_package will return False if model name contains
uppercase characters. See this issue:
https://support.prodi.gy/t/saving-a-trained-ner-model-as-a-loadable-modu
le/46/6

											
										
										
											2017-09-29 21:55:17 +03:00
+								        if package.lower().replace('-', '_') == name:
-												Reorder util functions

											
										
										
											2017-05-09 00:51:15 +03:00
+								            return True
 								    return False
-												Update util functions for model loading

											
										
										
											2017-05-28 01:22:00 +03:00
+								def get_package_path(name):
 								    """Get the path to an installed package.
-												Add docstrings, error messages and fix consistency

											
										
										
											2017-05-13 22:22:49 +03:00
-												Update util functions for model loading

											
										
										
											2017-05-28 01:22:00 +03:00
+								    name (unicode): Package name.
 								    RETURNS (Path): Path to installed package.
-												Add docstrings, error messages and fix consistency

											
										
										
											2017-05-13 22:22:49 +03:00
+								    """
-												Always compare lowercase package names

Otherwise, is_package will return False if model name contains
uppercase characters. See this issue:
https://support.prodi.gy/t/saving-a-trained-ner-model-as-a-loadable-modu
le/46/6

											
										
										
											2017-09-29 21:55:17 +03:00
+								    name = name.lower()  # use lowercase version to be safe
-												Reorder util functions

											
										
										
											2017-05-09 00:51:15 +03:00
+								    # Here we're importing the module just to find it. This is worryingly
 								    # indirect, but it's otherwise very difficult to find the package.
-												Add tests for displaCy and util functions and fix util typo

											
										
										
											2017-05-29 11:51:19 +03:00
+								    pkg = importlib.import_module(name)
-												Update util functions for model loading

											
										
										
											2017-05-28 01:22:00 +03:00
+								    return Path(pkg.__file__).parent
-												Reorder util functions

											
										
										
											2017-05-09 00:51:15 +03:00
-												Add is_in_jupyter() helper for displaCy (see #1058)

											
										
										
											2017-05-18 15:13:14 +03:00
+								def is_in_jupyter():
-												Update spacy.util documentation

											
										
										
											2017-05-21 02:12:09 +03:00
+								    """Check if user is running spaCy from a Jupyter notebook by detecting the
 								    IPython kernel. Mainly used for the displaCy visualizer.
-												Add is_in_jupyter() helper for displaCy (see #1058)

											
										
										
											2017-05-18 15:13:14 +03:00
 								    RETURNS (bool): True if in Jupyter, False if not.
 								    """
 								    try:
 								        cfg = get_ipython().config
 								        if cfg['IPKernelApp']['parent_appname'] == 'ipython-notebook':
 								            return True
 								    except NameError:
 								        return False
 								    return False
-												Remove cupy imports from parser, so it can work on CPU

											
										
										
											2017-05-14 01:37:53 +03:00
+								def get_cuda_stream(require=False):
-												Improve integration of NN parser, to support unified training API

											
										
										
											2017-05-15 22:46:08 +03:00
+								    return CudaStream() if CudaStream is not None else None
 								def get_async(stream, numpy_array):
 								    if cupy is None:
 								        return numpy_array
 								    else:
-												Add util.env_opt support: Can set hyper params through environment variables.

											
										
										
											2017-05-18 12:36:53 +03:00
+								        array = cupy.ndarray(numpy_array.shape, order='C',
-												Tidy up util and helpers

											
										
										
											2017-10-27 15:39:09 +03:00
+								                             dtype=numpy_array.dtype)
-												Add util.env_opt support: Can set hyper params through environment variables.

											
										
										
											2017-05-18 12:36:53 +03:00
+								        array.set(numpy_array, stream=stream)
 								        return array
-												Fix formatting

											
										
										
											2017-05-26 13:37:45 +03:00
-												Add util.env_opt support: Can set hyper params through environment variables.

											
										
										
											2017-05-18 12:36:53 +03:00
+								def env_opt(name, default=None):
-												Improve env_opt reporting

											
										
										
											2017-05-18 16:32:03 +03:00
+								    if type(default) is float:
 								        type_convert = float
-												Add util.env_opt support: Can set hyper params through environment variables.

											
										
										
											2017-05-18 12:36:53 +03:00
+								    else:
-												Improve env_opt reporting

											
										
										
											2017-05-18 16:32:03 +03:00
+								        type_convert = int
 								    if 'SPACY_' + name.upper() in os.environ:
 								        value = type_convert(os.environ['SPACY_' + name.upper()])
-												Silence env_opt, and fix serialization for GPU

											
										
										
											2017-05-31 15:14:11 +03:00
+								        if _PRINT_ENV:
 								            print(name, "=", repr(value), "via", "$SPACY_" + name.upper())
-												Improve env_opt reporting

											
										
										
											2017-05-18 16:32:03 +03:00
+								        return value
 								    elif name in os.environ:
 								        value = type_convert(os.environ[name])
-												Silence env_opt, and fix serialization for GPU

											
										
										
											2017-05-31 15:14:11 +03:00
+								        if _PRINT_ENV:
 								            print(name, "=", repr(value), "via", '$' + name)
-												Improve env_opt reporting

											
										
										
											2017-05-18 16:32:03 +03:00
+								        return value
 								    else:
-												Silence env_opt, and fix serialization for GPU

											
										
										
											2017-05-31 15:14:11 +03:00
+								        if _PRINT_ENV:
 								            print(name, '=', repr(default), "by default")
-												Add util.env_opt support: Can set hyper params through environment variables.

											
										
										
											2017-05-18 12:36:53 +03:00
+								        return default
-												Remove cupy imports from parser, so it can work on CPU

											
										
										
											2017-05-14 01:37:53 +03:00
-												Finish refactoring data loading

											
										
										
											2016-09-24 21:26:17 +03:00
+								def read_regex(path):
-												Add compat functions and remove old workarounds

Add ensure_path util function to handle checking instance of path

											
										
										
											2017-04-15 13:11:16 +03:00
+								    path = ensure_path(path)
-												Finish refactoring data loading

											
										
										
											2016-09-24 21:26:17 +03:00
+								    with path.open() as file_:
 								        entries = file_.read().split('\n')
-												Tidy up util and helpers

											
										
										
											2017-10-27 15:39:09 +03:00
+								    expression = '|'.join(['^' + re.escape(piece)
 								                           for piece in entries if piece.strip()])
-												Finish refactoring data loading

											
										
										
											2016-09-24 21:26:17 +03:00
+								    return re.compile(expression)
-												Refactor so that the tokenizer data is read from Python data, rather than from disk

											
										
										
											2016-09-25 15:49:53 +03:00
+								def compile_prefix_regex(entries):
-												Handle deprecated tokenizer prefix data

											
										
										
											2017-01-08 22:33:28 +03:00
+								    if '(' in entries:
 								        # Handle deprecated data
-												Tidy up util and helpers

											
										
										
											2017-10-27 15:39:09 +03:00
+								        expression = '|'.join(['^' + re.escape(piece)
 								                               for piece in entries if piece.strip()])
-												Handle deprecated tokenizer prefix data

											
										
										
											2017-01-08 22:33:28 +03:00
+								        return re.compile(expression)
 								    else:
-												Tidy up util and helpers

											
										
										
											2017-10-27 15:39:09 +03:00
+								        expression = '|'.join(['^' + piece
 								                               for piece in entries if piece.strip()])
-												Handle deprecated tokenizer prefix data

											
										
										
											2017-01-08 22:33:28 +03:00
+								        return re.compile(expression)
-												Finish refactoring data loading

											
										
										
											2016-09-24 21:26:17 +03:00
-												Refactor so that the tokenizer data is read from Python data, rather than from disk

											
										
										
											2016-09-25 15:49:53 +03:00
+								def compile_suffix_regex(entries):
-												Finish refactoring data loading

											
										
										
											2016-09-24 21:26:17 +03:00
+								    expression = '|'.join([piece + '$' for piece in entries if piece.strip()])
 								    return re.compile(expression)
-												Refactor so that the tokenizer data is read from Python data, rather than from disk

											
										
										
											2016-09-25 15:49:53 +03:00
+								def compile_infix_regex(entries):
-												Finish refactoring data loading

											
										
										
											2016-09-24 21:26:17 +03:00
+								    expression = '|'.join([piece for piece in entries if piece.strip()])
 								    return re.compile(expression)
-												Add add_lookups util function

											
										
										
											2017-06-03 20:44:47 +03:00
+								def add_lookups(default_func, *lookups):
 								    """Extend an attribute function with special cases. If a word is in the
 								    lookups, the value is returned. Otherwise the previous function is used.
 								    default_func (callable): The default function to execute.
 								    *lookups (dict): Lookup dictionary mapping string to attribute value.
 								    RETURNS (callable): Lexical attribute getter.
 								    """
-												Convert closure to functools.partial, to promote pickling

											
										
										
											2017-10-17 19:20:52 +03:00
+								    # This is implemented as functools.partial instead of a closure, to allow
 								    # pickle to work.
 								    return functools.partial(_get_attr_unless_lookup, default_func, lookups)
 								def _get_attr_unless_lookup(default_func, lookups, string):
 								    for lookup in lookups:
 								        if string in lookup:
 								            return lookup[string]
 								    return default_func(string)
-												Add add_lookups util function

											
										
										
											2017-06-03 20:44:47 +03:00
-												Add update_exc and expand_exc to util

Doesn't require separate language data util anymore

											
										
										
											2017-05-08 16:42:12 +03:00
+								def update_exc(base_exceptions, *addition_dicts):
-												Add docstrings, error messages and fix consistency

											
										
										
											2017-05-13 22:22:49 +03:00
+								    """Update and validate tokenizer exceptions. Will overwrite exceptions.
-												Update docstrings

											
										
										
											2017-05-14 02:30:29 +03:00
+								    base_exceptions (dict): Base exceptions.
 								    *addition_dicts (dict): Exceptions to add to the base dict, in order.
 								    RETURNS (dict): Combined tokenizer exceptions.
-												Add docstrings, error messages and fix consistency

											
										
										
											2017-05-13 22:22:49 +03:00
+								    """
-												Add update_exc and expand_exc to util

Doesn't require separate language data util anymore

											
										
										
											2017-05-08 16:42:12 +03:00
+								    exc = dict(base_exceptions)
 								    for additions in addition_dicts:
 								        for orth, token_attrs in additions.items():
-												Tidy up util and helpers

											
										
										
											2017-10-27 15:39:09 +03:00
+								            if not all(isinstance(attr[ORTH], unicode_)
 								                       for attr in token_attrs):
 								                msg = "Invalid ORTH value in exception: key='%s', orths='%s'"
-												Add update_exc and expand_exc to util

Doesn't require separate language data util anymore

											
										
										
											2017-05-08 16:42:12 +03:00
+								                raise ValueError(msg % (orth, token_attrs))
 								            described_orth = ''.join(attr[ORTH] for attr in token_attrs)
 								            if orth != described_orth:
-												Tidy up util and helpers

											
										
										
											2017-10-27 15:39:09 +03:00
+								                msg = ("Invalid tokenizer exception: ORTH values combined "
 								                       "don't match original string. key='%s', orths='%s'")
 								                raise ValueError(msg % (orth, described_orth))
-												Add update_exc and expand_exc to util

Doesn't require separate language data util anymore

											
										
										
											2017-05-08 16:42:12 +03:00
+								        exc.update(additions)
-												Fix expand_exc to make sure it returns combined dict

											
										
										
											2017-05-13 22:22:25 +03:00
+								    exc = expand_exc(exc, "'", "’")
-												Add update_exc and expand_exc to util

Doesn't require separate language data util anymore

											
										
										
											2017-05-08 16:42:12 +03:00
+								    return exc
 								def expand_exc(excs, search, replace):
-												Add docstrings, error messages and fix consistency

											
										
										
											2017-05-13 22:22:49 +03:00
+								    """Find string in tokenizer exceptions, duplicate entry and replace string.
 								    For example, to add additional versions with typographic apostrophes.
-												Update docstrings

											
										
										
											2017-05-14 02:30:29 +03:00
+								    excs (dict): Tokenizer exceptions.
 								    search (unicode): String to find and replace.
 								    replace (unicode): Replacement.
 								    RETURNS (dict): Combined tokenizer exceptions.
-												Add docstrings, error messages and fix consistency

											
										
										
											2017-05-13 22:22:49 +03:00
+								    """
-												Add update_exc and expand_exc to util

Doesn't require separate language data util anymore

											
										
										
											2017-05-08 16:42:12 +03:00
+								    def _fix_token(token, search, replace):
 								        fixed = dict(token)
 								        fixed[ORTH] = fixed[ORTH].replace(search, replace)
 								        return fixed
-												Fix expand_exc to make sure it returns combined dict

											
										
										
											2017-05-13 22:22:25 +03:00
+								    new_excs = dict(excs)
-												Add update_exc and expand_exc to util

Doesn't require separate language data util anymore

											
										
										
											2017-05-08 16:42:12 +03:00
+								    for token_string, tokens in excs.items():
 								        if search in token_string:
 								            new_key = token_string.replace(search, replace)
 								            new_value = [_fix_token(t, search, replace) for t in tokens]
-												Fix expand_exc to make sure it returns combined dict

											
										
										
											2017-05-13 22:22:25 +03:00
+								            new_excs[new_key] = new_value
 								    return new_excs
-												Add update_exc and expand_exc to util

Doesn't require separate language data util anymore

											
										
										
											2017-05-08 16:42:12 +03:00
-												Refactor to remove duplicate slicing logic

											
										
										
											2015-10-07 11:25:35 +03:00
+								def normalize_slice(length, start, stop, step=None):
 								    if not (step is None or step == 1):
 								        raise ValueError("Stepped slices not supported in Span objects."
 								                         "Try: list(tokens)[start:stop:step] instead.")
 								    if start is None:
-												Tidy up util and helpers

											
										
										
											2017-10-27 15:39:09 +03:00
+								        start = 0
-												Refactor to remove duplicate slicing logic

											
										
										
											2015-10-07 11:25:35 +03:00
+								    elif start < 0:
-												Tidy up util and helpers

											
										
										
											2017-10-27 15:39:09 +03:00
+								        start += length
-												Refactor to remove duplicate slicing logic

											
										
										
											2015-10-07 11:25:35 +03:00
+								    start = min(length, max(0, start))
 								    if stop is None:
-												Tidy up util and helpers

											
										
										
											2017-10-27 15:39:09 +03:00
+								        stop = length
-												Refactor to remove duplicate slicing logic

											
										
										
											2015-10-07 11:25:35 +03:00
+								    elif stop < 0:
-												Tidy up util and helpers

											
										
										
											2017-10-27 15:39:09 +03:00
+								        stop += length
-												Refactor to remove duplicate slicing logic

											
										
										
											2015-10-07 11:25:35 +03:00
+								    stop = min(length, max(start, stop))
 								    assert 0 <= start <= stop <= length
 								    return start, stop
-												Move minibatch function to util

											
										
										
											2017-11-07 01:45:36 +03:00
+								def minibatch(items, size=8):
 								    """Iterate over batches of items. `size` may be an iterator,
 								    so that batch-size can vary on each step.
 								    """
 								    if isinstance(size, int):
-												Update and document new util functions

											
										
										
											2017-11-07 02:22:43 +03:00
+								        size_ = itertools.repeat(size)
-												Move minibatch function to util

											
										
										
											2017-11-07 01:45:36 +03:00
+								    else:
 								        size_ = size
 								    items = iter(items)
 								    while True:
 								        batch_size = next(size_)
 								        batch = list(cytoolz.take(int(batch_size), items))
 								        if len(batch) == 0:
 								            break
 								        yield list(batch)
-												Add compounding and decaying functions

											
										
										
											2017-05-26 00:16:10 +03:00
+								def compounding(start, stop, compound):
-												Fix formatting and docstrings and remove deprecated function

											
										
										
											2017-05-28 01:04:04 +03:00
+								    """Yield an infinite series of compounding values. Each time the
-												Add compounding and decaying functions

											
										
										
											2017-05-26 00:16:10 +03:00
+								    generator is called, a value is produced by multiplying the previous
 								    value by the compound rate.
-												Fix formatting and docstrings and remove deprecated function

											
										
										
											2017-05-28 01:04:04 +03:00
+								    EXAMPLE:
-												Add compounding and decaying functions

											
										
										
											2017-05-26 00:16:10 +03:00
+								      >>> sizes = compounding(1., 10., 1.5)
 								      >>> assert next(sizes) == 1.
 								      >>> assert next(sizes) == 1 * 1.5
 								      >>> assert next(sizes) == 1.5 * 1.5
-												Fix formatting and docstrings and remove deprecated function

											
										
										
											2017-05-28 01:04:04 +03:00
+								    """
-												Add compounding and decaying functions

											
										
										
											2017-05-26 00:16:10 +03:00
+								    def clip(value):
-												Tidy up util and helpers

											
										
										
											2017-10-27 15:39:09 +03:00
+								        return max(value, stop) if (start > stop) else min(value, stop)
-												Add compounding and decaying functions

											
										
										
											2017-05-26 00:16:10 +03:00
+								    curr = float(start)
 								    while True:
 								        yield clip(curr)
 								        curr *= compound
 								def decaying(start, stop, decay):
-												Fix formatting and docstrings and remove deprecated function

											
										
										
											2017-05-28 01:04:04 +03:00
+								    """Yield an infinite series of linearly decaying values."""
-												Add compounding and decaying functions

											
										
										
											2017-05-26 00:16:10 +03:00
+								    def clip(value):
-												Tidy up util and helpers

											
										
										
											2017-10-27 15:39:09 +03:00
+								        return max(value, stop) if (start > stop) else min(value, stop)
-												Add compounding and decaying functions

											
										
										
											2017-05-26 00:16:10 +03:00
+								    nr_upd = 1.
 								    while True:
 								        yield clip(start * 1./(1. + decay * nr_upd))
 								        nr_upd += 1
-												Move minibatch function to util

											
										
										
											2017-11-07 01:45:36 +03:00
+								def itershuffle(iterable, bufsize=1000):
 								    """Shuffle an iterator. This works by holding `bufsize` items back
 								    and yielding them sometime later. Obviously, this is not unbiased –
 								    but should be good enough for batching. Larger bufsize means less bias.
 								    From https://gist.github.com/andres-erbsen/1307752
 								    iterable (iterable): Iterator to shuffle.
 								    bufsize (int): Items to hold back.
 								    YIELDS (iterable): The shuffled iterator.
 								    """
 								    iterable = iter(iterable)
 								    buf = []
 								    try:
 								        while True:
 								            for i in range(random.randint(1, bufsize-len(buf))):
 								                buf.append(iterable.next())
 								            random.shuffle(buf)
 								            for i in range(random.randint(1, bufsize)):
 								                if buf:
 								                    yield buf.pop()
 								                else:
 								                    break
 								    except StopIteration:
 								        random.shuffle(buf)
 								        while buf:
 								            yield buf.pop()
 								        raise StopIteration
-												Move read_json out to own util function

											
										
										
											2017-04-16 14:03:28 +03:00
+								def read_json(location):
-												Add docstrings, error messages and fix consistency

											
										
										
											2017-05-13 22:22:49 +03:00
+								    """Open and load JSON from file.
-												Update docstrings

											
										
										
											2017-05-14 02:30:29 +03:00
+								    location (Path): Path to JSON file.
 								    RETURNS (dict): Loaded JSON content.
-												Add docstrings, error messages and fix consistency

											
										
										
											2017-05-13 22:22:49 +03:00
+								    """
-												Ensure path on read_json

											
										
										
											2017-06-04 21:44:37 +03:00
+								    location = ensure_path(location)
-												Move read_json out to own util function

											
										
										
											2017-04-16 14:03:28 +03:00
+								    with location.open('r', encoding='utf8') as f:
 								        return ujson.load(f)
-												Add util function to get raw user input

											
										
										
											2017-03-21 00:48:56 +03:00
+								def get_raw_input(description, default=False):
-												Add docstrings, error messages and fix consistency

											
										
										
											2017-05-13 22:22:49 +03:00
+								    """Get user input from the command line via raw_input / input.
-												Update docstrings

											
										
										
											2017-05-14 02:30:29 +03:00
+								    description (unicode): Text to display before prompt.
 								    default (unicode or False/None): Default value to display with prompt.
 								    RETURNS (unicode): User input.
-												Fix formatting

											
										
										
											2017-04-16 14:42:34 +03:00
+								    """
-												Tidy up CLI and fix print functions

											
										
										
											2017-05-08 00:25:29 +03:00
+								    additional = ' (default: %s)' % default if default else ''
 								    prompt = '    %s%s: ' % (description, additional)
-												Add compat functions and remove old workarounds

Add ensure_path util function to handle checking instance of path

											
										
										
											2017-04-15 13:11:16 +03:00
+								    user_input = input_(prompt)
-												Add util function to get raw user input

											
										
										
											2017-03-21 00:48:56 +03:00
+								    return user_input
-												Move serialization functions to util

											
										
										
											2017-05-29 11:13:42 +03:00
+								def to_bytes(getters, exclude):
-												Fix to/from disk methods

											
										
										
											2017-05-31 14:42:39 +03:00
+								    serialized = OrderedDict()
-												Move serialization functions to util

											
										
										
											2017-05-29 11:13:42 +03:00
+								    for key, getter in getters.items():
 								        if key not in exclude:
 								            serialized[key] = getter()
-												Fiddle with msgpack bytes vs unicode

											
										
										
											2017-06-01 18:48:43 +03:00
+								    return msgpack.dumps(serialized, use_bin_type=True, encoding='utf8')
-												Move serialization functions to util

											
										
										
											2017-05-29 11:13:42 +03:00
 								def from_bytes(bytes_data, setters, exclude):
-												Fiddle with msgpack bytes vs unicode

											
										
										
											2017-06-01 18:48:43 +03:00
+								    msg = msgpack.loads(bytes_data, encoding='utf8')
-												Move serialization functions to util

											
										
										
											2017-05-29 11:13:42 +03:00
+								    for key, setter in setters.items():
-												Fix serialization of optional elements

											
										
										
											2017-06-02 19:18:17 +03:00
+								        if key not in exclude and key in msg:
-												Move serialization functions to util

											
										
										
											2017-05-29 11:13:42 +03:00
+								            setter(msg[key])
 								    return msg
-												Fix to/from disk methods

											
										
										
											2017-05-31 14:42:39 +03:00
+								def to_disk(path, writers, exclude):
 								    path = ensure_path(path)
 								    if not path.exists():
 								        path.mkdir()
 								    for key, writer in writers.items():
 								        if key not in exclude:
 								            writer(path / key)
 								    return path
 								def from_disk(path, readers, exclude):
 								    path = ensure_path(path)
 								    for key, reader in readers.items():
 								        if key not in exclude:
-												Fix deserialization of vectors

											
										
										
											2017-10-16 21:55:00 +03:00
+								            reader(path / key)
-												Fix to/from disk methods

											
										
										
											2017-05-31 14:42:39 +03:00
+								    return path
-												Add deprecated helper

Uses warning to show DeprecationWarning and custom stack trace

											
										
										
											2017-11-01 18:32:36 +03:00
+								def deprecated(message, filter='always'):
 								    """Show a deprecation warning.
 								    message (unicode): The message to display.
 								    filter (unicode): Filter value.
 								    """
 								    stack = inspect.stack()[-1]
 								    with warnings.catch_warnings():
 								        warnings.simplefilter(filter, DeprecationWarning)
 								        warnings.warn_explicit(message, DeprecationWarning, stack[1], stack[2])
-												Tidy up CLI and fix print functions

											
										
										
											2017-05-08 00:25:29 +03:00
+								def print_table(data, title=None):
-												Add docstrings, error messages and fix consistency

											
										
										
											2017-05-13 22:22:49 +03:00
+								    """Print data in table format.
-												Update docstrings

											
										
										
											2017-05-14 02:30:29 +03:00
+								    data (dict or list of tuples): Label/value pairs.
 								    title (unicode or None): Title, will be printed above.
-												Fix formatting

											
										
										
											2017-04-16 14:42:34 +03:00
+								    """
-												Add docstrings, error messages and fix consistency

											
										
										
											2017-05-13 22:22:49 +03:00
+								    if isinstance(data, dict):
-												Add util functions to print data as table or markdown list

											
										
										
											2017-03-18 15:00:14 +03:00
+								        data = list(data.items())
-												Tidy up CLI and fix print functions

											
										
										
											2017-05-08 00:25:29 +03:00
+								    tpl_row = '    {:<15}' * len(data[0])
-												Make sure printed values are always strings

											
										
										
											2017-06-04 22:27:20 +03:00
+								    table = '\n'.join([tpl_row.format(l, unicode_(v)) for l, v in data])
-												Tidy up CLI and fix print functions

											
										
										
											2017-05-08 00:25:29 +03:00
+								    if title:
 								        print('\n    \033[93m{}\033[0m'.format(title))
 								    print('\n{}\n'.format(table))
-												Add util functions to print data as table or markdown list

											
										
										
											2017-03-18 15:00:14 +03:00
-												Tidy up CLI and fix print functions

											
										
										
											2017-05-08 00:25:29 +03:00
+								def print_markdown(data, title=None):
-												Add docstrings, error messages and fix consistency

											
										
										
											2017-05-13 22:22:49 +03:00
+								    """Print data in GitHub-flavoured Markdown format for issues etc.
-												Update docstrings

											
										
										
											2017-05-14 02:30:29 +03:00
+								    data (dict or list of tuples): Label/value pairs.
 								    title (unicode or None): Title, will be rendered as headline 2.
-												Fix formatting

											
										
										
											2017-04-16 14:42:34 +03:00
+								    """
-												Add util functions to print data as table or markdown list

											
										
										
											2017-03-18 15:00:14 +03:00
+								    def excl_value(value):
-												Make sure printed values are always strings

											
										
										
											2017-06-04 22:27:20 +03:00
+								        # contains path, i.e. personal info
 								        return isinstance(value, basestring_) and Path(value).exists()
-												Add util functions to print data as table or markdown list

											
										
										
											2017-03-18 15:00:14 +03:00
-												Add docstrings, error messages and fix consistency

											
										
										
											2017-05-13 22:22:49 +03:00
+								    if isinstance(data, dict):
-												Add util functions to print data as table or markdown list

											
										
										
											2017-03-18 15:00:14 +03:00
+								        data = list(data.items())
-												Tidy up util and helpers

											
										
										
											2017-10-27 15:39:09 +03:00
+								    markdown = ["* **{}:** {}".format(l, unicode_(v))
 								                for l, v in data if not excl_value(v)]
-												Tidy up CLI and fix print functions

											
										
										
											2017-05-08 00:25:29 +03:00
+								    if title:
 								        print("\n## {}".format(title))
 								    print('\n{}\n'.format('\n'.join(markdown)))
-												Add util functions to print data as table or markdown list

											
										
										
											2017-03-18 15:00:14 +03:00
-												Fix typo

											
										
										
											2017-05-08 03:00:37 +03:00
+								def prints(*texts, **kwargs):
-												Tidy up util and helpers

											
										
										
											2017-10-27 15:39:09 +03:00
+								    """Print formatted message (manual ANSI escape sequences to avoid
 								    dependency)
-												Add docstrings, error messages and fix consistency

											
										
										
											2017-05-13 22:22:49 +03:00
-												Update docstrings

											
										
										
											2017-05-14 02:30:29 +03:00
+								    *texts (unicode): Texts to print. Each argument is rendered as paragraph.
-												Tidy up util and helpers

											
										
										
											2017-10-27 15:39:09 +03:00
+								    **kwargs: 'title' becomes coloured headline. exits=True performs sys exit.
-												Fix formatting

											
										
										
											2017-04-16 14:42:34 +03:00
+								    """
-												Allow sys.exit status as exits keyword arg in util.prints()

											
										
										
											2017-05-22 13:29:15 +03:00
+								    exits = kwargs.get('exits', None)
-												Fix kwargs

											
										
										
											2017-05-08 02:05:24 +03:00
+								    title = kwargs.get('title', None)
-												Tidy up CLI and fix print functions

											
										
										
											2017-05-08 00:25:29 +03:00
+								    title = '\033[93m{}\033[0m\n'.format(_wrap(title)) if title else ''
 								    message = '\n\n'.join([_wrap(text) for text in texts])
 								    print('\n{}{}\n'.format(title, message))
-												Allow sys.exit status as exits keyword arg in util.prints()

											
										
										
											2017-05-22 13:29:15 +03:00
+								    if exits is not None:
 								        sys.exit(exits)
-												Add util functions for printing and wrapping messages

											
										
										
											2017-03-15 19:35:57 +03:00
-												Tidy up CLI and fix print functions

											
										
										
											2017-05-08 00:25:29 +03:00
+								def _wrap(text, wrap_max=80, indent=4):
-												Add docstrings, error messages and fix consistency

											
										
										
											2017-05-13 22:22:49 +03:00
+								    """Wrap text at given width using textwrap module.
-												Update docstrings

											
										
										
											2017-05-14 02:30:29 +03:00
+								    text (unicode): Text to wrap. If it's a Path, it's converted to string.
 								    wrap_max (int): Maximum line length (indent is deducted).
 								    indent (int): Number of spaces for indentation.
 								    RETURNS (unicode): Wrapped text.
-												Fix formatting

											
										
										
											2017-04-16 14:42:34 +03:00
+								    """
-												Tidy up CLI and fix print functions

											
										
										
											2017-05-08 00:25:29 +03:00
+								    indent = indent * ' '
-												Add util functions for printing and wrapping messages

											
										
										
											2017-03-15 19:35:57 +03:00
+								    wrap_width = wrap_max - len(indent)
-												Tidy up CLI and fix print functions

											
										
										
											2017-05-08 00:25:29 +03:00
+								    if isinstance(text, Path):
 								        text = path2str(text)
-												Add util functions for printing and wrapping messages

											
										
										
											2017-03-15 19:35:57 +03:00
+								    return textwrap.fill(text, width=wrap_width, initial_indent=indent,
-												Tidy up CLI and fix print functions

											
										
										
											2017-05-08 00:25:29 +03:00
+								                         subsequent_indent=indent, break_long_words=False,
 								                         break_on_hyphens=False)
-												Add displaCy visualisers (see #1058)

											
										
										
											2017-05-14 18:50:23 +03:00
 								def minify_html(html):
 								    """Perform a template-specific, rudimentary HTML minification for displaCy.
-												Tidy up util and helpers

											
										
										
											2017-10-27 15:39:09 +03:00
+								    Disclaimer: NOT a general-purpose solution, only removes indentation and
 								    newlines.
-												Add displaCy visualisers (see #1058)

											
										
										
											2017-05-14 18:50:23 +03:00
 								    html (unicode): Markup to minify.
 								    RETURNS (unicode): "Minified" HTML.
 								    """
 								    return html.strip().replace('    ', '').replace('\n', '')
-												Add util function to enable GPU

											
										
										
											2017-09-21 03:16:35 +03:00
 								def use_gpu(gpu_id):
-												Fix evaluate for non-GPU

											
										
										
											2017-10-03 23:47:31 +03:00
+								    try:
 								        import cupy.cuda.device
 								    except ImportError:
 								        return None
-												Add util function to enable GPU

											
										
										
											2017-09-21 03:16:35 +03:00
+								    from thinc.neural.ops import CupyOps
 								    device = cupy.cuda.device.Device(gpu_id)
 								    device.use()
 								    Model.ops = CupyOps()
 								    Model.Ops = CupyOps
 								    return device