spaCy/spacy/deprecated.py

# coding: utf8
from __future__ import unicode_literals

from pathlib import Path

from . import about
from . import util
from .cli import download
from .cli import link


def read_lang_data(package):
    tokenization = package.load_json(('tokenizer', 'specials.json'))
    with package.open(('tokenizer', 'prefix.txt'), default=None) as file_:
        prefix = read_prefix(file_) if file_ is not None else None
    with package.open(('tokenizer', 'suffix.txt'), default=None) as file_:
        suffix = read_suffix(file_) if file_ is not None else None
    with package.open(('tokenizer', 'infix.txt'), default=None) as file_:
        infix = read_infix(file_) if file_ is not None else None
    return tokenization, prefix, suffix, infix


def align_tokens(ref, indices): # Deprecated, surely?
    start = 0
    queue = list(indices)
    for token in ref:
        end = start + len(token)
        emit = []
        while queue and queue[0][1] <= end:
            emit.append(queue.pop(0))
        yield token, emit
        start = end
    assert not queue


def detokenize(token_rules, words): # Deprecated?
    """
    To align with treebanks, return a list of "chunks", where a chunk is a
    sequence of tokens that are separated by whitespace in actual strings. Each
    chunk should be a tuple of token indices, e.g.

    >>> detokenize(["ca<SEP>n't", '<SEP>!'], ["I", "ca", "n't", "!"])
    [(0,), (1, 2, 3)]
    """
    string = ' '.join(words)
    for subtoks in token_rules:
        # Algorithmically this is dumb, but writing a little list-based match
        # machine? Ain't nobody got time for that.
        string = string.replace(subtoks.replace('<SEP>', ' '), subtoks)
    positions = []
    i = 0
    for chunk in string.split():
        subtoks = chunk.split('<SEP>')
        positions.append(tuple(range(i, i+len(subtoks))))
        i += len(subtoks)
    return positions


def match_best_version(target_name, target_version, path):
    path = util.ensure_path(path)
    if path is None or not path.exists():
        return None
    matches = []
    for data_name in path.iterdir():
        name, version = split_data_name(data_name.parts[-1])
        if name == target_name:
            matches.append((tuple(float(v) for v in version.split('.')), data_name))
    if matches:
        return Path(max(matches)[1])
    else:
        return None


def split_data_name(name):
    return name.split('-', 1) if '-' in name else (name, '')


def fix_glove_vectors_loading(overrides):
    """
    Special-case hack for loading the GloVe vectors, to support deprecated
    <1.0 stuff. Phase this out once the data is fixed.
    """
    if 'data_dir' in overrides and 'path' not in overrides:
        raise ValueError("The argument 'data_dir' has been renamed to 'path'")
    if overrides.get('path') is False:
        return overrides
    if overrides.get('path') in (None, True):
        data_path = util.get_data_path()
    else:
        path = util.ensure_path(overrides['path'])
        data_path = path.parent
    vec_path = None
    if 'add_vectors' not in overrides:
        if 'vectors' in overrides:
            vec_path = match_best_version(overrides['vectors'], None, data_path)
            if vec_path is None:
                return overrides
        else:
            vec_path = match_best_version('en_glove_cc_300_1m_vectors', None, data_path)
        if vec_path is not None:
            vec_path = vec_path / 'vocab' / 'vec.bin'
    if vec_path is not None:
        overrides['add_vectors'] = lambda vocab: vocab.load_vectors_from_bin_loc(vec_path)
    return overrides


def resolve_model_name(name):
    """
    If spaCy is loaded with 'de', check if symlink already exists. If
    not, user may have upgraded from older version and have old models installed.
    Check if old model directory exists and if so, return that instead and create
    shortcut link. If English model is found and no shortcut exists, raise error
    and tell user to install new model.
    """
    if name == 'en' or name == 'de':
        versions = ['1.0.0', '1.1.0']
        data_path = Path(util.get_data_path())
        model_path = data_path / name
        v_model_paths = [data_path / Path(name + '-' + v) for v in versions]

        if not model_path.exists(): # no shortcut found
            for v_path in v_model_paths:
                if v_path.exists(): # versioned model directory found
                    if name == 'de':
                        link(v_path, name)
                        return name
                    else:
                        raise ValueError(
                            "Found English model at {p}. This model is not "
                            "compatible with the current version. See "
                            "https://spacy.io/docs/usage/models to download the "
                            "new model.".format(p=v_path))
    return name


class ModelDownload():
    """
    Replace download modules within en and de with deprecation warning and
    download default language model (using shortcut). Use classmethods to allow
    importing ModelDownload as download and calling download.en() etc.
    """

    @classmethod
    def load(self, lang):
        util.print_msg(
            "The spacy.{l}.download command is now deprecated. Please use "
            "python -m spacy download [model name or shortcut] instead. For more "
            "info and available models, see the documentation: {d}. "
            "Downloading default '{l}' model now...".format(d=about.__docs_models__, l=lang),
            title="Warning: deprecated command")
        download(lang)

    @classmethod
    def en(cls, *args, **kwargs):
        cls.load('en')

    @classmethod
    def de(cls, *args, **kwargs):
        cls.load('de')
Tidy up and fix formatting and imports 2017-04-15 14:05:15 +03:00			`# coding: utf8`
			`from __future__ import unicode_literals`

Move fix_deprecated_glove_vectors_loading to deprecated.py 2017-03-15 19:33:29 +03:00			`from pathlib import Path`
Clean up imports, unused code, whitespace, docstrings 2017-04-15 13:05:47 +03:00
Mostly finished loading refactoring. Design is in place, but doesn't work yet. 2016-09-24 16:42:01 +03:00			`from . import about`
Move fix_deprecated_glove_vectors_loading to deprecated.py 2017-03-15 19:33:29 +03:00			`from . import util`
Add directory cli and set up command line interface 2017-03-18 17:14:48 +03:00			`from .cli import download`
			`from .cli import link`
Mostly finished loading refactoring. Design is in place, but doesn't work yet. 2016-09-24 16:42:01 +03:00

			`def read_lang_data(package):`
			`tokenization = package.load_json(('tokenizer', 'specials.json'))`
			`with package.open(('tokenizer', 'prefix.txt'), default=None) as file_:`
			`prefix = read_prefix(file_) if file_ is not None else None`
			`with package.open(('tokenizer', 'suffix.txt'), default=None) as file_:`
			`suffix = read_suffix(file_) if file_ is not None else None`
			`with package.open(('tokenizer', 'infix.txt'), default=None) as file_:`
			`infix = read_infix(file_) if file_ is not None else None`
			`return tokenization, prefix, suffix, infix`


			`def align_tokens(ref, indices): # Deprecated, surely?`
			`start = 0`
			`queue = list(indices)`
			`for token in ref:`
			`end = start + len(token)`
			`emit = []`
			`while queue and queue[0][1] <= end:`
			`emit.append(queue.pop(0))`
			`yield token, emit`
			`start = end`
			`assert not queue`


			`def detokenize(token_rules, words): # Deprecated?`
Use consistent formatting for docstrings 2017-04-15 12:59:21 +03:00			`"""`
			`To align with treebanks, return a list of "chunks", where a chunk is a`
Mostly finished loading refactoring. Design is in place, but doesn't work yet. 2016-09-24 16:42:01 +03:00			`sequence of tokens that are separated by whitespace in actual strings. Each`
			`chunk should be a tuple of token indices, e.g.`

			`>>> detokenize(["ca<SEP>n't", '<SEP>!'], ["I", "ca", "n't", "!"])`
			`[(0,), (1, 2, 3)]`
			`"""`
			`string = ' '.join(words)`
			`for subtoks in token_rules:`
			`# Algorithmically this is dumb, but writing a little list-based match`
			`# machine? Ain't nobody got time for that.`
			`string = string.replace(subtoks.replace('<SEP>', ' '), subtoks)`
			`positions = []`
			`i = 0`
			`for chunk in string.split():`
			`subtoks = chunk.split('<SEP>')`
			`positions.append(tuple(range(i, i+len(subtoks))))`
			`i += len(subtoks)`
			`return positions`


Move functions to deprecated 2017-04-15 13:12:31 +03:00			`def match_best_version(target_name, target_version, path):`
			`path = util.ensure_path(path)`
			`if path is None or not path.exists():`
			`return None`
			`matches = []`
			`for data_name in path.iterdir():`
			`name, version = split_data_name(data_name.parts[-1])`
			`if name == target_name:`
			`matches.append((tuple(float(v) for v in version.split('.')), data_name))`
			`if matches:`
			`return Path(max(matches)[1])`
			`else:`
			`return None`


			`def split_data_name(name):`
			`return name.split('-', 1) if '-' in name else (name, '')`
Move fix_deprecated_glove_vectors_loading to deprecated.py 2017-03-15 19:33:29 +03:00
Use consistent formatting for docstrings 2017-04-15 12:59:21 +03:00
			`def fix_glove_vectors_loading(overrides):`
			`"""`
			`Special-case hack for loading the GloVe vectors, to support deprecated`
			`<1.0 stuff. Phase this out once the data is fixed.`
			`"""`
Move fix_deprecated_glove_vectors_loading to deprecated.py 2017-03-15 19:33:29 +03:00			`if 'data_dir' in overrides and 'path' not in overrides:`
			`raise ValueError("The argument 'data_dir' has been renamed to 'path'")`
			`if overrides.get('path') is False:`
			`return overrides`
			`if overrides.get('path') in (None, True):`
			`data_path = util.get_data_path()`
			`else:`
Add compat functions and remove old workarounds Add ensure_path util function to handle checking instance of path 2017-04-15 13:11:16 +03:00			`path = util.ensure_path(overrides['path'])`
Move fix_deprecated_glove_vectors_loading to deprecated.py 2017-03-15 19:33:29 +03:00			`data_path = path.parent`
			`vec_path = None`
			`if 'add_vectors' not in overrides:`
			`if 'vectors' in overrides:`
Move functions to deprecated 2017-04-15 13:12:31 +03:00			`vec_path = match_best_version(overrides['vectors'], None, data_path)`
Move fix_deprecated_glove_vectors_loading to deprecated.py 2017-03-15 19:33:29 +03:00			`if vec_path is None:`
			`return overrides`
			`else:`
Move functions to deprecated 2017-04-15 13:12:31 +03:00			`vec_path = match_best_version('en_glove_cc_300_1m_vectors', None, data_path)`
Move fix_deprecated_glove_vectors_loading to deprecated.py 2017-03-15 19:33:29 +03:00			`if vec_path is not None:`
			`vec_path = vec_path / 'vocab' / 'vec.bin'`
			`if vec_path is not None:`
			`overrides['add_vectors'] = lambda vocab: vocab.load_vectors_from_bin_loc(vec_path)`
			`return overrides`
Handle deprecated language-specific model downloading 2017-03-15 19:37:55 +03:00

Add function to resolve model names and link them 2017-03-17 20:47:05 +03:00			`def resolve_model_name(name):`
Use consistent formatting for docstrings 2017-04-15 12:59:21 +03:00			`"""`
			`If spaCy is loaded with 'de', check if symlink already exists. If`
Fix docstring 2017-04-16 21:34:37 +03:00			`not, user may have upgraded from older version and have old models installed.`
Add function to resolve model names and link them 2017-03-17 20:47:05 +03:00			`Check if old model directory exists and if so, return that instead and create`
Update resolve_model_name 2017-03-17 21:26:28 +03:00			`shortcut link. If English model is found and no shortcut exists, raise error`
			`and tell user to install new model.`
Add function to resolve model names and link them 2017-03-17 20:47:05 +03:00			`"""`
			`if name == 'en' or name == 'de':`
			`versions = ['1.0.0', '1.1.0']`
			`data_path = Path(util.get_data_path())`
			`model_path = data_path / name`
			`v_model_paths = [data_path / Path(name + '-' + v) for v in versions]`
Update resolve_model_name 2017-03-17 21:26:28 +03:00
			`if not model_path.exists(): # no shortcut found`
Add function to resolve model names and link them 2017-03-17 20:47:05 +03:00			`for v_path in v_model_paths:`
Update resolve_model_name 2017-03-17 21:26:28 +03:00			`if v_path.exists(): # versioned model directory found`
			`if name == 'de':`
			`link(v_path, name)`
			`return name`
			`else:`
			`raise ValueError(`
			`"Found English model at {p}. This model is not "`
			`"compatible with the current version. See "`
			`"https://spacy.io/docs/usage/models to download the "`
			`"new model.".format(p=v_path))`
Add function to resolve model names and link them 2017-03-17 20:47:05 +03:00			`return name`


Handle deprecated language-specific model downloading 2017-03-15 19:37:55 +03:00			`class ModelDownload():`
Use consistent formatting for docstrings 2017-04-15 12:59:21 +03:00			`"""`
			`Replace download modules within en and de with deprecation warning and`
Handle deprecated language-specific model downloading 2017-03-15 19:37:55 +03:00			`download default language model (using shortcut). Use classmethods to allow`
Use consistent formatting for docstrings 2017-04-15 12:59:21 +03:00			`importing ModelDownload as download and calling download.en() etc.`
			`"""`
Handle deprecated language-specific model downloading 2017-03-15 19:37:55 +03:00
			`@classmethod`
			`def load(self, lang):`
			`util.print_msg(`
			`"The spacy.{l}.download command is now deprecated. Please use "`
Use correct command in deprecated download command message 2017-03-18 19:01:01 +03:00			`"python -m spacy download [model name or shortcut] instead. For more "`
Handle deprecated language-specific model downloading 2017-03-15 19:37:55 +03:00			`"info and available models, see the documentation: {d}. "`
Rename about.__docs__ to about.__docs_models__ 2017-05-13 14:09:00 +03:00			`"Downloading default '{l}' model now...".format(d=about.__docs_models__, l=lang),`
Handle deprecated language-specific model downloading 2017-03-15 19:37:55 +03:00			`title="Warning: deprecated command")`
			`download(lang)`

			`@classmethod`
			`def en(cls, args, *kwargs):`
			`cls.load('en')`

			`@classmethod`
			`def de(cls, args, *kwargs):`
			`cls.load('de')`