mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 18:06:29 +03:00
Merge remote-tracking branch 'origin/develop-downloads'
This commit is contained in:
commit
8843b84bd1
|
@ -8,5 +8,5 @@ murmurhash>=0.26,<0.27
|
||||||
plac<0.9.3
|
plac<0.9.3
|
||||||
six
|
six
|
||||||
ujson>=1.35
|
ujson>=1.35
|
||||||
sputnik>=0.9.2,<0.10.0
|
|
||||||
dill>=0.2,<0.3
|
dill>=0.2,<0.3
|
||||||
|
requests>=2.13.0,<3.0.0
|
||||||
|
|
4
setup.py
4
setup.py
|
@ -240,9 +240,9 @@ def setup_package():
|
||||||
'plac<0.9.3',
|
'plac<0.9.3',
|
||||||
'six',
|
'six',
|
||||||
'pathlib',
|
'pathlib',
|
||||||
'sputnik>=0.9.2,<0.10.0',
|
|
||||||
'ujson>=1.35',
|
'ujson>=1.35',
|
||||||
'dill>=0.2,<0.3'],
|
'dill>=0.2,<0.3',
|
||||||
|
'requests>=2.13.0,<3.0.0'],
|
||||||
classifiers=[
|
classifiers=[
|
||||||
'Development Status :: 5 - Production/Stable',
|
'Development Status :: 5 - Production/Stable',
|
||||||
'Environment :: Console',
|
'Environment :: Console',
|
||||||
|
|
|
@ -1,7 +1,9 @@
|
||||||
import pathlib
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals, print_function
|
||||||
|
|
||||||
from .util import set_lang_class, get_lang_class
|
import json
|
||||||
from .about import __version__
|
from pathlib import Path
|
||||||
|
from .util import set_lang_class, get_lang_class, parse_package_meta
|
||||||
|
|
||||||
from . import en
|
from . import en
|
||||||
from . import de
|
from . import de
|
||||||
|
@ -16,11 +18,6 @@ from . import sv
|
||||||
from . import fi
|
from . import fi
|
||||||
from . import bn
|
from . import bn
|
||||||
|
|
||||||
try:
|
|
||||||
basestring
|
|
||||||
except NameError:
|
|
||||||
basestring = str
|
|
||||||
|
|
||||||
|
|
||||||
set_lang_class(en.English.lang, en.English)
|
set_lang_class(en.English.lang, en.English)
|
||||||
set_lang_class(de.German.lang, de.German)
|
set_lang_class(de.German.lang, de.German)
|
||||||
|
@ -36,11 +33,16 @@ set_lang_class(fi.Finnish.lang, fi.Finnish)
|
||||||
set_lang_class(bn.Bengali.lang, bn.Bengali)
|
set_lang_class(bn.Bengali.lang, bn.Bengali)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def load(name, **overrides):
|
def load(name, **overrides):
|
||||||
target_name, target_version = util.split_data_name(name)
|
|
||||||
data_path = overrides.get('path', util.get_data_path())
|
data_path = overrides.get('path', util.get_data_path())
|
||||||
path = util.match_best_version(target_name, target_version, data_path)
|
meta = parse_package_meta(data_path, name)
|
||||||
cls = get_lang_class(target_name)
|
lang = meta['lang'] if meta and 'lang' in meta else 'en'
|
||||||
overrides['path'] = path
|
cls = get_lang_class(lang)
|
||||||
|
overrides['meta'] = meta
|
||||||
|
overrides['path'] = Path(data_path / name)
|
||||||
return cls(**overrides)
|
return cls(**overrides)
|
||||||
|
|
||||||
|
|
||||||
|
def info(name):
|
||||||
|
meta = parse_package_meta(util.get_data_path(), name)
|
||||||
|
print(json.dumps(meta, indent=2))
|
||||||
|
|
|
@ -1,5 +1,4 @@
|
||||||
# inspired from:
|
# inspired from:
|
||||||
|
|
||||||
# https://python-packaging-user-guide.readthedocs.org/en/latest/single_source_version/
|
# https://python-packaging-user-guide.readthedocs.org/en/latest/single_source_version/
|
||||||
# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
|
# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
|
||||||
|
|
||||||
|
@ -10,7 +9,8 @@ __uri__ = 'https://spacy.io'
|
||||||
__author__ = 'Matthew Honnibal'
|
__author__ = 'Matthew Honnibal'
|
||||||
__email__ = 'matt@explosion.ai'
|
__email__ = 'matt@explosion.ai'
|
||||||
__license__ = 'MIT'
|
__license__ = 'MIT'
|
||||||
__models__ = {
|
|
||||||
'en': 'en>=1.1.0,<1.2.0',
|
__docs__ = 'https://spacy.io/docs/usage'
|
||||||
'de': 'de>=1.0.0,<1.1.0',
|
__download_url__ = 'https://github.com/explosion/spacy-models/releases/download'
|
||||||
}
|
__compatibility__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json?token=ANAt54fi5zcUtnwGhMLw2klWwcAyHkZGks5Y0nw1wA%3D%3D'
|
||||||
|
__shortcuts__ = {'en': 'en_core_web_sm', 'de': 'de_core_web_md', 'vectors': 'en_vectors_glove_md'}
|
||||||
|
|
|
@ -1,14 +1,5 @@
|
||||||
import plac
|
from ..deprecated import ModelDownload as download
|
||||||
from ..download import download
|
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
|
||||||
force=("Force overwrite", "flag", "f", bool),
|
|
||||||
data_path=("Path to download model", "option", "d", str)
|
|
||||||
)
|
|
||||||
def main(data_size='all', force=False, data_path=None):
|
|
||||||
download('de', force=force, data_path=data_path)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
plac.call(main)
|
download.de()
|
||||||
|
|
|
@ -1,35 +1,13 @@
|
||||||
from sputnik.dir_package import DirPackage
|
from pathlib import Path
|
||||||
from sputnik.package_list import (PackageNotFoundException,
|
|
||||||
CompatiblePackageNotFoundException)
|
|
||||||
|
|
||||||
import sputnik
|
|
||||||
from . import about
|
from . import about
|
||||||
|
from . import util
|
||||||
|
from .download import download
|
||||||
|
|
||||||
|
|
||||||
def get_package(data_dir):
|
try:
|
||||||
if not isinstance(data_dir, six.string_types):
|
basestring
|
||||||
raise RuntimeError('data_dir must be a string')
|
except NameError:
|
||||||
return DirPackage(data_dir)
|
basestring = str
|
||||||
|
|
||||||
|
|
||||||
def get_package_by_name(name=None, via=None):
|
|
||||||
if name is None:
|
|
||||||
return
|
|
||||||
lang = get_lang_class(name)
|
|
||||||
try:
|
|
||||||
return sputnik.package(about.__title__, about.__version__,
|
|
||||||
name, data_path=via)
|
|
||||||
except PackageNotFoundException as e:
|
|
||||||
raise RuntimeError("Model '%s' not installed. Please run 'python -m "
|
|
||||||
"%s.download' to install latest compatible "
|
|
||||||
"model." % (name, lang.__module__))
|
|
||||||
except CompatiblePackageNotFoundException as e:
|
|
||||||
raise RuntimeError("Installed model is not compatible with spaCy "
|
|
||||||
"version. Please run 'python -m %s.download "
|
|
||||||
"--force' to install latest compatible model." %
|
|
||||||
(lang.__module__))
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def read_lang_data(package):
|
def read_lang_data(package):
|
||||||
|
@ -43,7 +21,6 @@ def read_lang_data(package):
|
||||||
return tokenization, prefix, suffix, infix
|
return tokenization, prefix, suffix, infix
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def align_tokens(ref, indices): # Deprecated, surely?
|
def align_tokens(ref, indices): # Deprecated, surely?
|
||||||
start = 0
|
start = 0
|
||||||
queue = list(indices)
|
queue = list(indices)
|
||||||
|
@ -79,4 +56,55 @@ def detokenize(token_rules, words): # Deprecated?
|
||||||
return positions
|
return positions
|
||||||
|
|
||||||
|
|
||||||
|
def fix_glove_vectors_loading(overrides):
|
||||||
|
"""Special-case hack for loading the GloVe vectors, to support deprecated
|
||||||
|
<1.0 stuff. Phase this out once the data is fixed."""
|
||||||
|
|
||||||
|
if 'data_dir' in overrides and 'path' not in overrides:
|
||||||
|
raise ValueError("The argument 'data_dir' has been renamed to 'path'")
|
||||||
|
if overrides.get('path') is False:
|
||||||
|
return overrides
|
||||||
|
if overrides.get('path') in (None, True):
|
||||||
|
data_path = util.get_data_path()
|
||||||
|
else:
|
||||||
|
path = overrides['path']
|
||||||
|
if isinstance(path, basestring):
|
||||||
|
path = Path(path)
|
||||||
|
data_path = path.parent
|
||||||
|
vec_path = None
|
||||||
|
if 'add_vectors' not in overrides:
|
||||||
|
if 'vectors' in overrides:
|
||||||
|
vec_path = util.match_best_version(overrides['vectors'], None, data_path)
|
||||||
|
if vec_path is None:
|
||||||
|
return overrides
|
||||||
|
else:
|
||||||
|
vec_path = util.match_best_version('en_glove_cc_300_1m_vectors', None, data_path)
|
||||||
|
if vec_path is not None:
|
||||||
|
vec_path = vec_path / 'vocab' / 'vec.bin'
|
||||||
|
if vec_path is not None:
|
||||||
|
overrides['add_vectors'] = lambda vocab: vocab.load_vectors_from_bin_loc(vec_path)
|
||||||
|
return overrides
|
||||||
|
|
||||||
|
|
||||||
|
class ModelDownload():
|
||||||
|
"""Replace download modules within en and de with deprecation warning and
|
||||||
|
download default language model (using shortcut). Use classmethods to allow
|
||||||
|
importing ModelDownload as download and calling download.en() etc."""
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def load(self, lang):
|
||||||
|
util.print_msg(
|
||||||
|
"The spacy.{l}.download command is now deprecated. Please use "
|
||||||
|
"spacy.download [model name or shortcut] instead. For more "
|
||||||
|
"info and available models, see the documentation: {d}. "
|
||||||
|
"Downloading default '{l}' model now...".format(d=about.__docs__, l=lang),
|
||||||
|
title="Warning: deprecated command")
|
||||||
|
download(lang)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def en(cls, *args, **kwargs):
|
||||||
|
cls.load('en')
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def de(cls, *args, **kwargs):
|
||||||
|
cls.load('de')
|
||||||
|
|
|
@ -1,47 +1,80 @@
|
||||||
from __future__ import print_function
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
import sys
|
|
||||||
import shutil
|
|
||||||
|
|
||||||
import sputnik
|
|
||||||
from sputnik.package_list import (PackageNotFoundException,
|
|
||||||
CompatiblePackageNotFoundException)
|
|
||||||
|
|
||||||
|
import pip
|
||||||
|
import plac
|
||||||
|
import requests
|
||||||
|
from os import path
|
||||||
from . import about
|
from . import about
|
||||||
from . import util
|
from . import util
|
||||||
|
|
||||||
|
|
||||||
def download(lang, force=False, fail_on_exist=True, data_path=None):
|
@plac.annotations(
|
||||||
if not data_path:
|
model=("Model to download", "positional", None, str),
|
||||||
data_path = util.get_data_path(require_exists=False)
|
direct=("Force direct download", "flag", "d", bool)
|
||||||
|
)
|
||||||
|
def download(model=None, direct=False):
|
||||||
|
"""Download compatible model from default download path using pip."""
|
||||||
|
|
||||||
# spaCy uses pathlib, and util.get_data_path returns a pathlib.Path object,
|
check_error_depr(model)
|
||||||
# but sputnik (which we're using below) doesn't use pathlib and requires
|
|
||||||
# its data_path parameters to be strings, so we coerce the data_path to a
|
|
||||||
# str here.
|
|
||||||
data_path = str(data_path)
|
|
||||||
|
|
||||||
try:
|
if direct:
|
||||||
pkg = sputnik.package(about.__title__, about.__version__,
|
download_model('{m}/{m}.tar.gz'.format(m=model))
|
||||||
about.__models__.get(lang, lang), data_path)
|
else:
|
||||||
if force:
|
model = about.__shortcuts__[model] if model in about.__shortcuts__ else model
|
||||||
shutil.rmtree(pkg.path)
|
compatibility = get_compatibility()
|
||||||
elif fail_on_exist:
|
version = get_version(model, compatibility)
|
||||||
print("Model already installed. Please run 'python -m "
|
download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model, v=version))
|
||||||
"spacy.%s.download --force' to reinstall." % lang, file=sys.stderr)
|
|
||||||
sys.exit(0)
|
|
||||||
except (PackageNotFoundException, CompatiblePackageNotFoundException):
|
|
||||||
pass
|
|
||||||
|
|
||||||
package = sputnik.install(about.__title__, about.__version__,
|
|
||||||
about.__models__.get(lang, lang), data_path)
|
|
||||||
|
|
||||||
try:
|
def get_compatibility():
|
||||||
sputnik.package(about.__title__, about.__version__,
|
version = about.__version__
|
||||||
about.__models__.get(lang, lang), data_path)
|
r = requests.get(about.__compatibility__)
|
||||||
except (PackageNotFoundException, CompatiblePackageNotFoundException):
|
if r.status_code != 200:
|
||||||
print("Model failed to install. Please run 'python -m "
|
util.sys_exit(
|
||||||
"spacy.%s.download --force'." % lang, file=sys.stderr)
|
"Couldn't fetch compatibility table. Please find the right model for "
|
||||||
sys.exit(1)
|
"your spaCy installation (v{v}), and download it manually:".format(v=version),
|
||||||
|
"python -m spacy.download [full model name + version] --direct",
|
||||||
|
title="Server error ({c})".format(c=r.status_code))
|
||||||
|
|
||||||
print("Model successfully installed to %s" % data_path, file=sys.stderr)
|
comp = r.json()['spacy']
|
||||||
|
if version not in comp:
|
||||||
|
util.sys_exit(
|
||||||
|
"No compatible models found for v{v} of spaCy.".format(v=version),
|
||||||
|
title="Compatibility error")
|
||||||
|
else:
|
||||||
|
return comp[version]
|
||||||
|
|
||||||
|
|
||||||
|
def get_version(model, comp):
|
||||||
|
if model not in comp:
|
||||||
|
util.sys_exit(
|
||||||
|
"No compatible model found for "
|
||||||
|
"{m} (spaCy v{v}).".format(m=model, v=about.__version__),
|
||||||
|
title="Compatibility error")
|
||||||
|
return comp[model][0]
|
||||||
|
|
||||||
|
|
||||||
|
def download_model(filename):
|
||||||
|
util.print_msg("Downloading {f}".format(f=filename))
|
||||||
|
download_url = path.join(about.__download_url__, filename)
|
||||||
|
pip.main(['install', download_url])
|
||||||
|
|
||||||
|
|
||||||
|
def check_error_depr(model):
|
||||||
|
if not model:
|
||||||
|
util.sys_exit(
|
||||||
|
"python -m spacy.download [name or shortcut]",
|
||||||
|
title="Missing model name or shortcut")
|
||||||
|
|
||||||
|
if model == 'all':
|
||||||
|
util.sys_exit(
|
||||||
|
"As of v1.7.0, the download all command is deprecated. Please "
|
||||||
|
"download the models individually via spacy.download [model name] "
|
||||||
|
"or pip install. For more info on this, see the documentation: "
|
||||||
|
"{d}".format(d=about.__docs__),
|
||||||
|
title="Deprecated command")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
plac.call(download)
|
||||||
|
|
|
@ -1,19 +1,16 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals, print_function
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from os import path
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
from ..util import match_best_version
|
|
||||||
from ..util import get_data_path
|
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..lemmatizer import Lemmatizer
|
from ..lemmatizer import Lemmatizer
|
||||||
from ..vocab import Vocab
|
from ..vocab import Vocab
|
||||||
from ..tokenizer import Tokenizer
|
from ..tokenizer import Tokenizer
|
||||||
from ..attrs import LANG
|
from ..attrs import LANG
|
||||||
|
from ..deprecated import fix_glove_vectors_loading
|
||||||
|
|
||||||
from .language_data import *
|
from .language_data import *
|
||||||
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
basestring
|
basestring
|
||||||
except NameError:
|
except NameError:
|
||||||
|
@ -38,34 +35,6 @@ class English(Language):
|
||||||
|
|
||||||
|
|
||||||
def __init__(self, **overrides):
|
def __init__(self, **overrides):
|
||||||
# Make a special-case hack for loading the GloVe vectors, to support
|
# Special-case hack for loading the GloVe vectors, to support <1.0
|
||||||
# deprecated <1.0 stuff. Phase this out once the data is fixed.
|
overrides = fix_glove_vectors_loading(overrides)
|
||||||
overrides = _fix_deprecated_glove_vectors_loading(overrides)
|
|
||||||
Language.__init__(self, **overrides)
|
Language.__init__(self, **overrides)
|
||||||
|
|
||||||
|
|
||||||
def _fix_deprecated_glove_vectors_loading(overrides):
|
|
||||||
if 'data_dir' in overrides and 'path' not in overrides:
|
|
||||||
raise ValueError("The argument 'data_dir' has been renamed to 'path'")
|
|
||||||
if overrides.get('path') is False:
|
|
||||||
return overrides
|
|
||||||
if overrides.get('path') in (None, True):
|
|
||||||
data_path = get_data_path()
|
|
||||||
else:
|
|
||||||
path = overrides['path']
|
|
||||||
if isinstance(path, basestring):
|
|
||||||
path = Path(path)
|
|
||||||
data_path = path.parent
|
|
||||||
vec_path = None
|
|
||||||
if 'add_vectors' not in overrides:
|
|
||||||
if 'vectors' in overrides:
|
|
||||||
vec_path = match_best_version(overrides['vectors'], None, data_path)
|
|
||||||
if vec_path is None:
|
|
||||||
return overrides
|
|
||||||
else:
|
|
||||||
vec_path = match_best_version('en_glove_cc_300_1m_vectors', None, data_path)
|
|
||||||
if vec_path is not None:
|
|
||||||
vec_path = vec_path / 'vocab' / 'vec.bin'
|
|
||||||
if vec_path is not None:
|
|
||||||
overrides['add_vectors'] = lambda vocab: vocab.load_vectors_from_bin_loc(vec_path)
|
|
||||||
return overrides
|
|
||||||
|
|
|
@ -1,25 +1,5 @@
|
||||||
import plac
|
from ..deprecated import ModelDownload as download
|
||||||
import sputnik
|
|
||||||
|
|
||||||
from ..download import download
|
|
||||||
from .. import about
|
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
|
||||||
force=("Force overwrite", "flag", "f", bool),
|
|
||||||
data_path=("Path to download model", "option", "d", str)
|
|
||||||
)
|
|
||||||
def main(data_size='all', force=False, data_path=None):
|
|
||||||
if force:
|
|
||||||
sputnik.purge(about.__title__, about.__version__)
|
|
||||||
|
|
||||||
if data_size in ('all', 'parser'):
|
|
||||||
print("Downloading parsing model")
|
|
||||||
download('en', force=False, data_path=data_path)
|
|
||||||
if data_size in ('all', 'glove'):
|
|
||||||
print("Downloading GloVe vectors")
|
|
||||||
download('en_glove_cc_300_1m_vectors', force=False, data_path=data_path)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
plac.call(main)
|
download.en()
|
||||||
|
|
|
@ -281,6 +281,7 @@ class Language(object):
|
||||||
if path is True:
|
if path is True:
|
||||||
path = util.match_best_version(self.lang, '', util.get_data_path())
|
path = util.match_best_version(self.lang, '', util.get_data_path())
|
||||||
|
|
||||||
|
self.meta = overrides.get('meta', {})
|
||||||
self.path = path
|
self.path = path
|
||||||
|
|
||||||
self.vocab = self.Defaults.create_vocab(self) \
|
self.vocab = self.Defaults.create_vocab(self) \
|
||||||
|
|
72
spacy/link.py
Normal file
72
spacy/link.py
Normal file
|
@ -0,0 +1,72 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import io
|
||||||
|
import os
|
||||||
|
import pip
|
||||||
|
import site
|
||||||
|
import plac
|
||||||
|
from . import util
|
||||||
|
|
||||||
|
|
||||||
|
@plac.annotations(
|
||||||
|
origin=("Package name or path to model", "positional", None, str),
|
||||||
|
link_name=("Name of link", "positional", None, str),
|
||||||
|
force=("Force overwriting existing link", "flag", "f", bool)
|
||||||
|
)
|
||||||
|
def link(origin, link_name, force=False):
|
||||||
|
"""Create a symlink for models within the spacy/data directory. Accepts
|
||||||
|
either the name of a pip package, or the local path to the model data
|
||||||
|
directory. Linking models allows loading them via spacy.load(link_name)."""
|
||||||
|
|
||||||
|
if is_package(origin):
|
||||||
|
package_path = site.getsitepackages()[0]
|
||||||
|
meta = get_meta(package_path, origin)
|
||||||
|
data_dir = origin + '-' + meta['version']
|
||||||
|
model_path = os.path.join(package_path, origin, data_dir)
|
||||||
|
symlink(model_path, link_name, force)
|
||||||
|
else:
|
||||||
|
symlink(origin, link_name, force)
|
||||||
|
|
||||||
|
|
||||||
|
def symlink(model_path, link_name, force):
|
||||||
|
if not os.path.isdir(model_path):
|
||||||
|
util.sys_exit(
|
||||||
|
"The data should be located in {p}".format(p=model_path),
|
||||||
|
title="Can't locate model data")
|
||||||
|
|
||||||
|
data_path = str(util.get_data_path())
|
||||||
|
link_path = os.path.join(os.path.abspath(__file__ + '/../../'), data_path, link_name)
|
||||||
|
|
||||||
|
if os.path.isdir(link_path):
|
||||||
|
if force:
|
||||||
|
os.unlink(link_path)
|
||||||
|
else:
|
||||||
|
util.sys_exit(
|
||||||
|
"To overwrite an existing link, use the --force flag.",
|
||||||
|
title="Link {l} already exists".format(l=link_name))
|
||||||
|
|
||||||
|
os.symlink(model_path, link_path)
|
||||||
|
util.print_msg(
|
||||||
|
"{a} --> {b}".format(a=model_path, b=link_path),
|
||||||
|
"You can now load the model via spacy.load('{l}').".format(l=link_name),
|
||||||
|
title="Linking successful")
|
||||||
|
|
||||||
|
|
||||||
|
def get_meta(package_path, package):
|
||||||
|
meta = util.parse_package_meta(package_path, package)
|
||||||
|
if not meta:
|
||||||
|
util.sys_exit()
|
||||||
|
return meta
|
||||||
|
|
||||||
|
|
||||||
|
def is_package(origin):
|
||||||
|
packages = pip.get_installed_distributions()
|
||||||
|
for package in packages:
|
||||||
|
if package.project_name.replace('-', '_') == origin:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
plac.call(link)
|
36
spacy/tests/test_download.py
Normal file
36
spacy/tests/test_download.py
Normal file
|
@ -0,0 +1,36 @@
|
||||||
|
# coding: utf-8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from ..download import download, get_compatibility, get_version, check_error_depr
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
def test_download_fetch_compatibility():
|
||||||
|
compatibility = get_compatibility()
|
||||||
|
assert type(compatibility) == dict
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.slow
|
||||||
|
@pytest.mark.parametrize('model', ['en_core_web_md-1.2.0'])
|
||||||
|
def test_download_direct_download(model):
|
||||||
|
download(model, direct=True)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('model', ['en_core_web_md'])
|
||||||
|
def test_download_get_matching_version_succeeds(model):
|
||||||
|
comp = { model: ['1.7.0', '0.100.0'] }
|
||||||
|
assert get_version(model, comp)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('model', ['en_core_web_md'])
|
||||||
|
def test_download_get_matching_version_fails(model):
|
||||||
|
diff_model = 'test_' + model
|
||||||
|
comp = { diff_model: ['1.7.0', '0.100.0'] }
|
||||||
|
with pytest.raises(SystemExit):
|
||||||
|
assert get_version(model, comp)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('model', [False, None, '', 'all'])
|
||||||
|
def test_download_no_model_depr_error(model):
|
||||||
|
with pytest.raises(SystemExit):
|
||||||
|
check_error_depr(model)
|
|
@ -1,40 +1,36 @@
|
||||||
|
# coding: utf-8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import io
|
|
||||||
import pytest
|
import pytest
|
||||||
import dill as pickle
|
import dill as pickle
|
||||||
|
|
||||||
from ..strings import StringStore
|
|
||||||
from ..vocab import Vocab
|
from ..vocab import Vocab
|
||||||
from ..attrs import NORM
|
from ..attrs import NORM
|
||||||
|
|
||||||
|
|
||||||
def test_pickle_string_store():
|
@pytest.mark.parametrize('text1,text2', [('hello', 'bye')])
|
||||||
sstore = StringStore()
|
def test_pickle_string_store(stringstore, text1, text2):
|
||||||
hello = sstore['hello']
|
store1 = stringstore[text1]
|
||||||
bye = sstore['bye']
|
store2 = stringstore[text2]
|
||||||
bdata = pickle.dumps(sstore, protocol=-1)
|
data = pickle.dumps(stringstore, protocol=-1)
|
||||||
unpickled = pickle.loads(bdata)
|
unpickled = pickle.loads(data)
|
||||||
assert unpickled['hello'] == hello
|
assert unpickled[text1] == store1
|
||||||
assert unpickled['bye'] == bye
|
assert unpickled[text2] == store2
|
||||||
assert len(sstore) == len(unpickled)
|
assert len(stringstore) == len(unpickled)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
@pytest.mark.xfail
|
||||||
def test_pickle_vocab():
|
@pytest.mark.parametrize('text1,text2', [('dog', 'cat')])
|
||||||
|
def test_pickle_vocab(text1, text2):
|
||||||
vocab = Vocab(lex_attr_getters={int(NORM): lambda string: string[:-1]})
|
vocab = Vocab(lex_attr_getters={int(NORM): lambda string: string[:-1]})
|
||||||
dog = vocab[u'dog']
|
lex1 = vocab[text1]
|
||||||
cat = vocab[u'cat']
|
lex2 = vocab[text2]
|
||||||
assert dog.norm_ == 'do'
|
assert lex1.norm_ == text1[:-1]
|
||||||
assert cat.norm_ == 'ca'
|
assert lex2.norm_ == text2[:-1]
|
||||||
|
data = pickle.dumps(vocab)
|
||||||
bdata = pickle.dumps(vocab)
|
unpickled = pickle.loads(data)
|
||||||
unpickled = pickle.loads(bdata)
|
assert unpickled[text1].orth == lex1.orth
|
||||||
|
assert unpickled[text2].orth == lex2.orth
|
||||||
assert unpickled[u'dog'].orth == dog.orth
|
assert unpickled[text1].norm == lex1.norm
|
||||||
assert unpickled[u'cat'].orth == cat.orth
|
assert unpickled[text2].norm == lex2.norm
|
||||||
assert unpickled[u'dog'].norm == dog.norm
|
assert unpickled[text1].norm != unpickled[text2].norm
|
||||||
assert unpickled[u'cat'].norm == cat.norm
|
|
||||||
dog_ = unpickled[u'dog']
|
|
||||||
cat_ = unpickled[u'cat']
|
|
||||||
assert dog_.norm != cat_.norm
|
|
||||||
|
|
|
@ -1,13 +1,16 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals, print_function
|
||||||
import os
|
import os
|
||||||
import io
|
import io
|
||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
import os.path
|
import os.path
|
||||||
import pathlib
|
import pathlib
|
||||||
|
import sys
|
||||||
|
|
||||||
import six
|
import six
|
||||||
|
import textwrap
|
||||||
|
|
||||||
from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
|
from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
@ -144,3 +147,53 @@ def check_renamed_kwargs(renamed, kwargs):
|
||||||
for old, new in renamed.items():
|
for old, new in renamed.items():
|
||||||
if old in kwargs:
|
if old in kwargs:
|
||||||
raise TypeError("Keyword argument %s now renamed to %s" % (old, new))
|
raise TypeError("Keyword argument %s now renamed to %s" % (old, new))
|
||||||
|
|
||||||
|
|
||||||
|
def parse_package_meta(package_path, package):
|
||||||
|
location = os.path.join(str(package_path), package, 'meta.json')
|
||||||
|
if not os.path.isfile(location):
|
||||||
|
print_msg("'{p}' doesn't seem to be a valid model package.".format(p=package),
|
||||||
|
title="No meta.json found")
|
||||||
|
else:
|
||||||
|
with io.open(location, encoding='utf8') as f:
|
||||||
|
meta = json.load(f)
|
||||||
|
return meta
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def print_msg(*text, **kwargs):
|
||||||
|
"""Print formatted message. Each positional argument is rendered as newline-
|
||||||
|
separated paragraph. If kwarg 'title' exist, title is printed above the text
|
||||||
|
and highlighted (using ANSI escape sequences manually to avoid unnecessary
|
||||||
|
dependency)."""
|
||||||
|
|
||||||
|
message = '\n\n'.join([_wrap_text(t) for t in text])
|
||||||
|
tpl_msg = '\n{msg}\n'
|
||||||
|
tpl_title = '\n\033[93m{msg}\033[0m'
|
||||||
|
|
||||||
|
if 'title' in kwargs and kwargs['title']:
|
||||||
|
title = _wrap_text(kwargs['title'])
|
||||||
|
print(tpl_title.format(msg=title))
|
||||||
|
print(tpl_msg.format(msg=message))
|
||||||
|
|
||||||
|
|
||||||
|
def _wrap_text(text):
|
||||||
|
"""Wrap text at given width using textwrap module. Indent should consist of
|
||||||
|
spaces. Its length is deducted from wrap width to ensure exact wrapping."""
|
||||||
|
|
||||||
|
wrap_max = 80
|
||||||
|
indent = ' '
|
||||||
|
wrap_width = wrap_max - len(indent)
|
||||||
|
return textwrap.fill(text, width=wrap_width, initial_indent=indent,
|
||||||
|
subsequent_indent=indent, break_long_words=False,
|
||||||
|
break_on_hyphens=False)
|
||||||
|
|
||||||
|
|
||||||
|
def sys_exit(*messages, **kwargs):
|
||||||
|
"""Performs SystemExit. For modules used from the command line, like
|
||||||
|
download and link. To print message, use the same arguments as for
|
||||||
|
print_msg()."""
|
||||||
|
|
||||||
|
if messages:
|
||||||
|
print_msg(*messages, **kwargs)
|
||||||
|
sys.exit(0)
|
||||||
|
|
|
@ -57,20 +57,6 @@ p Many of the associated tools and resources that we're developing alongside spa
|
||||||
+cell
|
+cell
|
||||||
| Super sparse multi-class machine learning with Cython.
|
| Super sparse multi-class machine learning with Cython.
|
||||||
|
|
||||||
+row
|
|
||||||
+cell
|
|
||||||
+src(gh("sputnik")) Sputnik
|
|
||||||
|
|
||||||
+cell
|
|
||||||
| Data package manager library for spaCy.
|
|
||||||
|
|
||||||
+row
|
|
||||||
+cell
|
|
||||||
+src(gh("sputnik-server")) Sputnik Server
|
|
||||||
|
|
||||||
+cell
|
|
||||||
| Index service for the Sputnik data package manager for spaCy.
|
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell
|
+cell
|
||||||
+src(gh("cymem")) Cymem
|
+src(gh("cymem")) Cymem
|
||||||
|
|
Loading…
Reference in New Issue
Block a user