mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Merge remote-tracking branch 'origin/develop-downloads'
This commit is contained in:
commit
8843b84bd1
|
@ -8,5 +8,5 @@ murmurhash>=0.26,<0.27
|
|||
plac<0.9.3
|
||||
six
|
||||
ujson>=1.35
|
||||
sputnik>=0.9.2,<0.10.0
|
||||
dill>=0.2,<0.3
|
||||
requests>=2.13.0,<3.0.0
|
||||
|
|
4
setup.py
4
setup.py
|
@ -240,9 +240,9 @@ def setup_package():
|
|||
'plac<0.9.3',
|
||||
'six',
|
||||
'pathlib',
|
||||
'sputnik>=0.9.2,<0.10.0',
|
||||
'ujson>=1.35',
|
||||
'dill>=0.2,<0.3'],
|
||||
'dill>=0.2,<0.3',
|
||||
'requests>=2.13.0,<3.0.0'],
|
||||
classifiers=[
|
||||
'Development Status :: 5 - Production/Stable',
|
||||
'Environment :: Console',
|
||||
|
|
|
@ -1,7 +1,9 @@
|
|||
import pathlib
|
||||
# coding: utf8
|
||||
from __future__ import unicode_literals, print_function
|
||||
|
||||
from .util import set_lang_class, get_lang_class
|
||||
from .about import __version__
|
||||
import json
|
||||
from pathlib import Path
|
||||
from .util import set_lang_class, get_lang_class, parse_package_meta
|
||||
|
||||
from . import en
|
||||
from . import de
|
||||
|
@ -16,11 +18,6 @@ from . import sv
|
|||
from . import fi
|
||||
from . import bn
|
||||
|
||||
try:
|
||||
basestring
|
||||
except NameError:
|
||||
basestring = str
|
||||
|
||||
|
||||
set_lang_class(en.English.lang, en.English)
|
||||
set_lang_class(de.German.lang, de.German)
|
||||
|
@ -36,11 +33,16 @@ set_lang_class(fi.Finnish.lang, fi.Finnish)
|
|||
set_lang_class(bn.Bengali.lang, bn.Bengali)
|
||||
|
||||
|
||||
|
||||
def load(name, **overrides):
|
||||
target_name, target_version = util.split_data_name(name)
|
||||
data_path = overrides.get('path', util.get_data_path())
|
||||
path = util.match_best_version(target_name, target_version, data_path)
|
||||
cls = get_lang_class(target_name)
|
||||
overrides['path'] = path
|
||||
meta = parse_package_meta(data_path, name)
|
||||
lang = meta['lang'] if meta and 'lang' in meta else 'en'
|
||||
cls = get_lang_class(lang)
|
||||
overrides['meta'] = meta
|
||||
overrides['path'] = Path(data_path / name)
|
||||
return cls(**overrides)
|
||||
|
||||
|
||||
def info(name):
|
||||
meta = parse_package_meta(util.get_data_path(), name)
|
||||
print(json.dumps(meta, indent=2))
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
# inspired from:
|
||||
|
||||
# https://python-packaging-user-guide.readthedocs.org/en/latest/single_source_version/
|
||||
# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
|
||||
|
||||
|
@ -10,7 +9,8 @@ __uri__ = 'https://spacy.io'
|
|||
__author__ = 'Matthew Honnibal'
|
||||
__email__ = 'matt@explosion.ai'
|
||||
__license__ = 'MIT'
|
||||
__models__ = {
|
||||
'en': 'en>=1.1.0,<1.2.0',
|
||||
'de': 'de>=1.0.0,<1.1.0',
|
||||
}
|
||||
|
||||
__docs__ = 'https://spacy.io/docs/usage'
|
||||
__download_url__ = 'https://github.com/explosion/spacy-models/releases/download'
|
||||
__compatibility__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json?token=ANAt54fi5zcUtnwGhMLw2klWwcAyHkZGks5Y0nw1wA%3D%3D'
|
||||
__shortcuts__ = {'en': 'en_core_web_sm', 'de': 'de_core_web_md', 'vectors': 'en_vectors_glove_md'}
|
||||
|
|
|
@ -1,14 +1,5 @@
|
|||
import plac
|
||||
from ..download import download
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
force=("Force overwrite", "flag", "f", bool),
|
||||
data_path=("Path to download model", "option", "d", str)
|
||||
)
|
||||
def main(data_size='all', force=False, data_path=None):
|
||||
download('de', force=force, data_path=data_path)
|
||||
from ..deprecated import ModelDownload as download
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
plac.call(main)
|
||||
download.de()
|
||||
|
|
|
@ -1,35 +1,13 @@
|
|||
from sputnik.dir_package import DirPackage
|
||||
from sputnik.package_list import (PackageNotFoundException,
|
||||
CompatiblePackageNotFoundException)
|
||||
|
||||
import sputnik
|
||||
from pathlib import Path
|
||||
from . import about
|
||||
from . import util
|
||||
from .download import download
|
||||
|
||||
|
||||
def get_package(data_dir):
|
||||
if not isinstance(data_dir, six.string_types):
|
||||
raise RuntimeError('data_dir must be a string')
|
||||
return DirPackage(data_dir)
|
||||
|
||||
|
||||
def get_package_by_name(name=None, via=None):
|
||||
if name is None:
|
||||
return
|
||||
lang = get_lang_class(name)
|
||||
try:
|
||||
return sputnik.package(about.__title__, about.__version__,
|
||||
name, data_path=via)
|
||||
except PackageNotFoundException as e:
|
||||
raise RuntimeError("Model '%s' not installed. Please run 'python -m "
|
||||
"%s.download' to install latest compatible "
|
||||
"model." % (name, lang.__module__))
|
||||
except CompatiblePackageNotFoundException as e:
|
||||
raise RuntimeError("Installed model is not compatible with spaCy "
|
||||
"version. Please run 'python -m %s.download "
|
||||
"--force' to install latest compatible model." %
|
||||
(lang.__module__))
|
||||
|
||||
|
||||
try:
|
||||
basestring
|
||||
except NameError:
|
||||
basestring = str
|
||||
|
||||
|
||||
def read_lang_data(package):
|
||||
|
@ -43,7 +21,6 @@ def read_lang_data(package):
|
|||
return tokenization, prefix, suffix, infix
|
||||
|
||||
|
||||
|
||||
def align_tokens(ref, indices): # Deprecated, surely?
|
||||
start = 0
|
||||
queue = list(indices)
|
||||
|
@ -79,4 +56,55 @@ def detokenize(token_rules, words): # Deprecated?
|
|||
return positions
|
||||
|
||||
|
||||
def fix_glove_vectors_loading(overrides):
|
||||
"""Special-case hack for loading the GloVe vectors, to support deprecated
|
||||
<1.0 stuff. Phase this out once the data is fixed."""
|
||||
|
||||
if 'data_dir' in overrides and 'path' not in overrides:
|
||||
raise ValueError("The argument 'data_dir' has been renamed to 'path'")
|
||||
if overrides.get('path') is False:
|
||||
return overrides
|
||||
if overrides.get('path') in (None, True):
|
||||
data_path = util.get_data_path()
|
||||
else:
|
||||
path = overrides['path']
|
||||
if isinstance(path, basestring):
|
||||
path = Path(path)
|
||||
data_path = path.parent
|
||||
vec_path = None
|
||||
if 'add_vectors' not in overrides:
|
||||
if 'vectors' in overrides:
|
||||
vec_path = util.match_best_version(overrides['vectors'], None, data_path)
|
||||
if vec_path is None:
|
||||
return overrides
|
||||
else:
|
||||
vec_path = util.match_best_version('en_glove_cc_300_1m_vectors', None, data_path)
|
||||
if vec_path is not None:
|
||||
vec_path = vec_path / 'vocab' / 'vec.bin'
|
||||
if vec_path is not None:
|
||||
overrides['add_vectors'] = lambda vocab: vocab.load_vectors_from_bin_loc(vec_path)
|
||||
return overrides
|
||||
|
||||
|
||||
class ModelDownload():
|
||||
"""Replace download modules within en and de with deprecation warning and
|
||||
download default language model (using shortcut). Use classmethods to allow
|
||||
importing ModelDownload as download and calling download.en() etc."""
|
||||
|
||||
@classmethod
|
||||
def load(self, lang):
|
||||
util.print_msg(
|
||||
"The spacy.{l}.download command is now deprecated. Please use "
|
||||
"spacy.download [model name or shortcut] instead. For more "
|
||||
"info and available models, see the documentation: {d}. "
|
||||
"Downloading default '{l}' model now...".format(d=about.__docs__, l=lang),
|
||||
title="Warning: deprecated command")
|
||||
download(lang)
|
||||
|
||||
@classmethod
|
||||
def en(cls, *args, **kwargs):
|
||||
cls.load('en')
|
||||
|
||||
@classmethod
|
||||
def de(cls, *args, **kwargs):
|
||||
cls.load('de')
|
||||
|
|
|
@ -1,47 +1,80 @@
|
|||
from __future__ import print_function
|
||||
|
||||
import sys
|
||||
import shutil
|
||||
|
||||
import sputnik
|
||||
from sputnik.package_list import (PackageNotFoundException,
|
||||
CompatiblePackageNotFoundException)
|
||||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pip
|
||||
import plac
|
||||
import requests
|
||||
from os import path
|
||||
from . import about
|
||||
from . import util
|
||||
|
||||
|
||||
def download(lang, force=False, fail_on_exist=True, data_path=None):
|
||||
if not data_path:
|
||||
data_path = util.get_data_path(require_exists=False)
|
||||
@plac.annotations(
|
||||
model=("Model to download", "positional", None, str),
|
||||
direct=("Force direct download", "flag", "d", bool)
|
||||
)
|
||||
def download(model=None, direct=False):
|
||||
"""Download compatible model from default download path using pip."""
|
||||
|
||||
# spaCy uses pathlib, and util.get_data_path returns a pathlib.Path object,
|
||||
# but sputnik (which we're using below) doesn't use pathlib and requires
|
||||
# its data_path parameters to be strings, so we coerce the data_path to a
|
||||
# str here.
|
||||
data_path = str(data_path)
|
||||
check_error_depr(model)
|
||||
|
||||
try:
|
||||
pkg = sputnik.package(about.__title__, about.__version__,
|
||||
about.__models__.get(lang, lang), data_path)
|
||||
if force:
|
||||
shutil.rmtree(pkg.path)
|
||||
elif fail_on_exist:
|
||||
print("Model already installed. Please run 'python -m "
|
||||
"spacy.%s.download --force' to reinstall." % lang, file=sys.stderr)
|
||||
sys.exit(0)
|
||||
except (PackageNotFoundException, CompatiblePackageNotFoundException):
|
||||
pass
|
||||
if direct:
|
||||
download_model('{m}/{m}.tar.gz'.format(m=model))
|
||||
else:
|
||||
model = about.__shortcuts__[model] if model in about.__shortcuts__ else model
|
||||
compatibility = get_compatibility()
|
||||
version = get_version(model, compatibility)
|
||||
download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model, v=version))
|
||||
|
||||
package = sputnik.install(about.__title__, about.__version__,
|
||||
about.__models__.get(lang, lang), data_path)
|
||||
|
||||
try:
|
||||
sputnik.package(about.__title__, about.__version__,
|
||||
about.__models__.get(lang, lang), data_path)
|
||||
except (PackageNotFoundException, CompatiblePackageNotFoundException):
|
||||
print("Model failed to install. Please run 'python -m "
|
||||
"spacy.%s.download --force'." % lang, file=sys.stderr)
|
||||
sys.exit(1)
|
||||
def get_compatibility():
|
||||
version = about.__version__
|
||||
r = requests.get(about.__compatibility__)
|
||||
if r.status_code != 200:
|
||||
util.sys_exit(
|
||||
"Couldn't fetch compatibility table. Please find the right model for "
|
||||
"your spaCy installation (v{v}), and download it manually:".format(v=version),
|
||||
"python -m spacy.download [full model name + version] --direct",
|
||||
title="Server error ({c})".format(c=r.status_code))
|
||||
|
||||
print("Model successfully installed to %s" % data_path, file=sys.stderr)
|
||||
comp = r.json()['spacy']
|
||||
if version not in comp:
|
||||
util.sys_exit(
|
||||
"No compatible models found for v{v} of spaCy.".format(v=version),
|
||||
title="Compatibility error")
|
||||
else:
|
||||
return comp[version]
|
||||
|
||||
|
||||
def get_version(model, comp):
|
||||
if model not in comp:
|
||||
util.sys_exit(
|
||||
"No compatible model found for "
|
||||
"{m} (spaCy v{v}).".format(m=model, v=about.__version__),
|
||||
title="Compatibility error")
|
||||
return comp[model][0]
|
||||
|
||||
|
||||
def download_model(filename):
|
||||
util.print_msg("Downloading {f}".format(f=filename))
|
||||
download_url = path.join(about.__download_url__, filename)
|
||||
pip.main(['install', download_url])
|
||||
|
||||
|
||||
def check_error_depr(model):
|
||||
if not model:
|
||||
util.sys_exit(
|
||||
"python -m spacy.download [name or shortcut]",
|
||||
title="Missing model name or shortcut")
|
||||
|
||||
if model == 'all':
|
||||
util.sys_exit(
|
||||
"As of v1.7.0, the download all command is deprecated. Please "
|
||||
"download the models individually via spacy.download [model name] "
|
||||
"or pip install. For more info on this, see the documentation: "
|
||||
"{d}".format(d=about.__docs__),
|
||||
title="Deprecated command")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
plac.call(download)
|
||||
|
|
|
@ -1,19 +1,16 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals, print_function
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from os import path
|
||||
from pathlib import Path
|
||||
|
||||
from ..util import match_best_version
|
||||
from ..util import get_data_path
|
||||
from ..language import Language
|
||||
from ..lemmatizer import Lemmatizer
|
||||
from ..vocab import Vocab
|
||||
from ..tokenizer import Tokenizer
|
||||
from ..attrs import LANG
|
||||
from ..deprecated import fix_glove_vectors_loading
|
||||
|
||||
from .language_data import *
|
||||
|
||||
|
||||
try:
|
||||
basestring
|
||||
except NameError:
|
||||
|
@ -38,34 +35,6 @@ class English(Language):
|
|||
|
||||
|
||||
def __init__(self, **overrides):
|
||||
# Make a special-case hack for loading the GloVe vectors, to support
|
||||
# deprecated <1.0 stuff. Phase this out once the data is fixed.
|
||||
overrides = _fix_deprecated_glove_vectors_loading(overrides)
|
||||
# Special-case hack for loading the GloVe vectors, to support <1.0
|
||||
overrides = fix_glove_vectors_loading(overrides)
|
||||
Language.__init__(self, **overrides)
|
||||
|
||||
|
||||
def _fix_deprecated_glove_vectors_loading(overrides):
|
||||
if 'data_dir' in overrides and 'path' not in overrides:
|
||||
raise ValueError("The argument 'data_dir' has been renamed to 'path'")
|
||||
if overrides.get('path') is False:
|
||||
return overrides
|
||||
if overrides.get('path') in (None, True):
|
||||
data_path = get_data_path()
|
||||
else:
|
||||
path = overrides['path']
|
||||
if isinstance(path, basestring):
|
||||
path = Path(path)
|
||||
data_path = path.parent
|
||||
vec_path = None
|
||||
if 'add_vectors' not in overrides:
|
||||
if 'vectors' in overrides:
|
||||
vec_path = match_best_version(overrides['vectors'], None, data_path)
|
||||
if vec_path is None:
|
||||
return overrides
|
||||
else:
|
||||
vec_path = match_best_version('en_glove_cc_300_1m_vectors', None, data_path)
|
||||
if vec_path is not None:
|
||||
vec_path = vec_path / 'vocab' / 'vec.bin'
|
||||
if vec_path is not None:
|
||||
overrides['add_vectors'] = lambda vocab: vocab.load_vectors_from_bin_loc(vec_path)
|
||||
return overrides
|
||||
|
|
|
@ -1,25 +1,5 @@
|
|||
import plac
|
||||
import sputnik
|
||||
|
||||
from ..download import download
|
||||
from .. import about
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
force=("Force overwrite", "flag", "f", bool),
|
||||
data_path=("Path to download model", "option", "d", str)
|
||||
)
|
||||
def main(data_size='all', force=False, data_path=None):
|
||||
if force:
|
||||
sputnik.purge(about.__title__, about.__version__)
|
||||
|
||||
if data_size in ('all', 'parser'):
|
||||
print("Downloading parsing model")
|
||||
download('en', force=False, data_path=data_path)
|
||||
if data_size in ('all', 'glove'):
|
||||
print("Downloading GloVe vectors")
|
||||
download('en_glove_cc_300_1m_vectors', force=False, data_path=data_path)
|
||||
from ..deprecated import ModelDownload as download
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
plac.call(main)
|
||||
download.en()
|
||||
|
|
|
@ -281,6 +281,7 @@ class Language(object):
|
|||
if path is True:
|
||||
path = util.match_best_version(self.lang, '', util.get_data_path())
|
||||
|
||||
self.meta = overrides.get('meta', {})
|
||||
self.path = path
|
||||
|
||||
self.vocab = self.Defaults.create_vocab(self) \
|
||||
|
|
72
spacy/link.py
Normal file
72
spacy/link.py
Normal file
|
@ -0,0 +1,72 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import io
|
||||
import os
|
||||
import pip
|
||||
import site
|
||||
import plac
|
||||
from . import util
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
origin=("Package name or path to model", "positional", None, str),
|
||||
link_name=("Name of link", "positional", None, str),
|
||||
force=("Force overwriting existing link", "flag", "f", bool)
|
||||
)
|
||||
def link(origin, link_name, force=False):
|
||||
"""Create a symlink for models within the spacy/data directory. Accepts
|
||||
either the name of a pip package, or the local path to the model data
|
||||
directory. Linking models allows loading them via spacy.load(link_name)."""
|
||||
|
||||
if is_package(origin):
|
||||
package_path = site.getsitepackages()[0]
|
||||
meta = get_meta(package_path, origin)
|
||||
data_dir = origin + '-' + meta['version']
|
||||
model_path = os.path.join(package_path, origin, data_dir)
|
||||
symlink(model_path, link_name, force)
|
||||
else:
|
||||
symlink(origin, link_name, force)
|
||||
|
||||
|
||||
def symlink(model_path, link_name, force):
|
||||
if not os.path.isdir(model_path):
|
||||
util.sys_exit(
|
||||
"The data should be located in {p}".format(p=model_path),
|
||||
title="Can't locate model data")
|
||||
|
||||
data_path = str(util.get_data_path())
|
||||
link_path = os.path.join(os.path.abspath(__file__ + '/../../'), data_path, link_name)
|
||||
|
||||
if os.path.isdir(link_path):
|
||||
if force:
|
||||
os.unlink(link_path)
|
||||
else:
|
||||
util.sys_exit(
|
||||
"To overwrite an existing link, use the --force flag.",
|
||||
title="Link {l} already exists".format(l=link_name))
|
||||
|
||||
os.symlink(model_path, link_path)
|
||||
util.print_msg(
|
||||
"{a} --> {b}".format(a=model_path, b=link_path),
|
||||
"You can now load the model via spacy.load('{l}').".format(l=link_name),
|
||||
title="Linking successful")
|
||||
|
||||
|
||||
def get_meta(package_path, package):
|
||||
meta = util.parse_package_meta(package_path, package)
|
||||
if not meta:
|
||||
util.sys_exit()
|
||||
return meta
|
||||
|
||||
|
||||
def is_package(origin):
|
||||
packages = pip.get_installed_distributions()
|
||||
for package in packages:
|
||||
if package.project_name.replace('-', '_') == origin:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
plac.call(link)
|
36
spacy/tests/test_download.py
Normal file
36
spacy/tests/test_download.py
Normal file
|
@ -0,0 +1,36 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ..download import download, get_compatibility, get_version, check_error_depr
|
||||
import pytest
|
||||
|
||||
|
||||
def test_download_fetch_compatibility():
|
||||
compatibility = get_compatibility()
|
||||
assert type(compatibility) == dict
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.parametrize('model', ['en_core_web_md-1.2.0'])
|
||||
def test_download_direct_download(model):
|
||||
download(model, direct=True)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('model', ['en_core_web_md'])
|
||||
def test_download_get_matching_version_succeeds(model):
|
||||
comp = { model: ['1.7.0', '0.100.0'] }
|
||||
assert get_version(model, comp)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('model', ['en_core_web_md'])
|
||||
def test_download_get_matching_version_fails(model):
|
||||
diff_model = 'test_' + model
|
||||
comp = { diff_model: ['1.7.0', '0.100.0'] }
|
||||
with pytest.raises(SystemExit):
|
||||
assert get_version(model, comp)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('model', [False, None, '', 'all'])
|
||||
def test_download_no_model_depr_error(model):
|
||||
with pytest.raises(SystemExit):
|
||||
check_error_depr(model)
|
|
@ -1,40 +1,36 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import io
|
||||
import pytest
|
||||
import dill as pickle
|
||||
|
||||
from ..strings import StringStore
|
||||
from ..vocab import Vocab
|
||||
from ..attrs import NORM
|
||||
|
||||
|
||||
def test_pickle_string_store():
|
||||
sstore = StringStore()
|
||||
hello = sstore['hello']
|
||||
bye = sstore['bye']
|
||||
bdata = pickle.dumps(sstore, protocol=-1)
|
||||
unpickled = pickle.loads(bdata)
|
||||
assert unpickled['hello'] == hello
|
||||
assert unpickled['bye'] == bye
|
||||
assert len(sstore) == len(unpickled)
|
||||
@pytest.mark.parametrize('text1,text2', [('hello', 'bye')])
|
||||
def test_pickle_string_store(stringstore, text1, text2):
|
||||
store1 = stringstore[text1]
|
||||
store2 = stringstore[text2]
|
||||
data = pickle.dumps(stringstore, protocol=-1)
|
||||
unpickled = pickle.loads(data)
|
||||
assert unpickled[text1] == store1
|
||||
assert unpickled[text2] == store2
|
||||
assert len(stringstore) == len(unpickled)
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
def test_pickle_vocab():
|
||||
@pytest.mark.parametrize('text1,text2', [('dog', 'cat')])
|
||||
def test_pickle_vocab(text1, text2):
|
||||
vocab = Vocab(lex_attr_getters={int(NORM): lambda string: string[:-1]})
|
||||
dog = vocab[u'dog']
|
||||
cat = vocab[u'cat']
|
||||
assert dog.norm_ == 'do'
|
||||
assert cat.norm_ == 'ca'
|
||||
|
||||
bdata = pickle.dumps(vocab)
|
||||
unpickled = pickle.loads(bdata)
|
||||
|
||||
assert unpickled[u'dog'].orth == dog.orth
|
||||
assert unpickled[u'cat'].orth == cat.orth
|
||||
assert unpickled[u'dog'].norm == dog.norm
|
||||
assert unpickled[u'cat'].norm == cat.norm
|
||||
dog_ = unpickled[u'dog']
|
||||
cat_ = unpickled[u'cat']
|
||||
assert dog_.norm != cat_.norm
|
||||
lex1 = vocab[text1]
|
||||
lex2 = vocab[text2]
|
||||
assert lex1.norm_ == text1[:-1]
|
||||
assert lex2.norm_ == text2[:-1]
|
||||
data = pickle.dumps(vocab)
|
||||
unpickled = pickle.loads(data)
|
||||
assert unpickled[text1].orth == lex1.orth
|
||||
assert unpickled[text2].orth == lex2.orth
|
||||
assert unpickled[text1].norm == lex1.norm
|
||||
assert unpickled[text2].norm == lex2.norm
|
||||
assert unpickled[text1].norm != unpickled[text2].norm
|
||||
|
|
|
@ -1,13 +1,16 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
from __future__ import unicode_literals, print_function
|
||||
import os
|
||||
import io
|
||||
import json
|
||||
import re
|
||||
import os.path
|
||||
import pathlib
|
||||
import sys
|
||||
|
||||
import six
|
||||
import textwrap
|
||||
|
||||
from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
|
||||
|
||||
try:
|
||||
|
@ -144,3 +147,53 @@ def check_renamed_kwargs(renamed, kwargs):
|
|||
for old, new in renamed.items():
|
||||
if old in kwargs:
|
||||
raise TypeError("Keyword argument %s now renamed to %s" % (old, new))
|
||||
|
||||
|
||||
def parse_package_meta(package_path, package):
|
||||
location = os.path.join(str(package_path), package, 'meta.json')
|
||||
if not os.path.isfile(location):
|
||||
print_msg("'{p}' doesn't seem to be a valid model package.".format(p=package),
|
||||
title="No meta.json found")
|
||||
else:
|
||||
with io.open(location, encoding='utf8') as f:
|
||||
meta = json.load(f)
|
||||
return meta
|
||||
return False
|
||||
|
||||
|
||||
def print_msg(*text, **kwargs):
|
||||
"""Print formatted message. Each positional argument is rendered as newline-
|
||||
separated paragraph. If kwarg 'title' exist, title is printed above the text
|
||||
and highlighted (using ANSI escape sequences manually to avoid unnecessary
|
||||
dependency)."""
|
||||
|
||||
message = '\n\n'.join([_wrap_text(t) for t in text])
|
||||
tpl_msg = '\n{msg}\n'
|
||||
tpl_title = '\n\033[93m{msg}\033[0m'
|
||||
|
||||
if 'title' in kwargs and kwargs['title']:
|
||||
title = _wrap_text(kwargs['title'])
|
||||
print(tpl_title.format(msg=title))
|
||||
print(tpl_msg.format(msg=message))
|
||||
|
||||
|
||||
def _wrap_text(text):
|
||||
"""Wrap text at given width using textwrap module. Indent should consist of
|
||||
spaces. Its length is deducted from wrap width to ensure exact wrapping."""
|
||||
|
||||
wrap_max = 80
|
||||
indent = ' '
|
||||
wrap_width = wrap_max - len(indent)
|
||||
return textwrap.fill(text, width=wrap_width, initial_indent=indent,
|
||||
subsequent_indent=indent, break_long_words=False,
|
||||
break_on_hyphens=False)
|
||||
|
||||
|
||||
def sys_exit(*messages, **kwargs):
|
||||
"""Performs SystemExit. For modules used from the command line, like
|
||||
download and link. To print message, use the same arguments as for
|
||||
print_msg()."""
|
||||
|
||||
if messages:
|
||||
print_msg(*messages, **kwargs)
|
||||
sys.exit(0)
|
||||
|
|
|
@ -57,20 +57,6 @@ p Many of the associated tools and resources that we're developing alongside spa
|
|||
+cell
|
||||
| Super sparse multi-class machine learning with Cython.
|
||||
|
||||
+row
|
||||
+cell
|
||||
+src(gh("sputnik")) Sputnik
|
||||
|
||||
+cell
|
||||
| Data package manager library for spaCy.
|
||||
|
||||
+row
|
||||
+cell
|
||||
+src(gh("sputnik-server")) Sputnik Server
|
||||
|
||||
+cell
|
||||
| Index service for the Sputnik data package manager for spaCy.
|
||||
|
||||
+row
|
||||
+cell
|
||||
+src(gh("cymem")) Cymem
|
||||
|
|
Loading…
Reference in New Issue
Block a user