Remove symlinks, data dir and related stuff

This commit is contained in:
Ines Montani 2020-02-18 17:20:17 +01:00
parent 2ed49404e3
commit 09cbeaef27
10 changed files with 82 additions and 347 deletions

View File

@ -1,13 +1,21 @@
from wasabi import msg
from .download import download # noqa: F401 from .download import download # noqa: F401
from .info import info # noqa: F401 from .info import info # noqa: F401
from .link import link # noqa: F401
from .package import package # noqa: F401 from .package import package # noqa: F401
from .profile import profile # noqa: F401 from .profile import profile # noqa: F401
from .train import train # noqa: F401 from .train import train # noqa: F401
from .train_from_config import train_from_config_cli # noqa: F401 from .train_from_config import train_from_config_cli # noqa: F401
from .pretrain import pretrain # noqa: F401 from .pretrain import pretrain # noqa: F401
from .debug_data import debug_data # noqa: F401 from .debug_data import debug_data # noqa: F401
from .evaluate import evaluate # noqa: F401 from .evaluate import evaluate # noqa: F401
from .convert import convert # noqa: F401 from .convert import convert # noqa: F401
from .init_model import init_model # noqa: F401 from .init_model import init_model # noqa: F401
from .validate import validate # noqa: F401 from .validate import validate # noqa: F401
def link(*args, **kwargs):
msg.warn(
"As of spaCy v3.0, model symlinks are deprecated. You can load models "
"using their full names or from a directory path."
)

View File

@ -4,8 +4,6 @@ import subprocess
import sys import sys
from wasabi import msg from wasabi import msg
from .link import link
from ..util import get_package_path
from .. import about from .. import about
@ -15,9 +13,9 @@ def download(
*pip_args: ("Additional arguments to be passed to `pip install` on model install"), *pip_args: ("Additional arguments to be passed to `pip install` on model install"),
): ):
""" """
Download compatible model from default download path using pip. Model Download compatible model from default download path using pip. If --direct
can be shortcut, model name or, if --direct flag is set, full model name flag is set, the command expects the full model name with version.
with version. For direct downloads, the compatibility check will be skipped. For direct downloads, the compatibility check will be skipped.
""" """
if not require_package("spacy") and "--no-deps" not in pip_args: if not require_package("spacy") and "--no-deps" not in pip_args:
msg.warn( msg.warn(
@ -47,28 +45,6 @@ def download(
"Download and installation successful", "Download and installation successful",
f"You can now load the model via spacy.load('{model_name}')", f"You can now load the model via spacy.load('{model_name}')",
) )
# Only create symlink if the model is installed via a shortcut like 'en'.
# There's no real advantage over an additional symlink for en_core_web_sm
# and if anything, it's more error prone and causes more confusion.
if model in shortcuts:
try:
# Get package path here because link uses
# pip.get_installed_distributions() to check if model is a
# package, which fails if model was just installed via
# subprocess
package_path = get_package_path(model_name)
link(model_name, model, force=True, model_path=package_path)
except: # noqa: E722
# Dirty, but since spacy.download and the auto-linking is
# mostly a convenience wrapper, it's best to show a success
# message and loading instructions, even if linking fails.
msg.warn(
"Download successful but linking failed",
f"Creating a shortcut link for '{model}' didn't work (maybe you "
f"don't have admin permissions?), but you can still load "
f"the model via its full package name: "
f"nlp = spacy.load('{model_name}')",
)
# If a model is downloaded and then loaded within the same process, our # If a model is downloaded and then loaded within the same process, our
# is_package check currently fails, because pkg_resources.working_set # is_package check currently fails, because pkg_resources.working_set
# is not refreshed automatically (see #3923). We're trying to work # is not refreshed automatically (see #3923). We're trying to work
@ -114,8 +90,7 @@ def get_version(model, comp):
model = model.rsplit(".dev", 1)[0] model = model.rsplit(".dev", 1)[0]
if model not in comp: if model not in comp:
msg.fail( msg.fail(
f"No compatible model found for '{model}' " f"No compatible model found for '{model}' (spaCy v{about.__version__})",
f"(spaCy v{about.__version__}).",
exits=1, exits=1,
) )
return comp[model][0] return comp[model][0]

View File

@ -3,25 +3,26 @@ from pathlib import Path
from wasabi import msg from wasabi import msg
import srsly import srsly
from .validate import get_model_pkgs
from .. import util from .. import util
from .. import about from .. import about
def info( def info(
model: ("Optional shortcut link of model", "positional", None, str) = None, model: ("Optional model name", "positional", None, str) = None,
markdown: ("Generate Markdown for GitHub issues", "flag", "md", str) = False, markdown: ("Generate Markdown for GitHub issues", "flag", "md", str) = False,
silent: ("Don't print anything (just return)", "flag", "s") = False, silent: ("Don't print anything (just return)", "flag", "s") = False,
): ):
""" """
Print info about spaCy installation. If a model shortcut link is Print info about spaCy installation. If a model is speficied as an argument,
speficied as an argument, print model information. Flag --markdown print model information. Flag --markdown prints details in Markdown for easy
prints details in Markdown for easy copy-pasting to GitHub issues. copy-pasting to GitHub issues.
""" """
if model: if model:
if util.is_package(model): if util.is_package(model):
model_path = util.get_package_path(model) model_path = util.get_package_path(model)
else: else:
model_path = util.get_data_path() / model model_path = model
meta_path = model_path / "meta.json" meta_path = model_path / "meta.json"
if not meta_path.is_file(): if not meta_path.is_file():
msg.fail("Can't find model meta.json", meta_path, exits=1) msg.fail("Can't find model meta.json", meta_path, exits=1)
@ -41,12 +42,13 @@ def info(
else: else:
msg.table(model_meta, title=title) msg.table(model_meta, title=title)
return meta return meta
all_models, _ = get_model_pkgs()
data = { data = {
"spaCy version": about.__version__, "spaCy version": about.__version__,
"Location": str(Path(__file__).parent.parent), "Location": str(Path(__file__).parent.parent),
"Platform": platform.platform(), "Platform": platform.platform(),
"Python version": platform.python_version(), "Python version": platform.python_version(),
"Models": list_models(), "Models": ", ".join(model["name"] for model in all_models.values()),
} }
if not silent: if not silent:
title = "Info about spaCy" title = "Info about spaCy"
@ -57,19 +59,6 @@ def info(
return data return data
def list_models():
def exclude_dir(dir_name):
# exclude common cache directories and hidden directories
exclude = ("cache", "pycache", "__pycache__")
return dir_name in exclude or dir_name.startswith(".")
data_path = util.get_data_path()
if data_path:
models = [f.parts[-1] for f in data_path.iterdir() if f.is_dir()]
return ", ".join([m for m in models if not exclude_dir(m)])
return "-"
def print_markdown(data, title=None): def print_markdown(data, title=None):
"""Print data in GitHub-flavoured Markdown format for issues etc. """Print data in GitHub-flavoured Markdown format for issues etc.

View File

@ -1,73 +0,0 @@
from pathlib import Path
from wasabi import msg
from ..compat import symlink_to
from .. import util
def link(
origin: ("package name or local path to model", "positional", None, str),
link_name: ("name of shortuct link to create", "positional", None, str),
force: ("force overwriting of existing link", "flag", "f", bool) = False,
model_path=None,
):
"""
Create a symlink for models within the spacy/data directory. Accepts
either the name of a pip package, or the local path to the model data
directory. Linking models allows loading them via spacy.load(link_name).
"""
if util.is_package(origin):
model_path = util.get_package_path(origin)
else:
model_path = Path(origin) if model_path is None else Path(model_path)
if not model_path.exists():
msg.fail(
"Can't locate model data",
f"The data should be located in {model_path}",
exits=1,
)
data_path = util.get_data_path()
if not data_path or not data_path.exists():
spacy_loc = Path(__file__).parent.parent
msg.fail(
f"Can't find the spaCy data path to create model symlink",
f"Make sure a directory `/data` exists within your spaCy "
f"installation and try again. The data directory should be located "
f"here: {spacy_loc}",
exits=1,
)
link_path = util.get_data_path() / link_name
if link_path.is_symlink() and not force:
msg.fail(
f"Link '{link_name}' already exists",
"To overwrite an existing link, use the --force flag",
exits=1,
)
elif link_path.is_symlink(): # does a symlink exist?
# NB: It's important to check for is_symlink here and not for exists,
# because invalid/outdated symlinks would return False otherwise.
link_path.unlink()
elif link_path.exists(): # does it exist otherwise?
# NB: Check this last because valid symlinks also "exist".
msg.fail(
f"Can't overwrite symlink '{link_name}'",
"This can happen if your data directory contains a directory or "
"file of the same name.",
exits=1,
)
details = f"{model_path} --> {link_path}"
try:
symlink_to(link_path, model_path)
except: # noqa: E722
# This is quite dirty, but just making sure other errors are caught.
msg.fail(
f"Couldn't link model to '{link_name}'",
"Creating a symlink in spacy/data failed. Make sure you have the "
"required permissions and try re-running the command as admin, or "
"use a virtualenv. You can still import the model as a module and "
"call its load() method, or create the symlink manually.",
)
msg.text(details)
raise
msg.good("Linking successful", details)
msg.text(f"You can now load the model via spacy.load('{link_name}')")

View File

@ -1,10 +1,8 @@
from pathlib import Path from pathlib import Path
import sys import sys
import requests import requests
import srsly
from wasabi import msg from wasabi import msg
from ..util import get_data_path
from .. import about from .. import about
@ -13,6 +11,50 @@ def validate():
Validate that the currently installed version of spaCy is compatible Validate that the currently installed version of spaCy is compatible
with the installed models. Should be run after `pip install -U spacy`. with the installed models. Should be run after `pip install -U spacy`.
""" """
model_pkgs, compat = get_model_pkgs()
spacy_version = about.__version__.rsplit(".dev", 1)[0]
current_compat = compat.get(spacy_version, {})
if not current_compat:
msg.warn(f"No compatible models found for v{spacy_version} of spaCy")
incompat_models = {d["name"] for _, d in model_pkgs.items() if not d["compat"]}
na_models = [m for m in incompat_models if m not in current_compat]
update_models = [m for m in incompat_models if m in current_compat]
spacy_dir = Path(__file__).parent.parent
msg.divider(f"Installed models (spaCy v{about.__version__})")
msg.info(f"spaCy installation: {spacy_dir}")
if model_pkgs:
header = ("NAME", "VERSION", "")
rows = []
for name, data in model_pkgs.items():
if data["compat"]:
comp = msg.text("", color="green", icon="good", no_print=True)
version = msg.text(data["version"], color="green", no_print=True)
else:
version = msg.text(data["version"], color="red", no_print=True)
comp = f"--> {compat.get(data['name'], ['n/a'])[0]}"
rows.append((data["name"], version, comp))
msg.table(rows, header=header)
else:
msg.text("No models found in your current environment.", exits=0)
if update_models:
msg.divider("Install updates")
msg.text("Use the following commands to update the model packages:")
cmd = "python -m spacy download {}"
print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n")
if na_models:
msg.warn(
f"The following models are not available for spaCy v{about.__version__}:",
", ".join(na_models),
)
if incompat_models:
sys.exit(1)
def get_model_pkgs():
import pkg_resources
with msg.loading("Loading compatibility table..."): with msg.loading("Loading compatibility table..."):
r = requests.get(about.__compatibility__) r = requests.get(about.__compatibility__)
if r.status_code != 200: if r.status_code != 200:
@ -23,88 +65,11 @@ def validate():
) )
msg.good("Loaded compatibility table") msg.good("Loaded compatibility table")
compat = r.json()["spacy"] compat = r.json()["spacy"]
version = about.__version__
version = version.rsplit(".dev", 1)[0]
current_compat = compat.get(version)
if not current_compat:
msg.fail(
f"Can't find spaCy v{version} in compatibility table",
about.__compatibility__,
exits=1,
)
all_models = set() all_models = set()
for spacy_v, models in dict(compat).items(): for spacy_v, models in dict(compat).items():
all_models.update(models.keys()) all_models.update(models.keys())
for model, model_vs in models.items(): for model, model_vs in models.items():
compat[spacy_v][model] = [reformat_version(v) for v in model_vs] compat[spacy_v][model] = [reformat_version(v) for v in model_vs]
model_links = get_model_links(current_compat)
model_pkgs = get_model_pkgs(current_compat, all_models)
incompat_links = {l for l, d in model_links.items() if not d["compat"]}
incompat_models = {d["name"] for _, d in model_pkgs.items() if not d["compat"]}
incompat_models.update(
[d["name"] for _, d in model_links.items() if not d["compat"]]
)
na_models = [m for m in incompat_models if m not in current_compat]
update_models = [m for m in incompat_models if m in current_compat]
spacy_dir = Path(__file__).parent.parent
msg.divider(f"Installed models (spaCy v{about.__version__})")
msg.info(f"spaCy installation: {spacy_dir}")
if model_links or model_pkgs:
header = ("TYPE", "NAME", "MODEL", "VERSION", "")
rows = []
for name, data in model_pkgs.items():
rows.append(get_model_row(current_compat, name, data, msg))
for name, data in model_links.items():
rows.append(get_model_row(current_compat, name, data, msg, "link"))
msg.table(rows, header=header)
else:
msg.text("No models found in your current environment.", exits=0)
if update_models:
msg.divider("Install updates")
msg.text("Use the following commands to update the model packages:")
cmd = "python -m spacy download {}"
print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n")
if na_models:
msg.text(
f"The following models are not available for spaCy "
f"v{about.__version__}: {', '.join(na_models)}"
)
if incompat_links:
msg.text(
f"You may also want to overwrite the incompatible links using the "
f"`python -m spacy link` command with `--force`, or remove them "
f"from the data directory. "
f"Data path: {get_data_path()}"
)
if incompat_models or incompat_links:
sys.exit(1)
def get_model_links(compat):
links = {}
data_path = get_data_path()
if data_path:
models = [p for p in data_path.iterdir() if is_model_path(p)]
for model in models:
meta_path = Path(model) / "meta.json"
if not meta_path.exists():
continue
meta = srsly.read_json(meta_path)
link = model.parts[-1]
name = meta["lang"] + "_" + meta["name"]
links[link] = {
"name": name,
"version": meta["version"],
"compat": is_compat(compat, name, meta["version"]),
}
return links
def get_model_pkgs(compat, all_models):
import pkg_resources
pkgs = {} pkgs = {}
for pkg_name, pkg_data in pkg_resources.working_set.by_key.items(): for pkg_name, pkg_data in pkg_resources.working_set.by_key.items():
package = pkg_name.replace("-", "_") package = pkg_name.replace("-", "_")
@ -113,29 +78,9 @@ def get_model_pkgs(compat, all_models):
pkgs[pkg_name] = { pkgs[pkg_name] = {
"name": package, "name": package,
"version": version, "version": version,
"compat": is_compat(compat, package, version), "compat": package in compat and version in compat[package],
} }
return pkgs return pkgs, compat
def get_model_row(compat, name, data, msg, model_type="package"):
if data["compat"]:
comp = msg.text("", color="green", icon="good", no_print=True)
version = msg.text(data["version"], color="green", no_print=True)
else:
version = msg.text(data["version"], color="red", no_print=True)
comp = f"--> {compat.get(data['name'], ['n/a'])[0]}"
return (model_type, name, data["name"], version, comp)
def is_model_path(model_path):
exclude = ["cache", "pycache", "__pycache__"]
name = model_path.parts[-1]
return model_path.is_dir() and name not in exclude and not name.startswith(".")
def is_compat(compat, name, version):
return name in compat and version in compat[name]
def reformat_version(version): def reformat_version(version):

View File

@ -5,7 +5,6 @@ e.g. `unicode_`.
DOCS: https://spacy.io/api/top-level#compat DOCS: https://spacy.io/api/top-level#compat
""" """
import os
import sys import sys
from thinc.util import copy_array from thinc.util import copy_array
@ -43,33 +42,6 @@ is_linux = sys.platform.startswith("linux")
is_osx = sys.platform == "darwin" is_osx = sys.platform == "darwin"
def symlink_to(orig, dest):
"""Create a symlink. Used for model shortcut links.
orig (unicode / Path): The origin path.
dest (unicode / Path): The destination path of the symlink.
"""
if is_windows:
import subprocess
subprocess.check_call(["mklink", "/d", str(orig), str(dest)], shell=True)
else:
orig.symlink_to(dest)
def symlink_remove(link):
"""Remove a symlink. Used for model shortcut links.
link (unicode / Path): The path to the symlink.
"""
# https://stackoverflow.com/q/26554135/6400719
if os.path.isdir(str(link)) and is_windows:
# this should only be on Py2.7 and windows
os.rmdir(str(link))
else:
os.unlink(str(link))
def is_config(windows=None, linux=None, osx=None, **kwargs): def is_config(windows=None, linux=None, osx=None, **kwargs):
"""Check if a specific configuration of Python version and operating system """Check if a specific configuration of Python version and operating system
matches the user's setup. Mostly used to display targeted error messages. matches the user's setup. Mostly used to display targeted error messages.

View File

View File

@ -224,13 +224,8 @@ class Errors(object):
E047 = ("Can't assign a value to unregistered extension attribute " E047 = ("Can't assign a value to unregistered extension attribute "
"'{name}'. Did you forget to call the `set_extension` method?") "'{name}'. Did you forget to call the `set_extension` method?")
E048 = ("Can't import language {lang} from spacy.lang: {err}") E048 = ("Can't import language {lang} from spacy.lang: {err}")
E049 = ("Can't find spaCy data directory: '{path}'. Check your " E050 = ("Can't find model '{name}'. It doesn't seem to be a Python "
"installation and permissions, or use spacy.util.set_data_path " "package or a valid path to a data directory.")
"to customise the location if necessary.")
E050 = ("Can't find model '{name}'. It doesn't seem to be a shortcut "
"link, a Python package or a valid path to a data directory.")
E051 = ("Cant' load '{name}'. If you're using a shortcut link, make sure "
"it points to a valid package (not just a data directory).")
E052 = ("Can't find model directory: {path}") E052 = ("Can't find model directory: {path}")
E053 = ("Could not read meta.json from {path}") E053 = ("Could not read meta.json from {path}")
E054 = ("No valid '{setting}' setting found in model meta.json.") E054 = ("No valid '{setting}' setting found in model meta.json.")

View File

@ -4,36 +4,8 @@ import ctypes
from pathlib import Path from pathlib import Path
from spacy import util from spacy import util
from spacy import prefer_gpu, require_gpu from spacy import prefer_gpu, require_gpu
from spacy.compat import symlink_to, symlink_remove, is_windows
from spacy.ml._layers import PrecomputableAffine from spacy.ml._layers import PrecomputableAffine
from spacy.ml._layers import _backprop_precomputable_affine_padding from spacy.ml._layers import _backprop_precomputable_affine_padding
from subprocess import CalledProcessError
@pytest.fixture
def symlink_target():
return Path("./foo-target")
@pytest.fixture
def symlink():
return Path("./foo-symlink")
@pytest.fixture(scope="function")
def symlink_setup_target(request, symlink_target, symlink):
if not symlink_target.exists():
os.mkdir(str(symlink_target))
# yield -- need to cleanup even if assertion fails
# https://github.com/pytest-dev/pytest/issues/2508#issuecomment-309934240
def cleanup():
# Remove symlink only if it was created
if symlink.exists():
symlink_remove(symlink)
os.rmdir(str(symlink_target))
request.addfinalizer(cleanup)
@pytest.fixture @pytest.fixture
@ -109,25 +81,6 @@ def test_require_gpu():
require_gpu() require_gpu()
def test_create_symlink_windows(
symlink_setup_target, symlink_target, symlink, is_admin
):
"""Test the creation of symlinks on windows. If run as admin or not on windows it should succeed, otherwise a CalledProcessError should be raised."""
assert symlink_target.exists()
if is_admin or not is_windows:
try:
symlink_to(symlink, symlink_target)
assert symlink.exists()
except CalledProcessError as e:
pytest.fail(e)
else:
with pytest.raises(CalledProcessError):
symlink_to(symlink, symlink_target)
assert not symlink.exists()
def test_ascii_filenames(): def test_ascii_filenames():
"""Test that all filenames in the project are ASCII. """Test that all filenames in the project are ASCII.
See: https://twitter.com/_inesmontani/status/1177941471632211968 See: https://twitter.com/_inesmontani/status/1177941471632211968

View File

@ -29,7 +29,6 @@ from .symbols import ORTH
from .compat import cupy, CudaStream from .compat import cupy, CudaStream
from .errors import Errors, Warnings, deprecation_warning, user_warning from .errors import Errors, Warnings, deprecation_warning, user_warning
_data_path = Path(__file__).parent / "data"
_PRINT_ENV = False _PRINT_ENV = False
@ -84,27 +83,6 @@ def set_lang_class(name, cls):
registry.languages.register(name, func=cls) registry.languages.register(name, func=cls)
def get_data_path(require_exists=True):
"""Get path to spaCy data directory.
require_exists (bool): Only return path if it exists, otherwise None.
RETURNS (Path or None): Data path or None.
"""
if not require_exists:
return _data_path
else:
return _data_path if _data_path.exists() else None
def set_data_path(path):
"""Set path to spaCy data directory.
path (unicode or Path): Path to new data directory.
"""
global _data_path
_data_path = ensure_path(path)
def make_layer(arch_config): def make_layer(arch_config):
arch_func = registry.architectures.get(arch_config["arch"]) arch_func = registry.architectures.get(arch_config["arch"])
return arch_func(arch_config["config"]) return arch_func(arch_config["config"])
@ -145,18 +123,13 @@ def get_module_path(module):
def load_model(name, **overrides): def load_model(name, **overrides):
"""Load a model from a shortcut link, package or data path. """Load a model from a package or data path.
name (unicode): Package name, shortcut link or model path. name (unicode): Package name or model path.
**overrides: Specific overrides, like pipeline components to disable. **overrides: Specific overrides, like pipeline components to disable.
RETURNS (Language): `Language` class with the loaded model. RETURNS (Language): `Language` class with the loaded model.
""" """
data_path = get_data_path() if isinstance(name, str): # name or string path
if not data_path or not data_path.exists():
raise IOError(Errors.E049.format(path=data_path))
if isinstance(name, str): # in data dir / shortcut
if name in set([d.name for d in data_path.iterdir()]):
return load_model_from_link(name, **overrides)
if is_package(name): # installed as package if is_package(name): # installed as package
return load_model_from_package(name, **overrides) return load_model_from_package(name, **overrides)
if Path(name).exists(): # path to model data directory if Path(name).exists(): # path to model data directory
@ -166,16 +139,6 @@ def load_model(name, **overrides):
raise IOError(Errors.E050.format(name=name)) raise IOError(Errors.E050.format(name=name))
def load_model_from_link(name, **overrides):
"""Load a model from a shortcut link, or directory in spaCy data path."""
path = get_data_path() / name / "__init__.py"
try:
cls = import_file(name, path)
except AttributeError:
raise IOError(Errors.E051.format(name=name))
return cls.load(**overrides)
def load_model_from_package(name, **overrides): def load_model_from_package(name, **overrides):
"""Load a model from an installed package.""" """Load a model from an installed package."""
cls = importlib.import_module(name) cls = importlib.import_module(name)
@ -797,5 +760,13 @@ def create_default_optimizer():
eps = env_opt("optimizer_eps", 1e-8) eps = env_opt("optimizer_eps", 1e-8)
L2 = env_opt("L2_penalty", 1e-6) L2 = env_opt("L2_penalty", 1e-6)
grad_clip = env_opt("grad_norm_clip", 1.0) grad_clip = env_opt("grad_norm_clip", 1.0)
optimizer = Adam(learn_rate, L2=L2, beta1=beta1, beta2=beta2, eps=eps, ops=ops, grad_clip=grad_clip) optimizer = Adam(
learn_rate,
L2=L2,
beta1=beta1,
beta2=beta2,
eps=eps,
ops=ops,
grad_clip=grad_clip,
)
return optimizer return optimizer