Better model compatibility and validation

This commit is contained in:
Ines Montani 2020-05-22 15:42:46 +02:00
parent 25b51f4fc8
commit 6e6db6afb6
8 changed files with 96 additions and 19 deletions

View File

@ -13,6 +13,7 @@ numpy>=1.15.0
requests>=2.13.0,<3.0.0
plac>=0.9.6,<1.2.0
tqdm>=4.38.0,<5.0.0
importlib_metadata>=0.20; python_version < "3.8"
# Optional dependencies
jsonschema>=2.6.0,<3.1.0
pydantic>=1.3.0,<2.0.0

View File

@ -56,6 +56,7 @@ install_requires =
requests>=2.13.0,<3.0.0
pydantic>=1.3.0,<2.0.0
tqdm>=4.38.0,<5.0.0
importlib_metadata>=0.20; python_version < "3.8"
[options.extras_require]
lookups =

View File

@ -48,7 +48,9 @@ def info(
"Location": str(Path(__file__).parent.parent),
"Platform": platform.platform(),
"Python version": platform.python_version(),
"Models": ", ".join(model["name"] for model in all_models.values()),
"Models": ", ".join(
f"{m['name']} ({m['version']})" for m in all_models.values()
),
}
if not silent:
title = "Info about spaCy"

View File

@ -83,14 +83,14 @@ def generate_meta(model_path, existing_meta, msg):
("lang", "Model language", meta.get("lang", "en")),
("name", "Model name", meta.get("name", "model")),
("version", "Model version", meta.get("version", "0.0.0")),
("spacy_version", "Required spaCy version", f">={about.__version__},<3.0.0"),
("description", "Model description", meta.get("description", False)),
("author", "Author", meta.get("author", False)),
("email", "Author email", meta.get("email", False)),
("url", "Author website", meta.get("url", False)),
("license", "License", meta.get("license", "CC BY-SA 3.0")),
("license", "License", meta.get("license", "MIT")),
]
nlp = util.load_model_from_path(Path(model_path))
meta["spacy_version"] = about.__version__
meta["pipeline"] = nlp.pipe_names
meta["vectors"] = {
"width": nlp.vocab.vectors_length,
@ -168,6 +168,7 @@ def setup_package():
package_data={model_name: list_files(model_dir)},
install_requires=list_requirements(meta),
zip_safe=False,
entry_points={'spacy_models': ['{m} = {m}'.format(m=model_name)]}
)

View File

@ -467,7 +467,7 @@ def train(
# Update model meta.json
meta["lang"] = nlp.lang
meta["pipeline"] = nlp.pipe_names
meta["spacy_version"] = f">={about.__version__}"
meta["spacy_version"] = about.__version__
if beam_width == 1:
meta["speed"] = {
"nwords": nwords,

View File

@ -4,6 +4,8 @@ import requests
from wasabi import msg
from .. import about
from ..util import get_package_version, get_installed_models, split_version
from ..util import get_package_path, get_model_meta, is_compatible_model
def validate():
@ -25,7 +27,7 @@ def validate():
msg.info(f"spaCy installation: {spacy_dir}")
if model_pkgs:
header = ("NAME", "VERSION", "")
header = ("NAME", "SPACY", "VERSION", "")
rows = []
for name, data in model_pkgs.items():
if data["compat"]:
@ -34,7 +36,7 @@ def validate():
else:
version = msg.text(data["version"], color="red", no_print=True)
comp = f"--> {compat.get(data['name'], ['n/a'])[0]}"
rows.append((data["name"], version, comp))
rows.append((data["name"], data["spacy"], version, comp))
msg.table(rows, header=header)
else:
msg.text("No models found in your current environment.", exits=0)
@ -44,8 +46,9 @@ def validate():
cmd = "python -m spacy download {}"
print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n")
if na_models:
msg.warn(
f"The following models are not available for spaCy v{about.__version__}:",
msg.info(
f"The following models are custom spaCy models or not "
f"available for spaCy v{about.__version__}:",
", ".join(na_models),
)
if incompat_models:
@ -53,8 +56,6 @@ def validate():
def get_model_pkgs():
import pkg_resources
with msg.loading("Loading compatibility table..."):
r = requests.get(about.__compatibility__)
if r.status_code != 200:
@ -66,20 +67,30 @@ def get_model_pkgs():
msg.good("Loaded compatibility table")
compat = r.json()["spacy"]
all_models = set()
installed_models = get_installed_models()
for spacy_v, models in dict(compat).items():
all_models.update(models.keys())
for model, model_vs in models.items():
compat[spacy_v][model] = [reformat_version(v) for v in model_vs]
pkgs = {}
for pkg_name, pkg_data in pkg_resources.working_set.by_key.items():
for pkg_name in installed_models:
package = pkg_name.replace("-", "_")
if package in all_models:
version = pkg_data.version
pkgs[pkg_name] = {
"name": package,
"version": version,
"compat": package in compat and version in compat[package],
}
version = get_package_version(pkg_name)
if package in compat:
is_compat = version in compat[package]
v_maj, v_min = split_version(about.__version__)
spacy_version = f"{v_maj}.{v_min}"
else:
model_path = get_package_path(package)
model_meta = get_model_meta(model_path)
is_compat = is_compatible_model(model_meta)
spacy_version = model_meta.get("spacy_version", "n/a")
pkgs[pkg_name] = {
"name": package,
"version": version,
"spacy": spacy_version,
"compat": is_compat,
}
return pkgs, compat

View File

@ -196,7 +196,7 @@ class Language(object):
self._meta.setdefault("lang", self.lang)
self._meta.setdefault("name", "model")
self._meta.setdefault("version", "0.0.0")
self._meta.setdefault("spacy_version", f">={about.__version__}")
self._meta.setdefault("spacy_version", about.__version__)
self._meta.setdefault("description", "")
self._meta.setdefault("author", "")
self._meta.setdefault("email", "")

View File

@ -21,9 +21,16 @@ try:
except ImportError:
cupy = None
try: # Python 3.8
import importlib.metadata as importlib_metadata
except ImportError:
import importlib_metadata
from .symbols import ORTH
from .compat import cupy, CudaStream
from .errors import Errors, Warnings
from . import about
_PRINT_ENV = False
@ -35,6 +42,10 @@ class registry(thinc.registry):
factories = catalogue.create("spacy", "factories", entry_points=True)
displacy_colors = catalogue.create("spacy", "displacy_colors", entry_points=True)
assets = catalogue.create("spacy", "assets", entry_points=True)
# This is mostly used to get a list of all installed models in the current
# environment. spaCy models packaged with `spacy package` will "advertise"
# themselves via entry points.
models = catalogue.create("spacy", "models", entry_points=True)
def set_env_log(value):
@ -204,6 +215,56 @@ def load_model_from_init_py(init_file, **overrides):
return load_model_from_path(data_path, meta, **overrides)
def get_installed_models():
"""List all model packages currently installed in the environment.
RETURNS (list): The string names of the models.
"""
return list(registry.models.get_all().keys())
def get_package_version(name):
"""Get the version of an installed package. Typically used to get model
package versions.
name (unicode): The name of the installed Python package.
RETURNS (unicode / None): The version or None if package not installed.
"""
try:
return importlib_metadata.version(name)
except importlib_metadata.PackageNotFoundError:
return None
def split_version(version):
"""RETURNS (tuple): Two integers, the major and minor spaCy version."""
pieces = version.split(".", 3)
return int(pieces[0]), int(pieces[1])
def is_compatible_model(meta):
"""Check if a model is compatible with the current version of spaCy, based
on its meta.json. We compare the version of spaCy the model was created with
with the current version. If the minor version is different, it's considered
incompatible.
meta (dict): The model's meta.
RETURNS (bool / None): Whether the model is compatible with the current
spaCy or None if we don't have enough info.
"""
cur_v = about.__version__
pkg_v = meta.get("spacy_version")
if not pkg_v or not isinstance(pkg_v, str):
return None
# Handle spacy_version values like >=x,<y, just in case
pkg_v = re.sub(r"[^0-9.]", "", pkg_v.split(",")[0])
cur_major, cur_minor = split_version(cur_v)
pkg_major, pkg_minor = split_version(pkg_v)
if cur_major != pkg_major or cur_minor != pkg_minor:
return False
return True
def load_config(path, create_objects=False):
"""Load a Thinc-formatted config file, optionally filling in objects where
the config references registry entries. See "Thinc config files" for details.