mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
Better model compatibility and validation
This commit is contained in:
parent
25b51f4fc8
commit
6e6db6afb6
|
@ -13,6 +13,7 @@ numpy>=1.15.0
|
|||
requests>=2.13.0,<3.0.0
|
||||
plac>=0.9.6,<1.2.0
|
||||
tqdm>=4.38.0,<5.0.0
|
||||
importlib_metadata>=0.20; python_version < "3.8"
|
||||
# Optional dependencies
|
||||
jsonschema>=2.6.0,<3.1.0
|
||||
pydantic>=1.3.0,<2.0.0
|
||||
|
|
|
@ -56,6 +56,7 @@ install_requires =
|
|||
requests>=2.13.0,<3.0.0
|
||||
pydantic>=1.3.0,<2.0.0
|
||||
tqdm>=4.38.0,<5.0.0
|
||||
importlib_metadata>=0.20; python_version < "3.8"
|
||||
|
||||
[options.extras_require]
|
||||
lookups =
|
||||
|
|
|
@ -48,7 +48,9 @@ def info(
|
|||
"Location": str(Path(__file__).parent.parent),
|
||||
"Platform": platform.platform(),
|
||||
"Python version": platform.python_version(),
|
||||
"Models": ", ".join(model["name"] for model in all_models.values()),
|
||||
"Models": ", ".join(
|
||||
f"{m['name']} ({m['version']})" for m in all_models.values()
|
||||
),
|
||||
}
|
||||
if not silent:
|
||||
title = "Info about spaCy"
|
||||
|
|
|
@ -83,14 +83,14 @@ def generate_meta(model_path, existing_meta, msg):
|
|||
("lang", "Model language", meta.get("lang", "en")),
|
||||
("name", "Model name", meta.get("name", "model")),
|
||||
("version", "Model version", meta.get("version", "0.0.0")),
|
||||
("spacy_version", "Required spaCy version", f">={about.__version__},<3.0.0"),
|
||||
("description", "Model description", meta.get("description", False)),
|
||||
("author", "Author", meta.get("author", False)),
|
||||
("email", "Author email", meta.get("email", False)),
|
||||
("url", "Author website", meta.get("url", False)),
|
||||
("license", "License", meta.get("license", "CC BY-SA 3.0")),
|
||||
("license", "License", meta.get("license", "MIT")),
|
||||
]
|
||||
nlp = util.load_model_from_path(Path(model_path))
|
||||
meta["spacy_version"] = about.__version__
|
||||
meta["pipeline"] = nlp.pipe_names
|
||||
meta["vectors"] = {
|
||||
"width": nlp.vocab.vectors_length,
|
||||
|
@ -168,6 +168,7 @@ def setup_package():
|
|||
package_data={model_name: list_files(model_dir)},
|
||||
install_requires=list_requirements(meta),
|
||||
zip_safe=False,
|
||||
entry_points={'spacy_models': ['{m} = {m}'.format(m=model_name)]}
|
||||
)
|
||||
|
||||
|
||||
|
|
|
@ -467,7 +467,7 @@ def train(
|
|||
# Update model meta.json
|
||||
meta["lang"] = nlp.lang
|
||||
meta["pipeline"] = nlp.pipe_names
|
||||
meta["spacy_version"] = f">={about.__version__}"
|
||||
meta["spacy_version"] = about.__version__
|
||||
if beam_width == 1:
|
||||
meta["speed"] = {
|
||||
"nwords": nwords,
|
||||
|
|
|
@ -4,6 +4,8 @@ import requests
|
|||
from wasabi import msg
|
||||
|
||||
from .. import about
|
||||
from ..util import get_package_version, get_installed_models, split_version
|
||||
from ..util import get_package_path, get_model_meta, is_compatible_model
|
||||
|
||||
|
||||
def validate():
|
||||
|
@ -25,7 +27,7 @@ def validate():
|
|||
msg.info(f"spaCy installation: {spacy_dir}")
|
||||
|
||||
if model_pkgs:
|
||||
header = ("NAME", "VERSION", "")
|
||||
header = ("NAME", "SPACY", "VERSION", "")
|
||||
rows = []
|
||||
for name, data in model_pkgs.items():
|
||||
if data["compat"]:
|
||||
|
@ -34,7 +36,7 @@ def validate():
|
|||
else:
|
||||
version = msg.text(data["version"], color="red", no_print=True)
|
||||
comp = f"--> {compat.get(data['name'], ['n/a'])[0]}"
|
||||
rows.append((data["name"], version, comp))
|
||||
rows.append((data["name"], data["spacy"], version, comp))
|
||||
msg.table(rows, header=header)
|
||||
else:
|
||||
msg.text("No models found in your current environment.", exits=0)
|
||||
|
@ -44,8 +46,9 @@ def validate():
|
|||
cmd = "python -m spacy download {}"
|
||||
print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n")
|
||||
if na_models:
|
||||
msg.warn(
|
||||
f"The following models are not available for spaCy v{about.__version__}:",
|
||||
msg.info(
|
||||
f"The following models are custom spaCy models or not "
|
||||
f"available for spaCy v{about.__version__}:",
|
||||
", ".join(na_models),
|
||||
)
|
||||
if incompat_models:
|
||||
|
@ -53,8 +56,6 @@ def validate():
|
|||
|
||||
|
||||
def get_model_pkgs():
|
||||
import pkg_resources
|
||||
|
||||
with msg.loading("Loading compatibility table..."):
|
||||
r = requests.get(about.__compatibility__)
|
||||
if r.status_code != 200:
|
||||
|
@ -66,19 +67,29 @@ def get_model_pkgs():
|
|||
msg.good("Loaded compatibility table")
|
||||
compat = r.json()["spacy"]
|
||||
all_models = set()
|
||||
installed_models = get_installed_models()
|
||||
for spacy_v, models in dict(compat).items():
|
||||
all_models.update(models.keys())
|
||||
for model, model_vs in models.items():
|
||||
compat[spacy_v][model] = [reformat_version(v) for v in model_vs]
|
||||
pkgs = {}
|
||||
for pkg_name, pkg_data in pkg_resources.working_set.by_key.items():
|
||||
for pkg_name in installed_models:
|
||||
package = pkg_name.replace("-", "_")
|
||||
if package in all_models:
|
||||
version = pkg_data.version
|
||||
version = get_package_version(pkg_name)
|
||||
if package in compat:
|
||||
is_compat = version in compat[package]
|
||||
v_maj, v_min = split_version(about.__version__)
|
||||
spacy_version = f"{v_maj}.{v_min}"
|
||||
else:
|
||||
model_path = get_package_path(package)
|
||||
model_meta = get_model_meta(model_path)
|
||||
is_compat = is_compatible_model(model_meta)
|
||||
spacy_version = model_meta.get("spacy_version", "n/a")
|
||||
pkgs[pkg_name] = {
|
||||
"name": package,
|
||||
"version": version,
|
||||
"compat": package in compat and version in compat[package],
|
||||
"spacy": spacy_version,
|
||||
"compat": is_compat,
|
||||
}
|
||||
return pkgs, compat
|
||||
|
||||
|
|
|
@ -196,7 +196,7 @@ class Language(object):
|
|||
self._meta.setdefault("lang", self.lang)
|
||||
self._meta.setdefault("name", "model")
|
||||
self._meta.setdefault("version", "0.0.0")
|
||||
self._meta.setdefault("spacy_version", f">={about.__version__}")
|
||||
self._meta.setdefault("spacy_version", about.__version__)
|
||||
self._meta.setdefault("description", "")
|
||||
self._meta.setdefault("author", "")
|
||||
self._meta.setdefault("email", "")
|
||||
|
|
|
@ -21,9 +21,16 @@ try:
|
|||
except ImportError:
|
||||
cupy = None
|
||||
|
||||
try: # Python 3.8
|
||||
import importlib.metadata as importlib_metadata
|
||||
except ImportError:
|
||||
import importlib_metadata
|
||||
|
||||
from .symbols import ORTH
|
||||
from .compat import cupy, CudaStream
|
||||
from .errors import Errors, Warnings
|
||||
from . import about
|
||||
|
||||
|
||||
_PRINT_ENV = False
|
||||
|
||||
|
@ -35,6 +42,10 @@ class registry(thinc.registry):
|
|||
factories = catalogue.create("spacy", "factories", entry_points=True)
|
||||
displacy_colors = catalogue.create("spacy", "displacy_colors", entry_points=True)
|
||||
assets = catalogue.create("spacy", "assets", entry_points=True)
|
||||
# This is mostly used to get a list of all installed models in the current
|
||||
# environment. spaCy models packaged with `spacy package` will "advertise"
|
||||
# themselves via entry points.
|
||||
models = catalogue.create("spacy", "models", entry_points=True)
|
||||
|
||||
|
||||
def set_env_log(value):
|
||||
|
@ -204,6 +215,56 @@ def load_model_from_init_py(init_file, **overrides):
|
|||
return load_model_from_path(data_path, meta, **overrides)
|
||||
|
||||
|
||||
def get_installed_models():
|
||||
"""List all model packages currently installed in the environment.
|
||||
|
||||
RETURNS (list): The string names of the models.
|
||||
"""
|
||||
return list(registry.models.get_all().keys())
|
||||
|
||||
|
||||
def get_package_version(name):
|
||||
"""Get the version of an installed package. Typically used to get model
|
||||
package versions.
|
||||
|
||||
name (unicode): The name of the installed Python package.
|
||||
RETURNS (unicode / None): The version or None if package not installed.
|
||||
"""
|
||||
try:
|
||||
return importlib_metadata.version(name)
|
||||
except importlib_metadata.PackageNotFoundError:
|
||||
return None
|
||||
|
||||
|
||||
def split_version(version):
|
||||
"""RETURNS (tuple): Two integers, the major and minor spaCy version."""
|
||||
pieces = version.split(".", 3)
|
||||
return int(pieces[0]), int(pieces[1])
|
||||
|
||||
|
||||
def is_compatible_model(meta):
|
||||
"""Check if a model is compatible with the current version of spaCy, based
|
||||
on its meta.json. We compare the version of spaCy the model was created with
|
||||
with the current version. If the minor version is different, it's considered
|
||||
incompatible.
|
||||
|
||||
meta (dict): The model's meta.
|
||||
RETURNS (bool / None): Whether the model is compatible with the current
|
||||
spaCy or None if we don't have enough info.
|
||||
"""
|
||||
cur_v = about.__version__
|
||||
pkg_v = meta.get("spacy_version")
|
||||
if not pkg_v or not isinstance(pkg_v, str):
|
||||
return None
|
||||
# Handle spacy_version values like >=x,<y, just in case
|
||||
pkg_v = re.sub(r"[^0-9.]", "", pkg_v.split(",")[0])
|
||||
cur_major, cur_minor = split_version(cur_v)
|
||||
pkg_major, pkg_minor = split_version(pkg_v)
|
||||
if cur_major != pkg_major or cur_minor != pkg_minor:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def load_config(path, create_objects=False):
|
||||
"""Load a Thinc-formatted config file, optionally filling in objects where
|
||||
the config references registry entries. See "Thinc config files" for details.
|
||||
|
|
Loading…
Reference in New Issue
Block a user