mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
Better model compatibility and validation
This commit is contained in:
parent
25b51f4fc8
commit
6e6db6afb6
|
@ -13,6 +13,7 @@ numpy>=1.15.0
|
||||||
requests>=2.13.0,<3.0.0
|
requests>=2.13.0,<3.0.0
|
||||||
plac>=0.9.6,<1.2.0
|
plac>=0.9.6,<1.2.0
|
||||||
tqdm>=4.38.0,<5.0.0
|
tqdm>=4.38.0,<5.0.0
|
||||||
|
importlib_metadata>=0.20; python_version < "3.8"
|
||||||
# Optional dependencies
|
# Optional dependencies
|
||||||
jsonschema>=2.6.0,<3.1.0
|
jsonschema>=2.6.0,<3.1.0
|
||||||
pydantic>=1.3.0,<2.0.0
|
pydantic>=1.3.0,<2.0.0
|
||||||
|
|
|
@ -56,6 +56,7 @@ install_requires =
|
||||||
requests>=2.13.0,<3.0.0
|
requests>=2.13.0,<3.0.0
|
||||||
pydantic>=1.3.0,<2.0.0
|
pydantic>=1.3.0,<2.0.0
|
||||||
tqdm>=4.38.0,<5.0.0
|
tqdm>=4.38.0,<5.0.0
|
||||||
|
importlib_metadata>=0.20; python_version < "3.8"
|
||||||
|
|
||||||
[options.extras_require]
|
[options.extras_require]
|
||||||
lookups =
|
lookups =
|
||||||
|
|
|
@ -48,7 +48,9 @@ def info(
|
||||||
"Location": str(Path(__file__).parent.parent),
|
"Location": str(Path(__file__).parent.parent),
|
||||||
"Platform": platform.platform(),
|
"Platform": platform.platform(),
|
||||||
"Python version": platform.python_version(),
|
"Python version": platform.python_version(),
|
||||||
"Models": ", ".join(model["name"] for model in all_models.values()),
|
"Models": ", ".join(
|
||||||
|
f"{m['name']} ({m['version']})" for m in all_models.values()
|
||||||
|
),
|
||||||
}
|
}
|
||||||
if not silent:
|
if not silent:
|
||||||
title = "Info about spaCy"
|
title = "Info about spaCy"
|
||||||
|
|
|
@ -83,14 +83,14 @@ def generate_meta(model_path, existing_meta, msg):
|
||||||
("lang", "Model language", meta.get("lang", "en")),
|
("lang", "Model language", meta.get("lang", "en")),
|
||||||
("name", "Model name", meta.get("name", "model")),
|
("name", "Model name", meta.get("name", "model")),
|
||||||
("version", "Model version", meta.get("version", "0.0.0")),
|
("version", "Model version", meta.get("version", "0.0.0")),
|
||||||
("spacy_version", "Required spaCy version", f">={about.__version__},<3.0.0"),
|
|
||||||
("description", "Model description", meta.get("description", False)),
|
("description", "Model description", meta.get("description", False)),
|
||||||
("author", "Author", meta.get("author", False)),
|
("author", "Author", meta.get("author", False)),
|
||||||
("email", "Author email", meta.get("email", False)),
|
("email", "Author email", meta.get("email", False)),
|
||||||
("url", "Author website", meta.get("url", False)),
|
("url", "Author website", meta.get("url", False)),
|
||||||
("license", "License", meta.get("license", "CC BY-SA 3.0")),
|
("license", "License", meta.get("license", "MIT")),
|
||||||
]
|
]
|
||||||
nlp = util.load_model_from_path(Path(model_path))
|
nlp = util.load_model_from_path(Path(model_path))
|
||||||
|
meta["spacy_version"] = about.__version__
|
||||||
meta["pipeline"] = nlp.pipe_names
|
meta["pipeline"] = nlp.pipe_names
|
||||||
meta["vectors"] = {
|
meta["vectors"] = {
|
||||||
"width": nlp.vocab.vectors_length,
|
"width": nlp.vocab.vectors_length,
|
||||||
|
@ -168,6 +168,7 @@ def setup_package():
|
||||||
package_data={model_name: list_files(model_dir)},
|
package_data={model_name: list_files(model_dir)},
|
||||||
install_requires=list_requirements(meta),
|
install_requires=list_requirements(meta),
|
||||||
zip_safe=False,
|
zip_safe=False,
|
||||||
|
entry_points={'spacy_models': ['{m} = {m}'.format(m=model_name)]}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -467,7 +467,7 @@ def train(
|
||||||
# Update model meta.json
|
# Update model meta.json
|
||||||
meta["lang"] = nlp.lang
|
meta["lang"] = nlp.lang
|
||||||
meta["pipeline"] = nlp.pipe_names
|
meta["pipeline"] = nlp.pipe_names
|
||||||
meta["spacy_version"] = f">={about.__version__}"
|
meta["spacy_version"] = about.__version__
|
||||||
if beam_width == 1:
|
if beam_width == 1:
|
||||||
meta["speed"] = {
|
meta["speed"] = {
|
||||||
"nwords": nwords,
|
"nwords": nwords,
|
||||||
|
|
|
@ -4,6 +4,8 @@ import requests
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
|
|
||||||
from .. import about
|
from .. import about
|
||||||
|
from ..util import get_package_version, get_installed_models, split_version
|
||||||
|
from ..util import get_package_path, get_model_meta, is_compatible_model
|
||||||
|
|
||||||
|
|
||||||
def validate():
|
def validate():
|
||||||
|
@ -25,7 +27,7 @@ def validate():
|
||||||
msg.info(f"spaCy installation: {spacy_dir}")
|
msg.info(f"spaCy installation: {spacy_dir}")
|
||||||
|
|
||||||
if model_pkgs:
|
if model_pkgs:
|
||||||
header = ("NAME", "VERSION", "")
|
header = ("NAME", "SPACY", "VERSION", "")
|
||||||
rows = []
|
rows = []
|
||||||
for name, data in model_pkgs.items():
|
for name, data in model_pkgs.items():
|
||||||
if data["compat"]:
|
if data["compat"]:
|
||||||
|
@ -34,7 +36,7 @@ def validate():
|
||||||
else:
|
else:
|
||||||
version = msg.text(data["version"], color="red", no_print=True)
|
version = msg.text(data["version"], color="red", no_print=True)
|
||||||
comp = f"--> {compat.get(data['name'], ['n/a'])[0]}"
|
comp = f"--> {compat.get(data['name'], ['n/a'])[0]}"
|
||||||
rows.append((data["name"], version, comp))
|
rows.append((data["name"], data["spacy"], version, comp))
|
||||||
msg.table(rows, header=header)
|
msg.table(rows, header=header)
|
||||||
else:
|
else:
|
||||||
msg.text("No models found in your current environment.", exits=0)
|
msg.text("No models found in your current environment.", exits=0)
|
||||||
|
@ -44,8 +46,9 @@ def validate():
|
||||||
cmd = "python -m spacy download {}"
|
cmd = "python -m spacy download {}"
|
||||||
print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n")
|
print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n")
|
||||||
if na_models:
|
if na_models:
|
||||||
msg.warn(
|
msg.info(
|
||||||
f"The following models are not available for spaCy v{about.__version__}:",
|
f"The following models are custom spaCy models or not "
|
||||||
|
f"available for spaCy v{about.__version__}:",
|
||||||
", ".join(na_models),
|
", ".join(na_models),
|
||||||
)
|
)
|
||||||
if incompat_models:
|
if incompat_models:
|
||||||
|
@ -53,8 +56,6 @@ def validate():
|
||||||
|
|
||||||
|
|
||||||
def get_model_pkgs():
|
def get_model_pkgs():
|
||||||
import pkg_resources
|
|
||||||
|
|
||||||
with msg.loading("Loading compatibility table..."):
|
with msg.loading("Loading compatibility table..."):
|
||||||
r = requests.get(about.__compatibility__)
|
r = requests.get(about.__compatibility__)
|
||||||
if r.status_code != 200:
|
if r.status_code != 200:
|
||||||
|
@ -66,20 +67,30 @@ def get_model_pkgs():
|
||||||
msg.good("Loaded compatibility table")
|
msg.good("Loaded compatibility table")
|
||||||
compat = r.json()["spacy"]
|
compat = r.json()["spacy"]
|
||||||
all_models = set()
|
all_models = set()
|
||||||
|
installed_models = get_installed_models()
|
||||||
for spacy_v, models in dict(compat).items():
|
for spacy_v, models in dict(compat).items():
|
||||||
all_models.update(models.keys())
|
all_models.update(models.keys())
|
||||||
for model, model_vs in models.items():
|
for model, model_vs in models.items():
|
||||||
compat[spacy_v][model] = [reformat_version(v) for v in model_vs]
|
compat[spacy_v][model] = [reformat_version(v) for v in model_vs]
|
||||||
pkgs = {}
|
pkgs = {}
|
||||||
for pkg_name, pkg_data in pkg_resources.working_set.by_key.items():
|
for pkg_name in installed_models:
|
||||||
package = pkg_name.replace("-", "_")
|
package = pkg_name.replace("-", "_")
|
||||||
if package in all_models:
|
version = get_package_version(pkg_name)
|
||||||
version = pkg_data.version
|
if package in compat:
|
||||||
pkgs[pkg_name] = {
|
is_compat = version in compat[package]
|
||||||
"name": package,
|
v_maj, v_min = split_version(about.__version__)
|
||||||
"version": version,
|
spacy_version = f"{v_maj}.{v_min}"
|
||||||
"compat": package in compat and version in compat[package],
|
else:
|
||||||
}
|
model_path = get_package_path(package)
|
||||||
|
model_meta = get_model_meta(model_path)
|
||||||
|
is_compat = is_compatible_model(model_meta)
|
||||||
|
spacy_version = model_meta.get("spacy_version", "n/a")
|
||||||
|
pkgs[pkg_name] = {
|
||||||
|
"name": package,
|
||||||
|
"version": version,
|
||||||
|
"spacy": spacy_version,
|
||||||
|
"compat": is_compat,
|
||||||
|
}
|
||||||
return pkgs, compat
|
return pkgs, compat
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -196,7 +196,7 @@ class Language(object):
|
||||||
self._meta.setdefault("lang", self.lang)
|
self._meta.setdefault("lang", self.lang)
|
||||||
self._meta.setdefault("name", "model")
|
self._meta.setdefault("name", "model")
|
||||||
self._meta.setdefault("version", "0.0.0")
|
self._meta.setdefault("version", "0.0.0")
|
||||||
self._meta.setdefault("spacy_version", f">={about.__version__}")
|
self._meta.setdefault("spacy_version", about.__version__)
|
||||||
self._meta.setdefault("description", "")
|
self._meta.setdefault("description", "")
|
||||||
self._meta.setdefault("author", "")
|
self._meta.setdefault("author", "")
|
||||||
self._meta.setdefault("email", "")
|
self._meta.setdefault("email", "")
|
||||||
|
|
|
@ -21,9 +21,16 @@ try:
|
||||||
except ImportError:
|
except ImportError:
|
||||||
cupy = None
|
cupy = None
|
||||||
|
|
||||||
|
try: # Python 3.8
|
||||||
|
import importlib.metadata as importlib_metadata
|
||||||
|
except ImportError:
|
||||||
|
import importlib_metadata
|
||||||
|
|
||||||
from .symbols import ORTH
|
from .symbols import ORTH
|
||||||
from .compat import cupy, CudaStream
|
from .compat import cupy, CudaStream
|
||||||
from .errors import Errors, Warnings
|
from .errors import Errors, Warnings
|
||||||
|
from . import about
|
||||||
|
|
||||||
|
|
||||||
_PRINT_ENV = False
|
_PRINT_ENV = False
|
||||||
|
|
||||||
|
@ -35,6 +42,10 @@ class registry(thinc.registry):
|
||||||
factories = catalogue.create("spacy", "factories", entry_points=True)
|
factories = catalogue.create("spacy", "factories", entry_points=True)
|
||||||
displacy_colors = catalogue.create("spacy", "displacy_colors", entry_points=True)
|
displacy_colors = catalogue.create("spacy", "displacy_colors", entry_points=True)
|
||||||
assets = catalogue.create("spacy", "assets", entry_points=True)
|
assets = catalogue.create("spacy", "assets", entry_points=True)
|
||||||
|
# This is mostly used to get a list of all installed models in the current
|
||||||
|
# environment. spaCy models packaged with `spacy package` will "advertise"
|
||||||
|
# themselves via entry points.
|
||||||
|
models = catalogue.create("spacy", "models", entry_points=True)
|
||||||
|
|
||||||
|
|
||||||
def set_env_log(value):
|
def set_env_log(value):
|
||||||
|
@ -204,6 +215,56 @@ def load_model_from_init_py(init_file, **overrides):
|
||||||
return load_model_from_path(data_path, meta, **overrides)
|
return load_model_from_path(data_path, meta, **overrides)
|
||||||
|
|
||||||
|
|
||||||
|
def get_installed_models():
|
||||||
|
"""List all model packages currently installed in the environment.
|
||||||
|
|
||||||
|
RETURNS (list): The string names of the models.
|
||||||
|
"""
|
||||||
|
return list(registry.models.get_all().keys())
|
||||||
|
|
||||||
|
|
||||||
|
def get_package_version(name):
|
||||||
|
"""Get the version of an installed package. Typically used to get model
|
||||||
|
package versions.
|
||||||
|
|
||||||
|
name (unicode): The name of the installed Python package.
|
||||||
|
RETURNS (unicode / None): The version or None if package not installed.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
return importlib_metadata.version(name)
|
||||||
|
except importlib_metadata.PackageNotFoundError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def split_version(version):
|
||||||
|
"""RETURNS (tuple): Two integers, the major and minor spaCy version."""
|
||||||
|
pieces = version.split(".", 3)
|
||||||
|
return int(pieces[0]), int(pieces[1])
|
||||||
|
|
||||||
|
|
||||||
|
def is_compatible_model(meta):
|
||||||
|
"""Check if a model is compatible with the current version of spaCy, based
|
||||||
|
on its meta.json. We compare the version of spaCy the model was created with
|
||||||
|
with the current version. If the minor version is different, it's considered
|
||||||
|
incompatible.
|
||||||
|
|
||||||
|
meta (dict): The model's meta.
|
||||||
|
RETURNS (bool / None): Whether the model is compatible with the current
|
||||||
|
spaCy or None if we don't have enough info.
|
||||||
|
"""
|
||||||
|
cur_v = about.__version__
|
||||||
|
pkg_v = meta.get("spacy_version")
|
||||||
|
if not pkg_v or not isinstance(pkg_v, str):
|
||||||
|
return None
|
||||||
|
# Handle spacy_version values like >=x,<y, just in case
|
||||||
|
pkg_v = re.sub(r"[^0-9.]", "", pkg_v.split(",")[0])
|
||||||
|
cur_major, cur_minor = split_version(cur_v)
|
||||||
|
pkg_major, pkg_minor = split_version(pkg_v)
|
||||||
|
if cur_major != pkg_major or cur_minor != pkg_minor:
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
def load_config(path, create_objects=False):
|
def load_config(path, create_objects=False):
|
||||||
"""Load a Thinc-formatted config file, optionally filling in objects where
|
"""Load a Thinc-formatted config file, optionally filling in objects where
|
||||||
the config references registry entries. See "Thinc config files" for details.
|
the config references registry entries. See "Thinc config files" for details.
|
||||||
|
|
Loading…
Reference in New Issue
Block a user