Auto-detect package dependencies in spacy package (#8948)

* Auto-detect package dependencies in spacy package

* Add simple get_third_party_dependencies test

* Import packages_distributions explicitly

* Inline packages_distributions

* Fix docstring [ci skip]

* Relax catalogue requirement

* Move importlib_metadata to spacy.compat with note

* Include license information [ci skip]
This commit is contained in:
Ines Montani 2021-08-17 22:05:13 +10:00 committed by GitHub
parent 0a6b68848f
commit d94ddd5686
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 130 additions and 15 deletions

View File

@ -104,3 +104,26 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE. SOFTWARE.
importlib_metadata
------------------
* Files: util.py
The implementation of packages_distributions() is adapted from
importlib_metadata, which is distributed under the following license:
Copyright 2017-2019 Jason R. Coombs, Barry Warsaw
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

View File

@ -2,6 +2,8 @@ from typing import Optional, Union, Any, Dict, List, Tuple
import shutil import shutil
from pathlib import Path from pathlib import Path
from wasabi import Printer, MarkdownRenderer, get_raw_input from wasabi import Printer, MarkdownRenderer, get_raw_input
from thinc.api import Config
from collections import defaultdict
import srsly import srsly
import sys import sys
@ -99,6 +101,12 @@ def package(
msg.fail("Can't load pipeline meta.json", meta_path, exits=1) msg.fail("Can't load pipeline meta.json", meta_path, exits=1)
meta = srsly.read_json(meta_path) meta = srsly.read_json(meta_path)
meta = get_meta(input_dir, meta) meta = get_meta(input_dir, meta)
if meta["requirements"]:
msg.good(
f"Including {len(meta['requirements'])} package requirement(s) from "
f"meta and config",
", ".join(meta["requirements"]),
)
if name is not None: if name is not None:
meta["name"] = name meta["name"] = name
if version is not None: if version is not None:
@ -175,6 +183,51 @@ def has_wheel() -> bool:
return False return False
def get_third_party_dependencies(
config: Config, exclude: List[str] = util.SimpleFrozenList()
) -> List[str]:
"""If the config includes references to registered functions that are
provided by third-party packages (spacy-transformers, other libraries), we
want to include them in meta["requirements"] so that the package specifies
them as dependencies and the user won't have to do it manually.
We do this by:
- traversing the config to check for registered function (@ keys)
- looking up the functions and getting their module
- looking up the module version and generating an appropriate version range
config (Config): The pipeline config.
exclude (list): List of packages to exclude (e.g. that already exist in meta).
RETURNS (list): The versioned requirements.
"""
own_packages = ("spacy", "spacy-nightly", "thinc", "srsly")
distributions = util.packages_distributions()
funcs = defaultdict(set)
for path, value in util.walk_dict(config):
if path[-1].startswith("@"): # collect all function references by registry
funcs[path[-1][1:]].add(value)
modules = set()
for reg_name, func_names in funcs.items():
sub_registry = getattr(util.registry, reg_name)
for func_name in func_names:
func_info = sub_registry.find(func_name)
module_name = func_info.get("module")
if module_name: # the code is part of a module, not a --code file
modules.add(func_info["module"].split(".")[0])
dependencies = []
for module_name in modules:
if module_name in distributions:
dist = distributions.get(module_name)
if dist:
pkg = dist[0]
if pkg in own_packages or pkg in exclude:
continue
version = util.get_package_version(pkg)
version_range = util.get_minor_version_range(version)
dependencies.append(f"{pkg}{version_range}")
return dependencies
def get_build_formats(formats: List[str]) -> Tuple[bool, bool]: def get_build_formats(formats: List[str]) -> Tuple[bool, bool]:
supported = ["sdist", "wheel", "none"] supported = ["sdist", "wheel", "none"]
for form in formats: for form in formats:
@ -208,7 +261,7 @@ def get_meta(
nlp = util.load_model_from_path(Path(model_path)) nlp = util.load_model_from_path(Path(model_path))
meta.update(nlp.meta) meta.update(nlp.meta)
meta.update(existing_meta) meta.update(existing_meta)
meta["spacy_version"] = util.get_model_version_range(about.__version__) meta["spacy_version"] = util.get_minor_version_range(about.__version__)
meta["vectors"] = { meta["vectors"] = {
"width": nlp.vocab.vectors_length, "width": nlp.vocab.vectors_length,
"vectors": len(nlp.vocab.vectors), "vectors": len(nlp.vocab.vectors),
@ -217,6 +270,11 @@ def get_meta(
} }
if about.__title__ != "spacy": if about.__title__ != "spacy":
meta["parent_package"] = about.__title__ meta["parent_package"] = about.__title__
meta.setdefault("requirements", [])
# Update the requirements with all third-party packages in the config
existing_reqs = [util.split_requirement(req)[0] for req in meta["requirements"]]
reqs = get_third_party_dependencies(nlp.config, exclude=existing_reqs)
meta["requirements"].extend(reqs)
return meta return meta

View File

@ -27,6 +27,14 @@ try: # Python 3.8+
except ImportError: except ImportError:
from typing_extensions import Literal # noqa: F401 from typing_extensions import Literal # noqa: F401
# Important note: The importlib_metadata "backport" includes functionality
# that's not part of the built-in importlib.metadata. We should treat this
# import like the built-in and only use what's available there.
try: # Python 3.8+
import importlib.metadata as importlib_metadata
except ImportError:
from catalogue import _importlib_metadata as importlib_metadata # noqa: F401
from thinc.api import Optimizer # noqa: F401 from thinc.api import Optimizer # noqa: F401
pickle = pickle pickle = pickle

View File

@ -199,7 +199,7 @@ class Language:
DOCS: https://spacy.io/api/language#meta DOCS: https://spacy.io/api/language#meta
""" """
spacy_version = util.get_model_version_range(about.__version__) spacy_version = util.get_minor_version_range(about.__version__)
if self.vocab.lang: if self.vocab.lang:
self._meta.setdefault("lang", self.vocab.lang) self._meta.setdefault("lang", self.vocab.lang)
else: else:

View File

@ -14,6 +14,7 @@ from spacy import about
from spacy.util import get_minor_version from spacy.util import get_minor_version
from spacy.cli.validate import get_model_pkgs from spacy.cli.validate import get_model_pkgs
from spacy.cli.download import get_compatibility, get_version from spacy.cli.download import get_compatibility, get_version
from spacy.cli.package import get_third_party_dependencies
from thinc.api import ConfigValidationError, Config from thinc.api import ConfigValidationError, Config
import srsly import srsly
import os import os
@ -532,3 +533,10 @@ def test_init_labels(component_name):
assert len(nlp2.get_pipe(component_name).labels) == 0 assert len(nlp2.get_pipe(component_name).labels) == 0
nlp2.initialize() nlp2.initialize()
assert len(nlp2.get_pipe(component_name).labels) == 4 assert len(nlp2.get_pipe(component_name).labels) == 4
def test_get_third_party_dependencies_runs():
# We can't easily test the detection of third-party packages here, but we
# can at least make sure that the function and its importlib magic runs.
nlp = Dutch()
assert get_third_party_dependencies(nlp.config) == []

View File

@ -20,8 +20,10 @@ import sys
import warnings import warnings
from packaging.specifiers import SpecifierSet, InvalidSpecifier from packaging.specifiers import SpecifierSet, InvalidSpecifier
from packaging.version import Version, InvalidVersion from packaging.version import Version, InvalidVersion
from packaging.requirements import Requirement
import subprocess import subprocess
from contextlib import contextmanager from contextlib import contextmanager
from collections import defaultdict
import tempfile import tempfile
import shutil import shutil
import shlex import shlex
@ -33,11 +35,6 @@ try:
except ImportError: except ImportError:
cupy = None cupy = None
try: # Python 3.8
import importlib.metadata as importlib_metadata
except ImportError:
from catalogue import _importlib_metadata as importlib_metadata
# These are functions that were previously (v2.x) available from spacy.util # These are functions that were previously (v2.x) available from spacy.util
# and have since moved to Thinc. We're importing them here so people's code # and have since moved to Thinc. We're importing them here so people's code
# doesn't break, but they should always be imported from Thinc from now on, # doesn't break, but they should always be imported from Thinc from now on,
@ -46,7 +43,7 @@ from thinc.api import fix_random_seed, compounding, decaying # noqa: F401
from .symbols import ORTH from .symbols import ORTH
from .compat import cupy, CudaStream, is_windows from .compat import cupy, CudaStream, is_windows, importlib_metadata
from .errors import Errors, Warnings, OLD_MODEL_SHORTCUTS from .errors import Errors, Warnings, OLD_MODEL_SHORTCUTS
from . import about from . import about
@ -639,13 +636,18 @@ def is_unconstrained_version(
return True return True
def get_model_version_range(spacy_version: str) -> str: def split_requirement(requirement: str) -> Tuple[str, str]:
"""Generate a version range like >=1.2.3,<1.3.0 based on a given spaCy """Split a requirement like spacy>=1.2.3 into ("spacy", ">=1.2.3")."""
version. Models are always compatible across patch versions but not req = Requirement(requirement)
across minor or major versions. return (req.name, str(req.specifier))
def get_minor_version_range(version: str) -> str:
"""Generate a version range like >=1.2.3,<1.3.0 based on a given version
(e.g. of spaCy).
""" """
release = Version(spacy_version).release release = Version(version).release
return f">={spacy_version},<{release[0]}.{release[1] + 1}.0" return f">={version},<{release[0]}.{release[1] + 1}.0"
def get_model_lower_version(constraint: str) -> Optional[str]: def get_model_lower_version(constraint: str) -> Optional[str]:
@ -733,7 +735,7 @@ def load_meta(path: Union[str, Path]) -> Dict[str, Any]:
model=f"{meta['lang']}_{meta['name']}", model=f"{meta['lang']}_{meta['name']}",
model_version=meta["version"], model_version=meta["version"],
version=meta["spacy_version"], version=meta["spacy_version"],
example=get_model_version_range(about.__version__), example=get_minor_version_range(about.__version__),
) )
warnings.warn(warn_msg) warnings.warn(warn_msg)
return meta return meta
@ -1549,3 +1551,19 @@ def to_ternary_int(val) -> int:
return 0 return 0
else: else:
return -1 return -1
# The following implementation of packages_distributions() is adapted from
# importlib_metadata, which is distributed under the Apache 2.0 License.
# Copyright (c) 2017-2019 Jason R. Coombs, Barry Warsaw
# See licenses/3rd_party_licenses.txt
def packages_distributions() -> Dict[str, List[str]]:
"""Return a mapping of top-level packages to their distributions. We're
inlining this helper from the importlib_metadata "backport" here, since
it's not available in the builtin importlib.metadata.
"""
pkg_to_dist = defaultdict(list)
for dist in importlib_metadata.distributions():
for pkg in (dist.read_text("top_level.txt") or "").split():
pkg_to_dist[pkg].append(dist.metadata["Name"])
return dict(pkg_to_dist)