diff --git a/.github/workflows/spacy_universe_alert.yml b/.github/workflows/spacy_universe_alert.yml index f507e0594..837aaeb33 100644 --- a/.github/workflows/spacy_universe_alert.yml +++ b/.github/workflows/spacy_universe_alert.yml @@ -19,6 +19,8 @@ jobs: - uses: actions/checkout@v3 - uses: actions/setup-python@v4 + with: + python-version: '3.10' - name: Install Bernadette app dependency and send an alert env: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} diff --git a/spacy/cli/project/run.py b/spacy/cli/project/run.py index ebab7471e..a109c4a5a 100644 --- a/spacy/cli/project/run.py +++ b/spacy/cli/project/run.py @@ -53,6 +53,7 @@ def project_run( force: bool = False, dry: bool = False, capture: bool = False, + skip_requirements_check: bool = False, ) -> None: """Run a named script defined in the project.yml. If the script is part of the default pipeline (defined in the "run" section), DVC is used to @@ -69,6 +70,7 @@ def project_run( sys.exit will be called with the return code. You should use capture=False when you want to turn over execution to the command, and capture=True when you want to run the command more like a function. + skip_requirements_check (bool): Whether to skip the requirements check. """ config = load_project_config(project_dir, overrides=overrides) commands = {cmd["name"]: cmd for cmd in config.get("commands", [])} @@ -76,9 +78,10 @@ def project_run( validate_subcommand(list(commands.keys()), list(workflows.keys()), subcommand) req_path = project_dir / "requirements.txt" - if config.get("check_requirements", True) and os.path.exists(req_path): - with req_path.open() as requirements_file: - _check_requirements([req.replace("\n", "") for req in requirements_file]) + if not skip_requirements_check: + if config.get("check_requirements", True) and os.path.exists(req_path): + with req_path.open() as requirements_file: + _check_requirements([req.strip() for req in requirements_file]) if subcommand in workflows: msg.info(f"Running workflow '{subcommand}'") @@ -90,6 +93,7 @@ def project_run( force=force, dry=dry, capture=capture, + skip_requirements_check=True, ) else: cmd = commands[subcommand] @@ -338,6 +342,12 @@ def _check_requirements(requirements: List[str]) -> Tuple[bool, bool]: failed_pkgs_msgs.append(dnf.report()) except pkg_resources.VersionConflict as vc: conflicting_pkgs_msgs.append(vc.report()) + except Exception: + msg.warn( + f"Unable to check requirement: {req} " + "Checks are currently limited to requirement specifiers " + "(PEP 508)" + ) if len(failed_pkgs_msgs) or len(conflicting_pkgs_msgs): msg.warn( diff --git a/spacy/errors.py b/spacy/errors.py index 2f8a3996f..278e5496a 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -212,8 +212,8 @@ class Warnings(metaclass=ErrorsWithCodes): W121 = ("Attempting to trace non-existent method '{method}' in pipe '{pipe}'") W122 = ("Couldn't trace method '{method}' in pipe '{pipe}'. This can happen if the pipe class " "is a Cython extension type.") - W123 = ("Argument {arg} with value {arg_value} is used instead of {config_value} as specified in the config. Be " - "aware that this might affect other components in your pipeline.") + W123 = ("Argument `enable` with value {enable} does not contain all values specified in the config option " + "`enabled` ({enabled}). Be aware that this might affect other components in your pipeline.") class Errors(metaclass=ErrorsWithCodes): diff --git a/spacy/language.py b/spacy/language.py index d391f15ab..967af1e62 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1879,31 +1879,22 @@ class Language: if isinstance(exclude, str): exclude = [exclude] - def fetch_pipes_status(value: Iterable[str], key: str) -> Iterable[str]: - """Fetch value for `enable` or `disable` w.r.t. the specified config and passed arguments passed to - .load(). If both arguments and config specified values for this field, the passed arguments take precedence - and a warning is printed. - value (Iterable[str]): Passed value for `enable` or `disable`. - key (str): Key for field in config (either "enabled" or "disabled"). - RETURN (Iterable[str]): - """ - # We assume that no argument was passed if the value is the specified default value. - if id(value) == id(_DEFAULT_EMPTY_PIPES): - return config["nlp"].get(key, []) - else: - if len(config["nlp"].get(key, [])): - warnings.warn( - Warnings.W123.format( - arg=key[:-1], - arg_value=value, - config_value=config["nlp"][key], - ) + # `enable` should not be merged with `enabled` (the opposite is true for `disable`/`disabled`). If the config + # specifies values for `enabled` not included in `enable`, emit warning. + if id(enable) != id(_DEFAULT_EMPTY_PIPES): + enabled = config["nlp"].get("enabled", []) + if len(enabled) and not set(enabled).issubset(enable): + warnings.warn( + Warnings.W123.format( + enable=enable, + enabled=enabled, ) - return value + ) + # Ensure sets of disabled/enabled pipe names are not contradictory. disabled_pipes = cls._resolve_component_status( - fetch_pipes_status(disable, "disabled"), - fetch_pipes_status(enable, "enabled"), + list({*disable, *config["nlp"].get("disabled", [])}), + enable, config["nlp"]["pipeline"], ) nlp._disabled = set(p for p in disabled_pipes if p not in exclude) @@ -2084,10 +2075,12 @@ class Language: if enable: if isinstance(enable, str): enable = [enable] - to_disable = [ - pipe_name for pipe_name in pipe_names if pipe_name not in enable - ] - if disable and disable != to_disable: + to_disable = { + *[pipe_name for pipe_name in pipe_names if pipe_name not in enable], + *disable, + } + # If any pipe to be enabled is in to_disable, the specification is inconsistent. + if len(set(enable) & to_disable): raise ValueError(Errors.E1042.format(enable=enable, disable=disable)) return tuple(to_disable) diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py index 14a7a36e5..4dd7bae16 100644 --- a/spacy/tests/pipeline/test_pipe_methods.py +++ b/spacy/tests/pipeline/test_pipe_methods.py @@ -615,20 +615,18 @@ def test_enable_disable_conflict_with_config(): with make_tempdir() as tmp_dir: nlp.to_disk(tmp_dir) - # Expected to fail, as config and arguments conflict. - with pytest.raises(ValueError): - spacy.load( - tmp_dir, enable=["tagger"], config={"nlp": {"disabled": ["senter"]}} - ) + # Expected to succeed, as config and arguments do not conflict. + assert spacy.load( + tmp_dir, enable=["tagger"], config={"nlp": {"disabled": ["senter"]}} + ).disabled == ["senter", "sentencizer"] # Expected to succeed without warning due to the lack of a conflicting config option. spacy.load(tmp_dir, enable=["tagger"]) - # Expected to succeed with a warning, as disable=[] should override the config setting. - with pytest.warns(UserWarning): + # Expected to fail due to conflict between enable and disabled. + with pytest.raises(ValueError): spacy.load( tmp_dir, - enable=["tagger"], - disable=[], - config={"nlp": {"disabled": ["senter"]}}, + enable=["senter"], + config={"nlp": {"disabled": ["senter", "tagger"]}}, ) diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py index b948bb76c..9fcf18e2d 100644 --- a/spacy/tests/serialize/test_serialize_pipeline.py +++ b/spacy/tests/serialize/test_serialize_pipeline.py @@ -404,11 +404,10 @@ def test_serialize_pipeline_disable_enable(): assert nlp3.component_names == ["ner", "tagger"] with make_tempdir() as d: nlp3.to_disk(d) - with pytest.warns(UserWarning): - nlp4 = spacy.load(d, disable=["ner"]) - assert nlp4.pipe_names == ["tagger"] + nlp4 = spacy.load(d, disable=["ner"]) + assert nlp4.pipe_names == [] assert nlp4.component_names == ["ner", "tagger"] - assert nlp4.disabled == ["ner"] + assert nlp4.disabled == ["ner", "tagger"] with make_tempdir() as d: nlp.to_disk(d) nlp5 = spacy.load(d, exclude=["tagger"]) diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 53709d660..5d24a0047 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -1,5 +1,6 @@ import os import math +import pkg_resources from random import sample from typing import Counter @@ -26,6 +27,7 @@ from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config from spacy.cli.package import get_third_party_dependencies from spacy.cli.package import _is_permitted_package_name from spacy.cli.project.remote_storage import RemoteStorage +from spacy.cli.project.run import _check_requirements from spacy.cli.validate import get_model_pkgs from spacy.lang.en import English from spacy.lang.nl import Dutch @@ -897,3 +899,42 @@ def test_local_remote_storage(): remote.pull(filename) with loc_file.open(mode="r") as file_: assert file_.read() == content + + +@pytest.mark.parametrize( + "reqs,output", + [ + [ + """ + spacy + + # comment + + thinc""", + (False, False), + ], + [ + """# comment + --some-flag + spacy""", + (False, False), + ], + [ + """# comment + --some-flag + spacy; python_version >= '3.6'""", + (False, False), + ], + [ + """# comment + spacyunknowndoesnotexist12345""", + (True, False), + ], + ], +) +def test_project_check_requirements(reqs, output): + # excessive guard against unlikely package name + try: + pkg_resources.require("spacyunknowndoesnotexist12345") + except pkg_resources.DistributionNotFound: + assert output == _check_requirements([req.strip() for req in reqs.split("\n")]) diff --git a/website/docs/api/language.md b/website/docs/api/language.md index 767a7450a..504640d57 100644 --- a/website/docs/api/language.md +++ b/website/docs/api/language.md @@ -63,18 +63,18 @@ spaCy loads a model under the hood based on its > nlp = Language.from_config(config) > ``` -| Name | Description | -| ------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `config` | The loaded config. ~~Union[Dict[str, Any], Config]~~ | -| _keyword-only_ | | -| `vocab` | A `Vocab` object. If `True`, a vocab is created using the default language data settings. ~~Vocab~~ | -| `disable` | Name(s) of pipeline component(s) to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [`nlp.enable_pipe`](/api/language#enable_pipe). ~~Union[str, Iterable[str]]~~ | -| `enable` 3.4 | Name(s) of pipeline component(s) to [enable](/usage/processing-pipelines#disabling). All other pipes will be disabled, but can be enabled again using [`nlp.enable_pipe`](/api/language#enable_pipe). ~~Union[str, Iterable[str]]~~ | -| `exclude` | Name(s) of pipeline component(s) to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~Union[str, Iterable[str]]~~ | -| `meta` | [Meta data](/api/data-formats#meta) overrides. ~~Dict[str, Any]~~ | -| `auto_fill` | Whether to automatically fill in missing values in the config, based on defaults and function argument annotations. Defaults to `True`. ~~bool~~ | -| `validate` | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. ~~bool~~ | -| **RETURNS** | The initialized object. ~~Language~~ | +| Name | Description | +| ------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `config` | The loaded config. ~~Union[Dict[str, Any], Config]~~ | +| _keyword-only_ | | +| `vocab` | A `Vocab` object. If `True`, a vocab is created using the default language data settings. ~~Vocab~~ | +| `disable` | Name(s) of pipeline component(s) to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). Is merged with the config entry `nlp.disabled`. ~~Union[str, Iterable[str]]~~ | +| `enable` 3.4 | Name(s) of pipeline component(s) to [enable](/usage/processing-pipelines#disabling). All other pipes will be disabled, but can be enabled again using [nlp.enable_pipe](/api/language#enable_pipe). ~~Union[str, Iterable[str]]~~ | +| `exclude` | Name(s) of pipeline component(s) to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~Union[str, Iterable[str]]~~ | +| `meta` | [Meta data](/api/data-formats#meta) overrides. ~~Dict[str, Any]~~ | +| `auto_fill` | Whether to automatically fill in missing values in the config, based on defaults and function argument annotations. Defaults to `True`. ~~bool~~ | +| `validate` | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. ~~bool~~ | +| **RETURNS** | The initialized object. ~~Language~~ | ## Language.component {#component tag="classmethod" new="3"} diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index bc53fc868..c798f2a8d 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -45,16 +45,16 @@ specified separately using the new `exclude` keyword argument. > nlp = spacy.load("en_core_web_sm", exclude=["parser", "tagger"]) > ``` -| Name | Description | -| ------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `name` | Pipeline to load, i.e. package name or path. ~~Union[str, Path]~~ | -| _keyword-only_ | | -| `vocab` | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~ | -| `disable` | Name(s) of pipeline component(s) to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). ~~Union[str, Iterable[str]]~~ | -| `enable` 3.4 | Name(s) of pipeline component(s) to [enable](/usage/processing-pipelines#disabling). All other pipes will be disabled. ~~Union[str, Iterable[str]]~~ | -| `exclude` 3 | Name(s) of pipeline component(s) to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~Union[str, Iterable[str]]~~ | -| `config` 3 | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ | -| **RETURNS** | A `Language` object with the loaded pipeline. ~~Language~~ | +| Name | Description | +| ------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `name` | Pipeline to load, i.e. package name or path. ~~Union[str, Path]~~ | +| _keyword-only_ | | +| `vocab` | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~ | +| `disable` | Name(s) of pipeline component(s) to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). Is merged with the config entry `nlp.disabled`. ~~Union[str, Iterable[str]]~~ | +| `enable` 3.4 | Name(s) of pipeline component(s) to [enable](/usage/processing-pipelines#disabling). All other pipes will be disabled. ~~Union[str, Iterable[str]]~~ | +| `exclude` 3 | Name(s) of pipeline component(s) to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~Union[str, Iterable[str]]~~ | +| `config` 3 | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ | +| **RETURNS** | A `Language` object with the loaded pipeline. ~~Language~~ | Essentially, `spacy.load()` is a convenience wrapper that reads the pipeline's [`config.cfg`](/api/data-formats#config), uses the language and pipeline diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md index bd28810ae..0b63cdcb8 100644 --- a/website/docs/usage/processing-pipelines.md +++ b/website/docs/usage/processing-pipelines.md @@ -363,7 +363,8 @@ nlp.enable_pipe("tagger") ``` In addition to `disable`, `spacy.load()` also accepts `enable`. If `enable` is -set, all components except for those in `enable` are disabled. +set, all components except for those in `enable` are disabled. If `enable` and +`disable` conflict (i.e. the same component is included in both), an error is raised. ```python # Load the complete pipeline, but disable all components except for tok2vec and tagger diff --git a/website/meta/universe.json b/website/meta/universe.json index d7c99956b..fa765f640 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -1,5 +1,31 @@ { "resources": [ + { + "id": "grecy", + "title": "greCy", + "slogan": "Ancient Greek pipelines for spaCy", + "description": "greCy offers state-of-the-art pipelines for ancient Greek NLP. The repository makes language models available in various sizes, some of them containing floret word vectors and a BERT transformer layer.", + "github": "jmyerston/greCy", + "code_example": [ + "import spacy", + "#After installing the grc_ud_proiel_trf wheel package from the greCy repository", + "", + "nlp = spacy.load('grc_ud_proiel_trf')", + "doc = nlp('δοκῶ μοι περὶ ὧν πυνθάνεσθε οὐκ ἀμελέτητος εἶναι.')", + "", + "for token in doc:", + " print(token.text, token.norm_, token.lemma_, token.pos_, token.tag_)" + ], + "code_language": "python", + "author": "Jacobo Myerston", + "author_links": { + "twitter": "@jcbmyrstn", + "github": "jmyerston", + "website": "https://huggingface.co/spaces/Jacobo/syntax" + }, + "category": ["pipeline", "research"], + "tags": ["ancient Greek"] + }, { "id": "spacy-cleaner", "title": "spacy-cleaner",