From b901dc22bec8aa66ec7da951d18272f79000cffb Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 25 Oct 2022 14:53:18 +0200 Subject: [PATCH 01/18] Rename test helper method with non-test_ name (#11701) --- spacy/tests/test_models.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/tests/test_models.py b/spacy/tests/test_models.py index 2306cabb7..d91ed1201 100644 --- a/spacy/tests/test_models.py +++ b/spacy/tests/test_models.py @@ -23,7 +23,7 @@ def get_textcat_bow_kwargs(): def get_textcat_cnn_kwargs(): - return {"tok2vec": test_tok2vec(), "exclusive_classes": False, "nO": 13} + return {"tok2vec": make_test_tok2vec(), "exclusive_classes": False, "nO": 13} def get_all_params(model): @@ -65,7 +65,7 @@ def get_tok2vec_kwargs(): } -def test_tok2vec(): +def make_test_tok2vec(): return build_Tok2Vec_model(**get_tok2vec_kwargs()) From eac142e5c3b146b7a59a90e9ed83acbcb16ee131 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 25 Oct 2022 14:54:54 +0200 Subject: [PATCH 02/18] Update languages and version in README and website (#11694) --- README.md | 6 +++--- website/meta/languages.json | 28 ++++++++++++++++++++++++++-- 2 files changed, 29 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index d9ef83e01..abfc3da67 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ be used in real products. spaCy comes with [pretrained pipelines](https://spacy.io/models) and -currently supports tokenization and training for **60+ languages**. It features +currently supports tokenization and training for **70+ languages**. It features state-of-the-art speed and **neural network models** for tagging, parsing, **named entity recognition**, **text classification** and more, multi-task learning with pretrained **transformers** like BERT, as well as a @@ -16,7 +16,7 @@ production-ready [**training system**](https://spacy.io/usage/training) and easy model packaging, deployment and workflow management. spaCy is commercial open-source software, released under the MIT license. -💫 **Version 3.4.0 out now!** +💫 **Version 3.4 out now!** [Check out the release notes here.](https://github.com/explosion/spaCy/releases) [![Azure Pipelines](https://img.shields.io/azure-devops/build/explosion-ai/public/8/master.svg?logo=azure-pipelines&style=flat-square&label=build)](https://dev.azure.com/explosion-ai/public/_build?definitionId=8) @@ -79,7 +79,7 @@ more people can benefit from it. ## Features -- Support for **60+ languages** +- Support for **70+ languages** - **Trained pipelines** for different languages and tasks - Multi-task learning with pretrained **transformers** like BERT - Support for pretrained **word vectors** and embeddings diff --git a/website/meta/languages.json b/website/meta/languages.json index 79e1fc5d5..06cd005de 100644 --- a/website/meta/languages.json +++ b/website/meta/languages.json @@ -4,12 +4,22 @@ "code": "af", "name": "Afrikaans" }, + { + "code": "am", + "name": "Amharic", + "has_examples": true + }, { "code": "ar", "name": "Arabic", "example": "هذه جملة", "has_examples": true }, + { + "code": "az", + "name": "Azerbaijani", + "has_examples": true + }, { "code": "bg", "name": "Bulgarian", @@ -65,7 +75,7 @@ { "code": "dsb", "name": "Lower Sorbian", - "has_examples": true + "has_examples": true }, { "code": "el", @@ -142,6 +152,11 @@ "code": "ga", "name": "Irish" }, + { + "code": "grc", + "name": "Ancient Greek", + "has_examples": true + }, { "code": "gu", "name": "Gujarati", @@ -172,7 +187,7 @@ { "code": "hsb", "name": "Upper Sorbian", - "has_examples": true + "has_examples": true }, { "code": "hu", @@ -260,6 +275,10 @@ "example": "Адамга эң кыйыны — күн сайын адам болуу", "has_examples": true }, + { + "code": "la", + "name": "Latin" + }, { "code": "lb", "name": "Luxembourgish", @@ -448,6 +467,11 @@ "example": "นี่คือประโยค", "has_examples": true }, + { + "code": "ti", + "name": "Tigrinya", + "has_examples": true + }, { "code": "tl", "name": "Tagalog" From cb983eff1dee5690c1baa715613a5556dfc26ee1 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 25 Oct 2022 19:38:23 +0200 Subject: [PATCH 03/18] Reduce python 3.10 in CI to one OS (#11703) --- azure-pipelines.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 357cce835..eea07cb7a 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -76,15 +76,15 @@ jobs: # Python39Mac: # imageName: "macos-latest" # python.version: "3.9" - Python310Linux: - imageName: "ubuntu-latest" - python.version: "3.10" + # Python310Linux: + # imageName: "ubuntu-latest" + # python.version: "3.10" Python310Windows: imageName: "windows-latest" python.version: "3.10" - Python310Mac: - imageName: "macos-latest" - python.version: "3.10" + # Python310Mac: + # imageName: "macos-latest" + # python.version: "3.10" Python311Linux: imageName: 'ubuntu-latest' python.version: '3.11.0-rc.2' From 6070aeb8306d0d966e4e71bcce23acb879ea7750 Mon Sep 17 00:00:00 2001 From: Ryn Daniels <397565+ryndaniels@users.noreply.github.com> Date: Wed, 26 Oct 2022 09:15:13 +0300 Subject: [PATCH 04/18] update github actions to deal with deprecations (#11702) --- .github/workflows/autoblack.yml | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/.github/workflows/autoblack.yml b/.github/workflows/autoblack.yml index 8d0282650..3ad4cf408 100644 --- a/.github/workflows/autoblack.yml +++ b/.github/workflows/autoblack.yml @@ -12,10 +12,10 @@ jobs: if: github.repository_owner == 'explosion' runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 with: ref: ${{ github.head_ref }} - - uses: actions/setup-python@v2 + - uses: actions/setup-python@v3 - run: pip install black - name: Auto-format code if needed run: black spacy @@ -23,10 +23,11 @@ jobs: # code and makes GitHub think the action failed - name: Check for modified files id: git-check - run: echo ::set-output name=modified::$(if git diff-index --quiet HEAD --; then echo "false"; else echo "true"; fi) + run: echo modified=$(if git diff-index --quiet HEAD --; then echo "false"; else echo "true"; fi) >> $GITHUB_OUTPUT + - name: Create Pull Request if: steps.git-check.outputs.modified == 'true' - uses: peter-evans/create-pull-request@v3 + uses: peter-evans/create-pull-request@v4 with: title: Auto-format code with black labels: meta From 7c4bc6629ac4841b509ef9448cce79781b346598 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Thu, 27 Oct 2022 22:08:24 +0900 Subject: [PATCH 05/18] Add warning to install widget for M1 GPUs (#11666) * Add warning to install widget for M1 GPUs * Use Thinc tracking issue instead * Update website/src/widgets/quickstart-install.js Co-authored-by: Adriane Boyd * Underline URL in warning * Update website/src/widgets/quickstart-install.js Co-authored-by: Adriane Boyd * Don't install cupy on m1 gpus Co-authored-by: Adriane Boyd --- website/src/styles/quickstart.module.sass | 3 +++ website/src/widgets/quickstart-install.js | 11 ++++++++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/website/src/styles/quickstart.module.sass b/website/src/styles/quickstart.module.sass index 8ad106a78..d0f9db551 100644 --- a/website/src/styles/quickstart.module.sass +++ b/website/src/styles/quickstart.module.sass @@ -149,6 +149,9 @@ & > span display: block + a + text-decoration: underline + .small font-size: var(--font-size-code) line-height: 1.65 diff --git a/website/src/widgets/quickstart-install.js b/website/src/widgets/quickstart-install.js index 0d2186acb..28dd14ecc 100644 --- a/website/src/widgets/quickstart-install.js +++ b/website/src/widgets/quickstart-install.js @@ -159,6 +159,9 @@ const QuickstartInstall = ({ id, title }) => { setters={setters} showDropdown={showDropdown} > + + # Note M1 GPU support is experimental, see Thinc issue #792 + python -m venv .env @@ -198,7 +201,13 @@ const QuickstartInstall = ({ id, title }) => { {nightly ? ' --pre' : ''} conda install -c conda-forge spacy - + + conda install -c conda-forge cupy + + + conda install -c conda-forge cupy + + conda install -c conda-forge cupy From 0ab9edefa8372f1641767720baab0941cbcd7dd5 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Fri, 28 Oct 2022 17:25:34 +0900 Subject: [PATCH 06/18] Handle Docs with no entities in EntityLinker (#11640) * Handle docs with no entities If a whole batch contains no entities it won't make it to the model, but it's possible for individual Docs to have no entities. Before this commit, those Docs would cause an error when attempting to concatenate arrays because the dimensions didn't match. It turns out the process of preparing the Ragged at the end of the span maker forward was a little different from list2ragged, which just uses the flatten function directly. Letting list2ragged do the conversion avoids the dimension issue. This did not come up before because in NEL demo projects it's typical for data with no entities to be discarded before it reaches the NEL component. This includes a simple direct test that shows the issue and checks it's resolved. It doesn't check if there are any downstream changes, so a more complete test could be added. A full run was tested by adding an example with no entities to the Emerson sample project. * Add a blank instance to default training data in tests Rather than adding a specific test, since not failing on instances with no entities is basic functionality, it makes sense to add it to the default set. * Fix without modifying architecture If the architecture is modified this would have to be a new version, but this change isn't big enough to merit that. --- spacy/ml/models/entity_linker.py | 7 +++---- spacy/tests/pipeline/test_entity_linker.py | 22 +++++++++++++++++++++- 2 files changed, 24 insertions(+), 5 deletions(-) diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py index d847342a3..0293f87e9 100644 --- a/spacy/ml/models/entity_linker.py +++ b/spacy/ml/models/entity_linker.py @@ -70,11 +70,10 @@ def span_maker_forward(model, docs: List[Doc], is_train) -> Tuple[Ragged, Callab cands.append((start_token, end_token)) candidates.append(ops.asarray2i(cands)) - candlens = ops.asarray1i([len(cands) for cands in candidates]) - candidates = ops.xp.concatenate(candidates) - outputs = Ragged(candidates, candlens) + lengths = model.ops.asarray1i([len(cands) for cands in candidates]) + out = Ragged(model.ops.flatten(candidates), lengths) # because this is just rearranging docs, the backprop does nothing - return outputs, lambda x: [] + return out, lambda x: [] @registry.misc("spacy.KBFromFile.v1") diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index 82bc976bb..1c8e49a09 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -9,6 +9,7 @@ from spacy.compat import pickle from spacy.kb import Candidate, KnowledgeBase, get_candidates from spacy.lang.en import English from spacy.ml import load_kb +from spacy.ml.models.entity_linker import build_span_maker from spacy.pipeline import EntityLinker from spacy.pipeline.legacy import EntityLinker_v1 from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL @@ -701,7 +702,11 @@ TRAIN_DATA = [ ("Russ Cochran was a member of University of Kentucky's golf team.", {"links": {(0, 12): {"Q7381115": 0.0, "Q2146908": 1.0}}, "entities": [(0, 12, "PERSON"), (43, 51, "LOC")], - "sent_starts": [1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}) + "sent_starts": [1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}), + # having a blank instance shouldn't break things + ("The weather is nice today.", + {"links": {}, "entities": [], + "sent_starts": [1, -1, 0, 0, 0, 0]}) ] GOLD_entities = ["Q2146908", "Q7381115", "Q7381115", "Q2146908"] # fmt: on @@ -1176,3 +1181,18 @@ def test_threshold(meet_threshold: bool, config: Dict[str, Any]): assert len(doc.ents) == 1 assert doc.ents[0].kb_id_ == entity_id if meet_threshold else EntityLinker.NIL + + +def test_span_maker_forward_with_empty(): + """The forward pass of the span maker may have a doc with no entities.""" + nlp = English() + doc1 = nlp("a b c") + ent = doc1[0:1] + ent.label_ = "X" + doc1.ents = [ent] + # no entities + doc2 = nlp("x y z") + + # just to get a model + span_maker = build_span_maker() + span_maker([doc1, doc2], False) From eda0ee2c896dced36a847cf705a9b5e9cdf7ac88 Mon Sep 17 00:00:00 2001 From: Aaron Zipp <15341396+aaronzipp@users.noreply.github.com> Date: Mon, 31 Oct 2022 05:27:12 +0100 Subject: [PATCH 07/18] Spelling mistake in rule-based-matching.md (#11717) Changed retokenize to retokenizer --- website/docs/usage/rule-based-matching.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md index f096890cb..64bbf8e7b 100644 --- a/website/docs/usage/rule-based-matching.md +++ b/website/docs/usage/rule-based-matching.md @@ -1792,7 +1792,7 @@ the entity `Span` – for example `._.orgs` or `._.prev_orgs` and > [`Doc.retokenize`](/api/doc#retokenize) context manager: > > ```python -> with doc.retokenize() as retokenize: +> with doc.retokenize() as retokenizer: > for ent in doc.ents: > retokenizer.merge(ent) > ``` From 9cd252016363c2503aa55f335a13b42169fd288c Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 2 Nov 2022 13:42:20 +0100 Subject: [PATCH 08/18] Switch CI to Python 3.11.0 (#11737) --- azure-pipelines.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index eea07cb7a..bf3672b8b 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -87,13 +87,13 @@ jobs: # python.version: "3.10" Python311Linux: imageName: 'ubuntu-latest' - python.version: '3.11.0-rc.2' + python.version: '3.11.0' Python311Windows: imageName: 'windows-latest' - python.version: '3.11.0-rc.2' + python.version: '3.11.0' Python311Mac: imageName: 'macos-latest' - python.version: '3.11.0-rc.2' + python.version: '3.11.0' maxParallel: 4 pool: vmImage: $(imageName) From 2e322298fe75902f62650fff0fda0fb11f4be5fb Mon Sep 17 00:00:00 2001 From: Ryn Daniels <397565+ryndaniels@users.noreply.github.com> Date: Wed, 2 Nov 2022 16:36:30 +0200 Subject: [PATCH 09/18] More version updates for github action deprecation warnings (#11705) * More version updates for github action deprecation warnings * fix the deprecated set-output commands * bump explosion-bot to run on ubuntu-latest --- .github/workflows/autoblack.yml | 2 +- .github/workflows/explosionbot.yml | 6 +++--- .github/workflows/slowtests.yml | 6 +++--- .github/workflows/spacy_universe_alert.yml | 4 ++-- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/.github/workflows/autoblack.yml b/.github/workflows/autoblack.yml index 3ad4cf408..70882c3cc 100644 --- a/.github/workflows/autoblack.yml +++ b/.github/workflows/autoblack.yml @@ -15,7 +15,7 @@ jobs: - uses: actions/checkout@v3 with: ref: ${{ github.head_ref }} - - uses: actions/setup-python@v3 + - uses: actions/setup-python@v4 - run: pip install black - name: Auto-format code if needed run: black spacy diff --git a/.github/workflows/explosionbot.yml b/.github/workflows/explosionbot.yml index d585ecd9c..6b472cd12 100644 --- a/.github/workflows/explosionbot.yml +++ b/.github/workflows/explosionbot.yml @@ -8,14 +8,14 @@ on: jobs: explosion-bot: - runs-on: ubuntu-18.04 + runs-on: ubuntu-latest steps: - name: Dump GitHub context env: GITHUB_CONTEXT: ${{ toJson(github) }} run: echo "$GITHUB_CONTEXT" - - uses: actions/checkout@v1 - - uses: actions/setup-python@v1 + - uses: actions/checkout@v3 + - uses: actions/setup-python@v4 - name: Install and run explosion-bot run: | pip install git+https://${{ secrets.EXPLOSIONBOT_TOKEN }}@github.com/explosion/explosion-bot diff --git a/.github/workflows/slowtests.yml b/.github/workflows/slowtests.yml index 38ceb18c6..f9fd3e817 100644 --- a/.github/workflows/slowtests.yml +++ b/.github/workflows/slowtests.yml @@ -14,7 +14,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout - uses: actions/checkout@v1 + uses: actions/checkout@v3 with: ref: ${{ matrix.branch }} - name: Get commits from past 24 hours @@ -23,9 +23,9 @@ jobs: today=$(date '+%Y-%m-%d %H:%M:%S') yesterday=$(date -d "yesterday" '+%Y-%m-%d %H:%M:%S') if git log --after="$yesterday" --before="$today" | grep commit ; then - echo "::set-output name=run_tests::true" + echo run_tests=true >> $GITHUB_OUTPUT else - echo "::set-output name=run_tests::false" + echo run_tests=false >> $GITHUB_OUTPUT fi - name: Trigger buildkite build diff --git a/.github/workflows/spacy_universe_alert.yml b/.github/workflows/spacy_universe_alert.yml index cbbf14c6e..f507e0594 100644 --- a/.github/workflows/spacy_universe_alert.yml +++ b/.github/workflows/spacy_universe_alert.yml @@ -17,8 +17,8 @@ jobs: run: | echo "$GITHUB_CONTEXT" - - uses: actions/checkout@v1 - - uses: actions/setup-python@v1 + - uses: actions/checkout@v3 + - uses: actions/setup-python@v4 - name: Install Bernadette app dependency and send an alert env: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} From a6c3701613efe69f83ded04d124c345874ce06e2 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 3 Nov 2022 09:29:46 +0100 Subject: [PATCH 10/18] Modernize and simplify CI steps (#11738) * Use `build` instead of `python setup.py sdist` * Remove in-place build with `setup.py` * Remove `gpu` parameter and GPU tests * Keep `architecture` and `num_build_jobs` in azure steps with CI defaults * Fix use of `num_build_jobs` parameters * Remove now-unused `prefix` parameter * Test imports and CLI before installing test requirements * Remove `*.egg-info` directory in addition to source directory for an warning-free `import spacy` * Switch `thinc-apple-ops` test to python 3.11 (as most recent python that is tested across platforms) --- .github/azure-steps.yml | 70 +++++++++++++++++++---------------------- azure-pipelines.yml | 17 ---------- 2 files changed, 33 insertions(+), 54 deletions(-) diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml index cc0247b3a..b2bc80dd6 100644 --- a/.github/azure-steps.yml +++ b/.github/azure-steps.yml @@ -1,9 +1,7 @@ parameters: python_version: '' - architecture: '' - prefix: '' - gpu: false - num_build_jobs: 1 + architecture: 'x64' + num_build_jobs: 2 steps: - task: UsePythonVersion@0 @@ -17,16 +15,16 @@ steps: displayName: 'Set variables' - script: | - ${{ parameters.prefix }} python -m pip install -U pip setuptools - ${{ parameters.prefix }} python -m pip install -U -r requirements.txt + python -m pip install -U build pip setuptools + python -m pip install -U -r requirements.txt displayName: "Install dependencies" - script: | - ${{ parameters.prefix }} python setup.py build_ext --inplace -j ${{ parameters.num_build_jobs }} - ${{ parameters.prefix }} python setup.py sdist --formats=gztar - displayName: "Compile and build sdist" + python -m build --sdist + displayName: "Build sdist" - - script: python -m mypy spacy + - script: | + python -m mypy spacy displayName: 'Run mypy' condition: ne(variables['python_version'], '3.6') @@ -35,35 +33,24 @@ steps: contents: "spacy" displayName: "Delete source directory" + - task: DeleteFiles@1 + inputs: + contents: "*.egg-info" + displayName: "Delete egg-info directory" + - script: | - ${{ parameters.prefix }} python -m pip freeze --exclude torch --exclude cupy-cuda110 > installed.txt - ${{ parameters.prefix }} python -m pip uninstall -y -r installed.txt + python -m pip freeze > installed.txt + python -m pip uninstall -y -r installed.txt displayName: "Uninstall all packages" - bash: | - ${{ parameters.prefix }} SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1) - ${{ parameters.prefix }} SPACY_NUM_BUILD_JOBS=2 python -m pip install dist/$SDIST + SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1) + SPACY_NUM_BUILD_JOBS=${{ parameters.num_build_jobs }} python -m pip install dist/$SDIST displayName: "Install from sdist" - script: | - ${{ parameters.prefix }} python -m pip install -U -r requirements.txt - displayName: "Install test requirements" - - - script: | - ${{ parameters.prefix }} python -m pip install -U cupy-cuda110 -f https://github.com/cupy/cupy/releases/v9.0.0 - ${{ parameters.prefix }} python -m pip install "torch==1.7.1+cu110" -f https://download.pytorch.org/whl/torch_stable.html - displayName: "Install GPU requirements" - condition: eq(${{ parameters.gpu }}, true) - - - script: | - ${{ parameters.prefix }} python -m pytest --pyargs spacy -W error - displayName: "Run CPU tests" - condition: eq(${{ parameters.gpu }}, false) - - - script: | - ${{ parameters.prefix }} python -m pytest --pyargs spacy -W error -p spacy.tests.enable_gpu - displayName: "Run GPU tests" - condition: eq(${{ parameters.gpu }}, true) + python -W error -c "import spacy" + displayName: "Test import" - script: | python -m spacy download ca_core_news_sm @@ -106,13 +93,22 @@ steps: displayName: 'Test assemble CLI vectors warning' condition: eq(variables['python_version'], '3.8') + - script: | + python -m pip install -U -r requirements.txt + displayName: "Install test requirements" + + - script: | + python -m pytest --pyargs spacy -W error + displayName: "Run CPU tests" + + - script: | + python -m pip install --pre thinc-apple-ops + python -m pytest --pyargs spacy + displayName: "Run CPU tests with thinc-apple-ops" + condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.11')) + - script: | python .github/validate_universe_json.py website/meta/universe.json displayName: 'Test website/meta/universe.json' condition: eq(variables['python_version'], '3.8') - - script: | - ${{ parameters.prefix }} python -m pip install --pre thinc-apple-ops - ${{ parameters.prefix }} python -m pytest --pyargs spacy - displayName: "Run CPU tests with thinc-apple-ops" - condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.10')) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index bf3672b8b..3499042cb 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -101,20 +101,3 @@ jobs: - template: .github/azure-steps.yml parameters: python_version: '$(python.version)' - architecture: 'x64' - -# - job: "TestGPU" -# dependsOn: "Validate" -# strategy: -# matrix: -# Python38LinuxX64_GPU: -# python.version: '3.8' -# pool: -# name: "LinuxX64_GPU" -# steps: -# - template: .github/azure-steps.yml -# parameters: -# python_version: '$(python.version)' -# architecture: 'x64' -# gpu: true -# num_build_jobs: 24 From 7b0c36660cc76b58efc4068516a696e1695c4741 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Thu, 3 Nov 2022 18:52:59 +0900 Subject: [PATCH 11/18] Fix default parameters for load functions (fix #11706) (#11713) * Fix default parameters for load functions Some load functions used SimpleFrozenList() directly instead of the _DEFAULT_EMPTY_PIPES parameter. That mostly worked as intended, but the changes in #11459 check for equality using identity, not value, so a warning is incorrectly raised sometimes, as in #11706. This change just has all the load functions use the singleton value instead. * Add test that there are no warnings on module-based load This will succeed due to changes in this branch, but local tests with the latest release failed as intended. * Try reverting commit and see if CI changes There is an error in CI that is probably unrelated. Revert "Fix default parameters for load functions" This reverts commit dc46b35687e92e4793e64edb11997d44b88c6a8b. * Revert "Try reverting commit and see if CI changes" This reverts commit 2514ed07ef29851b5ac60015442a7ce44c69decc. Co-authored-by: Adriane Boyd --- .github/azure-steps.yml | 5 +++++ spacy/util.py | 12 ++++++------ 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml index b2bc80dd6..e8bd0d212 100644 --- a/.github/azure-steps.yml +++ b/.github/azure-steps.yml @@ -59,6 +59,11 @@ steps: displayName: 'Test download CLI' condition: eq(variables['python_version'], '3.8') + - script: | + python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')" + displayName: 'Test no warnings on load (#11713)' + condition: eq(variables['python_version'], '3.8') + - script: | python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json . displayName: 'Test convert CLI' diff --git a/spacy/util.py b/spacy/util.py index 3034808ba..76a1e0bfa 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -443,9 +443,9 @@ def load_model_from_package( name: str, *, vocab: Union["Vocab", bool] = True, - disable: Union[str, Iterable[str]] = SimpleFrozenList(), - enable: Union[str, Iterable[str]] = SimpleFrozenList(), - exclude: Union[str, Iterable[str]] = SimpleFrozenList(), + disable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES, + enable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES, + exclude: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES, config: Union[Dict[str, Any], Config] = SimpleFrozenDict(), ) -> "Language": """Load a model from an installed package. @@ -619,9 +619,9 @@ def load_model_from_init_py( init_file: Union[Path, str], *, vocab: Union["Vocab", bool] = True, - disable: Union[str, Iterable[str]] = SimpleFrozenList(), - enable: Union[str, Iterable[str]] = SimpleFrozenList(), - exclude: Union[str, Iterable[str]] = SimpleFrozenList(), + disable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES, + enable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES, + exclude: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES, config: Union[Dict[str, Any], Config] = SimpleFrozenDict(), ) -> "Language": """Helper function to use in the `load()` method of a model package's From 3257718a73624324c9e926d2d5cc4a518a315fc4 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 3 Nov 2022 11:49:08 +0100 Subject: [PATCH 12/18] Restore Doc attr getter values in Doc.to_json (#11700) --- spacy/tests/doc/test_json_doc_conversion.py | 9 +++++++ spacy/tokens/doc.pyx | 27 ++++++++++++++------- 2 files changed, 27 insertions(+), 9 deletions(-) diff --git a/spacy/tests/doc/test_json_doc_conversion.py b/spacy/tests/doc/test_json_doc_conversion.py index 19698cfb2..11a1817e6 100644 --- a/spacy/tests/doc/test_json_doc_conversion.py +++ b/spacy/tests/doc/test_json_doc_conversion.py @@ -370,3 +370,12 @@ def test_json_to_doc_validation_error(doc): doc_json.pop("tokens") with pytest.raises(ValueError): Doc(doc.vocab).from_json(doc_json, validate=True) + + +def test_to_json_underscore_doc_getters(doc): + def get_text_length(doc): + return len(doc.text) + + Doc.set_extension("text_length", getter=get_text_length) + doc_json = doc.to_json(underscore=["text_length"]) + assert doc_json["_"]["text_length"] == get_text_length(doc) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 295f91c28..f2621292c 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -1668,6 +1668,20 @@ cdef class Doc: if underscore: user_keys = set() + # Handle doc attributes with .get to include values from getters + # and not only values stored in user_data, for backwards + # compatibility + for attr in underscore: + if self.has_extension(attr): + if "_" not in data: + data["_"] = {} + value = self._.get(attr) + if not srsly.is_json_serializable(value): + raise ValueError(Errors.E107.format(attr=attr, value=repr(value))) + data["_"][attr] = value + user_keys.add(attr) + # Token and span attributes only include values stored in user_data + # and not values generated by getters if self.user_data: for data_key, value in self.user_data.copy().items(): if type(data_key) == tuple and len(data_key) >= 4 and data_key[0] == "._.": @@ -1678,20 +1692,15 @@ cdef class Doc: user_keys.add(attr) if not srsly.is_json_serializable(value): raise ValueError(Errors.E107.format(attr=attr, value=repr(value))) - # Check if doc attribute - if start is None: - if "_" not in data: - data["_"] = {} - data["_"][attr] = value - # Check if token attribute - elif end is None: + # Token attribute + if start is not None and end is None: if "underscore_token" not in data: data["underscore_token"] = {} if attr not in data["underscore_token"]: data["underscore_token"][attr] = [] data["underscore_token"][attr].append({"start": start, "value": value}) - # Else span attribute - else: + # Span attribute + elif start is not None and end is not None: if "underscore_span" not in data: data["underscore_span"] = {} if attr not in data["underscore_span"]: From 451e0eccad74cd9332c26e3b22c5c8083d89cbf9 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 7 Nov 2022 08:11:13 +0100 Subject: [PATCH 13/18] Fix types for Span.id and Span.id_ (#11744) --- spacy/tokens/span.pyi | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/spacy/tokens/span.pyi b/spacy/tokens/span.pyi index 617e3d19d..0a6f306a6 100644 --- a/spacy/tokens/span.pyi +++ b/spacy/tokens/span.pyi @@ -117,15 +117,13 @@ class Span: end_char: int label: int kb_id: int + id: int ent_id: int ent_id_: str @property - def id(self) -> int: ... - @property - def id_(self) -> str: ... - @property def orth_(self) -> str: ... @property def lemma_(self) -> str: ... label_: str kb_id_: str + id_: str From 8fc74a7fb08c4ffbde901c11153839924949e033 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Mon, 7 Nov 2022 16:11:55 +0900 Subject: [PATCH 14/18] Raise Typer limit (#11720) * Raise typer limit to <0.7.0 * Raise limit to <0.8.0 --- requirements.txt | 2 +- setup.cfg | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 9d6bbb2c4..d91a3b3d4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,7 +9,7 @@ murmurhash>=0.28.0,<1.1.0 wasabi>=0.9.1,<1.1.0 srsly>=2.4.3,<3.0.0 catalogue>=2.0.6,<2.1.0 -typer>=0.3.0,<0.5.0 +typer>=0.3.0,<0.8.0 pathy>=0.3.5 # Third party dependencies numpy>=1.15.0 diff --git a/setup.cfg b/setup.cfg index c2653feba..82d4d2758 100644 --- a/setup.cfg +++ b/setup.cfg @@ -51,7 +51,7 @@ install_requires = srsly>=2.4.3,<3.0.0 catalogue>=2.0.6,<2.1.0 # Third-party dependencies - typer>=0.3.0,<0.5.0 + typer>=0.3.0,<0.8.0 pathy>=0.3.5 tqdm>=4.38.0,<5.0.0 numpy>=1.15.0 From 808a5c6bff09b9e89dc293a4a82f0ee39983dc89 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 7 Nov 2022 13:25:40 +0100 Subject: [PATCH 15/18] Switch CI to python 3.11 (#11765) --- azure-pipelines.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 3499042cb..9c3b92f06 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -87,13 +87,13 @@ jobs: # python.version: "3.10" Python311Linux: imageName: 'ubuntu-latest' - python.version: '3.11.0' + python.version: '3.11' Python311Windows: imageName: 'windows-latest' - python.version: '3.11.0' + python.version: '3.11' Python311Mac: imageName: 'macos-latest' - python.version: '3.11.0' + python.version: '3.11' maxParallel: 4 pool: vmImage: $(imageName) From 43bfc2ea9b7f178330483636fecf1019a162bd63 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 7 Nov 2022 14:46:08 +0100 Subject: [PATCH 16/18] Add fallback in requirements check, only check once (#11735) * Add fallback in requirements check, only check once * Rename to skip_requirements_check * Update spacy/cli/project/run.py Co-authored-by: Paul O'Leary McCann Co-authored-by: Paul O'Leary McCann --- spacy/cli/project/run.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/spacy/cli/project/run.py b/spacy/cli/project/run.py index ebab7471e..638e7fab1 100644 --- a/spacy/cli/project/run.py +++ b/spacy/cli/project/run.py @@ -53,6 +53,7 @@ def project_run( force: bool = False, dry: bool = False, capture: bool = False, + skip_requirements_check: bool = False, ) -> None: """Run a named script defined in the project.yml. If the script is part of the default pipeline (defined in the "run" section), DVC is used to @@ -69,6 +70,7 @@ def project_run( sys.exit will be called with the return code. You should use capture=False when you want to turn over execution to the command, and capture=True when you want to run the command more like a function. + skip_requirements_check (bool): Whether to skip the requirements check. """ config = load_project_config(project_dir, overrides=overrides) commands = {cmd["name"]: cmd for cmd in config.get("commands", [])} @@ -76,9 +78,10 @@ def project_run( validate_subcommand(list(commands.keys()), list(workflows.keys()), subcommand) req_path = project_dir / "requirements.txt" - if config.get("check_requirements", True) and os.path.exists(req_path): - with req_path.open() as requirements_file: - _check_requirements([req.replace("\n", "") for req in requirements_file]) + if not skip_requirements_check: + if config.get("check_requirements", True) and os.path.exists(req_path): + with req_path.open() as requirements_file: + _check_requirements([req.strip() for req in requirements_file]) if subcommand in workflows: msg.info(f"Running workflow '{subcommand}'") @@ -90,6 +93,7 @@ def project_run( force=force, dry=dry, capture=capture, + skip_requirements_check=True, ) else: cmd = commands[subcommand] @@ -338,6 +342,11 @@ def _check_requirements(requirements: List[str]) -> Tuple[bool, bool]: failed_pkgs_msgs.append(dnf.report()) except pkg_resources.VersionConflict as vc: conflicting_pkgs_msgs.append(vc.report()) + except Exception: + msg.warn(f"Unable to check requirement: {req} " + "Check that the requirement is formatted according to PEP " + "440, in particular that URLs are formatted as " + "'package_name @ URL'") if len(failed_pkgs_msgs) or len(conflicting_pkgs_msgs): msg.warn( From 940306f7863a19b7c01930db00e22848fa1d267c Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Tue, 8 Nov 2022 14:58:10 +0100 Subject: [PATCH 17/18] Revert disable/disabled merging behavior (#11745) * Merge disable with disabled. Adjust warnings, errors and tests. * Replace any() with set operation. * Update spacy/tests/pipeline/test_pipe_methods.py Co-authored-by: Adriane Boyd * Update docs. * Remve reference to config entry nlp.enabled from docs. Co-authored-by: Adriane Boyd --- spacy/errors.py | 4 +- spacy/language.py | 45 ++++++++----------- spacy/tests/pipeline/test_pipe_methods.py | 18 ++++---- .../serialize/test_serialize_pipeline.py | 7 ++- website/docs/api/language.md | 24 +++++----- website/docs/api/top-level.md | 20 ++++----- website/docs/usage/processing-pipelines.md | 3 +- 7 files changed, 56 insertions(+), 65 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index c035f684d..3cc9fd494 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -212,8 +212,8 @@ class Warnings(metaclass=ErrorsWithCodes): W121 = ("Attempting to trace non-existent method '{method}' in pipe '{pipe}'") W122 = ("Couldn't trace method '{method}' in pipe '{pipe}'. This can happen if the pipe class " "is a Cython extension type.") - W123 = ("Argument {arg} with value {arg_value} is used instead of {config_value} as specified in the config. Be " - "aware that this might affect other components in your pipeline.") + W123 = ("Argument `enable` with value {enable} does not contain all values specified in the config option " + "`enabled` ({enabled}). Be aware that this might affect other components in your pipeline.") class Errors(metaclass=ErrorsWithCodes): diff --git a/spacy/language.py b/spacy/language.py index d391f15ab..967af1e62 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1879,31 +1879,22 @@ class Language: if isinstance(exclude, str): exclude = [exclude] - def fetch_pipes_status(value: Iterable[str], key: str) -> Iterable[str]: - """Fetch value for `enable` or `disable` w.r.t. the specified config and passed arguments passed to - .load(). If both arguments and config specified values for this field, the passed arguments take precedence - and a warning is printed. - value (Iterable[str]): Passed value for `enable` or `disable`. - key (str): Key for field in config (either "enabled" or "disabled"). - RETURN (Iterable[str]): - """ - # We assume that no argument was passed if the value is the specified default value. - if id(value) == id(_DEFAULT_EMPTY_PIPES): - return config["nlp"].get(key, []) - else: - if len(config["nlp"].get(key, [])): - warnings.warn( - Warnings.W123.format( - arg=key[:-1], - arg_value=value, - config_value=config["nlp"][key], - ) + # `enable` should not be merged with `enabled` (the opposite is true for `disable`/`disabled`). If the config + # specifies values for `enabled` not included in `enable`, emit warning. + if id(enable) != id(_DEFAULT_EMPTY_PIPES): + enabled = config["nlp"].get("enabled", []) + if len(enabled) and not set(enabled).issubset(enable): + warnings.warn( + Warnings.W123.format( + enable=enable, + enabled=enabled, ) - return value + ) + # Ensure sets of disabled/enabled pipe names are not contradictory. disabled_pipes = cls._resolve_component_status( - fetch_pipes_status(disable, "disabled"), - fetch_pipes_status(enable, "enabled"), + list({*disable, *config["nlp"].get("disabled", [])}), + enable, config["nlp"]["pipeline"], ) nlp._disabled = set(p for p in disabled_pipes if p not in exclude) @@ -2084,10 +2075,12 @@ class Language: if enable: if isinstance(enable, str): enable = [enable] - to_disable = [ - pipe_name for pipe_name in pipe_names if pipe_name not in enable - ] - if disable and disable != to_disable: + to_disable = { + *[pipe_name for pipe_name in pipe_names if pipe_name not in enable], + *disable, + } + # If any pipe to be enabled is in to_disable, the specification is inconsistent. + if len(set(enable) & to_disable): raise ValueError(Errors.E1042.format(enable=enable, disable=disable)) return tuple(to_disable) diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py index 14a7a36e5..4dd7bae16 100644 --- a/spacy/tests/pipeline/test_pipe_methods.py +++ b/spacy/tests/pipeline/test_pipe_methods.py @@ -615,20 +615,18 @@ def test_enable_disable_conflict_with_config(): with make_tempdir() as tmp_dir: nlp.to_disk(tmp_dir) - # Expected to fail, as config and arguments conflict. - with pytest.raises(ValueError): - spacy.load( - tmp_dir, enable=["tagger"], config={"nlp": {"disabled": ["senter"]}} - ) + # Expected to succeed, as config and arguments do not conflict. + assert spacy.load( + tmp_dir, enable=["tagger"], config={"nlp": {"disabled": ["senter"]}} + ).disabled == ["senter", "sentencizer"] # Expected to succeed without warning due to the lack of a conflicting config option. spacy.load(tmp_dir, enable=["tagger"]) - # Expected to succeed with a warning, as disable=[] should override the config setting. - with pytest.warns(UserWarning): + # Expected to fail due to conflict between enable and disabled. + with pytest.raises(ValueError): spacy.load( tmp_dir, - enable=["tagger"], - disable=[], - config={"nlp": {"disabled": ["senter"]}}, + enable=["senter"], + config={"nlp": {"disabled": ["senter", "tagger"]}}, ) diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py index b948bb76c..9fcf18e2d 100644 --- a/spacy/tests/serialize/test_serialize_pipeline.py +++ b/spacy/tests/serialize/test_serialize_pipeline.py @@ -404,11 +404,10 @@ def test_serialize_pipeline_disable_enable(): assert nlp3.component_names == ["ner", "tagger"] with make_tempdir() as d: nlp3.to_disk(d) - with pytest.warns(UserWarning): - nlp4 = spacy.load(d, disable=["ner"]) - assert nlp4.pipe_names == ["tagger"] + nlp4 = spacy.load(d, disable=["ner"]) + assert nlp4.pipe_names == [] assert nlp4.component_names == ["ner", "tagger"] - assert nlp4.disabled == ["ner"] + assert nlp4.disabled == ["ner", "tagger"] with make_tempdir() as d: nlp.to_disk(d) nlp5 = spacy.load(d, exclude=["tagger"]) diff --git a/website/docs/api/language.md b/website/docs/api/language.md index 767a7450a..504640d57 100644 --- a/website/docs/api/language.md +++ b/website/docs/api/language.md @@ -63,18 +63,18 @@ spaCy loads a model under the hood based on its > nlp = Language.from_config(config) > ``` -| Name | Description | -| ------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `config` | The loaded config. ~~Union[Dict[str, Any], Config]~~ | -| _keyword-only_ | | -| `vocab` | A `Vocab` object. If `True`, a vocab is created using the default language data settings. ~~Vocab~~ | -| `disable` | Name(s) of pipeline component(s) to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [`nlp.enable_pipe`](/api/language#enable_pipe). ~~Union[str, Iterable[str]]~~ | -| `enable` 3.4 | Name(s) of pipeline component(s) to [enable](/usage/processing-pipelines#disabling). All other pipes will be disabled, but can be enabled again using [`nlp.enable_pipe`](/api/language#enable_pipe). ~~Union[str, Iterable[str]]~~ | -| `exclude` | Name(s) of pipeline component(s) to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~Union[str, Iterable[str]]~~ | -| `meta` | [Meta data](/api/data-formats#meta) overrides. ~~Dict[str, Any]~~ | -| `auto_fill` | Whether to automatically fill in missing values in the config, based on defaults and function argument annotations. Defaults to `True`. ~~bool~~ | -| `validate` | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. ~~bool~~ | -| **RETURNS** | The initialized object. ~~Language~~ | +| Name | Description | +| ------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `config` | The loaded config. ~~Union[Dict[str, Any], Config]~~ | +| _keyword-only_ | | +| `vocab` | A `Vocab` object. If `True`, a vocab is created using the default language data settings. ~~Vocab~~ | +| `disable` | Name(s) of pipeline component(s) to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). Is merged with the config entry `nlp.disabled`. ~~Union[str, Iterable[str]]~~ | +| `enable` 3.4 | Name(s) of pipeline component(s) to [enable](/usage/processing-pipelines#disabling). All other pipes will be disabled, but can be enabled again using [nlp.enable_pipe](/api/language#enable_pipe). ~~Union[str, Iterable[str]]~~ | +| `exclude` | Name(s) of pipeline component(s) to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~Union[str, Iterable[str]]~~ | +| `meta` | [Meta data](/api/data-formats#meta) overrides. ~~Dict[str, Any]~~ | +| `auto_fill` | Whether to automatically fill in missing values in the config, based on defaults and function argument annotations. Defaults to `True`. ~~bool~~ | +| `validate` | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. ~~bool~~ | +| **RETURNS** | The initialized object. ~~Language~~ | ## Language.component {#component tag="classmethod" new="3"} diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index bc53fc868..c798f2a8d 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -45,16 +45,16 @@ specified separately using the new `exclude` keyword argument. > nlp = spacy.load("en_core_web_sm", exclude=["parser", "tagger"]) > ``` -| Name | Description | -| ------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `name` | Pipeline to load, i.e. package name or path. ~~Union[str, Path]~~ | -| _keyword-only_ | | -| `vocab` | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~ | -| `disable` | Name(s) of pipeline component(s) to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). ~~Union[str, Iterable[str]]~~ | -| `enable` 3.4 | Name(s) of pipeline component(s) to [enable](/usage/processing-pipelines#disabling). All other pipes will be disabled. ~~Union[str, Iterable[str]]~~ | -| `exclude` 3 | Name(s) of pipeline component(s) to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~Union[str, Iterable[str]]~~ | -| `config` 3 | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ | -| **RETURNS** | A `Language` object with the loaded pipeline. ~~Language~~ | +| Name | Description | +| ------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `name` | Pipeline to load, i.e. package name or path. ~~Union[str, Path]~~ | +| _keyword-only_ | | +| `vocab` | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~ | +| `disable` | Name(s) of pipeline component(s) to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). Is merged with the config entry `nlp.disabled`. ~~Union[str, Iterable[str]]~~ | +| `enable` 3.4 | Name(s) of pipeline component(s) to [enable](/usage/processing-pipelines#disabling). All other pipes will be disabled. ~~Union[str, Iterable[str]]~~ | +| `exclude` 3 | Name(s) of pipeline component(s) to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~Union[str, Iterable[str]]~~ | +| `config` 3 | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ | +| **RETURNS** | A `Language` object with the loaded pipeline. ~~Language~~ | Essentially, `spacy.load()` is a convenience wrapper that reads the pipeline's [`config.cfg`](/api/data-formats#config), uses the language and pipeline diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md index bd28810ae..0b63cdcb8 100644 --- a/website/docs/usage/processing-pipelines.md +++ b/website/docs/usage/processing-pipelines.md @@ -363,7 +363,8 @@ nlp.enable_pipe("tagger") ``` In addition to `disable`, `spacy.load()` also accepts `enable`. If `enable` is -set, all components except for those in `enable` are disabled. +set, all components except for those in `enable` are disabled. If `enable` and +`disable` conflict (i.e. the same component is included in both), an error is raised. ```python # Load the complete pipeline, but disable all components except for tok2vec and tagger From 4cd6dc81c763622026d15594607904644bc8125e Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 9 Nov 2022 10:59:28 +0100 Subject: [PATCH 18/18] Update warning, add tests for project requirements check (#11777) * Update warning, add tests for project requirements check * Make warning more general for differences between PEP 508 and pip * Add tests for _check_requirements * Parameterize test --- spacy/cli/project/run.py | 5 ++--- spacy/tests/test_cli.py | 41 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 3 deletions(-) diff --git a/spacy/cli/project/run.py b/spacy/cli/project/run.py index 638e7fab1..5db9e14f4 100644 --- a/spacy/cli/project/run.py +++ b/spacy/cli/project/run.py @@ -344,9 +344,8 @@ def _check_requirements(requirements: List[str]) -> Tuple[bool, bool]: conflicting_pkgs_msgs.append(vc.report()) except Exception: msg.warn(f"Unable to check requirement: {req} " - "Check that the requirement is formatted according to PEP " - "440, in particular that URLs are formatted as " - "'package_name @ URL'") + "Checks are currently limited to requirement specifiers " + "(PEP 508)") if len(failed_pkgs_msgs) or len(conflicting_pkgs_msgs): msg.warn( diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 838e00369..8225e14f1 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -1,5 +1,6 @@ import os import math +import pkg_resources from random import sample from typing import Counter @@ -25,6 +26,7 @@ from spacy.cli.download import get_compatibility, get_version from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config from spacy.cli.package import get_third_party_dependencies from spacy.cli.package import _is_permitted_package_name +from spacy.cli.project.run import _check_requirements from spacy.cli.validate import get_model_pkgs from spacy.lang.en import English from spacy.lang.nl import Dutch @@ -855,3 +857,42 @@ def test_span_length_freq_dist_output_must_be_correct(): span_freqs = _get_spans_length_freq_dist(sample_span_lengths, threshold) assert sum(span_freqs.values()) >= threshold assert list(span_freqs.keys()) == [3, 1, 4, 5, 2] + + +@pytest.mark.parametrize( + "reqs,output", + [ + [ + """ + spacy + + # comment + + thinc""", + (False, False), + ], + [ + """# comment + --some-flag + spacy""", + (False, False), + ], + [ + """# comment + --some-flag + spacy; python_version >= '3.6'""", + (False, False), + ], + [ + """# comment + spacyunknowndoesnotexist12345""", + (True, False), + ], + ], +) +def test_project_check_requirements(reqs, output): + # excessive guard against unlikely package name + try: + pkg_resources.require("spacyunknowndoesnotexist12345") + except pkg_resources.DistributionNotFound: + assert output == _check_requirements([req.strip() for req in reqs.split("\n")])