diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml
index cc0247b3a..e8bd0d212 100644
--- a/.github/azure-steps.yml
+++ b/.github/azure-steps.yml
@@ -1,9 +1,7 @@
parameters:
python_version: ''
- architecture: ''
- prefix: ''
- gpu: false
- num_build_jobs: 1
+ architecture: 'x64'
+ num_build_jobs: 2
steps:
- task: UsePythonVersion@0
@@ -17,16 +15,16 @@ steps:
displayName: 'Set variables'
- script: |
- ${{ parameters.prefix }} python -m pip install -U pip setuptools
- ${{ parameters.prefix }} python -m pip install -U -r requirements.txt
+ python -m pip install -U build pip setuptools
+ python -m pip install -U -r requirements.txt
displayName: "Install dependencies"
- script: |
- ${{ parameters.prefix }} python setup.py build_ext --inplace -j ${{ parameters.num_build_jobs }}
- ${{ parameters.prefix }} python setup.py sdist --formats=gztar
- displayName: "Compile and build sdist"
+ python -m build --sdist
+ displayName: "Build sdist"
- - script: python -m mypy spacy
+ - script: |
+ python -m mypy spacy
displayName: 'Run mypy'
condition: ne(variables['python_version'], '3.6')
@@ -35,35 +33,24 @@ steps:
contents: "spacy"
displayName: "Delete source directory"
+ - task: DeleteFiles@1
+ inputs:
+ contents: "*.egg-info"
+ displayName: "Delete egg-info directory"
+
- script: |
- ${{ parameters.prefix }} python -m pip freeze --exclude torch --exclude cupy-cuda110 > installed.txt
- ${{ parameters.prefix }} python -m pip uninstall -y -r installed.txt
+ python -m pip freeze > installed.txt
+ python -m pip uninstall -y -r installed.txt
displayName: "Uninstall all packages"
- bash: |
- ${{ parameters.prefix }} SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
- ${{ parameters.prefix }} SPACY_NUM_BUILD_JOBS=2 python -m pip install dist/$SDIST
+ SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
+ SPACY_NUM_BUILD_JOBS=${{ parameters.num_build_jobs }} python -m pip install dist/$SDIST
displayName: "Install from sdist"
- script: |
- ${{ parameters.prefix }} python -m pip install -U -r requirements.txt
- displayName: "Install test requirements"
-
- - script: |
- ${{ parameters.prefix }} python -m pip install -U cupy-cuda110 -f https://github.com/cupy/cupy/releases/v9.0.0
- ${{ parameters.prefix }} python -m pip install "torch==1.7.1+cu110" -f https://download.pytorch.org/whl/torch_stable.html
- displayName: "Install GPU requirements"
- condition: eq(${{ parameters.gpu }}, true)
-
- - script: |
- ${{ parameters.prefix }} python -m pytest --pyargs spacy -W error
- displayName: "Run CPU tests"
- condition: eq(${{ parameters.gpu }}, false)
-
- - script: |
- ${{ parameters.prefix }} python -m pytest --pyargs spacy -W error -p spacy.tests.enable_gpu
- displayName: "Run GPU tests"
- condition: eq(${{ parameters.gpu }}, true)
+ python -W error -c "import spacy"
+ displayName: "Test import"
- script: |
python -m spacy download ca_core_news_sm
@@ -72,6 +59,11 @@ steps:
displayName: 'Test download CLI'
condition: eq(variables['python_version'], '3.8')
+ - script: |
+ python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
+ displayName: 'Test no warnings on load (#11713)'
+ condition: eq(variables['python_version'], '3.8')
+
- script: |
python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
displayName: 'Test convert CLI'
@@ -106,13 +98,22 @@ steps:
displayName: 'Test assemble CLI vectors warning'
condition: eq(variables['python_version'], '3.8')
+ - script: |
+ python -m pip install -U -r requirements.txt
+ displayName: "Install test requirements"
+
+ - script: |
+ python -m pytest --pyargs spacy -W error
+ displayName: "Run CPU tests"
+
+ - script: |
+ python -m pip install --pre thinc-apple-ops
+ python -m pytest --pyargs spacy
+ displayName: "Run CPU tests with thinc-apple-ops"
+ condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.11'))
+
- script: |
python .github/validate_universe_json.py website/meta/universe.json
displayName: 'Test website/meta/universe.json'
condition: eq(variables['python_version'], '3.8')
- - script: |
- ${{ parameters.prefix }} python -m pip install --pre thinc-apple-ops
- ${{ parameters.prefix }} python -m pytest --pyargs spacy
- displayName: "Run CPU tests with thinc-apple-ops"
- condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.10'))
diff --git a/.github/workflows/autoblack.yml b/.github/workflows/autoblack.yml
index 8d0282650..70882c3cc 100644
--- a/.github/workflows/autoblack.yml
+++ b/.github/workflows/autoblack.yml
@@ -12,10 +12,10 @@ jobs:
if: github.repository_owner == 'explosion'
runs-on: ubuntu-latest
steps:
- - uses: actions/checkout@v2
+ - uses: actions/checkout@v3
with:
ref: ${{ github.head_ref }}
- - uses: actions/setup-python@v2
+ - uses: actions/setup-python@v4
- run: pip install black
- name: Auto-format code if needed
run: black spacy
@@ -23,10 +23,11 @@ jobs:
# code and makes GitHub think the action failed
- name: Check for modified files
id: git-check
- run: echo ::set-output name=modified::$(if git diff-index --quiet HEAD --; then echo "false"; else echo "true"; fi)
+ run: echo modified=$(if git diff-index --quiet HEAD --; then echo "false"; else echo "true"; fi) >> $GITHUB_OUTPUT
+
- name: Create Pull Request
if: steps.git-check.outputs.modified == 'true'
- uses: peter-evans/create-pull-request@v3
+ uses: peter-evans/create-pull-request@v4
with:
title: Auto-format code with black
labels: meta
diff --git a/.github/workflows/explosionbot.yml b/.github/workflows/explosionbot.yml
index d585ecd9c..6b472cd12 100644
--- a/.github/workflows/explosionbot.yml
+++ b/.github/workflows/explosionbot.yml
@@ -8,14 +8,14 @@ on:
jobs:
explosion-bot:
- runs-on: ubuntu-18.04
+ runs-on: ubuntu-latest
steps:
- name: Dump GitHub context
env:
GITHUB_CONTEXT: ${{ toJson(github) }}
run: echo "$GITHUB_CONTEXT"
- - uses: actions/checkout@v1
- - uses: actions/setup-python@v1
+ - uses: actions/checkout@v3
+ - uses: actions/setup-python@v4
- name: Install and run explosion-bot
run: |
pip install git+https://${{ secrets.EXPLOSIONBOT_TOKEN }}@github.com/explosion/explosion-bot
diff --git a/.github/workflows/slowtests.yml b/.github/workflows/slowtests.yml
index 38ceb18c6..f9fd3e817 100644
--- a/.github/workflows/slowtests.yml
+++ b/.github/workflows/slowtests.yml
@@ -14,7 +14,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Checkout
- uses: actions/checkout@v1
+ uses: actions/checkout@v3
with:
ref: ${{ matrix.branch }}
- name: Get commits from past 24 hours
@@ -23,9 +23,9 @@ jobs:
today=$(date '+%Y-%m-%d %H:%M:%S')
yesterday=$(date -d "yesterday" '+%Y-%m-%d %H:%M:%S')
if git log --after="$yesterday" --before="$today" | grep commit ; then
- echo "::set-output name=run_tests::true"
+ echo run_tests=true >> $GITHUB_OUTPUT
else
- echo "::set-output name=run_tests::false"
+ echo run_tests=false >> $GITHUB_OUTPUT
fi
- name: Trigger buildkite build
diff --git a/.github/workflows/spacy_universe_alert.yml b/.github/workflows/spacy_universe_alert.yml
index cbbf14c6e..f507e0594 100644
--- a/.github/workflows/spacy_universe_alert.yml
+++ b/.github/workflows/spacy_universe_alert.yml
@@ -17,8 +17,8 @@ jobs:
run: |
echo "$GITHUB_CONTEXT"
- - uses: actions/checkout@v1
- - uses: actions/setup-python@v1
+ - uses: actions/checkout@v3
+ - uses: actions/setup-python@v4
- name: Install Bernadette app dependency and send an alert
env:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
diff --git a/README.md b/README.md
index d9ef83e01..abfc3da67 100644
--- a/README.md
+++ b/README.md
@@ -8,7 +8,7 @@ be used in real products.
spaCy comes with
[pretrained pipelines](https://spacy.io/models) and
-currently supports tokenization and training for **60+ languages**. It features
+currently supports tokenization and training for **70+ languages**. It features
state-of-the-art speed and **neural network models** for tagging,
parsing, **named entity recognition**, **text classification** and more,
multi-task learning with pretrained **transformers** like BERT, as well as a
@@ -16,7 +16,7 @@ production-ready [**training system**](https://spacy.io/usage/training) and easy
model packaging, deployment and workflow management. spaCy is commercial
open-source software, released under the MIT license.
-💫 **Version 3.4.0 out now!**
+💫 **Version 3.4 out now!**
[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
[](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)
@@ -79,7 +79,7 @@ more people can benefit from it.
## Features
-- Support for **60+ languages**
+- Support for **70+ languages**
- **Trained pipelines** for different languages and tasks
- Multi-task learning with pretrained **transformers** like BERT
- Support for pretrained **word vectors** and embeddings
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 357cce835..9c3b92f06 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -76,24 +76,24 @@ jobs:
# Python39Mac:
# imageName: "macos-latest"
# python.version: "3.9"
- Python310Linux:
- imageName: "ubuntu-latest"
- python.version: "3.10"
+ # Python310Linux:
+ # imageName: "ubuntu-latest"
+ # python.version: "3.10"
Python310Windows:
imageName: "windows-latest"
python.version: "3.10"
- Python310Mac:
- imageName: "macos-latest"
- python.version: "3.10"
+ # Python310Mac:
+ # imageName: "macos-latest"
+ # python.version: "3.10"
Python311Linux:
imageName: 'ubuntu-latest'
- python.version: '3.11.0-rc.2'
+ python.version: '3.11'
Python311Windows:
imageName: 'windows-latest'
- python.version: '3.11.0-rc.2'
+ python.version: '3.11'
Python311Mac:
imageName: 'macos-latest'
- python.version: '3.11.0-rc.2'
+ python.version: '3.11'
maxParallel: 4
pool:
vmImage: $(imageName)
@@ -101,20 +101,3 @@ jobs:
- template: .github/azure-steps.yml
parameters:
python_version: '$(python.version)'
- architecture: 'x64'
-
-# - job: "TestGPU"
-# dependsOn: "Validate"
-# strategy:
-# matrix:
-# Python38LinuxX64_GPU:
-# python.version: '3.8'
-# pool:
-# name: "LinuxX64_GPU"
-# steps:
-# - template: .github/azure-steps.yml
-# parameters:
-# python_version: '$(python.version)'
-# architecture: 'x64'
-# gpu: true
-# num_build_jobs: 24
diff --git a/requirements.txt b/requirements.txt
index 9d6bbb2c4..d91a3b3d4 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,7 +9,7 @@ murmurhash>=0.28.0,<1.1.0
wasabi>=0.9.1,<1.1.0
srsly>=2.4.3,<3.0.0
catalogue>=2.0.6,<2.1.0
-typer>=0.3.0,<0.5.0
+typer>=0.3.0,<0.8.0
pathy>=0.3.5
# Third party dependencies
numpy>=1.15.0
diff --git a/setup.cfg b/setup.cfg
index c2653feba..82d4d2758 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -51,7 +51,7 @@ install_requires =
srsly>=2.4.3,<3.0.0
catalogue>=2.0.6,<2.1.0
# Third-party dependencies
- typer>=0.3.0,<0.5.0
+ typer>=0.3.0,<0.8.0
pathy>=0.3.5
tqdm>=4.38.0,<5.0.0
numpy>=1.15.0
diff --git a/spacy/cli/project/run.py b/spacy/cli/project/run.py
index ebab7471e..5db9e14f4 100644
--- a/spacy/cli/project/run.py
+++ b/spacy/cli/project/run.py
@@ -53,6 +53,7 @@ def project_run(
force: bool = False,
dry: bool = False,
capture: bool = False,
+ skip_requirements_check: bool = False,
) -> None:
"""Run a named script defined in the project.yml. If the script is part
of the default pipeline (defined in the "run" section), DVC is used to
@@ -69,6 +70,7 @@ def project_run(
sys.exit will be called with the return code. You should use capture=False
when you want to turn over execution to the command, and capture=True
when you want to run the command more like a function.
+ skip_requirements_check (bool): Whether to skip the requirements check.
"""
config = load_project_config(project_dir, overrides=overrides)
commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
@@ -76,9 +78,10 @@ def project_run(
validate_subcommand(list(commands.keys()), list(workflows.keys()), subcommand)
req_path = project_dir / "requirements.txt"
- if config.get("check_requirements", True) and os.path.exists(req_path):
- with req_path.open() as requirements_file:
- _check_requirements([req.replace("\n", "") for req in requirements_file])
+ if not skip_requirements_check:
+ if config.get("check_requirements", True) and os.path.exists(req_path):
+ with req_path.open() as requirements_file:
+ _check_requirements([req.strip() for req in requirements_file])
if subcommand in workflows:
msg.info(f"Running workflow '{subcommand}'")
@@ -90,6 +93,7 @@ def project_run(
force=force,
dry=dry,
capture=capture,
+ skip_requirements_check=True,
)
else:
cmd = commands[subcommand]
@@ -338,6 +342,10 @@ def _check_requirements(requirements: List[str]) -> Tuple[bool, bool]:
failed_pkgs_msgs.append(dnf.report())
except pkg_resources.VersionConflict as vc:
conflicting_pkgs_msgs.append(vc.report())
+ except Exception:
+ msg.warn(f"Unable to check requirement: {req} "
+ "Checks are currently limited to requirement specifiers "
+ "(PEP 508)")
if len(failed_pkgs_msgs) or len(conflicting_pkgs_msgs):
msg.warn(
diff --git a/spacy/errors.py b/spacy/errors.py
index c035f684d..3cc9fd494 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -212,8 +212,8 @@ class Warnings(metaclass=ErrorsWithCodes):
W121 = ("Attempting to trace non-existent method '{method}' in pipe '{pipe}'")
W122 = ("Couldn't trace method '{method}' in pipe '{pipe}'. This can happen if the pipe class "
"is a Cython extension type.")
- W123 = ("Argument {arg} with value {arg_value} is used instead of {config_value} as specified in the config. Be "
- "aware that this might affect other components in your pipeline.")
+ W123 = ("Argument `enable` with value {enable} does not contain all values specified in the config option "
+ "`enabled` ({enabled}). Be aware that this might affect other components in your pipeline.")
class Errors(metaclass=ErrorsWithCodes):
diff --git a/spacy/language.py b/spacy/language.py
index d391f15ab..967af1e62 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1879,31 +1879,22 @@ class Language:
if isinstance(exclude, str):
exclude = [exclude]
- def fetch_pipes_status(value: Iterable[str], key: str) -> Iterable[str]:
- """Fetch value for `enable` or `disable` w.r.t. the specified config and passed arguments passed to
- .load(). If both arguments and config specified values for this field, the passed arguments take precedence
- and a warning is printed.
- value (Iterable[str]): Passed value for `enable` or `disable`.
- key (str): Key for field in config (either "enabled" or "disabled").
- RETURN (Iterable[str]):
- """
- # We assume that no argument was passed if the value is the specified default value.
- if id(value) == id(_DEFAULT_EMPTY_PIPES):
- return config["nlp"].get(key, [])
- else:
- if len(config["nlp"].get(key, [])):
- warnings.warn(
- Warnings.W123.format(
- arg=key[:-1],
- arg_value=value,
- config_value=config["nlp"][key],
- )
+ # `enable` should not be merged with `enabled` (the opposite is true for `disable`/`disabled`). If the config
+ # specifies values for `enabled` not included in `enable`, emit warning.
+ if id(enable) != id(_DEFAULT_EMPTY_PIPES):
+ enabled = config["nlp"].get("enabled", [])
+ if len(enabled) and not set(enabled).issubset(enable):
+ warnings.warn(
+ Warnings.W123.format(
+ enable=enable,
+ enabled=enabled,
)
- return value
+ )
+ # Ensure sets of disabled/enabled pipe names are not contradictory.
disabled_pipes = cls._resolve_component_status(
- fetch_pipes_status(disable, "disabled"),
- fetch_pipes_status(enable, "enabled"),
+ list({*disable, *config["nlp"].get("disabled", [])}),
+ enable,
config["nlp"]["pipeline"],
)
nlp._disabled = set(p for p in disabled_pipes if p not in exclude)
@@ -2084,10 +2075,12 @@ class Language:
if enable:
if isinstance(enable, str):
enable = [enable]
- to_disable = [
- pipe_name for pipe_name in pipe_names if pipe_name not in enable
- ]
- if disable and disable != to_disable:
+ to_disable = {
+ *[pipe_name for pipe_name in pipe_names if pipe_name not in enable],
+ *disable,
+ }
+ # If any pipe to be enabled is in to_disable, the specification is inconsistent.
+ if len(set(enable) & to_disable):
raise ValueError(Errors.E1042.format(enable=enable, disable=disable))
return tuple(to_disable)
diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py
index d847342a3..0293f87e9 100644
--- a/spacy/ml/models/entity_linker.py
+++ b/spacy/ml/models/entity_linker.py
@@ -70,11 +70,10 @@ def span_maker_forward(model, docs: List[Doc], is_train) -> Tuple[Ragged, Callab
cands.append((start_token, end_token))
candidates.append(ops.asarray2i(cands))
- candlens = ops.asarray1i([len(cands) for cands in candidates])
- candidates = ops.xp.concatenate(candidates)
- outputs = Ragged(candidates, candlens)
+ lengths = model.ops.asarray1i([len(cands) for cands in candidates])
+ out = Ragged(model.ops.flatten(candidates), lengths)
# because this is just rearranging docs, the backprop does nothing
- return outputs, lambda x: []
+ return out, lambda x: []
@registry.misc("spacy.KBFromFile.v1")
diff --git a/spacy/tests/doc/test_json_doc_conversion.py b/spacy/tests/doc/test_json_doc_conversion.py
index 19698cfb2..11a1817e6 100644
--- a/spacy/tests/doc/test_json_doc_conversion.py
+++ b/spacy/tests/doc/test_json_doc_conversion.py
@@ -370,3 +370,12 @@ def test_json_to_doc_validation_error(doc):
doc_json.pop("tokens")
with pytest.raises(ValueError):
Doc(doc.vocab).from_json(doc_json, validate=True)
+
+
+def test_to_json_underscore_doc_getters(doc):
+ def get_text_length(doc):
+ return len(doc.text)
+
+ Doc.set_extension("text_length", getter=get_text_length)
+ doc_json = doc.to_json(underscore=["text_length"])
+ assert doc_json["_"]["text_length"] == get_text_length(doc)
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index 82bc976bb..1c8e49a09 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -9,6 +9,7 @@ from spacy.compat import pickle
from spacy.kb import Candidate, KnowledgeBase, get_candidates
from spacy.lang.en import English
from spacy.ml import load_kb
+from spacy.ml.models.entity_linker import build_span_maker
from spacy.pipeline import EntityLinker
from spacy.pipeline.legacy import EntityLinker_v1
from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
@@ -701,7 +702,11 @@ TRAIN_DATA = [
("Russ Cochran was a member of University of Kentucky's golf team.",
{"links": {(0, 12): {"Q7381115": 0.0, "Q2146908": 1.0}},
"entities": [(0, 12, "PERSON"), (43, 51, "LOC")],
- "sent_starts": [1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]})
+ "sent_starts": [1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}),
+ # having a blank instance shouldn't break things
+ ("The weather is nice today.",
+ {"links": {}, "entities": [],
+ "sent_starts": [1, -1, 0, 0, 0, 0]})
]
GOLD_entities = ["Q2146908", "Q7381115", "Q7381115", "Q2146908"]
# fmt: on
@@ -1176,3 +1181,18 @@ def test_threshold(meet_threshold: bool, config: Dict[str, Any]):
assert len(doc.ents) == 1
assert doc.ents[0].kb_id_ == entity_id if meet_threshold else EntityLinker.NIL
+
+
+def test_span_maker_forward_with_empty():
+ """The forward pass of the span maker may have a doc with no entities."""
+ nlp = English()
+ doc1 = nlp("a b c")
+ ent = doc1[0:1]
+ ent.label_ = "X"
+ doc1.ents = [ent]
+ # no entities
+ doc2 = nlp("x y z")
+
+ # just to get a model
+ span_maker = build_span_maker()
+ span_maker([doc1, doc2], False)
diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py
index 14a7a36e5..4dd7bae16 100644
--- a/spacy/tests/pipeline/test_pipe_methods.py
+++ b/spacy/tests/pipeline/test_pipe_methods.py
@@ -615,20 +615,18 @@ def test_enable_disable_conflict_with_config():
with make_tempdir() as tmp_dir:
nlp.to_disk(tmp_dir)
- # Expected to fail, as config and arguments conflict.
- with pytest.raises(ValueError):
- spacy.load(
- tmp_dir, enable=["tagger"], config={"nlp": {"disabled": ["senter"]}}
- )
+ # Expected to succeed, as config and arguments do not conflict.
+ assert spacy.load(
+ tmp_dir, enable=["tagger"], config={"nlp": {"disabled": ["senter"]}}
+ ).disabled == ["senter", "sentencizer"]
# Expected to succeed without warning due to the lack of a conflicting config option.
spacy.load(tmp_dir, enable=["tagger"])
- # Expected to succeed with a warning, as disable=[] should override the config setting.
- with pytest.warns(UserWarning):
+ # Expected to fail due to conflict between enable and disabled.
+ with pytest.raises(ValueError):
spacy.load(
tmp_dir,
- enable=["tagger"],
- disable=[],
- config={"nlp": {"disabled": ["senter"]}},
+ enable=["senter"],
+ config={"nlp": {"disabled": ["senter", "tagger"]}},
)
diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py
index b948bb76c..9fcf18e2d 100644
--- a/spacy/tests/serialize/test_serialize_pipeline.py
+++ b/spacy/tests/serialize/test_serialize_pipeline.py
@@ -404,11 +404,10 @@ def test_serialize_pipeline_disable_enable():
assert nlp3.component_names == ["ner", "tagger"]
with make_tempdir() as d:
nlp3.to_disk(d)
- with pytest.warns(UserWarning):
- nlp4 = spacy.load(d, disable=["ner"])
- assert nlp4.pipe_names == ["tagger"]
+ nlp4 = spacy.load(d, disable=["ner"])
+ assert nlp4.pipe_names == []
assert nlp4.component_names == ["ner", "tagger"]
- assert nlp4.disabled == ["ner"]
+ assert nlp4.disabled == ["ner", "tagger"]
with make_tempdir() as d:
nlp.to_disk(d)
nlp5 = spacy.load(d, exclude=["tagger"])
diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index 838e00369..8225e14f1 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -1,5 +1,6 @@
import os
import math
+import pkg_resources
from random import sample
from typing import Counter
@@ -25,6 +26,7 @@ from spacy.cli.download import get_compatibility, get_version
from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config
from spacy.cli.package import get_third_party_dependencies
from spacy.cli.package import _is_permitted_package_name
+from spacy.cli.project.run import _check_requirements
from spacy.cli.validate import get_model_pkgs
from spacy.lang.en import English
from spacy.lang.nl import Dutch
@@ -855,3 +857,42 @@ def test_span_length_freq_dist_output_must_be_correct():
span_freqs = _get_spans_length_freq_dist(sample_span_lengths, threshold)
assert sum(span_freqs.values()) >= threshold
assert list(span_freqs.keys()) == [3, 1, 4, 5, 2]
+
+
+@pytest.mark.parametrize(
+ "reqs,output",
+ [
+ [
+ """
+ spacy
+
+ # comment
+
+ thinc""",
+ (False, False),
+ ],
+ [
+ """# comment
+ --some-flag
+ spacy""",
+ (False, False),
+ ],
+ [
+ """# comment
+ --some-flag
+ spacy; python_version >= '3.6'""",
+ (False, False),
+ ],
+ [
+ """# comment
+ spacyunknowndoesnotexist12345""",
+ (True, False),
+ ],
+ ],
+)
+def test_project_check_requirements(reqs, output):
+ # excessive guard against unlikely package name
+ try:
+ pkg_resources.require("spacyunknowndoesnotexist12345")
+ except pkg_resources.DistributionNotFound:
+ assert output == _check_requirements([req.strip() for req in reqs.split("\n")])
diff --git a/spacy/tests/test_models.py b/spacy/tests/test_models.py
index 2306cabb7..d91ed1201 100644
--- a/spacy/tests/test_models.py
+++ b/spacy/tests/test_models.py
@@ -23,7 +23,7 @@ def get_textcat_bow_kwargs():
def get_textcat_cnn_kwargs():
- return {"tok2vec": test_tok2vec(), "exclusive_classes": False, "nO": 13}
+ return {"tok2vec": make_test_tok2vec(), "exclusive_classes": False, "nO": 13}
def get_all_params(model):
@@ -65,7 +65,7 @@ def get_tok2vec_kwargs():
}
-def test_tok2vec():
+def make_test_tok2vec():
return build_Tok2Vec_model(**get_tok2vec_kwargs())
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 295f91c28..f2621292c 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -1668,6 +1668,20 @@ cdef class Doc:
if underscore:
user_keys = set()
+ # Handle doc attributes with .get to include values from getters
+ # and not only values stored in user_data, for backwards
+ # compatibility
+ for attr in underscore:
+ if self.has_extension(attr):
+ if "_" not in data:
+ data["_"] = {}
+ value = self._.get(attr)
+ if not srsly.is_json_serializable(value):
+ raise ValueError(Errors.E107.format(attr=attr, value=repr(value)))
+ data["_"][attr] = value
+ user_keys.add(attr)
+ # Token and span attributes only include values stored in user_data
+ # and not values generated by getters
if self.user_data:
for data_key, value in self.user_data.copy().items():
if type(data_key) == tuple and len(data_key) >= 4 and data_key[0] == "._.":
@@ -1678,20 +1692,15 @@ cdef class Doc:
user_keys.add(attr)
if not srsly.is_json_serializable(value):
raise ValueError(Errors.E107.format(attr=attr, value=repr(value)))
- # Check if doc attribute
- if start is None:
- if "_" not in data:
- data["_"] = {}
- data["_"][attr] = value
- # Check if token attribute
- elif end is None:
+ # Token attribute
+ if start is not None and end is None:
if "underscore_token" not in data:
data["underscore_token"] = {}
if attr not in data["underscore_token"]:
data["underscore_token"][attr] = []
data["underscore_token"][attr].append({"start": start, "value": value})
- # Else span attribute
- else:
+ # Span attribute
+ elif start is not None and end is not None:
if "underscore_span" not in data:
data["underscore_span"] = {}
if attr not in data["underscore_span"]:
diff --git a/spacy/tokens/span.pyi b/spacy/tokens/span.pyi
index 617e3d19d..0a6f306a6 100644
--- a/spacy/tokens/span.pyi
+++ b/spacy/tokens/span.pyi
@@ -117,15 +117,13 @@ class Span:
end_char: int
label: int
kb_id: int
+ id: int
ent_id: int
ent_id_: str
@property
- def id(self) -> int: ...
- @property
- def id_(self) -> str: ...
- @property
def orth_(self) -> str: ...
@property
def lemma_(self) -> str: ...
label_: str
kb_id_: str
+ id_: str
diff --git a/spacy/util.py b/spacy/util.py
index 3034808ba..76a1e0bfa 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -443,9 +443,9 @@ def load_model_from_package(
name: str,
*,
vocab: Union["Vocab", bool] = True,
- disable: Union[str, Iterable[str]] = SimpleFrozenList(),
- enable: Union[str, Iterable[str]] = SimpleFrozenList(),
- exclude: Union[str, Iterable[str]] = SimpleFrozenList(),
+ disable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
+ enable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
+ exclude: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
) -> "Language":
"""Load a model from an installed package.
@@ -619,9 +619,9 @@ def load_model_from_init_py(
init_file: Union[Path, str],
*,
vocab: Union["Vocab", bool] = True,
- disable: Union[str, Iterable[str]] = SimpleFrozenList(),
- enable: Union[str, Iterable[str]] = SimpleFrozenList(),
- exclude: Union[str, Iterable[str]] = SimpleFrozenList(),
+ disable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
+ enable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
+ exclude: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
) -> "Language":
"""Helper function to use in the `load()` method of a model package's
diff --git a/website/docs/api/language.md b/website/docs/api/language.md
index 767a7450a..504640d57 100644
--- a/website/docs/api/language.md
+++ b/website/docs/api/language.md
@@ -63,18 +63,18 @@ spaCy loads a model under the hood based on its
> nlp = Language.from_config(config)
> ```
-| Name | Description |
-| ------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `config` | The loaded config. ~~Union[Dict[str, Any], Config]~~ |
-| _keyword-only_ | |
-| `vocab` | A `Vocab` object. If `True`, a vocab is created using the default language data settings. ~~Vocab~~ |
-| `disable` | Name(s) of pipeline component(s) to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [`nlp.enable_pipe`](/api/language#enable_pipe). ~~Union[str, Iterable[str]]~~ |
-| `enable` 3.4 | Name(s) of pipeline component(s) to [enable](/usage/processing-pipelines#disabling). All other pipes will be disabled, but can be enabled again using [`nlp.enable_pipe`](/api/language#enable_pipe). ~~Union[str, Iterable[str]]~~ |
-| `exclude` | Name(s) of pipeline component(s) to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~Union[str, Iterable[str]]~~ |
-| `meta` | [Meta data](/api/data-formats#meta) overrides. ~~Dict[str, Any]~~ |
-| `auto_fill` | Whether to automatically fill in missing values in the config, based on defaults and function argument annotations. Defaults to `True`. ~~bool~~ |
-| `validate` | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. ~~bool~~ |
-| **RETURNS** | The initialized object. ~~Language~~ |
+| Name | Description |
+| ------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `config` | The loaded config. ~~Union[Dict[str, Any], Config]~~ |
+| _keyword-only_ | |
+| `vocab` | A `Vocab` object. If `True`, a vocab is created using the default language data settings. ~~Vocab~~ |
+| `disable` | Name(s) of pipeline component(s) to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). Is merged with the config entry `nlp.disabled`. ~~Union[str, Iterable[str]]~~ |
+| `enable` 3.4 | Name(s) of pipeline component(s) to [enable](/usage/processing-pipelines#disabling). All other pipes will be disabled, but can be enabled again using [nlp.enable_pipe](/api/language#enable_pipe). ~~Union[str, Iterable[str]]~~ |
+| `exclude` | Name(s) of pipeline component(s) to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~Union[str, Iterable[str]]~~ |
+| `meta` | [Meta data](/api/data-formats#meta) overrides. ~~Dict[str, Any]~~ |
+| `auto_fill` | Whether to automatically fill in missing values in the config, based on defaults and function argument annotations. Defaults to `True`. ~~bool~~ |
+| `validate` | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. ~~bool~~ |
+| **RETURNS** | The initialized object. ~~Language~~ |
## Language.component {#component tag="classmethod" new="3"}
diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md
index bc53fc868..c798f2a8d 100644
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@@ -45,16 +45,16 @@ specified separately using the new `exclude` keyword argument.
> nlp = spacy.load("en_core_web_sm", exclude=["parser", "tagger"])
> ```
-| Name | Description |
-| ------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `name` | Pipeline to load, i.e. package name or path. ~~Union[str, Path]~~ |
-| _keyword-only_ | |
-| `vocab` | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~ |
-| `disable` | Name(s) of pipeline component(s) to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). ~~Union[str, Iterable[str]]~~ |
-| `enable` 3.4 | Name(s) of pipeline component(s) to [enable](/usage/processing-pipelines#disabling). All other pipes will be disabled. ~~Union[str, Iterable[str]]~~ |
-| `exclude` 3 | Name(s) of pipeline component(s) to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~Union[str, Iterable[str]]~~ |
-| `config` 3 | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ |
-| **RETURNS** | A `Language` object with the loaded pipeline. ~~Language~~ |
+| Name | Description |
+| ------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `name` | Pipeline to load, i.e. package name or path. ~~Union[str, Path]~~ |
+| _keyword-only_ | |
+| `vocab` | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~ |
+| `disable` | Name(s) of pipeline component(s) to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). Is merged with the config entry `nlp.disabled`. ~~Union[str, Iterable[str]]~~ |
+| `enable` 3.4 | Name(s) of pipeline component(s) to [enable](/usage/processing-pipelines#disabling). All other pipes will be disabled. ~~Union[str, Iterable[str]]~~ |
+| `exclude` 3 | Name(s) of pipeline component(s) to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~Union[str, Iterable[str]]~~ |
+| `config` 3 | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ |
+| **RETURNS** | A `Language` object with the loaded pipeline. ~~Language~~ |
Essentially, `spacy.load()` is a convenience wrapper that reads the pipeline's
[`config.cfg`](/api/data-formats#config), uses the language and pipeline
diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md
index bd28810ae..0b63cdcb8 100644
--- a/website/docs/usage/processing-pipelines.md
+++ b/website/docs/usage/processing-pipelines.md
@@ -363,7 +363,8 @@ nlp.enable_pipe("tagger")
```
In addition to `disable`, `spacy.load()` also accepts `enable`. If `enable` is
-set, all components except for those in `enable` are disabled.
+set, all components except for those in `enable` are disabled. If `enable` and
+`disable` conflict (i.e. the same component is included in both), an error is raised.
```python
# Load the complete pipeline, but disable all components except for tok2vec and tagger
diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md
index f096890cb..64bbf8e7b 100644
--- a/website/docs/usage/rule-based-matching.md
+++ b/website/docs/usage/rule-based-matching.md
@@ -1792,7 +1792,7 @@ the entity `Span` – for example `._.orgs` or `._.prev_orgs` and
> [`Doc.retokenize`](/api/doc#retokenize) context manager:
>
> ```python
-> with doc.retokenize() as retokenize:
+> with doc.retokenize() as retokenizer:
> for ent in doc.ents:
> retokenizer.merge(ent)
> ```
diff --git a/website/meta/languages.json b/website/meta/languages.json
index 79e1fc5d5..06cd005de 100644
--- a/website/meta/languages.json
+++ b/website/meta/languages.json
@@ -4,12 +4,22 @@
"code": "af",
"name": "Afrikaans"
},
+ {
+ "code": "am",
+ "name": "Amharic",
+ "has_examples": true
+ },
{
"code": "ar",
"name": "Arabic",
"example": "هذه جملة",
"has_examples": true
},
+ {
+ "code": "az",
+ "name": "Azerbaijani",
+ "has_examples": true
+ },
{
"code": "bg",
"name": "Bulgarian",
@@ -65,7 +75,7 @@
{
"code": "dsb",
"name": "Lower Sorbian",
- "has_examples": true
+ "has_examples": true
},
{
"code": "el",
@@ -142,6 +152,11 @@
"code": "ga",
"name": "Irish"
},
+ {
+ "code": "grc",
+ "name": "Ancient Greek",
+ "has_examples": true
+ },
{
"code": "gu",
"name": "Gujarati",
@@ -172,7 +187,7 @@
{
"code": "hsb",
"name": "Upper Sorbian",
- "has_examples": true
+ "has_examples": true
},
{
"code": "hu",
@@ -260,6 +275,10 @@
"example": "Адамга эң кыйыны — күн сайын адам болуу",
"has_examples": true
},
+ {
+ "code": "la",
+ "name": "Latin"
+ },
{
"code": "lb",
"name": "Luxembourgish",
@@ -448,6 +467,11 @@
"example": "นี่คือประโยค",
"has_examples": true
},
+ {
+ "code": "ti",
+ "name": "Tigrinya",
+ "has_examples": true
+ },
{
"code": "tl",
"name": "Tagalog"
diff --git a/website/src/styles/quickstart.module.sass b/website/src/styles/quickstart.module.sass
index 8ad106a78..d0f9db551 100644
--- a/website/src/styles/quickstart.module.sass
+++ b/website/src/styles/quickstart.module.sass
@@ -149,6 +149,9 @@
& > span
display: block
+ a
+ text-decoration: underline
+
.small
font-size: var(--font-size-code)
line-height: 1.65
diff --git a/website/src/widgets/quickstart-install.js b/website/src/widgets/quickstart-install.js
index 0d2186acb..28dd14ecc 100644
--- a/website/src/widgets/quickstart-install.js
+++ b/website/src/widgets/quickstart-install.js
@@ -159,6 +159,9 @@ const QuickstartInstall = ({ id, title }) => {
setters={setters}
showDropdown={showDropdown}
>
+
+ # Note M1 GPU support is experimental, see Thinc issue #792
+
python -m venv .env
@@ -198,7 +201,13 @@ const QuickstartInstall = ({ id, title }) => {
{nightly ? ' --pre' : ''}
conda install -c conda-forge spacy
-
+
+ conda install -c conda-forge cupy
+
+
+ conda install -c conda-forge cupy
+
+
conda install -c conda-forge cupy