Merge branch 'master' of https://github.com/explosion/spaCy into feature/etl

This commit is contained in:
richardpaulhudson 2022-11-09 12:24:36 +01:00
commit 54bdc11353
37 changed files with 377 additions and 208 deletions

View File

@ -1,9 +1,7 @@
parameters: parameters:
python_version: '' python_version: ''
architecture: '' architecture: 'x64'
prefix: '' num_build_jobs: 2
gpu: false
num_build_jobs: 1
steps: steps:
- task: UsePythonVersion@0 - task: UsePythonVersion@0
@ -17,16 +15,16 @@ steps:
displayName: 'Set variables' displayName: 'Set variables'
- script: | - script: |
${{ parameters.prefix }} python -m pip install -U pip setuptools python -m pip install -U build pip setuptools
${{ parameters.prefix }} python -m pip install -U -r requirements.txt python -m pip install -U -r requirements.txt
displayName: "Install dependencies" displayName: "Install dependencies"
- script: | - script: |
${{ parameters.prefix }} python setup.py build_ext --inplace -j ${{ parameters.num_build_jobs }} python -m build --sdist
${{ parameters.prefix }} python setup.py sdist --formats=gztar displayName: "Build sdist"
displayName: "Compile and build sdist"
- script: python -m mypy spacy - script: |
python -m mypy spacy
displayName: 'Run mypy' displayName: 'Run mypy'
condition: ne(variables['python_version'], '3.6') condition: ne(variables['python_version'], '3.6')
@ -35,35 +33,24 @@ steps:
contents: "spacy" contents: "spacy"
displayName: "Delete source directory" displayName: "Delete source directory"
- task: DeleteFiles@1
inputs:
contents: "*.egg-info"
displayName: "Delete egg-info directory"
- script: | - script: |
${{ parameters.prefix }} python -m pip freeze --exclude torch --exclude cupy-cuda110 > installed.txt python -m pip freeze > installed.txt
${{ parameters.prefix }} python -m pip uninstall -y -r installed.txt python -m pip uninstall -y -r installed.txt
displayName: "Uninstall all packages" displayName: "Uninstall all packages"
- bash: | - bash: |
${{ parameters.prefix }} SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1) SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
${{ parameters.prefix }} SPACY_NUM_BUILD_JOBS=2 python -m pip install dist/$SDIST SPACY_NUM_BUILD_JOBS=${{ parameters.num_build_jobs }} python -m pip install dist/$SDIST
displayName: "Install from sdist" displayName: "Install from sdist"
- script: | - script: |
${{ parameters.prefix }} python -m pip install -U -r requirements.txt python -W error -c "import spacy"
displayName: "Install test requirements" displayName: "Test import"
- script: |
${{ parameters.prefix }} python -m pip install -U cupy-cuda110 -f https://github.com/cupy/cupy/releases/v9.0.0
${{ parameters.prefix }} python -m pip install "torch==1.7.1+cu110" -f https://download.pytorch.org/whl/torch_stable.html
displayName: "Install GPU requirements"
condition: eq(${{ parameters.gpu }}, true)
- script: |
${{ parameters.prefix }} python -m pytest --pyargs spacy -W error
displayName: "Run CPU tests"
condition: eq(${{ parameters.gpu }}, false)
- script: |
${{ parameters.prefix }} python -m pytest --pyargs spacy -W error -p spacy.tests.enable_gpu
displayName: "Run GPU tests"
condition: eq(${{ parameters.gpu }}, true)
- script: | - script: |
python -m spacy download ca_core_news_sm python -m spacy download ca_core_news_sm
@ -72,6 +59,11 @@ steps:
displayName: 'Test download CLI' displayName: 'Test download CLI'
condition: eq(variables['python_version'], '3.8') condition: eq(variables['python_version'], '3.8')
- script: |
python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
displayName: 'Test no warnings on load (#11713)'
condition: eq(variables['python_version'], '3.8')
- script: | - script: |
python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json . python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
displayName: 'Test convert CLI' displayName: 'Test convert CLI'
@ -106,13 +98,22 @@ steps:
displayName: 'Test assemble CLI vectors warning' displayName: 'Test assemble CLI vectors warning'
condition: eq(variables['python_version'], '3.8') condition: eq(variables['python_version'], '3.8')
- script: |
python -m pip install -U -r requirements.txt
displayName: "Install test requirements"
- script: |
python -m pytest --pyargs spacy -W error
displayName: "Run CPU tests"
- script: |
python -m pip install --pre thinc-apple-ops
python -m pytest --pyargs spacy
displayName: "Run CPU tests with thinc-apple-ops"
condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.11'))
- script: | - script: |
python .github/validate_universe_json.py website/meta/universe.json python .github/validate_universe_json.py website/meta/universe.json
displayName: 'Test website/meta/universe.json' displayName: 'Test website/meta/universe.json'
condition: eq(variables['python_version'], '3.8') condition: eq(variables['python_version'], '3.8')
- script: |
${{ parameters.prefix }} python -m pip install --pre thinc-apple-ops
${{ parameters.prefix }} python -m pytest --pyargs spacy
displayName: "Run CPU tests with thinc-apple-ops"
condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.10'))

View File

@ -12,10 +12,10 @@ jobs:
if: github.repository_owner == 'explosion' if: github.repository_owner == 'explosion'
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- uses: actions/checkout@v2 - uses: actions/checkout@v3
with: with:
ref: ${{ github.head_ref }} ref: ${{ github.head_ref }}
- uses: actions/setup-python@v2 - uses: actions/setup-python@v4
- run: pip install black - run: pip install black
- name: Auto-format code if needed - name: Auto-format code if needed
run: black spacy run: black spacy
@ -23,10 +23,11 @@ jobs:
# code and makes GitHub think the action failed # code and makes GitHub think the action failed
- name: Check for modified files - name: Check for modified files
id: git-check id: git-check
run: echo ::set-output name=modified::$(if git diff-index --quiet HEAD --; then echo "false"; else echo "true"; fi) run: echo modified=$(if git diff-index --quiet HEAD --; then echo "false"; else echo "true"; fi) >> $GITHUB_OUTPUT
- name: Create Pull Request - name: Create Pull Request
if: steps.git-check.outputs.modified == 'true' if: steps.git-check.outputs.modified == 'true'
uses: peter-evans/create-pull-request@v3 uses: peter-evans/create-pull-request@v4
with: with:
title: Auto-format code with black title: Auto-format code with black
labels: meta labels: meta

View File

@ -8,14 +8,14 @@ on:
jobs: jobs:
explosion-bot: explosion-bot:
runs-on: ubuntu-18.04 runs-on: ubuntu-latest
steps: steps:
- name: Dump GitHub context - name: Dump GitHub context
env: env:
GITHUB_CONTEXT: ${{ toJson(github) }} GITHUB_CONTEXT: ${{ toJson(github) }}
run: echo "$GITHUB_CONTEXT" run: echo "$GITHUB_CONTEXT"
- uses: actions/checkout@v1 - uses: actions/checkout@v3
- uses: actions/setup-python@v1 - uses: actions/setup-python@v4
- name: Install and run explosion-bot - name: Install and run explosion-bot
run: | run: |
pip install git+https://${{ secrets.EXPLOSIONBOT_TOKEN }}@github.com/explosion/explosion-bot pip install git+https://${{ secrets.EXPLOSIONBOT_TOKEN }}@github.com/explosion/explosion-bot

View File

@ -14,7 +14,7 @@ jobs:
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- name: Checkout - name: Checkout
uses: actions/checkout@v1 uses: actions/checkout@v3
with: with:
ref: ${{ matrix.branch }} ref: ${{ matrix.branch }}
- name: Get commits from past 24 hours - name: Get commits from past 24 hours
@ -23,9 +23,9 @@ jobs:
today=$(date '+%Y-%m-%d %H:%M:%S') today=$(date '+%Y-%m-%d %H:%M:%S')
yesterday=$(date -d "yesterday" '+%Y-%m-%d %H:%M:%S') yesterday=$(date -d "yesterday" '+%Y-%m-%d %H:%M:%S')
if git log --after="$yesterday" --before="$today" | grep commit ; then if git log --after="$yesterday" --before="$today" | grep commit ; then
echo "::set-output name=run_tests::true" echo run_tests=true >> $GITHUB_OUTPUT
else else
echo "::set-output name=run_tests::false" echo run_tests=false >> $GITHUB_OUTPUT
fi fi
- name: Trigger buildkite build - name: Trigger buildkite build

View File

@ -17,8 +17,10 @@ jobs:
run: | run: |
echo "$GITHUB_CONTEXT" echo "$GITHUB_CONTEXT"
- uses: actions/checkout@v1 - uses: actions/checkout@v3
- uses: actions/setup-python@v1 - uses: actions/setup-python@v4
with:
python-version: '3.10'
- name: Install Bernadette app dependency and send an alert - name: Install Bernadette app dependency and send an alert
env: env:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}

View File

@ -8,7 +8,7 @@ be used in real products.
spaCy comes with spaCy comes with
[pretrained pipelines](https://spacy.io/models) and [pretrained pipelines](https://spacy.io/models) and
currently supports tokenization and training for **60+ languages**. It features currently supports tokenization and training for **70+ languages**. It features
state-of-the-art speed and **neural network models** for tagging, state-of-the-art speed and **neural network models** for tagging,
parsing, **named entity recognition**, **text classification** and more, parsing, **named entity recognition**, **text classification** and more,
multi-task learning with pretrained **transformers** like BERT, as well as a multi-task learning with pretrained **transformers** like BERT, as well as a
@ -16,7 +16,7 @@ production-ready [**training system**](https://spacy.io/usage/training) and easy
model packaging, deployment and workflow management. spaCy is commercial model packaging, deployment and workflow management. spaCy is commercial
open-source software, released under the MIT license. open-source software, released under the MIT license.
💫 **Version 3.4.0 out now!** 💫 **Version 3.4 out now!**
[Check out the release notes here.](https://github.com/explosion/spaCy/releases) [Check out the release notes here.](https://github.com/explosion/spaCy/releases)
[![Azure Pipelines](https://img.shields.io/azure-devops/build/explosion-ai/public/8/master.svg?logo=azure-pipelines&style=flat-square&label=build)](https://dev.azure.com/explosion-ai/public/_build?definitionId=8) [![Azure Pipelines](https://img.shields.io/azure-devops/build/explosion-ai/public/8/master.svg?logo=azure-pipelines&style=flat-square&label=build)](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)
@ -79,7 +79,7 @@ more people can benefit from it.
## Features ## Features
- Support for **60+ languages** - Support for **70+ languages**
- **Trained pipelines** for different languages and tasks - **Trained pipelines** for different languages and tasks
- Multi-task learning with pretrained **transformers** like BERT - Multi-task learning with pretrained **transformers** like BERT
- Support for pretrained **word vectors** and embeddings - Support for pretrained **word vectors** and embeddings

View File

@ -76,24 +76,24 @@ jobs:
# Python39Mac: # Python39Mac:
# imageName: "macos-latest" # imageName: "macos-latest"
# python.version: "3.9" # python.version: "3.9"
Python310Linux: # Python310Linux:
imageName: "ubuntu-latest" # imageName: "ubuntu-latest"
python.version: "3.10" # python.version: "3.10"
Python310Windows: Python310Windows:
imageName: "windows-latest" imageName: "windows-latest"
python.version: "3.10" python.version: "3.10"
Python310Mac: # Python310Mac:
imageName: "macos-latest" # imageName: "macos-latest"
python.version: "3.10" # python.version: "3.10"
Python311Linux: Python311Linux:
imageName: 'ubuntu-latest' imageName: 'ubuntu-latest'
python.version: '3.11.0-rc.2' python.version: '3.11'
Python311Windows: Python311Windows:
imageName: 'windows-latest' imageName: 'windows-latest'
python.version: '3.11.0-rc.2' python.version: '3.11'
Python311Mac: Python311Mac:
imageName: 'macos-latest' imageName: 'macos-latest'
python.version: '3.11.0-rc.2' python.version: '3.11'
maxParallel: 4 maxParallel: 4
pool: pool:
vmImage: $(imageName) vmImage: $(imageName)
@ -101,20 +101,3 @@ jobs:
- template: .github/azure-steps.yml - template: .github/azure-steps.yml
parameters: parameters:
python_version: '$(python.version)' python_version: '$(python.version)'
architecture: 'x64'
# - job: "TestGPU"
# dependsOn: "Validate"
# strategy:
# matrix:
# Python38LinuxX64_GPU:
# python.version: '3.8'
# pool:
# name: "LinuxX64_GPU"
# steps:
# - template: .github/azure-steps.yml
# parameters:
# python_version: '$(python.version)'
# architecture: 'x64'
# gpu: true
# num_build_jobs: 24

View File

@ -9,7 +9,7 @@ murmurhash>=0.28.0,<1.1.0
wasabi>=0.9.1,<1.1.0 wasabi>=0.9.1,<1.1.0
srsly>=2.4.3,<3.0.0 srsly>=2.4.3,<3.0.0
catalogue>=2.0.6,<2.1.0 catalogue>=2.0.6,<2.1.0
typer>=0.3.0,<0.5.0 typer>=0.3.0,<0.8.0
pathy>=0.3.5 pathy>=0.3.5
# Third party dependencies # Third party dependencies
numpy>=1.15.0 numpy>=1.15.0

View File

@ -51,7 +51,7 @@ install_requires =
srsly>=2.4.3,<3.0.0 srsly>=2.4.3,<3.0.0
catalogue>=2.0.6,<2.1.0 catalogue>=2.0.6,<2.1.0
# Third-party dependencies # Third-party dependencies
typer>=0.3.0,<0.5.0 typer>=0.3.0,<0.8.0
pathy>=0.3.5 pathy>=0.3.5
tqdm>=4.38.0,<5.0.0 tqdm>=4.38.0,<5.0.0
numpy>=1.15.0 numpy>=1.15.0

View File

@ -10,6 +10,7 @@ from .._util import get_hash, get_checksum, download_file, ensure_pathy
from ...util import make_tempdir, get_minor_version, ENV_VARS, check_bool_env_var from ...util import make_tempdir, get_minor_version, ENV_VARS, check_bool_env_var
from ...git_info import GIT_VERSION from ...git_info import GIT_VERSION
from ... import about from ... import about
from ...errors import Errors
if TYPE_CHECKING: if TYPE_CHECKING:
from pathy import Pathy # noqa: F401 from pathy import Pathy # noqa: F401
@ -84,7 +85,23 @@ class RemoteStorage:
with tarfile.open(tar_loc, mode=mode_string) as tar_file: with tarfile.open(tar_loc, mode=mode_string) as tar_file:
# This requires that the path is added correctly, relative # This requires that the path is added correctly, relative
# to root. This is how we set things up in push() # to root. This is how we set things up in push()
tar_file.extractall(self.root)
# Disallow paths outside the current directory for the tar
# file (CVE-2007-4559, directory traversal vulnerability)
def is_within_directory(directory, target):
abs_directory = os.path.abspath(directory)
abs_target = os.path.abspath(target)
prefix = os.path.commonprefix([abs_directory, abs_target])
return prefix == abs_directory
def safe_extract(tar, path):
for member in tar.getmembers():
member_path = os.path.join(path, member.name)
if not is_within_directory(path, member_path):
raise ValueError(Errors.E852)
tar.extractall(path)
safe_extract(tar_file, self.root)
return url return url
def find( def find(

View File

@ -53,6 +53,7 @@ def project_run(
force: bool = False, force: bool = False,
dry: bool = False, dry: bool = False,
capture: bool = False, capture: bool = False,
skip_requirements_check: bool = False,
) -> None: ) -> None:
"""Run a named script defined in the project.yml. If the script is part """Run a named script defined in the project.yml. If the script is part
of the default pipeline (defined in the "run" section), DVC is used to of the default pipeline (defined in the "run" section), DVC is used to
@ -69,6 +70,7 @@ def project_run(
sys.exit will be called with the return code. You should use capture=False sys.exit will be called with the return code. You should use capture=False
when you want to turn over execution to the command, and capture=True when you want to turn over execution to the command, and capture=True
when you want to run the command more like a function. when you want to run the command more like a function.
skip_requirements_check (bool): Whether to skip the requirements check.
""" """
config = load_project_config(project_dir, overrides=overrides) config = load_project_config(project_dir, overrides=overrides)
commands = {cmd["name"]: cmd for cmd in config.get("commands", [])} commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
@ -76,9 +78,10 @@ def project_run(
validate_subcommand(list(commands.keys()), list(workflows.keys()), subcommand) validate_subcommand(list(commands.keys()), list(workflows.keys()), subcommand)
req_path = project_dir / "requirements.txt" req_path = project_dir / "requirements.txt"
if config.get("check_requirements", True) and os.path.exists(req_path): if not skip_requirements_check:
with req_path.open() as requirements_file: if config.get("check_requirements", True) and os.path.exists(req_path):
_check_requirements([req.replace("\n", "") for req in requirements_file]) with req_path.open() as requirements_file:
_check_requirements([req.strip() for req in requirements_file])
if subcommand in workflows: if subcommand in workflows:
msg.info(f"Running workflow '{subcommand}'") msg.info(f"Running workflow '{subcommand}'")
@ -90,6 +93,7 @@ def project_run(
force=force, force=force,
dry=dry, dry=dry,
capture=capture, capture=capture,
skip_requirements_check=True,
) )
else: else:
cmd = commands[subcommand] cmd = commands[subcommand]
@ -338,6 +342,10 @@ def _check_requirements(requirements: List[str]) -> Tuple[bool, bool]:
failed_pkgs_msgs.append(dnf.report()) failed_pkgs_msgs.append(dnf.report())
except pkg_resources.VersionConflict as vc: except pkg_resources.VersionConflict as vc:
conflicting_pkgs_msgs.append(vc.report()) conflicting_pkgs_msgs.append(vc.report())
except Exception:
msg.warn(f"Unable to check requirement: {req} "
"Checks are currently limited to requirement specifiers "
"(PEP 508)")
if len(failed_pkgs_msgs) or len(conflicting_pkgs_msgs): if len(failed_pkgs_msgs) or len(conflicting_pkgs_msgs):
msg.warn( msg.warn(

View File

@ -212,8 +212,8 @@ class Warnings(metaclass=ErrorsWithCodes):
W121 = ("Attempting to trace non-existent method '{method}' in pipe '{pipe}'") W121 = ("Attempting to trace non-existent method '{method}' in pipe '{pipe}'")
W122 = ("Couldn't trace method '{method}' in pipe '{pipe}'. This can happen if the pipe class " W122 = ("Couldn't trace method '{method}' in pipe '{pipe}'. This can happen if the pipe class "
"is a Cython extension type.") "is a Cython extension type.")
W123 = ("Argument {arg} with value {arg_value} is used instead of {config_value} as specified in the config. Be " W123 = ("Argument `enable` with value {enable} does not contain all values specified in the config option "
"aware that this might affect other components in your pipeline.") "`enabled` ({enabled}). Be aware that this might affect other components in your pipeline.")
W124 = ("Using the features PREFIX and/or SUFFIX in a RichFeatureExtractor configuration may lead to the same " W124 = ("Using the features PREFIX and/or SUFFIX in a RichFeatureExtractor configuration may lead to the same "
"information being fed forward twice if prefixes and suffixes of corresponding lengths are specified.") "information being fed forward twice if prefixes and suffixes of corresponding lengths are specified.")
@ -546,6 +546,8 @@ class Errors(metaclass=ErrorsWithCodes):
"during training, make sure to include it in 'annotating components'") "during training, make sure to include it in 'annotating components'")
# New errors added in v3.x # New errors added in v3.x
E852 = ("The tar file pulled from the remote attempted an unsafe path "
"traversal.")
E853 = ("Unsupported component factory name '{name}'. The character '.' is " E853 = ("Unsupported component factory name '{name}'. The character '.' is "
"not permitted in factory names.") "not permitted in factory names.")
E854 = ("Unable to set doc.ents. Check that the 'ents_filter' does not " E854 = ("Unable to set doc.ents. Check that the 'ents_filter' does not "

View File

@ -1879,31 +1879,22 @@ class Language:
if isinstance(exclude, str): if isinstance(exclude, str):
exclude = [exclude] exclude = [exclude]
def fetch_pipes_status(value: Iterable[str], key: str) -> Iterable[str]: # `enable` should not be merged with `enabled` (the opposite is true for `disable`/`disabled`). If the config
"""Fetch value for `enable` or `disable` w.r.t. the specified config and passed arguments passed to # specifies values for `enabled` not included in `enable`, emit warning.
.load(). If both arguments and config specified values for this field, the passed arguments take precedence if id(enable) != id(_DEFAULT_EMPTY_PIPES):
and a warning is printed. enabled = config["nlp"].get("enabled", [])
value (Iterable[str]): Passed value for `enable` or `disable`. if len(enabled) and not set(enabled).issubset(enable):
key (str): Key for field in config (either "enabled" or "disabled"). warnings.warn(
RETURN (Iterable[str]): Warnings.W123.format(
""" enable=enable,
# We assume that no argument was passed if the value is the specified default value. enabled=enabled,
if id(value) == id(_DEFAULT_EMPTY_PIPES):
return config["nlp"].get(key, [])
else:
if len(config["nlp"].get(key, [])):
warnings.warn(
Warnings.W123.format(
arg=key[:-1],
arg_value=value,
config_value=config["nlp"][key],
)
) )
return value )
# Ensure sets of disabled/enabled pipe names are not contradictory.
disabled_pipes = cls._resolve_component_status( disabled_pipes = cls._resolve_component_status(
fetch_pipes_status(disable, "disabled"), list({*disable, *config["nlp"].get("disabled", [])}),
fetch_pipes_status(enable, "enabled"), enable,
config["nlp"]["pipeline"], config["nlp"]["pipeline"],
) )
nlp._disabled = set(p for p in disabled_pipes if p not in exclude) nlp._disabled = set(p for p in disabled_pipes if p not in exclude)
@ -2084,10 +2075,12 @@ class Language:
if enable: if enable:
if isinstance(enable, str): if isinstance(enable, str):
enable = [enable] enable = [enable]
to_disable = [ to_disable = {
pipe_name for pipe_name in pipe_names if pipe_name not in enable *[pipe_name for pipe_name in pipe_names if pipe_name not in enable],
] *disable,
if disable and disable != to_disable: }
# If any pipe to be enabled is in to_disable, the specification is inconsistent.
if len(set(enable) & to_disable):
raise ValueError(Errors.E1042.format(enable=enable, disable=disable)) raise ValueError(Errors.E1042.format(enable=enable, disable=disable))
return tuple(to_disable) return tuple(to_disable)

View File

@ -71,11 +71,10 @@ def span_maker_forward(model, docs: List[Doc], is_train) -> Tuple[Ragged, Callab
cands.append((start_token, end_token)) cands.append((start_token, end_token))
candidates.append(ops.asarray2i(cands)) candidates.append(ops.asarray2i(cands))
candlens = ops.asarray1i([len(cands) for cands in candidates]) lengths = model.ops.asarray1i([len(cands) for cands in candidates])
candidates = ops.xp.concatenate(candidates) out = Ragged(model.ops.flatten(candidates), lengths)
outputs = Ragged(candidates, candlens)
# because this is just rearranging docs, the backprop does nothing # because this is just rearranging docs, the backprop does nothing
return outputs, lambda x: [] return out, lambda x: []
@registry.misc("spacy.KBFromFile.v1") @registry.misc("spacy.KBFromFile.v1")

View File

@ -24,8 +24,8 @@ single_label_default_config = """
[model.tok2vec.embed] [model.tok2vec.embed]
@architectures = "spacy.MultiHashEmbed.v2" @architectures = "spacy.MultiHashEmbed.v2"
width = 64 width = 64
rows = [2000, 2000, 1000, 1000, 1000, 1000] rows = [2000, 2000, 500, 1000, 500]
attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"] attrs = ["NORM", "LOWER", "PREFIX", "SUFFIX", "SHAPE"]
include_static_vectors = false include_static_vectors = false
[model.tok2vec.encode] [model.tok2vec.encode]
@ -72,7 +72,7 @@ subword_features = true
"textcat", "textcat",
assigns=["doc.cats"], assigns=["doc.cats"],
default_config={ default_config={
"threshold": 0.5, "threshold": 0.0,
"model": DEFAULT_SINGLE_TEXTCAT_MODEL, "model": DEFAULT_SINGLE_TEXTCAT_MODEL,
"scorer": {"@scorers": "spacy.textcat_scorer.v1"}, "scorer": {"@scorers": "spacy.textcat_scorer.v1"},
}, },
@ -144,7 +144,8 @@ class TextCategorizer(TrainablePipe):
model (thinc.api.Model): The Thinc Model powering the pipeline component. model (thinc.api.Model): The Thinc Model powering the pipeline component.
name (str): The component instance name, used to add entries to the name (str): The component instance name, used to add entries to the
losses during training. losses during training.
threshold (float): Cutoff to consider a prediction "positive". threshold (float): Unused, not needed for single-label (exclusive
classes) classification.
scorer (Optional[Callable]): The scoring method. Defaults to scorer (Optional[Callable]): The scoring method. Defaults to
Scorer.score_cats for the attribute "cats". Scorer.score_cats for the attribute "cats".
@ -154,7 +155,11 @@ class TextCategorizer(TrainablePipe):
self.model = model self.model = model
self.name = name self.name = name
self._rehearsal_model = None self._rehearsal_model = None
cfg = {"labels": [], "threshold": threshold, "positive_label": None} cfg: Dict[str, Any] = {
"labels": [],
"threshold": threshold,
"positive_label": None,
}
self.cfg = dict(cfg) self.cfg = dict(cfg)
self.scorer = scorer self.scorer = scorer

View File

@ -24,8 +24,8 @@ multi_label_default_config = """
[model.tok2vec.embed] [model.tok2vec.embed]
@architectures = "spacy.MultiHashEmbed.v2" @architectures = "spacy.MultiHashEmbed.v2"
width = 64 width = 64
rows = [2000, 2000, 1000, 1000, 1000, 1000] rows = [2000, 2000, 500, 1000, 500]
attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"] attrs = ["NORM", "LOWER", "PREFIX", "SUFFIX", "SHAPE"]
include_static_vectors = false include_static_vectors = false
[model.tok2vec.encode] [model.tok2vec.encode]

View File

@ -446,7 +446,7 @@ class Scorer:
labels (Iterable[str]): The set of possible labels. Defaults to []. labels (Iterable[str]): The set of possible labels. Defaults to [].
multi_label (bool): Whether the attribute allows multiple labels. multi_label (bool): Whether the attribute allows multiple labels.
Defaults to True. When set to False (exclusive labels), missing Defaults to True. When set to False (exclusive labels), missing
gold labels are interpreted as 0.0. gold labels are interpreted as 0.0 and the threshold is set to 0.0.
positive_label (str): The positive label for a binary task with positive_label (str): The positive label for a binary task with
exclusive classes. Defaults to None. exclusive classes. Defaults to None.
threshold (float): Cutoff to consider a prediction "positive". Defaults threshold (float): Cutoff to consider a prediction "positive". Defaults
@ -471,6 +471,8 @@ class Scorer:
""" """
if threshold is None: if threshold is None:
threshold = 0.5 if multi_label else 0.0 threshold = 0.5 if multi_label else 0.0
if not multi_label:
threshold = 0.0
f_per_type = {label: PRFScore() for label in labels} f_per_type = {label: PRFScore() for label in labels}
auc_per_type = {label: ROCAUCScore() for label in labels} auc_per_type = {label: ROCAUCScore() for label in labels}
labels = set(labels) labels = set(labels)
@ -505,20 +507,18 @@ class Scorer:
# Get the highest-scoring for each. # Get the highest-scoring for each.
pred_label, pred_score = max(pred_cats.items(), key=lambda it: it[1]) pred_label, pred_score = max(pred_cats.items(), key=lambda it: it[1])
gold_label, gold_score = max(gold_cats.items(), key=lambda it: it[1]) gold_label, gold_score = max(gold_cats.items(), key=lambda it: it[1])
if pred_label == gold_label and pred_score >= threshold: if pred_label == gold_label:
f_per_type[pred_label].tp += 1 f_per_type[pred_label].tp += 1
else: else:
f_per_type[gold_label].fn += 1 f_per_type[gold_label].fn += 1
if pred_score >= threshold: f_per_type[pred_label].fp += 1
f_per_type[pred_label].fp += 1
elif gold_cats: elif gold_cats:
gold_label, gold_score = max(gold_cats, key=lambda it: it[1]) gold_label, gold_score = max(gold_cats, key=lambda it: it[1])
if gold_score > 0: if gold_score > 0:
f_per_type[gold_label].fn += 1 f_per_type[gold_label].fn += 1
elif pred_cats: elif pred_cats:
pred_label, pred_score = max(pred_cats.items(), key=lambda it: it[1]) pred_label, pred_score = max(pred_cats.items(), key=lambda it: it[1])
if pred_score >= threshold: f_per_type[pred_label].fp += 1
f_per_type[pred_label].fp += 1
micro_prf = PRFScore() micro_prf = PRFScore()
for label_prf in f_per_type.values(): for label_prf in f_per_type.values():
micro_prf.tp += label_prf.tp micro_prf.tp += label_prf.tp

View File

@ -370,3 +370,12 @@ def test_json_to_doc_validation_error(doc):
doc_json.pop("tokens") doc_json.pop("tokens")
with pytest.raises(ValueError): with pytest.raises(ValueError):
Doc(doc.vocab).from_json(doc_json, validate=True) Doc(doc.vocab).from_json(doc_json, validate=True)
def test_to_json_underscore_doc_getters(doc):
def get_text_length(doc):
return len(doc.text)
Doc.set_extension("text_length", getter=get_text_length)
doc_json = doc.to_json(underscore=["text_length"])
assert doc_json["_"]["text_length"] == get_text_length(doc)

View File

@ -9,6 +9,7 @@ from spacy.compat import pickle
from spacy.kb import Candidate, InMemoryLookupKB, get_candidates, KnowledgeBase from spacy.kb import Candidate, InMemoryLookupKB, get_candidates, KnowledgeBase
from spacy.lang.en import English from spacy.lang.en import English
from spacy.ml import load_kb from spacy.ml import load_kb
from spacy.ml.models.entity_linker import build_span_maker
from spacy.pipeline import EntityLinker from spacy.pipeline import EntityLinker
from spacy.pipeline.legacy import EntityLinker_v1 from spacy.pipeline.legacy import EntityLinker_v1
from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
@ -715,7 +716,11 @@ TRAIN_DATA = [
("Russ Cochran was a member of University of Kentucky's golf team.", ("Russ Cochran was a member of University of Kentucky's golf team.",
{"links": {(0, 12): {"Q7381115": 0.0, "Q2146908": 1.0}}, {"links": {(0, 12): {"Q7381115": 0.0, "Q2146908": 1.0}},
"entities": [(0, 12, "PERSON"), (43, 51, "LOC")], "entities": [(0, 12, "PERSON"), (43, 51, "LOC")],
"sent_starts": [1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}) "sent_starts": [1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}),
# having a blank instance shouldn't break things
("The weather is nice today.",
{"links": {}, "entities": [],
"sent_starts": [1, -1, 0, 0, 0, 0]})
] ]
GOLD_entities = ["Q2146908", "Q7381115", "Q7381115", "Q2146908"] GOLD_entities = ["Q2146908", "Q7381115", "Q7381115", "Q2146908"]
# fmt: on # fmt: on
@ -1196,3 +1201,18 @@ def test_threshold(meet_threshold: bool, config: Dict[str, Any]):
assert len(doc.ents) == 1 assert len(doc.ents) == 1
assert doc.ents[0].kb_id_ == entity_id if meet_threshold else EntityLinker.NIL assert doc.ents[0].kb_id_ == entity_id if meet_threshold else EntityLinker.NIL
def test_span_maker_forward_with_empty():
"""The forward pass of the span maker may have a doc with no entities."""
nlp = English()
doc1 = nlp("a b c")
ent = doc1[0:1]
ent.label_ = "X"
doc1.ents = [ent]
# no entities
doc2 = nlp("x y z")
# just to get a model
span_maker = build_span_maker()
span_maker([doc1, doc2], False)

View File

@ -615,20 +615,18 @@ def test_enable_disable_conflict_with_config():
with make_tempdir() as tmp_dir: with make_tempdir() as tmp_dir:
nlp.to_disk(tmp_dir) nlp.to_disk(tmp_dir)
# Expected to fail, as config and arguments conflict. # Expected to succeed, as config and arguments do not conflict.
with pytest.raises(ValueError): assert spacy.load(
spacy.load( tmp_dir, enable=["tagger"], config={"nlp": {"disabled": ["senter"]}}
tmp_dir, enable=["tagger"], config={"nlp": {"disabled": ["senter"]}} ).disabled == ["senter", "sentencizer"]
)
# Expected to succeed without warning due to the lack of a conflicting config option. # Expected to succeed without warning due to the lack of a conflicting config option.
spacy.load(tmp_dir, enable=["tagger"]) spacy.load(tmp_dir, enable=["tagger"])
# Expected to succeed with a warning, as disable=[] should override the config setting. # Expected to fail due to conflict between enable and disabled.
with pytest.warns(UserWarning): with pytest.raises(ValueError):
spacy.load( spacy.load(
tmp_dir, tmp_dir,
enable=["tagger"], enable=["senter"],
disable=[], config={"nlp": {"disabled": ["senter", "tagger"]}},
config={"nlp": {"disabled": ["senter"]}},
) )

View File

@ -823,10 +823,10 @@ def test_textcat_loss(multi_label: bool, expected_loss: float):
assert loss == expected_loss assert loss == expected_loss
def test_textcat_threshold(): def test_textcat_multilabel_threshold():
# Ensure the scorer can be called with a different threshold # Ensure the scorer can be called with a different threshold
nlp = English() nlp = English()
nlp.add_pipe("textcat") nlp.add_pipe("textcat_multilabel")
train_examples = [] train_examples = []
for text, annotations in TRAIN_DATA_SINGLE_LABEL: for text, annotations in TRAIN_DATA_SINGLE_LABEL:
@ -849,7 +849,7 @@ def test_textcat_threshold():
) )
pos_f = scores["cats_score"] pos_f = scores["cats_score"]
assert scores["cats_f_per_type"]["POSITIVE"]["r"] == 1.0 assert scores["cats_f_per_type"]["POSITIVE"]["r"] == 1.0
assert pos_f > macro_f assert pos_f >= macro_f
def test_textcat_multi_threshold(): def test_textcat_multi_threshold():

View File

@ -404,11 +404,10 @@ def test_serialize_pipeline_disable_enable():
assert nlp3.component_names == ["ner", "tagger"] assert nlp3.component_names == ["ner", "tagger"]
with make_tempdir() as d: with make_tempdir() as d:
nlp3.to_disk(d) nlp3.to_disk(d)
with pytest.warns(UserWarning): nlp4 = spacy.load(d, disable=["ner"])
nlp4 = spacy.load(d, disable=["ner"]) assert nlp4.pipe_names == []
assert nlp4.pipe_names == ["tagger"]
assert nlp4.component_names == ["ner", "tagger"] assert nlp4.component_names == ["ner", "tagger"]
assert nlp4.disabled == ["ner"] assert nlp4.disabled == ["ner", "tagger"]
with make_tempdir() as d: with make_tempdir() as d:
nlp.to_disk(d) nlp.to_disk(d)
nlp5 = spacy.load(d, exclude=["tagger"]) nlp5 = spacy.load(d, exclude=["tagger"])

View File

@ -1,5 +1,6 @@
import os import os
import math import math
import pkg_resources
from random import sample from random import sample
from typing import Counter from typing import Counter
@ -25,6 +26,7 @@ from spacy.cli.download import get_compatibility, get_version
from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config
from spacy.cli.package import get_third_party_dependencies from spacy.cli.package import get_third_party_dependencies
from spacy.cli.package import _is_permitted_package_name from spacy.cli.package import _is_permitted_package_name
from spacy.cli.project.run import _check_requirements
from spacy.cli.validate import get_model_pkgs from spacy.cli.validate import get_model_pkgs
from spacy.lang.en import English from spacy.lang.en import English
from spacy.lang.nl import Dutch from spacy.lang.nl import Dutch
@ -855,3 +857,42 @@ def test_span_length_freq_dist_output_must_be_correct():
span_freqs = _get_spans_length_freq_dist(sample_span_lengths, threshold) span_freqs = _get_spans_length_freq_dist(sample_span_lengths, threshold)
assert sum(span_freqs.values()) >= threshold assert sum(span_freqs.values()) >= threshold
assert list(span_freqs.keys()) == [3, 1, 4, 5, 2] assert list(span_freqs.keys()) == [3, 1, 4, 5, 2]
@pytest.mark.parametrize(
"reqs,output",
[
[
"""
spacy
# comment
thinc""",
(False, False),
],
[
"""# comment
--some-flag
spacy""",
(False, False),
],
[
"""# comment
--some-flag
spacy; python_version >= '3.6'""",
(False, False),
],
[
"""# comment
spacyunknowndoesnotexist12345""",
(True, False),
],
],
)
def test_project_check_requirements(reqs, output):
# excessive guard against unlikely package name
try:
pkg_resources.require("spacyunknowndoesnotexist12345")
except pkg_resources.DistributionNotFound:
assert output == _check_requirements([req.strip() for req in reqs.split("\n")])

View File

@ -23,7 +23,7 @@ def get_textcat_bow_kwargs():
def get_textcat_cnn_kwargs(): def get_textcat_cnn_kwargs():
return {"tok2vec": test_tok2vec(), "exclusive_classes": False, "nO": 13} return {"tok2vec": make_test_tok2vec(), "exclusive_classes": False, "nO": 13}
def get_all_params(model): def get_all_params(model):
@ -65,7 +65,7 @@ def get_tok2vec_kwargs():
} }
def test_tok2vec(): def make_test_tok2vec():
return build_Tok2Vec_model(**get_tok2vec_kwargs()) return build_Tok2Vec_model(**get_tok2vec_kwargs())

View File

@ -474,3 +474,50 @@ def test_prf_score():
assert (a.precision, a.recall, a.fscore) == approx( assert (a.precision, a.recall, a.fscore) == approx(
(c.precision, c.recall, c.fscore) (c.precision, c.recall, c.fscore)
) )
def test_score_cats(en_tokenizer):
text = "some text"
gold_doc = en_tokenizer(text)
gold_doc.cats = {"POSITIVE": 1.0, "NEGATIVE": 0.0}
pred_doc = en_tokenizer(text)
pred_doc.cats = {"POSITIVE": 0.75, "NEGATIVE": 0.25}
example = Example(pred_doc, gold_doc)
# threshold is ignored for multi_label=False
scores1 = Scorer.score_cats(
[example],
"cats",
labels=list(gold_doc.cats.keys()),
multi_label=False,
positive_label="POSITIVE",
threshold=0.1,
)
scores2 = Scorer.score_cats(
[example],
"cats",
labels=list(gold_doc.cats.keys()),
multi_label=False,
positive_label="POSITIVE",
threshold=0.9,
)
assert scores1["cats_score"] == 1.0
assert scores2["cats_score"] == 1.0
assert scores1 == scores2
# threshold is relevant for multi_label=True
scores = Scorer.score_cats(
[example],
"cats",
labels=list(gold_doc.cats.keys()),
multi_label=True,
threshold=0.9,
)
assert scores["cats_macro_f"] == 0.0
# threshold is relevant for multi_label=True
scores = Scorer.score_cats(
[example],
"cats",
labels=list(gold_doc.cats.keys()),
multi_label=True,
threshold=0.1,
)
assert scores["cats_macro_f"] == 0.5

View File

@ -1667,6 +1667,20 @@ cdef class Doc:
if underscore: if underscore:
user_keys = set() user_keys = set()
# Handle doc attributes with .get to include values from getters
# and not only values stored in user_data, for backwards
# compatibility
for attr in underscore:
if self.has_extension(attr):
if "_" not in data:
data["_"] = {}
value = self._.get(attr)
if not srsly.is_json_serializable(value):
raise ValueError(Errors.E107.format(attr=attr, value=repr(value)))
data["_"][attr] = value
user_keys.add(attr)
# Token and span attributes only include values stored in user_data
# and not values generated by getters
if self.user_data: if self.user_data:
for data_key, value in self.user_data.copy().items(): for data_key, value in self.user_data.copy().items():
if type(data_key) == tuple and len(data_key) >= 4 and data_key[0] == "._.": if type(data_key) == tuple and len(data_key) >= 4 and data_key[0] == "._.":
@ -1677,20 +1691,15 @@ cdef class Doc:
user_keys.add(attr) user_keys.add(attr)
if not srsly.is_json_serializable(value): if not srsly.is_json_serializable(value):
raise ValueError(Errors.E107.format(attr=attr, value=repr(value))) raise ValueError(Errors.E107.format(attr=attr, value=repr(value)))
# Check if doc attribute # Token attribute
if start is None: if start is not None and end is None:
if "_" not in data:
data["_"] = {}
data["_"][attr] = value
# Check if token attribute
elif end is None:
if "underscore_token" not in data: if "underscore_token" not in data:
data["underscore_token"] = {} data["underscore_token"] = {}
if attr not in data["underscore_token"]: if attr not in data["underscore_token"]:
data["underscore_token"][attr] = [] data["underscore_token"][attr] = []
data["underscore_token"][attr].append({"start": start, "value": value}) data["underscore_token"][attr].append({"start": start, "value": value})
# Else span attribute # Span attribute
else: elif start is not None and end is not None:
if "underscore_span" not in data: if "underscore_span" not in data:
data["underscore_span"] = {} data["underscore_span"] = {}
if attr not in data["underscore_span"]: if attr not in data["underscore_span"]:

View File

@ -117,15 +117,13 @@ class Span:
end_char: int end_char: int
label: int label: int
kb_id: int kb_id: int
id: int
ent_id: int ent_id: int
ent_id_: str ent_id_: str
@property @property
def id(self) -> int: ...
@property
def id_(self) -> str: ...
@property
def orth_(self) -> str: ... def orth_(self) -> str: ...
@property @property
def lemma_(self) -> str: ... def lemma_(self) -> str: ...
label_: str label_: str
kb_id_: str kb_id_: str
id_: str

View File

@ -443,9 +443,9 @@ def load_model_from_package(
name: str, name: str,
*, *,
vocab: Union["Vocab", bool] = True, vocab: Union["Vocab", bool] = True,
disable: Union[str, Iterable[str]] = SimpleFrozenList(), disable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
enable: Union[str, Iterable[str]] = SimpleFrozenList(), enable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
exclude: Union[str, Iterable[str]] = SimpleFrozenList(), exclude: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
config: Union[Dict[str, Any], Config] = SimpleFrozenDict(), config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
) -> "Language": ) -> "Language":
"""Load a model from an installed package. """Load a model from an installed package.
@ -619,9 +619,9 @@ def load_model_from_init_py(
init_file: Union[Path, str], init_file: Union[Path, str],
*, *,
vocab: Union["Vocab", bool] = True, vocab: Union["Vocab", bool] = True,
disable: Union[str, Iterable[str]] = SimpleFrozenList(), disable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
enable: Union[str, Iterable[str]] = SimpleFrozenList(), enable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
exclude: Union[str, Iterable[str]] = SimpleFrozenList(), exclude: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
config: Union[Dict[str, Any], Config] = SimpleFrozenDict(), config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
) -> "Language": ) -> "Language":
"""Helper function to use in the `load()` method of a model package's """Helper function to use in the `load()` method of a model package's

View File

@ -63,18 +63,18 @@ spaCy loads a model under the hood based on its
> nlp = Language.from_config(config) > nlp = Language.from_config(config)
> ``` > ```
| Name | Description | | Name | Description |
| ------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `config` | The loaded config. ~~Union[Dict[str, Any], Config]~~ | | `config` | The loaded config. ~~Union[Dict[str, Any], Config]~~ |
| _keyword-only_ | | | _keyword-only_ | |
| `vocab` | A `Vocab` object. If `True`, a vocab is created using the default language data settings. ~~Vocab~~ | | `vocab` | A `Vocab` object. If `True`, a vocab is created using the default language data settings. ~~Vocab~~ |
| `disable` | Name(s) of pipeline component(s) to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [`nlp.enable_pipe`](/api/language#enable_pipe). ~~Union[str, Iterable[str]]~~ | | `disable` | Name(s) of pipeline component(s) to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). Is merged with the config entry `nlp.disabled`. ~~Union[str, Iterable[str]]~~ |
| `enable` <Tag variant="new">3.4</Tag> | Name(s) of pipeline component(s) to [enable](/usage/processing-pipelines#disabling). All other pipes will be disabled, but can be enabled again using [`nlp.enable_pipe`](/api/language#enable_pipe). ~~Union[str, Iterable[str]]~~ | | `enable` <Tag variant="new">3.4</Tag> | Name(s) of pipeline component(s) to [enable](/usage/processing-pipelines#disabling). All other pipes will be disabled, but can be enabled again using [nlp.enable_pipe](/api/language#enable_pipe). ~~Union[str, Iterable[str]]~~ |
| `exclude` | Name(s) of pipeline component(s) to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~Union[str, Iterable[str]]~~ | | `exclude` | Name(s) of pipeline component(s) to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~Union[str, Iterable[str]]~~ |
| `meta` | [Meta data](/api/data-formats#meta) overrides. ~~Dict[str, Any]~~ | | `meta` | [Meta data](/api/data-formats#meta) overrides. ~~Dict[str, Any]~~ |
| `auto_fill` | Whether to automatically fill in missing values in the config, based on defaults and function argument annotations. Defaults to `True`. ~~bool~~ | | `auto_fill` | Whether to automatically fill in missing values in the config, based on defaults and function argument annotations. Defaults to `True`. ~~bool~~ |
| `validate` | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. ~~bool~~ | | `validate` | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. ~~bool~~ |
| **RETURNS** | The initialized object. ~~Language~~ | | **RETURNS** | The initialized object. ~~Language~~ |
## Language.component {#component tag="classmethod" new="3"} ## Language.component {#component tag="classmethod" new="3"}

View File

@ -229,16 +229,17 @@ The reported `{attr}_score` depends on the classification properties:
> print(scores["cats_macro_auc"]) > print(scores["cats_macro_auc"])
> ``` > ```
| Name | Description | | Name | Description |
| ---------------- | -------------------------------------------------------------------------------------------------------------------------------------------------- | | ---------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ | | `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
| `attr` | The attribute to score. ~~str~~ | | `attr` | The attribute to score. ~~str~~ |
| _keyword-only_ | | | _keyword-only_ | |
| `getter` | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the cats for an individual `Doc`. ~~Callable[[Doc, str], Dict[str, float]]~~ | | `getter` | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the cats for an individual `Doc`. ~~Callable[[Doc, str], Dict[str, float]]~~ |
| labels | The set of possible labels. Defaults to `[]`. ~~Iterable[str]~~ | | labels | The set of possible labels. Defaults to `[]`. ~~Iterable[str]~~ |
| `multi_label` | Whether the attribute allows multiple labels. Defaults to `True`. ~~bool~~ | | `multi_label` | Whether the attribute allows multiple labels. Defaults to `True`. When set to `False` (exclusive labels), missing gold labels are interpreted as `0.0` and the threshold is set to `0.0`. ~~bool~~ |
| `positive_label` | The positive label for a binary task with exclusive classes. Defaults to `None`. ~~Optional[str]~~ | | `positive_label` | The positive label for a binary task with exclusive classes. Defaults to `None`. ~~Optional[str]~~ |
| **RETURNS** | A dictionary containing the scores, with inapplicable scores as `None`. ~~Dict[str, Optional[float]]~~ | | `threshold` | Cutoff to consider a prediction "positive". Defaults to `0.5` for multi-label, and `0.0` (i.e. whatever's highest scoring) otherwise. ~~float~~ |
| **RETURNS** | A dictionary containing the scores, with inapplicable scores as `None`. ~~Dict[str, Optional[float]]~~ |
## Scorer.score_links {#score_links tag="staticmethod" new="3"} ## Scorer.score_links {#score_links tag="staticmethod" new="3"}

View File

@ -63,7 +63,6 @@ architectures and their arguments and hyperparameters.
> ```python > ```python
> from spacy.pipeline.textcat import DEFAULT_SINGLE_TEXTCAT_MODEL > from spacy.pipeline.textcat import DEFAULT_SINGLE_TEXTCAT_MODEL
> config = { > config = {
> "threshold": 0.5,
> "model": DEFAULT_SINGLE_TEXTCAT_MODEL, > "model": DEFAULT_SINGLE_TEXTCAT_MODEL,
> } > }
> nlp.add_pipe("textcat", config=config) > nlp.add_pipe("textcat", config=config)
@ -82,7 +81,7 @@ architectures and their arguments and hyperparameters.
| Setting | Description | | Setting | Description |
| ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `threshold` | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~ | | `threshold` | Cutoff to consider a prediction "positive", relevant for `textcat_multilabel` when calculating accuracy scores. ~~float~~ |
| `model` | A model instance that predicts scores for each category. Defaults to [TextCatEnsemble](/api/architectures#TextCatEnsemble). ~~Model[List[Doc], List[Floats2d]]~~ | | `model` | A model instance that predicts scores for each category. Defaults to [TextCatEnsemble](/api/architectures#TextCatEnsemble). ~~Model[List[Doc], List[Floats2d]]~~ |
| `scorer` | The scoring method. Defaults to [`Scorer.score_cats`](/api/scorer#score_cats) for the attribute `"cats"`. ~~Optional[Callable]~~ | | `scorer` | The scoring method. Defaults to [`Scorer.score_cats`](/api/scorer#score_cats) for the attribute `"cats"`. ~~Optional[Callable]~~ |
@ -123,7 +122,7 @@ shortcut for this and instantiate the component using its string name and
| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ | | `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ |
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | | `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
| _keyword-only_ | | | _keyword-only_ | |
| `threshold` | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~ | | `threshold` | Cutoff to consider a prediction "positive", relevant for `textcat_multilabel` when calculating accuracy scores. ~~float~~ |
| `scorer` | The scoring method. Defaults to [`Scorer.score_cats`](/api/scorer#score_cats) for the attribute `"cats"`. ~~Optional[Callable]~~ | | `scorer` | The scoring method. Defaults to [`Scorer.score_cats`](/api/scorer#score_cats) for the attribute `"cats"`. ~~Optional[Callable]~~ |
## TextCategorizer.\_\_call\_\_ {#call tag="method"} ## TextCategorizer.\_\_call\_\_ {#call tag="method"}

View File

@ -45,16 +45,16 @@ specified separately using the new `exclude` keyword argument.
> nlp = spacy.load("en_core_web_sm", exclude=["parser", "tagger"]) > nlp = spacy.load("en_core_web_sm", exclude=["parser", "tagger"])
> ``` > ```
| Name | Description | | Name | Description |
| ------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | ------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `name` | Pipeline to load, i.e. package name or path. ~~Union[str, Path]~~ | | `name` | Pipeline to load, i.e. package name or path. ~~Union[str, Path]~~ |
| _keyword-only_ | | | _keyword-only_ | |
| `vocab` | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~ | | `vocab` | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~ |
| `disable` | Name(s) of pipeline component(s) to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). ~~Union[str, Iterable[str]]~~ | | `disable` | Name(s) of pipeline component(s) to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). Is merged with the config entry `nlp.disabled`. ~~Union[str, Iterable[str]]~~ |
| `enable` <Tag variant="new">3.4</Tag> | Name(s) of pipeline component(s) to [enable](/usage/processing-pipelines#disabling). All other pipes will be disabled. ~~Union[str, Iterable[str]]~~ | | `enable` <Tag variant="new">3.4</Tag> | Name(s) of pipeline component(s) to [enable](/usage/processing-pipelines#disabling). All other pipes will be disabled. ~~Union[str, Iterable[str]]~~ |
| `exclude` <Tag variant="new">3</Tag> | Name(s) of pipeline component(s) to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~Union[str, Iterable[str]]~~ | | `exclude` <Tag variant="new">3</Tag> | Name(s) of pipeline component(s) to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~Union[str, Iterable[str]]~~ |
| `config` <Tag variant="new">3</Tag> | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ | | `config` <Tag variant="new">3</Tag> | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ |
| **RETURNS** | A `Language` object with the loaded pipeline. ~~Language~~ | | **RETURNS** | A `Language` object with the loaded pipeline. ~~Language~~ |
Essentially, `spacy.load()` is a convenience wrapper that reads the pipeline's Essentially, `spacy.load()` is a convenience wrapper that reads the pipeline's
[`config.cfg`](/api/data-formats#config), uses the language and pipeline [`config.cfg`](/api/data-formats#config), uses the language and pipeline

View File

@ -363,7 +363,8 @@ nlp.enable_pipe("tagger")
``` ```
In addition to `disable`, `spacy.load()` also accepts `enable`. If `enable` is In addition to `disable`, `spacy.load()` also accepts `enable`. If `enable` is
set, all components except for those in `enable` are disabled. set, all components except for those in `enable` are disabled. If `enable` and
`disable` conflict (i.e. the same component is included in both), an error is raised.
```python ```python
# Load the complete pipeline, but disable all components except for tok2vec and tagger # Load the complete pipeline, but disable all components except for tok2vec and tagger

View File

@ -1792,7 +1792,7 @@ the entity `Span` for example `._.orgs` or `._.prev_orgs` and
> [`Doc.retokenize`](/api/doc#retokenize) context manager: > [`Doc.retokenize`](/api/doc#retokenize) context manager:
> >
> ```python > ```python
> with doc.retokenize() as retokenize: > with doc.retokenize() as retokenizer:
> for ent in doc.ents: > for ent in doc.ents:
> retokenizer.merge(ent) > retokenizer.merge(ent)
> ``` > ```

View File

@ -4,12 +4,22 @@
"code": "af", "code": "af",
"name": "Afrikaans" "name": "Afrikaans"
}, },
{
"code": "am",
"name": "Amharic",
"has_examples": true
},
{ {
"code": "ar", "code": "ar",
"name": "Arabic", "name": "Arabic",
"example": "هذه جملة", "example": "هذه جملة",
"has_examples": true "has_examples": true
}, },
{
"code": "az",
"name": "Azerbaijani",
"has_examples": true
},
{ {
"code": "bg", "code": "bg",
"name": "Bulgarian", "name": "Bulgarian",
@ -65,7 +75,7 @@
{ {
"code": "dsb", "code": "dsb",
"name": "Lower Sorbian", "name": "Lower Sorbian",
"has_examples": true "has_examples": true
}, },
{ {
"code": "el", "code": "el",
@ -142,6 +152,11 @@
"code": "ga", "code": "ga",
"name": "Irish" "name": "Irish"
}, },
{
"code": "grc",
"name": "Ancient Greek",
"has_examples": true
},
{ {
"code": "gu", "code": "gu",
"name": "Gujarati", "name": "Gujarati",
@ -172,7 +187,7 @@
{ {
"code": "hsb", "code": "hsb",
"name": "Upper Sorbian", "name": "Upper Sorbian",
"has_examples": true "has_examples": true
}, },
{ {
"code": "hu", "code": "hu",
@ -260,6 +275,10 @@
"example": "Адамга эң кыйыны — күн сайын адам болуу", "example": "Адамга эң кыйыны — күн сайын адам болуу",
"has_examples": true "has_examples": true
}, },
{
"code": "la",
"name": "Latin"
},
{ {
"code": "lb", "code": "lb",
"name": "Luxembourgish", "name": "Luxembourgish",
@ -448,6 +467,11 @@
"example": "นี่คือประโยค", "example": "นี่คือประโยค",
"has_examples": true "has_examples": true
}, },
{
"code": "ti",
"name": "Tigrinya",
"has_examples": true
},
{ {
"code": "tl", "code": "tl",
"name": "Tagalog" "name": "Tagalog"

View File

@ -149,6 +149,9 @@
& > span & > span
display: block display: block
a
text-decoration: underline
.small .small
font-size: var(--font-size-code) font-size: var(--font-size-code)
line-height: 1.65 line-height: 1.65

View File

@ -159,6 +159,9 @@ const QuickstartInstall = ({ id, title }) => {
setters={setters} setters={setters}
showDropdown={showDropdown} showDropdown={showDropdown}
> >
<QS os="mac" hardware="gpu" platform="arm">
# Note M1 GPU support is experimental, see <a href="https://github.com/explosion/thinc/issues/792">Thinc issue #792</a>
</QS>
<QS package="pip" config="venv"> <QS package="pip" config="venv">
python -m venv .env python -m venv .env
</QS> </QS>
@ -198,7 +201,13 @@ const QuickstartInstall = ({ id, title }) => {
{nightly ? ' --pre' : ''} {nightly ? ' --pre' : ''}
</QS> </QS>
<QS package="conda">conda install -c conda-forge spacy</QS> <QS package="conda">conda install -c conda-forge spacy</QS>
<QS package="conda" hardware="gpu"> <QS package="conda" hardware="gpu" os="windows">
conda install -c conda-forge cupy
</QS>
<QS package="conda" hardware="gpu" os="linux">
conda install -c conda-forge cupy
</QS>
<QS package="conda" hardware="gpu" os="mac" platform="x86">
conda install -c conda-forge cupy conda install -c conda-forge cupy
</QS> </QS>
<QS package="conda" config="train"> <QS package="conda" config="train">