mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-02 19:30:19 +03:00
Merge branch 'master' of https://github.com/explosion/spaCy into feature/etl
This commit is contained in:
commit
54bdc11353
75
.github/azure-steps.yml
vendored
75
.github/azure-steps.yml
vendored
|
@ -1,9 +1,7 @@
|
||||||
parameters:
|
parameters:
|
||||||
python_version: ''
|
python_version: ''
|
||||||
architecture: ''
|
architecture: 'x64'
|
||||||
prefix: ''
|
num_build_jobs: 2
|
||||||
gpu: false
|
|
||||||
num_build_jobs: 1
|
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- task: UsePythonVersion@0
|
- task: UsePythonVersion@0
|
||||||
|
@ -17,16 +15,16 @@ steps:
|
||||||
displayName: 'Set variables'
|
displayName: 'Set variables'
|
||||||
|
|
||||||
- script: |
|
- script: |
|
||||||
${{ parameters.prefix }} python -m pip install -U pip setuptools
|
python -m pip install -U build pip setuptools
|
||||||
${{ parameters.prefix }} python -m pip install -U -r requirements.txt
|
python -m pip install -U -r requirements.txt
|
||||||
displayName: "Install dependencies"
|
displayName: "Install dependencies"
|
||||||
|
|
||||||
- script: |
|
- script: |
|
||||||
${{ parameters.prefix }} python setup.py build_ext --inplace -j ${{ parameters.num_build_jobs }}
|
python -m build --sdist
|
||||||
${{ parameters.prefix }} python setup.py sdist --formats=gztar
|
displayName: "Build sdist"
|
||||||
displayName: "Compile and build sdist"
|
|
||||||
|
|
||||||
- script: python -m mypy spacy
|
- script: |
|
||||||
|
python -m mypy spacy
|
||||||
displayName: 'Run mypy'
|
displayName: 'Run mypy'
|
||||||
condition: ne(variables['python_version'], '3.6')
|
condition: ne(variables['python_version'], '3.6')
|
||||||
|
|
||||||
|
@ -35,35 +33,24 @@ steps:
|
||||||
contents: "spacy"
|
contents: "spacy"
|
||||||
displayName: "Delete source directory"
|
displayName: "Delete source directory"
|
||||||
|
|
||||||
|
- task: DeleteFiles@1
|
||||||
|
inputs:
|
||||||
|
contents: "*.egg-info"
|
||||||
|
displayName: "Delete egg-info directory"
|
||||||
|
|
||||||
- script: |
|
- script: |
|
||||||
${{ parameters.prefix }} python -m pip freeze --exclude torch --exclude cupy-cuda110 > installed.txt
|
python -m pip freeze > installed.txt
|
||||||
${{ parameters.prefix }} python -m pip uninstall -y -r installed.txt
|
python -m pip uninstall -y -r installed.txt
|
||||||
displayName: "Uninstall all packages"
|
displayName: "Uninstall all packages"
|
||||||
|
|
||||||
- bash: |
|
- bash: |
|
||||||
${{ parameters.prefix }} SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
|
SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
|
||||||
${{ parameters.prefix }} SPACY_NUM_BUILD_JOBS=2 python -m pip install dist/$SDIST
|
SPACY_NUM_BUILD_JOBS=${{ parameters.num_build_jobs }} python -m pip install dist/$SDIST
|
||||||
displayName: "Install from sdist"
|
displayName: "Install from sdist"
|
||||||
|
|
||||||
- script: |
|
- script: |
|
||||||
${{ parameters.prefix }} python -m pip install -U -r requirements.txt
|
python -W error -c "import spacy"
|
||||||
displayName: "Install test requirements"
|
displayName: "Test import"
|
||||||
|
|
||||||
- script: |
|
|
||||||
${{ parameters.prefix }} python -m pip install -U cupy-cuda110 -f https://github.com/cupy/cupy/releases/v9.0.0
|
|
||||||
${{ parameters.prefix }} python -m pip install "torch==1.7.1+cu110" -f https://download.pytorch.org/whl/torch_stable.html
|
|
||||||
displayName: "Install GPU requirements"
|
|
||||||
condition: eq(${{ parameters.gpu }}, true)
|
|
||||||
|
|
||||||
- script: |
|
|
||||||
${{ parameters.prefix }} python -m pytest --pyargs spacy -W error
|
|
||||||
displayName: "Run CPU tests"
|
|
||||||
condition: eq(${{ parameters.gpu }}, false)
|
|
||||||
|
|
||||||
- script: |
|
|
||||||
${{ parameters.prefix }} python -m pytest --pyargs spacy -W error -p spacy.tests.enable_gpu
|
|
||||||
displayName: "Run GPU tests"
|
|
||||||
condition: eq(${{ parameters.gpu }}, true)
|
|
||||||
|
|
||||||
- script: |
|
- script: |
|
||||||
python -m spacy download ca_core_news_sm
|
python -m spacy download ca_core_news_sm
|
||||||
|
@ -72,6 +59,11 @@ steps:
|
||||||
displayName: 'Test download CLI'
|
displayName: 'Test download CLI'
|
||||||
condition: eq(variables['python_version'], '3.8')
|
condition: eq(variables['python_version'], '3.8')
|
||||||
|
|
||||||
|
- script: |
|
||||||
|
python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
|
||||||
|
displayName: 'Test no warnings on load (#11713)'
|
||||||
|
condition: eq(variables['python_version'], '3.8')
|
||||||
|
|
||||||
- script: |
|
- script: |
|
||||||
python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
|
python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
|
||||||
displayName: 'Test convert CLI'
|
displayName: 'Test convert CLI'
|
||||||
|
@ -106,13 +98,22 @@ steps:
|
||||||
displayName: 'Test assemble CLI vectors warning'
|
displayName: 'Test assemble CLI vectors warning'
|
||||||
condition: eq(variables['python_version'], '3.8')
|
condition: eq(variables['python_version'], '3.8')
|
||||||
|
|
||||||
|
- script: |
|
||||||
|
python -m pip install -U -r requirements.txt
|
||||||
|
displayName: "Install test requirements"
|
||||||
|
|
||||||
|
- script: |
|
||||||
|
python -m pytest --pyargs spacy -W error
|
||||||
|
displayName: "Run CPU tests"
|
||||||
|
|
||||||
|
- script: |
|
||||||
|
python -m pip install --pre thinc-apple-ops
|
||||||
|
python -m pytest --pyargs spacy
|
||||||
|
displayName: "Run CPU tests with thinc-apple-ops"
|
||||||
|
condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.11'))
|
||||||
|
|
||||||
- script: |
|
- script: |
|
||||||
python .github/validate_universe_json.py website/meta/universe.json
|
python .github/validate_universe_json.py website/meta/universe.json
|
||||||
displayName: 'Test website/meta/universe.json'
|
displayName: 'Test website/meta/universe.json'
|
||||||
condition: eq(variables['python_version'], '3.8')
|
condition: eq(variables['python_version'], '3.8')
|
||||||
|
|
||||||
- script: |
|
|
||||||
${{ parameters.prefix }} python -m pip install --pre thinc-apple-ops
|
|
||||||
${{ parameters.prefix }} python -m pytest --pyargs spacy
|
|
||||||
displayName: "Run CPU tests with thinc-apple-ops"
|
|
||||||
condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.10'))
|
|
||||||
|
|
9
.github/workflows/autoblack.yml
vendored
9
.github/workflows/autoblack.yml
vendored
|
@ -12,10 +12,10 @@ jobs:
|
||||||
if: github.repository_owner == 'explosion'
|
if: github.repository_owner == 'explosion'
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v2
|
- uses: actions/checkout@v3
|
||||||
with:
|
with:
|
||||||
ref: ${{ github.head_ref }}
|
ref: ${{ github.head_ref }}
|
||||||
- uses: actions/setup-python@v2
|
- uses: actions/setup-python@v4
|
||||||
- run: pip install black
|
- run: pip install black
|
||||||
- name: Auto-format code if needed
|
- name: Auto-format code if needed
|
||||||
run: black spacy
|
run: black spacy
|
||||||
|
@ -23,10 +23,11 @@ jobs:
|
||||||
# code and makes GitHub think the action failed
|
# code and makes GitHub think the action failed
|
||||||
- name: Check for modified files
|
- name: Check for modified files
|
||||||
id: git-check
|
id: git-check
|
||||||
run: echo ::set-output name=modified::$(if git diff-index --quiet HEAD --; then echo "false"; else echo "true"; fi)
|
run: echo modified=$(if git diff-index --quiet HEAD --; then echo "false"; else echo "true"; fi) >> $GITHUB_OUTPUT
|
||||||
|
|
||||||
- name: Create Pull Request
|
- name: Create Pull Request
|
||||||
if: steps.git-check.outputs.modified == 'true'
|
if: steps.git-check.outputs.modified == 'true'
|
||||||
uses: peter-evans/create-pull-request@v3
|
uses: peter-evans/create-pull-request@v4
|
||||||
with:
|
with:
|
||||||
title: Auto-format code with black
|
title: Auto-format code with black
|
||||||
labels: meta
|
labels: meta
|
||||||
|
|
6
.github/workflows/explosionbot.yml
vendored
6
.github/workflows/explosionbot.yml
vendored
|
@ -8,14 +8,14 @@ on:
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
explosion-bot:
|
explosion-bot:
|
||||||
runs-on: ubuntu-18.04
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Dump GitHub context
|
- name: Dump GitHub context
|
||||||
env:
|
env:
|
||||||
GITHUB_CONTEXT: ${{ toJson(github) }}
|
GITHUB_CONTEXT: ${{ toJson(github) }}
|
||||||
run: echo "$GITHUB_CONTEXT"
|
run: echo "$GITHUB_CONTEXT"
|
||||||
- uses: actions/checkout@v1
|
- uses: actions/checkout@v3
|
||||||
- uses: actions/setup-python@v1
|
- uses: actions/setup-python@v4
|
||||||
- name: Install and run explosion-bot
|
- name: Install and run explosion-bot
|
||||||
run: |
|
run: |
|
||||||
pip install git+https://${{ secrets.EXPLOSIONBOT_TOKEN }}@github.com/explosion/explosion-bot
|
pip install git+https://${{ secrets.EXPLOSIONBOT_TOKEN }}@github.com/explosion/explosion-bot
|
||||||
|
|
6
.github/workflows/slowtests.yml
vendored
6
.github/workflows/slowtests.yml
vendored
|
@ -14,7 +14,7 @@ jobs:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v1
|
uses: actions/checkout@v3
|
||||||
with:
|
with:
|
||||||
ref: ${{ matrix.branch }}
|
ref: ${{ matrix.branch }}
|
||||||
- name: Get commits from past 24 hours
|
- name: Get commits from past 24 hours
|
||||||
|
@ -23,9 +23,9 @@ jobs:
|
||||||
today=$(date '+%Y-%m-%d %H:%M:%S')
|
today=$(date '+%Y-%m-%d %H:%M:%S')
|
||||||
yesterday=$(date -d "yesterday" '+%Y-%m-%d %H:%M:%S')
|
yesterday=$(date -d "yesterday" '+%Y-%m-%d %H:%M:%S')
|
||||||
if git log --after="$yesterday" --before="$today" | grep commit ; then
|
if git log --after="$yesterday" --before="$today" | grep commit ; then
|
||||||
echo "::set-output name=run_tests::true"
|
echo run_tests=true >> $GITHUB_OUTPUT
|
||||||
else
|
else
|
||||||
echo "::set-output name=run_tests::false"
|
echo run_tests=false >> $GITHUB_OUTPUT
|
||||||
fi
|
fi
|
||||||
|
|
||||||
- name: Trigger buildkite build
|
- name: Trigger buildkite build
|
||||||
|
|
6
.github/workflows/spacy_universe_alert.yml
vendored
6
.github/workflows/spacy_universe_alert.yml
vendored
|
@ -17,8 +17,10 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
echo "$GITHUB_CONTEXT"
|
echo "$GITHUB_CONTEXT"
|
||||||
|
|
||||||
- uses: actions/checkout@v1
|
- uses: actions/checkout@v3
|
||||||
- uses: actions/setup-python@v1
|
- uses: actions/setup-python@v4
|
||||||
|
with:
|
||||||
|
python-version: '3.10'
|
||||||
- name: Install Bernadette app dependency and send an alert
|
- name: Install Bernadette app dependency and send an alert
|
||||||
env:
|
env:
|
||||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||||
|
|
|
@ -8,7 +8,7 @@ be used in real products.
|
||||||
|
|
||||||
spaCy comes with
|
spaCy comes with
|
||||||
[pretrained pipelines](https://spacy.io/models) and
|
[pretrained pipelines](https://spacy.io/models) and
|
||||||
currently supports tokenization and training for **60+ languages**. It features
|
currently supports tokenization and training for **70+ languages**. It features
|
||||||
state-of-the-art speed and **neural network models** for tagging,
|
state-of-the-art speed and **neural network models** for tagging,
|
||||||
parsing, **named entity recognition**, **text classification** and more,
|
parsing, **named entity recognition**, **text classification** and more,
|
||||||
multi-task learning with pretrained **transformers** like BERT, as well as a
|
multi-task learning with pretrained **transformers** like BERT, as well as a
|
||||||
|
@ -16,7 +16,7 @@ production-ready [**training system**](https://spacy.io/usage/training) and easy
|
||||||
model packaging, deployment and workflow management. spaCy is commercial
|
model packaging, deployment and workflow management. spaCy is commercial
|
||||||
open-source software, released under the MIT license.
|
open-source software, released under the MIT license.
|
||||||
|
|
||||||
💫 **Version 3.4.0 out now!**
|
💫 **Version 3.4 out now!**
|
||||||
[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
|
[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
|
||||||
|
|
||||||
[](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)
|
[](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)
|
||||||
|
@ -79,7 +79,7 @@ more people can benefit from it.
|
||||||
|
|
||||||
## Features
|
## Features
|
||||||
|
|
||||||
- Support for **60+ languages**
|
- Support for **70+ languages**
|
||||||
- **Trained pipelines** for different languages and tasks
|
- **Trained pipelines** for different languages and tasks
|
||||||
- Multi-task learning with pretrained **transformers** like BERT
|
- Multi-task learning with pretrained **transformers** like BERT
|
||||||
- Support for pretrained **word vectors** and embeddings
|
- Support for pretrained **word vectors** and embeddings
|
||||||
|
|
|
@ -76,24 +76,24 @@ jobs:
|
||||||
# Python39Mac:
|
# Python39Mac:
|
||||||
# imageName: "macos-latest"
|
# imageName: "macos-latest"
|
||||||
# python.version: "3.9"
|
# python.version: "3.9"
|
||||||
Python310Linux:
|
# Python310Linux:
|
||||||
imageName: "ubuntu-latest"
|
# imageName: "ubuntu-latest"
|
||||||
python.version: "3.10"
|
# python.version: "3.10"
|
||||||
Python310Windows:
|
Python310Windows:
|
||||||
imageName: "windows-latest"
|
imageName: "windows-latest"
|
||||||
python.version: "3.10"
|
python.version: "3.10"
|
||||||
Python310Mac:
|
# Python310Mac:
|
||||||
imageName: "macos-latest"
|
# imageName: "macos-latest"
|
||||||
python.version: "3.10"
|
# python.version: "3.10"
|
||||||
Python311Linux:
|
Python311Linux:
|
||||||
imageName: 'ubuntu-latest'
|
imageName: 'ubuntu-latest'
|
||||||
python.version: '3.11.0-rc.2'
|
python.version: '3.11'
|
||||||
Python311Windows:
|
Python311Windows:
|
||||||
imageName: 'windows-latest'
|
imageName: 'windows-latest'
|
||||||
python.version: '3.11.0-rc.2'
|
python.version: '3.11'
|
||||||
Python311Mac:
|
Python311Mac:
|
||||||
imageName: 'macos-latest'
|
imageName: 'macos-latest'
|
||||||
python.version: '3.11.0-rc.2'
|
python.version: '3.11'
|
||||||
maxParallel: 4
|
maxParallel: 4
|
||||||
pool:
|
pool:
|
||||||
vmImage: $(imageName)
|
vmImage: $(imageName)
|
||||||
|
@ -101,20 +101,3 @@ jobs:
|
||||||
- template: .github/azure-steps.yml
|
- template: .github/azure-steps.yml
|
||||||
parameters:
|
parameters:
|
||||||
python_version: '$(python.version)'
|
python_version: '$(python.version)'
|
||||||
architecture: 'x64'
|
|
||||||
|
|
||||||
# - job: "TestGPU"
|
|
||||||
# dependsOn: "Validate"
|
|
||||||
# strategy:
|
|
||||||
# matrix:
|
|
||||||
# Python38LinuxX64_GPU:
|
|
||||||
# python.version: '3.8'
|
|
||||||
# pool:
|
|
||||||
# name: "LinuxX64_GPU"
|
|
||||||
# steps:
|
|
||||||
# - template: .github/azure-steps.yml
|
|
||||||
# parameters:
|
|
||||||
# python_version: '$(python.version)'
|
|
||||||
# architecture: 'x64'
|
|
||||||
# gpu: true
|
|
||||||
# num_build_jobs: 24
|
|
||||||
|
|
|
@ -9,7 +9,7 @@ murmurhash>=0.28.0,<1.1.0
|
||||||
wasabi>=0.9.1,<1.1.0
|
wasabi>=0.9.1,<1.1.0
|
||||||
srsly>=2.4.3,<3.0.0
|
srsly>=2.4.3,<3.0.0
|
||||||
catalogue>=2.0.6,<2.1.0
|
catalogue>=2.0.6,<2.1.0
|
||||||
typer>=0.3.0,<0.5.0
|
typer>=0.3.0,<0.8.0
|
||||||
pathy>=0.3.5
|
pathy>=0.3.5
|
||||||
# Third party dependencies
|
# Third party dependencies
|
||||||
numpy>=1.15.0
|
numpy>=1.15.0
|
||||||
|
|
|
@ -51,7 +51,7 @@ install_requires =
|
||||||
srsly>=2.4.3,<3.0.0
|
srsly>=2.4.3,<3.0.0
|
||||||
catalogue>=2.0.6,<2.1.0
|
catalogue>=2.0.6,<2.1.0
|
||||||
# Third-party dependencies
|
# Third-party dependencies
|
||||||
typer>=0.3.0,<0.5.0
|
typer>=0.3.0,<0.8.0
|
||||||
pathy>=0.3.5
|
pathy>=0.3.5
|
||||||
tqdm>=4.38.0,<5.0.0
|
tqdm>=4.38.0,<5.0.0
|
||||||
numpy>=1.15.0
|
numpy>=1.15.0
|
||||||
|
|
|
@ -10,6 +10,7 @@ from .._util import get_hash, get_checksum, download_file, ensure_pathy
|
||||||
from ...util import make_tempdir, get_minor_version, ENV_VARS, check_bool_env_var
|
from ...util import make_tempdir, get_minor_version, ENV_VARS, check_bool_env_var
|
||||||
from ...git_info import GIT_VERSION
|
from ...git_info import GIT_VERSION
|
||||||
from ... import about
|
from ... import about
|
||||||
|
from ...errors import Errors
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from pathy import Pathy # noqa: F401
|
from pathy import Pathy # noqa: F401
|
||||||
|
@ -84,7 +85,23 @@ class RemoteStorage:
|
||||||
with tarfile.open(tar_loc, mode=mode_string) as tar_file:
|
with tarfile.open(tar_loc, mode=mode_string) as tar_file:
|
||||||
# This requires that the path is added correctly, relative
|
# This requires that the path is added correctly, relative
|
||||||
# to root. This is how we set things up in push()
|
# to root. This is how we set things up in push()
|
||||||
tar_file.extractall(self.root)
|
|
||||||
|
# Disallow paths outside the current directory for the tar
|
||||||
|
# file (CVE-2007-4559, directory traversal vulnerability)
|
||||||
|
def is_within_directory(directory, target):
|
||||||
|
abs_directory = os.path.abspath(directory)
|
||||||
|
abs_target = os.path.abspath(target)
|
||||||
|
prefix = os.path.commonprefix([abs_directory, abs_target])
|
||||||
|
return prefix == abs_directory
|
||||||
|
|
||||||
|
def safe_extract(tar, path):
|
||||||
|
for member in tar.getmembers():
|
||||||
|
member_path = os.path.join(path, member.name)
|
||||||
|
if not is_within_directory(path, member_path):
|
||||||
|
raise ValueError(Errors.E852)
|
||||||
|
tar.extractall(path)
|
||||||
|
|
||||||
|
safe_extract(tar_file, self.root)
|
||||||
return url
|
return url
|
||||||
|
|
||||||
def find(
|
def find(
|
||||||
|
|
|
@ -53,6 +53,7 @@ def project_run(
|
||||||
force: bool = False,
|
force: bool = False,
|
||||||
dry: bool = False,
|
dry: bool = False,
|
||||||
capture: bool = False,
|
capture: bool = False,
|
||||||
|
skip_requirements_check: bool = False,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Run a named script defined in the project.yml. If the script is part
|
"""Run a named script defined in the project.yml. If the script is part
|
||||||
of the default pipeline (defined in the "run" section), DVC is used to
|
of the default pipeline (defined in the "run" section), DVC is used to
|
||||||
|
@ -69,6 +70,7 @@ def project_run(
|
||||||
sys.exit will be called with the return code. You should use capture=False
|
sys.exit will be called with the return code. You should use capture=False
|
||||||
when you want to turn over execution to the command, and capture=True
|
when you want to turn over execution to the command, and capture=True
|
||||||
when you want to run the command more like a function.
|
when you want to run the command more like a function.
|
||||||
|
skip_requirements_check (bool): Whether to skip the requirements check.
|
||||||
"""
|
"""
|
||||||
config = load_project_config(project_dir, overrides=overrides)
|
config = load_project_config(project_dir, overrides=overrides)
|
||||||
commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
|
commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
|
||||||
|
@ -76,9 +78,10 @@ def project_run(
|
||||||
validate_subcommand(list(commands.keys()), list(workflows.keys()), subcommand)
|
validate_subcommand(list(commands.keys()), list(workflows.keys()), subcommand)
|
||||||
|
|
||||||
req_path = project_dir / "requirements.txt"
|
req_path = project_dir / "requirements.txt"
|
||||||
if config.get("check_requirements", True) and os.path.exists(req_path):
|
if not skip_requirements_check:
|
||||||
with req_path.open() as requirements_file:
|
if config.get("check_requirements", True) and os.path.exists(req_path):
|
||||||
_check_requirements([req.replace("\n", "") for req in requirements_file])
|
with req_path.open() as requirements_file:
|
||||||
|
_check_requirements([req.strip() for req in requirements_file])
|
||||||
|
|
||||||
if subcommand in workflows:
|
if subcommand in workflows:
|
||||||
msg.info(f"Running workflow '{subcommand}'")
|
msg.info(f"Running workflow '{subcommand}'")
|
||||||
|
@ -90,6 +93,7 @@ def project_run(
|
||||||
force=force,
|
force=force,
|
||||||
dry=dry,
|
dry=dry,
|
||||||
capture=capture,
|
capture=capture,
|
||||||
|
skip_requirements_check=True,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
cmd = commands[subcommand]
|
cmd = commands[subcommand]
|
||||||
|
@ -338,6 +342,10 @@ def _check_requirements(requirements: List[str]) -> Tuple[bool, bool]:
|
||||||
failed_pkgs_msgs.append(dnf.report())
|
failed_pkgs_msgs.append(dnf.report())
|
||||||
except pkg_resources.VersionConflict as vc:
|
except pkg_resources.VersionConflict as vc:
|
||||||
conflicting_pkgs_msgs.append(vc.report())
|
conflicting_pkgs_msgs.append(vc.report())
|
||||||
|
except Exception:
|
||||||
|
msg.warn(f"Unable to check requirement: {req} "
|
||||||
|
"Checks are currently limited to requirement specifiers "
|
||||||
|
"(PEP 508)")
|
||||||
|
|
||||||
if len(failed_pkgs_msgs) or len(conflicting_pkgs_msgs):
|
if len(failed_pkgs_msgs) or len(conflicting_pkgs_msgs):
|
||||||
msg.warn(
|
msg.warn(
|
||||||
|
|
|
@ -212,8 +212,8 @@ class Warnings(metaclass=ErrorsWithCodes):
|
||||||
W121 = ("Attempting to trace non-existent method '{method}' in pipe '{pipe}'")
|
W121 = ("Attempting to trace non-existent method '{method}' in pipe '{pipe}'")
|
||||||
W122 = ("Couldn't trace method '{method}' in pipe '{pipe}'. This can happen if the pipe class "
|
W122 = ("Couldn't trace method '{method}' in pipe '{pipe}'. This can happen if the pipe class "
|
||||||
"is a Cython extension type.")
|
"is a Cython extension type.")
|
||||||
W123 = ("Argument {arg} with value {arg_value} is used instead of {config_value} as specified in the config. Be "
|
W123 = ("Argument `enable` with value {enable} does not contain all values specified in the config option "
|
||||||
"aware that this might affect other components in your pipeline.")
|
"`enabled` ({enabled}). Be aware that this might affect other components in your pipeline.")
|
||||||
W124 = ("Using the features PREFIX and/or SUFFIX in a RichFeatureExtractor configuration may lead to the same "
|
W124 = ("Using the features PREFIX and/or SUFFIX in a RichFeatureExtractor configuration may lead to the same "
|
||||||
"information being fed forward twice if prefixes and suffixes of corresponding lengths are specified.")
|
"information being fed forward twice if prefixes and suffixes of corresponding lengths are specified.")
|
||||||
|
|
||||||
|
@ -546,6 +546,8 @@ class Errors(metaclass=ErrorsWithCodes):
|
||||||
"during training, make sure to include it in 'annotating components'")
|
"during training, make sure to include it in 'annotating components'")
|
||||||
|
|
||||||
# New errors added in v3.x
|
# New errors added in v3.x
|
||||||
|
E852 = ("The tar file pulled from the remote attempted an unsafe path "
|
||||||
|
"traversal.")
|
||||||
E853 = ("Unsupported component factory name '{name}'. The character '.' is "
|
E853 = ("Unsupported component factory name '{name}'. The character '.' is "
|
||||||
"not permitted in factory names.")
|
"not permitted in factory names.")
|
||||||
E854 = ("Unable to set doc.ents. Check that the 'ents_filter' does not "
|
E854 = ("Unable to set doc.ents. Check that the 'ents_filter' does not "
|
||||||
|
|
|
@ -1879,31 +1879,22 @@ class Language:
|
||||||
if isinstance(exclude, str):
|
if isinstance(exclude, str):
|
||||||
exclude = [exclude]
|
exclude = [exclude]
|
||||||
|
|
||||||
def fetch_pipes_status(value: Iterable[str], key: str) -> Iterable[str]:
|
# `enable` should not be merged with `enabled` (the opposite is true for `disable`/`disabled`). If the config
|
||||||
"""Fetch value for `enable` or `disable` w.r.t. the specified config and passed arguments passed to
|
# specifies values for `enabled` not included in `enable`, emit warning.
|
||||||
.load(). If both arguments and config specified values for this field, the passed arguments take precedence
|
if id(enable) != id(_DEFAULT_EMPTY_PIPES):
|
||||||
and a warning is printed.
|
enabled = config["nlp"].get("enabled", [])
|
||||||
value (Iterable[str]): Passed value for `enable` or `disable`.
|
if len(enabled) and not set(enabled).issubset(enable):
|
||||||
key (str): Key for field in config (either "enabled" or "disabled").
|
warnings.warn(
|
||||||
RETURN (Iterable[str]):
|
Warnings.W123.format(
|
||||||
"""
|
enable=enable,
|
||||||
# We assume that no argument was passed if the value is the specified default value.
|
enabled=enabled,
|
||||||
if id(value) == id(_DEFAULT_EMPTY_PIPES):
|
|
||||||
return config["nlp"].get(key, [])
|
|
||||||
else:
|
|
||||||
if len(config["nlp"].get(key, [])):
|
|
||||||
warnings.warn(
|
|
||||||
Warnings.W123.format(
|
|
||||||
arg=key[:-1],
|
|
||||||
arg_value=value,
|
|
||||||
config_value=config["nlp"][key],
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
return value
|
)
|
||||||
|
|
||||||
|
# Ensure sets of disabled/enabled pipe names are not contradictory.
|
||||||
disabled_pipes = cls._resolve_component_status(
|
disabled_pipes = cls._resolve_component_status(
|
||||||
fetch_pipes_status(disable, "disabled"),
|
list({*disable, *config["nlp"].get("disabled", [])}),
|
||||||
fetch_pipes_status(enable, "enabled"),
|
enable,
|
||||||
config["nlp"]["pipeline"],
|
config["nlp"]["pipeline"],
|
||||||
)
|
)
|
||||||
nlp._disabled = set(p for p in disabled_pipes if p not in exclude)
|
nlp._disabled = set(p for p in disabled_pipes if p not in exclude)
|
||||||
|
@ -2084,10 +2075,12 @@ class Language:
|
||||||
if enable:
|
if enable:
|
||||||
if isinstance(enable, str):
|
if isinstance(enable, str):
|
||||||
enable = [enable]
|
enable = [enable]
|
||||||
to_disable = [
|
to_disable = {
|
||||||
pipe_name for pipe_name in pipe_names if pipe_name not in enable
|
*[pipe_name for pipe_name in pipe_names if pipe_name not in enable],
|
||||||
]
|
*disable,
|
||||||
if disable and disable != to_disable:
|
}
|
||||||
|
# If any pipe to be enabled is in to_disable, the specification is inconsistent.
|
||||||
|
if len(set(enable) & to_disable):
|
||||||
raise ValueError(Errors.E1042.format(enable=enable, disable=disable))
|
raise ValueError(Errors.E1042.format(enable=enable, disable=disable))
|
||||||
|
|
||||||
return tuple(to_disable)
|
return tuple(to_disable)
|
||||||
|
|
|
@ -71,11 +71,10 @@ def span_maker_forward(model, docs: List[Doc], is_train) -> Tuple[Ragged, Callab
|
||||||
cands.append((start_token, end_token))
|
cands.append((start_token, end_token))
|
||||||
|
|
||||||
candidates.append(ops.asarray2i(cands))
|
candidates.append(ops.asarray2i(cands))
|
||||||
candlens = ops.asarray1i([len(cands) for cands in candidates])
|
lengths = model.ops.asarray1i([len(cands) for cands in candidates])
|
||||||
candidates = ops.xp.concatenate(candidates)
|
out = Ragged(model.ops.flatten(candidates), lengths)
|
||||||
outputs = Ragged(candidates, candlens)
|
|
||||||
# because this is just rearranging docs, the backprop does nothing
|
# because this is just rearranging docs, the backprop does nothing
|
||||||
return outputs, lambda x: []
|
return out, lambda x: []
|
||||||
|
|
||||||
|
|
||||||
@registry.misc("spacy.KBFromFile.v1")
|
@registry.misc("spacy.KBFromFile.v1")
|
||||||
|
|
|
@ -24,8 +24,8 @@ single_label_default_config = """
|
||||||
[model.tok2vec.embed]
|
[model.tok2vec.embed]
|
||||||
@architectures = "spacy.MultiHashEmbed.v2"
|
@architectures = "spacy.MultiHashEmbed.v2"
|
||||||
width = 64
|
width = 64
|
||||||
rows = [2000, 2000, 1000, 1000, 1000, 1000]
|
rows = [2000, 2000, 500, 1000, 500]
|
||||||
attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
|
attrs = ["NORM", "LOWER", "PREFIX", "SUFFIX", "SHAPE"]
|
||||||
include_static_vectors = false
|
include_static_vectors = false
|
||||||
|
|
||||||
[model.tok2vec.encode]
|
[model.tok2vec.encode]
|
||||||
|
@ -72,7 +72,7 @@ subword_features = true
|
||||||
"textcat",
|
"textcat",
|
||||||
assigns=["doc.cats"],
|
assigns=["doc.cats"],
|
||||||
default_config={
|
default_config={
|
||||||
"threshold": 0.5,
|
"threshold": 0.0,
|
||||||
"model": DEFAULT_SINGLE_TEXTCAT_MODEL,
|
"model": DEFAULT_SINGLE_TEXTCAT_MODEL,
|
||||||
"scorer": {"@scorers": "spacy.textcat_scorer.v1"},
|
"scorer": {"@scorers": "spacy.textcat_scorer.v1"},
|
||||||
},
|
},
|
||||||
|
@ -144,7 +144,8 @@ class TextCategorizer(TrainablePipe):
|
||||||
model (thinc.api.Model): The Thinc Model powering the pipeline component.
|
model (thinc.api.Model): The Thinc Model powering the pipeline component.
|
||||||
name (str): The component instance name, used to add entries to the
|
name (str): The component instance name, used to add entries to the
|
||||||
losses during training.
|
losses during training.
|
||||||
threshold (float): Cutoff to consider a prediction "positive".
|
threshold (float): Unused, not needed for single-label (exclusive
|
||||||
|
classes) classification.
|
||||||
scorer (Optional[Callable]): The scoring method. Defaults to
|
scorer (Optional[Callable]): The scoring method. Defaults to
|
||||||
Scorer.score_cats for the attribute "cats".
|
Scorer.score_cats for the attribute "cats".
|
||||||
|
|
||||||
|
@ -154,7 +155,11 @@ class TextCategorizer(TrainablePipe):
|
||||||
self.model = model
|
self.model = model
|
||||||
self.name = name
|
self.name = name
|
||||||
self._rehearsal_model = None
|
self._rehearsal_model = None
|
||||||
cfg = {"labels": [], "threshold": threshold, "positive_label": None}
|
cfg: Dict[str, Any] = {
|
||||||
|
"labels": [],
|
||||||
|
"threshold": threshold,
|
||||||
|
"positive_label": None,
|
||||||
|
}
|
||||||
self.cfg = dict(cfg)
|
self.cfg = dict(cfg)
|
||||||
self.scorer = scorer
|
self.scorer = scorer
|
||||||
|
|
||||||
|
|
|
@ -24,8 +24,8 @@ multi_label_default_config = """
|
||||||
[model.tok2vec.embed]
|
[model.tok2vec.embed]
|
||||||
@architectures = "spacy.MultiHashEmbed.v2"
|
@architectures = "spacy.MultiHashEmbed.v2"
|
||||||
width = 64
|
width = 64
|
||||||
rows = [2000, 2000, 1000, 1000, 1000, 1000]
|
rows = [2000, 2000, 500, 1000, 500]
|
||||||
attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
|
attrs = ["NORM", "LOWER", "PREFIX", "SUFFIX", "SHAPE"]
|
||||||
include_static_vectors = false
|
include_static_vectors = false
|
||||||
|
|
||||||
[model.tok2vec.encode]
|
[model.tok2vec.encode]
|
||||||
|
|
|
@ -446,7 +446,7 @@ class Scorer:
|
||||||
labels (Iterable[str]): The set of possible labels. Defaults to [].
|
labels (Iterable[str]): The set of possible labels. Defaults to [].
|
||||||
multi_label (bool): Whether the attribute allows multiple labels.
|
multi_label (bool): Whether the attribute allows multiple labels.
|
||||||
Defaults to True. When set to False (exclusive labels), missing
|
Defaults to True. When set to False (exclusive labels), missing
|
||||||
gold labels are interpreted as 0.0.
|
gold labels are interpreted as 0.0 and the threshold is set to 0.0.
|
||||||
positive_label (str): The positive label for a binary task with
|
positive_label (str): The positive label for a binary task with
|
||||||
exclusive classes. Defaults to None.
|
exclusive classes. Defaults to None.
|
||||||
threshold (float): Cutoff to consider a prediction "positive". Defaults
|
threshold (float): Cutoff to consider a prediction "positive". Defaults
|
||||||
|
@ -471,6 +471,8 @@ class Scorer:
|
||||||
"""
|
"""
|
||||||
if threshold is None:
|
if threshold is None:
|
||||||
threshold = 0.5 if multi_label else 0.0
|
threshold = 0.5 if multi_label else 0.0
|
||||||
|
if not multi_label:
|
||||||
|
threshold = 0.0
|
||||||
f_per_type = {label: PRFScore() for label in labels}
|
f_per_type = {label: PRFScore() for label in labels}
|
||||||
auc_per_type = {label: ROCAUCScore() for label in labels}
|
auc_per_type = {label: ROCAUCScore() for label in labels}
|
||||||
labels = set(labels)
|
labels = set(labels)
|
||||||
|
@ -505,20 +507,18 @@ class Scorer:
|
||||||
# Get the highest-scoring for each.
|
# Get the highest-scoring for each.
|
||||||
pred_label, pred_score = max(pred_cats.items(), key=lambda it: it[1])
|
pred_label, pred_score = max(pred_cats.items(), key=lambda it: it[1])
|
||||||
gold_label, gold_score = max(gold_cats.items(), key=lambda it: it[1])
|
gold_label, gold_score = max(gold_cats.items(), key=lambda it: it[1])
|
||||||
if pred_label == gold_label and pred_score >= threshold:
|
if pred_label == gold_label:
|
||||||
f_per_type[pred_label].tp += 1
|
f_per_type[pred_label].tp += 1
|
||||||
else:
|
else:
|
||||||
f_per_type[gold_label].fn += 1
|
f_per_type[gold_label].fn += 1
|
||||||
if pred_score >= threshold:
|
f_per_type[pred_label].fp += 1
|
||||||
f_per_type[pred_label].fp += 1
|
|
||||||
elif gold_cats:
|
elif gold_cats:
|
||||||
gold_label, gold_score = max(gold_cats, key=lambda it: it[1])
|
gold_label, gold_score = max(gold_cats, key=lambda it: it[1])
|
||||||
if gold_score > 0:
|
if gold_score > 0:
|
||||||
f_per_type[gold_label].fn += 1
|
f_per_type[gold_label].fn += 1
|
||||||
elif pred_cats:
|
elif pred_cats:
|
||||||
pred_label, pred_score = max(pred_cats.items(), key=lambda it: it[1])
|
pred_label, pred_score = max(pred_cats.items(), key=lambda it: it[1])
|
||||||
if pred_score >= threshold:
|
f_per_type[pred_label].fp += 1
|
||||||
f_per_type[pred_label].fp += 1
|
|
||||||
micro_prf = PRFScore()
|
micro_prf = PRFScore()
|
||||||
for label_prf in f_per_type.values():
|
for label_prf in f_per_type.values():
|
||||||
micro_prf.tp += label_prf.tp
|
micro_prf.tp += label_prf.tp
|
||||||
|
|
|
@ -370,3 +370,12 @@ def test_json_to_doc_validation_error(doc):
|
||||||
doc_json.pop("tokens")
|
doc_json.pop("tokens")
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
Doc(doc.vocab).from_json(doc_json, validate=True)
|
Doc(doc.vocab).from_json(doc_json, validate=True)
|
||||||
|
|
||||||
|
|
||||||
|
def test_to_json_underscore_doc_getters(doc):
|
||||||
|
def get_text_length(doc):
|
||||||
|
return len(doc.text)
|
||||||
|
|
||||||
|
Doc.set_extension("text_length", getter=get_text_length)
|
||||||
|
doc_json = doc.to_json(underscore=["text_length"])
|
||||||
|
assert doc_json["_"]["text_length"] == get_text_length(doc)
|
||||||
|
|
|
@ -9,6 +9,7 @@ from spacy.compat import pickle
|
||||||
from spacy.kb import Candidate, InMemoryLookupKB, get_candidates, KnowledgeBase
|
from spacy.kb import Candidate, InMemoryLookupKB, get_candidates, KnowledgeBase
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.ml import load_kb
|
from spacy.ml import load_kb
|
||||||
|
from spacy.ml.models.entity_linker import build_span_maker
|
||||||
from spacy.pipeline import EntityLinker
|
from spacy.pipeline import EntityLinker
|
||||||
from spacy.pipeline.legacy import EntityLinker_v1
|
from spacy.pipeline.legacy import EntityLinker_v1
|
||||||
from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
|
from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
|
||||||
|
@ -715,7 +716,11 @@ TRAIN_DATA = [
|
||||||
("Russ Cochran was a member of University of Kentucky's golf team.",
|
("Russ Cochran was a member of University of Kentucky's golf team.",
|
||||||
{"links": {(0, 12): {"Q7381115": 0.0, "Q2146908": 1.0}},
|
{"links": {(0, 12): {"Q7381115": 0.0, "Q2146908": 1.0}},
|
||||||
"entities": [(0, 12, "PERSON"), (43, 51, "LOC")],
|
"entities": [(0, 12, "PERSON"), (43, 51, "LOC")],
|
||||||
"sent_starts": [1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]})
|
"sent_starts": [1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}),
|
||||||
|
# having a blank instance shouldn't break things
|
||||||
|
("The weather is nice today.",
|
||||||
|
{"links": {}, "entities": [],
|
||||||
|
"sent_starts": [1, -1, 0, 0, 0, 0]})
|
||||||
]
|
]
|
||||||
GOLD_entities = ["Q2146908", "Q7381115", "Q7381115", "Q2146908"]
|
GOLD_entities = ["Q2146908", "Q7381115", "Q7381115", "Q2146908"]
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
@ -1196,3 +1201,18 @@ def test_threshold(meet_threshold: bool, config: Dict[str, Any]):
|
||||||
|
|
||||||
assert len(doc.ents) == 1
|
assert len(doc.ents) == 1
|
||||||
assert doc.ents[0].kb_id_ == entity_id if meet_threshold else EntityLinker.NIL
|
assert doc.ents[0].kb_id_ == entity_id if meet_threshold else EntityLinker.NIL
|
||||||
|
|
||||||
|
|
||||||
|
def test_span_maker_forward_with_empty():
|
||||||
|
"""The forward pass of the span maker may have a doc with no entities."""
|
||||||
|
nlp = English()
|
||||||
|
doc1 = nlp("a b c")
|
||||||
|
ent = doc1[0:1]
|
||||||
|
ent.label_ = "X"
|
||||||
|
doc1.ents = [ent]
|
||||||
|
# no entities
|
||||||
|
doc2 = nlp("x y z")
|
||||||
|
|
||||||
|
# just to get a model
|
||||||
|
span_maker = build_span_maker()
|
||||||
|
span_maker([doc1, doc2], False)
|
||||||
|
|
|
@ -615,20 +615,18 @@ def test_enable_disable_conflict_with_config():
|
||||||
|
|
||||||
with make_tempdir() as tmp_dir:
|
with make_tempdir() as tmp_dir:
|
||||||
nlp.to_disk(tmp_dir)
|
nlp.to_disk(tmp_dir)
|
||||||
# Expected to fail, as config and arguments conflict.
|
# Expected to succeed, as config and arguments do not conflict.
|
||||||
with pytest.raises(ValueError):
|
assert spacy.load(
|
||||||
spacy.load(
|
tmp_dir, enable=["tagger"], config={"nlp": {"disabled": ["senter"]}}
|
||||||
tmp_dir, enable=["tagger"], config={"nlp": {"disabled": ["senter"]}}
|
).disabled == ["senter", "sentencizer"]
|
||||||
)
|
|
||||||
# Expected to succeed without warning due to the lack of a conflicting config option.
|
# Expected to succeed without warning due to the lack of a conflicting config option.
|
||||||
spacy.load(tmp_dir, enable=["tagger"])
|
spacy.load(tmp_dir, enable=["tagger"])
|
||||||
# Expected to succeed with a warning, as disable=[] should override the config setting.
|
# Expected to fail due to conflict between enable and disabled.
|
||||||
with pytest.warns(UserWarning):
|
with pytest.raises(ValueError):
|
||||||
spacy.load(
|
spacy.load(
|
||||||
tmp_dir,
|
tmp_dir,
|
||||||
enable=["tagger"],
|
enable=["senter"],
|
||||||
disable=[],
|
config={"nlp": {"disabled": ["senter", "tagger"]}},
|
||||||
config={"nlp": {"disabled": ["senter"]}},
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -823,10 +823,10 @@ def test_textcat_loss(multi_label: bool, expected_loss: float):
|
||||||
assert loss == expected_loss
|
assert loss == expected_loss
|
||||||
|
|
||||||
|
|
||||||
def test_textcat_threshold():
|
def test_textcat_multilabel_threshold():
|
||||||
# Ensure the scorer can be called with a different threshold
|
# Ensure the scorer can be called with a different threshold
|
||||||
nlp = English()
|
nlp = English()
|
||||||
nlp.add_pipe("textcat")
|
nlp.add_pipe("textcat_multilabel")
|
||||||
|
|
||||||
train_examples = []
|
train_examples = []
|
||||||
for text, annotations in TRAIN_DATA_SINGLE_LABEL:
|
for text, annotations in TRAIN_DATA_SINGLE_LABEL:
|
||||||
|
@ -849,7 +849,7 @@ def test_textcat_threshold():
|
||||||
)
|
)
|
||||||
pos_f = scores["cats_score"]
|
pos_f = scores["cats_score"]
|
||||||
assert scores["cats_f_per_type"]["POSITIVE"]["r"] == 1.0
|
assert scores["cats_f_per_type"]["POSITIVE"]["r"] == 1.0
|
||||||
assert pos_f > macro_f
|
assert pos_f >= macro_f
|
||||||
|
|
||||||
|
|
||||||
def test_textcat_multi_threshold():
|
def test_textcat_multi_threshold():
|
||||||
|
|
|
@ -404,11 +404,10 @@ def test_serialize_pipeline_disable_enable():
|
||||||
assert nlp3.component_names == ["ner", "tagger"]
|
assert nlp3.component_names == ["ner", "tagger"]
|
||||||
with make_tempdir() as d:
|
with make_tempdir() as d:
|
||||||
nlp3.to_disk(d)
|
nlp3.to_disk(d)
|
||||||
with pytest.warns(UserWarning):
|
nlp4 = spacy.load(d, disable=["ner"])
|
||||||
nlp4 = spacy.load(d, disable=["ner"])
|
assert nlp4.pipe_names == []
|
||||||
assert nlp4.pipe_names == ["tagger"]
|
|
||||||
assert nlp4.component_names == ["ner", "tagger"]
|
assert nlp4.component_names == ["ner", "tagger"]
|
||||||
assert nlp4.disabled == ["ner"]
|
assert nlp4.disabled == ["ner", "tagger"]
|
||||||
with make_tempdir() as d:
|
with make_tempdir() as d:
|
||||||
nlp.to_disk(d)
|
nlp.to_disk(d)
|
||||||
nlp5 = spacy.load(d, exclude=["tagger"])
|
nlp5 = spacy.load(d, exclude=["tagger"])
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
import os
|
import os
|
||||||
import math
|
import math
|
||||||
|
import pkg_resources
|
||||||
from random import sample
|
from random import sample
|
||||||
from typing import Counter
|
from typing import Counter
|
||||||
|
|
||||||
|
@ -25,6 +26,7 @@ from spacy.cli.download import get_compatibility, get_version
|
||||||
from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config
|
from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config
|
||||||
from spacy.cli.package import get_third_party_dependencies
|
from spacy.cli.package import get_third_party_dependencies
|
||||||
from spacy.cli.package import _is_permitted_package_name
|
from spacy.cli.package import _is_permitted_package_name
|
||||||
|
from spacy.cli.project.run import _check_requirements
|
||||||
from spacy.cli.validate import get_model_pkgs
|
from spacy.cli.validate import get_model_pkgs
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.lang.nl import Dutch
|
from spacy.lang.nl import Dutch
|
||||||
|
@ -855,3 +857,42 @@ def test_span_length_freq_dist_output_must_be_correct():
|
||||||
span_freqs = _get_spans_length_freq_dist(sample_span_lengths, threshold)
|
span_freqs = _get_spans_length_freq_dist(sample_span_lengths, threshold)
|
||||||
assert sum(span_freqs.values()) >= threshold
|
assert sum(span_freqs.values()) >= threshold
|
||||||
assert list(span_freqs.keys()) == [3, 1, 4, 5, 2]
|
assert list(span_freqs.keys()) == [3, 1, 4, 5, 2]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"reqs,output",
|
||||||
|
[
|
||||||
|
[
|
||||||
|
"""
|
||||||
|
spacy
|
||||||
|
|
||||||
|
# comment
|
||||||
|
|
||||||
|
thinc""",
|
||||||
|
(False, False),
|
||||||
|
],
|
||||||
|
[
|
||||||
|
"""# comment
|
||||||
|
--some-flag
|
||||||
|
spacy""",
|
||||||
|
(False, False),
|
||||||
|
],
|
||||||
|
[
|
||||||
|
"""# comment
|
||||||
|
--some-flag
|
||||||
|
spacy; python_version >= '3.6'""",
|
||||||
|
(False, False),
|
||||||
|
],
|
||||||
|
[
|
||||||
|
"""# comment
|
||||||
|
spacyunknowndoesnotexist12345""",
|
||||||
|
(True, False),
|
||||||
|
],
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_project_check_requirements(reqs, output):
|
||||||
|
# excessive guard against unlikely package name
|
||||||
|
try:
|
||||||
|
pkg_resources.require("spacyunknowndoesnotexist12345")
|
||||||
|
except pkg_resources.DistributionNotFound:
|
||||||
|
assert output == _check_requirements([req.strip() for req in reqs.split("\n")])
|
||||||
|
|
|
@ -23,7 +23,7 @@ def get_textcat_bow_kwargs():
|
||||||
|
|
||||||
|
|
||||||
def get_textcat_cnn_kwargs():
|
def get_textcat_cnn_kwargs():
|
||||||
return {"tok2vec": test_tok2vec(), "exclusive_classes": False, "nO": 13}
|
return {"tok2vec": make_test_tok2vec(), "exclusive_classes": False, "nO": 13}
|
||||||
|
|
||||||
|
|
||||||
def get_all_params(model):
|
def get_all_params(model):
|
||||||
|
@ -65,7 +65,7 @@ def get_tok2vec_kwargs():
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def test_tok2vec():
|
def make_test_tok2vec():
|
||||||
return build_Tok2Vec_model(**get_tok2vec_kwargs())
|
return build_Tok2Vec_model(**get_tok2vec_kwargs())
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -474,3 +474,50 @@ def test_prf_score():
|
||||||
assert (a.precision, a.recall, a.fscore) == approx(
|
assert (a.precision, a.recall, a.fscore) == approx(
|
||||||
(c.precision, c.recall, c.fscore)
|
(c.precision, c.recall, c.fscore)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_score_cats(en_tokenizer):
|
||||||
|
text = "some text"
|
||||||
|
gold_doc = en_tokenizer(text)
|
||||||
|
gold_doc.cats = {"POSITIVE": 1.0, "NEGATIVE": 0.0}
|
||||||
|
pred_doc = en_tokenizer(text)
|
||||||
|
pred_doc.cats = {"POSITIVE": 0.75, "NEGATIVE": 0.25}
|
||||||
|
example = Example(pred_doc, gold_doc)
|
||||||
|
# threshold is ignored for multi_label=False
|
||||||
|
scores1 = Scorer.score_cats(
|
||||||
|
[example],
|
||||||
|
"cats",
|
||||||
|
labels=list(gold_doc.cats.keys()),
|
||||||
|
multi_label=False,
|
||||||
|
positive_label="POSITIVE",
|
||||||
|
threshold=0.1,
|
||||||
|
)
|
||||||
|
scores2 = Scorer.score_cats(
|
||||||
|
[example],
|
||||||
|
"cats",
|
||||||
|
labels=list(gold_doc.cats.keys()),
|
||||||
|
multi_label=False,
|
||||||
|
positive_label="POSITIVE",
|
||||||
|
threshold=0.9,
|
||||||
|
)
|
||||||
|
assert scores1["cats_score"] == 1.0
|
||||||
|
assert scores2["cats_score"] == 1.0
|
||||||
|
assert scores1 == scores2
|
||||||
|
# threshold is relevant for multi_label=True
|
||||||
|
scores = Scorer.score_cats(
|
||||||
|
[example],
|
||||||
|
"cats",
|
||||||
|
labels=list(gold_doc.cats.keys()),
|
||||||
|
multi_label=True,
|
||||||
|
threshold=0.9,
|
||||||
|
)
|
||||||
|
assert scores["cats_macro_f"] == 0.0
|
||||||
|
# threshold is relevant for multi_label=True
|
||||||
|
scores = Scorer.score_cats(
|
||||||
|
[example],
|
||||||
|
"cats",
|
||||||
|
labels=list(gold_doc.cats.keys()),
|
||||||
|
multi_label=True,
|
||||||
|
threshold=0.1,
|
||||||
|
)
|
||||||
|
assert scores["cats_macro_f"] == 0.5
|
||||||
|
|
|
@ -1667,6 +1667,20 @@ cdef class Doc:
|
||||||
|
|
||||||
if underscore:
|
if underscore:
|
||||||
user_keys = set()
|
user_keys = set()
|
||||||
|
# Handle doc attributes with .get to include values from getters
|
||||||
|
# and not only values stored in user_data, for backwards
|
||||||
|
# compatibility
|
||||||
|
for attr in underscore:
|
||||||
|
if self.has_extension(attr):
|
||||||
|
if "_" not in data:
|
||||||
|
data["_"] = {}
|
||||||
|
value = self._.get(attr)
|
||||||
|
if not srsly.is_json_serializable(value):
|
||||||
|
raise ValueError(Errors.E107.format(attr=attr, value=repr(value)))
|
||||||
|
data["_"][attr] = value
|
||||||
|
user_keys.add(attr)
|
||||||
|
# Token and span attributes only include values stored in user_data
|
||||||
|
# and not values generated by getters
|
||||||
if self.user_data:
|
if self.user_data:
|
||||||
for data_key, value in self.user_data.copy().items():
|
for data_key, value in self.user_data.copy().items():
|
||||||
if type(data_key) == tuple and len(data_key) >= 4 and data_key[0] == "._.":
|
if type(data_key) == tuple and len(data_key) >= 4 and data_key[0] == "._.":
|
||||||
|
@ -1677,20 +1691,15 @@ cdef class Doc:
|
||||||
user_keys.add(attr)
|
user_keys.add(attr)
|
||||||
if not srsly.is_json_serializable(value):
|
if not srsly.is_json_serializable(value):
|
||||||
raise ValueError(Errors.E107.format(attr=attr, value=repr(value)))
|
raise ValueError(Errors.E107.format(attr=attr, value=repr(value)))
|
||||||
# Check if doc attribute
|
# Token attribute
|
||||||
if start is None:
|
if start is not None and end is None:
|
||||||
if "_" not in data:
|
|
||||||
data["_"] = {}
|
|
||||||
data["_"][attr] = value
|
|
||||||
# Check if token attribute
|
|
||||||
elif end is None:
|
|
||||||
if "underscore_token" not in data:
|
if "underscore_token" not in data:
|
||||||
data["underscore_token"] = {}
|
data["underscore_token"] = {}
|
||||||
if attr not in data["underscore_token"]:
|
if attr not in data["underscore_token"]:
|
||||||
data["underscore_token"][attr] = []
|
data["underscore_token"][attr] = []
|
||||||
data["underscore_token"][attr].append({"start": start, "value": value})
|
data["underscore_token"][attr].append({"start": start, "value": value})
|
||||||
# Else span attribute
|
# Span attribute
|
||||||
else:
|
elif start is not None and end is not None:
|
||||||
if "underscore_span" not in data:
|
if "underscore_span" not in data:
|
||||||
data["underscore_span"] = {}
|
data["underscore_span"] = {}
|
||||||
if attr not in data["underscore_span"]:
|
if attr not in data["underscore_span"]:
|
||||||
|
|
|
@ -117,15 +117,13 @@ class Span:
|
||||||
end_char: int
|
end_char: int
|
||||||
label: int
|
label: int
|
||||||
kb_id: int
|
kb_id: int
|
||||||
|
id: int
|
||||||
ent_id: int
|
ent_id: int
|
||||||
ent_id_: str
|
ent_id_: str
|
||||||
@property
|
@property
|
||||||
def id(self) -> int: ...
|
|
||||||
@property
|
|
||||||
def id_(self) -> str: ...
|
|
||||||
@property
|
|
||||||
def orth_(self) -> str: ...
|
def orth_(self) -> str: ...
|
||||||
@property
|
@property
|
||||||
def lemma_(self) -> str: ...
|
def lemma_(self) -> str: ...
|
||||||
label_: str
|
label_: str
|
||||||
kb_id_: str
|
kb_id_: str
|
||||||
|
id_: str
|
||||||
|
|
|
@ -443,9 +443,9 @@ def load_model_from_package(
|
||||||
name: str,
|
name: str,
|
||||||
*,
|
*,
|
||||||
vocab: Union["Vocab", bool] = True,
|
vocab: Union["Vocab", bool] = True,
|
||||||
disable: Union[str, Iterable[str]] = SimpleFrozenList(),
|
disable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
|
||||||
enable: Union[str, Iterable[str]] = SimpleFrozenList(),
|
enable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
|
||||||
exclude: Union[str, Iterable[str]] = SimpleFrozenList(),
|
exclude: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
|
||||||
config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
|
config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
|
||||||
) -> "Language":
|
) -> "Language":
|
||||||
"""Load a model from an installed package.
|
"""Load a model from an installed package.
|
||||||
|
@ -619,9 +619,9 @@ def load_model_from_init_py(
|
||||||
init_file: Union[Path, str],
|
init_file: Union[Path, str],
|
||||||
*,
|
*,
|
||||||
vocab: Union["Vocab", bool] = True,
|
vocab: Union["Vocab", bool] = True,
|
||||||
disable: Union[str, Iterable[str]] = SimpleFrozenList(),
|
disable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
|
||||||
enable: Union[str, Iterable[str]] = SimpleFrozenList(),
|
enable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
|
||||||
exclude: Union[str, Iterable[str]] = SimpleFrozenList(),
|
exclude: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
|
||||||
config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
|
config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
|
||||||
) -> "Language":
|
) -> "Language":
|
||||||
"""Helper function to use in the `load()` method of a model package's
|
"""Helper function to use in the `load()` method of a model package's
|
||||||
|
|
|
@ -63,18 +63,18 @@ spaCy loads a model under the hood based on its
|
||||||
> nlp = Language.from_config(config)
|
> nlp = Language.from_config(config)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `config` | The loaded config. ~~Union[Dict[str, Any], Config]~~ |
|
| `config` | The loaded config. ~~Union[Dict[str, Any], Config]~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `vocab` | A `Vocab` object. If `True`, a vocab is created using the default language data settings. ~~Vocab~~ |
|
| `vocab` | A `Vocab` object. If `True`, a vocab is created using the default language data settings. ~~Vocab~~ |
|
||||||
| `disable` | Name(s) of pipeline component(s) to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [`nlp.enable_pipe`](/api/language#enable_pipe). ~~Union[str, Iterable[str]]~~ |
|
| `disable` | Name(s) of pipeline component(s) to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). Is merged with the config entry `nlp.disabled`. ~~Union[str, Iterable[str]]~~ |
|
||||||
| `enable` <Tag variant="new">3.4</Tag> | Name(s) of pipeline component(s) to [enable](/usage/processing-pipelines#disabling). All other pipes will be disabled, but can be enabled again using [`nlp.enable_pipe`](/api/language#enable_pipe). ~~Union[str, Iterable[str]]~~ |
|
| `enable` <Tag variant="new">3.4</Tag> | Name(s) of pipeline component(s) to [enable](/usage/processing-pipelines#disabling). All other pipes will be disabled, but can be enabled again using [nlp.enable_pipe](/api/language#enable_pipe). ~~Union[str, Iterable[str]]~~ |
|
||||||
| `exclude` | Name(s) of pipeline component(s) to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~Union[str, Iterable[str]]~~ |
|
| `exclude` | Name(s) of pipeline component(s) to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~Union[str, Iterable[str]]~~ |
|
||||||
| `meta` | [Meta data](/api/data-formats#meta) overrides. ~~Dict[str, Any]~~ |
|
| `meta` | [Meta data](/api/data-formats#meta) overrides. ~~Dict[str, Any]~~ |
|
||||||
| `auto_fill` | Whether to automatically fill in missing values in the config, based on defaults and function argument annotations. Defaults to `True`. ~~bool~~ |
|
| `auto_fill` | Whether to automatically fill in missing values in the config, based on defaults and function argument annotations. Defaults to `True`. ~~bool~~ |
|
||||||
| `validate` | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. ~~bool~~ |
|
| `validate` | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. ~~bool~~ |
|
||||||
| **RETURNS** | The initialized object. ~~Language~~ |
|
| **RETURNS** | The initialized object. ~~Language~~ |
|
||||||
|
|
||||||
## Language.component {#component tag="classmethod" new="3"}
|
## Language.component {#component tag="classmethod" new="3"}
|
||||||
|
|
||||||
|
|
|
@ -229,16 +229,17 @@ The reported `{attr}_score` depends on the classification properties:
|
||||||
> print(scores["cats_macro_auc"])
|
> print(scores["cats_macro_auc"])
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ---------------- | -------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ---------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
|
| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
|
||||||
| `attr` | The attribute to score. ~~str~~ |
|
| `attr` | The attribute to score. ~~str~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `getter` | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the cats for an individual `Doc`. ~~Callable[[Doc, str], Dict[str, float]]~~ |
|
| `getter` | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the cats for an individual `Doc`. ~~Callable[[Doc, str], Dict[str, float]]~~ |
|
||||||
| labels | The set of possible labels. Defaults to `[]`. ~~Iterable[str]~~ |
|
| labels | The set of possible labels. Defaults to `[]`. ~~Iterable[str]~~ |
|
||||||
| `multi_label` | Whether the attribute allows multiple labels. Defaults to `True`. ~~bool~~ |
|
| `multi_label` | Whether the attribute allows multiple labels. Defaults to `True`. When set to `False` (exclusive labels), missing gold labels are interpreted as `0.0` and the threshold is set to `0.0`. ~~bool~~ |
|
||||||
| `positive_label` | The positive label for a binary task with exclusive classes. Defaults to `None`. ~~Optional[str]~~ |
|
| `positive_label` | The positive label for a binary task with exclusive classes. Defaults to `None`. ~~Optional[str]~~ |
|
||||||
| **RETURNS** | A dictionary containing the scores, with inapplicable scores as `None`. ~~Dict[str, Optional[float]]~~ |
|
| `threshold` | Cutoff to consider a prediction "positive". Defaults to `0.5` for multi-label, and `0.0` (i.e. whatever's highest scoring) otherwise. ~~float~~ |
|
||||||
|
| **RETURNS** | A dictionary containing the scores, with inapplicable scores as `None`. ~~Dict[str, Optional[float]]~~ |
|
||||||
|
|
||||||
## Scorer.score_links {#score_links tag="staticmethod" new="3"}
|
## Scorer.score_links {#score_links tag="staticmethod" new="3"}
|
||||||
|
|
||||||
|
|
|
@ -63,7 +63,6 @@ architectures and their arguments and hyperparameters.
|
||||||
> ```python
|
> ```python
|
||||||
> from spacy.pipeline.textcat import DEFAULT_SINGLE_TEXTCAT_MODEL
|
> from spacy.pipeline.textcat import DEFAULT_SINGLE_TEXTCAT_MODEL
|
||||||
> config = {
|
> config = {
|
||||||
> "threshold": 0.5,
|
|
||||||
> "model": DEFAULT_SINGLE_TEXTCAT_MODEL,
|
> "model": DEFAULT_SINGLE_TEXTCAT_MODEL,
|
||||||
> }
|
> }
|
||||||
> nlp.add_pipe("textcat", config=config)
|
> nlp.add_pipe("textcat", config=config)
|
||||||
|
@ -82,7 +81,7 @@ architectures and their arguments and hyperparameters.
|
||||||
|
|
||||||
| Setting | Description |
|
| Setting | Description |
|
||||||
| ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `threshold` | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~ |
|
| `threshold` | Cutoff to consider a prediction "positive", relevant for `textcat_multilabel` when calculating accuracy scores. ~~float~~ |
|
||||||
| `model` | A model instance that predicts scores for each category. Defaults to [TextCatEnsemble](/api/architectures#TextCatEnsemble). ~~Model[List[Doc], List[Floats2d]]~~ |
|
| `model` | A model instance that predicts scores for each category. Defaults to [TextCatEnsemble](/api/architectures#TextCatEnsemble). ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||||
| `scorer` | The scoring method. Defaults to [`Scorer.score_cats`](/api/scorer#score_cats) for the attribute `"cats"`. ~~Optional[Callable]~~ |
|
| `scorer` | The scoring method. Defaults to [`Scorer.score_cats`](/api/scorer#score_cats) for the attribute `"cats"`. ~~Optional[Callable]~~ |
|
||||||
|
|
||||||
|
@ -123,7 +122,7 @@ shortcut for this and instantiate the component using its string name and
|
||||||
| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ |
|
| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||||
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
|
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `threshold` | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~ |
|
| `threshold` | Cutoff to consider a prediction "positive", relevant for `textcat_multilabel` when calculating accuracy scores. ~~float~~ |
|
||||||
| `scorer` | The scoring method. Defaults to [`Scorer.score_cats`](/api/scorer#score_cats) for the attribute `"cats"`. ~~Optional[Callable]~~ |
|
| `scorer` | The scoring method. Defaults to [`Scorer.score_cats`](/api/scorer#score_cats) for the attribute `"cats"`. ~~Optional[Callable]~~ |
|
||||||
|
|
||||||
## TextCategorizer.\_\_call\_\_ {#call tag="method"}
|
## TextCategorizer.\_\_call\_\_ {#call tag="method"}
|
||||||
|
|
|
@ -45,16 +45,16 @@ specified separately using the new `exclude` keyword argument.
|
||||||
> nlp = spacy.load("en_core_web_sm", exclude=["parser", "tagger"])
|
> nlp = spacy.load("en_core_web_sm", exclude=["parser", "tagger"])
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
| ------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `name` | Pipeline to load, i.e. package name or path. ~~Union[str, Path]~~ |
|
| `name` | Pipeline to load, i.e. package name or path. ~~Union[str, Path]~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `vocab` | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~ |
|
| `vocab` | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~ |
|
||||||
| `disable` | Name(s) of pipeline component(s) to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). ~~Union[str, Iterable[str]]~~ |
|
| `disable` | Name(s) of pipeline component(s) to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). Is merged with the config entry `nlp.disabled`. ~~Union[str, Iterable[str]]~~ |
|
||||||
| `enable` <Tag variant="new">3.4</Tag> | Name(s) of pipeline component(s) to [enable](/usage/processing-pipelines#disabling). All other pipes will be disabled. ~~Union[str, Iterable[str]]~~ |
|
| `enable` <Tag variant="new">3.4</Tag> | Name(s) of pipeline component(s) to [enable](/usage/processing-pipelines#disabling). All other pipes will be disabled. ~~Union[str, Iterable[str]]~~ |
|
||||||
| `exclude` <Tag variant="new">3</Tag> | Name(s) of pipeline component(s) to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~Union[str, Iterable[str]]~~ |
|
| `exclude` <Tag variant="new">3</Tag> | Name(s) of pipeline component(s) to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~Union[str, Iterable[str]]~~ |
|
||||||
| `config` <Tag variant="new">3</Tag> | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ |
|
| `config` <Tag variant="new">3</Tag> | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ |
|
||||||
| **RETURNS** | A `Language` object with the loaded pipeline. ~~Language~~ |
|
| **RETURNS** | A `Language` object with the loaded pipeline. ~~Language~~ |
|
||||||
|
|
||||||
Essentially, `spacy.load()` is a convenience wrapper that reads the pipeline's
|
Essentially, `spacy.load()` is a convenience wrapper that reads the pipeline's
|
||||||
[`config.cfg`](/api/data-formats#config), uses the language and pipeline
|
[`config.cfg`](/api/data-formats#config), uses the language and pipeline
|
||||||
|
|
|
@ -363,7 +363,8 @@ nlp.enable_pipe("tagger")
|
||||||
```
|
```
|
||||||
|
|
||||||
In addition to `disable`, `spacy.load()` also accepts `enable`. If `enable` is
|
In addition to `disable`, `spacy.load()` also accepts `enable`. If `enable` is
|
||||||
set, all components except for those in `enable` are disabled.
|
set, all components except for those in `enable` are disabled. If `enable` and
|
||||||
|
`disable` conflict (i.e. the same component is included in both), an error is raised.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
# Load the complete pipeline, but disable all components except for tok2vec and tagger
|
# Load the complete pipeline, but disable all components except for tok2vec and tagger
|
||||||
|
|
|
@ -1792,7 +1792,7 @@ the entity `Span` – for example `._.orgs` or `._.prev_orgs` and
|
||||||
> [`Doc.retokenize`](/api/doc#retokenize) context manager:
|
> [`Doc.retokenize`](/api/doc#retokenize) context manager:
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> with doc.retokenize() as retokenize:
|
> with doc.retokenize() as retokenizer:
|
||||||
> for ent in doc.ents:
|
> for ent in doc.ents:
|
||||||
> retokenizer.merge(ent)
|
> retokenizer.merge(ent)
|
||||||
> ```
|
> ```
|
||||||
|
|
|
@ -4,12 +4,22 @@
|
||||||
"code": "af",
|
"code": "af",
|
||||||
"name": "Afrikaans"
|
"name": "Afrikaans"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"code": "am",
|
||||||
|
"name": "Amharic",
|
||||||
|
"has_examples": true
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"code": "ar",
|
"code": "ar",
|
||||||
"name": "Arabic",
|
"name": "Arabic",
|
||||||
"example": "هذه جملة",
|
"example": "هذه جملة",
|
||||||
"has_examples": true
|
"has_examples": true
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"code": "az",
|
||||||
|
"name": "Azerbaijani",
|
||||||
|
"has_examples": true
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"code": "bg",
|
"code": "bg",
|
||||||
"name": "Bulgarian",
|
"name": "Bulgarian",
|
||||||
|
@ -65,7 +75,7 @@
|
||||||
{
|
{
|
||||||
"code": "dsb",
|
"code": "dsb",
|
||||||
"name": "Lower Sorbian",
|
"name": "Lower Sorbian",
|
||||||
"has_examples": true
|
"has_examples": true
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"code": "el",
|
"code": "el",
|
||||||
|
@ -142,6 +152,11 @@
|
||||||
"code": "ga",
|
"code": "ga",
|
||||||
"name": "Irish"
|
"name": "Irish"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"code": "grc",
|
||||||
|
"name": "Ancient Greek",
|
||||||
|
"has_examples": true
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"code": "gu",
|
"code": "gu",
|
||||||
"name": "Gujarati",
|
"name": "Gujarati",
|
||||||
|
@ -172,7 +187,7 @@
|
||||||
{
|
{
|
||||||
"code": "hsb",
|
"code": "hsb",
|
||||||
"name": "Upper Sorbian",
|
"name": "Upper Sorbian",
|
||||||
"has_examples": true
|
"has_examples": true
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"code": "hu",
|
"code": "hu",
|
||||||
|
@ -260,6 +275,10 @@
|
||||||
"example": "Адамга эң кыйыны — күн сайын адам болуу",
|
"example": "Адамга эң кыйыны — күн сайын адам болуу",
|
||||||
"has_examples": true
|
"has_examples": true
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"code": "la",
|
||||||
|
"name": "Latin"
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"code": "lb",
|
"code": "lb",
|
||||||
"name": "Luxembourgish",
|
"name": "Luxembourgish",
|
||||||
|
@ -448,6 +467,11 @@
|
||||||
"example": "นี่คือประโยค",
|
"example": "นี่คือประโยค",
|
||||||
"has_examples": true
|
"has_examples": true
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"code": "ti",
|
||||||
|
"name": "Tigrinya",
|
||||||
|
"has_examples": true
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"code": "tl",
|
"code": "tl",
|
||||||
"name": "Tagalog"
|
"name": "Tagalog"
|
||||||
|
|
|
@ -149,6 +149,9 @@
|
||||||
& > span
|
& > span
|
||||||
display: block
|
display: block
|
||||||
|
|
||||||
|
a
|
||||||
|
text-decoration: underline
|
||||||
|
|
||||||
.small
|
.small
|
||||||
font-size: var(--font-size-code)
|
font-size: var(--font-size-code)
|
||||||
line-height: 1.65
|
line-height: 1.65
|
||||||
|
|
|
@ -159,6 +159,9 @@ const QuickstartInstall = ({ id, title }) => {
|
||||||
setters={setters}
|
setters={setters}
|
||||||
showDropdown={showDropdown}
|
showDropdown={showDropdown}
|
||||||
>
|
>
|
||||||
|
<QS os="mac" hardware="gpu" platform="arm">
|
||||||
|
# Note M1 GPU support is experimental, see <a href="https://github.com/explosion/thinc/issues/792">Thinc issue #792</a>
|
||||||
|
</QS>
|
||||||
<QS package="pip" config="venv">
|
<QS package="pip" config="venv">
|
||||||
python -m venv .env
|
python -m venv .env
|
||||||
</QS>
|
</QS>
|
||||||
|
@ -198,7 +201,13 @@ const QuickstartInstall = ({ id, title }) => {
|
||||||
{nightly ? ' --pre' : ''}
|
{nightly ? ' --pre' : ''}
|
||||||
</QS>
|
</QS>
|
||||||
<QS package="conda">conda install -c conda-forge spacy</QS>
|
<QS package="conda">conda install -c conda-forge spacy</QS>
|
||||||
<QS package="conda" hardware="gpu">
|
<QS package="conda" hardware="gpu" os="windows">
|
||||||
|
conda install -c conda-forge cupy
|
||||||
|
</QS>
|
||||||
|
<QS package="conda" hardware="gpu" os="linux">
|
||||||
|
conda install -c conda-forge cupy
|
||||||
|
</QS>
|
||||||
|
<QS package="conda" hardware="gpu" os="mac" platform="x86">
|
||||||
conda install -c conda-forge cupy
|
conda install -c conda-forge cupy
|
||||||
</QS>
|
</QS>
|
||||||
<QS package="conda" config="train">
|
<QS package="conda" config="train">
|
||||||
|
|
Loading…
Reference in New Issue
Block a user