Merge pull request #12494 from adrianeboyd/backport/v3.5.2-1

Backports for v3.5.2
This commit is contained in:
Adriane Boyd 2023-04-06 16:18:59 +02:00 committed by GitHub
commit e4bbdf7b50
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
40 changed files with 604 additions and 240 deletions

View File

@ -57,51 +57,51 @@ steps:
python -m spacy download ca_core_news_md python -m spacy download ca_core_news_md
python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')" python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
displayName: 'Test download CLI' displayName: 'Test download CLI'
condition: eq(variables['python_version'], '3.8') condition: eq(variables['python_version'], '3.9')
- script: | - script: |
python -W error -m spacy info ca_core_news_sm | grep -q download_url python -W error -m spacy info ca_core_news_sm | grep -q download_url
displayName: 'Test download_url in info CLI' displayName: 'Test download_url in info CLI'
condition: eq(variables['python_version'], '3.8') condition: eq(variables['python_version'], '3.9')
- script: | - script: |
python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')" python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
displayName: 'Test no warnings on load (#11713)' displayName: 'Test no warnings on load (#11713)'
condition: eq(variables['python_version'], '3.8') condition: eq(variables['python_version'], '3.9')
- script: | - script: |
python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json . python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
displayName: 'Test convert CLI' displayName: 'Test convert CLI'
condition: eq(variables['python_version'], '3.8') condition: eq(variables['python_version'], '3.9')
- script: | - script: |
python -m spacy init config -p ner -l ca ner.cfg python -m spacy init config -p ner -l ca ner.cfg
python -m spacy debug config ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy python -m spacy debug config ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy
displayName: 'Test debug config CLI' displayName: 'Test debug config CLI'
condition: eq(variables['python_version'], '3.8') condition: eq(variables['python_version'], '3.9')
- script: | - script: |
# will have errors due to sparse data, check for summary in output # will have errors due to sparse data, check for summary in output
python -m spacy debug data ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy | grep -q Summary python -m spacy debug data ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy | grep -q Summary
displayName: 'Test debug data CLI' displayName: 'Test debug data CLI'
condition: eq(variables['python_version'], '3.8') condition: eq(variables['python_version'], '3.9')
- script: | - script: |
python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1 python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
displayName: 'Test train CLI' displayName: 'Test train CLI'
condition: eq(variables['python_version'], '3.8') condition: eq(variables['python_version'], '3.9')
- script: | - script: |
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')" python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
displayName: 'Test assemble CLI' displayName: 'Test assemble CLI'
condition: eq(variables['python_version'], '3.8') condition: eq(variables['python_version'], '3.9')
- script: | - script: |
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')" python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113 python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
displayName: 'Test assemble CLI vectors warning' displayName: 'Test assemble CLI vectors warning'
condition: eq(variables['python_version'], '3.8') condition: eq(variables['python_version'], '3.9')
- script: | - script: |
python -m pip install -U -r requirements.txt python -m pip install -U -r requirements.txt
@ -116,9 +116,3 @@ steps:
python -m pytest --pyargs spacy python -m pytest --pyargs spacy
displayName: "Run CPU tests with thinc-apple-ops" displayName: "Run CPU tests with thinc-apple-ops"
condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.11')) condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.11'))
- script: |
python .github/validate_universe_json.py website/meta/universe.json
displayName: 'Test website/meta/universe.json'
condition: eq(variables['python_version'], '3.8')

View File

@ -1,45 +0,0 @@
# GitHub Action that uses Black to reformat all Python code and submits a PR
# in regular intervals. Inspired by: https://github.com/cclauss/autoblack
name: autoblack
on:
workflow_dispatch: # allow manual trigger
schedule:
- cron: '0 8 * * 5' # every Friday at 8am UTC
jobs:
autoblack:
if: github.repository_owner == 'explosion'
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
with:
ref: ${{ github.head_ref }}
- uses: actions/setup-python@v4
- run: pip install black -c requirements.txt
- name: Auto-format code if needed
run: black spacy
# We can't run black --check here because that returns a non-zero excit
# code and makes GitHub think the action failed
- name: Check for modified files
id: git-check
run: echo modified=$(if git diff-index --quiet HEAD --; then echo "false"; else echo "true"; fi) >> $GITHUB_OUTPUT
- name: Create Pull Request
if: steps.git-check.outputs.modified == 'true'
uses: peter-evans/create-pull-request@v4
with:
title: Auto-format code with black
labels: meta
commit-message: Auto-format code with black
committer: GitHub <noreply@github.com>
author: explosion-bot <explosion-bot@users.noreply.github.com>
body: _This PR is auto-generated._
branch: autoblack
delete-branch: true
draft: false
- name: Check outputs
if: steps.git-check.outputs.modified == 'true'
run: |
echo "Pull Request Number - ${{ steps.cpr.outputs.pull-request-number }}"
echo "Pull Request URL - ${{ steps.cpr.outputs.pull-request-url }}"

View File

@ -8,6 +8,7 @@ on:
jobs: jobs:
explosion-bot: explosion-bot:
if: github.repository_owner == 'explosion'
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- name: Dump GitHub context - name: Dump GitHub context

View File

@ -13,6 +13,7 @@ on:
jobs: jobs:
issue-manager: issue-manager:
if: github.repository_owner == 'explosion'
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- uses: tiangolo/issue-manager@0.4.0 - uses: tiangolo/issue-manager@0.4.0

View File

@ -13,6 +13,7 @@ concurrency:
jobs: jobs:
action: action:
if: github.repository_owner == 'explosion'
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- uses: dessant/lock-threads@v4 - uses: dessant/lock-threads@v4

View File

@ -7,6 +7,7 @@ on:
jobs: jobs:
build: build:
if: github.repository_owner == 'explosion'
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:

173
.github/workflows/tests.yml vendored Normal file
View File

@ -0,0 +1,173 @@
name: tests
on:
push:
branches-ignore:
- "spacy.io"
- "nightly.spacy.io"
- "v2.spacy.io"
paths-ignore:
- "*.md"
- "*.mdx"
- "website/**"
- ".github/workflows/**"
pull_request:
types: [opened, synchronize, reopened, edited]
paths-ignore:
- "*.md"
- "*.mdx"
- "website/**"
jobs:
validate:
name: Validate
if: github.repository_owner == 'explosion'
runs-on: ubuntu-latest
steps:
- name: Check out repo
uses: actions/checkout@v3
- name: Configure Python version
uses: actions/setup-python@v4
with:
python-version: "3.7"
architecture: x64
- name: black
run: |
python -m pip install black -c requirements.txt
python -m black spacy --check
- name: flake8
run: |
python -m pip install flake8==5.0.4
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
tests:
name: Test
needs: Validate
strategy:
fail-fast: true
matrix:
os: [ubuntu-latest, windows-latest, macos-latest]
python_version: ["3.11"]
include:
- os: ubuntu-20.04
python_version: "3.6"
- os: windows-latest
python_version: "3.7"
- os: macos-latest
python_version: "3.8"
- os: ubuntu-latest
python_version: "3.9"
- os: windows-latest
python_version: "3.10"
runs-on: ${{ matrix.os }}
steps:
- name: Check out repo
uses: actions/checkout@v3
- name: Configure Python version
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python_version }}
architecture: x64
- name: Install dependencies
run: |
python -m pip install -U build pip setuptools
python -m pip install -U -r requirements.txt
- name: Build sdist
run: |
python -m build --sdist
- name: Run mypy
run: |
python -m mypy spacy
if: matrix.python_version != '3.6'
- name: Delete source directory and .egg-info
run: |
rm -rf spacy *.egg-info
shell: bash
- name: Uninstall all packages
run: |
python -m pip freeze
python -m pip freeze --exclude pywin32 > installed.txt
python -m pip uninstall -y -r installed.txt
- name: Install from sdist
run: |
SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
SPACY_NUM_BUILD_JOBS=2 python -m pip install dist/$SDIST
shell: bash
- name: Test import
run: python -W error -c "import spacy"
- name: "Test download CLI"
run: |
python -m spacy download ca_core_news_sm
python -m spacy download ca_core_news_md
python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
if: matrix.python_version == '3.9'
- name: "Test download_url in info CLI"
run: |
python -W error -m spacy info ca_core_news_sm | grep -q download_url
if: matrix.python_version == '3.9'
- name: "Test no warnings on load (#11713)"
run: |
python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
if: matrix.python_version == '3.9'
- name: "Test convert CLI"
run: |
python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
if: matrix.python_version == '3.9'
- name: "Test debug config CLI"
run: |
python -m spacy init config -p ner -l ca ner.cfg
python -m spacy debug config ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy
if: matrix.python_version == '3.9'
- name: "Test debug data CLI"
run: |
# will have errors due to sparse data, check for summary in output
python -m spacy debug data ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy | grep -q Summary
if: matrix.python_version == '3.9'
- name: "Test train CLI"
run: |
python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
if: matrix.python_version == '3.9'
- name: "Test assemble CLI"
run: |
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
if: matrix.python_version == '3.9'
- name: "Test assemble CLI vectors warning"
run: |
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
if: matrix.python_version == '3.9'
- name: "Install test requirements"
run: |
python -m pip install -U -r requirements.txt
- name: "Run CPU tests"
run: |
python -m pytest --pyargs spacy -W error
- name: "Run CPU tests with thinc-apple-ops"
run: |
python -m pip install 'spacy[apple]'
python -m pytest --pyargs spacy
if: startsWith(matrix.os, 'macos') && matrix.python_version == '3.11'

View File

@ -0,0 +1,33 @@
name: universe validation
on:
push:
branches-ignore:
- "spacy.io"
- "nightly.spacy.io"
- "v2.spacy.io"
paths:
- "website/meta/universe.json"
pull_request:
types: [opened, synchronize, reopened, edited]
paths:
- "website/meta/universe.json"
jobs:
validate:
name: Validate
if: github.repository_owner == 'explosion'
runs-on: ubuntu-latest
steps:
- name: Check out repo
uses: actions/checkout@v3
- name: Configure Python version
uses: actions/setup-python@v4
with:
python-version: "3.7"
architecture: x64
- name: Validate website/meta/universe.json
run: |
python .github/validate_universe_json.py website/meta/universe.json

View File

@ -48,6 +48,9 @@ jobs:
pip install flake8==5.0.4 pip install flake8==5.0.4
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
displayName: "flake8" displayName: "flake8"
- script: |
python .github/validate_universe_json.py website/meta/universe.json
displayName: 'Validate website/meta/universe.json'
- job: "Test" - job: "Test"
dependsOn: "Validate" dependsOn: "Validate"

View File

@ -78,41 +78,41 @@ transformers =
ray = ray =
spacy_ray>=0.1.0,<1.0.0 spacy_ray>=0.1.0,<1.0.0
cuda = cuda =
cupy>=5.0.0b4,<12.0.0 cupy>=5.0.0b4,<13.0.0
cuda80 = cuda80 =
cupy-cuda80>=5.0.0b4,<12.0.0 cupy-cuda80>=5.0.0b4,<13.0.0
cuda90 = cuda90 =
cupy-cuda90>=5.0.0b4,<12.0.0 cupy-cuda90>=5.0.0b4,<13.0.0
cuda91 = cuda91 =
cupy-cuda91>=5.0.0b4,<12.0.0 cupy-cuda91>=5.0.0b4,<13.0.0
cuda92 = cuda92 =
cupy-cuda92>=5.0.0b4,<12.0.0 cupy-cuda92>=5.0.0b4,<13.0.0
cuda100 = cuda100 =
cupy-cuda100>=5.0.0b4,<12.0.0 cupy-cuda100>=5.0.0b4,<13.0.0
cuda101 = cuda101 =
cupy-cuda101>=5.0.0b4,<12.0.0 cupy-cuda101>=5.0.0b4,<13.0.0
cuda102 = cuda102 =
cupy-cuda102>=5.0.0b4,<12.0.0 cupy-cuda102>=5.0.0b4,<13.0.0
cuda110 = cuda110 =
cupy-cuda110>=5.0.0b4,<12.0.0 cupy-cuda110>=5.0.0b4,<13.0.0
cuda111 = cuda111 =
cupy-cuda111>=5.0.0b4,<12.0.0 cupy-cuda111>=5.0.0b4,<13.0.0
cuda112 = cuda112 =
cupy-cuda112>=5.0.0b4,<12.0.0 cupy-cuda112>=5.0.0b4,<13.0.0
cuda113 = cuda113 =
cupy-cuda113>=5.0.0b4,<12.0.0 cupy-cuda113>=5.0.0b4,<13.0.0
cuda114 = cuda114 =
cupy-cuda114>=5.0.0b4,<12.0.0 cupy-cuda114>=5.0.0b4,<13.0.0
cuda115 = cuda115 =
cupy-cuda115>=5.0.0b4,<12.0.0 cupy-cuda115>=5.0.0b4,<13.0.0
cuda116 = cuda116 =
cupy-cuda116>=5.0.0b4,<12.0.0 cupy-cuda116>=5.0.0b4,<13.0.0
cuda117 = cuda117 =
cupy-cuda117>=5.0.0b4,<12.0.0 cupy-cuda117>=5.0.0b4,<13.0.0
cuda11x = cuda11x =
cupy-cuda11x>=11.0.0,<12.0.0 cupy-cuda11x>=11.0.0,<13.0.0
cuda-autodetect = cuda-autodetect =
cupy-wheel>=11.0.0,<12.0.0 cupy-wheel>=11.0.0,<13.0.0
apple = apple =
thinc-apple-ops>=0.1.0.dev0,<1.0.0 thinc-apple-ops>=0.1.0.dev0,<1.0.0
# Language tokenizers with external dependencies # Language tokenizers with external dependencies

View File

@ -35,7 +35,7 @@ def find_threshold_cli(
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
use_gpu: int = Opt(_DEFAULTS["use_gpu"], "--gpu-id", "-g", help="GPU ID or -1 for CPU"), use_gpu: int = Opt(_DEFAULTS["use_gpu"], "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
gold_preproc: bool = Opt(_DEFAULTS["gold_preproc"], "--gold-preproc", "-G", help="Use gold preprocessing"), gold_preproc: bool = Opt(_DEFAULTS["gold_preproc"], "--gold-preproc", "-G", help="Use gold preprocessing"),
verbose: bool = Opt(False, "--silent", "-V", "-VV", help="Display more information for debugging purposes"), verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
# fmt: on # fmt: on
): ):
""" """

View File

@ -23,6 +23,7 @@ def pretrain_cli(
resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"), resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."), epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."),
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"), use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
skip_last: bool = Opt(False, "--skip-last", "-L", help="Skip saving model-last.bin"),
# fmt: on # fmt: on
): ):
""" """
@ -74,6 +75,7 @@ def pretrain_cli(
epoch_resume=epoch_resume, epoch_resume=epoch_resume,
use_gpu=use_gpu, use_gpu=use_gpu,
silent=False, silent=False,
skip_last=skip_last,
) )
msg.good("Successfully finished pretrain") msg.good("Successfully finished pretrain")

View File

@ -125,13 +125,17 @@ def app(environ, start_response):
return [res] return [res]
def parse_deps(orig_doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]: def parse_deps(
orig_doc: Union[Doc, Span], options: Dict[str, Any] = {}
) -> Dict[str, Any]:
"""Generate dependency parse in {'words': [], 'arcs': []} format. """Generate dependency parse in {'words': [], 'arcs': []} format.
orig_doc (Doc): Document to parse. orig_doc (Union[Doc, Span]): Document to parse.
options (Dict[str, Any]): Dependency parse specific visualisation options. options (Dict[str, Any]): Dependency parse specific visualisation options.
RETURNS (dict): Generated dependency parse keyed by words and arcs. RETURNS (dict): Generated dependency parse keyed by words and arcs.
""" """
if isinstance(orig_doc, Span):
orig_doc = orig_doc.as_doc()
doc = Doc(orig_doc.vocab).from_bytes( doc = Doc(orig_doc.vocab).from_bytes(
orig_doc.to_bytes(exclude=["user_data", "user_hooks"]) orig_doc.to_bytes(exclude=["user_data", "user_hooks"])
) )

View File

@ -549,8 +549,8 @@ class Errors(metaclass=ErrorsWithCodes):
"during training, make sure to include it in 'annotating components'") "during training, make sure to include it in 'annotating components'")
# New errors added in v3.x # New errors added in v3.x
E850 = ("The PretrainVectors objective currently only supports default " E850 = ("The PretrainVectors objective currently only supports default or "
"vectors, not {mode} vectors.") "floret vectors, not {mode} vectors.")
E851 = ("The 'textcat' component labels should only have values of 0 or 1, " E851 = ("The 'textcat' component labels should only have values of 0 or 1, "
"but found value of '{val}'.") "but found value of '{val}'.")
E852 = ("The tar file pulled from the remote attempted an unsafe path " E852 = ("The tar file pulled from the remote attempted an unsafe path "

View File

@ -1,5 +1,5 @@
from typing import Any, Optional, Iterable, Tuple, List, Callable, TYPE_CHECKING, cast from typing import Any, Optional, Iterable, Tuple, List, Callable, TYPE_CHECKING, cast
from thinc.types import Floats2d from thinc.types import Floats2d, Ints1d
from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init, Model from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init, Model
from thinc.api import MultiSoftmax, list2array from thinc.api import MultiSoftmax, list2array
from thinc.api import to_categorical, CosineDistance, L2Distance from thinc.api import to_categorical, CosineDistance, L2Distance
@ -7,7 +7,7 @@ from thinc.loss import Loss
from ...util import registry, OOV_RANK from ...util import registry, OOV_RANK
from ...errors import Errors from ...errors import Errors
from ...attrs import ID from ...attrs import ID, ORTH
from ...vectors import Mode as VectorsMode from ...vectors import Mode as VectorsMode
import numpy import numpy
@ -24,8 +24,6 @@ def create_pretrain_vectors(
maxout_pieces: int, hidden_size: int, loss: str maxout_pieces: int, hidden_size: int, loss: str
) -> Callable[["Vocab", Model], Model]: ) -> Callable[["Vocab", Model], Model]:
def create_vectors_objective(vocab: "Vocab", tok2vec: Model) -> Model: def create_vectors_objective(vocab: "Vocab", tok2vec: Model) -> Model:
if vocab.vectors.mode != VectorsMode.default:
raise ValueError(Errors.E850.format(mode=vocab.vectors.mode))
if vocab.vectors.shape[1] == 0: if vocab.vectors.shape[1] == 0:
raise ValueError(Errors.E875) raise ValueError(Errors.E875)
model = build_cloze_multi_task_model( model = build_cloze_multi_task_model(
@ -70,14 +68,23 @@ def get_vectors_loss(ops, docs, prediction, distance):
"""Compute a loss based on a distance between the documents' vectors and """Compute a loss based on a distance between the documents' vectors and
the prediction. the prediction.
""" """
vocab = docs[0].vocab
if vocab.vectors.mode == VectorsMode.default:
# The simplest way to implement this would be to vstack the # The simplest way to implement this would be to vstack the
# token.vector values, but that's a bit inefficient, especially on GPU. # token.vector values, but that's a bit inefficient, especially on GPU.
# Instead we fetch the index into the vectors table for each of our tokens, # Instead we fetch the index into the vectors table for each of our
# and look them up all at once. This prevents data copying. # tokens, and look them up all at once. This prevents data copying.
ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs]) ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
target = docs[0].vocab.vectors.data[ids] target = docs[0].vocab.vectors.data[ids]
target[ids == OOV_RANK] = 0 target[ids == OOV_RANK] = 0
d_target, loss = distance(prediction, target) d_target, loss = distance(prediction, target)
elif vocab.vectors.mode == VectorsMode.floret:
keys = ops.flatten([cast(Ints1d, doc.to_array(ORTH)) for doc in docs])
target = vocab.vectors.get_batch(keys)
target = ops.as_contig(target)
d_target, loss = distance(prediction, target)
else:
raise ValueError(Errors.E850.format(mode=vocab.vectors.mode))
return loss, d_target return loss, d_target

View File

@ -474,18 +474,24 @@ class EntityLinker(TrainablePipe):
# Looping through each entity in batch (TODO: rewrite) # Looping through each entity in batch (TODO: rewrite)
for j, ent in enumerate(ent_batch): for j, ent in enumerate(ent_batch):
sent_index = sentences.index(ent.sent) assert hasattr(ent, "sents")
assert sent_index >= 0 sents = list(ent.sents)
sent_indices = (
sentences.index(sents[0]),
sentences.index(sents[-1]),
)
assert sent_indices[1] >= sent_indices[0] >= 0
if self.incl_context: if self.incl_context:
# get n_neighbour sentences, clipped to the length of the document # get n_neighbour sentences, clipped to the length of the document
start_sentence = max(0, sent_index - self.n_sents) start_sentence = max(0, sent_indices[0] - self.n_sents)
end_sentence = min( end_sentence = min(
len(sentences) - 1, sent_index + self.n_sents len(sentences) - 1, sent_indices[1] + self.n_sents
) )
start_token = sentences[start_sentence].start start_token = sentences[start_sentence].start
end_token = sentences[end_sentence].end end_token = sentences[end_sentence].end
sent_doc = doc[start_token:end_token].as_doc() sent_doc = doc[start_token:end_token].as_doc()
# currently, the context is the same for each entity in a sentence (should be refined) # currently, the context is the same for each entity in a sentence (should be refined)
sentence_encoding = self.model.predict([sent_doc])[0] sentence_encoding = self.model.predict([sent_doc])[0]
sentence_encoding_t = sentence_encoding.T sentence_encoding_t = sentence_encoding.T

View File

@ -1,5 +1,6 @@
from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any, cast, Union from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any, cast, Union
from dataclasses import dataclass from dataclasses import dataclass
from functools import partial
from thinc.api import Config, Model, get_current_ops, set_dropout_rate, Ops from thinc.api import Config, Model, get_current_ops, set_dropout_rate, Ops
from thinc.api import Optimizer from thinc.api import Optimizer
from thinc.types import Ragged, Ints2d, Floats2d from thinc.types import Ragged, Ints2d, Floats2d
@ -82,13 +83,9 @@ class Suggester(Protocol):
... ...
@registry.misc("spacy.ngram_suggester.v1") def ngram_suggester(
def build_ngram_suggester(sizes: List[int]) -> Suggester: docs: Iterable[Doc], sizes: List[int], *, ops: Optional[Ops] = None
"""Suggest all spans of the given lengths. Spans are returned as a ragged ) -> Ragged:
array of integers. The array has two columns, indicating the start and end
position."""
def ngram_suggester(docs: Iterable[Doc], *, ops: Optional[Ops] = None) -> Ragged:
if ops is None: if ops is None:
ops = get_current_ops() ops = get_current_ops()
spans = [] spans = []
@ -114,7 +111,14 @@ def build_ngram_suggester(sizes: List[int]) -> Suggester:
assert output.dataXd.ndim == 2 assert output.dataXd.ndim == 2
return output return output
return ngram_suggester
@registry.misc("spacy.ngram_suggester.v1")
def build_ngram_suggester(sizes: List[int]) -> Suggester:
"""Suggest all spans of the given lengths. Spans are returned as a ragged
array of integers. The array has two columns, indicating the start and end
position."""
return partial(ngram_suggester, sizes=sizes)
@registry.misc("spacy.ngram_range_suggester.v1") @registry.misc("spacy.ngram_range_suggester.v1")
@ -726,6 +730,7 @@ class SpanCategorizer(TrainablePipe):
if not allow_overlap: if not allow_overlap:
# Get the probabilities # Get the probabilities
sort_idx = (argmax_scores.squeeze() * -1).argsort() sort_idx = (argmax_scores.squeeze() * -1).argsort()
argmax_scores = argmax_scores[sort_idx]
predicted = predicted[sort_idx] predicted = predicted[sort_idx]
indices = indices[sort_idx] indices = indices[sort_idx]
keeps = keeps[sort_idx] keeps = keeps[sort_idx]
@ -748,4 +753,5 @@ class SpanCategorizer(TrainablePipe):
attrs_scores.append(argmax_scores[i]) attrs_scores.append(argmax_scores[i])
spans.append(Span(doc, start, end, label=self.labels[label])) spans.append(Span(doc, start, end, label=self.labels[label]))
spans.attrs["scores"] = numpy.array(attrs_scores)
return spans return spans

View File

@ -700,3 +700,34 @@ def test_span_group_copy(doc):
assert len(doc.spans["test"]) == 3 assert len(doc.spans["test"]) == 3
# check that the copy spans were not modified and this is an isolated doc # check that the copy spans were not modified and this is an isolated doc
assert len(doc_copy.spans["test"]) == 2 assert len(doc_copy.spans["test"]) == 2
def test_for_partial_ent_sents():
"""Spans may be associated with multiple sentences. These .sents should always be complete, not partial, sentences,
which this tests for.
"""
doc = Doc(
English().vocab,
words=["Mahler's", "Symphony", "No.", "8", "was", "beautiful."],
sent_starts=[1, 0, 0, 1, 0, 0],
)
doc.set_ents([Span(doc, 1, 4, "WORK")])
# The specified entity is associated with both sentences in this doc, so we expect all sentences in the doc to be
# equal to the sentences referenced in ent.sents.
for doc_sent, ent_sent in zip(doc.sents, doc.ents[0].sents):
assert doc_sent == ent_sent
def test_for_no_ent_sents():
"""Span.sents() should set .sents correctly, even if Span in question is trailing and doesn't form a full
sentence.
"""
doc = Doc(
English().vocab,
words=["This", "is", "a", "test.", "ENTITY"],
sent_starts=[1, 0, 0, 0, 1],
)
doc.set_ents([Span(doc, 4, 5, "WORK")])
sents = list(doc.ents[0].sents)
assert len(sents) == 1
assert str(sents[0]) == str(doc.ents[0].sent) == "ENTITY"

View File

@ -1,9 +1,9 @@
from typing import Callable, Iterable, Dict, Any from typing import Callable, Iterable, Dict, Any, Tuple
import pytest import pytest
from numpy.testing import assert_equal from numpy.testing import assert_equal
from spacy import registry, util from spacy import registry, util, Language
from spacy.attrs import ENT_KB_ID from spacy.attrs import ENT_KB_ID
from spacy.compat import pickle from spacy.compat import pickle
from spacy.kb import Candidate, InMemoryLookupKB, get_candidates, KnowledgeBase from spacy.kb import Candidate, InMemoryLookupKB, get_candidates, KnowledgeBase
@ -108,18 +108,23 @@ def test_issue7065():
@pytest.mark.issue(7065) @pytest.mark.issue(7065)
def test_issue7065_b(): @pytest.mark.parametrize("entity_in_first_sentence", [True, False])
def test_sentence_crossing_ents(entity_in_first_sentence: bool):
"""Tests if NEL crashes if entities cross sentence boundaries and the first associated sentence doesn't have an
entity.
entity_in_prior_sentence (bool): Whether to include an entity in the first sentence associated with the
sentence-crossing entity.
"""
# Test that the NEL doesn't crash when an entity crosses a sentence boundary # Test that the NEL doesn't crash when an entity crosses a sentence boundary
nlp = English() nlp = English()
vector_length = 3 vector_length = 3
nlp.add_pipe("sentencizer")
text = "Mahler 's Symphony No. 8 was beautiful." text = "Mahler 's Symphony No. 8 was beautiful."
entities = [(0, 6, "PERSON"), (10, 24, "WORK")] entities = [(10, 24, "WORK")]
links = { links = {(10, 24): {"Q7304": 0.0, "Q270853": 1.0}}
(0, 6): {"Q7304": 1.0, "Q270853": 0.0}, if entity_in_first_sentence:
(10, 24): {"Q7304": 0.0, "Q270853": 1.0}, entities.append((0, 6, "PERSON"))
} links[(0, 6)] = {"Q7304": 1.0, "Q270853": 0.0}
sent_starts = [1, -1, 0, 0, 0, 0, 0, 0, 0] sent_starts = [1, -1, 0, 0, 0, 1, 0, 0, 0]
doc = nlp(text) doc = nlp(text)
example = Example.from_dict( example = Example.from_dict(
doc, {"entities": entities, "links": links, "sent_starts": sent_starts} doc, {"entities": entities, "links": links, "sent_starts": sent_starts}
@ -145,31 +150,14 @@ def test_issue7065_b():
# Create the Entity Linker component and add it to the pipeline # Create the Entity Linker component and add it to the pipeline
entity_linker = nlp.add_pipe("entity_linker", last=True) entity_linker = nlp.add_pipe("entity_linker", last=True)
entity_linker.set_kb(create_kb) entity_linker.set_kb(create_kb) # type: ignore
# train the NEL pipe # train the NEL pipe
optimizer = nlp.initialize(get_examples=lambda: train_examples) optimizer = nlp.initialize(get_examples=lambda: train_examples)
for i in range(2): for i in range(2):
losses = {} nlp.update(train_examples, sgd=optimizer)
nlp.update(train_examples, sgd=optimizer, losses=losses)
# Add a custom rule-based component to mimick NER # This shouldn't crash.
patterns = [ entity_linker.predict([example.reference]) # type: ignore
{"label": "PERSON", "pattern": [{"LOWER": "mahler"}]},
{
"label": "WORK",
"pattern": [
{"LOWER": "symphony"},
{"LOWER": "no"},
{"LOWER": "."},
{"LOWER": "8"},
],
},
]
ruler = nlp.add_pipe("entity_ruler", before="entity_linker")
ruler.add_patterns(patterns)
# test the trained model - this should not throw E148
doc = nlp(text)
assert doc
def test_no_entities(): def test_no_entities():

View File

@ -1,7 +1,7 @@
import pytest import pytest
import numpy import numpy
from numpy.testing import assert_array_equal, assert_almost_equal from numpy.testing import assert_array_equal, assert_almost_equal
from thinc.api import get_current_ops, Ragged from thinc.api import get_current_ops, NumpyOps, Ragged
from spacy import util from spacy import util
from spacy.lang.en import English from spacy.lang.en import English
@ -190,17 +190,19 @@ def test_make_spangroup_singlelabel(threshold, allow_overlap, nr_results):
spangroup = spancat._make_span_group_singlelabel( spangroup = spancat._make_span_group_singlelabel(
doc, indices, scores, allow_overlap doc, indices, scores, allow_overlap
) )
assert len(spangroup) == nr_results
if threshold > 0.4: if threshold > 0.4:
if allow_overlap: if allow_overlap:
assert spangroup[0].text == "London" assert spangroup[0].text == "London"
assert spangroup[0].label_ == "City" assert spangroup[0].label_ == "City"
assert_almost_equal(0.6, spangroup.attrs["scores"][0], 5)
assert spangroup[1].text == "Greater London" assert spangroup[1].text == "Greater London"
assert spangroup[1].label_ == "GreatCity" assert spangroup[1].label_ == "GreatCity"
assert spangroup.attrs["scores"][1] == 0.9
assert_almost_equal(0.9, spangroup.attrs["scores"][1], 5)
else: else:
assert spangroup[0].text == "Greater London" assert spangroup[0].text == "Greater London"
assert spangroup[0].label_ == "GreatCity" assert spangroup[0].label_ == "GreatCity"
assert spangroup.attrs["scores"][0] == 0.9
else: else:
if allow_overlap: if allow_overlap:
assert spangroup[0].text == "Greater" assert spangroup[0].text == "Greater"
@ -256,22 +258,32 @@ def test_make_spangroup_negative_label():
assert len(spangroup_single) == 2 assert len(spangroup_single) == 2
assert spangroup_single[0].text == "Greater" assert spangroup_single[0].text == "Greater"
assert spangroup_single[0].label_ == "City" assert spangroup_single[0].label_ == "City"
assert_almost_equal(0.4, spangroup_single.attrs["scores"][0], 5)
assert spangroup_single[1].text == "Greater London" assert spangroup_single[1].text == "Greater London"
assert spangroup_single[1].label_ == "GreatCity" assert spangroup_single[1].label_ == "GreatCity"
assert spangroup_single.attrs["scores"][1] == 0.9
assert_almost_equal(0.9, spangroup_single.attrs["scores"][1], 5)
assert len(spangroup_multi) == 6 assert len(spangroup_multi) == 6
assert spangroup_multi[0].text == "Greater" assert spangroup_multi[0].text == "Greater"
assert spangroup_multi[0].label_ == "City" assert spangroup_multi[0].label_ == "City"
assert_almost_equal(0.4, spangroup_multi.attrs["scores"][0], 5)
assert spangroup_multi[1].text == "Greater" assert spangroup_multi[1].text == "Greater"
assert spangroup_multi[1].label_ == "Person" assert spangroup_multi[1].label_ == "Person"
assert_almost_equal(0.3, spangroup_multi.attrs["scores"][1], 5)
assert spangroup_multi[2].text == "London" assert spangroup_multi[2].text == "London"
assert spangroup_multi[2].label_ == "City" assert spangroup_multi[2].label_ == "City"
assert_almost_equal(0.6, spangroup_multi.attrs["scores"][2], 5)
assert spangroup_multi[3].text == "London" assert spangroup_multi[3].text == "London"
assert spangroup_multi[3].label_ == "GreatCity" assert spangroup_multi[3].label_ == "GreatCity"
assert_almost_equal(0.4, spangroup_multi.attrs["scores"][3], 5)
assert spangroup_multi[4].text == "Greater London" assert spangroup_multi[4].text == "Greater London"
assert spangroup_multi[4].label_ == "Thing" assert spangroup_multi[4].label_ == "Thing"
assert spangroup_multi[4].text == "Greater London"
assert_almost_equal(0.8, spangroup_multi.attrs["scores"][4], 5)
assert spangroup_multi[5].text == "Greater London" assert spangroup_multi[5].text == "Greater London"
assert spangroup_multi[5].label_ == "GreatCity" assert spangroup_multi[5].label_ == "GreatCity"
assert_almost_equal(0.9, spangroup_multi.attrs["scores"][5], 5)
def test_ngram_suggester(en_tokenizer): def test_ngram_suggester(en_tokenizer):
@ -565,3 +577,21 @@ def test_set_candidates(name):
assert len(docs[0].spans["candidates"]) == 9 assert len(docs[0].spans["candidates"]) == 9
assert docs[0].spans["candidates"][0].text == "Just" assert docs[0].spans["candidates"][0].text == "Just"
assert docs[0].spans["candidates"][4].text == "Just a" assert docs[0].spans["candidates"][4].text == "Just a"
@pytest.mark.parametrize("name", SPANCAT_COMPONENTS)
@pytest.mark.parametrize("n_process", [1, 2])
def test_spancat_multiprocessing(name, n_process):
if isinstance(get_current_ops, NumpyOps) or n_process < 2:
nlp = Language()
spancat = nlp.add_pipe(name, config={"spans_key": SPAN_KEY})
train_examples = make_examples(nlp)
nlp.initialize(get_examples=lambda: train_examples)
texts = [
"Just a sentence.",
"I like London and Berlin",
"I like Berlin",
"I eat ham.",
]
docs = list(nlp.pipe(texts, n_process=n_process))
assert len(docs) == len(texts)

View File

@ -213,6 +213,13 @@ def test_serialize_doc_exclude(en_vocab):
def test_serialize_doc_span_groups(en_vocab): def test_serialize_doc_span_groups(en_vocab):
doc = Doc(en_vocab, words=["hello", "world", "!"]) doc = Doc(en_vocab, words=["hello", "world", "!"])
doc.spans["content"] = [doc[0:2]] span = doc[0:2]
span.label_ = "test_serialize_doc_span_groups_label"
span.id_ = "test_serialize_doc_span_groups_id"
span.kb_id_ = "test_serialize_doc_span_groups_kb_id"
doc.spans["content"] = [span]
new_doc = Doc(en_vocab).from_bytes(doc.to_bytes()) new_doc = Doc(en_vocab).from_bytes(doc.to_bytes())
assert len(new_doc.spans["content"]) == 1 assert len(new_doc.spans["content"]) == 1
assert new_doc.spans["content"][0].label_ == "test_serialize_doc_span_groups_label"
assert new_doc.spans["content"][0].id_ == "test_serialize_doc_span_groups_id"
assert new_doc.spans["content"][0].kb_id_ == "test_serialize_doc_span_groups_kb_id"

View File

@ -49,7 +49,11 @@ def test_serialize_doc_bin():
nlp = English() nlp = English()
for doc in nlp.pipe(texts): for doc in nlp.pipe(texts):
doc.cats = cats doc.cats = cats
doc.spans["start"] = [doc[0:2]] span = doc[0:2]
span.label_ = "UNUSUAL_SPAN_LABEL"
span.id_ = "UNUSUAL_SPAN_ID"
span.kb_id_ = "UNUSUAL_SPAN_KB_ID"
doc.spans["start"] = [span]
doc[0].norm_ = "UNUSUAL_TOKEN_NORM" doc[0].norm_ = "UNUSUAL_TOKEN_NORM"
doc[0].ent_id_ = "UNUSUAL_TOKEN_ENT_ID" doc[0].ent_id_ = "UNUSUAL_TOKEN_ENT_ID"
doc_bin.add(doc) doc_bin.add(doc)
@ -63,6 +67,9 @@ def test_serialize_doc_bin():
assert doc.text == texts[i] assert doc.text == texts[i]
assert doc.cats == cats assert doc.cats == cats
assert len(doc.spans) == 1 assert len(doc.spans) == 1
assert doc.spans["start"][0].label_ == "UNUSUAL_SPAN_LABEL"
assert doc.spans["start"][0].id_ == "UNUSUAL_SPAN_ID"
assert doc.spans["start"][0].kb_id_ == "UNUSUAL_SPAN_KB_ID"
assert doc[0].norm_ == "UNUSUAL_TOKEN_NORM" assert doc[0].norm_ == "UNUSUAL_TOKEN_NORM"
assert doc[0].ent_id_ == "UNUSUAL_TOKEN_ENT_ID" assert doc[0].ent_id_ == "UNUSUAL_TOKEN_ENT_ID"

View File

@ -275,6 +275,20 @@ def test_displacy_parse_deps(en_vocab):
{"start": 2, "end": 3, "label": "det", "dir": "left"}, {"start": 2, "end": 3, "label": "det", "dir": "left"},
{"start": 1, "end": 3, "label": "attr", "dir": "right"}, {"start": 1, "end": 3, "label": "attr", "dir": "right"},
] ]
# Test that displacy.parse_deps converts Span to Doc
deps = displacy.parse_deps(doc[:])
assert isinstance(deps, dict)
assert deps["words"] == [
{"lemma": None, "text": words[0], "tag": pos[0]},
{"lemma": None, "text": words[1], "tag": pos[1]},
{"lemma": None, "text": words[2], "tag": pos[2]},
{"lemma": None, "text": words[3], "tag": pos[3]},
]
assert deps["arcs"] == [
{"start": 0, "end": 1, "label": "nsubj", "dir": "left"},
{"start": 2, "end": 3, "label": "det", "dir": "left"},
{"start": 1, "end": 3, "label": "attr", "dir": "right"},
]
def test_displacy_invalid_arcs(): def test_displacy_invalid_arcs():

View File

@ -165,7 +165,8 @@ def test_pretraining_default():
@pytest.mark.parametrize("objective", CHAR_OBJECTIVES) @pytest.mark.parametrize("objective", CHAR_OBJECTIVES)
def test_pretraining_tok2vec_characters(objective): @pytest.mark.parametrize("skip_last", (True, False))
def test_pretraining_tok2vec_characters(objective, skip_last):
"""Test that pretraining works with the character objective""" """Test that pretraining works with the character objective"""
config = Config().from_str(pretrain_string_listener) config = Config().from_str(pretrain_string_listener)
config["pretraining"]["objective"] = objective config["pretraining"]["objective"] = objective
@ -178,10 +179,14 @@ def test_pretraining_tok2vec_characters(objective):
filled["paths"]["raw_text"] = file_path filled["paths"]["raw_text"] = file_path
filled = filled.interpolate() filled = filled.interpolate()
assert filled["pretraining"]["component"] == "tok2vec" assert filled["pretraining"]["component"] == "tok2vec"
pretrain(filled, tmp_dir) pretrain(filled, tmp_dir, skip_last=skip_last)
assert Path(tmp_dir / "model0.bin").exists() assert Path(tmp_dir / "model0.bin").exists()
assert Path(tmp_dir / "model4.bin").exists() assert Path(tmp_dir / "model4.bin").exists()
assert not Path(tmp_dir / "model5.bin").exists() assert not Path(tmp_dir / "model5.bin").exists()
if skip_last:
assert not Path(tmp_dir / "model-last.bin").exists()
else:
assert Path(tmp_dir / "model-last.bin").exists()
@pytest.mark.parametrize("objective", VECTOR_OBJECTIVES) @pytest.mark.parametrize("objective", VECTOR_OBJECTIVES)
@ -237,6 +242,7 @@ def test_pretraining_tagger_tok2vec(config):
pretrain(filled, tmp_dir) pretrain(filled, tmp_dir)
assert Path(tmp_dir / "model0.bin").exists() assert Path(tmp_dir / "model0.bin").exists()
assert Path(tmp_dir / "model4.bin").exists() assert Path(tmp_dir / "model4.bin").exists()
assert Path(tmp_dir / "model-last.bin").exists()
assert not Path(tmp_dir / "model5.bin").exists() assert not Path(tmp_dir / "model5.bin").exists()
@ -359,19 +365,15 @@ def test_pretrain_default_vectors():
nlp.vocab.vectors = Vectors(shape=(10, 10)) nlp.vocab.vectors = Vectors(shape=(10, 10))
create_pretrain_vectors(1, 1, "cosine")(nlp.vocab, nlp.get_pipe("tok2vec").model) create_pretrain_vectors(1, 1, "cosine")(nlp.vocab, nlp.get_pipe("tok2vec").model)
# floret vectors are supported
nlp.vocab.vectors = Vectors(
data=get_current_ops().xp.zeros((10, 10)), mode="floret", hash_count=1
)
create_pretrain_vectors(1, 1, "cosine")(nlp.vocab, nlp.get_pipe("tok2vec").model)
# error for no vectors # error for no vectors
with pytest.raises(ValueError, match="E875"): with pytest.raises(ValueError, match="E875"):
nlp.vocab.vectors = Vectors() nlp.vocab.vectors = Vectors()
create_pretrain_vectors(1, 1, "cosine")( create_pretrain_vectors(1, 1, "cosine")(
nlp.vocab, nlp.get_pipe("tok2vec").model nlp.vocab, nlp.get_pipe("tok2vec").model
) )
# error for floret vectors
with pytest.raises(ValueError, match="E850"):
ops = get_current_ops()
nlp.vocab.vectors = Vectors(
data=ops.xp.zeros((10, 10)), mode="floret", hash_count=1
)
create_pretrain_vectors(1, 1, "cosine")(
nlp.vocab, nlp.get_pipe("tok2vec").model
)

View File

@ -124,6 +124,10 @@ class DocBin:
for key, group in doc.spans.items(): for key, group in doc.spans.items():
for span in group: for span in group:
self.strings.add(span.label_) self.strings.add(span.label_)
if span.kb_id in span.doc.vocab.strings:
self.strings.add(span.kb_id_)
if span.id in span.doc.vocab.strings:
self.strings.add(span.id_)
def get_docs(self, vocab: Vocab) -> Iterator[Doc]: def get_docs(self, vocab: Vocab) -> Iterator[Doc]:
"""Recover Doc objects from the annotations, using the given vocab. """Recover Doc objects from the annotations, using the given vocab.

View File

@ -544,10 +544,6 @@ cdef class Doc:
DOCS: https://spacy.io/api/doc#char_span DOCS: https://spacy.io/api/doc#char_span
""" """
if not isinstance(label, int):
label = self.vocab.strings.add(label)
if not isinstance(kb_id, int):
kb_id = self.vocab.strings.add(kb_id)
alignment_modes = ("strict", "contract", "expand") alignment_modes = ("strict", "contract", "expand")
if alignment_mode not in alignment_modes: if alignment_mode not in alignment_modes:
raise ValueError( raise ValueError(
@ -1350,6 +1346,10 @@ cdef class Doc:
for group in self.spans.values(): for group in self.spans.values():
for span in group: for span in group:
strings.add(span.label_) strings.add(span.label_)
if span.kb_id in span.doc.vocab.strings:
strings.add(span.kb_id_)
if span.id in span.doc.vocab.strings:
strings.add(span.id_)
# Msgpack doesn't distinguish between lists and tuples, which is # Msgpack doesn't distinguish between lists and tuples, which is
# vexing for user data. As a best guess, we *know* that within # vexing for user data. As a best guess, we *know* that within
# keys, we must have tuples. In values we just have to hope # keys, we must have tuples. In values we just have to hope

View File

@ -460,9 +460,12 @@ cdef class Span:
start = i start = i
if start >= self.end: if start >= self.end:
break break
if start < self.end: elif i == self.doc.length - 1:
yield Span(self.doc, start, self.end) yield Span(self.doc, start, self.doc.length)
# Ensure that trailing parts of the Span instance are included in last element of .sents.
if start == self.doc.length - 1:
yield Span(self.doc, start, self.doc.length)
@property @property
def ents(self): def ents(self):

View File

@ -24,6 +24,7 @@ def pretrain(
epoch_resume: Optional[int] = None, epoch_resume: Optional[int] = None,
use_gpu: int = -1, use_gpu: int = -1,
silent: bool = True, silent: bool = True,
skip_last: bool = False,
): ):
msg = Printer(no_print=silent) msg = Printer(no_print=silent)
if config["training"]["seed"] is not None: if config["training"]["seed"] is not None:
@ -60,10 +61,14 @@ def pretrain(
row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")} row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings) msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)
def _save_model(epoch, is_temp=False): def _save_model(epoch, is_temp=False, is_last=False):
is_temp_str = ".temp" if is_temp else "" is_temp_str = ".temp" if is_temp else ""
with model.use_params(optimizer.averages): with model.use_params(optimizer.averages):
with (output_dir / f"model{epoch}{is_temp_str}.bin").open("wb") as file_: if is_last:
save_path = output_dir / f"model-last.bin"
else:
save_path = output_dir / f"model{epoch}{is_temp_str}.bin"
with (save_path).open("wb") as file_:
file_.write(model.get_ref("tok2vec").to_bytes()) file_.write(model.get_ref("tok2vec").to_bytes())
log = { log = {
"nr_word": tracker.nr_word, "nr_word": tracker.nr_word,
@ -76,6 +81,7 @@ def pretrain(
# TODO: I think we probably want this to look more like the # TODO: I think we probably want this to look more like the
# 'create_train_batches' function? # 'create_train_batches' function?
try:
for epoch in range(epoch_resume, P["max_epochs"]): for epoch in range(epoch_resume, P["max_epochs"]):
for batch_id, batch in enumerate(batcher(corpus(nlp))): for batch_id, batch in enumerate(batcher(corpus(nlp))):
docs = ensure_docs(batch) docs = ensure_docs(batch)
@ -92,6 +98,9 @@ def pretrain(
else: else:
_save_model(epoch) _save_model(epoch)
tracker.epoch_loss = 0.0 tracker.epoch_loss = 0.0
finally:
if not skip_last:
_save_model(P["max_epochs"], is_last=True)
def ensure_docs(examples_or_docs: Iterable[Union[Doc, Example]]) -> List[Doc]: def ensure_docs(examples_or_docs: Iterable[Union[Doc, Example]]) -> List[Doc]:

View File

@ -1123,13 +1123,14 @@ $ python -m spacy pretrain [config_path] [output_dir] [--code] [--resume-path] [
``` ```
| Name | Description | | Name | Description |
| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | -------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ | | `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ |
| `output_dir` | Directory to save binary weights to on each epoch. ~~Path (positional)~~ | | `output_dir` | Directory to save binary weights to on each epoch. ~~Path (positional)~~ |
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | | `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
| `--resume-path`, `-r` | Path to pretrained weights from which to resume pretraining. ~~Optional[Path] \(option)~~ | | `--resume-path`, `-r` | Path to pretrained weights from which to resume pretraining. ~~Optional[Path] \(option)~~ |
| `--epoch-resume`, `-er` | The epoch to resume counting from when using `--resume-path`. Prevents unintended overwriting of existing weight files. ~~Optional[int] \(option)~~ | | `--epoch-resume`, `-er` | The epoch to resume counting from when using `--resume-path`. Prevents unintended overwriting of existing weight files. ~~Optional[int] \(option)~~ |
| `--gpu-id`, `-g` | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~ | | `--gpu-id`, `-g` | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~ |
| `--skip-last`, `-L` <Tag variant="new">3.5.2</Tag> | Skip saving `model-last.bin`. Defaults to `False`. ~~bool (flag)~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.dropout 0.2`. ~~Any (option/flag)~~ | | overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.dropout 0.2`. ~~Any (option/flag)~~ |
| **CREATES** | The pretrained weights that can be used to initialize `spacy train`. | | **CREATES** | The pretrained weights that can be used to initialize `spacy train`. |
@ -1255,7 +1256,7 @@ be provided.
> ``` > ```
| Name | Description | | Name | Description |
| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | ------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `model` | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~ | | `model` | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~ |
| `data_path` | Path to file with DocBin with docs to use for threshold search. ~~Path (positional)~~ | | `data_path` | Path to file with DocBin with docs to use for threshold search. ~~Path (positional)~~ |
| `pipe_name` | Name of pipe to examine thresholds for. ~~str (positional)~~ | | `pipe_name` | Name of pipe to examine thresholds for. ~~str (positional)~~ |
@ -1265,7 +1266,7 @@ be provided.
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | | `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ | | `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ |
| `--gold-preproc`, `-G` | Use gold preprocessing. ~~bool (flag)~~ | | `--gold-preproc`, `-G` | Use gold preprocessing. ~~bool (flag)~~ |
| `--silent`, `-V`, `-VV` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ | | `--verbose`, `-V`, `-VV` | Display more information for debugging purposes. ~~bool (flag)~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
## assemble {id="assemble",tag="command"} ## assemble {id="assemble",tag="command"}

View File

@ -64,7 +64,7 @@ details on the architectures and their arguments and hyperparameters.
> config={ > config={
> "model": DEFAULT_COREF_MODEL, > "model": DEFAULT_COREF_MODEL,
> "span_cluster_prefix": DEFAULT_CLUSTER_PREFIX, > "span_cluster_prefix": DEFAULT_CLUSTER_PREFIX,
> }, > }
> nlp.add_pipe("experimental_coref", config=config) > nlp.add_pipe("experimental_coref", config=config)
> ``` > ```

View File

@ -20,8 +20,9 @@ output class probabilities are independent for each class. However, if you need
to predict at most one true class for a span, then use `spancat_singlelabel`. It to predict at most one true class for a span, then use `spancat_singlelabel`. It
uses a `Softmax` layer and treats the task as a multi-class problem. uses a `Softmax` layer and treats the task as a multi-class problem.
Predicted spans will be saved in a [`SpanGroup`](/api/spangroup) on the doc. Predicted spans will be saved in a [`SpanGroup`](/api/spangroup) on the doc
Individual span scores can be found in `spangroup.attrs["scores"]`. under `doc.spans[spans_key]`, where `spans_key` is a component config setting.
Individual span scores are stored in `doc.spans[spans_key].attrs["scores"]`.
## Assigned Attributes {id="assigned-attributes"} ## Assigned Attributes {id="assigned-attributes"}
@ -29,7 +30,9 @@ Predictions will be saved to `Doc.spans[spans_key]` as a
[`SpanGroup`](/api/spangroup). The scores for the spans in the `SpanGroup` will [`SpanGroup`](/api/spangroup). The scores for the spans in the `SpanGroup` will
be saved in `SpanGroup.attrs["scores"]`. be saved in `SpanGroup.attrs["scores"]`.
`spans_key` defaults to `"sc"`, but can be passed as a parameter. `spans_key` defaults to `"sc"`, but can be passed as a parameter. The `spancat`
component will overwrite any existing spans under the spans key
`doc.spans[spans_key]`.
| Location | Value | | Location | Value |
| -------------------------------------- | -------------------------------------------------------- | | -------------------------------------- | -------------------------------------------------------- |

View File

@ -8,6 +8,13 @@ Look up strings by 64-bit hashes. As of v2.0, spaCy uses hash values instead of
integer IDs. This ensures that strings always map to the same ID, even from integer IDs. This ensures that strings always map to the same ID, even from
different `StringStores`. different `StringStores`.
<Infobox variant ="warning">
Note that a `StringStore` instance is not static. It increases in size as texts
with new tokens are processed.
</Infobox>
## StringStore.\_\_init\_\_ {id="init",tag="method"} ## StringStore.\_\_init\_\_ {id="init",tag="method"}
Create the `StringStore`. Create the `StringStore`.

View File

@ -25,7 +25,10 @@ and call the package's own `load()` method. If a pipeline is loaded from a path,
spaCy will assume it's a data directory, load its spaCy will assume it's a data directory, load its
[`config.cfg`](/api/data-formats#config) and use the language and pipeline [`config.cfg`](/api/data-formats#config) and use the language and pipeline
information to construct the `Language` class. The data will be loaded in via information to construct the `Language` class. The data will be loaded in via
[`Language.from_disk`](/api/language#from_disk). [`Language.from_disk`](/api/language#from_disk). Loading a pipeline from a
package will also import any custom code, if present, whereas loading from a
directory does not. For these cases, you need to manually import your custom
code.
<Infobox variant="warning" title="Changed in v3.0"> <Infobox variant="warning" title="Changed in v3.0">
@ -291,7 +294,7 @@ the `manual=True` argument in `displacy.render`.
| Name | Description | | Name | Description |
| ----------- | ------------------------------------------------------------------- | | ----------- | ------------------------------------------------------------------- |
| `orig_doc` | Doc to parse dependencies. ~~Doc~~ | | `orig_doc` | Doc or span to parse dependencies. ~~Union[Doc, Span]~~ |
| `options` | Dependency parse specific visualisation options. ~~Dict[str, Any]~~ | | `options` | Dependency parse specific visualisation options. ~~Dict[str, Any]~~ |
| **RETURNS** | Generated dependency parse keyed by words and arcs. ~~dict~~ | | **RETURNS** | Generated dependency parse keyed by words and arcs. ~~dict~~ |
@ -577,7 +580,7 @@ start decreasing across epochs.
> ```ini > ```ini
> [training.logger] > [training.logger]
> @loggers = "spacy.ConsoleLogger.v3" > @loggers = "spacy.ConsoleLogger.v3"
> progress_bar = "all_steps" > progress_bar = "eval"
> console_output = true > console_output = true
> output_file = "training_log.jsonl" > output_file = "training_log.jsonl"
> ``` > ```

View File

@ -10,6 +10,13 @@ The `Vocab` object provides a lookup table that allows you to access
[`StringStore`](/api/stringstore). It also owns underlying C-data that is shared [`StringStore`](/api/stringstore). It also owns underlying C-data that is shared
between `Doc` objects. between `Doc` objects.
<Infobox variant ="warning">
Note that a `Vocab` instance is not static. It increases in size as texts with
new tokens are processed.
</Infobox>
## Vocab.\_\_init\_\_ {id="init",tag="method"} ## Vocab.\_\_init\_\_ {id="init",tag="method"}
Create the vocabulary. Create the vocabulary.

View File

@ -746,13 +746,16 @@ this by setting `initialize.init_tok2vec` to the filename of the `.bin` file
that you want to use from pretraining. that you want to use from pretraining.
A pretraining step that runs for 5 epochs with an output path of `pretrain/`, as A pretraining step that runs for 5 epochs with an output path of `pretrain/`, as
an example, produces `pretrain/model0.bin` through `pretrain/model4.bin`. To an example, produces `pretrain/model0.bin` through `pretrain/model4.bin` plus a
make use of the final output, you could fill in this value in your config file: copy of the last iteration as `pretrain/model-last.bin`. Additionally, you can
configure `n_save_epoch` to tell pretraining in which epoch interval it should
save the current training progress. To use the final output to initialize your
`tok2vec` layer, you could fill in this value in your config file:
```ini {title="config.cfg"} ```ini {title="config.cfg"}
[paths] [paths]
init_tok2vec = "pretrain/model4.bin" init_tok2vec = "pretrain/model-last.bin"
[initialize] [initialize]
init_tok2vec = ${paths.init_tok2vec} init_tok2vec = ${paths.init_tok2vec}

View File

@ -1684,6 +1684,8 @@ def expand_person_entities(doc):
new_ents.append(new_ent) new_ents.append(new_ent)
else: else:
new_ents.append(ent) new_ents.append(ent)
else:
new_ents.append(ent)
doc.ents = new_ents doc.ents = new_ents
return doc return doc

View File

@ -758,6 +758,15 @@ any custom architectures, functions or
your pipeline and registered when it's loaded. See the documentation on your pipeline and registered when it's loaded. See the documentation on
[saving and loading pipelines](/usage/saving-loading#models-custom) for details. [saving and loading pipelines](/usage/saving-loading#models-custom) for details.
<Infobox variant="warning">
Note that the unpackaged models produced by `spacy train` are data directories
that **do not include custom code**. You need to import the code in your script
before loading in unpackaged models. For more details, see
[`spacy.load`](/api/top-level#spacy.load).
</Infobox>
#### Example: Modifying the nlp object {id="custom-code-nlp-callbacks"} #### Example: Modifying the nlp object {id="custom-code-nlp-callbacks"}
For many use cases, you don't necessarily want to implement the whole `Language` For many use cases, you don't necessarily want to implement the whole `Language`

View File

@ -3215,6 +3215,51 @@
"category": ["pipeline"], "category": ["pipeline"],
"tags": ["syllables", "multilingual"] "tags": ["syllables", "multilingual"]
}, },
{
"id": "sentimental-onix",
"title": "Sentimental Onix",
"slogan": "Use onnx for sentiment models",
"description": "spaCy pipeline component for sentiment analysis using onnx",
"github": "sloev/sentimental-onix",
"pip": "sentimental-onix",
"code_example": [
"# Download model:",
"# python -m sentimental_onix download en",
"import spacy",
"from sentimental_onix import pipeline",
"",
"nlp = spacy.load(\"en_core_web_sm\")",
"nlp.add_pipe(\"sentencizer\")",
"nlp.add_pipe(\"sentimental_onix\", after=\"sentencizer\")",
"",
"sentences = [",
" (sent.text, sent._.sentiment)",
" for doc in nlp.pipe(",
" [",
" \"i hate pasta on tuesdays\",",
" \"i like movies on wednesdays\",",
" \"i find your argument ridiculous\",",
" \"soda with straws are my favorite\",",
" ]",
" )",
" for sent in doc.sents",
"]",
"",
"assert sentences == [",
" (\"i hate pasta on tuesdays\", \"Negative\"),",
" (\"i like movies on wednesdays\", \"Positive\"),",
" (\"i find your argument ridiculous\", \"Negative\"),",
" (\"soda with straws are my favorite\", \"Positive\"),",
"]"
],
"thumb": "https://raw.githubusercontent.com/sloev/sentimental-onix/master/.github/onix.webp",
"author": "Johannes Valbjørn",
"author_links": {
"github": "sloev"
},
"category": ["pipeline"],
"tags": ["sentiment", "english"]
},
{ {
"id": "gobbli", "id": "gobbli",
"title": "gobbli", "title": "gobbli",

View File

@ -111,11 +111,12 @@
line-height: var(--line-height-xs) line-height: var(--line-height-xs)
text-align: center text-align: center
@include breakpoint(max, xs) @include breakpoint(max, md)
.list .alert
display: none display: none
.alert @include breakpoint(max, xs)
.list
display: none display: none
.has-alert .has-alert

View File

@ -25,11 +25,6 @@ const AlertSpace = ({ nightly, legacy }) => {
const isOnline = useOnlineStatus() const isOnline = useOnlineStatus()
return ( return (
<> <>
{isOnline && (
<Alert title="💥 We'd love to learn more about your experience with spaCy!">
<Link to="https://form.typeform.com/to/aMel9q9f">Take our survey here.</Link>
</Alert>
)}
{nightly && ( {nightly && (
<Alert <Alert
title="You're viewing the pre-release docs." title="You're viewing the pre-release docs."
@ -62,9 +57,15 @@ const AlertSpace = ({ nightly, legacy }) => {
) )
} }
// const navAlert = (
// <Link to="/usage/v3-5" noLinkLayout>
// <strong>💥 Out now:</strong> spaCy v3.5
// </Link>
// )
const navAlert = ( const navAlert = (
<Link to="/usage/v3-5" noLinkLayout> <Link to="https://form.typeform.com/to/aMel9q9f" noLinkLayout>
<strong>💥 Out now:</strong> spaCy v3.5 <strong>💥 Take the user survey!</strong>
</Link> </Link>
) )