Merge branch 'master' into add-more-info-to-custom-code-doc

This commit is contained in:
thomashacker 2023-03-31 10:21:26 +02:00
commit 5f57bdb588
53 changed files with 1248 additions and 348 deletions

View File

@ -1,6 +1,6 @@
parameters: parameters:
python_version: '' python_version: ""
architecture: 'x64' architecture: "x64"
num_build_jobs: 2 num_build_jobs: 2
steps: steps:
@ -12,7 +12,7 @@ steps:
- bash: | - bash: |
echo "##vso[task.setvariable variable=python_version]${{ parameters.python_version }}" echo "##vso[task.setvariable variable=python_version]${{ parameters.python_version }}"
displayName: 'Set variables' displayName: "Set variables"
- script: | - script: |
python -m pip install -U build pip setuptools python -m pip install -U build pip setuptools
@ -25,7 +25,7 @@ steps:
- script: | - script: |
python -m mypy spacy python -m mypy spacy
displayName: 'Run mypy' displayName: "Run mypy"
condition: ne(variables['python_version'], '3.6') condition: ne(variables['python_version'], '3.6')
- task: DeleteFiles@1 - task: DeleteFiles@1
@ -52,56 +52,56 @@ steps:
python -W error -c "import spacy" python -W error -c "import spacy"
displayName: "Test import" displayName: "Test import"
# - script: | - script: |
# python -m spacy download ca_core_news_sm python -m spacy download ca_core_news_sm
# python -m spacy download ca_core_news_md python -m spacy download ca_core_news_md
# python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')" python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
# displayName: 'Test download CLI' displayName: "Test download CLI"
# condition: eq(variables['python_version'], '3.8') condition: eq(variables['python_version'], '3.9')
#
# - script: | - script: |
# python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')" python -W error -m spacy info ca_core_news_sm | grep -q download_url
# displayName: 'Test no warnings on load (#11713)' displayName: "Test download_url in info CLI"
# condition: eq(variables['python_version'], '3.8') condition: eq(variables['python_version'], '3.9')
#
# - script: | - script: |
# python -m spacy download ca_core_news_sm 2>&1 | grep -q skipping python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
# displayName: 'Test skip re-download (#12188)' displayName: "Test no warnings on load (#11713)"
# condition: eq(variables['python_version'], '3.8') condition: eq(variables['python_version'], '3.9')
- script: | - script: |
python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json . python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
displayName: 'Test convert CLI' displayName: "Test convert CLI"
condition: eq(variables['python_version'], '3.8') condition: eq(variables['python_version'], '3.9')
- script: | - script: |
python -m spacy init config -p ner -l ca ner.cfg python -m spacy init config -p ner -l ca ner.cfg
python -m spacy debug config ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy python -m spacy debug config ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy
displayName: 'Test debug config CLI' displayName: "Test debug config CLI"
condition: eq(variables['python_version'], '3.8') condition: eq(variables['python_version'], '3.9')
- script: | - script: |
# will have errors due to sparse data, check for summary in output # will have errors due to sparse data, check for summary in output
python -m spacy debug data ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy | grep -q Summary python -m spacy debug data ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy | grep -q Summary
displayName: 'Test debug data CLI' displayName: "Test debug data CLI"
condition: eq(variables['python_version'], '3.8') condition: eq(variables['python_version'], '3.9')
- script: | - script: |
python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1 python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
displayName: 'Test train CLI' displayName: "Test train CLI"
condition: eq(variables['python_version'], '3.8') condition: eq(variables['python_version'], '3.9')
# - script: | - script: |
# python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')" python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
# PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
# displayName: 'Test assemble CLI' displayName: "Test assemble CLI"
# condition: eq(variables['python_version'], '3.8') condition: eq(variables['python_version'], '3.9')
#
# - script: | - script: |
# python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')" python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
# python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113 python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
# displayName: 'Test assemble CLI vectors warning' displayName: "Test assemble CLI vectors warning"
# condition: eq(variables['python_version'], '3.8') condition: eq(variables['python_version'], '3.9')
- script: | - script: |
python -m pip install -U -r requirements.txt python -m pip install -U -r requirements.txt
@ -116,9 +116,3 @@ steps:
python -m pytest --pyargs spacy python -m pytest --pyargs spacy
displayName: "Run CPU tests with thinc-apple-ops" displayName: "Run CPU tests with thinc-apple-ops"
condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.11')) condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.11'))
- script: |
python .github/validate_universe_json.py website/meta/universe.json
displayName: 'Test website/meta/universe.json'
condition: eq(variables['python_version'], '3.8')

View File

@ -1,45 +0,0 @@
# GitHub Action that uses Black to reformat all Python code and submits a PR
# in regular intervals. Inspired by: https://github.com/cclauss/autoblack
name: autoblack
on:
workflow_dispatch: # allow manual trigger
schedule:
- cron: '0 8 * * 5' # every Friday at 8am UTC
jobs:
autoblack:
if: github.repository_owner == 'explosion'
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
with:
ref: ${{ github.head_ref }}
- uses: actions/setup-python@v4
- run: pip install black -c requirements.txt
- name: Auto-format code if needed
run: black spacy
# We can't run black --check here because that returns a non-zero excit
# code and makes GitHub think the action failed
- name: Check for modified files
id: git-check
run: echo modified=$(if git diff-index --quiet HEAD --; then echo "false"; else echo "true"; fi) >> $GITHUB_OUTPUT
- name: Create Pull Request
if: steps.git-check.outputs.modified == 'true'
uses: peter-evans/create-pull-request@v4
with:
title: Auto-format code with black
labels: meta
commit-message: Auto-format code with black
committer: GitHub <noreply@github.com>
author: explosion-bot <explosion-bot@users.noreply.github.com>
body: _This PR is auto-generated._
branch: autoblack
delete-branch: true
draft: false
- name: Check outputs
if: steps.git-check.outputs.modified == 'true'
run: |
echo "Pull Request Number - ${{ steps.cpr.outputs.pull-request-number }}"
echo "Pull Request URL - ${{ steps.cpr.outputs.pull-request-url }}"

View File

@ -8,6 +8,7 @@ on:
jobs: jobs:
explosion-bot: explosion-bot:
if: github.repository_owner == 'explosion'
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- name: Dump GitHub context - name: Dump GitHub context

View File

@ -13,6 +13,7 @@ on:
jobs: jobs:
issue-manager: issue-manager:
if: github.repository_owner == 'explosion'
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- uses: tiangolo/issue-manager@0.4.0 - uses: tiangolo/issue-manager@0.4.0

View File

@ -13,6 +13,7 @@ concurrency:
jobs: jobs:
action: action:
if: github.repository_owner == 'explosion'
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- uses: dessant/lock-threads@v4 - uses: dessant/lock-threads@v4

View File

@ -7,6 +7,7 @@ on:
jobs: jobs:
build: build:
if: github.repository_owner == 'explosion'
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:

173
.github/workflows/tests.yml vendored Normal file
View File

@ -0,0 +1,173 @@
name: tests
on:
push:
branches-ignore:
- "spacy.io"
- "nightly.spacy.io"
- "v2.spacy.io"
paths-ignore:
- "*.md"
- "*.mdx"
- "website/**"
- ".github/workflows/**"
pull_request:
types: [opened, synchronize, reopened, edited]
paths-ignore:
- "*.md"
- "*.mdx"
- "website/**"
jobs:
validate:
name: Validate
if: github.repository_owner == 'explosion'
runs-on: ubuntu-latest
steps:
- name: Check out repo
uses: actions/checkout@v3
- name: Configure Python version
uses: actions/setup-python@v4
with:
python-version: "3.7"
architecture: x64
- name: black
run: |
python -m pip install black -c requirements.txt
python -m black spacy --check
- name: flake8
run: |
python -m pip install flake8==5.0.4
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
tests:
name: Test
needs: Validate
strategy:
fail-fast: true
matrix:
os: [ubuntu-latest, windows-latest, macos-latest]
python_version: ["3.11"]
include:
- os: ubuntu-20.04
python_version: "3.6"
- os: windows-latest
python_version: "3.7"
- os: macos-latest
python_version: "3.8"
- os: ubuntu-latest
python_version: "3.9"
- os: windows-latest
python_version: "3.10"
runs-on: ${{ matrix.os }}
steps:
- name: Check out repo
uses: actions/checkout@v3
- name: Configure Python version
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python_version }}
architecture: x64
- name: Install dependencies
run: |
python -m pip install -U build pip setuptools
python -m pip install -U -r requirements.txt
- name: Build sdist
run: |
python -m build --sdist
- name: Run mypy
run: |
python -m mypy spacy
if: matrix.python_version != '3.6'
- name: Delete source directory and .egg-info
run: |
rm -rf spacy *.egg-info
shell: bash
- name: Uninstall all packages
run: |
python -m pip freeze
python -m pip freeze --exclude pywin32 > installed.txt
python -m pip uninstall -y -r installed.txt
- name: Install from sdist
run: |
SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
SPACY_NUM_BUILD_JOBS=2 python -m pip install dist/$SDIST
shell: bash
- name: Test import
run: python -W error -c "import spacy"
- name: "Test download CLI"
run: |
python -m spacy download ca_core_news_sm
python -m spacy download ca_core_news_md
python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
if: matrix.python_version == '3.9'
- name: "Test download_url in info CLI"
run: |
python -W error -m spacy info ca_core_news_sm | grep -q download_url
if: matrix.python_version == '3.9'
- name: "Test no warnings on load (#11713)"
run: |
python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
if: matrix.python_version == '3.9'
- name: "Test convert CLI"
run: |
python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
if: matrix.python_version == '3.9'
- name: "Test debug config CLI"
run: |
python -m spacy init config -p ner -l ca ner.cfg
python -m spacy debug config ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy
if: matrix.python_version == '3.9'
- name: "Test debug data CLI"
run: |
# will have errors due to sparse data, check for summary in output
python -m spacy debug data ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy | grep -q Summary
if: matrix.python_version == '3.9'
- name: "Test train CLI"
run: |
python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
if: matrix.python_version == '3.9'
- name: "Test assemble CLI"
run: |
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
if: matrix.python_version == '3.9'
- name: "Test assemble CLI vectors warning"
run: |
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
if: matrix.python_version == '3.9'
- name: "Install test requirements"
run: |
python -m pip install -U -r requirements.txt
- name: "Run CPU tests"
run: |
python -m pytest --pyargs spacy -W error
- name: "Run CPU tests with thinc-apple-ops"
run: |
python -m pip install 'spacy[apple]'
python -m pytest --pyargs spacy
if: startsWith(matrix.os, 'macos') && matrix.python_version == '3.11'

View File

@ -0,0 +1,33 @@
name: universe validation
on:
push:
branches-ignore:
- "spacy.io"
- "nightly.spacy.io"
- "v2.spacy.io"
paths:
- "website/meta/universe.json"
pull_request:
types: [opened, synchronize, reopened, edited]
paths:
- "website/meta/universe.json"
jobs:
validate:
name: Validate
if: github.repository_owner == 'explosion'
runs-on: ubuntu-latest
steps:
- name: Check out repo
uses: actions/checkout@v3
- name: Configure Python version
uses: actions/setup-python@v4
with:
python-version: "3.7"
architecture: x64
- name: Validate website/meta/universe.json
run: |
python .github/validate_universe_json.py website/meta/universe.json

View File

@ -16,6 +16,9 @@ production-ready [**training system**](https://spacy.io/usage/training) and easy
model packaging, deployment and workflow management. spaCy is commercial model packaging, deployment and workflow management. spaCy is commercial
open-source software, released under the [MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE). open-source software, released under the [MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE).
💥 **We'd love to hear more about your experience with spaCy!**
[Fill out our survey here.](https://form.typeform.com/to/aMel9q9f)
💫 **Version 3.5 out now!** 💫 **Version 3.5 out now!**
[Check out the release notes here.](https://github.com/explosion/spaCy/releases) [Check out the release notes here.](https://github.com/explosion/spaCy/releases)

View File

@ -48,6 +48,9 @@ jobs:
pip install flake8==5.0.4 pip install flake8==5.0.4
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
displayName: "flake8" displayName: "flake8"
- script: |
python .github/validate_universe_json.py website/meta/universe.json
displayName: 'Validate website/meta/universe.json'
- job: "Test" - job: "Test"
dependsOn: "Validate" dependsOn: "Validate"

View File

@ -5,7 +5,7 @@ requires = [
"cymem>=2.0.2,<2.1.0", "cymem>=2.0.2,<2.1.0",
"preshed>=3.0.2,<3.1.0", "preshed>=3.0.2,<3.1.0",
"murmurhash>=0.28.0,<1.1.0", "murmurhash>=0.28.0,<1.1.0",
"thinc>=9.0.0.dev2,<9.1.0", "thinc>=8.1.8,<8.2.0",
"numpy>=1.15.0", "numpy>=1.15.0",
] ]
build-backend = "setuptools.build_meta" build-backend = "setuptools.build_meta"

View File

@ -3,7 +3,7 @@ spacy-legacy>=4.0.0.dev0,<4.1.0
spacy-loggers>=1.0.0,<2.0.0 spacy-loggers>=1.0.0,<2.0.0
cymem>=2.0.2,<2.1.0 cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0 preshed>=3.0.2,<3.1.0
thinc>=9.0.0.dev2,<9.1.0 thinc>=8.1.8,<8.2.0
ml_datasets>=0.2.0,<0.3.0 ml_datasets>=0.2.0,<0.3.0
murmurhash>=0.28.0,<1.1.0 murmurhash>=0.28.0,<1.1.0
wasabi>=0.9.1,<1.2.0 wasabi>=0.9.1,<1.2.0

View File

@ -29,7 +29,15 @@ project_urls =
[options] [options]
zip_safe = false zip_safe = false
include_package_data = true include_package_data = true
python_requires = >=3.8 python_requires = >=3.6
setup_requires =
cython>=0.25,<3.0
numpy>=1.15.0
# We also need our Cython packages here to compile against
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
murmurhash>=0.28.0,<1.1.0
thinc>=8.1.8,<8.2.0
install_requires = install_requires =
# Our libraries # Our libraries
spacy-legacy>=4.0.0.dev0,<4.1.0 spacy-legacy>=4.0.0.dev0,<4.1.0
@ -37,7 +45,7 @@ install_requires =
murmurhash>=0.28.0,<1.1.0 murmurhash>=0.28.0,<1.1.0
cymem>=2.0.2,<2.1.0 cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0 preshed>=3.0.2,<3.1.0
thinc>=9.0.0.dev2,<9.1.0 thinc>=8.1.8,<8.2.0
wasabi>=0.9.1,<1.2.0 wasabi>=0.9.1,<1.2.0
srsly>=2.4.3,<3.0.0 srsly>=2.4.3,<3.0.0
catalogue>=2.0.6,<2.1.0 catalogue>=2.0.6,<2.1.0

View File

@ -7,6 +7,7 @@ import srsly
from wasabi import Printer, MESSAGES, msg from wasabi import Printer, MESSAGES, msg
import typer import typer
import math import math
import numpy
from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
from ._util import import_code, debug_cli, _format_number from ._util import import_code, debug_cli, _format_number
@ -520,9 +521,13 @@ def debug_data(
if "tagger" in factory_names: if "tagger" in factory_names:
msg.divider("Part-of-speech Tagging") msg.divider("Part-of-speech Tagging")
label_list = [label for label in gold_train_data["tags"]] label_list, counts = zip(*gold_train_data["tags"].items())
model_labels = _get_labels_from_model(nlp, "tagger")
msg.info(f"{len(label_list)} label(s) in train data") msg.info(f"{len(label_list)} label(s) in train data")
p = numpy.array(counts)
p = p / p.sum()
norm_entropy = (-p * numpy.log2(p)).sum() / numpy.log2(len(label_list))
msg.info(f"{norm_entropy} is the normalised label entropy")
model_labels = _get_labels_from_model(nlp, "tagger")
labels = set(label_list) labels = set(label_list)
missing_labels = model_labels - labels missing_labels = model_labels - labels
if missing_labels: if missing_labels:

View File

@ -35,7 +35,7 @@ def find_threshold_cli(
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
use_gpu: int = Opt(_DEFAULTS["use_gpu"], "--gpu-id", "-g", help="GPU ID or -1 for CPU"), use_gpu: int = Opt(_DEFAULTS["use_gpu"], "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
gold_preproc: bool = Opt(_DEFAULTS["gold_preproc"], "--gold-preproc", "-G", help="Use gold preprocessing"), gold_preproc: bool = Opt(_DEFAULTS["gold_preproc"], "--gold-preproc", "-G", help="Use gold preprocessing"),
verbose: bool = Opt(False, "--silent", "-V", "-VV", help="Display more information for debugging purposes"), verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
# fmt: on # fmt: on
): ):
""" """

View File

@ -1,6 +1,5 @@
from typing import Optional, Dict, Any, Union, List from typing import Optional, Dict, Any, Union, List
import platform import platform
import pkg_resources
import json import json
from pathlib import Path from pathlib import Path
from wasabi import Printer, MarkdownRenderer from wasabi import Printer, MarkdownRenderer
@ -10,6 +9,7 @@ from ._util import app, Arg, Opt, string_to_list
from .download import get_model_filename, get_latest_version from .download import get_model_filename, get_latest_version
from .. import util from .. import util
from .. import about from .. import about
from ..compat import importlib_metadata
@app.command("info") @app.command("info")
@ -137,14 +137,13 @@ def info_installed_model_url(model: str) -> Optional[str]:
dist-info available. dist-info available.
""" """
try: try:
dist = pkg_resources.get_distribution(model) dist = importlib_metadata.distribution(model)
data = json.loads(dist.get_metadata("direct_url.json")) text = dist.read_text("direct_url.json")
if isinstance(text, str):
data = json.loads(text)
return data["url"] return data["url"]
except pkg_resources.DistributionNotFound:
# no such package
return None
except Exception: except Exception:
# something else, like no file or invalid JSON pass
return None return None

View File

@ -2,7 +2,6 @@ from typing import Optional, List, Dict, Sequence, Any, Iterable, Tuple
import os.path import os.path
from pathlib import Path from pathlib import Path
import pkg_resources
from wasabi import msg from wasabi import msg
from wasabi.util import locale_escape from wasabi.util import locale_escape
import sys import sys
@ -331,6 +330,7 @@ def _check_requirements(requirements: List[str]) -> Tuple[bool, bool]:
RETURNS (Tuple[bool, bool]): Whether (1) any packages couldn't be imported, (2) any packages with version conflicts RETURNS (Tuple[bool, bool]): Whether (1) any packages couldn't be imported, (2) any packages with version conflicts
exist. exist.
""" """
import pkg_resources
failed_pkgs_msgs: List[str] = [] failed_pkgs_msgs: List[str] = []
conflicting_pkgs_msgs: List[str] = [] conflicting_pkgs_msgs: List[str] = []

View File

@ -3,7 +3,7 @@ the docs and the init config command. It encodes various best practices and
can help generate the best possible configuration, given a user's requirements. #} can help generate the best possible configuration, given a user's requirements. #}
{%- set use_transformer = hardware != "cpu" and transformer_data -%} {%- set use_transformer = hardware != "cpu" and transformer_data -%}
{%- set transformer = transformer_data[optimize] if use_transformer else {} -%} {%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
{%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "spancat", "trainable_lemmatizer"] -%} {%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "spancat", "spancat_singlelabel", "trainable_lemmatizer"] -%}
[paths] [paths]
train = null train = null
dev = null dev = null
@ -24,8 +24,11 @@ gpu_allocator = null
lang = "{{ lang }}" lang = "{{ lang }}"
{%- set has_textcat = ("textcat" in components or "textcat_multilabel" in components) -%} {%- set has_textcat = ("textcat" in components or "textcat_multilabel" in components) -%}
{%- set with_accuracy = optimize == "accuracy" -%} {%- set with_accuracy = optimize == "accuracy" -%}
{%- set has_accurate_textcat = has_textcat and with_accuracy -%} {# The BOW textcat doesn't need a source of features, so it can omit the
{%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "spancat" in components or "trainable_lemmatizer" in components or "entity_linker" in components or has_accurate_textcat) -%} tok2vec/transformer. #}
{%- set with_accuracy_or_transformer = (use_transformer or with_accuracy) -%}
{%- set textcat_needs_features = has_textcat and with_accuracy_or_transformer -%}
{%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "spancat" in components or "spancat_singlelabel" in components or "trainable_lemmatizer" in components or "entity_linker" in components or textcat_needs_features) -%}
{%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components -%} {%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components -%}
{%- else -%} {%- else -%}
{%- set full_pipeline = components -%} {%- set full_pipeline = components -%}
@ -154,6 +157,36 @@ grad_factor = 1.0
sizes = [1,2,3] sizes = [1,2,3]
{% endif -%} {% endif -%}
{% if "spancat_singlelabel" in components %}
[components.spancat_singlelabel]
factory = "spancat_singlelabel"
negative_weight = 1.0
allow_overlap = true
scorer = {"@scorers":"spacy.spancat_scorer.v1"}
spans_key = "sc"
[components.spancat_singlelabel.model]
@architectures = "spacy.SpanCategorizer.v1"
[components.spancat_singlelabel.model.reducer]
@layers = "spacy.mean_max_reducer.v1"
hidden_size = 128
[components.spancat_singlelabel.model.scorer]
@layers = "Softmax.v2"
[components.spancat_singlelabel.model.tok2vec]
@architectures = "spacy-transformers.TransformerListener.v1"
grad_factor = 1.0
[components.spancat_singlelabel.model.tok2vec.pooling]
@layers = "reduce_mean.v1"
[components.spancat_singlelabel.suggester]
@misc = "spacy.ngram_suggester.v1"
sizes = [1,2,3]
{% endif %}
{% if "trainable_lemmatizer" in components -%} {% if "trainable_lemmatizer" in components -%}
[components.trainable_lemmatizer] [components.trainable_lemmatizer]
factory = "trainable_lemmatizer" factory = "trainable_lemmatizer"
@ -219,10 +252,16 @@ no_output_layer = false
{% else -%} {% else -%}
[components.textcat.model] [components.textcat.model]
@architectures = "spacy.TextCatBOW.v2" @architectures = "spacy.TextCatCNN.v2"
exclusive_classes = true exclusive_classes = true
ngram_size = 1 nO = null
no_output_layer = false
[components.textcat.model.tok2vec]
@architectures = "spacy-transformers.TransformerListener.v1"
grad_factor = 1.0
[components.textcat.model.tok2vec.pooling]
@layers = "reduce_mean.v1"
{%- endif %} {%- endif %}
{%- endif %} {%- endif %}
@ -250,10 +289,16 @@ no_output_layer = false
{% else -%} {% else -%}
[components.textcat_multilabel.model] [components.textcat_multilabel.model]
@architectures = "spacy.TextCatBOW.v2" @architectures = "spacy.TextCatCNN.v2"
exclusive_classes = false exclusive_classes = false
ngram_size = 1 nO = null
no_output_layer = false
[components.textcat_multilabel.model.tok2vec]
@architectures = "spacy-transformers.TransformerListener.v1"
grad_factor = 1.0
[components.textcat_multilabel.model.tok2vec.pooling]
@layers = "reduce_mean.v1"
{%- endif %} {%- endif %}
{%- endif %} {%- endif %}
@ -284,6 +329,7 @@ maxout_pieces = 3
{% if "morphologizer" in components %} {% if "morphologizer" in components %}
[components.morphologizer] [components.morphologizer]
factory = "morphologizer" factory = "morphologizer"
label_smoothing = 0.05
[components.morphologizer.model] [components.morphologizer.model]
@architectures = "spacy.Tagger.v2" @architectures = "spacy.Tagger.v2"
@ -297,6 +343,7 @@ width = ${components.tok2vec.model.encode.width}
{% if "tagger" in components %} {% if "tagger" in components %}
[components.tagger] [components.tagger]
factory = "tagger" factory = "tagger"
label_smoothing = 0.05
[components.tagger.model] [components.tagger.model]
@architectures = "spacy.Tagger.v2" @architectures = "spacy.Tagger.v2"
@ -370,6 +417,33 @@ width = ${components.tok2vec.model.encode.width}
sizes = [1,2,3] sizes = [1,2,3]
{% endif %} {% endif %}
{% if "spancat_singlelabel" in components %}
[components.spancat_singlelabel]
factory = "spancat_singlelabel"
negative_weight = 1.0
allow_overlap = true
scorer = {"@scorers":"spacy.spancat_scorer.v1"}
spans_key = "sc"
[components.spancat_singlelabel.model]
@architectures = "spacy.SpanCategorizer.v1"
[components.spancat_singlelabel.model.reducer]
@layers = "spacy.mean_max_reducer.v1"
hidden_size = 128
[components.spancat_singlelabel.model.scorer]
@layers = "Softmax.v2"
[components.spancat_singlelabel.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode.width}
[components.spancat_singlelabel.suggester]
@misc = "spacy.ngram_suggester.v1"
sizes = [1,2,3]
{% endif %}
{% if "trainable_lemmatizer" in components -%} {% if "trainable_lemmatizer" in components -%}
[components.trainable_lemmatizer] [components.trainable_lemmatizer]
factory = "trainable_lemmatizer" factory = "trainable_lemmatizer"

View File

@ -125,13 +125,17 @@ def app(environ, start_response):
return [res] return [res]
def parse_deps(orig_doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]: def parse_deps(
orig_doc: Union[Doc, Span], options: Dict[str, Any] = {}
) -> Dict[str, Any]:
"""Generate dependency parse in {'words': [], 'arcs': []} format. """Generate dependency parse in {'words': [], 'arcs': []} format.
orig_doc (Doc): Document to parse. orig_doc (Union[Doc, Span]): Document to parse.
options (Dict[str, Any]): Dependency parse specific visualisation options. options (Dict[str, Any]): Dependency parse specific visualisation options.
RETURNS (dict): Generated dependency parse keyed by words and arcs. RETURNS (dict): Generated dependency parse keyed by words and arcs.
""" """
if isinstance(orig_doc, Span):
orig_doc = orig_doc.as_doc()
doc = Doc(orig_doc.vocab).from_bytes( doc = Doc(orig_doc.vocab).from_bytes(
orig_doc.to_bytes(exclude=["user_data", "user_hooks"]) orig_doc.to_bytes(exclude=["user_data", "user_hooks"])
) )

View File

@ -542,6 +542,8 @@ class Errors(metaclass=ErrorsWithCodes):
"during training, make sure to include it in 'annotating components'") "during training, make sure to include it in 'annotating components'")
# New errors added in v3.x # New errors added in v3.x
E850 = ("The PretrainVectors objective currently only supports default or "
"floret vectors, not {mode} vectors.")
E851 = ("The 'textcat' component labels should only have values of 0 or 1, " E851 = ("The 'textcat' component labels should only have values of 0 or 1, "
"but found value of '{val}'.") "but found value of '{val}'.")
E852 = ("The tar file pulled from the remote attempted an unsafe path " E852 = ("The tar file pulled from the remote attempted an unsafe path "
@ -951,6 +953,7 @@ class Errors(metaclass=ErrorsWithCodes):
"with `displacy.serve(doc, port=port)`") "with `displacy.serve(doc, port=port)`")
E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port=port)` " E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port=port)` "
"or use `auto_select_port=True` to pick an available port automatically.") "or use `auto_select_port=True` to pick an available port automatically.")
E1051 = ("'allow_overlap' can only be False when max_positive is 1, but found 'max_positive': {max_positive}.")
# v4 error strings # v4 error strings
E4000 = ("Expected a Doc as input, but got: '{type}'") E4000 = ("Expected a Doc as input, but got: '{type}'")

View File

@ -1,11 +1,14 @@
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
from ...language import Language, BaseDefaults from ...language import Language, BaseDefaults
class SerbianDefaults(BaseDefaults): class SerbianDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES
lex_attr_getters = LEX_ATTRS lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS stop_words = STOP_WORDS

View File

@ -0,0 +1,36 @@
from ..char_classes import LIST_ELLIPSES, LIST_ICONS, LIST_PUNCT, LIST_QUOTES
from ..char_classes import CURRENCY, UNITS, PUNCT
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
_infixes = (
LIST_ELLIPSES
+ LIST_ICONS
+ [
r"(?<=[0-9])[+\-\*^](?=[0-9-])",
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
),
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
]
)
_suffixes = (
LIST_PUNCT
+ LIST_ELLIPSES
+ LIST_QUOTES
+ LIST_ICONS
+ [
r"(?<=[0-9])\+",
r"(?<=°[FfCcKk])\.",
r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
r"(?<=[0-9])(?:{u})".format(u=UNITS),
r"(?<=[{a}{e}{p}(?:{q})])\.".format(
a=ALPHA, e=r"%²\-\+", q=CONCAT_QUOTES, p=PUNCT
),
]
)
TOKENIZER_INFIXES = _infixes
TOKENIZER_SUFFIXES = _suffixes

View File

@ -24,7 +24,8 @@ class Lexeme:
def orth_(self) -> str: ... def orth_(self) -> str: ...
@property @property
def text(self) -> str: ... def text(self) -> str: ...
lower: str orth: int
lower: int
norm: int norm: int
shape: int shape: int
prefix: int prefix: int

View File

@ -186,7 +186,7 @@ cdef class Lexeme:
return self.orth_ return self.orth_
property lower: property lower:
"""RETURNS (str): Lowercase form of the lexeme.""" """RETURNS (uint64): Lowercase form of the lexeme."""
def __get__(self): def __get__(self):
return self.c.lower return self.c.lower

View File

@ -1,5 +1,5 @@
from typing import Any, Optional, Iterable, Tuple, List, Callable, TYPE_CHECKING, cast from typing import Any, Optional, Iterable, Tuple, List, Callable, TYPE_CHECKING, cast
from thinc.types import Floats2d from thinc.types import Floats2d, Ints1d
from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init, Model from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init, Model
from thinc.api import MultiSoftmax, list2array from thinc.api import MultiSoftmax, list2array
from thinc.api import to_categorical, CosineDistance, L2Distance from thinc.api import to_categorical, CosineDistance, L2Distance
@ -7,7 +7,8 @@ from thinc.loss import Loss
from ...util import registry, OOV_RANK from ...util import registry, OOV_RANK
from ...errors import Errors from ...errors import Errors
from ...attrs import ID from ...attrs import ID, ORTH
from ...vectors import Mode as VectorsMode
import numpy import numpy
from functools import partial from functools import partial
@ -67,14 +68,23 @@ def get_vectors_loss(ops, docs, prediction, distance):
"""Compute a loss based on a distance between the documents' vectors and """Compute a loss based on a distance between the documents' vectors and
the prediction. the prediction.
""" """
vocab = docs[0].vocab
if vocab.vectors.mode == VectorsMode.default:
# The simplest way to implement this would be to vstack the # The simplest way to implement this would be to vstack the
# token.vector values, but that's a bit inefficient, especially on GPU. # token.vector values, but that's a bit inefficient, especially on GPU.
# Instead we fetch the index into the vectors table for each of our tokens, # Instead we fetch the index into the vectors table for each of our
# and look them up all at once. This prevents data copying. # tokens, and look them up all at once. This prevents data copying.
ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs]) ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
target = docs[0].vocab.vectors.data[ids] target = docs[0].vocab.vectors.data[ids]
target[ids == OOV_RANK] = 0 target[ids == OOV_RANK] = 0
d_target, loss = distance(prediction, target) d_target, loss = distance(prediction, target)
elif vocab.vectors.mode == VectorsMode.floret:
keys = ops.flatten([cast(Ints1d, doc.to_array(ORTH)) for doc in docs])
target = vocab.vectors.get_batch(keys)
target = ops.as_contig(target)
d_target, loss = distance(prediction, target)
else:
raise ValueError(Errors.E850.format(mode=vocab.vectors.mode))
return loss, d_target return loss, d_target

View File

@ -502,18 +502,24 @@ class EntityLinker(TrainablePipe):
# Looping through each entity in batch (TODO: rewrite) # Looping through each entity in batch (TODO: rewrite)
for j, ent in enumerate(ent_batch): for j, ent in enumerate(ent_batch):
sent_index = sentences.index(ent.sent) assert hasattr(ent, "sents")
assert sent_index >= 0 sents = list(ent.sents)
sent_indices = (
sentences.index(sents[0]),
sentences.index(sents[-1]),
)
assert sent_indices[1] >= sent_indices[0] >= 0
if self.incl_context: if self.incl_context:
# get n_neighbour sentences, clipped to the length of the document # get n_neighbour sentences, clipped to the length of the document
start_sentence = max(0, sent_index - self.n_sents) start_sentence = max(0, sent_indices[0] - self.n_sents)
end_sentence = min( end_sentence = min(
len(sentences) - 1, sent_index + self.n_sents len(sentences) - 1, sent_indices[1] + self.n_sents
) )
start_token = sentences[start_sentence].start start_token = sentences[start_sentence].start
end_token = sentences[end_sentence].end end_token = sentences[end_sentence].end
sent_doc = doc[start_token:end_token].as_doc() sent_doc = doc[start_token:end_token].as_doc()
# currently, the context is the same for each entity in a sentence (should be refined) # currently, the context is the same for each entity in a sentence (should be refined)
sentence_encoding = self.model.predict([sent_doc])[0] sentence_encoding = self.model.predict([sent_doc])[0]
sentence_encoding_t = sentence_encoding.T sentence_encoding_t = sentence_encoding.T

View File

@ -50,13 +50,8 @@ DEFAULT_MORPH_MODEL = Config().from_str(default_model_config)["model"]
@Language.factory( @Language.factory(
"morphologizer", "morphologizer",
assigns=["token.morph", "token.pos"], assigns=["token.morph", "token.pos"],
default_config={ default_config={"model": DEFAULT_MORPH_MODEL, "overwrite": True, "extend": False,
"model": DEFAULT_MORPH_MODEL, "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"}, "label_smoothing": 0.0},
"overwrite": True,
"extend": False,
"scorer": {"@scorers": "spacy.morphologizer_scorer.v1"},
"save_activations": False,
},
default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5, "morph_per_feat": None}, default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5, "morph_per_feat": None},
) )
def make_morphologizer( def make_morphologizer(
@ -65,11 +60,11 @@ def make_morphologizer(
name: str, name: str,
overwrite: bool, overwrite: bool,
extend: bool, extend: bool,
label_smoothing: float,
scorer: Optional[Callable], scorer: Optional[Callable],
save_activations: bool, save_activations: bool,
): ):
return Morphologizer(nlp.vocab, model, name, overwrite=overwrite, extend=extend, scorer=scorer, return Morphologizer(nlp.vocab, model, name, overwrite=overwrite, extend=extend, label_smoothing=label_smoothing, scorer=scorer)
save_activations=save_activations)
def morphologizer_score(examples, **kwargs): def morphologizer_score(examples, **kwargs):
@ -98,8 +93,9 @@ class Morphologizer(Tagger):
model: Model, model: Model,
name: str = "morphologizer", name: str = "morphologizer",
*, *,
overwrite: bool = False, overwrite: bool = BACKWARD_OVERWRITE,
extend: bool = False, extend: bool = BACKWARD_EXTEND,
label_smoothing: float = 0.0,
scorer: Optional[Callable] = morphologizer_score, scorer: Optional[Callable] = morphologizer_score,
save_activations: bool = False, save_activations: bool = False,
): ):
@ -131,6 +127,7 @@ class Morphologizer(Tagger):
"labels_pos": {}, "labels_pos": {},
"overwrite": overwrite, "overwrite": overwrite,
"extend": extend, "extend": extend,
"label_smoothing": label_smoothing,
} }
self.cfg = dict(sorted(cfg.items())) self.cfg = dict(sorted(cfg.items()))
self.scorer = scorer self.scorer = scorer
@ -289,7 +286,8 @@ class Morphologizer(Tagger):
DOCS: https://spacy.io/api/morphologizer#get_loss DOCS: https://spacy.io/api/morphologizer#get_loss
""" """
validate_examples(examples, "Morphologizer.get_loss") validate_examples(examples, "Morphologizer.get_loss")
loss_func = LegacySequenceCategoricalCrossentropy(names=tuple(self.labels), normalize=False) loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False,
label_smoothing=self.cfg["label_smoothing"])
truths = [] truths = []
for eg in examples: for eg in examples:
eg_truths = [] eg_truths = []

View File

@ -1,5 +1,5 @@
from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any, cast from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any, cast, Union
from typing import Union, Protocol, runtime_checkable from dataclasses import dataclass
from thinc.api import Config, Model, get_current_ops, set_dropout_rate, Ops from thinc.api import Config, Model, get_current_ops, set_dropout_rate, Ops
from thinc.api import Optimizer from thinc.api import Optimizer
from thinc.types import Ragged, Ints2d, Floats2d from thinc.types import Ragged, Ints2d, Floats2d
@ -46,7 +46,36 @@ maxout_pieces = 3
depth = 4 depth = 4
""" """
spancat_singlelabel_default_config = """
[model]
@architectures = "spacy.SpanCategorizer.v1"
scorer = {"@layers": "Softmax.v2"}
[model.reducer]
@layers = spacy.mean_max_reducer.v1
hidden_size = 128
[model.tok2vec]
@architectures = "spacy.Tok2Vec.v2"
[model.tok2vec.embed]
@architectures = "spacy.MultiHashEmbed.v1"
width = 96
rows = [5000, 1000, 2500, 1000]
attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
include_static_vectors = false
[model.tok2vec.encode]
@architectures = "spacy.MaxoutWindowEncoder.v2"
width = ${model.tok2vec.embed.width}
window_size = 1
maxout_pieces = 3
depth = 4
"""
DEFAULT_SPANCAT_MODEL = Config().from_str(spancat_default_config)["model"] DEFAULT_SPANCAT_MODEL = Config().from_str(spancat_default_config)["model"]
DEFAULT_SPANCAT_SINGLELABEL_MODEL = Config().from_str(
spancat_singlelabel_default_config
)["model"]
@runtime_checkable @runtime_checkable
@ -124,10 +153,14 @@ def make_spancat(
max_positive: Optional[int], max_positive: Optional[int],
save_activations: bool, save_activations: bool,
) -> "SpanCategorizer": ) -> "SpanCategorizer":
"""Create a SpanCategorizer component. The span categorizer consists of two """Create a SpanCategorizer component and configure it for multi-label
classification to be able to assign multiple labels for each span.
The span categorizer consists of two
parts: a suggester function that proposes candidate spans, and a labeller parts: a suggester function that proposes candidate spans, and a labeller
model that predicts one or more labels for each span. model that predicts one or more labels for each span.
name (str): The component instance name, used to add entries to the
losses during training.
suggester (Callable[[Iterable[Doc], Optional[Ops]], Ragged]): A function that suggests spans. suggester (Callable[[Iterable[Doc], Optional[Ops]], Ragged]): A function that suggests spans.
Spans are returned as a ragged array with two integer columns, for the Spans are returned as a ragged array with two integer columns, for the
start and end positions. start and end positions.
@ -150,12 +183,80 @@ def make_spancat(
""" """
return SpanCategorizer( return SpanCategorizer(
nlp.vocab, nlp.vocab,
suggester=suggester,
model=model, model=model,
spans_key=spans_key, suggester=suggester,
threshold=threshold,
max_positive=max_positive,
name=name, name=name,
spans_key=spans_key,
negative_weight=None,
allow_overlap=True,
max_positive=max_positive,
threshold=threshold,
scorer=scorer,
add_negative_label=False,
)
@Language.factory(
"spancat_singlelabel",
assigns=["doc.spans"],
default_config={
"spans_key": "sc",
"model": DEFAULT_SPANCAT_SINGLELABEL_MODEL,
"negative_weight": 1.0,
"suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
"scorer": {"@scorers": "spacy.spancat_scorer.v1"},
"allow_overlap": True,
},
default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0},
)
def make_spancat_singlelabel(
nlp: Language,
name: str,
suggester: Suggester,
model: Model[Tuple[List[Doc], Ragged], Floats2d],
spans_key: str,
negative_weight: float,
allow_overlap: bool,
scorer: Optional[Callable],
) -> "SpanCategorizer":
"""Create a SpanCategorizer component and configure it for multi-class
classification. With this configuration each span can get at most one
label. The span categorizer consists of two
parts: a suggester function that proposes candidate spans, and a labeller
model that predicts one or more labels for each span.
name (str): The component instance name, used to add entries to the
losses during training.
suggester (Callable[[Iterable[Doc], Optional[Ops]], Ragged]): A function that suggests spans.
Spans are returned as a ragged array with two integer columns, for the
start and end positions.
model (Model[Tuple[List[Doc], Ragged], Floats2d]): A model instance that
is given a list of documents and (start, end) indices representing
candidate span offsets. The model predicts a probability for each category
for each span.
spans_key (str): Key of the doc.spans dict to save the spans under. During
initialization and training, the component will look for spans on the
reference document under the same key.
scorer (Optional[Callable]): The scoring method. Defaults to
Scorer.score_spans for the Doc.spans[spans_key] with overlapping
spans allowed.
negative_weight (float): Multiplier for the loss terms.
Can be used to downweight the negative samples if there are too many.
allow_overlap (bool): If True the data is assumed to contain overlapping spans.
Otherwise it produces non-overlapping spans greedily prioritizing
higher assigned label scores.
"""
return SpanCategorizer(
nlp.vocab,
model=model,
suggester=suggester,
name=name,
spans_key=spans_key,
negative_weight=negative_weight,
allow_overlap=allow_overlap,
max_positive=1,
add_negative_label=True,
threshold=None,
scorer=scorer, scorer=scorer,
save_activations=save_activations, save_activations=save_activations,
) )
@ -179,6 +280,27 @@ def make_spancat_scorer():
return spancat_score return spancat_score
@dataclass
class _Intervals:
"""
Helper class to avoid storing overlapping spans.
"""
def __init__(self):
self.ranges = set()
def add(self, i, j):
for e in range(i, j):
self.ranges.add(e)
def __contains__(self, rang):
i, j = rang
for e in range(i, j):
if e in self.ranges:
return True
return False
class SpanCategorizer(TrainablePipe): class SpanCategorizer(TrainablePipe):
"""Pipeline component to label spans of text. """Pipeline component to label spans of text.
@ -192,26 +314,44 @@ class SpanCategorizer(TrainablePipe):
suggester: Suggester, suggester: Suggester,
name: str = "spancat", name: str = "spancat",
*, *,
add_negative_label: bool = False,
spans_key: str = "spans", spans_key: str = "spans",
threshold: float = 0.5, negative_weight: Optional[float] = 1.0,
allow_overlap: Optional[bool] = True,
max_positive: Optional[int] = None, max_positive: Optional[int] = None,
threshold: Optional[float] = 0.5,
scorer: Optional[Callable] = spancat_score, scorer: Optional[Callable] = spancat_score,
save_activations: bool = False, save_activations: bool = False,
) -> None: ) -> None:
"""Initialize the span categorizer. """Initialize the multi-label or multi-class span categorizer.
vocab (Vocab): The shared vocabulary. vocab (Vocab): The shared vocabulary.
model (thinc.api.Model): The Thinc Model powering the pipeline component. model (thinc.api.Model): The Thinc Model powering the pipeline component.
For multi-class classification (single label per span) we recommend
using a Softmax classifier as a the final layer, while for multi-label
classification (multiple possible labels per span) we recommend Logistic.
suggester (Callable[[Iterable[Doc], Optional[Ops]], Ragged]): A function that suggests spans.
Spans are returned as a ragged array with two integer columns, for the
start and end positions.
name (str): The component instance name, used to add entries to the name (str): The component instance name, used to add entries to the
losses during training. losses during training.
spans_key (str): Key of the Doc.spans dict to save the spans under. spans_key (str): Key of the Doc.spans dict to save the spans under.
During initialization and training, the component will look for During initialization and training, the component will look for
spans on the reference document under the same key. Defaults to spans on the reference document under the same key. Defaults to
`"spans"`. `"spans"`.
threshold (float): Minimum probability to consider a prediction add_negative_label (bool): Learn to predict a special 'negative_label'
positive. Spans with a positive prediction will be saved on the Doc. when a Span is not annotated.
Defaults to 0.5. threshold (Optional[float]): Minimum probability to consider a prediction
positive. Defaults to 0.5. Spans with a positive prediction will be saved
on the Doc.
max_positive (Optional[int]): Maximum number of labels to consider max_positive (Optional[int]): Maximum number of labels to consider
positive per span. Defaults to None, indicating no limit. positive per span. Defaults to None, indicating no limit.
negative_weight (float): Multiplier for the loss terms.
Can be used to downweight the negative samples if there are too many
when add_negative_label is True. Otherwise its unused.
allow_overlap (bool): If True the data is assumed to contain overlapping spans.
Otherwise it produces non-overlapping spans greedily prioritizing
higher assigned label scores. Only used when max_positive is 1.
scorer (Optional[Callable]): The scoring method. Defaults to scorer (Optional[Callable]): The scoring method. Defaults to
Scorer.score_spans for the Doc.spans[spans_key] with overlapping Scorer.score_spans for the Doc.spans[spans_key] with overlapping
spans allowed. spans allowed.
@ -223,13 +363,17 @@ class SpanCategorizer(TrainablePipe):
"spans_key": spans_key, "spans_key": spans_key,
"threshold": threshold, "threshold": threshold,
"max_positive": max_positive, "max_positive": max_positive,
"negative_weight": negative_weight,
"allow_overlap": allow_overlap,
} }
self.vocab = vocab self.vocab = vocab
self.suggester = suggester self.suggester = suggester
self.model = model self.model = model
self.name = name self.name = name
self.scorer = scorer self.scorer = scorer
self.save_activations = save_activations self.add_negative_label = add_negative_label
if not allow_overlap and max_positive is not None and max_positive > 1:
raise ValueError(Errors.E1051.format(max_positive=max_positive))
@property @property
def key(self) -> str: def key(self) -> str:
@ -239,6 +383,21 @@ class SpanCategorizer(TrainablePipe):
""" """
return str(self.cfg["spans_key"]) return str(self.cfg["spans_key"])
def _allow_extra_label(self) -> None:
"""Raise an error if the component can not add any more labels."""
nO = None
if self.model.has_dim("nO"):
nO = self.model.get_dim("nO")
elif self.model.has_ref("output_layer") and self.model.get_ref(
"output_layer"
).has_dim("nO"):
nO = self.model.get_ref("output_layer").get_dim("nO")
if nO is not None and nO == self._n_labels:
if not self.is_resizable:
raise ValueError(
Errors.E922.format(name=self.name, nO=self.model.get_dim("nO"))
)
def add_label(self, label: str) -> int: def add_label(self, label: str) -> int:
"""Add a new label to the pipe. """Add a new label to the pipe.
@ -272,7 +431,28 @@ class SpanCategorizer(TrainablePipe):
""" """
return list(self.labels) return list(self.labels)
def predict(self, docs: Iterable[Doc]) -> ActivationsT: @property
def _label_map(self) -> Dict[str, int]:
"""RETURNS (Dict[str, int]): The label map."""
return {label: i for i, label in enumerate(self.labels)}
@property
def _n_labels(self) -> int:
"""RETURNS (int): Number of labels."""
if self.add_negative_label:
return len(self.labels) + 1
else:
return len(self.labels)
@property
def _negative_label_i(self) -> Union[int, None]:
"""RETURNS (Union[int, None]): Index of the negative label."""
if self.add_negative_label:
return len(self.label_data)
else:
return None
def predict(self, docs: Iterable[Doc]):
"""Apply the pipeline's model to a batch of docs, without modifying them. """Apply the pipeline's model to a batch of docs, without modifying them.
docs (Iterable[Doc]): The documents to predict. docs (Iterable[Doc]): The documents to predict.
@ -313,23 +493,23 @@ class SpanCategorizer(TrainablePipe):
DOCS: https://spacy.io/api/spancategorizer#set_annotations DOCS: https://spacy.io/api/spancategorizer#set_annotations
""" """
labels = self.labels indices, scores = indices_scores
indices = activations["indices"]
assert isinstance(indices, Ragged)
scores = cast(Floats2d, activations["scores"])
offset = 0 offset = 0
for i, doc in enumerate(docs): for i, doc in enumerate(docs):
indices_i = indices[i].dataXd indices_i = indices[i].dataXd
if self.save_activations: allow_overlap = cast(bool, self.cfg["allow_overlap"])
doc.activations[self.name] = {} if self.cfg["max_positive"] == 1:
doc.activations[self.name]["indices"] = indices_i doc.spans[self.key] = self._make_span_group_singlelabel(
doc.activations[self.name]["scores"] = scores[ doc,
offset : offset + indices.lengths[i] indices_i,
] scores[offset : offset + indices.lengths[i]],
doc.spans[self.key] = self._make_span_group( allow_overlap,
doc, indices_i, scores[offset : offset + indices.lengths[i]], labels # type: ignore[arg-type] )
else:
doc.spans[self.key] = self._make_span_group_multilabel(
doc,
indices_i,
scores[offset : offset + indices.lengths[i]],
) )
offset += indices.lengths[i] offset += indices.lengths[i]
@ -390,9 +570,11 @@ class SpanCategorizer(TrainablePipe):
spans = Ragged( spans = Ragged(
self.model.ops.to_numpy(spans.data), self.model.ops.to_numpy(spans.lengths) self.model.ops.to_numpy(spans.data), self.model.ops.to_numpy(spans.lengths)
) )
label_map = {label: i for i, label in enumerate(self.labels)}
target = numpy.zeros(scores.shape, dtype=scores.dtype) target = numpy.zeros(scores.shape, dtype=scores.dtype)
if self.add_negative_label:
negative_spans = numpy.ones((scores.shape[0]))
offset = 0 offset = 0
label_map = self._label_map
for i, eg in enumerate(examples): for i, eg in enumerate(examples):
# Map (start, end) offset of spans to the row in the d_scores array, # Map (start, end) offset of spans to the row in the d_scores array,
# so that we can adjust the gradient for predictions that were # so that we can adjust the gradient for predictions that were
@ -409,10 +591,16 @@ class SpanCategorizer(TrainablePipe):
row = spans_index[key] row = spans_index[key]
k = label_map[gold_span.label_] k = label_map[gold_span.label_]
target[row, k] = 1.0 target[row, k] = 1.0
if self.add_negative_label:
# delete negative label target.
negative_spans[row] = 0.0
# The target is a flat array for all docs. Track the position # The target is a flat array for all docs. Track the position
# we're at within the flat array. # we're at within the flat array.
offset += spans.lengths[i] offset += spans.lengths[i]
target = self.model.ops.asarray(target, dtype="f") # type: ignore target = self.model.ops.asarray(target, dtype="f") # type: ignore
if self.add_negative_label:
negative_samples = numpy.nonzero(negative_spans)[0]
target[negative_samples, self._negative_label_i] = 1.0 # type: ignore
# The target will have the values 0 (for untrue predictions) or 1 # The target will have the values 0 (for untrue predictions) or 1
# (for true predictions). # (for true predictions).
# The scores should be in the range [0, 1]. # The scores should be in the range [0, 1].
@ -421,6 +609,10 @@ class SpanCategorizer(TrainablePipe):
# If the prediction is 0.9 and it's false, the gradient will be # If the prediction is 0.9 and it's false, the gradient will be
# 0.9 (0.9 - 0.0) # 0.9 (0.9 - 0.0)
d_scores = scores - target d_scores = scores - target
if self.add_negative_label:
neg_weight = cast(float, self.cfg["negative_weight"])
if neg_weight != 1.0:
d_scores[negative_samples] *= neg_weight
loss = float((d_scores**2).sum()) loss = float((d_scores**2).sum())
return loss, d_scores return loss, d_scores
@ -457,7 +649,7 @@ class SpanCategorizer(TrainablePipe):
if subbatch: if subbatch:
docs = [eg.x for eg in subbatch] docs = [eg.x for eg in subbatch]
spans = build_ngram_suggester(sizes=[1])(docs) spans = build_ngram_suggester(sizes=[1])(docs)
Y = self.model.ops.alloc2f(spans.dataXd.shape[0], len(self.labels)) Y = self.model.ops.alloc2f(spans.dataXd.shape[0], self._n_labels)
self.model.initialize(X=(docs, spans), Y=Y) self.model.initialize(X=(docs, spans), Y=Y)
else: else:
self.model.initialize() self.model.initialize()
@ -471,31 +663,98 @@ class SpanCategorizer(TrainablePipe):
eg.reference.spans.get(self.key, []), allow_overlap=True eg.reference.spans.get(self.key, []), allow_overlap=True
) )
def _make_span_group( def _make_span_group_multilabel(
self, doc: Doc, indices: Ints2d, scores: Floats2d, labels: List[str] self,
doc: Doc,
indices: Ints2d,
scores: Floats2d,
) -> SpanGroup: ) -> SpanGroup:
"""Find the top-k labels for each span (k=max_positive)."""
spans = SpanGroup(doc, name=self.key) spans = SpanGroup(doc, name=self.key)
max_positive = self.cfg["max_positive"] if scores.size == 0:
return spans
scores = self.model.ops.to_numpy(scores)
indices = self.model.ops.to_numpy(indices)
threshold = self.cfg["threshold"] threshold = self.cfg["threshold"]
max_positive = self.cfg["max_positive"]
keeps = scores >= threshold keeps = scores >= threshold
ranked = (scores * -1).argsort() # type: ignore
if max_positive is not None: if max_positive is not None:
assert isinstance(max_positive, int) assert isinstance(max_positive, int)
if self.add_negative_label:
negative_scores = numpy.copy(scores[:, self._negative_label_i])
scores[:, self._negative_label_i] = -numpy.inf
ranked = (scores * -1).argsort() # type: ignore
scores[:, self._negative_label_i] = negative_scores
else:
ranked = (scores * -1).argsort() # type: ignore
span_filter = ranked[:, max_positive:] span_filter = ranked[:, max_positive:]
for i, row in enumerate(span_filter): for i, row in enumerate(span_filter):
keeps[i, row] = False keeps[i, row] = False
spans.attrs["scores"] = scores[keeps].flatten()
indices = self.model.ops.to_numpy(indices)
keeps = self.model.ops.to_numpy(keeps)
attrs_scores = []
for i in range(indices.shape[0]): for i in range(indices.shape[0]):
start = indices[i, 0] start = indices[i, 0]
end = indices[i, 1] end = indices[i, 1]
for j, keep in enumerate(keeps[i]): for j, keep in enumerate(keeps[i]):
if keep: if keep:
spans.append(Span(doc, start, end, label=labels[j])) if j != self._negative_label_i:
spans.append(Span(doc, start, end, label=self.labels[j]))
attrs_scores.append(scores[i, j])
spans.attrs["scores"] = numpy.array(attrs_scores)
return spans
def _make_span_group_singlelabel(
self,
doc: Doc,
indices: Ints2d,
scores: Floats2d,
allow_overlap: bool = True,
) -> SpanGroup:
"""Find the argmax label for each span."""
# Handle cases when there are zero suggestions
if scores.size == 0:
return SpanGroup(doc, name=self.key)
scores = self.model.ops.to_numpy(scores)
indices = self.model.ops.to_numpy(indices)
predicted = scores.argmax(axis=1)
argmax_scores = numpy.take_along_axis(
scores, numpy.expand_dims(predicted, 1), axis=1
)
keeps = numpy.ones(predicted.shape, dtype=bool)
# Remove samples where the negative label is the argmax.
if self.add_negative_label:
keeps = numpy.logical_and(keeps, predicted != self._negative_label_i)
# Filter samples according to threshold.
threshold = self.cfg["threshold"]
if threshold is not None:
keeps = numpy.logical_and(keeps, (argmax_scores >= threshold).squeeze())
# Sort spans according to argmax probability
if not allow_overlap:
# Get the probabilities
sort_idx = (argmax_scores.squeeze() * -1).argsort()
argmax_scores = argmax_scores[sort_idx]
predicted = predicted[sort_idx]
indices = indices[sort_idx]
keeps = keeps[sort_idx]
seen = _Intervals()
spans = SpanGroup(doc, name=self.key)
attrs_scores = []
for i in range(indices.shape[0]):
if not keeps[i]:
continue
label = predicted[i]
start = indices[i, 0]
end = indices[i, 1]
if not allow_overlap:
if (start, end) in seen:
continue
else:
seen.add(start, end)
attrs_scores.append(argmax_scores[i])
spans.append(Span(doc, start, end, label=self.labels[label]))
spans.attrs["scores"] = numpy.array(attrs_scores)
return spans return spans

View File

@ -47,13 +47,7 @@ DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"]
@Language.factory( @Language.factory(
"tagger", "tagger",
assigns=["token.tag"], assigns=["token.tag"],
default_config={ default_config={"model": DEFAULT_TAGGER_MODEL, "overwrite": False, "scorer": {"@scorers": "spacy.tagger_scorer.v1"}, "neg_prefix": "!", "label_smoothing": 0.0},
"model": DEFAULT_TAGGER_MODEL,
"overwrite": False,
"scorer": {"@scorers": "spacy.tagger_scorer.v1"},
"neg_prefix": "!",
"save_activations": False,
},
default_score_weights={"tag_acc": 1.0}, default_score_weights={"tag_acc": 1.0},
) )
def make_tagger( def make_tagger(
@ -63,7 +57,7 @@ def make_tagger(
overwrite: bool, overwrite: bool,
scorer: Optional[Callable], scorer: Optional[Callable],
neg_prefix: str, neg_prefix: str,
save_activations: bool, label_smoothing: float,
): ):
"""Construct a part-of-speech tagger component. """Construct a part-of-speech tagger component.
@ -72,8 +66,7 @@ def make_tagger(
in size, and be normalized as probabilities (all scores between 0 and 1, in size, and be normalized as probabilities (all scores between 0 and 1,
with the rows summing to 1). with the rows summing to 1).
""" """
return Tagger(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer, neg_prefix=neg_prefix, return Tagger(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer, neg_prefix=neg_prefix, label_smoothing=label_smoothing)
save_activations=save_activations)
def tagger_score(examples, **kwargs): def tagger_score(examples, **kwargs):
@ -99,7 +92,7 @@ class Tagger(TrainablePipe):
overwrite=False, overwrite=False,
scorer=tagger_score, scorer=tagger_score,
neg_prefix="!", neg_prefix="!",
save_activations: bool = False, label_smoothing=0.0,
): ):
"""Initialize a part-of-speech tagger. """Initialize a part-of-speech tagger.
@ -118,7 +111,7 @@ class Tagger(TrainablePipe):
self.model = model self.model = model
self.name = name self.name = name
self._rehearsal_model = None self._rehearsal_model = None
cfg = {"labels": [], "overwrite": overwrite, "neg_prefix": neg_prefix} cfg = {"labels": [], "overwrite": overwrite, "neg_prefix": neg_prefix, "label_smoothing": label_smoothing}
self.cfg = dict(sorted(cfg.items())) self.cfg = dict(sorted(cfg.items()))
self.scorer = scorer self.scorer = scorer
self.save_activations = save_activations self.save_activations = save_activations
@ -294,7 +287,7 @@ class Tagger(TrainablePipe):
DOCS: https://spacy.io/api/tagger#get_loss DOCS: https://spacy.io/api/tagger#get_loss
""" """
validate_examples(examples, "Tagger.get_loss") validate_examples(examples, "Tagger.get_loss")
loss_func = LegacySequenceCategoricalCrossentropy(names=self.labels, normalize=False, neg_prefix=self.cfg["neg_prefix"]) loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False, neg_prefix=self.cfg["neg_prefix"], label_smoothing=self.cfg["label_smoothing"])
# Convert empty tag "" to missing value None so that both misaligned # Convert empty tag "" to missing value None so that both misaligned
# tokens and tokens with missing annotation have the default missing # tokens and tokens with missing annotation have the default missing
# value None. # value None.

View File

@ -689,21 +689,32 @@ def test_span_group_copy(doc):
assert len(doc_copy.spans["test"]) == 2 assert len(doc_copy.spans["test"]) == 2
@pytest.mark.issue(11113) def test_for_partial_ent_sents():
def test_span_ent_id(en_tokenizer): """Spans may be associated with multiple sentences. These .sents should always be complete, not partial, sentences,
doc = en_tokenizer("a b c d") which this tests for.
doc.ents = [Span(doc, 1, 3, label="A", span_id="ID0")] """
span = doc.ents[0] doc = Doc(
assert doc[1].ent_id_ == "ID0" English().vocab,
words=["Mahler's", "Symphony", "No.", "8", "was", "beautiful."],
sent_starts=[1, 0, 0, 1, 0, 0],
)
doc.set_ents([Span(doc, 1, 4, "WORK")])
# The specified entity is associated with both sentences in this doc, so we expect all sentences in the doc to be
# equal to the sentences referenced in ent.sents.
for doc_sent, ent_sent in zip(doc.sents, doc.ents[0].sents):
assert doc_sent == ent_sent
# setting Span.id sets Token.ent_id
span.id_ = "ID1"
doc.ents = [span]
assert doc.ents[0].ent_id_ == "ID1"
assert doc[1].ent_id_ == "ID1"
# Span.ent_id is an alias of Span.id def test_for_no_ent_sents():
span.ent_id_ = "ID2" """Span.sents() should set .sents correctly, even if Span in question is trailing and doesn't form a full
doc.ents = [span] sentence.
assert doc.ents[0].ent_id_ == "ID2" """
assert doc[1].ent_id_ == "ID2" doc = Doc(
English().vocab,
words=["This", "is", "a", "test.", "ENTITY"],
sent_starts=[1, 0, 0, 0, 1],
)
doc.set_ents([Span(doc, 4, 5, "WORK")])
sents = list(doc.ents[0].sents)
assert len(sents) == 1
assert str(sents[0]) == str(doc.ents[0].sent) == "ENTITY"

View File

@ -9,6 +9,8 @@ from spacy.lang.en import English
from spacy.lang.it import Italian from spacy.lang.it import Italian
from spacy.language import Language from spacy.language import Language
from spacy.lookups import Lookups from spacy.lookups import Lookups
from spacy.pipeline import EntityRecognizer
from spacy.pipeline.ner import DEFAULT_NER_MODEL
from spacy.pipeline._parser_internals.ner import BiluoPushDown from spacy.pipeline._parser_internals.ner import BiluoPushDown
from spacy.training import Example, iob_to_biluo, split_bilu_label from spacy.training import Example, iob_to_biluo, split_bilu_label
from spacy.tokens import Doc, Span from spacy.tokens import Doc, Span
@ -17,8 +19,6 @@ from thinc.api import fix_random_seed
import logging import logging
from ..util import make_tempdir from ..util import make_tempdir
from ...pipeline import EntityRecognizer
from ...pipeline.ner import DEFAULT_NER_MODEL
TRAIN_DATA = [ TRAIN_DATA = [
("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}), ("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),

View File

@ -10,13 +10,11 @@ from spacy.lang.en import English
from spacy.training import Example from spacy.training import Example
from spacy.tokens import Doc from spacy.tokens import Doc
from spacy.vocab import Vocab from spacy.vocab import Vocab
from spacy import util, registry from spacy.pipeline import DependencyParser
from thinc.api import fix_random_seed from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL
from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
from ...pipeline import DependencyParser
from ...pipeline.dep_parser import DEFAULT_PARSER_MODEL
from ..util import apply_transition_sequence, make_tempdir from ..util import apply_transition_sequence, make_tempdir
from ...pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
TRAIN_DATA = [ TRAIN_DATA = [
( (

View File

@ -1,10 +1,10 @@
from typing import Callable, Iterable, Dict, Any, cast from typing import Callable, Iterable, Dict, Any, Tuple
import pytest import pytest
from numpy.testing import assert_equal from numpy.testing import assert_equal
from thinc.types import Ragged from thinc.types import Ragged
from spacy import registry, util from spacy import registry, util, Language
from spacy.attrs import ENT_KB_ID from spacy.attrs import ENT_KB_ID
from spacy.compat import pickle from spacy.compat import pickle
from spacy.kb import Candidate, InMemoryLookupKB, get_candidates, KnowledgeBase from spacy.kb import Candidate, InMemoryLookupKB, get_candidates, KnowledgeBase
@ -108,18 +108,23 @@ def test_issue7065():
@pytest.mark.issue(7065) @pytest.mark.issue(7065)
def test_issue7065_b(): @pytest.mark.parametrize("entity_in_first_sentence", [True, False])
def test_sentence_crossing_ents(entity_in_first_sentence: bool):
"""Tests if NEL crashes if entities cross sentence boundaries and the first associated sentence doesn't have an
entity.
entity_in_prior_sentence (bool): Whether to include an entity in the first sentence associated with the
sentence-crossing entity.
"""
# Test that the NEL doesn't crash when an entity crosses a sentence boundary # Test that the NEL doesn't crash when an entity crosses a sentence boundary
nlp = English() nlp = English()
vector_length = 3 vector_length = 3
nlp.add_pipe("sentencizer")
text = "Mahler 's Symphony No. 8 was beautiful." text = "Mahler 's Symphony No. 8 was beautiful."
entities = [(0, 6, "PERSON"), (10, 24, "WORK")] entities = [(10, 24, "WORK")]
links = { links = {(10, 24): {"Q7304": 0.0, "Q270853": 1.0}}
(0, 6): {"Q7304": 1.0, "Q270853": 0.0}, if entity_in_first_sentence:
(10, 24): {"Q7304": 0.0, "Q270853": 1.0}, entities.append((0, 6, "PERSON"))
} links[(0, 6)] = {"Q7304": 1.0, "Q270853": 0.0}
sent_starts = [1, -1, 0, 0, 0, 0, 0, 0, 0] sent_starts = [1, -1, 0, 0, 0, 1, 0, 0, 0]
doc = nlp(text) doc = nlp(text)
example = Example.from_dict( example = Example.from_dict(
doc, {"entities": entities, "links": links, "sent_starts": sent_starts} doc, {"entities": entities, "links": links, "sent_starts": sent_starts}
@ -145,31 +150,14 @@ def test_issue7065_b():
# Create the Entity Linker component and add it to the pipeline # Create the Entity Linker component and add it to the pipeline
entity_linker = nlp.add_pipe("entity_linker", last=True) entity_linker = nlp.add_pipe("entity_linker", last=True)
entity_linker.set_kb(create_kb) entity_linker.set_kb(create_kb) # type: ignore
# train the NEL pipe # train the NEL pipe
optimizer = nlp.initialize(get_examples=lambda: train_examples) optimizer = nlp.initialize(get_examples=lambda: train_examples)
for i in range(2): for i in range(2):
losses = {} nlp.update(train_examples, sgd=optimizer)
nlp.update(train_examples, sgd=optimizer, losses=losses)
# Add a custom rule-based component to mimick NER # This shouldn't crash.
patterns = [ entity_linker.predict([example.reference]) # type: ignore
{"label": "PERSON", "pattern": [{"LOWER": "mahler"}]},
{
"label": "WORK",
"pattern": [
{"LOWER": "symphony"},
{"LOWER": "no"},
{"LOWER": "."},
{"LOWER": "8"},
],
},
]
ruler = nlp.add_pipe("entity_ruler", before="entity_linker")
ruler.add_patterns(patterns)
# test the trained model - this should not throw E148
doc = nlp(text)
assert doc
def test_no_entities(): def test_no_entities():

View File

@ -1,6 +1,6 @@
from typing import cast from typing import cast
import pytest import pytest
from numpy.testing import assert_equal from numpy.testing import assert_equal, assert_almost_equal
from spacy import util from spacy import util
from spacy.training import Example from spacy.training import Example
@ -21,6 +21,8 @@ def test_label_types():
morphologizer.add_label(9) morphologizer.add_label(9)
TAGS = ["Feat=N", "Feat=V", "Feat=J"]
TRAIN_DATA = [ TRAIN_DATA = [
( (
"I like green eggs", "I like green eggs",
@ -34,6 +36,29 @@ TRAIN_DATA = [
] ]
def test_label_smoothing():
nlp = Language()
morph_no_ls = nlp.add_pipe("morphologizer", "no_label_smoothing")
morph_ls = nlp.add_pipe(
"morphologizer", "label_smoothing", config=dict(label_smoothing=0.05)
)
train_examples = []
losses = {}
for tag in TAGS:
morph_no_ls.add_label(tag)
morph_ls.add_label(tag)
for t in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
nlp.initialize(get_examples=lambda: train_examples)
tag_scores, bp_tag_scores = morph_ls.model.begin_update(
[eg.predicted for eg in train_examples]
)
no_ls_grads = morph_no_ls.get_loss(train_examples, tag_scores)[1][0]
ls_grads = morph_ls.get_loss(train_examples, tag_scores)[1][0]
assert_almost_equal(ls_grads / no_ls_grads, 0.94285715)
def test_no_label(): def test_no_label():
nlp = Language() nlp = Language()
nlp.add_pipe("morphologizer") nlp.add_pipe("morphologizer")

View File

@ -15,6 +15,8 @@ OPS = get_current_ops()
SPAN_KEY = "labeled_spans" SPAN_KEY = "labeled_spans"
SPANCAT_COMPONENTS = ["spancat", "spancat_singlelabel"]
TRAIN_DATA = [ TRAIN_DATA = [
("Who is Shaka Khan?", {"spans": {SPAN_KEY: [(7, 17, "PERSON")]}}), ("Who is Shaka Khan?", {"spans": {SPAN_KEY: [(7, 17, "PERSON")]}}),
( (
@ -41,38 +43,42 @@ def make_examples(nlp, data=TRAIN_DATA):
return train_examples return train_examples
def test_no_label(): @pytest.mark.parametrize("name", SPANCAT_COMPONENTS)
def test_no_label(name):
nlp = Language() nlp = Language()
nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY}) nlp.add_pipe(name, config={"spans_key": SPAN_KEY})
with pytest.raises(ValueError): with pytest.raises(ValueError):
nlp.initialize() nlp.initialize()
def test_no_resize(): @pytest.mark.parametrize("name", SPANCAT_COMPONENTS)
def test_no_resize(name):
nlp = Language() nlp = Language()
spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY}) spancat = nlp.add_pipe(name, config={"spans_key": SPAN_KEY})
spancat.add_label("Thing") spancat.add_label("Thing")
spancat.add_label("Phrase") spancat.add_label("Phrase")
assert spancat.labels == ("Thing", "Phrase") assert spancat.labels == ("Thing", "Phrase")
nlp.initialize() nlp.initialize()
assert spancat.model.get_dim("nO") == 2 assert spancat.model.get_dim("nO") == spancat._n_labels
# this throws an error because the spancat can't be resized after initialization # this throws an error because the spancat can't be resized after initialization
with pytest.raises(ValueError): with pytest.raises(ValueError):
spancat.add_label("Stuff") spancat.add_label("Stuff")
def test_implicit_labels(): @pytest.mark.parametrize("name", SPANCAT_COMPONENTS)
def test_implicit_labels(name):
nlp = Language() nlp = Language()
spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY}) spancat = nlp.add_pipe(name, config={"spans_key": SPAN_KEY})
assert len(spancat.labels) == 0 assert len(spancat.labels) == 0
train_examples = make_examples(nlp) train_examples = make_examples(nlp)
nlp.initialize(get_examples=lambda: train_examples) nlp.initialize(get_examples=lambda: train_examples)
assert spancat.labels == ("PERSON", "LOC") assert spancat.labels == ("PERSON", "LOC")
def test_explicit_labels(): @pytest.mark.parametrize("name", SPANCAT_COMPONENTS)
def test_explicit_labels(name):
nlp = Language() nlp = Language()
spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY}) spancat = nlp.add_pipe(name, config={"spans_key": SPAN_KEY})
assert len(spancat.labels) == 0 assert len(spancat.labels) == 0
spancat.add_label("PERSON") spancat.add_label("PERSON")
spancat.add_label("LOC") spancat.add_label("LOC")
@ -102,13 +108,13 @@ def test_doc_gc():
# XXX This fails with length 0 sometimes # XXX This fails with length 0 sometimes
assert len(spangroup) > 0 assert len(spangroup) > 0
with pytest.raises(RuntimeError): with pytest.raises(RuntimeError):
span = spangroup[0] spangroup[0]
@pytest.mark.parametrize( @pytest.mark.parametrize(
"max_positive,nr_results", [(None, 4), (1, 2), (2, 3), (3, 4), (4, 4)] "max_positive,nr_results", [(None, 4), (1, 2), (2, 3), (3, 4), (4, 4)]
) )
def test_make_spangroup(max_positive, nr_results): def test_make_spangroup_multilabel(max_positive, nr_results):
fix_random_seed(0) fix_random_seed(0)
nlp = Language() nlp = Language()
spancat = nlp.add_pipe( spancat = nlp.add_pipe(
@ -120,10 +126,12 @@ def test_make_spangroup(max_positive, nr_results):
indices = ngram_suggester([doc])[0].dataXd indices = ngram_suggester([doc])[0].dataXd
assert_array_equal(OPS.to_numpy(indices), numpy.asarray([[0, 1], [1, 2], [0, 2]])) assert_array_equal(OPS.to_numpy(indices), numpy.asarray([[0, 1], [1, 2], [0, 2]]))
labels = ["Thing", "City", "Person", "GreatCity"] labels = ["Thing", "City", "Person", "GreatCity"]
for label in labels:
spancat.add_label(label)
scores = numpy.asarray( scores = numpy.asarray(
[[0.2, 0.4, 0.3, 0.1], [0.1, 0.6, 0.2, 0.4], [0.8, 0.7, 0.3, 0.9]], dtype="f" [[0.2, 0.4, 0.3, 0.1], [0.1, 0.6, 0.2, 0.4], [0.8, 0.7, 0.3, 0.9]], dtype="f"
) )
spangroup = spancat._make_span_group(doc, indices, scores, labels) spangroup = spancat._make_span_group_multilabel(doc, indices, scores)
assert len(spangroup) == nr_results assert len(spangroup) == nr_results
# first span is always the second token "London" # first span is always the second token "London"
@ -154,6 +162,130 @@ def test_make_spangroup(max_positive, nr_results):
assert_almost_equal(0.9, spangroup.attrs["scores"][-1], 5) assert_almost_equal(0.9, spangroup.attrs["scores"][-1], 5)
@pytest.mark.parametrize(
"threshold,allow_overlap,nr_results",
[(0.05, True, 3), (0.05, False, 1), (0.5, True, 2), (0.5, False, 1)],
)
def test_make_spangroup_singlelabel(threshold, allow_overlap, nr_results):
fix_random_seed(0)
nlp = Language()
spancat = nlp.add_pipe(
"spancat",
config={
"spans_key": SPAN_KEY,
"threshold": threshold,
"max_positive": 1,
},
)
doc = nlp.make_doc("Greater London")
ngram_suggester = registry.misc.get("spacy.ngram_suggester.v1")(sizes=[1, 2])
indices = ngram_suggester([doc])[0].dataXd
assert_array_equal(OPS.to_numpy(indices), numpy.asarray([[0, 1], [1, 2], [0, 2]]))
labels = ["Thing", "City", "Person", "GreatCity"]
for label in labels:
spancat.add_label(label)
scores = numpy.asarray(
[[0.2, 0.4, 0.3, 0.1], [0.1, 0.6, 0.2, 0.4], [0.8, 0.7, 0.3, 0.9]], dtype="f"
)
spangroup = spancat._make_span_group_singlelabel(
doc, indices, scores, allow_overlap
)
if threshold > 0.4:
if allow_overlap:
assert spangroup[0].text == "London"
assert spangroup[0].label_ == "City"
assert_almost_equal(0.6, spangroup.attrs["scores"][0], 5)
assert spangroup[1].text == "Greater London"
assert spangroup[1].label_ == "GreatCity"
assert spangroup.attrs["scores"][1] == 0.9
assert_almost_equal(0.9, spangroup.attrs["scores"][1], 5)
else:
assert spangroup[0].text == "Greater London"
assert spangroup[0].label_ == "GreatCity"
assert spangroup.attrs["scores"][0] == 0.9
else:
if allow_overlap:
assert spangroup[0].text == "Greater"
assert spangroup[0].label_ == "City"
assert spangroup[1].text == "London"
assert spangroup[1].label_ == "City"
assert spangroup[2].text == "Greater London"
assert spangroup[2].label_ == "GreatCity"
else:
assert spangroup[0].text == "Greater London"
def test_make_spangroup_negative_label():
fix_random_seed(0)
nlp_single = Language()
nlp_multi = Language()
spancat_single = nlp_single.add_pipe(
"spancat",
config={
"spans_key": SPAN_KEY,
"threshold": 0.1,
"max_positive": 1,
},
)
spancat_multi = nlp_multi.add_pipe(
"spancat",
config={
"spans_key": SPAN_KEY,
"threshold": 0.1,
"max_positive": 2,
},
)
spancat_single.add_negative_label = True
spancat_multi.add_negative_label = True
doc = nlp_single.make_doc("Greater London")
labels = ["Thing", "City", "Person", "GreatCity"]
for label in labels:
spancat_multi.add_label(label)
spancat_single.add_label(label)
ngram_suggester = registry.misc.get("spacy.ngram_suggester.v1")(sizes=[1, 2])
indices = ngram_suggester([doc])[0].dataXd
assert_array_equal(OPS.to_numpy(indices), numpy.asarray([[0, 1], [1, 2], [0, 2]]))
scores = numpy.asarray(
[
[0.2, 0.4, 0.3, 0.1, 0.1],
[0.1, 0.6, 0.2, 0.4, 0.9],
[0.8, 0.7, 0.3, 0.9, 0.1],
],
dtype="f",
)
spangroup_multi = spancat_multi._make_span_group_multilabel(doc, indices, scores)
spangroup_single = spancat_single._make_span_group_singlelabel(doc, indices, scores)
assert len(spangroup_single) == 2
assert spangroup_single[0].text == "Greater"
assert spangroup_single[0].label_ == "City"
assert_almost_equal(0.4, spangroup_single.attrs["scores"][0], 5)
assert spangroup_single[1].text == "Greater London"
assert spangroup_single[1].label_ == "GreatCity"
assert spangroup_single.attrs["scores"][1] == 0.9
assert_almost_equal(0.9, spangroup_single.attrs["scores"][1], 5)
assert len(spangroup_multi) == 6
assert spangroup_multi[0].text == "Greater"
assert spangroup_multi[0].label_ == "City"
assert_almost_equal(0.4, spangroup_multi.attrs["scores"][0], 5)
assert spangroup_multi[1].text == "Greater"
assert spangroup_multi[1].label_ == "Person"
assert_almost_equal(0.3, spangroup_multi.attrs["scores"][1], 5)
assert spangroup_multi[2].text == "London"
assert spangroup_multi[2].label_ == "City"
assert_almost_equal(0.6, spangroup_multi.attrs["scores"][2], 5)
assert spangroup_multi[3].text == "London"
assert spangroup_multi[3].label_ == "GreatCity"
assert_almost_equal(0.4, spangroup_multi.attrs["scores"][3], 5)
assert spangroup_multi[4].text == "Greater London"
assert spangroup_multi[4].label_ == "Thing"
assert spangroup_multi[4].text == "Greater London"
assert_almost_equal(0.8, spangroup_multi.attrs["scores"][4], 5)
assert spangroup_multi[5].text == "Greater London"
assert spangroup_multi[5].label_ == "GreatCity"
assert_almost_equal(0.9, spangroup_multi.attrs["scores"][5], 5)
def test_ngram_suggester(en_tokenizer): def test_ngram_suggester(en_tokenizer):
# test different n-gram lengths # test different n-gram lengths
for size in [1, 2, 3]: for size in [1, 2, 3]:
@ -371,9 +503,9 @@ def test_overfitting_IO_overlapping():
assert set([span.label_ for span in spans2]) == {"LOC", "DOUBLE_LOC"} assert set([span.label_ for span in spans2]) == {"LOC", "DOUBLE_LOC"}
def test_zero_suggestions(): @pytest.mark.parametrize("name", SPANCAT_COMPONENTS)
def test_zero_suggestions(name):
# Test with a suggester that can return 0 suggestions # Test with a suggester that can return 0 suggestions
@registry.misc("test_mixed_zero_suggester") @registry.misc("test_mixed_zero_suggester")
def make_mixed_zero_suggester(): def make_mixed_zero_suggester():
def mixed_zero_suggester(docs, *, ops=None): def mixed_zero_suggester(docs, *, ops=None):
@ -400,7 +532,7 @@ def test_zero_suggestions():
fix_random_seed(0) fix_random_seed(0)
nlp = English() nlp = English()
spancat = nlp.add_pipe( spancat = nlp.add_pipe(
"spancat", name,
config={ config={
"suggester": {"@misc": "test_mixed_zero_suggester"}, "suggester": {"@misc": "test_mixed_zero_suggester"},
"spans_key": SPAN_KEY, "spans_key": SPAN_KEY,
@ -408,7 +540,7 @@ def test_zero_suggestions():
) )
train_examples = make_examples(nlp) train_examples = make_examples(nlp)
optimizer = nlp.initialize(get_examples=lambda: train_examples) optimizer = nlp.initialize(get_examples=lambda: train_examples)
assert spancat.model.get_dim("nO") == 2 assert spancat.model.get_dim("nO") == spancat._n_labels
assert set(spancat.labels) == {"LOC", "PERSON"} assert set(spancat.labels) == {"LOC", "PERSON"}
nlp.update(train_examples, sgd=optimizer) nlp.update(train_examples, sgd=optimizer)
@ -424,9 +556,10 @@ def test_zero_suggestions():
list(nlp.pipe(["", "one", "three three three"])) list(nlp.pipe(["", "one", "three three three"]))
def test_set_candidates(): @pytest.mark.parametrize("name", SPANCAT_COMPONENTS)
def test_set_candidates(name):
nlp = Language() nlp = Language()
spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY}) spancat = nlp.add_pipe(name, config={"spans_key": SPAN_KEY})
train_examples = make_examples(nlp) train_examples = make_examples(nlp)
nlp.initialize(get_examples=lambda: train_examples) nlp.initialize(get_examples=lambda: train_examples)
texts = [ texts = [

View File

@ -1,6 +1,6 @@
from typing import cast from typing import cast
import pytest import pytest
from numpy.testing import assert_equal from numpy.testing import assert_equal, assert_almost_equal
from spacy.attrs import TAG from spacy.attrs import TAG
from spacy import util from spacy import util
@ -71,6 +71,29 @@ PARTIAL_DATA = [
] ]
def test_label_smoothing():
nlp = Language()
tagger_no_ls = nlp.add_pipe("tagger", "no_label_smoothing")
tagger_ls = nlp.add_pipe(
"tagger", "label_smoothing", config=dict(label_smoothing=0.05)
)
train_examples = []
losses = {}
for tag in TAGS:
tagger_no_ls.add_label(tag)
tagger_ls.add_label(tag)
for t in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
nlp.initialize(get_examples=lambda: train_examples)
tag_scores, bp_tag_scores = tagger_ls.model.begin_update(
[eg.predicted for eg in train_examples]
)
no_ls_grads = tagger_no_ls.get_loss(train_examples, tag_scores)[1][0]
ls_grads = tagger_ls.get_loss(train_examples, tag_scores)[1][0]
assert_almost_equal(ls_grads / no_ls_grads, 0.925)
def test_no_label(): def test_no_label():
nlp = Language() nlp = Language()
nlp.add_pipe("tagger") nlp.add_pipe("tagger")

View File

@ -2,7 +2,6 @@ import os
import math import math
from collections import Counter from collections import Counter
from typing import Tuple, List, Dict, Any from typing import Tuple, List, Dict, Any
import pkg_resources
import time import time
from pathlib import Path from pathlib import Path
@ -29,6 +28,7 @@ from spacy.cli.debug_data import _print_span_characteristics
from spacy.cli.debug_data import _get_spans_length_freq_dist from spacy.cli.debug_data import _get_spans_length_freq_dist
from spacy.cli.download import get_compatibility, get_version from spacy.cli.download import get_compatibility, get_version
from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config
from spacy.cli.init_pipeline import _init_labels
from spacy.cli.package import get_third_party_dependencies from spacy.cli.package import get_third_party_dependencies
from spacy.cli.package import _is_permitted_package_name from spacy.cli.package import _is_permitted_package_name
from spacy.cli.project.remote_storage import RemoteStorage from spacy.cli.project.remote_storage import RemoteStorage
@ -47,7 +47,6 @@ from spacy.training.converters import conll_ner_to_docs, conllu_to_docs
from spacy.training.converters import iob_to_docs from spacy.training.converters import iob_to_docs
from spacy.util import ENV_VARS, get_minor_version, load_model_from_config, load_config from spacy.util import ENV_VARS, get_minor_version, load_model_from_config, load_config
from ..cli.init_pipeline import _init_labels
from .util import make_tempdir from .util import make_tempdir
@ -553,7 +552,14 @@ def test_parse_cli_overrides():
@pytest.mark.parametrize("lang", ["en", "nl"]) @pytest.mark.parametrize("lang", ["en", "nl"])
@pytest.mark.parametrize( @pytest.mark.parametrize(
"pipeline", [["tagger", "parser", "ner"], [], ["ner", "textcat", "sentencizer"]] "pipeline",
[
["tagger", "parser", "ner"],
[],
["ner", "textcat", "sentencizer"],
["morphologizer", "spancat", "entity_linker"],
["spancat_singlelabel", "textcat_multilabel"],
],
) )
@pytest.mark.parametrize("optimize", ["efficiency", "accuracy"]) @pytest.mark.parametrize("optimize", ["efficiency", "accuracy"])
@pytest.mark.parametrize("pretraining", [True, False]) @pytest.mark.parametrize("pretraining", [True, False])
@ -1126,6 +1132,7 @@ def test_cli_find_threshold(capsys):
) )
@pytest.mark.filterwarnings("ignore::DeprecationWarning")
@pytest.mark.parametrize( @pytest.mark.parametrize(
"reqs,output", "reqs,output",
[ [
@ -1158,6 +1165,8 @@ def test_cli_find_threshold(capsys):
], ],
) )
def test_project_check_requirements(reqs, output): def test_project_check_requirements(reqs, output):
import pkg_resources
# excessive guard against unlikely package name # excessive guard against unlikely package name
try: try:
pkg_resources.require("spacyunknowndoesnotexist12345") pkg_resources.require("spacyunknowndoesnotexist12345")

View File

@ -5,10 +5,18 @@ import srsly
from typer.testing import CliRunner from typer.testing import CliRunner
from spacy.tokens import DocBin, Doc from spacy.tokens import DocBin, Doc
from spacy.cli._util import app from spacy.cli._util import app, get_git_version
from .util import make_tempdir, normalize_whitespace from .util import make_tempdir, normalize_whitespace
def has_git():
try:
get_git_version()
return True
except RuntimeError:
return False
def test_convert_auto(): def test_convert_auto():
with make_tempdir() as d_in, make_tempdir() as d_out: with make_tempdir() as d_in, make_tempdir() as d_out:
for f in ["data1.iob", "data2.iob", "data3.iob"]: for f in ["data1.iob", "data2.iob", "data3.iob"]:
@ -181,6 +189,7 @@ def test_project_run(project_dir):
assert "okokok" in result.stdout assert "okokok" in result.stdout
@pytest.mark.skipif(not has_git(), reason="git not installed")
@pytest.mark.parametrize( @pytest.mark.parametrize(
"options", "options",
[ [

View File

@ -275,6 +275,20 @@ def test_displacy_parse_deps(en_vocab):
{"start": 2, "end": 3, "label": "det", "dir": "left"}, {"start": 2, "end": 3, "label": "det", "dir": "left"},
{"start": 1, "end": 3, "label": "attr", "dir": "right"}, {"start": 1, "end": 3, "label": "attr", "dir": "right"},
] ]
# Test that displacy.parse_deps converts Span to Doc
deps = displacy.parse_deps(doc[:])
assert isinstance(deps, dict)
assert deps["words"] == [
{"lemma": None, "text": words[0], "tag": pos[0]},
{"lemma": None, "text": words[1], "tag": pos[1]},
{"lemma": None, "text": words[2], "tag": pos[2]},
{"lemma": None, "text": words[3], "tag": pos[3]},
]
assert deps["arcs"] == [
{"start": 0, "end": 1, "label": "nsubj", "dir": "left"},
{"start": 2, "end": 3, "label": "det", "dir": "left"},
{"start": 1, "end": 3, "label": "attr", "dir": "right"},
]
def test_displacy_invalid_arcs(): def test_displacy_invalid_arcs():

View File

@ -2,17 +2,19 @@ from pathlib import Path
import numpy as np import numpy as np
import pytest import pytest
import srsly import srsly
from spacy.vocab import Vocab from thinc.api import Config, get_current_ops
from thinc.api import Config
from spacy import util
from spacy.lang.en import English
from spacy.training.initialize import init_nlp
from spacy.training.loop import train
from spacy.training.pretrain import pretrain
from spacy.tokens import Doc, DocBin
from spacy.language import DEFAULT_CONFIG_PRETRAIN_PATH, DEFAULT_CONFIG_PATH
from spacy.ml.models.multi_task import create_pretrain_vectors
from spacy.vectors import Vectors
from spacy.vocab import Vocab
from ..util import make_tempdir from ..util import make_tempdir
from ... import util
from ...lang.en import English
from ...training.initialize import init_nlp
from ...training.loop import train
from ...training.pretrain import pretrain
from ...tokens import Doc, DocBin
from ...language import DEFAULT_CONFIG_PRETRAIN_PATH, DEFAULT_CONFIG_PATH
pretrain_string_listener = """ pretrain_string_listener = """
[nlp] [nlp]
@ -346,3 +348,26 @@ def write_vectors_model(tmp_dir):
nlp = English(vocab) nlp = English(vocab)
nlp.to_disk(nlp_path) nlp.to_disk(nlp_path)
return str(nlp_path) return str(nlp_path)
def test_pretrain_default_vectors():
nlp = English()
nlp.add_pipe("tok2vec")
nlp.initialize()
# default vectors are supported
nlp.vocab.vectors = Vectors(shape=(10, 10))
create_pretrain_vectors(1, 1, "cosine")(nlp.vocab, nlp.get_pipe("tok2vec").model)
# floret vectors are supported
nlp.vocab.vectors = Vectors(
data=get_current_ops().xp.zeros((10, 10)), mode="floret", hash_count=1
)
create_pretrain_vectors(1, 1, "cosine")(nlp.vocab, nlp.get_pipe("tok2vec").model)
# error for no vectors
with pytest.raises(ValueError, match="E875"):
nlp.vocab.vectors = Vectors()
create_pretrain_vectors(1, 1, "cosine")(
nlp.vocab, nlp.get_pipe("tok2vec").model
)

View File

@ -494,10 +494,12 @@ cdef class Span:
start = i start = i
if start >= self.end: if start >= self.end:
break break
if start < self.end: elif i == self.doc.length - 1:
spans.append(Span(self.doc, start, self.end)) yield Span(self.doc, start, self.doc.length)
return tuple(spans)
# Ensure that trailing parts of the Span instance are included in last element of .sents.
if start == self.doc.length - 1:
yield Span(self.doc, start, self.doc.length)
@property @property
def ents(self): def ents(self):

View File

@ -1254,7 +1254,7 @@ be provided.
> ``` > ```
| Name | Description | | Name | Description |
| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | ------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `model` | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~ | | `model` | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~ |
| `data_path` | Path to file with DocBin with docs to use for threshold search. ~~Path (positional)~~ | | `data_path` | Path to file with DocBin with docs to use for threshold search. ~~Path (positional)~~ |
| `pipe_name` | Name of pipe to examine thresholds for. ~~str (positional)~~ | | `pipe_name` | Name of pipe to examine thresholds for. ~~str (positional)~~ |
@ -1264,7 +1264,7 @@ be provided.
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | | `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ | | `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ |
| `--gold-preproc`, `-G` | Use gold preprocessing. ~~bool (flag)~~ | | `--gold-preproc`, `-G` | Use gold preprocessing. ~~bool (flag)~~ |
| `--silent`, `-V`, `-VV` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ | | `--verbose`, `-V`, `-VV` | Display more information for debugging purposes. ~~bool (flag)~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
## assemble {id="assemble",tag="command"} ## assemble {id="assemble",tag="command"}

View File

@ -64,7 +64,7 @@ details on the architectures and their arguments and hyperparameters.
> config={ > config={
> "model": DEFAULT_COREF_MODEL, > "model": DEFAULT_COREF_MODEL,
> "span_cluster_prefix": DEFAULT_CLUSTER_PREFIX, > "span_cluster_prefix": DEFAULT_CLUSTER_PREFIX,
> }, > }
> nlp.add_pipe("experimental_coref", config=config) > nlp.add_pipe("experimental_coref", config=config)
> ``` > ```

View File

@ -43,12 +43,12 @@ architectures and their arguments and hyperparameters.
> ``` > ```
| Setting | Description | | Setting | Description |
| ----------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ---------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `model` | The model to use. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ | | `model` | The model to use. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ |
| `overwrite` <Tag variant="new">3.2</Tag> | Whether the values of existing features are overwritten. Defaults to `False`. ~~bool~~ | | `overwrite` <Tag variant="new">3.2</Tag> | Whether the values of existing features are overwritten. Defaults to `True`. ~~bool~~ |
| `extend` <Tag variant="new">3.2</Tag> | Whether existing feature types (whose values may or may not be overwritten depending on `overwrite`) are preserved. Defaults to `False`. ~~bool~~ | | `extend` <Tag variant="new">3.2</Tag> | Whether existing feature types (whose values may or may not be overwritten depending on `overwrite`) are preserved. Defaults to `False`. ~~bool~~ |
| `scorer` <Tag variant="new">3.2</Tag> | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"pos"` and `"morph"` and [`Scorer.score_token_attr_per_feat`](/api/scorer#score_token_attr_per_feat) for the attribute `"morph"`. ~~Optional[Callable]~~ | | `scorer` <Tag variant="new">3.2</Tag> | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"pos"` and `"morph"` and [`Scorer.score_token_attr_per_feat`](/api/scorer#score_token_attr_per_feat) for the attribute `"morph"`. ~~Optional[Callable]~~ |
| `save_activations` <Tag variant="new">4.0</Tag> | Save activations in `Doc` when annotating. Saved activations are `"probabilities"` and `"label_ids"`. ~~Union[bool, list[str]]~~ | | `label_smoothing` <Tag variant="new">3.6</Tag> | [Label smoothing](https://arxiv.org/abs/1906.02629) factor. Defaults to `0.0`. ~~float~~ |
```python ```python
%%GITHUB_SPACY/spacy/pipeline/morphologizer.pyx %%GITHUB_SPACY/spacy/pipeline/morphologizer.pyx

View File

@ -13,8 +13,16 @@ A span categorizer consists of two parts: a [suggester function](#suggesters)
that proposes candidate spans, which may or may not overlap, and a labeler model that proposes candidate spans, which may or may not overlap, and a labeler model
that predicts zero or more labels for each candidate. that predicts zero or more labels for each candidate.
Predicted spans will be saved in a [`SpanGroup`](/api/spangroup) on the doc. This component comes in two forms: `spancat` and `spancat_singlelabel` (added in
Individual span scores can be found in `spangroup.attrs["scores"]`. spaCy v3.5.1). When you need to perform multi-label classification on your
spans, use `spancat`. The `spancat` component uses a `Logistic` layer where the
output class probabilities are independent for each class. However, if you need
to predict at most one true class for a span, then use `spancat_singlelabel`. It
uses a `Softmax` layer and treats the task as a multi-class problem.
Predicted spans will be saved in a [`SpanGroup`](/api/spangroup) on the doc
under `doc.spans[spans_key]`, where `spans_key` is a component config setting.
Individual span scores are stored in `doc.spans[spans_key].attrs["scores"]`.
## Assigned Attributes {id="assigned-attributes"} ## Assigned Attributes {id="assigned-attributes"}
@ -22,7 +30,9 @@ Predictions will be saved to `Doc.spans[spans_key]` as a
[`SpanGroup`](/api/spangroup). The scores for the spans in the `SpanGroup` will [`SpanGroup`](/api/spangroup). The scores for the spans in the `SpanGroup` will
be saved in `SpanGroup.attrs["scores"]`. be saved in `SpanGroup.attrs["scores"]`.
`spans_key` defaults to `"sc"`, but can be passed as a parameter. `spans_key` defaults to `"sc"`, but can be passed as a parameter. The `spancat`
component will overwrite any existing spans under the spans key
`doc.spans[spans_key]`.
| Location | Value | | Location | Value |
| -------------------------------------- | -------------------------------------------------------- | | -------------------------------------- | -------------------------------------------------------- |
@ -38,7 +48,7 @@ how the component should be configured. You can override its settings via the
[model architectures](/api/architectures) documentation for details on the [model architectures](/api/architectures) documentation for details on the
architectures and their arguments and hyperparameters. architectures and their arguments and hyperparameters.
> #### Example > #### Example (spancat)
> >
> ```python > ```python
> from spacy.pipeline.spancat import DEFAULT_SPANCAT_MODEL > from spacy.pipeline.spancat import DEFAULT_SPANCAT_MODEL
@ -52,15 +62,33 @@ architectures and their arguments and hyperparameters.
> nlp.add_pipe("spancat", config=config) > nlp.add_pipe("spancat", config=config)
> ``` > ```
> #### Example (spancat_singlelabel)
>
> ```python
> from spacy.pipeline.spancat import DEFAULT_SPANCAT_SINGLELABEL_MODEL
> config = {
> "threshold": 0.5,
> "spans_key": "labeled_spans",
> "model": DEFAULT_SPANCAT_SINGLELABEL_MODEL,
> "suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
> # Additional spancat_singlelabel parameters
> "negative_weight": 0.8,
> "allow_overlap": True,
> }
> nlp.add_pipe("spancat_singlelabel", config=config)
> ```
| Setting | Description | | Setting | Description |
| ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `suggester` | A function that [suggests spans](#suggesters). Spans are returned as a ragged array with two integer columns, for the start and end positions. Defaults to [`ngram_suggester`](#ngram_suggester). ~~Callable[[Iterable[Doc], Optional[Ops]], Ragged]~~ | | `suggester` | A function that [suggests spans](#suggesters). Spans are returned as a ragged array with two integer columns, for the start and end positions. Defaults to [`ngram_suggester`](#ngram_suggester). ~~Callable[[Iterable[Doc], Optional[Ops]], Ragged]~~ |
| `model` | A model instance that is given a a list of documents and `(start, end)` indices representing candidate span offsets. The model predicts a probability for each category for each span. Defaults to [SpanCategorizer](/api/architectures#SpanCategorizer). ~~Model[Tuple[List[Doc], Ragged], Floats2d]~~ | | `model` | A model instance that is given a a list of documents and `(start, end)` indices representing candidate span offsets. The model predicts a probability for each category for each span. Defaults to [SpanCategorizer](/api/architectures#SpanCategorizer). ~~Model[Tuple[List[Doc], Ragged], Floats2d]~~ |
| `spans_key` | Key of the [`Doc.spans`](/api/doc#spans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"sc"`. ~~str~~ | | `spans_key` | Key of the [`Doc.spans`](/api/doc#spans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"sc"`. ~~str~~ |
| `threshold` | Minimum probability to consider a prediction positive. Spans with a positive prediction will be saved on the Doc. Defaults to `0.5`. ~~float~~ | | `threshold` | Minimum probability to consider a prediction positive. Spans with a positive prediction will be saved on the Doc. Meant to be used in combination with the multi-class `spancat` component with a `Logistic` scoring layer. Defaults to `0.5`. ~~float~~ |
| `max_positive` | Maximum number of labels to consider positive per span. Defaults to `None`, indicating no limit. ~~Optional[int]~~ | | `max_positive` | Maximum number of labels to consider positive per span. Defaults to `None`, indicating no limit. Meant to be used together with the `spancat` component and defaults to 0 with `spancat_singlelabel`. ~~Optional[int]~~ |
| `scorer` | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for `Doc.spans[spans_key]` with overlapping spans allowed. ~~Optional[Callable]~~ | | `scorer` | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for `Doc.spans[spans_key]` with overlapping spans allowed. ~~Optional[Callable]~~ |
| `save_activations` <Tag variant="new">4.0</Tag> | Save activations in `Doc` when annotating. Saved activations are `"indices"` and `"scores"`. ~~Union[bool, list[str]]~~ | | `add_negative_label` <Tag variant="new">3.5.1</Tag> | Whether to learn to predict a special negative label for each unannotated `Span` . This should be `True` when using a `Softmax` classifier layer and so its `True` by default for `spancat_singlelabel`. Spans with negative labels and their scores are not stored as annotations. ~~bool~~ |
| `negative_weight` <Tag variant="new">3.5.1</Tag> | Multiplier for the loss terms. It can be used to downweight the negative samples if there are too many. It is only used when `add_negative_label` is `True`. Defaults to `1.0`. ~~float~~ |
| `allow_overlap` <Tag variant="new">3.5.1</Tag> | If `True`, the data is assumed to contain overlapping spans. It is only available when `max_positive` is exactly 1. Defaults to `True`. ~~bool~~ |
```python ```python
%%GITHUB_SPACY/spacy/pipeline/spancat.py %%GITHUB_SPACY/spacy/pipeline/spancat.py
@ -72,6 +100,7 @@ architectures and their arguments and hyperparameters.
> >
> ```python > ```python
> # Construction via add_pipe with default model > # Construction via add_pipe with default model
> # Replace 'spancat' with 'spancat_singlelabel' for exclusive classes
> spancat = nlp.add_pipe("spancat") > spancat = nlp.add_pipe("spancat")
> >
> # Construction via add_pipe with custom model > # Construction via add_pipe with custom model
@ -88,7 +117,7 @@ shortcut for this and instantiate the component using its string name and
[`nlp.add_pipe`](/api/language#create_pipe). [`nlp.add_pipe`](/api/language#create_pipe).
| Name | Description | | Name | Description |
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | --------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `vocab` | The shared vocabulary. ~~Vocab~~ | | `vocab` | The shared vocabulary. ~~Vocab~~ |
| `model` | A model instance that is given a a list of documents and `(start, end)` indices representing candidate span offsets. The model predicts a probability for each category for each span. ~~Model[Tuple[List[Doc], Ragged], Floats2d]~~ | | `model` | A model instance that is given a a list of documents and `(start, end)` indices representing candidate span offsets. The model predicts a probability for each category for each span. ~~Model[Tuple[List[Doc], Ragged], Floats2d]~~ |
| `suggester` | A function that [suggests spans](#suggesters). Spans are returned as a ragged array with two integer columns, for the start and end positions. ~~Callable[[Iterable[Doc], Optional[Ops]], Ragged]~~ | | `suggester` | A function that [suggests spans](#suggesters). Spans are returned as a ragged array with two integer columns, for the start and end positions. ~~Callable[[Iterable[Doc], Optional[Ops]], Ragged]~~ |
@ -97,6 +126,9 @@ shortcut for this and instantiate the component using its string name and
| `spans_key` | Key of the [`Doc.spans`](/api/doc#sans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"sc"`. ~~str~~ | | `spans_key` | Key of the [`Doc.spans`](/api/doc#sans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"sc"`. ~~str~~ |
| `threshold` | Minimum probability to consider a prediction positive. Spans with a positive prediction will be saved on the Doc. Defaults to `0.5`. ~~float~~ | | `threshold` | Minimum probability to consider a prediction positive. Spans with a positive prediction will be saved on the Doc. Defaults to `0.5`. ~~float~~ |
| `max_positive` | Maximum number of labels to consider positive per span. Defaults to `None`, indicating no limit. ~~Optional[int]~~ | | `max_positive` | Maximum number of labels to consider positive per span. Defaults to `None`, indicating no limit. ~~Optional[int]~~ |
| `allow_overlap` <Tag variant="new">3.5.1</Tag> | If `True`, the data is assumed to contain overlapping spans. It is only available when `max_positive` is exactly 1. Defaults to `True`. ~~bool~~ |
| `add_negative_label` <Tag variant="new">3.5.1</Tag> | Whether to learn to predict a special negative label for each unannotated `Span`. This should be `True` when using a `Softmax` classifier layer and so its `True` by default for `spancat_singlelabel` . Spans with negative labels and their scores are not stored as annotations. ~~bool~~ |
| `negative_weight` <Tag variant="new">3.5.1</Tag> | Multiplier for the loss terms. It can be used to downweight the negative samples if there are too many . It is only used when `add_negative_label` is `True`. Defaults to `1.0`. ~~float~~ |
## SpanCategorizer.\_\_call\_\_ {id="call",tag="method"} ## SpanCategorizer.\_\_call\_\_ {id="call",tag="method"}

View File

@ -8,6 +8,13 @@ Look up strings by 64-bit hashes. As of v2.0, spaCy uses hash values instead of
integer IDs. This ensures that strings always map to the same ID, even from integer IDs. This ensures that strings always map to the same ID, even from
different `StringStores`. different `StringStores`.
<Infobox variant ="warning">
Note that a `StringStore` instance is not static. It increases in size as texts
with new tokens are processed.
</Infobox>
## StringStore.\_\_init\_\_ {id="init",tag="method"} ## StringStore.\_\_init\_\_ {id="init",tag="method"}
Create the `StringStore`. Create the `StringStore`.

View File

@ -41,12 +41,12 @@ architectures and their arguments and hyperparameters.
> ``` > ```
| Setting | Description | | Setting | Description |
| ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | ---------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `model` | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ | | `model` | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ |
| `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~ | | `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~ |
| `scorer` <Tag variant="new">3.2</Tag> | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attribute `"tag"`. ~~Optional[Callable]~~ | | `scorer` <Tag variant="new">3.2</Tag> | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attribute `"tag"`. ~~Optional[Callable]~~ |
| `neg_prefix` <Tag variant="new">3.2.1</Tag> | The prefix used to specify incorrect tags while training. The tagger will learn not to predict exactly this tag. Defaults to `!`. ~~str~~ | | `neg_prefix` <Tag variant="new">3.2.1</Tag> | The prefix used to specify incorrect tags while training. The tagger will learn not to predict exactly this tag. Defaults to `!`. ~~str~~ |
| `save_activations` <Tag variant="new">4.0</Tag> | Save activations in `Doc` when annotating. Saved activations are `"probabilities"` and `"label_ids"`. ~~Union[bool, list[str]]~~ | | `label_smoothing` <Tag variant="new">3.6</Tag> | [Label smoothing](https://arxiv.org/abs/1906.02629) factor. Defaults to `0.0`. ~~float~~ |
```python ```python
%%GITHUB_SPACY/spacy/pipeline/tagger.pyx %%GITHUB_SPACY/spacy/pipeline/tagger.pyx

View File

@ -294,7 +294,7 @@ the `manual=True` argument in `displacy.render`.
| Name | Description | | Name | Description |
| ----------- | ------------------------------------------------------------------- | | ----------- | ------------------------------------------------------------------- |
| `orig_doc` | Doc to parse dependencies. ~~Doc~~ | | `orig_doc` | Doc or span to parse dependencies. ~~Union[Doc, Span]~~ |
| `options` | Dependency parse specific visualisation options. ~~Dict[str, Any]~~ | | `options` | Dependency parse specific visualisation options. ~~Dict[str, Any]~~ |
| **RETURNS** | Generated dependency parse keyed by words and arcs. ~~dict~~ | | **RETURNS** | Generated dependency parse keyed by words and arcs. ~~dict~~ |

View File

@ -10,6 +10,13 @@ The `Vocab` object provides a lookup table that allows you to access
[`StringStore`](/api/stringstore). It also owns underlying C-data that is shared [`StringStore`](/api/stringstore). It also owns underlying C-data that is shared
between `Doc` objects. between `Doc` objects.
<Infobox variant ="warning">
Note that a `Vocab` instance is not static. It increases in size as texts with
new tokens are processed.
</Infobox>
## Vocab.\_\_init\_\_ {id="init",tag="method"} ## Vocab.\_\_init\_\_ {id="init",tag="method"}
Create the vocabulary. Create the vocabulary.

View File

@ -3219,6 +3219,51 @@
"category": ["pipeline"], "category": ["pipeline"],
"tags": ["syllables", "multilingual"] "tags": ["syllables", "multilingual"]
}, },
{
"id": "sentimental-onix",
"title": "Sentimental Onix",
"slogan": "Use onnx for sentiment models",
"description": "spaCy pipeline component for sentiment analysis using onnx",
"github": "sloev/sentimental-onix",
"pip": "sentimental-onix",
"code_example": [
"# Download model:",
"# python -m sentimental_onix download en",
"import spacy",
"from sentimental_onix import pipeline",
"",
"nlp = spacy.load(\"en_core_web_sm\")",
"nlp.add_pipe(\"sentencizer\")",
"nlp.add_pipe(\"sentimental_onix\", after=\"sentencizer\")",
"",
"sentences = [",
" (sent.text, sent._.sentiment)",
" for doc in nlp.pipe(",
" [",
" \"i hate pasta on tuesdays\",",
" \"i like movies on wednesdays\",",
" \"i find your argument ridiculous\",",
" \"soda with straws are my favorite\",",
" ]",
" )",
" for sent in doc.sents",
"]",
"",
"assert sentences == [",
" (\"i hate pasta on tuesdays\", \"Negative\"),",
" (\"i like movies on wednesdays\", \"Positive\"),",
" (\"i find your argument ridiculous\", \"Negative\"),",
" (\"soda with straws are my favorite\", \"Positive\"),",
"]"
],
"thumb": "https://raw.githubusercontent.com/sloev/sentimental-onix/master/.github/onix.webp",
"author": "Johannes Valbjørn",
"author_links": {
"github": "sloev"
},
"category": ["pipeline"],
"tags": ["sentiment", "english"]
},
{ {
"id": "gobbli", "id": "gobbli",
"title": "gobbli", "title": "gobbli",

View File

@ -6,6 +6,7 @@
"dev": "next dev", "dev": "next dev",
"build": "next build && npm run sitemap && next export", "build": "next build && npm run sitemap && next export",
"prebuild": "pip install -r setup/requirements.txt && sh setup/setup.sh", "prebuild": "pip install -r setup/requirements.txt && sh setup/setup.sh",
"predev": "npm run prebuild",
"sitemap": "next-sitemap --config next-sitemap.config.mjs", "sitemap": "next-sitemap --config next-sitemap.config.mjs",
"start": "next start", "start": "next start",
"lint": "next lint", "lint": "next lint",

View File

@ -111,11 +111,12 @@
line-height: var(--line-height-xs) line-height: var(--line-height-xs)
text-align: center text-align: center
@include breakpoint(max, xs) @include breakpoint(max, md)
.list .alert
display: none display: none
.alert @include breakpoint(max, xs)
.list
display: none display: none
.has-alert .has-alert

View File

@ -57,9 +57,15 @@ const AlertSpace = ({ nightly, legacy }) => {
) )
} }
// const navAlert = (
// <Link to="/usage/v3-5" noLinkLayout>
// <strong>💥 Out now:</strong> spaCy v3.5
// </Link>
// )
const navAlert = ( const navAlert = (
<Link to="/usage/v3-5" noLinkLayout> <Link to="https://form.typeform.com/to/aMel9q9f" noLinkLayout>
<strong>💥 Out now:</strong> spaCy v3.5 <strong>💥 Take the user survey!</strong>
</Link> </Link>
) )