mirror of
https://github.com/explosion/spaCy.git
synced 2025-04-21 17:41:59 +03:00
Merge branch 'upstream_master' into feature/weasel
This commit is contained in:
commit
7fa617f5a6
24
.github/azure-steps.yml
vendored
24
.github/azure-steps.yml
vendored
|
@ -57,51 +57,51 @@ steps:
|
|||
python -m spacy download ca_core_news_md
|
||||
python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
|
||||
displayName: 'Test download CLI'
|
||||
condition: eq(variables['python_version'], '3.8')
|
||||
condition: eq(variables['python_version'], '3.9')
|
||||
|
||||
- script: |
|
||||
python -W error -m spacy info ca_core_news_sm | grep -q download_url
|
||||
displayName: 'Test download_url in info CLI'
|
||||
condition: eq(variables['python_version'], '3.8')
|
||||
condition: eq(variables['python_version'], '3.9')
|
||||
|
||||
- script: |
|
||||
python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
|
||||
displayName: 'Test no warnings on load (#11713)'
|
||||
condition: eq(variables['python_version'], '3.8')
|
||||
condition: eq(variables['python_version'], '3.9')
|
||||
|
||||
- script: |
|
||||
python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
|
||||
displayName: 'Test convert CLI'
|
||||
condition: eq(variables['python_version'], '3.8')
|
||||
condition: eq(variables['python_version'], '3.9')
|
||||
|
||||
- script: |
|
||||
python -m spacy init config -p ner -l ca ner.cfg
|
||||
python -m spacy debug config ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy
|
||||
displayName: 'Test debug config CLI'
|
||||
condition: eq(variables['python_version'], '3.8')
|
||||
condition: eq(variables['python_version'], '3.9')
|
||||
|
||||
- script: |
|
||||
# will have errors due to sparse data, check for summary in output
|
||||
python -m spacy debug data ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy | grep -q Summary
|
||||
displayName: 'Test debug data CLI'
|
||||
condition: eq(variables['python_version'], '3.8')
|
||||
condition: eq(variables['python_version'], '3.9')
|
||||
|
||||
- script: |
|
||||
python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
|
||||
displayName: 'Test train CLI'
|
||||
condition: eq(variables['python_version'], '3.8')
|
||||
condition: eq(variables['python_version'], '3.9')
|
||||
|
||||
- script: |
|
||||
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
|
||||
PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
|
||||
displayName: 'Test assemble CLI'
|
||||
condition: eq(variables['python_version'], '3.8')
|
||||
condition: eq(variables['python_version'], '3.9')
|
||||
|
||||
- script: |
|
||||
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
|
||||
python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
|
||||
displayName: 'Test assemble CLI vectors warning'
|
||||
condition: eq(variables['python_version'], '3.8')
|
||||
condition: eq(variables['python_version'], '3.9')
|
||||
|
||||
- script: |
|
||||
python -m pip install -U -r requirements.txt
|
||||
|
@ -116,9 +116,3 @@ steps:
|
|||
python -m pytest --pyargs spacy
|
||||
displayName: "Run CPU tests with thinc-apple-ops"
|
||||
condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.11'))
|
||||
|
||||
- script: |
|
||||
python .github/validate_universe_json.py website/meta/universe.json
|
||||
displayName: 'Test website/meta/universe.json'
|
||||
condition: eq(variables['python_version'], '3.8')
|
||||
|
||||
|
|
45
.github/workflows/autoblack.yml
vendored
45
.github/workflows/autoblack.yml
vendored
|
@ -1,45 +0,0 @@
|
|||
# GitHub Action that uses Black to reformat all Python code and submits a PR
|
||||
# in regular intervals. Inspired by: https://github.com/cclauss/autoblack
|
||||
|
||||
name: autoblack
|
||||
on:
|
||||
workflow_dispatch: # allow manual trigger
|
||||
schedule:
|
||||
- cron: '0 8 * * 5' # every Friday at 8am UTC
|
||||
|
||||
jobs:
|
||||
autoblack:
|
||||
if: github.repository_owner == 'explosion'
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
ref: ${{ github.head_ref }}
|
||||
- uses: actions/setup-python@v4
|
||||
- run: pip install black -c requirements.txt
|
||||
- name: Auto-format code if needed
|
||||
run: black spacy
|
||||
# We can't run black --check here because that returns a non-zero excit
|
||||
# code and makes GitHub think the action failed
|
||||
- name: Check for modified files
|
||||
id: git-check
|
||||
run: echo modified=$(if git diff-index --quiet HEAD --; then echo "false"; else echo "true"; fi) >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Create Pull Request
|
||||
if: steps.git-check.outputs.modified == 'true'
|
||||
uses: peter-evans/create-pull-request@v4
|
||||
with:
|
||||
title: Auto-format code with black
|
||||
labels: meta
|
||||
commit-message: Auto-format code with black
|
||||
committer: GitHub <noreply@github.com>
|
||||
author: explosion-bot <explosion-bot@users.noreply.github.com>
|
||||
body: _This PR is auto-generated._
|
||||
branch: autoblack
|
||||
delete-branch: true
|
||||
draft: false
|
||||
- name: Check outputs
|
||||
if: steps.git-check.outputs.modified == 'true'
|
||||
run: |
|
||||
echo "Pull Request Number - ${{ steps.cpr.outputs.pull-request-number }}"
|
||||
echo "Pull Request URL - ${{ steps.cpr.outputs.pull-request-url }}"
|
172
.github/workflows/tests.yml
vendored
Normal file
172
.github/workflows/tests.yml
vendored
Normal file
|
@ -0,0 +1,172 @@
|
|||
name: tests
|
||||
|
||||
on:
|
||||
push:
|
||||
branches-ignore:
|
||||
- "spacy.io"
|
||||
- "nightly.spacy.io"
|
||||
- "v2.spacy.io"
|
||||
paths-ignore:
|
||||
- "*.md"
|
||||
- "*.mdx"
|
||||
- "website/**"
|
||||
- ".github/workflows/**"
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened, edited]
|
||||
paths-ignore:
|
||||
- "*.md"
|
||||
- "*.mdx"
|
||||
- "website/**"
|
||||
|
||||
jobs:
|
||||
validate:
|
||||
name: Validate
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Check out repo
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Configure Python version
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: "3.7"
|
||||
architecture: x64
|
||||
|
||||
- name: black
|
||||
run: |
|
||||
python -m pip install black -c requirements.txt
|
||||
python -m black spacy --check
|
||||
- name: flake8
|
||||
run: |
|
||||
python -m pip install flake8==5.0.4
|
||||
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
|
||||
tests:
|
||||
name: Test
|
||||
needs: Validate
|
||||
strategy:
|
||||
fail-fast: true
|
||||
matrix:
|
||||
os: [ubuntu-latest, windows-latest, macos-latest]
|
||||
python_version: ["3.11"]
|
||||
include:
|
||||
- os: ubuntu-20.04
|
||||
python_version: "3.6"
|
||||
- os: windows-latest
|
||||
python_version: "3.7"
|
||||
- os: macos-latest
|
||||
python_version: "3.8"
|
||||
- os: ubuntu-latest
|
||||
python_version: "3.9"
|
||||
- os: windows-latest
|
||||
python_version: "3.10"
|
||||
|
||||
runs-on: ${{ matrix.os }}
|
||||
|
||||
steps:
|
||||
- name: Check out repo
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Configure Python version
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: ${{ matrix.python_version }}
|
||||
architecture: x64
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install -U build pip setuptools
|
||||
python -m pip install -U -r requirements.txt
|
||||
|
||||
- name: Build sdist
|
||||
run: |
|
||||
python -m build --sdist
|
||||
|
||||
- name: Run mypy
|
||||
run: |
|
||||
python -m mypy spacy
|
||||
if: matrix.python_version != '3.6'
|
||||
|
||||
- name: Delete source directory and .egg-info
|
||||
run: |
|
||||
rm -rf spacy *.egg-info
|
||||
shell: bash
|
||||
|
||||
- name: Uninstall all packages
|
||||
run: |
|
||||
python -m pip freeze
|
||||
python -m pip freeze --exclude pywin32 > installed.txt
|
||||
python -m pip uninstall -y -r installed.txt
|
||||
|
||||
- name: Install from sdist
|
||||
run: |
|
||||
SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
|
||||
SPACY_NUM_BUILD_JOBS=2 python -m pip install dist/$SDIST
|
||||
shell: bash
|
||||
|
||||
- name: Test import
|
||||
run: python -W error -c "import spacy"
|
||||
|
||||
- name: "Test download CLI"
|
||||
run: |
|
||||
python -m spacy download ca_core_news_sm
|
||||
python -m spacy download ca_core_news_md
|
||||
python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
|
||||
if: matrix.python_version == '3.9'
|
||||
|
||||
- name: "Test download_url in info CLI"
|
||||
run: |
|
||||
python -W error -m spacy info ca_core_news_sm | grep -q download_url
|
||||
if: matrix.python_version == '3.9'
|
||||
|
||||
- name: "Test no warnings on load (#11713)"
|
||||
run: |
|
||||
python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
|
||||
if: matrix.python_version == '3.9'
|
||||
|
||||
- name: "Test convert CLI"
|
||||
run: |
|
||||
python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
|
||||
if: matrix.python_version == '3.9'
|
||||
|
||||
- name: "Test debug config CLI"
|
||||
run: |
|
||||
python -m spacy init config -p ner -l ca ner.cfg
|
||||
python -m spacy debug config ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy
|
||||
if: matrix.python_version == '3.9'
|
||||
|
||||
- name: "Test debug data CLI"
|
||||
run: |
|
||||
# will have errors due to sparse data, check for summary in output
|
||||
python -m spacy debug data ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy | grep -q Summary
|
||||
if: matrix.python_version == '3.9'
|
||||
|
||||
- name: "Test train CLI"
|
||||
run: |
|
||||
python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
|
||||
if: matrix.python_version == '3.9'
|
||||
|
||||
- name: "Test assemble CLI"
|
||||
run: |
|
||||
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
|
||||
PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
|
||||
if: matrix.python_version == '3.9'
|
||||
|
||||
- name: "Test assemble CLI vectors warning"
|
||||
run: |
|
||||
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
|
||||
python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
|
||||
if: matrix.python_version == '3.9'
|
||||
|
||||
- name: "Install test requirements"
|
||||
run: |
|
||||
python -m pip install -U -r requirements.txt
|
||||
|
||||
- name: "Run CPU tests"
|
||||
run: |
|
||||
python -m pytest --pyargs spacy -W error
|
||||
|
||||
- name: "Run CPU tests with thinc-apple-ops"
|
||||
run: |
|
||||
python -m pip install 'spacy[apple]'
|
||||
python -m pytest --pyargs spacy
|
||||
if: startsWith(matrix.os, 'macos') && matrix.python_version == '3.11'
|
32
.github/workflows/universe_validation.yml
vendored
Normal file
32
.github/workflows/universe_validation.yml
vendored
Normal file
|
@ -0,0 +1,32 @@
|
|||
name: universe validation
|
||||
|
||||
on:
|
||||
push:
|
||||
branches-ignore:
|
||||
- "spacy.io"
|
||||
- "nightly.spacy.io"
|
||||
- "v2.spacy.io"
|
||||
paths:
|
||||
- "website/meta/universe.json"
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened, edited]
|
||||
paths:
|
||||
- "website/meta/universe.json"
|
||||
|
||||
jobs:
|
||||
validate:
|
||||
name: Validate
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Check out repo
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Configure Python version
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: "3.7"
|
||||
architecture: x64
|
||||
|
||||
- name: Validate website/meta/universe.json
|
||||
run: |
|
||||
python .github/validate_universe_json.py website/meta/universe.json
|
|
@ -16,6 +16,9 @@ production-ready [**training system**](https://spacy.io/usage/training) and easy
|
|||
model packaging, deployment and workflow management. spaCy is commercial
|
||||
open-source software, released under the [MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE).
|
||||
|
||||
💥 **We'd love to hear more about your experience with spaCy!**
|
||||
[Fill out our survey here.](https://form.typeform.com/to/aMel9q9f)
|
||||
|
||||
💫 **Version 3.5 out now!**
|
||||
[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
|
||||
|
||||
|
|
|
@ -48,6 +48,9 @@ jobs:
|
|||
pip install flake8==5.0.4
|
||||
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
|
||||
displayName: "flake8"
|
||||
- script: |
|
||||
python .github/validate_universe_json.py website/meta/universe.json
|
||||
displayName: 'Validate website/meta/universe.json'
|
||||
|
||||
- job: "Test"
|
||||
dependsOn: "Validate"
|
||||
|
|
|
@ -5,7 +5,7 @@ requires = [
|
|||
"cymem>=2.0.2,<2.1.0",
|
||||
"preshed>=3.0.2,<3.1.0",
|
||||
"murmurhash>=0.28.0,<1.1.0",
|
||||
"thinc>=8.1.0,<8.2.0",
|
||||
"thinc>=8.1.8,<8.2.0",
|
||||
"numpy>=1.15.0",
|
||||
]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
|
|
@ -3,7 +3,7 @@ spacy-legacy>=3.0.11,<3.1.0
|
|||
spacy-loggers>=1.0.0,<2.0.0
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
thinc>=8.1.0,<8.2.0
|
||||
thinc>=8.1.8,<8.2.0
|
||||
ml_datasets>=0.2.0,<0.3.0
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
wasabi>=0.9.1,<1.2.0
|
||||
|
|
|
@ -39,7 +39,7 @@ setup_requires =
|
|||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
thinc>=8.1.0,<8.2.0
|
||||
thinc>=8.1.8,<8.2.0
|
||||
install_requires =
|
||||
# Our libraries
|
||||
spacy-legacy>=3.0.11,<3.1.0
|
||||
|
@ -47,7 +47,7 @@ install_requires =
|
|||
murmurhash>=0.28.0,<1.1.0
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
thinc>=8.1.0,<8.2.0
|
||||
thinc>=8.1.8,<8.2.0
|
||||
wasabi>=0.9.1,<1.2.0
|
||||
srsly>=2.4.3,<3.0.0
|
||||
catalogue>=2.0.6,<2.1.0
|
||||
|
|
|
@ -7,6 +7,7 @@ import srsly
|
|||
from wasabi import Printer, MESSAGES, msg
|
||||
import typer
|
||||
import math
|
||||
import numpy
|
||||
|
||||
from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
|
||||
from ._util import import_code, debug_cli, _format_number
|
||||
|
@ -521,9 +522,13 @@ def debug_data(
|
|||
|
||||
if "tagger" in factory_names:
|
||||
msg.divider("Part-of-speech Tagging")
|
||||
label_list = [label for label in gold_train_data["tags"]]
|
||||
model_labels = _get_labels_from_model(nlp, "tagger")
|
||||
label_list, counts = zip(*gold_train_data["tags"].items())
|
||||
msg.info(f"{len(label_list)} label(s) in train data")
|
||||
p = numpy.array(counts)
|
||||
p = p / p.sum()
|
||||
norm_entropy = (-p * numpy.log2(p)).sum() / numpy.log2(len(label_list))
|
||||
msg.info(f"{norm_entropy} is the normalised label entropy")
|
||||
model_labels = _get_labels_from_model(nlp, "tagger")
|
||||
labels = set(label_list)
|
||||
missing_labels = model_labels - labels
|
||||
if missing_labels:
|
||||
|
|
|
@ -35,7 +35,7 @@ def find_threshold_cli(
|
|||
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||
use_gpu: int = Opt(_DEFAULTS["use_gpu"], "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
||||
gold_preproc: bool = Opt(_DEFAULTS["gold_preproc"], "--gold-preproc", "-G", help="Use gold preprocessing"),
|
||||
verbose: bool = Opt(False, "--silent", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||
# fmt: on
|
||||
):
|
||||
"""
|
||||
|
|
|
@ -3,7 +3,7 @@ the docs and the init config command. It encodes various best practices and
|
|||
can help generate the best possible configuration, given a user's requirements. #}
|
||||
{%- set use_transformer = hardware != "cpu" and transformer_data -%}
|
||||
{%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
|
||||
{%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "spancat", "trainable_lemmatizer"] -%}
|
||||
{%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "spancat", "spancat_singlelabel", "trainable_lemmatizer"] -%}
|
||||
[paths]
|
||||
train = null
|
||||
dev = null
|
||||
|
@ -24,8 +24,11 @@ gpu_allocator = null
|
|||
lang = "{{ lang }}"
|
||||
{%- set has_textcat = ("textcat" in components or "textcat_multilabel" in components) -%}
|
||||
{%- set with_accuracy = optimize == "accuracy" -%}
|
||||
{%- set has_accurate_textcat = has_textcat and with_accuracy -%}
|
||||
{%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "spancat" in components or "trainable_lemmatizer" in components or "entity_linker" in components or has_accurate_textcat) -%}
|
||||
{# The BOW textcat doesn't need a source of features, so it can omit the
|
||||
tok2vec/transformer. #}
|
||||
{%- set with_accuracy_or_transformer = (use_transformer or with_accuracy) -%}
|
||||
{%- set textcat_needs_features = has_textcat and with_accuracy_or_transformer -%}
|
||||
{%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "spancat" in components or "spancat_singlelabel" in components or "trainable_lemmatizer" in components or "entity_linker" in components or textcat_needs_features) -%}
|
||||
{%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components -%}
|
||||
{%- else -%}
|
||||
{%- set full_pipeline = components -%}
|
||||
|
@ -156,6 +159,36 @@ grad_factor = 1.0
|
|||
sizes = [1,2,3]
|
||||
{% endif -%}
|
||||
|
||||
{% if "spancat_singlelabel" in components %}
|
||||
[components.spancat_singlelabel]
|
||||
factory = "spancat_singlelabel"
|
||||
negative_weight = 1.0
|
||||
allow_overlap = true
|
||||
scorer = {"@scorers":"spacy.spancat_scorer.v1"}
|
||||
spans_key = "sc"
|
||||
|
||||
[components.spancat_singlelabel.model]
|
||||
@architectures = "spacy.SpanCategorizer.v1"
|
||||
|
||||
[components.spancat_singlelabel.model.reducer]
|
||||
@layers = "spacy.mean_max_reducer.v1"
|
||||
hidden_size = 128
|
||||
|
||||
[components.spancat_singlelabel.model.scorer]
|
||||
@layers = "Softmax.v2"
|
||||
|
||||
[components.spancat_singlelabel.model.tok2vec]
|
||||
@architectures = "spacy-transformers.TransformerListener.v1"
|
||||
grad_factor = 1.0
|
||||
|
||||
[components.spancat_singlelabel.model.tok2vec.pooling]
|
||||
@layers = "reduce_mean.v1"
|
||||
|
||||
[components.spancat_singlelabel.suggester]
|
||||
@misc = "spacy.ngram_suggester.v1"
|
||||
sizes = [1,2,3]
|
||||
{% endif %}
|
||||
|
||||
{% if "trainable_lemmatizer" in components -%}
|
||||
[components.trainable_lemmatizer]
|
||||
factory = "trainable_lemmatizer"
|
||||
|
@ -221,10 +254,16 @@ no_output_layer = false
|
|||
|
||||
{% else -%}
|
||||
[components.textcat.model]
|
||||
@architectures = "spacy.TextCatBOW.v2"
|
||||
@architectures = "spacy.TextCatCNN.v2"
|
||||
exclusive_classes = true
|
||||
ngram_size = 1
|
||||
no_output_layer = false
|
||||
nO = null
|
||||
|
||||
[components.textcat.model.tok2vec]
|
||||
@architectures = "spacy-transformers.TransformerListener.v1"
|
||||
grad_factor = 1.0
|
||||
|
||||
[components.textcat.model.tok2vec.pooling]
|
||||
@layers = "reduce_mean.v1"
|
||||
{%- endif %}
|
||||
{%- endif %}
|
||||
|
||||
|
@ -252,10 +291,16 @@ no_output_layer = false
|
|||
|
||||
{% else -%}
|
||||
[components.textcat_multilabel.model]
|
||||
@architectures = "spacy.TextCatBOW.v2"
|
||||
@architectures = "spacy.TextCatCNN.v2"
|
||||
exclusive_classes = false
|
||||
ngram_size = 1
|
||||
no_output_layer = false
|
||||
nO = null
|
||||
|
||||
[components.textcat_multilabel.model.tok2vec]
|
||||
@architectures = "spacy-transformers.TransformerListener.v1"
|
||||
grad_factor = 1.0
|
||||
|
||||
[components.textcat_multilabel.model.tok2vec.pooling]
|
||||
@layers = "reduce_mean.v1"
|
||||
{%- endif %}
|
||||
{%- endif %}
|
||||
|
||||
|
@ -286,6 +331,7 @@ maxout_pieces = 3
|
|||
{% if "morphologizer" in components %}
|
||||
[components.morphologizer]
|
||||
factory = "morphologizer"
|
||||
label_smoothing = 0.05
|
||||
|
||||
[components.morphologizer.model]
|
||||
@architectures = "spacy.Tagger.v2"
|
||||
|
@ -299,6 +345,7 @@ width = ${components.tok2vec.model.encode.width}
|
|||
{% if "tagger" in components %}
|
||||
[components.tagger]
|
||||
factory = "tagger"
|
||||
label_smoothing = 0.05
|
||||
|
||||
[components.tagger.model]
|
||||
@architectures = "spacy.Tagger.v2"
|
||||
|
@ -374,6 +421,33 @@ width = ${components.tok2vec.model.encode.width}
|
|||
sizes = [1,2,3]
|
||||
{% endif %}
|
||||
|
||||
{% if "spancat_singlelabel" in components %}
|
||||
[components.spancat_singlelabel]
|
||||
factory = "spancat_singlelabel"
|
||||
negative_weight = 1.0
|
||||
allow_overlap = true
|
||||
scorer = {"@scorers":"spacy.spancat_scorer.v1"}
|
||||
spans_key = "sc"
|
||||
|
||||
[components.spancat_singlelabel.model]
|
||||
@architectures = "spacy.SpanCategorizer.v1"
|
||||
|
||||
[components.spancat_singlelabel.model.reducer]
|
||||
@layers = "spacy.mean_max_reducer.v1"
|
||||
hidden_size = 128
|
||||
|
||||
[components.spancat_singlelabel.model.scorer]
|
||||
@layers = "Softmax.v2"
|
||||
|
||||
[components.spancat_singlelabel.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecListener.v1"
|
||||
width = ${components.tok2vec.model.encode.width}
|
||||
|
||||
[components.spancat_singlelabel.suggester]
|
||||
@misc = "spacy.ngram_suggester.v1"
|
||||
sizes = [1,2,3]
|
||||
{% endif %}
|
||||
|
||||
{% if "trainable_lemmatizer" in components -%}
|
||||
[components.trainable_lemmatizer]
|
||||
factory = "trainable_lemmatizer"
|
||||
|
|
|
@ -966,6 +966,7 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
"with `displacy.serve(doc, port=port)`")
|
||||
E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port=port)` "
|
||||
"or use `auto_select_port=True` to pick an available port automatically.")
|
||||
E1051 = ("'allow_overlap' can only be False when max_positive is 1, but found 'max_positive': {max_positive}.")
|
||||
|
||||
|
||||
# Deprecated model shortcuts, only used in errors and warnings
|
||||
|
|
|
@ -474,18 +474,24 @@ class EntityLinker(TrainablePipe):
|
|||
|
||||
# Looping through each entity in batch (TODO: rewrite)
|
||||
for j, ent in enumerate(ent_batch):
|
||||
sent_index = sentences.index(ent.sent)
|
||||
assert sent_index >= 0
|
||||
assert hasattr(ent, "sents")
|
||||
sents = list(ent.sents)
|
||||
sent_indices = (
|
||||
sentences.index(sents[0]),
|
||||
sentences.index(sents[-1]),
|
||||
)
|
||||
assert sent_indices[1] >= sent_indices[0] >= 0
|
||||
|
||||
if self.incl_context:
|
||||
# get n_neighbour sentences, clipped to the length of the document
|
||||
start_sentence = max(0, sent_index - self.n_sents)
|
||||
start_sentence = max(0, sent_indices[0] - self.n_sents)
|
||||
end_sentence = min(
|
||||
len(sentences) - 1, sent_index + self.n_sents
|
||||
len(sentences) - 1, sent_indices[1] + self.n_sents
|
||||
)
|
||||
start_token = sentences[start_sentence].start
|
||||
end_token = sentences[end_sentence].end
|
||||
sent_doc = doc[start_token:end_token].as_doc()
|
||||
|
||||
# currently, the context is the same for each entity in a sentence (should be refined)
|
||||
sentence_encoding = self.model.predict([sent_doc])[0]
|
||||
sentence_encoding_t = sentence_encoding.T
|
||||
|
|
|
@ -52,7 +52,8 @@ DEFAULT_MORPH_MODEL = Config().from_str(default_model_config)["model"]
|
|||
@Language.factory(
|
||||
"morphologizer",
|
||||
assigns=["token.morph", "token.pos"],
|
||||
default_config={"model": DEFAULT_MORPH_MODEL, "overwrite": True, "extend": False, "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"}},
|
||||
default_config={"model": DEFAULT_MORPH_MODEL, "overwrite": True, "extend": False,
|
||||
"scorer": {"@scorers": "spacy.morphologizer_scorer.v1"}, "label_smoothing": 0.0},
|
||||
default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5, "morph_per_feat": None},
|
||||
)
|
||||
def make_morphologizer(
|
||||
|
@ -61,9 +62,10 @@ def make_morphologizer(
|
|||
name: str,
|
||||
overwrite: bool,
|
||||
extend: bool,
|
||||
label_smoothing: float,
|
||||
scorer: Optional[Callable],
|
||||
):
|
||||
return Morphologizer(nlp.vocab, model, name, overwrite=overwrite, extend=extend, scorer=scorer)
|
||||
return Morphologizer(nlp.vocab, model, name, overwrite=overwrite, extend=extend, label_smoothing=label_smoothing, scorer=scorer)
|
||||
|
||||
|
||||
def morphologizer_score(examples, **kwargs):
|
||||
|
@ -94,6 +96,7 @@ class Morphologizer(Tagger):
|
|||
*,
|
||||
overwrite: bool = BACKWARD_OVERWRITE,
|
||||
extend: bool = BACKWARD_EXTEND,
|
||||
label_smoothing: float = 0.0,
|
||||
scorer: Optional[Callable] = morphologizer_score,
|
||||
):
|
||||
"""Initialize a morphologizer.
|
||||
|
@ -121,6 +124,7 @@ class Morphologizer(Tagger):
|
|||
"labels_pos": {},
|
||||
"overwrite": overwrite,
|
||||
"extend": extend,
|
||||
"label_smoothing": label_smoothing,
|
||||
}
|
||||
self.cfg = dict(sorted(cfg.items()))
|
||||
self.scorer = scorer
|
||||
|
@ -270,7 +274,8 @@ class Morphologizer(Tagger):
|
|||
DOCS: https://spacy.io/api/morphologizer#get_loss
|
||||
"""
|
||||
validate_examples(examples, "Morphologizer.get_loss")
|
||||
loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False)
|
||||
loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False,
|
||||
label_smoothing=self.cfg["label_smoothing"])
|
||||
truths = []
|
||||
for eg in examples:
|
||||
eg_truths = []
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any
|
||||
from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any, cast, Union
|
||||
from dataclasses import dataclass
|
||||
from thinc.api import Config, Model, get_current_ops, set_dropout_rate, Ops
|
||||
from thinc.api import Optimizer
|
||||
from thinc.types import Ragged, Ints2d, Floats2d
|
||||
|
@ -43,7 +44,36 @@ maxout_pieces = 3
|
|||
depth = 4
|
||||
"""
|
||||
|
||||
spancat_singlelabel_default_config = """
|
||||
[model]
|
||||
@architectures = "spacy.SpanCategorizer.v1"
|
||||
scorer = {"@layers": "Softmax.v2"}
|
||||
|
||||
[model.reducer]
|
||||
@layers = spacy.mean_max_reducer.v1
|
||||
hidden_size = 128
|
||||
|
||||
[model.tok2vec]
|
||||
@architectures = "spacy.Tok2Vec.v2"
|
||||
[model.tok2vec.embed]
|
||||
@architectures = "spacy.MultiHashEmbed.v1"
|
||||
width = 96
|
||||
rows = [5000, 1000, 2500, 1000]
|
||||
attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
|
||||
include_static_vectors = false
|
||||
|
||||
[model.tok2vec.encode]
|
||||
@architectures = "spacy.MaxoutWindowEncoder.v2"
|
||||
width = ${model.tok2vec.embed.width}
|
||||
window_size = 1
|
||||
maxout_pieces = 3
|
||||
depth = 4
|
||||
"""
|
||||
|
||||
DEFAULT_SPANCAT_MODEL = Config().from_str(spancat_default_config)["model"]
|
||||
DEFAULT_SPANCAT_SINGLELABEL_MODEL = Config().from_str(
|
||||
spancat_singlelabel_default_config
|
||||
)["model"]
|
||||
|
||||
|
||||
@runtime_checkable
|
||||
|
@ -119,10 +149,14 @@ def make_spancat(
|
|||
threshold: float,
|
||||
max_positive: Optional[int],
|
||||
) -> "SpanCategorizer":
|
||||
"""Create a SpanCategorizer component. The span categorizer consists of two
|
||||
"""Create a SpanCategorizer component and configure it for multi-label
|
||||
classification to be able to assign multiple labels for each span.
|
||||
The span categorizer consists of two
|
||||
parts: a suggester function that proposes candidate spans, and a labeller
|
||||
model that predicts one or more labels for each span.
|
||||
|
||||
name (str): The component instance name, used to add entries to the
|
||||
losses during training.
|
||||
suggester (Callable[[Iterable[Doc], Optional[Ops]], Ragged]): A function that suggests spans.
|
||||
Spans are returned as a ragged array with two integer columns, for the
|
||||
start and end positions.
|
||||
|
@ -144,12 +178,80 @@ def make_spancat(
|
|||
"""
|
||||
return SpanCategorizer(
|
||||
nlp.vocab,
|
||||
suggester=suggester,
|
||||
model=model,
|
||||
spans_key=spans_key,
|
||||
threshold=threshold,
|
||||
max_positive=max_positive,
|
||||
suggester=suggester,
|
||||
name=name,
|
||||
spans_key=spans_key,
|
||||
negative_weight=None,
|
||||
allow_overlap=True,
|
||||
max_positive=max_positive,
|
||||
threshold=threshold,
|
||||
scorer=scorer,
|
||||
add_negative_label=False,
|
||||
)
|
||||
|
||||
|
||||
@Language.factory(
|
||||
"spancat_singlelabel",
|
||||
assigns=["doc.spans"],
|
||||
default_config={
|
||||
"spans_key": "sc",
|
||||
"model": DEFAULT_SPANCAT_SINGLELABEL_MODEL,
|
||||
"negative_weight": 1.0,
|
||||
"suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
|
||||
"scorer": {"@scorers": "spacy.spancat_scorer.v1"},
|
||||
"allow_overlap": True,
|
||||
},
|
||||
default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0},
|
||||
)
|
||||
def make_spancat_singlelabel(
|
||||
nlp: Language,
|
||||
name: str,
|
||||
suggester: Suggester,
|
||||
model: Model[Tuple[List[Doc], Ragged], Floats2d],
|
||||
spans_key: str,
|
||||
negative_weight: float,
|
||||
allow_overlap: bool,
|
||||
scorer: Optional[Callable],
|
||||
) -> "SpanCategorizer":
|
||||
"""Create a SpanCategorizer component and configure it for multi-class
|
||||
classification. With this configuration each span can get at most one
|
||||
label. The span categorizer consists of two
|
||||
parts: a suggester function that proposes candidate spans, and a labeller
|
||||
model that predicts one or more labels for each span.
|
||||
|
||||
name (str): The component instance name, used to add entries to the
|
||||
losses during training.
|
||||
suggester (Callable[[Iterable[Doc], Optional[Ops]], Ragged]): A function that suggests spans.
|
||||
Spans are returned as a ragged array with two integer columns, for the
|
||||
start and end positions.
|
||||
model (Model[Tuple[List[Doc], Ragged], Floats2d]): A model instance that
|
||||
is given a list of documents and (start, end) indices representing
|
||||
candidate span offsets. The model predicts a probability for each category
|
||||
for each span.
|
||||
spans_key (str): Key of the doc.spans dict to save the spans under. During
|
||||
initialization and training, the component will look for spans on the
|
||||
reference document under the same key.
|
||||
scorer (Optional[Callable]): The scoring method. Defaults to
|
||||
Scorer.score_spans for the Doc.spans[spans_key] with overlapping
|
||||
spans allowed.
|
||||
negative_weight (float): Multiplier for the loss terms.
|
||||
Can be used to downweight the negative samples if there are too many.
|
||||
allow_overlap (bool): If True the data is assumed to contain overlapping spans.
|
||||
Otherwise it produces non-overlapping spans greedily prioritizing
|
||||
higher assigned label scores.
|
||||
"""
|
||||
return SpanCategorizer(
|
||||
nlp.vocab,
|
||||
model=model,
|
||||
suggester=suggester,
|
||||
name=name,
|
||||
spans_key=spans_key,
|
||||
negative_weight=negative_weight,
|
||||
allow_overlap=allow_overlap,
|
||||
max_positive=1,
|
||||
add_negative_label=True,
|
||||
threshold=None,
|
||||
scorer=scorer,
|
||||
)
|
||||
|
||||
|
@ -172,6 +274,27 @@ def make_spancat_scorer():
|
|||
return spancat_score
|
||||
|
||||
|
||||
@dataclass
|
||||
class _Intervals:
|
||||
"""
|
||||
Helper class to avoid storing overlapping spans.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.ranges = set()
|
||||
|
||||
def add(self, i, j):
|
||||
for e in range(i, j):
|
||||
self.ranges.add(e)
|
||||
|
||||
def __contains__(self, rang):
|
||||
i, j = rang
|
||||
for e in range(i, j):
|
||||
if e in self.ranges:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
class SpanCategorizer(TrainablePipe):
|
||||
"""Pipeline component to label spans of text.
|
||||
|
||||
|
@ -185,25 +308,43 @@ class SpanCategorizer(TrainablePipe):
|
|||
suggester: Suggester,
|
||||
name: str = "spancat",
|
||||
*,
|
||||
add_negative_label: bool = False,
|
||||
spans_key: str = "spans",
|
||||
threshold: float = 0.5,
|
||||
negative_weight: Optional[float] = 1.0,
|
||||
allow_overlap: Optional[bool] = True,
|
||||
max_positive: Optional[int] = None,
|
||||
threshold: Optional[float] = 0.5,
|
||||
scorer: Optional[Callable] = spancat_score,
|
||||
) -> None:
|
||||
"""Initialize the span categorizer.
|
||||
"""Initialize the multi-label or multi-class span categorizer.
|
||||
|
||||
vocab (Vocab): The shared vocabulary.
|
||||
model (thinc.api.Model): The Thinc Model powering the pipeline component.
|
||||
For multi-class classification (single label per span) we recommend
|
||||
using a Softmax classifier as a the final layer, while for multi-label
|
||||
classification (multiple possible labels per span) we recommend Logistic.
|
||||
suggester (Callable[[Iterable[Doc], Optional[Ops]], Ragged]): A function that suggests spans.
|
||||
Spans are returned as a ragged array with two integer columns, for the
|
||||
start and end positions.
|
||||
name (str): The component instance name, used to add entries to the
|
||||
losses during training.
|
||||
spans_key (str): Key of the Doc.spans dict to save the spans under.
|
||||
During initialization and training, the component will look for
|
||||
spans on the reference document under the same key. Defaults to
|
||||
`"spans"`.
|
||||
threshold (float): Minimum probability to consider a prediction
|
||||
positive. Spans with a positive prediction will be saved on the Doc.
|
||||
Defaults to 0.5.
|
||||
add_negative_label (bool): Learn to predict a special 'negative_label'
|
||||
when a Span is not annotated.
|
||||
threshold (Optional[float]): Minimum probability to consider a prediction
|
||||
positive. Defaults to 0.5. Spans with a positive prediction will be saved
|
||||
on the Doc.
|
||||
max_positive (Optional[int]): Maximum number of labels to consider
|
||||
positive per span. Defaults to None, indicating no limit.
|
||||
negative_weight (float): Multiplier for the loss terms.
|
||||
Can be used to downweight the negative samples if there are too many
|
||||
when add_negative_label is True. Otherwise its unused.
|
||||
allow_overlap (bool): If True the data is assumed to contain overlapping spans.
|
||||
Otherwise it produces non-overlapping spans greedily prioritizing
|
||||
higher assigned label scores. Only used when max_positive is 1.
|
||||
scorer (Optional[Callable]): The scoring method. Defaults to
|
||||
Scorer.score_spans for the Doc.spans[spans_key] with overlapping
|
||||
spans allowed.
|
||||
|
@ -215,12 +356,17 @@ class SpanCategorizer(TrainablePipe):
|
|||
"spans_key": spans_key,
|
||||
"threshold": threshold,
|
||||
"max_positive": max_positive,
|
||||
"negative_weight": negative_weight,
|
||||
"allow_overlap": allow_overlap,
|
||||
}
|
||||
self.vocab = vocab
|
||||
self.suggester = suggester
|
||||
self.model = model
|
||||
self.name = name
|
||||
self.scorer = scorer
|
||||
self.add_negative_label = add_negative_label
|
||||
if not allow_overlap and max_positive is not None and max_positive > 1:
|
||||
raise ValueError(Errors.E1051.format(max_positive=max_positive))
|
||||
|
||||
@property
|
||||
def key(self) -> str:
|
||||
|
@ -230,6 +376,21 @@ class SpanCategorizer(TrainablePipe):
|
|||
"""
|
||||
return str(self.cfg["spans_key"])
|
||||
|
||||
def _allow_extra_label(self) -> None:
|
||||
"""Raise an error if the component can not add any more labels."""
|
||||
nO = None
|
||||
if self.model.has_dim("nO"):
|
||||
nO = self.model.get_dim("nO")
|
||||
elif self.model.has_ref("output_layer") and self.model.get_ref(
|
||||
"output_layer"
|
||||
).has_dim("nO"):
|
||||
nO = self.model.get_ref("output_layer").get_dim("nO")
|
||||
if nO is not None and nO == self._n_labels:
|
||||
if not self.is_resizable:
|
||||
raise ValueError(
|
||||
Errors.E922.format(name=self.name, nO=self.model.get_dim("nO"))
|
||||
)
|
||||
|
||||
def add_label(self, label: str) -> int:
|
||||
"""Add a new label to the pipe.
|
||||
|
||||
|
@ -263,6 +424,27 @@ class SpanCategorizer(TrainablePipe):
|
|||
"""
|
||||
return list(self.labels)
|
||||
|
||||
@property
|
||||
def _label_map(self) -> Dict[str, int]:
|
||||
"""RETURNS (Dict[str, int]): The label map."""
|
||||
return {label: i for i, label in enumerate(self.labels)}
|
||||
|
||||
@property
|
||||
def _n_labels(self) -> int:
|
||||
"""RETURNS (int): Number of labels."""
|
||||
if self.add_negative_label:
|
||||
return len(self.labels) + 1
|
||||
else:
|
||||
return len(self.labels)
|
||||
|
||||
@property
|
||||
def _negative_label_i(self) -> Union[int, None]:
|
||||
"""RETURNS (Union[int, None]): Index of the negative label."""
|
||||
if self.add_negative_label:
|
||||
return len(self.label_data)
|
||||
else:
|
||||
return None
|
||||
|
||||
def predict(self, docs: Iterable[Doc]):
|
||||
"""Apply the pipeline's model to a batch of docs, without modifying them.
|
||||
|
||||
|
@ -304,14 +486,24 @@ class SpanCategorizer(TrainablePipe):
|
|||
|
||||
DOCS: https://spacy.io/api/spancategorizer#set_annotations
|
||||
"""
|
||||
labels = self.labels
|
||||
indices, scores = indices_scores
|
||||
offset = 0
|
||||
for i, doc in enumerate(docs):
|
||||
indices_i = indices[i].dataXd
|
||||
doc.spans[self.key] = self._make_span_group(
|
||||
doc, indices_i, scores[offset : offset + indices.lengths[i]], labels # type: ignore[arg-type]
|
||||
)
|
||||
allow_overlap = cast(bool, self.cfg["allow_overlap"])
|
||||
if self.cfg["max_positive"] == 1:
|
||||
doc.spans[self.key] = self._make_span_group_singlelabel(
|
||||
doc,
|
||||
indices_i,
|
||||
scores[offset : offset + indices.lengths[i]],
|
||||
allow_overlap,
|
||||
)
|
||||
else:
|
||||
doc.spans[self.key] = self._make_span_group_multilabel(
|
||||
doc,
|
||||
indices_i,
|
||||
scores[offset : offset + indices.lengths[i]],
|
||||
)
|
||||
offset += indices.lengths[i]
|
||||
|
||||
def update(
|
||||
|
@ -371,9 +563,11 @@ class SpanCategorizer(TrainablePipe):
|
|||
spans = Ragged(
|
||||
self.model.ops.to_numpy(spans.data), self.model.ops.to_numpy(spans.lengths)
|
||||
)
|
||||
label_map = {label: i for i, label in enumerate(self.labels)}
|
||||
target = numpy.zeros(scores.shape, dtype=scores.dtype)
|
||||
if self.add_negative_label:
|
||||
negative_spans = numpy.ones((scores.shape[0]))
|
||||
offset = 0
|
||||
label_map = self._label_map
|
||||
for i, eg in enumerate(examples):
|
||||
# Map (start, end) offset of spans to the row in the d_scores array,
|
||||
# so that we can adjust the gradient for predictions that were
|
||||
|
@ -390,10 +584,16 @@ class SpanCategorizer(TrainablePipe):
|
|||
row = spans_index[key]
|
||||
k = label_map[gold_span.label_]
|
||||
target[row, k] = 1.0
|
||||
if self.add_negative_label:
|
||||
# delete negative label target.
|
||||
negative_spans[row] = 0.0
|
||||
# The target is a flat array for all docs. Track the position
|
||||
# we're at within the flat array.
|
||||
offset += spans.lengths[i]
|
||||
target = self.model.ops.asarray(target, dtype="f") # type: ignore
|
||||
if self.add_negative_label:
|
||||
negative_samples = numpy.nonzero(negative_spans)[0]
|
||||
target[negative_samples, self._negative_label_i] = 1.0 # type: ignore
|
||||
# The target will have the values 0 (for untrue predictions) or 1
|
||||
# (for true predictions).
|
||||
# The scores should be in the range [0, 1].
|
||||
|
@ -402,6 +602,10 @@ class SpanCategorizer(TrainablePipe):
|
|||
# If the prediction is 0.9 and it's false, the gradient will be
|
||||
# 0.9 (0.9 - 0.0)
|
||||
d_scores = scores - target
|
||||
if self.add_negative_label:
|
||||
neg_weight = cast(float, self.cfg["negative_weight"])
|
||||
if neg_weight != 1.0:
|
||||
d_scores[negative_samples] *= neg_weight
|
||||
loss = float((d_scores**2).sum())
|
||||
return loss, d_scores
|
||||
|
||||
|
@ -438,7 +642,7 @@ class SpanCategorizer(TrainablePipe):
|
|||
if subbatch:
|
||||
docs = [eg.x for eg in subbatch]
|
||||
spans = build_ngram_suggester(sizes=[1])(docs)
|
||||
Y = self.model.ops.alloc2f(spans.dataXd.shape[0], len(self.labels))
|
||||
Y = self.model.ops.alloc2f(spans.dataXd.shape[0], self._n_labels)
|
||||
self.model.initialize(X=(docs, spans), Y=Y)
|
||||
else:
|
||||
self.model.initialize()
|
||||
|
@ -452,31 +656,96 @@ class SpanCategorizer(TrainablePipe):
|
|||
eg.reference.spans.get(self.key, []), allow_overlap=True
|
||||
)
|
||||
|
||||
def _make_span_group(
|
||||
self, doc: Doc, indices: Ints2d, scores: Floats2d, labels: List[str]
|
||||
def _make_span_group_multilabel(
|
||||
self,
|
||||
doc: Doc,
|
||||
indices: Ints2d,
|
||||
scores: Floats2d,
|
||||
) -> SpanGroup:
|
||||
"""Find the top-k labels for each span (k=max_positive)."""
|
||||
spans = SpanGroup(doc, name=self.key)
|
||||
max_positive = self.cfg["max_positive"]
|
||||
if scores.size == 0:
|
||||
return spans
|
||||
scores = self.model.ops.to_numpy(scores)
|
||||
indices = self.model.ops.to_numpy(indices)
|
||||
threshold = self.cfg["threshold"]
|
||||
max_positive = self.cfg["max_positive"]
|
||||
|
||||
keeps = scores >= threshold
|
||||
ranked = (scores * -1).argsort() # type: ignore
|
||||
if max_positive is not None:
|
||||
assert isinstance(max_positive, int)
|
||||
if self.add_negative_label:
|
||||
negative_scores = numpy.copy(scores[:, self._negative_label_i])
|
||||
scores[:, self._negative_label_i] = -numpy.inf
|
||||
ranked = (scores * -1).argsort() # type: ignore
|
||||
scores[:, self._negative_label_i] = negative_scores
|
||||
else:
|
||||
ranked = (scores * -1).argsort() # type: ignore
|
||||
span_filter = ranked[:, max_positive:]
|
||||
for i, row in enumerate(span_filter):
|
||||
keeps[i, row] = False
|
||||
spans.attrs["scores"] = scores[keeps].flatten()
|
||||
|
||||
indices = self.model.ops.to_numpy(indices)
|
||||
keeps = self.model.ops.to_numpy(keeps)
|
||||
|
||||
attrs_scores = []
|
||||
for i in range(indices.shape[0]):
|
||||
start = indices[i, 0]
|
||||
end = indices[i, 1]
|
||||
|
||||
for j, keep in enumerate(keeps[i]):
|
||||
if keep:
|
||||
spans.append(Span(doc, start, end, label=labels[j]))
|
||||
if j != self._negative_label_i:
|
||||
spans.append(Span(doc, start, end, label=self.labels[j]))
|
||||
attrs_scores.append(scores[i, j])
|
||||
spans.attrs["scores"] = numpy.array(attrs_scores)
|
||||
return spans
|
||||
|
||||
def _make_span_group_singlelabel(
|
||||
self,
|
||||
doc: Doc,
|
||||
indices: Ints2d,
|
||||
scores: Floats2d,
|
||||
allow_overlap: bool = True,
|
||||
) -> SpanGroup:
|
||||
"""Find the argmax label for each span."""
|
||||
# Handle cases when there are zero suggestions
|
||||
if scores.size == 0:
|
||||
return SpanGroup(doc, name=self.key)
|
||||
scores = self.model.ops.to_numpy(scores)
|
||||
indices = self.model.ops.to_numpy(indices)
|
||||
predicted = scores.argmax(axis=1)
|
||||
argmax_scores = numpy.take_along_axis(
|
||||
scores, numpy.expand_dims(predicted, 1), axis=1
|
||||
)
|
||||
keeps = numpy.ones(predicted.shape, dtype=bool)
|
||||
# Remove samples where the negative label is the argmax.
|
||||
if self.add_negative_label:
|
||||
keeps = numpy.logical_and(keeps, predicted != self._negative_label_i)
|
||||
# Filter samples according to threshold.
|
||||
threshold = self.cfg["threshold"]
|
||||
if threshold is not None:
|
||||
keeps = numpy.logical_and(keeps, (argmax_scores >= threshold).squeeze())
|
||||
# Sort spans according to argmax probability
|
||||
if not allow_overlap:
|
||||
# Get the probabilities
|
||||
sort_idx = (argmax_scores.squeeze() * -1).argsort()
|
||||
predicted = predicted[sort_idx]
|
||||
indices = indices[sort_idx]
|
||||
keeps = keeps[sort_idx]
|
||||
seen = _Intervals()
|
||||
spans = SpanGroup(doc, name=self.key)
|
||||
attrs_scores = []
|
||||
for i in range(indices.shape[0]):
|
||||
if not keeps[i]:
|
||||
continue
|
||||
|
||||
label = predicted[i]
|
||||
start = indices[i, 0]
|
||||
end = indices[i, 1]
|
||||
|
||||
if not allow_overlap:
|
||||
if (start, end) in seen:
|
||||
continue
|
||||
else:
|
||||
seen.add(start, end)
|
||||
attrs_scores.append(argmax_scores[i])
|
||||
spans.append(Span(doc, start, end, label=self.labels[label]))
|
||||
|
||||
return spans
|
||||
|
|
|
@ -45,7 +45,7 @@ DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"]
|
|||
@Language.factory(
|
||||
"tagger",
|
||||
assigns=["token.tag"],
|
||||
default_config={"model": DEFAULT_TAGGER_MODEL, "overwrite": False, "scorer": {"@scorers": "spacy.tagger_scorer.v1"}, "neg_prefix": "!"},
|
||||
default_config={"model": DEFAULT_TAGGER_MODEL, "overwrite": False, "scorer": {"@scorers": "spacy.tagger_scorer.v1"}, "neg_prefix": "!", "label_smoothing": 0.0},
|
||||
default_score_weights={"tag_acc": 1.0},
|
||||
)
|
||||
def make_tagger(
|
||||
|
@ -55,6 +55,7 @@ def make_tagger(
|
|||
overwrite: bool,
|
||||
scorer: Optional[Callable],
|
||||
neg_prefix: str,
|
||||
label_smoothing: float,
|
||||
):
|
||||
"""Construct a part-of-speech tagger component.
|
||||
|
||||
|
@ -63,7 +64,7 @@ def make_tagger(
|
|||
in size, and be normalized as probabilities (all scores between 0 and 1,
|
||||
with the rows summing to 1).
|
||||
"""
|
||||
return Tagger(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer, neg_prefix=neg_prefix)
|
||||
return Tagger(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer, neg_prefix=neg_prefix, label_smoothing=label_smoothing)
|
||||
|
||||
|
||||
def tagger_score(examples, **kwargs):
|
||||
|
@ -89,6 +90,7 @@ class Tagger(TrainablePipe):
|
|||
overwrite=BACKWARD_OVERWRITE,
|
||||
scorer=tagger_score,
|
||||
neg_prefix="!",
|
||||
label_smoothing=0.0,
|
||||
):
|
||||
"""Initialize a part-of-speech tagger.
|
||||
|
||||
|
@ -105,7 +107,7 @@ class Tagger(TrainablePipe):
|
|||
self.model = model
|
||||
self.name = name
|
||||
self._rehearsal_model = None
|
||||
cfg = {"labels": [], "overwrite": overwrite, "neg_prefix": neg_prefix}
|
||||
cfg = {"labels": [], "overwrite": overwrite, "neg_prefix": neg_prefix, "label_smoothing": label_smoothing}
|
||||
self.cfg = dict(sorted(cfg.items()))
|
||||
self.scorer = scorer
|
||||
|
||||
|
@ -256,7 +258,7 @@ class Tagger(TrainablePipe):
|
|||
DOCS: https://spacy.io/api/tagger#get_loss
|
||||
"""
|
||||
validate_examples(examples, "Tagger.get_loss")
|
||||
loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False, neg_prefix=self.cfg["neg_prefix"])
|
||||
loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False, neg_prefix=self.cfg["neg_prefix"], label_smoothing=self.cfg["label_smoothing"])
|
||||
# Convert empty tag "" to missing value None so that both misaligned
|
||||
# tokens and tokens with missing annotation have the default missing
|
||||
# value None.
|
||||
|
|
|
@ -700,3 +700,19 @@ def test_span_group_copy(doc):
|
|||
assert len(doc.spans["test"]) == 3
|
||||
# check that the copy spans were not modified and this is an isolated doc
|
||||
assert len(doc_copy.spans["test"]) == 2
|
||||
|
||||
|
||||
def test_for_partial_ent_sents():
|
||||
"""Spans may be associated with multiple sentences. These .sents should always be complete, not partial, sentences,
|
||||
which this tests for.
|
||||
"""
|
||||
doc = Doc(
|
||||
English().vocab,
|
||||
words=["Mahler's", "Symphony", "No.", "8", "was", "beautiful."],
|
||||
sent_starts=[1, 0, 0, 1, 0, 0],
|
||||
)
|
||||
doc.set_ents([Span(doc, 1, 4, "WORK")])
|
||||
# The specified entity is associated with both sentences in this doc, so we expect all sentences in the doc to be
|
||||
# equal to the sentences referenced in ent.sents.
|
||||
for doc_sent, ent_sent in zip(doc.sents, doc.ents[0].sents):
|
||||
assert doc_sent == ent_sent
|
||||
|
|
|
@ -1,9 +1,9 @@
|
|||
from typing import Callable, Iterable, Dict, Any
|
||||
from typing import Callable, Iterable, Dict, Any, Tuple
|
||||
|
||||
import pytest
|
||||
from numpy.testing import assert_equal
|
||||
|
||||
from spacy import registry, util
|
||||
from spacy import registry, util, Language
|
||||
from spacy.attrs import ENT_KB_ID
|
||||
from spacy.compat import pickle
|
||||
from spacy.kb import Candidate, InMemoryLookupKB, get_candidates, KnowledgeBase
|
||||
|
@ -108,18 +108,23 @@ def test_issue7065():
|
|||
|
||||
|
||||
@pytest.mark.issue(7065)
|
||||
def test_issue7065_b():
|
||||
@pytest.mark.parametrize("entity_in_first_sentence", [True, False])
|
||||
def test_sentence_crossing_ents(entity_in_first_sentence: bool):
|
||||
"""Tests if NEL crashes if entities cross sentence boundaries and the first associated sentence doesn't have an
|
||||
entity.
|
||||
entity_in_prior_sentence (bool): Whether to include an entity in the first sentence associated with the
|
||||
sentence-crossing entity.
|
||||
"""
|
||||
# Test that the NEL doesn't crash when an entity crosses a sentence boundary
|
||||
nlp = English()
|
||||
vector_length = 3
|
||||
nlp.add_pipe("sentencizer")
|
||||
text = "Mahler 's Symphony No. 8 was beautiful."
|
||||
entities = [(0, 6, "PERSON"), (10, 24, "WORK")]
|
||||
links = {
|
||||
(0, 6): {"Q7304": 1.0, "Q270853": 0.0},
|
||||
(10, 24): {"Q7304": 0.0, "Q270853": 1.0},
|
||||
}
|
||||
sent_starts = [1, -1, 0, 0, 0, 0, 0, 0, 0]
|
||||
entities = [(10, 24, "WORK")]
|
||||
links = {(10, 24): {"Q7304": 0.0, "Q270853": 1.0}}
|
||||
if entity_in_first_sentence:
|
||||
entities.append((0, 6, "PERSON"))
|
||||
links[(0, 6)] = {"Q7304": 1.0, "Q270853": 0.0}
|
||||
sent_starts = [1, -1, 0, 0, 0, 1, 0, 0, 0]
|
||||
doc = nlp(text)
|
||||
example = Example.from_dict(
|
||||
doc, {"entities": entities, "links": links, "sent_starts": sent_starts}
|
||||
|
@ -145,31 +150,14 @@ def test_issue7065_b():
|
|||
|
||||
# Create the Entity Linker component and add it to the pipeline
|
||||
entity_linker = nlp.add_pipe("entity_linker", last=True)
|
||||
entity_linker.set_kb(create_kb)
|
||||
entity_linker.set_kb(create_kb) # type: ignore
|
||||
# train the NEL pipe
|
||||
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||
for i in range(2):
|
||||
losses = {}
|
||||
nlp.update(train_examples, sgd=optimizer, losses=losses)
|
||||
nlp.update(train_examples, sgd=optimizer)
|
||||
|
||||
# Add a custom rule-based component to mimick NER
|
||||
patterns = [
|
||||
{"label": "PERSON", "pattern": [{"LOWER": "mahler"}]},
|
||||
{
|
||||
"label": "WORK",
|
||||
"pattern": [
|
||||
{"LOWER": "symphony"},
|
||||
{"LOWER": "no"},
|
||||
{"LOWER": "."},
|
||||
{"LOWER": "8"},
|
||||
],
|
||||
},
|
||||
]
|
||||
ruler = nlp.add_pipe("entity_ruler", before="entity_linker")
|
||||
ruler.add_patterns(patterns)
|
||||
# test the trained model - this should not throw E148
|
||||
doc = nlp(text)
|
||||
assert doc
|
||||
# This shouldn't crash.
|
||||
entity_linker.predict([example.reference]) # type: ignore
|
||||
|
||||
|
||||
def test_no_entities():
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
import pytest
|
||||
from numpy.testing import assert_equal
|
||||
from numpy.testing import assert_equal, assert_almost_equal
|
||||
|
||||
from spacy import util
|
||||
from spacy.training import Example
|
||||
|
@ -19,6 +19,8 @@ def test_label_types():
|
|||
morphologizer.add_label(9)
|
||||
|
||||
|
||||
TAGS = ["Feat=N", "Feat=V", "Feat=J"]
|
||||
|
||||
TRAIN_DATA = [
|
||||
(
|
||||
"I like green eggs",
|
||||
|
@ -32,6 +34,29 @@ TRAIN_DATA = [
|
|||
]
|
||||
|
||||
|
||||
def test_label_smoothing():
|
||||
nlp = Language()
|
||||
morph_no_ls = nlp.add_pipe("morphologizer", "no_label_smoothing")
|
||||
morph_ls = nlp.add_pipe(
|
||||
"morphologizer", "label_smoothing", config=dict(label_smoothing=0.05)
|
||||
)
|
||||
train_examples = []
|
||||
losses = {}
|
||||
for tag in TAGS:
|
||||
morph_no_ls.add_label(tag)
|
||||
morph_ls.add_label(tag)
|
||||
for t in TRAIN_DATA:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||
|
||||
nlp.initialize(get_examples=lambda: train_examples)
|
||||
tag_scores, bp_tag_scores = morph_ls.model.begin_update(
|
||||
[eg.predicted for eg in train_examples]
|
||||
)
|
||||
no_ls_grads = morph_no_ls.get_loss(train_examples, tag_scores)[1][0]
|
||||
ls_grads = morph_ls.get_loss(train_examples, tag_scores)[1][0]
|
||||
assert_almost_equal(ls_grads / no_ls_grads, 0.94285715)
|
||||
|
||||
|
||||
def test_no_label():
|
||||
nlp = Language()
|
||||
nlp.add_pipe("morphologizer")
|
||||
|
|
|
@ -15,6 +15,8 @@ OPS = get_current_ops()
|
|||
|
||||
SPAN_KEY = "labeled_spans"
|
||||
|
||||
SPANCAT_COMPONENTS = ["spancat", "spancat_singlelabel"]
|
||||
|
||||
TRAIN_DATA = [
|
||||
("Who is Shaka Khan?", {"spans": {SPAN_KEY: [(7, 17, "PERSON")]}}),
|
||||
(
|
||||
|
@ -41,38 +43,42 @@ def make_examples(nlp, data=TRAIN_DATA):
|
|||
return train_examples
|
||||
|
||||
|
||||
def test_no_label():
|
||||
@pytest.mark.parametrize("name", SPANCAT_COMPONENTS)
|
||||
def test_no_label(name):
|
||||
nlp = Language()
|
||||
nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY})
|
||||
nlp.add_pipe(name, config={"spans_key": SPAN_KEY})
|
||||
with pytest.raises(ValueError):
|
||||
nlp.initialize()
|
||||
|
||||
|
||||
def test_no_resize():
|
||||
@pytest.mark.parametrize("name", SPANCAT_COMPONENTS)
|
||||
def test_no_resize(name):
|
||||
nlp = Language()
|
||||
spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY})
|
||||
spancat = nlp.add_pipe(name, config={"spans_key": SPAN_KEY})
|
||||
spancat.add_label("Thing")
|
||||
spancat.add_label("Phrase")
|
||||
assert spancat.labels == ("Thing", "Phrase")
|
||||
nlp.initialize()
|
||||
assert spancat.model.get_dim("nO") == 2
|
||||
assert spancat.model.get_dim("nO") == spancat._n_labels
|
||||
# this throws an error because the spancat can't be resized after initialization
|
||||
with pytest.raises(ValueError):
|
||||
spancat.add_label("Stuff")
|
||||
|
||||
|
||||
def test_implicit_labels():
|
||||
@pytest.mark.parametrize("name", SPANCAT_COMPONENTS)
|
||||
def test_implicit_labels(name):
|
||||
nlp = Language()
|
||||
spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY})
|
||||
spancat = nlp.add_pipe(name, config={"spans_key": SPAN_KEY})
|
||||
assert len(spancat.labels) == 0
|
||||
train_examples = make_examples(nlp)
|
||||
nlp.initialize(get_examples=lambda: train_examples)
|
||||
assert spancat.labels == ("PERSON", "LOC")
|
||||
|
||||
|
||||
def test_explicit_labels():
|
||||
@pytest.mark.parametrize("name", SPANCAT_COMPONENTS)
|
||||
def test_explicit_labels(name):
|
||||
nlp = Language()
|
||||
spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY})
|
||||
spancat = nlp.add_pipe(name, config={"spans_key": SPAN_KEY})
|
||||
assert len(spancat.labels) == 0
|
||||
spancat.add_label("PERSON")
|
||||
spancat.add_label("LOC")
|
||||
|
@ -102,13 +108,13 @@ def test_doc_gc():
|
|||
# XXX This fails with length 0 sometimes
|
||||
assert len(spangroup) > 0
|
||||
with pytest.raises(RuntimeError):
|
||||
span = spangroup[0]
|
||||
spangroup[0]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"max_positive,nr_results", [(None, 4), (1, 2), (2, 3), (3, 4), (4, 4)]
|
||||
)
|
||||
def test_make_spangroup(max_positive, nr_results):
|
||||
def test_make_spangroup_multilabel(max_positive, nr_results):
|
||||
fix_random_seed(0)
|
||||
nlp = Language()
|
||||
spancat = nlp.add_pipe(
|
||||
|
@ -120,10 +126,12 @@ def test_make_spangroup(max_positive, nr_results):
|
|||
indices = ngram_suggester([doc])[0].dataXd
|
||||
assert_array_equal(OPS.to_numpy(indices), numpy.asarray([[0, 1], [1, 2], [0, 2]]))
|
||||
labels = ["Thing", "City", "Person", "GreatCity"]
|
||||
for label in labels:
|
||||
spancat.add_label(label)
|
||||
scores = numpy.asarray(
|
||||
[[0.2, 0.4, 0.3, 0.1], [0.1, 0.6, 0.2, 0.4], [0.8, 0.7, 0.3, 0.9]], dtype="f"
|
||||
)
|
||||
spangroup = spancat._make_span_group(doc, indices, scores, labels)
|
||||
spangroup = spancat._make_span_group_multilabel(doc, indices, scores)
|
||||
assert len(spangroup) == nr_results
|
||||
|
||||
# first span is always the second token "London"
|
||||
|
@ -154,6 +162,118 @@ def test_make_spangroup(max_positive, nr_results):
|
|||
assert_almost_equal(0.9, spangroup.attrs["scores"][-1], 5)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"threshold,allow_overlap,nr_results",
|
||||
[(0.05, True, 3), (0.05, False, 1), (0.5, True, 2), (0.5, False, 1)],
|
||||
)
|
||||
def test_make_spangroup_singlelabel(threshold, allow_overlap, nr_results):
|
||||
fix_random_seed(0)
|
||||
nlp = Language()
|
||||
spancat = nlp.add_pipe(
|
||||
"spancat",
|
||||
config={
|
||||
"spans_key": SPAN_KEY,
|
||||
"threshold": threshold,
|
||||
"max_positive": 1,
|
||||
},
|
||||
)
|
||||
doc = nlp.make_doc("Greater London")
|
||||
ngram_suggester = registry.misc.get("spacy.ngram_suggester.v1")(sizes=[1, 2])
|
||||
indices = ngram_suggester([doc])[0].dataXd
|
||||
assert_array_equal(OPS.to_numpy(indices), numpy.asarray([[0, 1], [1, 2], [0, 2]]))
|
||||
labels = ["Thing", "City", "Person", "GreatCity"]
|
||||
for label in labels:
|
||||
spancat.add_label(label)
|
||||
scores = numpy.asarray(
|
||||
[[0.2, 0.4, 0.3, 0.1], [0.1, 0.6, 0.2, 0.4], [0.8, 0.7, 0.3, 0.9]], dtype="f"
|
||||
)
|
||||
spangroup = spancat._make_span_group_singlelabel(
|
||||
doc, indices, scores, allow_overlap
|
||||
)
|
||||
assert len(spangroup) == nr_results
|
||||
if threshold > 0.4:
|
||||
if allow_overlap:
|
||||
assert spangroup[0].text == "London"
|
||||
assert spangroup[0].label_ == "City"
|
||||
assert spangroup[1].text == "Greater London"
|
||||
assert spangroup[1].label_ == "GreatCity"
|
||||
|
||||
else:
|
||||
assert spangroup[0].text == "Greater London"
|
||||
assert spangroup[0].label_ == "GreatCity"
|
||||
else:
|
||||
if allow_overlap:
|
||||
assert spangroup[0].text == "Greater"
|
||||
assert spangroup[0].label_ == "City"
|
||||
assert spangroup[1].text == "London"
|
||||
assert spangroup[1].label_ == "City"
|
||||
assert spangroup[2].text == "Greater London"
|
||||
assert spangroup[2].label_ == "GreatCity"
|
||||
else:
|
||||
assert spangroup[0].text == "Greater London"
|
||||
|
||||
|
||||
def test_make_spangroup_negative_label():
|
||||
fix_random_seed(0)
|
||||
nlp_single = Language()
|
||||
nlp_multi = Language()
|
||||
spancat_single = nlp_single.add_pipe(
|
||||
"spancat",
|
||||
config={
|
||||
"spans_key": SPAN_KEY,
|
||||
"threshold": 0.1,
|
||||
"max_positive": 1,
|
||||
},
|
||||
)
|
||||
spancat_multi = nlp_multi.add_pipe(
|
||||
"spancat",
|
||||
config={
|
||||
"spans_key": SPAN_KEY,
|
||||
"threshold": 0.1,
|
||||
"max_positive": 2,
|
||||
},
|
||||
)
|
||||
spancat_single.add_negative_label = True
|
||||
spancat_multi.add_negative_label = True
|
||||
doc = nlp_single.make_doc("Greater London")
|
||||
labels = ["Thing", "City", "Person", "GreatCity"]
|
||||
for label in labels:
|
||||
spancat_multi.add_label(label)
|
||||
spancat_single.add_label(label)
|
||||
ngram_suggester = registry.misc.get("spacy.ngram_suggester.v1")(sizes=[1, 2])
|
||||
indices = ngram_suggester([doc])[0].dataXd
|
||||
assert_array_equal(OPS.to_numpy(indices), numpy.asarray([[0, 1], [1, 2], [0, 2]]))
|
||||
scores = numpy.asarray(
|
||||
[
|
||||
[0.2, 0.4, 0.3, 0.1, 0.1],
|
||||
[0.1, 0.6, 0.2, 0.4, 0.9],
|
||||
[0.8, 0.7, 0.3, 0.9, 0.1],
|
||||
],
|
||||
dtype="f",
|
||||
)
|
||||
spangroup_multi = spancat_multi._make_span_group_multilabel(doc, indices, scores)
|
||||
spangroup_single = spancat_single._make_span_group_singlelabel(doc, indices, scores)
|
||||
assert len(spangroup_single) == 2
|
||||
assert spangroup_single[0].text == "Greater"
|
||||
assert spangroup_single[0].label_ == "City"
|
||||
assert spangroup_single[1].text == "Greater London"
|
||||
assert spangroup_single[1].label_ == "GreatCity"
|
||||
|
||||
assert len(spangroup_multi) == 6
|
||||
assert spangroup_multi[0].text == "Greater"
|
||||
assert spangroup_multi[0].label_ == "City"
|
||||
assert spangroup_multi[1].text == "Greater"
|
||||
assert spangroup_multi[1].label_ == "Person"
|
||||
assert spangroup_multi[2].text == "London"
|
||||
assert spangroup_multi[2].label_ == "City"
|
||||
assert spangroup_multi[3].text == "London"
|
||||
assert spangroup_multi[3].label_ == "GreatCity"
|
||||
assert spangroup_multi[4].text == "Greater London"
|
||||
assert spangroup_multi[4].label_ == "Thing"
|
||||
assert spangroup_multi[5].text == "Greater London"
|
||||
assert spangroup_multi[5].label_ == "GreatCity"
|
||||
|
||||
|
||||
def test_ngram_suggester(en_tokenizer):
|
||||
# test different n-gram lengths
|
||||
for size in [1, 2, 3]:
|
||||
|
@ -371,9 +491,9 @@ def test_overfitting_IO_overlapping():
|
|||
assert set([span.label_ for span in spans2]) == {"LOC", "DOUBLE_LOC"}
|
||||
|
||||
|
||||
def test_zero_suggestions():
|
||||
@pytest.mark.parametrize("name", SPANCAT_COMPONENTS)
|
||||
def test_zero_suggestions(name):
|
||||
# Test with a suggester that can return 0 suggestions
|
||||
|
||||
@registry.misc("test_mixed_zero_suggester")
|
||||
def make_mixed_zero_suggester():
|
||||
def mixed_zero_suggester(docs, *, ops=None):
|
||||
|
@ -400,7 +520,7 @@ def test_zero_suggestions():
|
|||
fix_random_seed(0)
|
||||
nlp = English()
|
||||
spancat = nlp.add_pipe(
|
||||
"spancat",
|
||||
name,
|
||||
config={
|
||||
"suggester": {"@misc": "test_mixed_zero_suggester"},
|
||||
"spans_key": SPAN_KEY,
|
||||
|
@ -408,7 +528,7 @@ def test_zero_suggestions():
|
|||
)
|
||||
train_examples = make_examples(nlp)
|
||||
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||
assert spancat.model.get_dim("nO") == 2
|
||||
assert spancat.model.get_dim("nO") == spancat._n_labels
|
||||
assert set(spancat.labels) == {"LOC", "PERSON"}
|
||||
|
||||
nlp.update(train_examples, sgd=optimizer)
|
||||
|
@ -424,9 +544,10 @@ def test_zero_suggestions():
|
|||
list(nlp.pipe(["", "one", "three three three"]))
|
||||
|
||||
|
||||
def test_set_candidates():
|
||||
@pytest.mark.parametrize("name", SPANCAT_COMPONENTS)
|
||||
def test_set_candidates(name):
|
||||
nlp = Language()
|
||||
spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY})
|
||||
spancat = nlp.add_pipe(name, config={"spans_key": SPAN_KEY})
|
||||
train_examples = make_examples(nlp)
|
||||
nlp.initialize(get_examples=lambda: train_examples)
|
||||
texts = [
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
import pytest
|
||||
from numpy.testing import assert_equal
|
||||
from numpy.testing import assert_equal, assert_almost_equal
|
||||
from spacy.attrs import TAG
|
||||
|
||||
from spacy import util
|
||||
|
@ -67,6 +67,29 @@ PARTIAL_DATA = [
|
|||
]
|
||||
|
||||
|
||||
def test_label_smoothing():
|
||||
nlp = Language()
|
||||
tagger_no_ls = nlp.add_pipe("tagger", "no_label_smoothing")
|
||||
tagger_ls = nlp.add_pipe(
|
||||
"tagger", "label_smoothing", config=dict(label_smoothing=0.05)
|
||||
)
|
||||
train_examples = []
|
||||
losses = {}
|
||||
for tag in TAGS:
|
||||
tagger_no_ls.add_label(tag)
|
||||
tagger_ls.add_label(tag)
|
||||
for t in TRAIN_DATA:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||
|
||||
nlp.initialize(get_examples=lambda: train_examples)
|
||||
tag_scores, bp_tag_scores = tagger_ls.model.begin_update(
|
||||
[eg.predicted for eg in train_examples]
|
||||
)
|
||||
no_ls_grads = tagger_no_ls.get_loss(train_examples, tag_scores)[1][0]
|
||||
ls_grads = tagger_ls.get_loss(train_examples, tag_scores)[1][0]
|
||||
assert_almost_equal(ls_grads / no_ls_grads, 0.925)
|
||||
|
||||
|
||||
def test_no_label():
|
||||
nlp = Language()
|
||||
nlp.add_pipe("tagger")
|
||||
|
|
|
@ -397,7 +397,14 @@ def test_parse_cli_overrides():
|
|||
|
||||
@pytest.mark.parametrize("lang", ["en", "nl"])
|
||||
@pytest.mark.parametrize(
|
||||
"pipeline", [["tagger", "parser", "ner"], [], ["ner", "textcat", "sentencizer"]]
|
||||
"pipeline",
|
||||
[
|
||||
["tagger", "parser", "ner"],
|
||||
[],
|
||||
["ner", "textcat", "sentencizer"],
|
||||
["morphologizer", "spancat", "entity_linker"],
|
||||
["spancat_singlelabel", "textcat_multilabel"],
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("optimize", ["efficiency", "accuracy"])
|
||||
@pytest.mark.parametrize("pretraining", [True, False])
|
||||
|
|
|
@ -5,10 +5,18 @@ import srsly
|
|||
from typer.testing import CliRunner
|
||||
from spacy.tokens import DocBin, Doc
|
||||
|
||||
from spacy.cli._util import app
|
||||
from spacy.cli._util import app, get_git_version
|
||||
from .util import make_tempdir, normalize_whitespace
|
||||
|
||||
|
||||
def has_git():
|
||||
try:
|
||||
get_git_version()
|
||||
return True
|
||||
except RuntimeError:
|
||||
return False
|
||||
|
||||
|
||||
def test_convert_auto():
|
||||
with make_tempdir() as d_in, make_tempdir() as d_out:
|
||||
for f in ["data1.iob", "data2.iob", "data3.iob"]:
|
||||
|
@ -181,6 +189,7 @@ def test_project_run(project_dir):
|
|||
assert "okokok" in result.stdout
|
||||
|
||||
|
||||
@pytest.mark.skipif(not has_git(), reason="git not installed")
|
||||
@pytest.mark.parametrize(
|
||||
"options",
|
||||
[
|
||||
|
|
|
@ -460,9 +460,8 @@ cdef class Span:
|
|||
start = i
|
||||
if start >= self.end:
|
||||
break
|
||||
if start < self.end:
|
||||
yield Span(self.doc, start, self.end)
|
||||
|
||||
elif i == self.doc.length - 1:
|
||||
yield Span(self.doc, start, self.doc.length)
|
||||
|
||||
@property
|
||||
def ents(self):
|
||||
|
|
|
@ -1254,19 +1254,19 @@ be provided.
|
|||
> $ python -m spacy find-threshold my_nlp data.spacy spancat threshold spans_sc_f
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `model` | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~ |
|
||||
| `data_path` | Path to file with DocBin with docs to use for threshold search. ~~Path (positional)~~ |
|
||||
| `pipe_name` | Name of pipe to examine thresholds for. ~~str (positional)~~ |
|
||||
| `threshold_key` | Key of threshold attribute in component's configuration. ~~str (positional)~~ |
|
||||
| `scores_key` | Name of score to metric to optimize. ~~str (positional)~~ |
|
||||
| `--n_trials`, `-n` | Number of trials to determine optimal thresholds. ~~int (option)~~ |
|
||||
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
||||
| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ |
|
||||
| `--gold-preproc`, `-G` | Use gold preprocessing. ~~bool (flag)~~ |
|
||||
| `--silent`, `-V`, `-VV` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| Name | Description |
|
||||
| ------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `model` | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~ |
|
||||
| `data_path` | Path to file with DocBin with docs to use for threshold search. ~~Path (positional)~~ |
|
||||
| `pipe_name` | Name of pipe to examine thresholds for. ~~str (positional)~~ |
|
||||
| `threshold_key` | Key of threshold attribute in component's configuration. ~~str (positional)~~ |
|
||||
| `scores_key` | Name of score to metric to optimize. ~~str (positional)~~ |
|
||||
| `--n_trials`, `-n` | Number of trials to determine optimal thresholds. ~~int (option)~~ |
|
||||
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
||||
| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ |
|
||||
| `--gold-preproc`, `-G` | Use gold preprocessing. ~~bool (flag)~~ |
|
||||
| `--verbose`, `-V`, `-VV` | Display more information for debugging purposes. ~~bool (flag)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
|
||||
## assemble {id="assemble",tag="command"}
|
||||
|
||||
|
|
|
@ -42,12 +42,13 @@ architectures and their arguments and hyperparameters.
|
|||
> nlp.add_pipe("morphologizer", config=config)
|
||||
> ```
|
||||
|
||||
| Setting | Description |
|
||||
| ---------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `model` | The model to use. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||
| `overwrite` <Tag variant="new">3.2</Tag> | Whether the values of existing features are overwritten. Defaults to `True`. ~~bool~~ |
|
||||
| `extend` <Tag variant="new">3.2</Tag> | Whether existing feature types (whose values may or may not be overwritten depending on `overwrite`) are preserved. Defaults to `False`. ~~bool~~ |
|
||||
| `scorer` <Tag variant="new">3.2</Tag> | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"pos"` and `"morph"` and [`Scorer.score_token_attr_per_feat`](/api/scorer#score_token_attr_per_feat) for the attribute `"morph"`. ~~Optional[Callable]~~ |
|
||||
| Setting | Description |
|
||||
| ---------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `model` | The model to use. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||
| `overwrite` <Tag variant="new">3.2</Tag> | Whether the values of existing features are overwritten. Defaults to `True`. ~~bool~~ |
|
||||
| `extend` <Tag variant="new">3.2</Tag> | Whether existing feature types (whose values may or may not be overwritten depending on `overwrite`) are preserved. Defaults to `False`. ~~bool~~ |
|
||||
| `scorer` <Tag variant="new">3.2</Tag> | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"pos"` and `"morph"` and [`Scorer.score_token_attr_per_feat`](/api/scorer#score_token_attr_per_feat) for the attribute `"morph"`. ~~Optional[Callable]~~ |
|
||||
| `label_smoothing` <Tag variant="new">3.6</Tag> | [Label smoothing](https://arxiv.org/abs/1906.02629) factor. Defaults to `0.0`. ~~float~~ |
|
||||
|
||||
```python
|
||||
%%GITHUB_SPACY/spacy/pipeline/morphologizer.pyx
|
||||
|
|
|
@ -13,6 +13,13 @@ A span categorizer consists of two parts: a [suggester function](#suggesters)
|
|||
that proposes candidate spans, which may or may not overlap, and a labeler model
|
||||
that predicts zero or more labels for each candidate.
|
||||
|
||||
This component comes in two forms: `spancat` and `spancat_singlelabel` (added in
|
||||
spaCy v3.5.1). When you need to perform multi-label classification on your
|
||||
spans, use `spancat`. The `spancat` component uses a `Logistic` layer where the
|
||||
output class probabilities are independent for each class. However, if you need
|
||||
to predict at most one true class for a span, then use `spancat_singlelabel`. It
|
||||
uses a `Softmax` layer and treats the task as a multi-class problem.
|
||||
|
||||
Predicted spans will be saved in a [`SpanGroup`](/api/spangroup) on the doc.
|
||||
Individual span scores can be found in `spangroup.attrs["scores"]`.
|
||||
|
||||
|
@ -38,7 +45,7 @@ how the component should be configured. You can override its settings via the
|
|||
[model architectures](/api/architectures) documentation for details on the
|
||||
architectures and their arguments and hyperparameters.
|
||||
|
||||
> #### Example
|
||||
> #### Example (spancat)
|
||||
>
|
||||
> ```python
|
||||
> from spacy.pipeline.spancat import DEFAULT_SPANCAT_MODEL
|
||||
|
@ -52,14 +59,33 @@ architectures and their arguments and hyperparameters.
|
|||
> nlp.add_pipe("spancat", config=config)
|
||||
> ```
|
||||
|
||||
| Setting | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `suggester` | A function that [suggests spans](#suggesters). Spans are returned as a ragged array with two integer columns, for the start and end positions. Defaults to [`ngram_suggester`](#ngram_suggester). ~~Callable[[Iterable[Doc], Optional[Ops]], Ragged]~~ |
|
||||
| `model` | A model instance that is given a a list of documents and `(start, end)` indices representing candidate span offsets. The model predicts a probability for each category for each span. Defaults to [SpanCategorizer](/api/architectures#SpanCategorizer). ~~Model[Tuple[List[Doc], Ragged], Floats2d]~~ |
|
||||
| `spans_key` | Key of the [`Doc.spans`](/api/doc#spans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"sc"`. ~~str~~ |
|
||||
| `threshold` | Minimum probability to consider a prediction positive. Spans with a positive prediction will be saved on the Doc. Defaults to `0.5`. ~~float~~ |
|
||||
| `max_positive` | Maximum number of labels to consider positive per span. Defaults to `None`, indicating no limit. ~~Optional[int]~~ |
|
||||
| `scorer` | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for `Doc.spans[spans_key]` with overlapping spans allowed. ~~Optional[Callable]~~ |
|
||||
> #### Example (spancat_singlelabel)
|
||||
>
|
||||
> ```python
|
||||
> from spacy.pipeline.spancat import DEFAULT_SPANCAT_SINGLELABEL_MODEL
|
||||
> config = {
|
||||
> "threshold": 0.5,
|
||||
> "spans_key": "labeled_spans",
|
||||
> "model": DEFAULT_SPANCAT_SINGLELABEL_MODEL,
|
||||
> "suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
|
||||
> # Additional spancat_singlelabel parameters
|
||||
> "negative_weight": 0.8,
|
||||
> "allow_overlap": True,
|
||||
> }
|
||||
> nlp.add_pipe("spancat_singlelabel", config=config)
|
||||
> ```
|
||||
|
||||
| Setting | Description |
|
||||
| --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `suggester` | A function that [suggests spans](#suggesters). Spans are returned as a ragged array with two integer columns, for the start and end positions. Defaults to [`ngram_suggester`](#ngram_suggester). ~~Callable[[Iterable[Doc], Optional[Ops]], Ragged]~~ |
|
||||
| `model` | A model instance that is given a a list of documents and `(start, end)` indices representing candidate span offsets. The model predicts a probability for each category for each span. Defaults to [SpanCategorizer](/api/architectures#SpanCategorizer). ~~Model[Tuple[List[Doc], Ragged], Floats2d]~~ |
|
||||
| `spans_key` | Key of the [`Doc.spans`](/api/doc#spans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"sc"`. ~~str~~ |
|
||||
| `threshold` | Minimum probability to consider a prediction positive. Spans with a positive prediction will be saved on the Doc. Meant to be used in combination with the multi-class `spancat` component with a `Logistic` scoring layer. Defaults to `0.5`. ~~float~~ |
|
||||
| `max_positive` | Maximum number of labels to consider positive per span. Defaults to `None`, indicating no limit. Meant to be used together with the `spancat` component and defaults to 0 with `spancat_singlelabel`. ~~Optional[int]~~ |
|
||||
| `scorer` | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for `Doc.spans[spans_key]` with overlapping spans allowed. ~~Optional[Callable]~~ |
|
||||
| `add_negative_label` <Tag variant="new">3.5.1</Tag> | Whether to learn to predict a special negative label for each unannotated `Span` . This should be `True` when using a `Softmax` classifier layer and so its `True` by default for `spancat_singlelabel`. Spans with negative labels and their scores are not stored as annotations. ~~bool~~ |
|
||||
| `negative_weight` <Tag variant="new">3.5.1</Tag> | Multiplier for the loss terms. It can be used to downweight the negative samples if there are too many. It is only used when `add_negative_label` is `True`. Defaults to `1.0`. ~~float~~ |
|
||||
| `allow_overlap` <Tag variant="new">3.5.1</Tag> | If `True`, the data is assumed to contain overlapping spans. It is only available when `max_positive` is exactly 1. Defaults to `True`. ~~bool~~ |
|
||||
|
||||
```python
|
||||
%%GITHUB_SPACY/spacy/pipeline/spancat.py
|
||||
|
@ -71,6 +97,7 @@ architectures and their arguments and hyperparameters.
|
|||
>
|
||||
> ```python
|
||||
> # Construction via add_pipe with default model
|
||||
> # Replace 'spancat' with 'spancat_singlelabel' for exclusive classes
|
||||
> spancat = nlp.add_pipe("spancat")
|
||||
>
|
||||
> # Construction via add_pipe with custom model
|
||||
|
@ -86,16 +113,19 @@ Create a new pipeline instance. In your application, you would normally use a
|
|||
shortcut for this and instantiate the component using its string name and
|
||||
[`nlp.add_pipe`](/api/language#create_pipe).
|
||||
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `vocab` | The shared vocabulary. ~~Vocab~~ |
|
||||
| `model` | A model instance that is given a a list of documents and `(start, end)` indices representing candidate span offsets. The model predicts a probability for each category for each span. ~~Model[Tuple[List[Doc], Ragged], Floats2d]~~ |
|
||||
| `suggester` | A function that [suggests spans](#suggesters). Spans are returned as a ragged array with two integer columns, for the start and end positions. ~~Callable[[Iterable[Doc], Optional[Ops]], Ragged]~~ |
|
||||
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
|
||||
| _keyword-only_ | |
|
||||
| `spans_key` | Key of the [`Doc.spans`](/api/doc#sans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"sc"`. ~~str~~ |
|
||||
| `threshold` | Minimum probability to consider a prediction positive. Spans with a positive prediction will be saved on the Doc. Defaults to `0.5`. ~~float~~ |
|
||||
| `max_positive` | Maximum number of labels to consider positive per span. Defaults to `None`, indicating no limit. ~~Optional[int]~~ |
|
||||
| Name | Description |
|
||||
| --------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | The shared vocabulary. ~~Vocab~~ |
|
||||
| `model` | A model instance that is given a a list of documents and `(start, end)` indices representing candidate span offsets. The model predicts a probability for each category for each span. ~~Model[Tuple[List[Doc], Ragged], Floats2d]~~ |
|
||||
| `suggester` | A function that [suggests spans](#suggesters). Spans are returned as a ragged array with two integer columns, for the start and end positions. ~~Callable[[Iterable[Doc], Optional[Ops]], Ragged]~~ |
|
||||
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
|
||||
| _keyword-only_ | |
|
||||
| `spans_key` | Key of the [`Doc.spans`](/api/doc#sans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"sc"`. ~~str~~ |
|
||||
| `threshold` | Minimum probability to consider a prediction positive. Spans with a positive prediction will be saved on the Doc. Defaults to `0.5`. ~~float~~ |
|
||||
| `max_positive` | Maximum number of labels to consider positive per span. Defaults to `None`, indicating no limit. ~~Optional[int]~~ |
|
||||
| `allow_overlap` <Tag variant="new">3.5.1</Tag> | If `True`, the data is assumed to contain overlapping spans. It is only available when `max_positive` is exactly 1. Defaults to `True`. ~~bool~~ |
|
||||
| `add_negative_label` <Tag variant="new">3.5.1</Tag> | Whether to learn to predict a special negative label for each unannotated `Span`. This should be `True` when using a `Softmax` classifier layer and so its `True` by default for `spancat_singlelabel` . Spans with negative labels and their scores are not stored as annotations. ~~bool~~ |
|
||||
| `negative_weight` <Tag variant="new">3.5.1</Tag> | Multiplier for the loss terms. It can be used to downweight the negative samples if there are too many . It is only used when `add_negative_label` is `True`. Defaults to `1.0`. ~~float~~ |
|
||||
|
||||
## SpanCategorizer.\_\_call\_\_ {id="call",tag="method"}
|
||||
|
||||
|
|
|
@ -40,12 +40,13 @@ architectures and their arguments and hyperparameters.
|
|||
> nlp.add_pipe("tagger", config=config)
|
||||
> ```
|
||||
|
||||
| Setting | Description |
|
||||
| ------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `model` | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||
| `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~ |
|
||||
| `scorer` <Tag variant="new">3.2</Tag> | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attribute `"tag"`. ~~Optional[Callable]~~ |
|
||||
| `neg_prefix` <Tag variant="new">3.2.1</Tag> | The prefix used to specify incorrect tags while training. The tagger will learn not to predict exactly this tag. Defaults to `!`. ~~str~~ |
|
||||
| Setting | Description |
|
||||
| ---------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `model` | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||
| `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~ |
|
||||
| `scorer` <Tag variant="new">3.2</Tag> | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attribute `"tag"`. ~~Optional[Callable]~~ |
|
||||
| `neg_prefix` <Tag variant="new">3.2.1</Tag> | The prefix used to specify incorrect tags while training. The tagger will learn not to predict exactly this tag. Defaults to `!`. ~~str~~ |
|
||||
| `label_smoothing` <Tag variant="new">3.6</Tag> | [Label smoothing](https://arxiv.org/abs/1906.02629) factor. Defaults to `0.0`. ~~float~~ |
|
||||
|
||||
```python
|
||||
%%GITHUB_SPACY/spacy/pipeline/tagger.pyx
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
"dev": "next dev",
|
||||
"build": "next build && npm run sitemap && next export",
|
||||
"prebuild": "pip install -r setup/requirements.txt && sh setup/setup.sh",
|
||||
"predev": "npm run prebuild",
|
||||
"sitemap": "next-sitemap --config next-sitemap.config.mjs",
|
||||
"start": "next start",
|
||||
"lint": "next lint",
|
||||
|
|
|
@ -111,11 +111,12 @@
|
|||
line-height: var(--line-height-xs)
|
||||
text-align: center
|
||||
|
||||
@include breakpoint(max, xs)
|
||||
.list
|
||||
@include breakpoint(max, md)
|
||||
.alert
|
||||
display: none
|
||||
|
||||
.alert
|
||||
@include breakpoint(max, xs)
|
||||
.list
|
||||
display: none
|
||||
|
||||
.has-alert
|
||||
|
|
|
@ -57,9 +57,15 @@ const AlertSpace = ({ nightly, legacy }) => {
|
|||
)
|
||||
}
|
||||
|
||||
// const navAlert = (
|
||||
// <Link to="/usage/v3-5" noLinkLayout>
|
||||
// <strong>💥 Out now:</strong> spaCy v3.5
|
||||
// </Link>
|
||||
// )
|
||||
|
||||
const navAlert = (
|
||||
<Link to="/usage/v3-5" noLinkLayout>
|
||||
<strong>💥 Out now:</strong> spaCy v3.5
|
||||
<Link to="https://form.typeform.com/to/aMel9q9f" noLinkLayout>
|
||||
<strong>💥 Take the user survey!</strong>
|
||||
</Link>
|
||||
)
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user