mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-04 20:30:24 +03:00
Compare commits
50 Commits
Author | SHA1 | Date | |
---|---|---|---|
|
aea4a96f92 | ||
|
e4bbdf7b50 | ||
|
f66d55fe5b | ||
|
9fbb8ee912 | ||
|
314a7cea73 | ||
|
2fbd080a03 | ||
|
bbf232e355 | ||
|
0ec4dc5c29 | ||
|
a5406a6c45 | ||
|
57ee1212de | ||
|
b228875600 | ||
|
8d064872ff | ||
|
26da226a39 | ||
|
888332dfb2 | ||
|
1b4a67bc54 | ||
|
79dcef17f7 | ||
|
0ecbeff1a6 | ||
|
4380d750f9 | ||
|
2953e7b7ce | ||
|
d2d9e9e139 | ||
|
f1a42b6fcc | ||
|
f9c0220ea5 | ||
|
6183906a0b | ||
|
bd0768c05c | ||
|
be644caa13 | ||
|
7880da952b | ||
|
545218a7d9 | ||
|
d00e58d1ac | ||
|
9ca67dc539 | ||
|
ed83cafe46 | ||
|
9da333cbfa | ||
|
8153bd573f | ||
|
83056bb44c | ||
|
03b320b3bd | ||
|
c2810575c0 | ||
|
53687b5bca | ||
|
5398e9f276 | ||
|
69ca6eb041 | ||
|
cbd85c9608 | ||
|
a1fc4ed962 | ||
|
6177c87539 | ||
|
a86ec1b2b1 | ||
|
e381efd936 | ||
|
6f1632b3e9 | ||
|
e325de3ff8 | ||
|
b3e7364551 | ||
|
f87919d8f0 | ||
|
5ed1db7ae4 | ||
|
18f4378a91 | ||
|
be673462be |
27
.github/azure-steps.yml
vendored
27
.github/azure-steps.yml
vendored
|
@ -57,46 +57,51 @@ steps:
|
||||||
python -m spacy download ca_core_news_md
|
python -m spacy download ca_core_news_md
|
||||||
python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
|
python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
|
||||||
displayName: 'Test download CLI'
|
displayName: 'Test download CLI'
|
||||||
condition: eq(variables['python_version'], '3.8')
|
condition: eq(variables['python_version'], '3.9')
|
||||||
|
|
||||||
|
- script: |
|
||||||
|
python -W error -m spacy info ca_core_news_sm | grep -q download_url
|
||||||
|
displayName: 'Test download_url in info CLI'
|
||||||
|
condition: eq(variables['python_version'], '3.9')
|
||||||
|
|
||||||
- script: |
|
- script: |
|
||||||
python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
|
python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
|
||||||
displayName: 'Test no warnings on load (#11713)'
|
displayName: 'Test no warnings on load (#11713)'
|
||||||
condition: eq(variables['python_version'], '3.8')
|
condition: eq(variables['python_version'], '3.9')
|
||||||
|
|
||||||
- script: |
|
- script: |
|
||||||
python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
|
python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
|
||||||
displayName: 'Test convert CLI'
|
displayName: 'Test convert CLI'
|
||||||
condition: eq(variables['python_version'], '3.8')
|
condition: eq(variables['python_version'], '3.9')
|
||||||
|
|
||||||
- script: |
|
- script: |
|
||||||
python -m spacy init config -p ner -l ca ner.cfg
|
python -m spacy init config -p ner -l ca ner.cfg
|
||||||
python -m spacy debug config ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy
|
python -m spacy debug config ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy
|
||||||
displayName: 'Test debug config CLI'
|
displayName: 'Test debug config CLI'
|
||||||
condition: eq(variables['python_version'], '3.8')
|
condition: eq(variables['python_version'], '3.9')
|
||||||
|
|
||||||
- script: |
|
- script: |
|
||||||
# will have errors due to sparse data, check for summary in output
|
# will have errors due to sparse data, check for summary in output
|
||||||
python -m spacy debug data ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy | grep -q Summary
|
python -m spacy debug data ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy | grep -q Summary
|
||||||
displayName: 'Test debug data CLI'
|
displayName: 'Test debug data CLI'
|
||||||
condition: eq(variables['python_version'], '3.8')
|
condition: eq(variables['python_version'], '3.9')
|
||||||
|
|
||||||
- script: |
|
- script: |
|
||||||
python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
|
python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
|
||||||
displayName: 'Test train CLI'
|
displayName: 'Test train CLI'
|
||||||
condition: eq(variables['python_version'], '3.8')
|
condition: eq(variables['python_version'], '3.9')
|
||||||
|
|
||||||
- script: |
|
- script: |
|
||||||
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
|
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
|
||||||
PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
|
PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
|
||||||
displayName: 'Test assemble CLI'
|
displayName: 'Test assemble CLI'
|
||||||
condition: eq(variables['python_version'], '3.8')
|
condition: eq(variables['python_version'], '3.9')
|
||||||
|
|
||||||
- script: |
|
- script: |
|
||||||
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
|
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
|
||||||
python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
|
python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
|
||||||
displayName: 'Test assemble CLI vectors warning'
|
displayName: 'Test assemble CLI vectors warning'
|
||||||
condition: eq(variables['python_version'], '3.8')
|
condition: eq(variables['python_version'], '3.9')
|
||||||
|
|
||||||
- script: |
|
- script: |
|
||||||
python -m pip install -U -r requirements.txt
|
python -m pip install -U -r requirements.txt
|
||||||
|
@ -111,9 +116,3 @@ steps:
|
||||||
python -m pytest --pyargs spacy
|
python -m pytest --pyargs spacy
|
||||||
displayName: "Run CPU tests with thinc-apple-ops"
|
displayName: "Run CPU tests with thinc-apple-ops"
|
||||||
condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.11'))
|
condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.11'))
|
||||||
|
|
||||||
- script: |
|
|
||||||
python .github/validate_universe_json.py website/meta/universe.json
|
|
||||||
displayName: 'Test website/meta/universe.json'
|
|
||||||
condition: eq(variables['python_version'], '3.8')
|
|
||||||
|
|
||||||
|
|
45
.github/workflows/autoblack.yml
vendored
45
.github/workflows/autoblack.yml
vendored
|
@ -1,45 +0,0 @@
|
||||||
# GitHub Action that uses Black to reformat all Python code and submits a PR
|
|
||||||
# in regular intervals. Inspired by: https://github.com/cclauss/autoblack
|
|
||||||
|
|
||||||
name: autoblack
|
|
||||||
on:
|
|
||||||
workflow_dispatch: # allow manual trigger
|
|
||||||
schedule:
|
|
||||||
- cron: '0 8 * * 5' # every Friday at 8am UTC
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
autoblack:
|
|
||||||
if: github.repository_owner == 'explosion'
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v3
|
|
||||||
with:
|
|
||||||
ref: ${{ github.head_ref }}
|
|
||||||
- uses: actions/setup-python@v4
|
|
||||||
- run: pip install black -c requirements.txt
|
|
||||||
- name: Auto-format code if needed
|
|
||||||
run: black spacy
|
|
||||||
# We can't run black --check here because that returns a non-zero excit
|
|
||||||
# code and makes GitHub think the action failed
|
|
||||||
- name: Check for modified files
|
|
||||||
id: git-check
|
|
||||||
run: echo modified=$(if git diff-index --quiet HEAD --; then echo "false"; else echo "true"; fi) >> $GITHUB_OUTPUT
|
|
||||||
|
|
||||||
- name: Create Pull Request
|
|
||||||
if: steps.git-check.outputs.modified == 'true'
|
|
||||||
uses: peter-evans/create-pull-request@v4
|
|
||||||
with:
|
|
||||||
title: Auto-format code with black
|
|
||||||
labels: meta
|
|
||||||
commit-message: Auto-format code with black
|
|
||||||
committer: GitHub <noreply@github.com>
|
|
||||||
author: explosion-bot <explosion-bot@users.noreply.github.com>
|
|
||||||
body: _This PR is auto-generated._
|
|
||||||
branch: autoblack
|
|
||||||
delete-branch: true
|
|
||||||
draft: false
|
|
||||||
- name: Check outputs
|
|
||||||
if: steps.git-check.outputs.modified == 'true'
|
|
||||||
run: |
|
|
||||||
echo "Pull Request Number - ${{ steps.cpr.outputs.pull-request-number }}"
|
|
||||||
echo "Pull Request URL - ${{ steps.cpr.outputs.pull-request-url }}"
|
|
1
.github/workflows/explosionbot.yml
vendored
1
.github/workflows/explosionbot.yml
vendored
|
@ -8,6 +8,7 @@ on:
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
explosion-bot:
|
explosion-bot:
|
||||||
|
if: github.repository_owner == 'explosion'
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Dump GitHub context
|
- name: Dump GitHub context
|
||||||
|
|
1
.github/workflows/issue-manager.yml
vendored
1
.github/workflows/issue-manager.yml
vendored
|
@ -13,6 +13,7 @@ on:
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
issue-manager:
|
issue-manager:
|
||||||
|
if: github.repository_owner == 'explosion'
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: tiangolo/issue-manager@0.4.0
|
- uses: tiangolo/issue-manager@0.4.0
|
||||||
|
|
1
.github/workflows/lock.yml
vendored
1
.github/workflows/lock.yml
vendored
|
@ -13,6 +13,7 @@ concurrency:
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
action:
|
action:
|
||||||
|
if: github.repository_owner == 'explosion'
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: dessant/lock-threads@v4
|
- uses: dessant/lock-threads@v4
|
||||||
|
|
1
.github/workflows/spacy_universe_alert.yml
vendored
1
.github/workflows/spacy_universe_alert.yml
vendored
|
@ -7,6 +7,7 @@ on:
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
build:
|
build:
|
||||||
|
if: github.repository_owner == 'explosion'
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
|
|
173
.github/workflows/tests.yml
vendored
Normal file
173
.github/workflows/tests.yml
vendored
Normal file
|
@ -0,0 +1,173 @@
|
||||||
|
name: tests
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches-ignore:
|
||||||
|
- "spacy.io"
|
||||||
|
- "nightly.spacy.io"
|
||||||
|
- "v2.spacy.io"
|
||||||
|
paths-ignore:
|
||||||
|
- "*.md"
|
||||||
|
- "*.mdx"
|
||||||
|
- "website/**"
|
||||||
|
- ".github/workflows/**"
|
||||||
|
pull_request:
|
||||||
|
types: [opened, synchronize, reopened, edited]
|
||||||
|
paths-ignore:
|
||||||
|
- "*.md"
|
||||||
|
- "*.mdx"
|
||||||
|
- "website/**"
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
validate:
|
||||||
|
name: Validate
|
||||||
|
if: github.repository_owner == 'explosion'
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- name: Check out repo
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
|
- name: Configure Python version
|
||||||
|
uses: actions/setup-python@v4
|
||||||
|
with:
|
||||||
|
python-version: "3.7"
|
||||||
|
architecture: x64
|
||||||
|
|
||||||
|
- name: black
|
||||||
|
run: |
|
||||||
|
python -m pip install black -c requirements.txt
|
||||||
|
python -m black spacy --check
|
||||||
|
- name: flake8
|
||||||
|
run: |
|
||||||
|
python -m pip install flake8==5.0.4
|
||||||
|
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
|
||||||
|
tests:
|
||||||
|
name: Test
|
||||||
|
needs: Validate
|
||||||
|
strategy:
|
||||||
|
fail-fast: true
|
||||||
|
matrix:
|
||||||
|
os: [ubuntu-latest, windows-latest, macos-latest]
|
||||||
|
python_version: ["3.11"]
|
||||||
|
include:
|
||||||
|
- os: ubuntu-20.04
|
||||||
|
python_version: "3.6"
|
||||||
|
- os: windows-latest
|
||||||
|
python_version: "3.7"
|
||||||
|
- os: macos-latest
|
||||||
|
python_version: "3.8"
|
||||||
|
- os: ubuntu-latest
|
||||||
|
python_version: "3.9"
|
||||||
|
- os: windows-latest
|
||||||
|
python_version: "3.10"
|
||||||
|
|
||||||
|
runs-on: ${{ matrix.os }}
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Check out repo
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
|
- name: Configure Python version
|
||||||
|
uses: actions/setup-python@v4
|
||||||
|
with:
|
||||||
|
python-version: ${{ matrix.python_version }}
|
||||||
|
architecture: x64
|
||||||
|
|
||||||
|
- name: Install dependencies
|
||||||
|
run: |
|
||||||
|
python -m pip install -U build pip setuptools
|
||||||
|
python -m pip install -U -r requirements.txt
|
||||||
|
|
||||||
|
- name: Build sdist
|
||||||
|
run: |
|
||||||
|
python -m build --sdist
|
||||||
|
|
||||||
|
- name: Run mypy
|
||||||
|
run: |
|
||||||
|
python -m mypy spacy
|
||||||
|
if: matrix.python_version != '3.6'
|
||||||
|
|
||||||
|
- name: Delete source directory and .egg-info
|
||||||
|
run: |
|
||||||
|
rm -rf spacy *.egg-info
|
||||||
|
shell: bash
|
||||||
|
|
||||||
|
- name: Uninstall all packages
|
||||||
|
run: |
|
||||||
|
python -m pip freeze
|
||||||
|
python -m pip freeze --exclude pywin32 > installed.txt
|
||||||
|
python -m pip uninstall -y -r installed.txt
|
||||||
|
|
||||||
|
- name: Install from sdist
|
||||||
|
run: |
|
||||||
|
SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
|
||||||
|
SPACY_NUM_BUILD_JOBS=2 python -m pip install dist/$SDIST
|
||||||
|
shell: bash
|
||||||
|
|
||||||
|
- name: Test import
|
||||||
|
run: python -W error -c "import spacy"
|
||||||
|
|
||||||
|
- name: "Test download CLI"
|
||||||
|
run: |
|
||||||
|
python -m spacy download ca_core_news_sm
|
||||||
|
python -m spacy download ca_core_news_md
|
||||||
|
python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
|
||||||
|
if: matrix.python_version == '3.9'
|
||||||
|
|
||||||
|
- name: "Test download_url in info CLI"
|
||||||
|
run: |
|
||||||
|
python -W error -m spacy info ca_core_news_sm | grep -q download_url
|
||||||
|
if: matrix.python_version == '3.9'
|
||||||
|
|
||||||
|
- name: "Test no warnings on load (#11713)"
|
||||||
|
run: |
|
||||||
|
python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
|
||||||
|
if: matrix.python_version == '3.9'
|
||||||
|
|
||||||
|
- name: "Test convert CLI"
|
||||||
|
run: |
|
||||||
|
python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
|
||||||
|
if: matrix.python_version == '3.9'
|
||||||
|
|
||||||
|
- name: "Test debug config CLI"
|
||||||
|
run: |
|
||||||
|
python -m spacy init config -p ner -l ca ner.cfg
|
||||||
|
python -m spacy debug config ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy
|
||||||
|
if: matrix.python_version == '3.9'
|
||||||
|
|
||||||
|
- name: "Test debug data CLI"
|
||||||
|
run: |
|
||||||
|
# will have errors due to sparse data, check for summary in output
|
||||||
|
python -m spacy debug data ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy | grep -q Summary
|
||||||
|
if: matrix.python_version == '3.9'
|
||||||
|
|
||||||
|
- name: "Test train CLI"
|
||||||
|
run: |
|
||||||
|
python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
|
||||||
|
if: matrix.python_version == '3.9'
|
||||||
|
|
||||||
|
- name: "Test assemble CLI"
|
||||||
|
run: |
|
||||||
|
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
|
||||||
|
PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
|
||||||
|
if: matrix.python_version == '3.9'
|
||||||
|
|
||||||
|
- name: "Test assemble CLI vectors warning"
|
||||||
|
run: |
|
||||||
|
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
|
||||||
|
python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
|
||||||
|
if: matrix.python_version == '3.9'
|
||||||
|
|
||||||
|
- name: "Install test requirements"
|
||||||
|
run: |
|
||||||
|
python -m pip install -U -r requirements.txt
|
||||||
|
|
||||||
|
- name: "Run CPU tests"
|
||||||
|
run: |
|
||||||
|
python -m pytest --pyargs spacy -W error
|
||||||
|
|
||||||
|
- name: "Run CPU tests with thinc-apple-ops"
|
||||||
|
run: |
|
||||||
|
python -m pip install 'spacy[apple]'
|
||||||
|
python -m pytest --pyargs spacy
|
||||||
|
if: startsWith(matrix.os, 'macos') && matrix.python_version == '3.11'
|
33
.github/workflows/universe_validation.yml
vendored
Normal file
33
.github/workflows/universe_validation.yml
vendored
Normal file
|
@ -0,0 +1,33 @@
|
||||||
|
name: universe validation
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches-ignore:
|
||||||
|
- "spacy.io"
|
||||||
|
- "nightly.spacy.io"
|
||||||
|
- "v2.spacy.io"
|
||||||
|
paths:
|
||||||
|
- "website/meta/universe.json"
|
||||||
|
pull_request:
|
||||||
|
types: [opened, synchronize, reopened, edited]
|
||||||
|
paths:
|
||||||
|
- "website/meta/universe.json"
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
validate:
|
||||||
|
name: Validate
|
||||||
|
if: github.repository_owner == 'explosion'
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- name: Check out repo
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
|
- name: Configure Python version
|
||||||
|
uses: actions/setup-python@v4
|
||||||
|
with:
|
||||||
|
python-version: "3.7"
|
||||||
|
architecture: x64
|
||||||
|
|
||||||
|
- name: Validate website/meta/universe.json
|
||||||
|
run: |
|
||||||
|
python .github/validate_universe_json.py website/meta/universe.json
|
|
@ -16,6 +16,9 @@ production-ready [**training system**](https://spacy.io/usage/training) and easy
|
||||||
model packaging, deployment and workflow management. spaCy is commercial
|
model packaging, deployment and workflow management. spaCy is commercial
|
||||||
open-source software, released under the [MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE).
|
open-source software, released under the [MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE).
|
||||||
|
|
||||||
|
💥 **We'd love to hear more about your experience with spaCy!**
|
||||||
|
[Fill out our survey here.](https://form.typeform.com/to/aMel9q9f)
|
||||||
|
|
||||||
💫 **Version 3.5 out now!**
|
💫 **Version 3.5 out now!**
|
||||||
[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
|
[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
|
||||||
|
|
||||||
|
|
|
@ -48,6 +48,9 @@ jobs:
|
||||||
pip install flake8==5.0.4
|
pip install flake8==5.0.4
|
||||||
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
|
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
|
||||||
displayName: "flake8"
|
displayName: "flake8"
|
||||||
|
- script: |
|
||||||
|
python .github/validate_universe_json.py website/meta/universe.json
|
||||||
|
displayName: 'Validate website/meta/universe.json'
|
||||||
|
|
||||||
- job: "Test"
|
- job: "Test"
|
||||||
dependsOn: "Validate"
|
dependsOn: "Validate"
|
||||||
|
|
|
@ -5,7 +5,7 @@ requires = [
|
||||||
"cymem>=2.0.2,<2.1.0",
|
"cymem>=2.0.2,<2.1.0",
|
||||||
"preshed>=3.0.2,<3.1.0",
|
"preshed>=3.0.2,<3.1.0",
|
||||||
"murmurhash>=0.28.0,<1.1.0",
|
"murmurhash>=0.28.0,<1.1.0",
|
||||||
"thinc>=8.1.0,<8.2.0",
|
"thinc>=8.1.8,<8.2.0",
|
||||||
"numpy>=1.15.0",
|
"numpy>=1.15.0",
|
||||||
]
|
]
|
||||||
build-backend = "setuptools.build_meta"
|
build-backend = "setuptools.build_meta"
|
||||||
|
|
|
@ -3,7 +3,7 @@ spacy-legacy>=3.0.11,<3.1.0
|
||||||
spacy-loggers>=1.0.0,<2.0.0
|
spacy-loggers>=1.0.0,<2.0.0
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
thinc>=8.1.0,<8.2.0
|
thinc>=8.1.8,<8.2.0
|
||||||
ml_datasets>=0.2.0,<0.3.0
|
ml_datasets>=0.2.0,<0.3.0
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
wasabi>=0.9.1,<1.2.0
|
wasabi>=0.9.1,<1.2.0
|
||||||
|
|
40
setup.cfg
40
setup.cfg
|
@ -39,7 +39,7 @@ setup_requires =
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
thinc>=8.1.0,<8.2.0
|
thinc>=8.1.8,<8.2.0
|
||||||
install_requires =
|
install_requires =
|
||||||
# Our libraries
|
# Our libraries
|
||||||
spacy-legacy>=3.0.11,<3.1.0
|
spacy-legacy>=3.0.11,<3.1.0
|
||||||
|
@ -47,7 +47,7 @@ install_requires =
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
thinc>=8.1.0,<8.2.0
|
thinc>=8.1.8,<8.2.0
|
||||||
wasabi>=0.9.1,<1.2.0
|
wasabi>=0.9.1,<1.2.0
|
||||||
srsly>=2.4.3,<3.0.0
|
srsly>=2.4.3,<3.0.0
|
||||||
catalogue>=2.0.6,<2.1.0
|
catalogue>=2.0.6,<2.1.0
|
||||||
|
@ -78,41 +78,41 @@ transformers =
|
||||||
ray =
|
ray =
|
||||||
spacy_ray>=0.1.0,<1.0.0
|
spacy_ray>=0.1.0,<1.0.0
|
||||||
cuda =
|
cuda =
|
||||||
cupy>=5.0.0b4,<12.0.0
|
cupy>=5.0.0b4,<13.0.0
|
||||||
cuda80 =
|
cuda80 =
|
||||||
cupy-cuda80>=5.0.0b4,<12.0.0
|
cupy-cuda80>=5.0.0b4,<13.0.0
|
||||||
cuda90 =
|
cuda90 =
|
||||||
cupy-cuda90>=5.0.0b4,<12.0.0
|
cupy-cuda90>=5.0.0b4,<13.0.0
|
||||||
cuda91 =
|
cuda91 =
|
||||||
cupy-cuda91>=5.0.0b4,<12.0.0
|
cupy-cuda91>=5.0.0b4,<13.0.0
|
||||||
cuda92 =
|
cuda92 =
|
||||||
cupy-cuda92>=5.0.0b4,<12.0.0
|
cupy-cuda92>=5.0.0b4,<13.0.0
|
||||||
cuda100 =
|
cuda100 =
|
||||||
cupy-cuda100>=5.0.0b4,<12.0.0
|
cupy-cuda100>=5.0.0b4,<13.0.0
|
||||||
cuda101 =
|
cuda101 =
|
||||||
cupy-cuda101>=5.0.0b4,<12.0.0
|
cupy-cuda101>=5.0.0b4,<13.0.0
|
||||||
cuda102 =
|
cuda102 =
|
||||||
cupy-cuda102>=5.0.0b4,<12.0.0
|
cupy-cuda102>=5.0.0b4,<13.0.0
|
||||||
cuda110 =
|
cuda110 =
|
||||||
cupy-cuda110>=5.0.0b4,<12.0.0
|
cupy-cuda110>=5.0.0b4,<13.0.0
|
||||||
cuda111 =
|
cuda111 =
|
||||||
cupy-cuda111>=5.0.0b4,<12.0.0
|
cupy-cuda111>=5.0.0b4,<13.0.0
|
||||||
cuda112 =
|
cuda112 =
|
||||||
cupy-cuda112>=5.0.0b4,<12.0.0
|
cupy-cuda112>=5.0.0b4,<13.0.0
|
||||||
cuda113 =
|
cuda113 =
|
||||||
cupy-cuda113>=5.0.0b4,<12.0.0
|
cupy-cuda113>=5.0.0b4,<13.0.0
|
||||||
cuda114 =
|
cuda114 =
|
||||||
cupy-cuda114>=5.0.0b4,<12.0.0
|
cupy-cuda114>=5.0.0b4,<13.0.0
|
||||||
cuda115 =
|
cuda115 =
|
||||||
cupy-cuda115>=5.0.0b4,<12.0.0
|
cupy-cuda115>=5.0.0b4,<13.0.0
|
||||||
cuda116 =
|
cuda116 =
|
||||||
cupy-cuda116>=5.0.0b4,<12.0.0
|
cupy-cuda116>=5.0.0b4,<13.0.0
|
||||||
cuda117 =
|
cuda117 =
|
||||||
cupy-cuda117>=5.0.0b4,<12.0.0
|
cupy-cuda117>=5.0.0b4,<13.0.0
|
||||||
cuda11x =
|
cuda11x =
|
||||||
cupy-cuda11x>=11.0.0,<12.0.0
|
cupy-cuda11x>=11.0.0,<13.0.0
|
||||||
cuda-autodetect =
|
cuda-autodetect =
|
||||||
cupy-wheel>=11.0.0,<12.0.0
|
cupy-wheel>=11.0.0,<13.0.0
|
||||||
apple =
|
apple =
|
||||||
thinc-apple-ops>=0.1.0.dev0,<1.0.0
|
thinc-apple-ops>=0.1.0.dev0,<1.0.0
|
||||||
# Language tokenizers with external dependencies
|
# Language tokenizers with external dependencies
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
# fmt: off
|
# fmt: off
|
||||||
__title__ = "spacy"
|
__title__ = "spacy"
|
||||||
__version__ = "3.5.0"
|
__version__ = "3.5.2"
|
||||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||||
__projects__ = "https://github.com/explosion/projects"
|
__projects__ = "https://github.com/explosion/projects"
|
||||||
|
|
|
@ -35,7 +35,7 @@ def find_threshold_cli(
|
||||||
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||||
use_gpu: int = Opt(_DEFAULTS["use_gpu"], "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
use_gpu: int = Opt(_DEFAULTS["use_gpu"], "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
||||||
gold_preproc: bool = Opt(_DEFAULTS["gold_preproc"], "--gold-preproc", "-G", help="Use gold preprocessing"),
|
gold_preproc: bool = Opt(_DEFAULTS["gold_preproc"], "--gold-preproc", "-G", help="Use gold preprocessing"),
|
||||||
verbose: bool = Opt(False, "--silent", "-V", "-VV", help="Display more information for debugging purposes"),
|
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
from typing import Optional, Dict, Any, Union, List
|
from typing import Optional, Dict, Any, Union, List
|
||||||
import platform
|
import platform
|
||||||
import pkg_resources
|
|
||||||
import json
|
import json
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from wasabi import Printer, MarkdownRenderer
|
from wasabi import Printer, MarkdownRenderer
|
||||||
|
@ -10,6 +9,7 @@ from ._util import app, Arg, Opt, string_to_list
|
||||||
from .download import get_model_filename, get_latest_version
|
from .download import get_model_filename, get_latest_version
|
||||||
from .. import util
|
from .. import util
|
||||||
from .. import about
|
from .. import about
|
||||||
|
from ..compat import importlib_metadata
|
||||||
|
|
||||||
|
|
||||||
@app.command("info")
|
@app.command("info")
|
||||||
|
@ -137,15 +137,14 @@ def info_installed_model_url(model: str) -> Optional[str]:
|
||||||
dist-info available.
|
dist-info available.
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
dist = pkg_resources.get_distribution(model)
|
dist = importlib_metadata.distribution(model)
|
||||||
data = json.loads(dist.get_metadata("direct_url.json"))
|
text = dist.read_text("direct_url.json")
|
||||||
return data["url"]
|
if isinstance(text, str):
|
||||||
except pkg_resources.DistributionNotFound:
|
data = json.loads(text)
|
||||||
# no such package
|
return data["url"]
|
||||||
return None
|
|
||||||
except Exception:
|
except Exception:
|
||||||
# something else, like no file or invalid JSON
|
pass
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def info_model_url(model: str) -> Dict[str, Any]:
|
def info_model_url(model: str) -> Dict[str, Any]:
|
||||||
|
|
|
@ -23,6 +23,7 @@ def pretrain_cli(
|
||||||
resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
|
resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
|
||||||
epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."),
|
epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."),
|
||||||
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
||||||
|
skip_last: bool = Opt(False, "--skip-last", "-L", help="Skip saving model-last.bin"),
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
|
@ -74,6 +75,7 @@ def pretrain_cli(
|
||||||
epoch_resume=epoch_resume,
|
epoch_resume=epoch_resume,
|
||||||
use_gpu=use_gpu,
|
use_gpu=use_gpu,
|
||||||
silent=False,
|
silent=False,
|
||||||
|
skip_last=skip_last,
|
||||||
)
|
)
|
||||||
msg.good("Successfully finished pretrain")
|
msg.good("Successfully finished pretrain")
|
||||||
|
|
||||||
|
|
|
@ -2,7 +2,6 @@ from typing import Optional, List, Dict, Sequence, Any, Iterable, Tuple
|
||||||
import os.path
|
import os.path
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import pkg_resources
|
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
from wasabi.util import locale_escape
|
from wasabi.util import locale_escape
|
||||||
import sys
|
import sys
|
||||||
|
@ -331,6 +330,7 @@ def _check_requirements(requirements: List[str]) -> Tuple[bool, bool]:
|
||||||
RETURNS (Tuple[bool, bool]): Whether (1) any packages couldn't be imported, (2) any packages with version conflicts
|
RETURNS (Tuple[bool, bool]): Whether (1) any packages couldn't be imported, (2) any packages with version conflicts
|
||||||
exist.
|
exist.
|
||||||
"""
|
"""
|
||||||
|
import pkg_resources
|
||||||
|
|
||||||
failed_pkgs_msgs: List[str] = []
|
failed_pkgs_msgs: List[str] = []
|
||||||
conflicting_pkgs_msgs: List[str] = []
|
conflicting_pkgs_msgs: List[str] = []
|
||||||
|
|
|
@ -3,7 +3,7 @@ the docs and the init config command. It encodes various best practices and
|
||||||
can help generate the best possible configuration, given a user's requirements. #}
|
can help generate the best possible configuration, given a user's requirements. #}
|
||||||
{%- set use_transformer = hardware != "cpu" and transformer_data -%}
|
{%- set use_transformer = hardware != "cpu" and transformer_data -%}
|
||||||
{%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
|
{%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
|
||||||
{%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "spancat", "trainable_lemmatizer"] -%}
|
{%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "spancat", "spancat_singlelabel", "trainable_lemmatizer"] -%}
|
||||||
[paths]
|
[paths]
|
||||||
train = null
|
train = null
|
||||||
dev = null
|
dev = null
|
||||||
|
@ -24,8 +24,11 @@ gpu_allocator = null
|
||||||
lang = "{{ lang }}"
|
lang = "{{ lang }}"
|
||||||
{%- set has_textcat = ("textcat" in components or "textcat_multilabel" in components) -%}
|
{%- set has_textcat = ("textcat" in components or "textcat_multilabel" in components) -%}
|
||||||
{%- set with_accuracy = optimize == "accuracy" -%}
|
{%- set with_accuracy = optimize == "accuracy" -%}
|
||||||
{%- set has_accurate_textcat = has_textcat and with_accuracy -%}
|
{# The BOW textcat doesn't need a source of features, so it can omit the
|
||||||
{%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "spancat" in components or "trainable_lemmatizer" in components or "entity_linker" in components or has_accurate_textcat) -%}
|
tok2vec/transformer. #}
|
||||||
|
{%- set with_accuracy_or_transformer = (use_transformer or with_accuracy) -%}
|
||||||
|
{%- set textcat_needs_features = has_textcat and with_accuracy_or_transformer -%}
|
||||||
|
{%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "spancat" in components or "spancat_singlelabel" in components or "trainable_lemmatizer" in components or "entity_linker" in components or textcat_needs_features) -%}
|
||||||
{%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components -%}
|
{%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components -%}
|
||||||
{%- else -%}
|
{%- else -%}
|
||||||
{%- set full_pipeline = components -%}
|
{%- set full_pipeline = components -%}
|
||||||
|
@ -156,6 +159,36 @@ grad_factor = 1.0
|
||||||
sizes = [1,2,3]
|
sizes = [1,2,3]
|
||||||
{% endif -%}
|
{% endif -%}
|
||||||
|
|
||||||
|
{% if "spancat_singlelabel" in components %}
|
||||||
|
[components.spancat_singlelabel]
|
||||||
|
factory = "spancat_singlelabel"
|
||||||
|
negative_weight = 1.0
|
||||||
|
allow_overlap = true
|
||||||
|
scorer = {"@scorers":"spacy.spancat_scorer.v1"}
|
||||||
|
spans_key = "sc"
|
||||||
|
|
||||||
|
[components.spancat_singlelabel.model]
|
||||||
|
@architectures = "spacy.SpanCategorizer.v1"
|
||||||
|
|
||||||
|
[components.spancat_singlelabel.model.reducer]
|
||||||
|
@layers = "spacy.mean_max_reducer.v1"
|
||||||
|
hidden_size = 128
|
||||||
|
|
||||||
|
[components.spancat_singlelabel.model.scorer]
|
||||||
|
@layers = "Softmax.v2"
|
||||||
|
|
||||||
|
[components.spancat_singlelabel.model.tok2vec]
|
||||||
|
@architectures = "spacy-transformers.TransformerListener.v1"
|
||||||
|
grad_factor = 1.0
|
||||||
|
|
||||||
|
[components.spancat_singlelabel.model.tok2vec.pooling]
|
||||||
|
@layers = "reduce_mean.v1"
|
||||||
|
|
||||||
|
[components.spancat_singlelabel.suggester]
|
||||||
|
@misc = "spacy.ngram_suggester.v1"
|
||||||
|
sizes = [1,2,3]
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
{% if "trainable_lemmatizer" in components -%}
|
{% if "trainable_lemmatizer" in components -%}
|
||||||
[components.trainable_lemmatizer]
|
[components.trainable_lemmatizer]
|
||||||
factory = "trainable_lemmatizer"
|
factory = "trainable_lemmatizer"
|
||||||
|
@ -221,10 +254,16 @@ no_output_layer = false
|
||||||
|
|
||||||
{% else -%}
|
{% else -%}
|
||||||
[components.textcat.model]
|
[components.textcat.model]
|
||||||
@architectures = "spacy.TextCatBOW.v2"
|
@architectures = "spacy.TextCatCNN.v2"
|
||||||
exclusive_classes = true
|
exclusive_classes = true
|
||||||
ngram_size = 1
|
nO = null
|
||||||
no_output_layer = false
|
|
||||||
|
[components.textcat.model.tok2vec]
|
||||||
|
@architectures = "spacy-transformers.TransformerListener.v1"
|
||||||
|
grad_factor = 1.0
|
||||||
|
|
||||||
|
[components.textcat.model.tok2vec.pooling]
|
||||||
|
@layers = "reduce_mean.v1"
|
||||||
{%- endif %}
|
{%- endif %}
|
||||||
{%- endif %}
|
{%- endif %}
|
||||||
|
|
||||||
|
@ -252,10 +291,16 @@ no_output_layer = false
|
||||||
|
|
||||||
{% else -%}
|
{% else -%}
|
||||||
[components.textcat_multilabel.model]
|
[components.textcat_multilabel.model]
|
||||||
@architectures = "spacy.TextCatBOW.v2"
|
@architectures = "spacy.TextCatCNN.v2"
|
||||||
exclusive_classes = false
|
exclusive_classes = false
|
||||||
ngram_size = 1
|
nO = null
|
||||||
no_output_layer = false
|
|
||||||
|
[components.textcat_multilabel.model.tok2vec]
|
||||||
|
@architectures = "spacy-transformers.TransformerListener.v1"
|
||||||
|
grad_factor = 1.0
|
||||||
|
|
||||||
|
[components.textcat_multilabel.model.tok2vec.pooling]
|
||||||
|
@layers = "reduce_mean.v1"
|
||||||
{%- endif %}
|
{%- endif %}
|
||||||
{%- endif %}
|
{%- endif %}
|
||||||
|
|
||||||
|
@ -374,6 +419,33 @@ width = ${components.tok2vec.model.encode.width}
|
||||||
sizes = [1,2,3]
|
sizes = [1,2,3]
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
|
||||||
|
{% if "spancat_singlelabel" in components %}
|
||||||
|
[components.spancat_singlelabel]
|
||||||
|
factory = "spancat_singlelabel"
|
||||||
|
negative_weight = 1.0
|
||||||
|
allow_overlap = true
|
||||||
|
scorer = {"@scorers":"spacy.spancat_scorer.v1"}
|
||||||
|
spans_key = "sc"
|
||||||
|
|
||||||
|
[components.spancat_singlelabel.model]
|
||||||
|
@architectures = "spacy.SpanCategorizer.v1"
|
||||||
|
|
||||||
|
[components.spancat_singlelabel.model.reducer]
|
||||||
|
@layers = "spacy.mean_max_reducer.v1"
|
||||||
|
hidden_size = 128
|
||||||
|
|
||||||
|
[components.spancat_singlelabel.model.scorer]
|
||||||
|
@layers = "Softmax.v2"
|
||||||
|
|
||||||
|
[components.spancat_singlelabel.model.tok2vec]
|
||||||
|
@architectures = "spacy.Tok2VecListener.v1"
|
||||||
|
width = ${components.tok2vec.model.encode.width}
|
||||||
|
|
||||||
|
[components.spancat_singlelabel.suggester]
|
||||||
|
@misc = "spacy.ngram_suggester.v1"
|
||||||
|
sizes = [1,2,3]
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
{% if "trainable_lemmatizer" in components -%}
|
{% if "trainable_lemmatizer" in components -%}
|
||||||
[components.trainable_lemmatizer]
|
[components.trainable_lemmatizer]
|
||||||
factory = "trainable_lemmatizer"
|
factory = "trainable_lemmatizer"
|
||||||
|
|
|
@ -125,13 +125,17 @@ def app(environ, start_response):
|
||||||
return [res]
|
return [res]
|
||||||
|
|
||||||
|
|
||||||
def parse_deps(orig_doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
|
def parse_deps(
|
||||||
|
orig_doc: Union[Doc, Span], options: Dict[str, Any] = {}
|
||||||
|
) -> Dict[str, Any]:
|
||||||
"""Generate dependency parse in {'words': [], 'arcs': []} format.
|
"""Generate dependency parse in {'words': [], 'arcs': []} format.
|
||||||
|
|
||||||
orig_doc (Doc): Document to parse.
|
orig_doc (Union[Doc, Span]): Document to parse.
|
||||||
options (Dict[str, Any]): Dependency parse specific visualisation options.
|
options (Dict[str, Any]): Dependency parse specific visualisation options.
|
||||||
RETURNS (dict): Generated dependency parse keyed by words and arcs.
|
RETURNS (dict): Generated dependency parse keyed by words and arcs.
|
||||||
"""
|
"""
|
||||||
|
if isinstance(orig_doc, Span):
|
||||||
|
orig_doc = orig_doc.as_doc()
|
||||||
doc = Doc(orig_doc.vocab).from_bytes(
|
doc = Doc(orig_doc.vocab).from_bytes(
|
||||||
orig_doc.to_bytes(exclude=["user_data", "user_hooks"])
|
orig_doc.to_bytes(exclude=["user_data", "user_hooks"])
|
||||||
)
|
)
|
||||||
|
|
|
@ -444,8 +444,7 @@ class Errors(metaclass=ErrorsWithCodes):
|
||||||
E133 = ("The sum of prior probabilities for alias '{alias}' should not "
|
E133 = ("The sum of prior probabilities for alias '{alias}' should not "
|
||||||
"exceed 1, but found {sum}.")
|
"exceed 1, but found {sum}.")
|
||||||
E134 = ("Entity '{entity}' is not defined in the Knowledge Base.")
|
E134 = ("Entity '{entity}' is not defined in the Knowledge Base.")
|
||||||
E139 = ("Knowledge base for component '{name}' is empty. Use the methods "
|
E139 = ("Knowledge base for component '{name}' is empty.")
|
||||||
"`kb.add_entity` and `kb.add_alias` to add entries.")
|
|
||||||
E140 = ("The list of entities, prior probabilities and entity vectors "
|
E140 = ("The list of entities, prior probabilities and entity vectors "
|
||||||
"should be of equal length.")
|
"should be of equal length.")
|
||||||
E141 = ("Entity vectors should be of length {required} instead of the "
|
E141 = ("Entity vectors should be of length {required} instead of the "
|
||||||
|
@ -550,6 +549,8 @@ class Errors(metaclass=ErrorsWithCodes):
|
||||||
"during training, make sure to include it in 'annotating components'")
|
"during training, make sure to include it in 'annotating components'")
|
||||||
|
|
||||||
# New errors added in v3.x
|
# New errors added in v3.x
|
||||||
|
E850 = ("The PretrainVectors objective currently only supports default or "
|
||||||
|
"floret vectors, not {mode} vectors.")
|
||||||
E851 = ("The 'textcat' component labels should only have values of 0 or 1, "
|
E851 = ("The 'textcat' component labels should only have values of 0 or 1, "
|
||||||
"but found value of '{val}'.")
|
"but found value of '{val}'.")
|
||||||
E852 = ("The tar file pulled from the remote attempted an unsafe path "
|
E852 = ("The tar file pulled from the remote attempted an unsafe path "
|
||||||
|
@ -967,7 +968,8 @@ class Errors(metaclass=ErrorsWithCodes):
|
||||||
E1049 = ("No available port found for displaCy on host {host}. Please specify an available port "
|
E1049 = ("No available port found for displaCy on host {host}. Please specify an available port "
|
||||||
"with `displacy.serve(doc, port=port)`")
|
"with `displacy.serve(doc, port=port)`")
|
||||||
E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port=port)` "
|
E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port=port)` "
|
||||||
"or use `auto_switch_port=True` to pick an available port automatically.")
|
"or use `auto_select_port=True` to pick an available port automatically.")
|
||||||
|
E1051 = ("'allow_overlap' can only be False when max_positive is 1, but found 'max_positive': {max_positive}.")
|
||||||
|
|
||||||
|
|
||||||
# Deprecated model shortcuts, only used in errors and warnings
|
# Deprecated model shortcuts, only used in errors and warnings
|
||||||
|
|
|
@ -46,6 +46,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
||||||
self._alias_index = PreshMap(nr_aliases + 1)
|
self._alias_index = PreshMap(nr_aliases + 1)
|
||||||
self._aliases_table = alias_vec(nr_aliases + 1)
|
self._aliases_table = alias_vec(nr_aliases + 1)
|
||||||
|
|
||||||
|
def is_empty(self):
|
||||||
|
return len(self) == 0
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
return self.get_size_entities()
|
return self.get_size_entities()
|
||||||
|
|
||||||
|
|
|
@ -25,7 +25,8 @@ class Lexeme:
|
||||||
def orth_(self) -> str: ...
|
def orth_(self) -> str: ...
|
||||||
@property
|
@property
|
||||||
def text(self) -> str: ...
|
def text(self) -> str: ...
|
||||||
lower: str
|
orth: int
|
||||||
|
lower: int
|
||||||
norm: int
|
norm: int
|
||||||
shape: int
|
shape: int
|
||||||
prefix: int
|
prefix: int
|
||||||
|
|
|
@ -199,7 +199,7 @@ cdef class Lexeme:
|
||||||
return self.orth_
|
return self.orth_
|
||||||
|
|
||||||
property lower:
|
property lower:
|
||||||
"""RETURNS (str): Lowercase form of the lexeme."""
|
"""RETURNS (uint64): Lowercase form of the lexeme."""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.c.lower
|
return self.c.lower
|
||||||
|
|
||||||
|
|
|
@ -82,8 +82,12 @@ cdef class DependencyMatcher:
|
||||||
"$-": self._imm_left_sib,
|
"$-": self._imm_left_sib,
|
||||||
"$++": self._right_sib,
|
"$++": self._right_sib,
|
||||||
"$--": self._left_sib,
|
"$--": self._left_sib,
|
||||||
|
">+": self._imm_right_child,
|
||||||
|
">-": self._imm_left_child,
|
||||||
">++": self._right_child,
|
">++": self._right_child,
|
||||||
">--": self._left_child,
|
">--": self._left_child,
|
||||||
|
"<+": self._imm_right_parent,
|
||||||
|
"<-": self._imm_left_parent,
|
||||||
"<++": self._right_parent,
|
"<++": self._right_parent,
|
||||||
"<--": self._left_parent,
|
"<--": self._left_parent,
|
||||||
}
|
}
|
||||||
|
@ -427,12 +431,34 @@ cdef class DependencyMatcher:
|
||||||
def _left_sib(self, doc, node):
|
def _left_sib(self, doc, node):
|
||||||
return [doc[child.i] for child in doc[node].head.children if child.i < node]
|
return [doc[child.i] for child in doc[node].head.children if child.i < node]
|
||||||
|
|
||||||
|
def _imm_right_child(self, doc, node):
|
||||||
|
for child in doc[node].children:
|
||||||
|
if child.i == node + 1:
|
||||||
|
return [doc[child.i]]
|
||||||
|
return []
|
||||||
|
|
||||||
|
def _imm_left_child(self, doc, node):
|
||||||
|
for child in doc[node].children:
|
||||||
|
if child.i == node - 1:
|
||||||
|
return [doc[child.i]]
|
||||||
|
return []
|
||||||
|
|
||||||
def _right_child(self, doc, node):
|
def _right_child(self, doc, node):
|
||||||
return [doc[child.i] for child in doc[node].children if child.i > node]
|
return [doc[child.i] for child in doc[node].children if child.i > node]
|
||||||
|
|
||||||
def _left_child(self, doc, node):
|
def _left_child(self, doc, node):
|
||||||
return [doc[child.i] for child in doc[node].children if child.i < node]
|
return [doc[child.i] for child in doc[node].children if child.i < node]
|
||||||
|
|
||||||
|
def _imm_right_parent(self, doc, node):
|
||||||
|
if doc[node].head.i == node + 1:
|
||||||
|
return [doc[node].head]
|
||||||
|
return []
|
||||||
|
|
||||||
|
def _imm_left_parent(self, doc, node):
|
||||||
|
if doc[node].head.i == node - 1:
|
||||||
|
return [doc[node].head]
|
||||||
|
return []
|
||||||
|
|
||||||
def _right_parent(self, doc, node):
|
def _right_parent(self, doc, node):
|
||||||
if doc[node].head.i > node:
|
if doc[node].head.i > node:
|
||||||
return [doc[node].head]
|
return [doc[node].head]
|
||||||
|
|
|
@ -89,6 +89,14 @@ def load_kb(
|
||||||
return kb_from_file
|
return kb_from_file
|
||||||
|
|
||||||
|
|
||||||
|
@registry.misc("spacy.EmptyKB.v2")
|
||||||
|
def empty_kb_for_config() -> Callable[[Vocab, int], KnowledgeBase]:
|
||||||
|
def empty_kb_factory(vocab: Vocab, entity_vector_length: int):
|
||||||
|
return InMemoryLookupKB(vocab=vocab, entity_vector_length=entity_vector_length)
|
||||||
|
|
||||||
|
return empty_kb_factory
|
||||||
|
|
||||||
|
|
||||||
@registry.misc("spacy.EmptyKB.v1")
|
@registry.misc("spacy.EmptyKB.v1")
|
||||||
def empty_kb(
|
def empty_kb(
|
||||||
entity_vector_length: int,
|
entity_vector_length: int,
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
from typing import Any, Optional, Iterable, Tuple, List, Callable, TYPE_CHECKING, cast
|
from typing import Any, Optional, Iterable, Tuple, List, Callable, TYPE_CHECKING, cast
|
||||||
from thinc.types import Floats2d
|
from thinc.types import Floats2d, Ints1d
|
||||||
from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init, Model
|
from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init, Model
|
||||||
from thinc.api import MultiSoftmax, list2array
|
from thinc.api import MultiSoftmax, list2array
|
||||||
from thinc.api import to_categorical, CosineDistance, L2Distance
|
from thinc.api import to_categorical, CosineDistance, L2Distance
|
||||||
|
@ -7,7 +7,8 @@ from thinc.loss import Loss
|
||||||
|
|
||||||
from ...util import registry, OOV_RANK
|
from ...util import registry, OOV_RANK
|
||||||
from ...errors import Errors
|
from ...errors import Errors
|
||||||
from ...attrs import ID
|
from ...attrs import ID, ORTH
|
||||||
|
from ...vectors import Mode as VectorsMode
|
||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
from functools import partial
|
from functools import partial
|
||||||
|
@ -67,14 +68,23 @@ def get_vectors_loss(ops, docs, prediction, distance):
|
||||||
"""Compute a loss based on a distance between the documents' vectors and
|
"""Compute a loss based on a distance between the documents' vectors and
|
||||||
the prediction.
|
the prediction.
|
||||||
"""
|
"""
|
||||||
# The simplest way to implement this would be to vstack the
|
vocab = docs[0].vocab
|
||||||
# token.vector values, but that's a bit inefficient, especially on GPU.
|
if vocab.vectors.mode == VectorsMode.default:
|
||||||
# Instead we fetch the index into the vectors table for each of our tokens,
|
# The simplest way to implement this would be to vstack the
|
||||||
# and look them up all at once. This prevents data copying.
|
# token.vector values, but that's a bit inefficient, especially on GPU.
|
||||||
ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
|
# Instead we fetch the index into the vectors table for each of our
|
||||||
target = docs[0].vocab.vectors.data[ids]
|
# tokens, and look them up all at once. This prevents data copying.
|
||||||
target[ids == OOV_RANK] = 0
|
ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
|
||||||
d_target, loss = distance(prediction, target)
|
target = docs[0].vocab.vectors.data[ids]
|
||||||
|
target[ids == OOV_RANK] = 0
|
||||||
|
d_target, loss = distance(prediction, target)
|
||||||
|
elif vocab.vectors.mode == VectorsMode.floret:
|
||||||
|
keys = ops.flatten([cast(Ints1d, doc.to_array(ORTH)) for doc in docs])
|
||||||
|
target = vocab.vectors.get_batch(keys)
|
||||||
|
target = ops.as_contig(target)
|
||||||
|
d_target, loss = distance(prediction, target)
|
||||||
|
else:
|
||||||
|
raise ValueError(Errors.E850.format(mode=vocab.vectors.mode))
|
||||||
return loss, d_target
|
return loss, d_target
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -54,6 +54,7 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]
|
||||||
"entity_vector_length": 64,
|
"entity_vector_length": 64,
|
||||||
"get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
|
"get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
|
||||||
"get_candidates_batch": {"@misc": "spacy.CandidateBatchGenerator.v1"},
|
"get_candidates_batch": {"@misc": "spacy.CandidateBatchGenerator.v1"},
|
||||||
|
"generate_empty_kb": {"@misc": "spacy.EmptyKB.v2"},
|
||||||
"overwrite": True,
|
"overwrite": True,
|
||||||
"scorer": {"@scorers": "spacy.entity_linker_scorer.v1"},
|
"scorer": {"@scorers": "spacy.entity_linker_scorer.v1"},
|
||||||
"use_gold_ents": True,
|
"use_gold_ents": True,
|
||||||
|
@ -80,6 +81,7 @@ def make_entity_linker(
|
||||||
get_candidates_batch: Callable[
|
get_candidates_batch: Callable[
|
||||||
[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
|
[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
|
||||||
],
|
],
|
||||||
|
generate_empty_kb: Callable[[Vocab, int], KnowledgeBase],
|
||||||
overwrite: bool,
|
overwrite: bool,
|
||||||
scorer: Optional[Callable],
|
scorer: Optional[Callable],
|
||||||
use_gold_ents: bool,
|
use_gold_ents: bool,
|
||||||
|
@ -101,6 +103,7 @@ def make_entity_linker(
|
||||||
get_candidates_batch (
|
get_candidates_batch (
|
||||||
Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]], Iterable[Candidate]]
|
Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]], Iterable[Candidate]]
|
||||||
): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
|
): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
|
||||||
|
generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase.
|
||||||
scorer (Optional[Callable]): The scoring method.
|
scorer (Optional[Callable]): The scoring method.
|
||||||
use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
|
use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
|
||||||
component must provide entity annotations.
|
component must provide entity annotations.
|
||||||
|
@ -135,6 +138,7 @@ def make_entity_linker(
|
||||||
entity_vector_length=entity_vector_length,
|
entity_vector_length=entity_vector_length,
|
||||||
get_candidates=get_candidates,
|
get_candidates=get_candidates,
|
||||||
get_candidates_batch=get_candidates_batch,
|
get_candidates_batch=get_candidates_batch,
|
||||||
|
generate_empty_kb=generate_empty_kb,
|
||||||
overwrite=overwrite,
|
overwrite=overwrite,
|
||||||
scorer=scorer,
|
scorer=scorer,
|
||||||
use_gold_ents=use_gold_ents,
|
use_gold_ents=use_gold_ents,
|
||||||
|
@ -175,6 +179,7 @@ class EntityLinker(TrainablePipe):
|
||||||
get_candidates_batch: Callable[
|
get_candidates_batch: Callable[
|
||||||
[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
|
[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
|
||||||
],
|
],
|
||||||
|
generate_empty_kb: Callable[[Vocab, int], KnowledgeBase],
|
||||||
overwrite: bool = BACKWARD_OVERWRITE,
|
overwrite: bool = BACKWARD_OVERWRITE,
|
||||||
scorer: Optional[Callable] = entity_linker_score,
|
scorer: Optional[Callable] = entity_linker_score,
|
||||||
use_gold_ents: bool,
|
use_gold_ents: bool,
|
||||||
|
@ -198,6 +203,7 @@ class EntityLinker(TrainablePipe):
|
||||||
Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]],
|
Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]],
|
||||||
Iterable[Candidate]]
|
Iterable[Candidate]]
|
||||||
): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
|
): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
|
||||||
|
generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase.
|
||||||
scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_links.
|
scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_links.
|
||||||
use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
|
use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
|
||||||
component must provide entity annotations.
|
component must provide entity annotations.
|
||||||
|
@ -220,6 +226,7 @@ class EntityLinker(TrainablePipe):
|
||||||
self.model = model
|
self.model = model
|
||||||
self.name = name
|
self.name = name
|
||||||
self.labels_discard = list(labels_discard)
|
self.labels_discard = list(labels_discard)
|
||||||
|
# how many neighbour sentences to take into account
|
||||||
self.n_sents = n_sents
|
self.n_sents = n_sents
|
||||||
self.incl_prior = incl_prior
|
self.incl_prior = incl_prior
|
||||||
self.incl_context = incl_context
|
self.incl_context = incl_context
|
||||||
|
@ -227,9 +234,7 @@ class EntityLinker(TrainablePipe):
|
||||||
self.get_candidates_batch = get_candidates_batch
|
self.get_candidates_batch = get_candidates_batch
|
||||||
self.cfg: Dict[str, Any] = {"overwrite": overwrite}
|
self.cfg: Dict[str, Any] = {"overwrite": overwrite}
|
||||||
self.distance = CosineDistance(normalize=False)
|
self.distance = CosineDistance(normalize=False)
|
||||||
# how many neighbour sentences to take into account
|
self.kb = generate_empty_kb(self.vocab, entity_vector_length)
|
||||||
# create an empty KB by default
|
|
||||||
self.kb = empty_kb(entity_vector_length)(self.vocab)
|
|
||||||
self.scorer = scorer
|
self.scorer = scorer
|
||||||
self.use_gold_ents = use_gold_ents
|
self.use_gold_ents = use_gold_ents
|
||||||
self.candidates_batch_size = candidates_batch_size
|
self.candidates_batch_size = candidates_batch_size
|
||||||
|
@ -250,7 +255,7 @@ class EntityLinker(TrainablePipe):
|
||||||
# Raise an error if the knowledge base is not initialized.
|
# Raise an error if the knowledge base is not initialized.
|
||||||
if self.kb is None:
|
if self.kb is None:
|
||||||
raise ValueError(Errors.E1018.format(name=self.name))
|
raise ValueError(Errors.E1018.format(name=self.name))
|
||||||
if len(self.kb) == 0:
|
if hasattr(self.kb, "is_empty") and self.kb.is_empty():
|
||||||
raise ValueError(Errors.E139.format(name=self.name))
|
raise ValueError(Errors.E139.format(name=self.name))
|
||||||
|
|
||||||
def initialize(
|
def initialize(
|
||||||
|
@ -469,18 +474,24 @@ class EntityLinker(TrainablePipe):
|
||||||
|
|
||||||
# Looping through each entity in batch (TODO: rewrite)
|
# Looping through each entity in batch (TODO: rewrite)
|
||||||
for j, ent in enumerate(ent_batch):
|
for j, ent in enumerate(ent_batch):
|
||||||
sent_index = sentences.index(ent.sent)
|
assert hasattr(ent, "sents")
|
||||||
assert sent_index >= 0
|
sents = list(ent.sents)
|
||||||
|
sent_indices = (
|
||||||
|
sentences.index(sents[0]),
|
||||||
|
sentences.index(sents[-1]),
|
||||||
|
)
|
||||||
|
assert sent_indices[1] >= sent_indices[0] >= 0
|
||||||
|
|
||||||
if self.incl_context:
|
if self.incl_context:
|
||||||
# get n_neighbour sentences, clipped to the length of the document
|
# get n_neighbour sentences, clipped to the length of the document
|
||||||
start_sentence = max(0, sent_index - self.n_sents)
|
start_sentence = max(0, sent_indices[0] - self.n_sents)
|
||||||
end_sentence = min(
|
end_sentence = min(
|
||||||
len(sentences) - 1, sent_index + self.n_sents
|
len(sentences) - 1, sent_indices[1] + self.n_sents
|
||||||
)
|
)
|
||||||
start_token = sentences[start_sentence].start
|
start_token = sentences[start_sentence].start
|
||||||
end_token = sentences[end_sentence].end
|
end_token = sentences[end_sentence].end
|
||||||
sent_doc = doc[start_token:end_token].as_doc()
|
sent_doc = doc[start_token:end_token].as_doc()
|
||||||
|
|
||||||
# currently, the context is the same for each entity in a sentence (should be refined)
|
# currently, the context is the same for each entity in a sentence (should be refined)
|
||||||
sentence_encoding = self.model.predict([sent_doc])[0]
|
sentence_encoding = self.model.predict([sent_doc])[0]
|
||||||
sentence_encoding_t = sentence_encoding.T
|
sentence_encoding_t = sentence_encoding.T
|
||||||
|
|
|
@ -1,4 +1,6 @@
|
||||||
from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any
|
from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any, cast, Union
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from functools import partial
|
||||||
from thinc.api import Config, Model, get_current_ops, set_dropout_rate, Ops
|
from thinc.api import Config, Model, get_current_ops, set_dropout_rate, Ops
|
||||||
from thinc.api import Optimizer
|
from thinc.api import Optimizer
|
||||||
from thinc.types import Ragged, Ints2d, Floats2d
|
from thinc.types import Ragged, Ints2d, Floats2d
|
||||||
|
@ -43,7 +45,36 @@ maxout_pieces = 3
|
||||||
depth = 4
|
depth = 4
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
spancat_singlelabel_default_config = """
|
||||||
|
[model]
|
||||||
|
@architectures = "spacy.SpanCategorizer.v1"
|
||||||
|
scorer = {"@layers": "Softmax.v2"}
|
||||||
|
|
||||||
|
[model.reducer]
|
||||||
|
@layers = spacy.mean_max_reducer.v1
|
||||||
|
hidden_size = 128
|
||||||
|
|
||||||
|
[model.tok2vec]
|
||||||
|
@architectures = "spacy.Tok2Vec.v2"
|
||||||
|
[model.tok2vec.embed]
|
||||||
|
@architectures = "spacy.MultiHashEmbed.v1"
|
||||||
|
width = 96
|
||||||
|
rows = [5000, 1000, 2500, 1000]
|
||||||
|
attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
|
||||||
|
include_static_vectors = false
|
||||||
|
|
||||||
|
[model.tok2vec.encode]
|
||||||
|
@architectures = "spacy.MaxoutWindowEncoder.v2"
|
||||||
|
width = ${model.tok2vec.embed.width}
|
||||||
|
window_size = 1
|
||||||
|
maxout_pieces = 3
|
||||||
|
depth = 4
|
||||||
|
"""
|
||||||
|
|
||||||
DEFAULT_SPANCAT_MODEL = Config().from_str(spancat_default_config)["model"]
|
DEFAULT_SPANCAT_MODEL = Config().from_str(spancat_default_config)["model"]
|
||||||
|
DEFAULT_SPANCAT_SINGLELABEL_MODEL = Config().from_str(
|
||||||
|
spancat_singlelabel_default_config
|
||||||
|
)["model"]
|
||||||
|
|
||||||
|
|
||||||
@runtime_checkable
|
@runtime_checkable
|
||||||
|
@ -52,39 +83,42 @@ class Suggester(Protocol):
|
||||||
...
|
...
|
||||||
|
|
||||||
|
|
||||||
|
def ngram_suggester(
|
||||||
|
docs: Iterable[Doc], sizes: List[int], *, ops: Optional[Ops] = None
|
||||||
|
) -> Ragged:
|
||||||
|
if ops is None:
|
||||||
|
ops = get_current_ops()
|
||||||
|
spans = []
|
||||||
|
lengths = []
|
||||||
|
for doc in docs:
|
||||||
|
starts = ops.xp.arange(len(doc), dtype="i")
|
||||||
|
starts = starts.reshape((-1, 1))
|
||||||
|
length = 0
|
||||||
|
for size in sizes:
|
||||||
|
if size <= len(doc):
|
||||||
|
starts_size = starts[: len(doc) - (size - 1)]
|
||||||
|
spans.append(ops.xp.hstack((starts_size, starts_size + size)))
|
||||||
|
length += spans[-1].shape[0]
|
||||||
|
if spans:
|
||||||
|
assert spans[-1].ndim == 2, spans[-1].shape
|
||||||
|
lengths.append(length)
|
||||||
|
lengths_array = ops.asarray1i(lengths)
|
||||||
|
if len(spans) > 0:
|
||||||
|
output = Ragged(ops.xp.vstack(spans), lengths_array)
|
||||||
|
else:
|
||||||
|
output = Ragged(ops.xp.zeros((0, 0), dtype="i"), lengths_array)
|
||||||
|
|
||||||
|
assert output.dataXd.ndim == 2
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
@registry.misc("spacy.ngram_suggester.v1")
|
@registry.misc("spacy.ngram_suggester.v1")
|
||||||
def build_ngram_suggester(sizes: List[int]) -> Suggester:
|
def build_ngram_suggester(sizes: List[int]) -> Suggester:
|
||||||
"""Suggest all spans of the given lengths. Spans are returned as a ragged
|
"""Suggest all spans of the given lengths. Spans are returned as a ragged
|
||||||
array of integers. The array has two columns, indicating the start and end
|
array of integers. The array has two columns, indicating the start and end
|
||||||
position."""
|
position."""
|
||||||
|
|
||||||
def ngram_suggester(docs: Iterable[Doc], *, ops: Optional[Ops] = None) -> Ragged:
|
return partial(ngram_suggester, sizes=sizes)
|
||||||
if ops is None:
|
|
||||||
ops = get_current_ops()
|
|
||||||
spans = []
|
|
||||||
lengths = []
|
|
||||||
for doc in docs:
|
|
||||||
starts = ops.xp.arange(len(doc), dtype="i")
|
|
||||||
starts = starts.reshape((-1, 1))
|
|
||||||
length = 0
|
|
||||||
for size in sizes:
|
|
||||||
if size <= len(doc):
|
|
||||||
starts_size = starts[: len(doc) - (size - 1)]
|
|
||||||
spans.append(ops.xp.hstack((starts_size, starts_size + size)))
|
|
||||||
length += spans[-1].shape[0]
|
|
||||||
if spans:
|
|
||||||
assert spans[-1].ndim == 2, spans[-1].shape
|
|
||||||
lengths.append(length)
|
|
||||||
lengths_array = ops.asarray1i(lengths)
|
|
||||||
if len(spans) > 0:
|
|
||||||
output = Ragged(ops.xp.vstack(spans), lengths_array)
|
|
||||||
else:
|
|
||||||
output = Ragged(ops.xp.zeros((0, 0), dtype="i"), lengths_array)
|
|
||||||
|
|
||||||
assert output.dataXd.ndim == 2
|
|
||||||
return output
|
|
||||||
|
|
||||||
return ngram_suggester
|
|
||||||
|
|
||||||
|
|
||||||
@registry.misc("spacy.ngram_range_suggester.v1")
|
@registry.misc("spacy.ngram_range_suggester.v1")
|
||||||
|
@ -119,10 +153,14 @@ def make_spancat(
|
||||||
threshold: float,
|
threshold: float,
|
||||||
max_positive: Optional[int],
|
max_positive: Optional[int],
|
||||||
) -> "SpanCategorizer":
|
) -> "SpanCategorizer":
|
||||||
"""Create a SpanCategorizer component. The span categorizer consists of two
|
"""Create a SpanCategorizer component and configure it for multi-label
|
||||||
|
classification to be able to assign multiple labels for each span.
|
||||||
|
The span categorizer consists of two
|
||||||
parts: a suggester function that proposes candidate spans, and a labeller
|
parts: a suggester function that proposes candidate spans, and a labeller
|
||||||
model that predicts one or more labels for each span.
|
model that predicts one or more labels for each span.
|
||||||
|
|
||||||
|
name (str): The component instance name, used to add entries to the
|
||||||
|
losses during training.
|
||||||
suggester (Callable[[Iterable[Doc], Optional[Ops]], Ragged]): A function that suggests spans.
|
suggester (Callable[[Iterable[Doc], Optional[Ops]], Ragged]): A function that suggests spans.
|
||||||
Spans are returned as a ragged array with two integer columns, for the
|
Spans are returned as a ragged array with two integer columns, for the
|
||||||
start and end positions.
|
start and end positions.
|
||||||
|
@ -144,12 +182,80 @@ def make_spancat(
|
||||||
"""
|
"""
|
||||||
return SpanCategorizer(
|
return SpanCategorizer(
|
||||||
nlp.vocab,
|
nlp.vocab,
|
||||||
suggester=suggester,
|
|
||||||
model=model,
|
model=model,
|
||||||
spans_key=spans_key,
|
suggester=suggester,
|
||||||
threshold=threshold,
|
|
||||||
max_positive=max_positive,
|
|
||||||
name=name,
|
name=name,
|
||||||
|
spans_key=spans_key,
|
||||||
|
negative_weight=None,
|
||||||
|
allow_overlap=True,
|
||||||
|
max_positive=max_positive,
|
||||||
|
threshold=threshold,
|
||||||
|
scorer=scorer,
|
||||||
|
add_negative_label=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@Language.factory(
|
||||||
|
"spancat_singlelabel",
|
||||||
|
assigns=["doc.spans"],
|
||||||
|
default_config={
|
||||||
|
"spans_key": "sc",
|
||||||
|
"model": DEFAULT_SPANCAT_SINGLELABEL_MODEL,
|
||||||
|
"negative_weight": 1.0,
|
||||||
|
"suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
|
||||||
|
"scorer": {"@scorers": "spacy.spancat_scorer.v1"},
|
||||||
|
"allow_overlap": True,
|
||||||
|
},
|
||||||
|
default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0},
|
||||||
|
)
|
||||||
|
def make_spancat_singlelabel(
|
||||||
|
nlp: Language,
|
||||||
|
name: str,
|
||||||
|
suggester: Suggester,
|
||||||
|
model: Model[Tuple[List[Doc], Ragged], Floats2d],
|
||||||
|
spans_key: str,
|
||||||
|
negative_weight: float,
|
||||||
|
allow_overlap: bool,
|
||||||
|
scorer: Optional[Callable],
|
||||||
|
) -> "SpanCategorizer":
|
||||||
|
"""Create a SpanCategorizer component and configure it for multi-class
|
||||||
|
classification. With this configuration each span can get at most one
|
||||||
|
label. The span categorizer consists of two
|
||||||
|
parts: a suggester function that proposes candidate spans, and a labeller
|
||||||
|
model that predicts one or more labels for each span.
|
||||||
|
|
||||||
|
name (str): The component instance name, used to add entries to the
|
||||||
|
losses during training.
|
||||||
|
suggester (Callable[[Iterable[Doc], Optional[Ops]], Ragged]): A function that suggests spans.
|
||||||
|
Spans are returned as a ragged array with two integer columns, for the
|
||||||
|
start and end positions.
|
||||||
|
model (Model[Tuple[List[Doc], Ragged], Floats2d]): A model instance that
|
||||||
|
is given a list of documents and (start, end) indices representing
|
||||||
|
candidate span offsets. The model predicts a probability for each category
|
||||||
|
for each span.
|
||||||
|
spans_key (str): Key of the doc.spans dict to save the spans under. During
|
||||||
|
initialization and training, the component will look for spans on the
|
||||||
|
reference document under the same key.
|
||||||
|
scorer (Optional[Callable]): The scoring method. Defaults to
|
||||||
|
Scorer.score_spans for the Doc.spans[spans_key] with overlapping
|
||||||
|
spans allowed.
|
||||||
|
negative_weight (float): Multiplier for the loss terms.
|
||||||
|
Can be used to downweight the negative samples if there are too many.
|
||||||
|
allow_overlap (bool): If True the data is assumed to contain overlapping spans.
|
||||||
|
Otherwise it produces non-overlapping spans greedily prioritizing
|
||||||
|
higher assigned label scores.
|
||||||
|
"""
|
||||||
|
return SpanCategorizer(
|
||||||
|
nlp.vocab,
|
||||||
|
model=model,
|
||||||
|
suggester=suggester,
|
||||||
|
name=name,
|
||||||
|
spans_key=spans_key,
|
||||||
|
negative_weight=negative_weight,
|
||||||
|
allow_overlap=allow_overlap,
|
||||||
|
max_positive=1,
|
||||||
|
add_negative_label=True,
|
||||||
|
threshold=None,
|
||||||
scorer=scorer,
|
scorer=scorer,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -172,6 +278,27 @@ def make_spancat_scorer():
|
||||||
return spancat_score
|
return spancat_score
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class _Intervals:
|
||||||
|
"""
|
||||||
|
Helper class to avoid storing overlapping spans.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.ranges = set()
|
||||||
|
|
||||||
|
def add(self, i, j):
|
||||||
|
for e in range(i, j):
|
||||||
|
self.ranges.add(e)
|
||||||
|
|
||||||
|
def __contains__(self, rang):
|
||||||
|
i, j = rang
|
||||||
|
for e in range(i, j):
|
||||||
|
if e in self.ranges:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
class SpanCategorizer(TrainablePipe):
|
class SpanCategorizer(TrainablePipe):
|
||||||
"""Pipeline component to label spans of text.
|
"""Pipeline component to label spans of text.
|
||||||
|
|
||||||
|
@ -185,25 +312,43 @@ class SpanCategorizer(TrainablePipe):
|
||||||
suggester: Suggester,
|
suggester: Suggester,
|
||||||
name: str = "spancat",
|
name: str = "spancat",
|
||||||
*,
|
*,
|
||||||
|
add_negative_label: bool = False,
|
||||||
spans_key: str = "spans",
|
spans_key: str = "spans",
|
||||||
threshold: float = 0.5,
|
negative_weight: Optional[float] = 1.0,
|
||||||
|
allow_overlap: Optional[bool] = True,
|
||||||
max_positive: Optional[int] = None,
|
max_positive: Optional[int] = None,
|
||||||
|
threshold: Optional[float] = 0.5,
|
||||||
scorer: Optional[Callable] = spancat_score,
|
scorer: Optional[Callable] = spancat_score,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Initialize the span categorizer.
|
"""Initialize the multi-label or multi-class span categorizer.
|
||||||
|
|
||||||
vocab (Vocab): The shared vocabulary.
|
vocab (Vocab): The shared vocabulary.
|
||||||
model (thinc.api.Model): The Thinc Model powering the pipeline component.
|
model (thinc.api.Model): The Thinc Model powering the pipeline component.
|
||||||
|
For multi-class classification (single label per span) we recommend
|
||||||
|
using a Softmax classifier as a the final layer, while for multi-label
|
||||||
|
classification (multiple possible labels per span) we recommend Logistic.
|
||||||
|
suggester (Callable[[Iterable[Doc], Optional[Ops]], Ragged]): A function that suggests spans.
|
||||||
|
Spans are returned as a ragged array with two integer columns, for the
|
||||||
|
start and end positions.
|
||||||
name (str): The component instance name, used to add entries to the
|
name (str): The component instance name, used to add entries to the
|
||||||
losses during training.
|
losses during training.
|
||||||
spans_key (str): Key of the Doc.spans dict to save the spans under.
|
spans_key (str): Key of the Doc.spans dict to save the spans under.
|
||||||
During initialization and training, the component will look for
|
During initialization and training, the component will look for
|
||||||
spans on the reference document under the same key. Defaults to
|
spans on the reference document under the same key. Defaults to
|
||||||
`"spans"`.
|
`"spans"`.
|
||||||
threshold (float): Minimum probability to consider a prediction
|
add_negative_label (bool): Learn to predict a special 'negative_label'
|
||||||
positive. Spans with a positive prediction will be saved on the Doc.
|
when a Span is not annotated.
|
||||||
Defaults to 0.5.
|
threshold (Optional[float]): Minimum probability to consider a prediction
|
||||||
|
positive. Defaults to 0.5. Spans with a positive prediction will be saved
|
||||||
|
on the Doc.
|
||||||
max_positive (Optional[int]): Maximum number of labels to consider
|
max_positive (Optional[int]): Maximum number of labels to consider
|
||||||
positive per span. Defaults to None, indicating no limit.
|
positive per span. Defaults to None, indicating no limit.
|
||||||
|
negative_weight (float): Multiplier for the loss terms.
|
||||||
|
Can be used to downweight the negative samples if there are too many
|
||||||
|
when add_negative_label is True. Otherwise its unused.
|
||||||
|
allow_overlap (bool): If True the data is assumed to contain overlapping spans.
|
||||||
|
Otherwise it produces non-overlapping spans greedily prioritizing
|
||||||
|
higher assigned label scores. Only used when max_positive is 1.
|
||||||
scorer (Optional[Callable]): The scoring method. Defaults to
|
scorer (Optional[Callable]): The scoring method. Defaults to
|
||||||
Scorer.score_spans for the Doc.spans[spans_key] with overlapping
|
Scorer.score_spans for the Doc.spans[spans_key] with overlapping
|
||||||
spans allowed.
|
spans allowed.
|
||||||
|
@ -215,12 +360,17 @@ class SpanCategorizer(TrainablePipe):
|
||||||
"spans_key": spans_key,
|
"spans_key": spans_key,
|
||||||
"threshold": threshold,
|
"threshold": threshold,
|
||||||
"max_positive": max_positive,
|
"max_positive": max_positive,
|
||||||
|
"negative_weight": negative_weight,
|
||||||
|
"allow_overlap": allow_overlap,
|
||||||
}
|
}
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.suggester = suggester
|
self.suggester = suggester
|
||||||
self.model = model
|
self.model = model
|
||||||
self.name = name
|
self.name = name
|
||||||
self.scorer = scorer
|
self.scorer = scorer
|
||||||
|
self.add_negative_label = add_negative_label
|
||||||
|
if not allow_overlap and max_positive is not None and max_positive > 1:
|
||||||
|
raise ValueError(Errors.E1051.format(max_positive=max_positive))
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def key(self) -> str:
|
def key(self) -> str:
|
||||||
|
@ -230,6 +380,21 @@ class SpanCategorizer(TrainablePipe):
|
||||||
"""
|
"""
|
||||||
return str(self.cfg["spans_key"])
|
return str(self.cfg["spans_key"])
|
||||||
|
|
||||||
|
def _allow_extra_label(self) -> None:
|
||||||
|
"""Raise an error if the component can not add any more labels."""
|
||||||
|
nO = None
|
||||||
|
if self.model.has_dim("nO"):
|
||||||
|
nO = self.model.get_dim("nO")
|
||||||
|
elif self.model.has_ref("output_layer") and self.model.get_ref(
|
||||||
|
"output_layer"
|
||||||
|
).has_dim("nO"):
|
||||||
|
nO = self.model.get_ref("output_layer").get_dim("nO")
|
||||||
|
if nO is not None and nO == self._n_labels:
|
||||||
|
if not self.is_resizable:
|
||||||
|
raise ValueError(
|
||||||
|
Errors.E922.format(name=self.name, nO=self.model.get_dim("nO"))
|
||||||
|
)
|
||||||
|
|
||||||
def add_label(self, label: str) -> int:
|
def add_label(self, label: str) -> int:
|
||||||
"""Add a new label to the pipe.
|
"""Add a new label to the pipe.
|
||||||
|
|
||||||
|
@ -263,6 +428,27 @@ class SpanCategorizer(TrainablePipe):
|
||||||
"""
|
"""
|
||||||
return list(self.labels)
|
return list(self.labels)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def _label_map(self) -> Dict[str, int]:
|
||||||
|
"""RETURNS (Dict[str, int]): The label map."""
|
||||||
|
return {label: i for i, label in enumerate(self.labels)}
|
||||||
|
|
||||||
|
@property
|
||||||
|
def _n_labels(self) -> int:
|
||||||
|
"""RETURNS (int): Number of labels."""
|
||||||
|
if self.add_negative_label:
|
||||||
|
return len(self.labels) + 1
|
||||||
|
else:
|
||||||
|
return len(self.labels)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def _negative_label_i(self) -> Union[int, None]:
|
||||||
|
"""RETURNS (Union[int, None]): Index of the negative label."""
|
||||||
|
if self.add_negative_label:
|
||||||
|
return len(self.label_data)
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
def predict(self, docs: Iterable[Doc]):
|
def predict(self, docs: Iterable[Doc]):
|
||||||
"""Apply the pipeline's model to a batch of docs, without modifying them.
|
"""Apply the pipeline's model to a batch of docs, without modifying them.
|
||||||
|
|
||||||
|
@ -304,14 +490,24 @@ class SpanCategorizer(TrainablePipe):
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/spancategorizer#set_annotations
|
DOCS: https://spacy.io/api/spancategorizer#set_annotations
|
||||||
"""
|
"""
|
||||||
labels = self.labels
|
|
||||||
indices, scores = indices_scores
|
indices, scores = indices_scores
|
||||||
offset = 0
|
offset = 0
|
||||||
for i, doc in enumerate(docs):
|
for i, doc in enumerate(docs):
|
||||||
indices_i = indices[i].dataXd
|
indices_i = indices[i].dataXd
|
||||||
doc.spans[self.key] = self._make_span_group(
|
allow_overlap = cast(bool, self.cfg["allow_overlap"])
|
||||||
doc, indices_i, scores[offset : offset + indices.lengths[i]], labels # type: ignore[arg-type]
|
if self.cfg["max_positive"] == 1:
|
||||||
)
|
doc.spans[self.key] = self._make_span_group_singlelabel(
|
||||||
|
doc,
|
||||||
|
indices_i,
|
||||||
|
scores[offset : offset + indices.lengths[i]],
|
||||||
|
allow_overlap,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
doc.spans[self.key] = self._make_span_group_multilabel(
|
||||||
|
doc,
|
||||||
|
indices_i,
|
||||||
|
scores[offset : offset + indices.lengths[i]],
|
||||||
|
)
|
||||||
offset += indices.lengths[i]
|
offset += indices.lengths[i]
|
||||||
|
|
||||||
def update(
|
def update(
|
||||||
|
@ -371,9 +567,11 @@ class SpanCategorizer(TrainablePipe):
|
||||||
spans = Ragged(
|
spans = Ragged(
|
||||||
self.model.ops.to_numpy(spans.data), self.model.ops.to_numpy(spans.lengths)
|
self.model.ops.to_numpy(spans.data), self.model.ops.to_numpy(spans.lengths)
|
||||||
)
|
)
|
||||||
label_map = {label: i for i, label in enumerate(self.labels)}
|
|
||||||
target = numpy.zeros(scores.shape, dtype=scores.dtype)
|
target = numpy.zeros(scores.shape, dtype=scores.dtype)
|
||||||
|
if self.add_negative_label:
|
||||||
|
negative_spans = numpy.ones((scores.shape[0]))
|
||||||
offset = 0
|
offset = 0
|
||||||
|
label_map = self._label_map
|
||||||
for i, eg in enumerate(examples):
|
for i, eg in enumerate(examples):
|
||||||
# Map (start, end) offset of spans to the row in the d_scores array,
|
# Map (start, end) offset of spans to the row in the d_scores array,
|
||||||
# so that we can adjust the gradient for predictions that were
|
# so that we can adjust the gradient for predictions that were
|
||||||
|
@ -390,10 +588,16 @@ class SpanCategorizer(TrainablePipe):
|
||||||
row = spans_index[key]
|
row = spans_index[key]
|
||||||
k = label_map[gold_span.label_]
|
k = label_map[gold_span.label_]
|
||||||
target[row, k] = 1.0
|
target[row, k] = 1.0
|
||||||
|
if self.add_negative_label:
|
||||||
|
# delete negative label target.
|
||||||
|
negative_spans[row] = 0.0
|
||||||
# The target is a flat array for all docs. Track the position
|
# The target is a flat array for all docs. Track the position
|
||||||
# we're at within the flat array.
|
# we're at within the flat array.
|
||||||
offset += spans.lengths[i]
|
offset += spans.lengths[i]
|
||||||
target = self.model.ops.asarray(target, dtype="f") # type: ignore
|
target = self.model.ops.asarray(target, dtype="f") # type: ignore
|
||||||
|
if self.add_negative_label:
|
||||||
|
negative_samples = numpy.nonzero(negative_spans)[0]
|
||||||
|
target[negative_samples, self._negative_label_i] = 1.0 # type: ignore
|
||||||
# The target will have the values 0 (for untrue predictions) or 1
|
# The target will have the values 0 (for untrue predictions) or 1
|
||||||
# (for true predictions).
|
# (for true predictions).
|
||||||
# The scores should be in the range [0, 1].
|
# The scores should be in the range [0, 1].
|
||||||
|
@ -402,6 +606,10 @@ class SpanCategorizer(TrainablePipe):
|
||||||
# If the prediction is 0.9 and it's false, the gradient will be
|
# If the prediction is 0.9 and it's false, the gradient will be
|
||||||
# 0.9 (0.9 - 0.0)
|
# 0.9 (0.9 - 0.0)
|
||||||
d_scores = scores - target
|
d_scores = scores - target
|
||||||
|
if self.add_negative_label:
|
||||||
|
neg_weight = cast(float, self.cfg["negative_weight"])
|
||||||
|
if neg_weight != 1.0:
|
||||||
|
d_scores[negative_samples] *= neg_weight
|
||||||
loss = float((d_scores**2).sum())
|
loss = float((d_scores**2).sum())
|
||||||
return loss, d_scores
|
return loss, d_scores
|
||||||
|
|
||||||
|
@ -438,7 +646,7 @@ class SpanCategorizer(TrainablePipe):
|
||||||
if subbatch:
|
if subbatch:
|
||||||
docs = [eg.x for eg in subbatch]
|
docs = [eg.x for eg in subbatch]
|
||||||
spans = build_ngram_suggester(sizes=[1])(docs)
|
spans = build_ngram_suggester(sizes=[1])(docs)
|
||||||
Y = self.model.ops.alloc2f(spans.dataXd.shape[0], len(self.labels))
|
Y = self.model.ops.alloc2f(spans.dataXd.shape[0], self._n_labels)
|
||||||
self.model.initialize(X=(docs, spans), Y=Y)
|
self.model.initialize(X=(docs, spans), Y=Y)
|
||||||
else:
|
else:
|
||||||
self.model.initialize()
|
self.model.initialize()
|
||||||
|
@ -452,31 +660,98 @@ class SpanCategorizer(TrainablePipe):
|
||||||
eg.reference.spans.get(self.key, []), allow_overlap=True
|
eg.reference.spans.get(self.key, []), allow_overlap=True
|
||||||
)
|
)
|
||||||
|
|
||||||
def _make_span_group(
|
def _make_span_group_multilabel(
|
||||||
self, doc: Doc, indices: Ints2d, scores: Floats2d, labels: List[str]
|
self,
|
||||||
|
doc: Doc,
|
||||||
|
indices: Ints2d,
|
||||||
|
scores: Floats2d,
|
||||||
) -> SpanGroup:
|
) -> SpanGroup:
|
||||||
|
"""Find the top-k labels for each span (k=max_positive)."""
|
||||||
spans = SpanGroup(doc, name=self.key)
|
spans = SpanGroup(doc, name=self.key)
|
||||||
max_positive = self.cfg["max_positive"]
|
if scores.size == 0:
|
||||||
|
return spans
|
||||||
|
scores = self.model.ops.to_numpy(scores)
|
||||||
|
indices = self.model.ops.to_numpy(indices)
|
||||||
threshold = self.cfg["threshold"]
|
threshold = self.cfg["threshold"]
|
||||||
|
max_positive = self.cfg["max_positive"]
|
||||||
|
|
||||||
keeps = scores >= threshold
|
keeps = scores >= threshold
|
||||||
ranked = (scores * -1).argsort() # type: ignore
|
|
||||||
if max_positive is not None:
|
if max_positive is not None:
|
||||||
assert isinstance(max_positive, int)
|
assert isinstance(max_positive, int)
|
||||||
|
if self.add_negative_label:
|
||||||
|
negative_scores = numpy.copy(scores[:, self._negative_label_i])
|
||||||
|
scores[:, self._negative_label_i] = -numpy.inf
|
||||||
|
ranked = (scores * -1).argsort() # type: ignore
|
||||||
|
scores[:, self._negative_label_i] = negative_scores
|
||||||
|
else:
|
||||||
|
ranked = (scores * -1).argsort() # type: ignore
|
||||||
span_filter = ranked[:, max_positive:]
|
span_filter = ranked[:, max_positive:]
|
||||||
for i, row in enumerate(span_filter):
|
for i, row in enumerate(span_filter):
|
||||||
keeps[i, row] = False
|
keeps[i, row] = False
|
||||||
spans.attrs["scores"] = scores[keeps].flatten()
|
|
||||||
|
|
||||||
indices = self.model.ops.to_numpy(indices)
|
|
||||||
keeps = self.model.ops.to_numpy(keeps)
|
|
||||||
|
|
||||||
|
attrs_scores = []
|
||||||
for i in range(indices.shape[0]):
|
for i in range(indices.shape[0]):
|
||||||
start = indices[i, 0]
|
start = indices[i, 0]
|
||||||
end = indices[i, 1]
|
end = indices[i, 1]
|
||||||
|
|
||||||
for j, keep in enumerate(keeps[i]):
|
for j, keep in enumerate(keeps[i]):
|
||||||
if keep:
|
if keep:
|
||||||
spans.append(Span(doc, start, end, label=labels[j]))
|
if j != self._negative_label_i:
|
||||||
|
spans.append(Span(doc, start, end, label=self.labels[j]))
|
||||||
|
attrs_scores.append(scores[i, j])
|
||||||
|
spans.attrs["scores"] = numpy.array(attrs_scores)
|
||||||
|
return spans
|
||||||
|
|
||||||
|
def _make_span_group_singlelabel(
|
||||||
|
self,
|
||||||
|
doc: Doc,
|
||||||
|
indices: Ints2d,
|
||||||
|
scores: Floats2d,
|
||||||
|
allow_overlap: bool = True,
|
||||||
|
) -> SpanGroup:
|
||||||
|
"""Find the argmax label for each span."""
|
||||||
|
# Handle cases when there are zero suggestions
|
||||||
|
if scores.size == 0:
|
||||||
|
return SpanGroup(doc, name=self.key)
|
||||||
|
scores = self.model.ops.to_numpy(scores)
|
||||||
|
indices = self.model.ops.to_numpy(indices)
|
||||||
|
predicted = scores.argmax(axis=1)
|
||||||
|
argmax_scores = numpy.take_along_axis(
|
||||||
|
scores, numpy.expand_dims(predicted, 1), axis=1
|
||||||
|
)
|
||||||
|
keeps = numpy.ones(predicted.shape, dtype=bool)
|
||||||
|
# Remove samples where the negative label is the argmax.
|
||||||
|
if self.add_negative_label:
|
||||||
|
keeps = numpy.logical_and(keeps, predicted != self._negative_label_i)
|
||||||
|
# Filter samples according to threshold.
|
||||||
|
threshold = self.cfg["threshold"]
|
||||||
|
if threshold is not None:
|
||||||
|
keeps = numpy.logical_and(keeps, (argmax_scores >= threshold).squeeze())
|
||||||
|
# Sort spans according to argmax probability
|
||||||
|
if not allow_overlap:
|
||||||
|
# Get the probabilities
|
||||||
|
sort_idx = (argmax_scores.squeeze() * -1).argsort()
|
||||||
|
argmax_scores = argmax_scores[sort_idx]
|
||||||
|
predicted = predicted[sort_idx]
|
||||||
|
indices = indices[sort_idx]
|
||||||
|
keeps = keeps[sort_idx]
|
||||||
|
seen = _Intervals()
|
||||||
|
spans = SpanGroup(doc, name=self.key)
|
||||||
|
attrs_scores = []
|
||||||
|
for i in range(indices.shape[0]):
|
||||||
|
if not keeps[i]:
|
||||||
|
continue
|
||||||
|
|
||||||
|
label = predicted[i]
|
||||||
|
start = indices[i, 0]
|
||||||
|
end = indices[i, 1]
|
||||||
|
|
||||||
|
if not allow_overlap:
|
||||||
|
if (start, end) in seen:
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
seen.add(start, end)
|
||||||
|
attrs_scores.append(argmax_scores[i])
|
||||||
|
spans.append(Span(doc, start, end, label=self.labels[label]))
|
||||||
|
|
||||||
|
spans.attrs["scores"] = numpy.array(attrs_scores)
|
||||||
return spans
|
return spans
|
||||||
|
|
|
@ -700,3 +700,34 @@ def test_span_group_copy(doc):
|
||||||
assert len(doc.spans["test"]) == 3
|
assert len(doc.spans["test"]) == 3
|
||||||
# check that the copy spans were not modified and this is an isolated doc
|
# check that the copy spans were not modified and this is an isolated doc
|
||||||
assert len(doc_copy.spans["test"]) == 2
|
assert len(doc_copy.spans["test"]) == 2
|
||||||
|
|
||||||
|
|
||||||
|
def test_for_partial_ent_sents():
|
||||||
|
"""Spans may be associated with multiple sentences. These .sents should always be complete, not partial, sentences,
|
||||||
|
which this tests for.
|
||||||
|
"""
|
||||||
|
doc = Doc(
|
||||||
|
English().vocab,
|
||||||
|
words=["Mahler's", "Symphony", "No.", "8", "was", "beautiful."],
|
||||||
|
sent_starts=[1, 0, 0, 1, 0, 0],
|
||||||
|
)
|
||||||
|
doc.set_ents([Span(doc, 1, 4, "WORK")])
|
||||||
|
# The specified entity is associated with both sentences in this doc, so we expect all sentences in the doc to be
|
||||||
|
# equal to the sentences referenced in ent.sents.
|
||||||
|
for doc_sent, ent_sent in zip(doc.sents, doc.ents[0].sents):
|
||||||
|
assert doc_sent == ent_sent
|
||||||
|
|
||||||
|
|
||||||
|
def test_for_no_ent_sents():
|
||||||
|
"""Span.sents() should set .sents correctly, even if Span in question is trailing and doesn't form a full
|
||||||
|
sentence.
|
||||||
|
"""
|
||||||
|
doc = Doc(
|
||||||
|
English().vocab,
|
||||||
|
words=["This", "is", "a", "test.", "ENTITY"],
|
||||||
|
sent_starts=[1, 0, 0, 0, 1],
|
||||||
|
)
|
||||||
|
doc.set_ents([Span(doc, 4, 5, "WORK")])
|
||||||
|
sents = list(doc.ents[0].sents)
|
||||||
|
assert len(sents) == 1
|
||||||
|
assert str(sents[0]) == str(doc.ents[0].sent) == "ENTITY"
|
||||||
|
|
|
@ -316,16 +316,32 @@ def test_dependency_matcher_precedence_ops(en_vocab, op, num_matches):
|
||||||
("the", "brown", "$--", 0),
|
("the", "brown", "$--", 0),
|
||||||
("brown", "the", "$--", 1),
|
("brown", "the", "$--", 1),
|
||||||
("brown", "brown", "$--", 0),
|
("brown", "brown", "$--", 0),
|
||||||
|
("over", "jumped", "<+", 0),
|
||||||
|
("quick", "fox", "<+", 0),
|
||||||
|
("the", "quick", "<+", 0),
|
||||||
|
("brown", "fox", "<+", 1),
|
||||||
("quick", "fox", "<++", 1),
|
("quick", "fox", "<++", 1),
|
||||||
("quick", "over", "<++", 0),
|
("quick", "over", "<++", 0),
|
||||||
("over", "jumped", "<++", 0),
|
("over", "jumped", "<++", 0),
|
||||||
("the", "fox", "<++", 2),
|
("the", "fox", "<++", 2),
|
||||||
|
("brown", "fox", "<-", 0),
|
||||||
|
("fox", "over", "<-", 0),
|
||||||
|
("the", "over", "<-", 0),
|
||||||
|
("over", "jumped", "<-", 1),
|
||||||
("brown", "fox", "<--", 0),
|
("brown", "fox", "<--", 0),
|
||||||
("fox", "jumped", "<--", 0),
|
("fox", "jumped", "<--", 0),
|
||||||
("fox", "over", "<--", 1),
|
("fox", "over", "<--", 1),
|
||||||
|
("fox", "brown", ">+", 0),
|
||||||
|
("over", "fox", ">+", 0),
|
||||||
|
("over", "the", ">+", 0),
|
||||||
|
("jumped", "over", ">+", 1),
|
||||||
("jumped", "over", ">++", 1),
|
("jumped", "over", ">++", 1),
|
||||||
("fox", "lazy", ">++", 0),
|
("fox", "lazy", ">++", 0),
|
||||||
("over", "the", ">++", 0),
|
("over", "the", ">++", 0),
|
||||||
|
("jumped", "over", ">-", 0),
|
||||||
|
("fox", "quick", ">-", 0),
|
||||||
|
("brown", "quick", ">-", 0),
|
||||||
|
("fox", "brown", ">-", 1),
|
||||||
("brown", "fox", ">--", 0),
|
("brown", "fox", ">--", 0),
|
||||||
("fox", "brown", ">--", 1),
|
("fox", "brown", ">--", 1),
|
||||||
("jumped", "fox", ">--", 1),
|
("jumped", "fox", ">--", 1),
|
||||||
|
|
|
@ -9,6 +9,8 @@ from spacy.lang.en import English
|
||||||
from spacy.lang.it import Italian
|
from spacy.lang.it import Italian
|
||||||
from spacy.language import Language
|
from spacy.language import Language
|
||||||
from spacy.lookups import Lookups
|
from spacy.lookups import Lookups
|
||||||
|
from spacy.pipeline import EntityRecognizer
|
||||||
|
from spacy.pipeline.ner import DEFAULT_NER_MODEL
|
||||||
from spacy.pipeline._parser_internals.ner import BiluoPushDown
|
from spacy.pipeline._parser_internals.ner import BiluoPushDown
|
||||||
from spacy.training import Example, iob_to_biluo, split_bilu_label
|
from spacy.training import Example, iob_to_biluo, split_bilu_label
|
||||||
from spacy.tokens import Doc, Span
|
from spacy.tokens import Doc, Span
|
||||||
|
@ -16,8 +18,6 @@ from spacy.vocab import Vocab
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
from ..util import make_tempdir
|
from ..util import make_tempdir
|
||||||
from ...pipeline import EntityRecognizer
|
|
||||||
from ...pipeline.ner import DEFAULT_NER_MODEL
|
|
||||||
|
|
||||||
TRAIN_DATA = [
|
TRAIN_DATA = [
|
||||||
("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),
|
("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),
|
||||||
|
|
|
@ -8,11 +8,11 @@ from spacy.lang.en import English
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
from spacy.training import Example
|
from spacy.training import Example
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
|
from spacy.pipeline import DependencyParser
|
||||||
|
from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL
|
||||||
|
from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
|
||||||
|
|
||||||
from ...pipeline import DependencyParser
|
|
||||||
from ...pipeline.dep_parser import DEFAULT_PARSER_MODEL
|
|
||||||
from ..util import apply_transition_sequence, make_tempdir
|
from ..util import apply_transition_sequence, make_tempdir
|
||||||
from ...pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
|
|
||||||
|
|
||||||
TRAIN_DATA = [
|
TRAIN_DATA = [
|
||||||
(
|
(
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
from typing import Callable, Iterable, Dict, Any
|
from typing import Callable, Iterable, Dict, Any, Tuple
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from numpy.testing import assert_equal
|
from numpy.testing import assert_equal
|
||||||
|
|
||||||
from spacy import registry, util
|
from spacy import registry, util, Language
|
||||||
from spacy.attrs import ENT_KB_ID
|
from spacy.attrs import ENT_KB_ID
|
||||||
from spacy.compat import pickle
|
from spacy.compat import pickle
|
||||||
from spacy.kb import Candidate, InMemoryLookupKB, get_candidates, KnowledgeBase
|
from spacy.kb import Candidate, InMemoryLookupKB, get_candidates, KnowledgeBase
|
||||||
|
@ -108,18 +108,23 @@ def test_issue7065():
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.issue(7065)
|
@pytest.mark.issue(7065)
|
||||||
def test_issue7065_b():
|
@pytest.mark.parametrize("entity_in_first_sentence", [True, False])
|
||||||
|
def test_sentence_crossing_ents(entity_in_first_sentence: bool):
|
||||||
|
"""Tests if NEL crashes if entities cross sentence boundaries and the first associated sentence doesn't have an
|
||||||
|
entity.
|
||||||
|
entity_in_prior_sentence (bool): Whether to include an entity in the first sentence associated with the
|
||||||
|
sentence-crossing entity.
|
||||||
|
"""
|
||||||
# Test that the NEL doesn't crash when an entity crosses a sentence boundary
|
# Test that the NEL doesn't crash when an entity crosses a sentence boundary
|
||||||
nlp = English()
|
nlp = English()
|
||||||
vector_length = 3
|
vector_length = 3
|
||||||
nlp.add_pipe("sentencizer")
|
|
||||||
text = "Mahler 's Symphony No. 8 was beautiful."
|
text = "Mahler 's Symphony No. 8 was beautiful."
|
||||||
entities = [(0, 6, "PERSON"), (10, 24, "WORK")]
|
entities = [(10, 24, "WORK")]
|
||||||
links = {
|
links = {(10, 24): {"Q7304": 0.0, "Q270853": 1.0}}
|
||||||
(0, 6): {"Q7304": 1.0, "Q270853": 0.0},
|
if entity_in_first_sentence:
|
||||||
(10, 24): {"Q7304": 0.0, "Q270853": 1.0},
|
entities.append((0, 6, "PERSON"))
|
||||||
}
|
links[(0, 6)] = {"Q7304": 1.0, "Q270853": 0.0}
|
||||||
sent_starts = [1, -1, 0, 0, 0, 0, 0, 0, 0]
|
sent_starts = [1, -1, 0, 0, 0, 1, 0, 0, 0]
|
||||||
doc = nlp(text)
|
doc = nlp(text)
|
||||||
example = Example.from_dict(
|
example = Example.from_dict(
|
||||||
doc, {"entities": entities, "links": links, "sent_starts": sent_starts}
|
doc, {"entities": entities, "links": links, "sent_starts": sent_starts}
|
||||||
|
@ -145,31 +150,14 @@ def test_issue7065_b():
|
||||||
|
|
||||||
# Create the Entity Linker component and add it to the pipeline
|
# Create the Entity Linker component and add it to the pipeline
|
||||||
entity_linker = nlp.add_pipe("entity_linker", last=True)
|
entity_linker = nlp.add_pipe("entity_linker", last=True)
|
||||||
entity_linker.set_kb(create_kb)
|
entity_linker.set_kb(create_kb) # type: ignore
|
||||||
# train the NEL pipe
|
# train the NEL pipe
|
||||||
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||||
for i in range(2):
|
for i in range(2):
|
||||||
losses = {}
|
nlp.update(train_examples, sgd=optimizer)
|
||||||
nlp.update(train_examples, sgd=optimizer, losses=losses)
|
|
||||||
|
|
||||||
# Add a custom rule-based component to mimick NER
|
# This shouldn't crash.
|
||||||
patterns = [
|
entity_linker.predict([example.reference]) # type: ignore
|
||||||
{"label": "PERSON", "pattern": [{"LOWER": "mahler"}]},
|
|
||||||
{
|
|
||||||
"label": "WORK",
|
|
||||||
"pattern": [
|
|
||||||
{"LOWER": "symphony"},
|
|
||||||
{"LOWER": "no"},
|
|
||||||
{"LOWER": "."},
|
|
||||||
{"LOWER": "8"},
|
|
||||||
],
|
|
||||||
},
|
|
||||||
]
|
|
||||||
ruler = nlp.add_pipe("entity_ruler", before="entity_linker")
|
|
||||||
ruler.add_patterns(patterns)
|
|
||||||
# test the trained model - this should not throw E148
|
|
||||||
doc = nlp(text)
|
|
||||||
assert doc
|
|
||||||
|
|
||||||
|
|
||||||
def test_no_entities():
|
def test_no_entities():
|
||||||
|
@ -353,6 +341,9 @@ def test_kb_default(nlp):
|
||||||
"""Test that the default (empty) KB is loaded upon construction"""
|
"""Test that the default (empty) KB is loaded upon construction"""
|
||||||
entity_linker = nlp.add_pipe("entity_linker", config={})
|
entity_linker = nlp.add_pipe("entity_linker", config={})
|
||||||
assert len(entity_linker.kb) == 0
|
assert len(entity_linker.kb) == 0
|
||||||
|
with pytest.raises(ValueError, match="E139"):
|
||||||
|
# this raises an error because the KB is empty
|
||||||
|
entity_linker.validate_kb()
|
||||||
assert entity_linker.kb.get_size_entities() == 0
|
assert entity_linker.kb.get_size_entities() == 0
|
||||||
assert entity_linker.kb.get_size_aliases() == 0
|
assert entity_linker.kb.get_size_aliases() == 0
|
||||||
# 64 is the default value from pipeline.entity_linker
|
# 64 is the default value from pipeline.entity_linker
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
import pytest
|
import pytest
|
||||||
import numpy
|
import numpy
|
||||||
from numpy.testing import assert_array_equal, assert_almost_equal
|
from numpy.testing import assert_array_equal, assert_almost_equal
|
||||||
from thinc.api import get_current_ops, Ragged
|
from thinc.api import get_current_ops, NumpyOps, Ragged
|
||||||
|
|
||||||
from spacy import util
|
from spacy import util
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
|
@ -15,6 +15,8 @@ OPS = get_current_ops()
|
||||||
|
|
||||||
SPAN_KEY = "labeled_spans"
|
SPAN_KEY = "labeled_spans"
|
||||||
|
|
||||||
|
SPANCAT_COMPONENTS = ["spancat", "spancat_singlelabel"]
|
||||||
|
|
||||||
TRAIN_DATA = [
|
TRAIN_DATA = [
|
||||||
("Who is Shaka Khan?", {"spans": {SPAN_KEY: [(7, 17, "PERSON")]}}),
|
("Who is Shaka Khan?", {"spans": {SPAN_KEY: [(7, 17, "PERSON")]}}),
|
||||||
(
|
(
|
||||||
|
@ -41,38 +43,42 @@ def make_examples(nlp, data=TRAIN_DATA):
|
||||||
return train_examples
|
return train_examples
|
||||||
|
|
||||||
|
|
||||||
def test_no_label():
|
@pytest.mark.parametrize("name", SPANCAT_COMPONENTS)
|
||||||
|
def test_no_label(name):
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY})
|
nlp.add_pipe(name, config={"spans_key": SPAN_KEY})
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
nlp.initialize()
|
nlp.initialize()
|
||||||
|
|
||||||
|
|
||||||
def test_no_resize():
|
@pytest.mark.parametrize("name", SPANCAT_COMPONENTS)
|
||||||
|
def test_no_resize(name):
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY})
|
spancat = nlp.add_pipe(name, config={"spans_key": SPAN_KEY})
|
||||||
spancat.add_label("Thing")
|
spancat.add_label("Thing")
|
||||||
spancat.add_label("Phrase")
|
spancat.add_label("Phrase")
|
||||||
assert spancat.labels == ("Thing", "Phrase")
|
assert spancat.labels == ("Thing", "Phrase")
|
||||||
nlp.initialize()
|
nlp.initialize()
|
||||||
assert spancat.model.get_dim("nO") == 2
|
assert spancat.model.get_dim("nO") == spancat._n_labels
|
||||||
# this throws an error because the spancat can't be resized after initialization
|
# this throws an error because the spancat can't be resized after initialization
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
spancat.add_label("Stuff")
|
spancat.add_label("Stuff")
|
||||||
|
|
||||||
|
|
||||||
def test_implicit_labels():
|
@pytest.mark.parametrize("name", SPANCAT_COMPONENTS)
|
||||||
|
def test_implicit_labels(name):
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY})
|
spancat = nlp.add_pipe(name, config={"spans_key": SPAN_KEY})
|
||||||
assert len(spancat.labels) == 0
|
assert len(spancat.labels) == 0
|
||||||
train_examples = make_examples(nlp)
|
train_examples = make_examples(nlp)
|
||||||
nlp.initialize(get_examples=lambda: train_examples)
|
nlp.initialize(get_examples=lambda: train_examples)
|
||||||
assert spancat.labels == ("PERSON", "LOC")
|
assert spancat.labels == ("PERSON", "LOC")
|
||||||
|
|
||||||
|
|
||||||
def test_explicit_labels():
|
@pytest.mark.parametrize("name", SPANCAT_COMPONENTS)
|
||||||
|
def test_explicit_labels(name):
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY})
|
spancat = nlp.add_pipe(name, config={"spans_key": SPAN_KEY})
|
||||||
assert len(spancat.labels) == 0
|
assert len(spancat.labels) == 0
|
||||||
spancat.add_label("PERSON")
|
spancat.add_label("PERSON")
|
||||||
spancat.add_label("LOC")
|
spancat.add_label("LOC")
|
||||||
|
@ -102,13 +108,13 @@ def test_doc_gc():
|
||||||
# XXX This fails with length 0 sometimes
|
# XXX This fails with length 0 sometimes
|
||||||
assert len(spangroup) > 0
|
assert len(spangroup) > 0
|
||||||
with pytest.raises(RuntimeError):
|
with pytest.raises(RuntimeError):
|
||||||
span = spangroup[0]
|
spangroup[0]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"max_positive,nr_results", [(None, 4), (1, 2), (2, 3), (3, 4), (4, 4)]
|
"max_positive,nr_results", [(None, 4), (1, 2), (2, 3), (3, 4), (4, 4)]
|
||||||
)
|
)
|
||||||
def test_make_spangroup(max_positive, nr_results):
|
def test_make_spangroup_multilabel(max_positive, nr_results):
|
||||||
fix_random_seed(0)
|
fix_random_seed(0)
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
spancat = nlp.add_pipe(
|
spancat = nlp.add_pipe(
|
||||||
|
@ -120,10 +126,12 @@ def test_make_spangroup(max_positive, nr_results):
|
||||||
indices = ngram_suggester([doc])[0].dataXd
|
indices = ngram_suggester([doc])[0].dataXd
|
||||||
assert_array_equal(OPS.to_numpy(indices), numpy.asarray([[0, 1], [1, 2], [0, 2]]))
|
assert_array_equal(OPS.to_numpy(indices), numpy.asarray([[0, 1], [1, 2], [0, 2]]))
|
||||||
labels = ["Thing", "City", "Person", "GreatCity"]
|
labels = ["Thing", "City", "Person", "GreatCity"]
|
||||||
|
for label in labels:
|
||||||
|
spancat.add_label(label)
|
||||||
scores = numpy.asarray(
|
scores = numpy.asarray(
|
||||||
[[0.2, 0.4, 0.3, 0.1], [0.1, 0.6, 0.2, 0.4], [0.8, 0.7, 0.3, 0.9]], dtype="f"
|
[[0.2, 0.4, 0.3, 0.1], [0.1, 0.6, 0.2, 0.4], [0.8, 0.7, 0.3, 0.9]], dtype="f"
|
||||||
)
|
)
|
||||||
spangroup = spancat._make_span_group(doc, indices, scores, labels)
|
spangroup = spancat._make_span_group_multilabel(doc, indices, scores)
|
||||||
assert len(spangroup) == nr_results
|
assert len(spangroup) == nr_results
|
||||||
|
|
||||||
# first span is always the second token "London"
|
# first span is always the second token "London"
|
||||||
|
@ -154,6 +162,130 @@ def test_make_spangroup(max_positive, nr_results):
|
||||||
assert_almost_equal(0.9, spangroup.attrs["scores"][-1], 5)
|
assert_almost_equal(0.9, spangroup.attrs["scores"][-1], 5)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"threshold,allow_overlap,nr_results",
|
||||||
|
[(0.05, True, 3), (0.05, False, 1), (0.5, True, 2), (0.5, False, 1)],
|
||||||
|
)
|
||||||
|
def test_make_spangroup_singlelabel(threshold, allow_overlap, nr_results):
|
||||||
|
fix_random_seed(0)
|
||||||
|
nlp = Language()
|
||||||
|
spancat = nlp.add_pipe(
|
||||||
|
"spancat",
|
||||||
|
config={
|
||||||
|
"spans_key": SPAN_KEY,
|
||||||
|
"threshold": threshold,
|
||||||
|
"max_positive": 1,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
doc = nlp.make_doc("Greater London")
|
||||||
|
ngram_suggester = registry.misc.get("spacy.ngram_suggester.v1")(sizes=[1, 2])
|
||||||
|
indices = ngram_suggester([doc])[0].dataXd
|
||||||
|
assert_array_equal(OPS.to_numpy(indices), numpy.asarray([[0, 1], [1, 2], [0, 2]]))
|
||||||
|
labels = ["Thing", "City", "Person", "GreatCity"]
|
||||||
|
for label in labels:
|
||||||
|
spancat.add_label(label)
|
||||||
|
scores = numpy.asarray(
|
||||||
|
[[0.2, 0.4, 0.3, 0.1], [0.1, 0.6, 0.2, 0.4], [0.8, 0.7, 0.3, 0.9]], dtype="f"
|
||||||
|
)
|
||||||
|
spangroup = spancat._make_span_group_singlelabel(
|
||||||
|
doc, indices, scores, allow_overlap
|
||||||
|
)
|
||||||
|
if threshold > 0.4:
|
||||||
|
if allow_overlap:
|
||||||
|
assert spangroup[0].text == "London"
|
||||||
|
assert spangroup[0].label_ == "City"
|
||||||
|
assert_almost_equal(0.6, spangroup.attrs["scores"][0], 5)
|
||||||
|
assert spangroup[1].text == "Greater London"
|
||||||
|
assert spangroup[1].label_ == "GreatCity"
|
||||||
|
assert spangroup.attrs["scores"][1] == 0.9
|
||||||
|
assert_almost_equal(0.9, spangroup.attrs["scores"][1], 5)
|
||||||
|
else:
|
||||||
|
assert spangroup[0].text == "Greater London"
|
||||||
|
assert spangroup[0].label_ == "GreatCity"
|
||||||
|
assert spangroup.attrs["scores"][0] == 0.9
|
||||||
|
else:
|
||||||
|
if allow_overlap:
|
||||||
|
assert spangroup[0].text == "Greater"
|
||||||
|
assert spangroup[0].label_ == "City"
|
||||||
|
assert spangroup[1].text == "London"
|
||||||
|
assert spangroup[1].label_ == "City"
|
||||||
|
assert spangroup[2].text == "Greater London"
|
||||||
|
assert spangroup[2].label_ == "GreatCity"
|
||||||
|
else:
|
||||||
|
assert spangroup[0].text == "Greater London"
|
||||||
|
|
||||||
|
|
||||||
|
def test_make_spangroup_negative_label():
|
||||||
|
fix_random_seed(0)
|
||||||
|
nlp_single = Language()
|
||||||
|
nlp_multi = Language()
|
||||||
|
spancat_single = nlp_single.add_pipe(
|
||||||
|
"spancat",
|
||||||
|
config={
|
||||||
|
"spans_key": SPAN_KEY,
|
||||||
|
"threshold": 0.1,
|
||||||
|
"max_positive": 1,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
spancat_multi = nlp_multi.add_pipe(
|
||||||
|
"spancat",
|
||||||
|
config={
|
||||||
|
"spans_key": SPAN_KEY,
|
||||||
|
"threshold": 0.1,
|
||||||
|
"max_positive": 2,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
spancat_single.add_negative_label = True
|
||||||
|
spancat_multi.add_negative_label = True
|
||||||
|
doc = nlp_single.make_doc("Greater London")
|
||||||
|
labels = ["Thing", "City", "Person", "GreatCity"]
|
||||||
|
for label in labels:
|
||||||
|
spancat_multi.add_label(label)
|
||||||
|
spancat_single.add_label(label)
|
||||||
|
ngram_suggester = registry.misc.get("spacy.ngram_suggester.v1")(sizes=[1, 2])
|
||||||
|
indices = ngram_suggester([doc])[0].dataXd
|
||||||
|
assert_array_equal(OPS.to_numpy(indices), numpy.asarray([[0, 1], [1, 2], [0, 2]]))
|
||||||
|
scores = numpy.asarray(
|
||||||
|
[
|
||||||
|
[0.2, 0.4, 0.3, 0.1, 0.1],
|
||||||
|
[0.1, 0.6, 0.2, 0.4, 0.9],
|
||||||
|
[0.8, 0.7, 0.3, 0.9, 0.1],
|
||||||
|
],
|
||||||
|
dtype="f",
|
||||||
|
)
|
||||||
|
spangroup_multi = spancat_multi._make_span_group_multilabel(doc, indices, scores)
|
||||||
|
spangroup_single = spancat_single._make_span_group_singlelabel(doc, indices, scores)
|
||||||
|
assert len(spangroup_single) == 2
|
||||||
|
assert spangroup_single[0].text == "Greater"
|
||||||
|
assert spangroup_single[0].label_ == "City"
|
||||||
|
assert_almost_equal(0.4, spangroup_single.attrs["scores"][0], 5)
|
||||||
|
assert spangroup_single[1].text == "Greater London"
|
||||||
|
assert spangroup_single[1].label_ == "GreatCity"
|
||||||
|
assert spangroup_single.attrs["scores"][1] == 0.9
|
||||||
|
assert_almost_equal(0.9, spangroup_single.attrs["scores"][1], 5)
|
||||||
|
|
||||||
|
assert len(spangroup_multi) == 6
|
||||||
|
assert spangroup_multi[0].text == "Greater"
|
||||||
|
assert spangroup_multi[0].label_ == "City"
|
||||||
|
assert_almost_equal(0.4, spangroup_multi.attrs["scores"][0], 5)
|
||||||
|
assert spangroup_multi[1].text == "Greater"
|
||||||
|
assert spangroup_multi[1].label_ == "Person"
|
||||||
|
assert_almost_equal(0.3, spangroup_multi.attrs["scores"][1], 5)
|
||||||
|
assert spangroup_multi[2].text == "London"
|
||||||
|
assert spangroup_multi[2].label_ == "City"
|
||||||
|
assert_almost_equal(0.6, spangroup_multi.attrs["scores"][2], 5)
|
||||||
|
assert spangroup_multi[3].text == "London"
|
||||||
|
assert spangroup_multi[3].label_ == "GreatCity"
|
||||||
|
assert_almost_equal(0.4, spangroup_multi.attrs["scores"][3], 5)
|
||||||
|
assert spangroup_multi[4].text == "Greater London"
|
||||||
|
assert spangroup_multi[4].label_ == "Thing"
|
||||||
|
assert spangroup_multi[4].text == "Greater London"
|
||||||
|
assert_almost_equal(0.8, spangroup_multi.attrs["scores"][4], 5)
|
||||||
|
assert spangroup_multi[5].text == "Greater London"
|
||||||
|
assert spangroup_multi[5].label_ == "GreatCity"
|
||||||
|
assert_almost_equal(0.9, spangroup_multi.attrs["scores"][5], 5)
|
||||||
|
|
||||||
|
|
||||||
def test_ngram_suggester(en_tokenizer):
|
def test_ngram_suggester(en_tokenizer):
|
||||||
# test different n-gram lengths
|
# test different n-gram lengths
|
||||||
for size in [1, 2, 3]:
|
for size in [1, 2, 3]:
|
||||||
|
@ -371,9 +503,9 @@ def test_overfitting_IO_overlapping():
|
||||||
assert set([span.label_ for span in spans2]) == {"LOC", "DOUBLE_LOC"}
|
assert set([span.label_ for span in spans2]) == {"LOC", "DOUBLE_LOC"}
|
||||||
|
|
||||||
|
|
||||||
def test_zero_suggestions():
|
@pytest.mark.parametrize("name", SPANCAT_COMPONENTS)
|
||||||
|
def test_zero_suggestions(name):
|
||||||
# Test with a suggester that can return 0 suggestions
|
# Test with a suggester that can return 0 suggestions
|
||||||
|
|
||||||
@registry.misc("test_mixed_zero_suggester")
|
@registry.misc("test_mixed_zero_suggester")
|
||||||
def make_mixed_zero_suggester():
|
def make_mixed_zero_suggester():
|
||||||
def mixed_zero_suggester(docs, *, ops=None):
|
def mixed_zero_suggester(docs, *, ops=None):
|
||||||
|
@ -400,7 +532,7 @@ def test_zero_suggestions():
|
||||||
fix_random_seed(0)
|
fix_random_seed(0)
|
||||||
nlp = English()
|
nlp = English()
|
||||||
spancat = nlp.add_pipe(
|
spancat = nlp.add_pipe(
|
||||||
"spancat",
|
name,
|
||||||
config={
|
config={
|
||||||
"suggester": {"@misc": "test_mixed_zero_suggester"},
|
"suggester": {"@misc": "test_mixed_zero_suggester"},
|
||||||
"spans_key": SPAN_KEY,
|
"spans_key": SPAN_KEY,
|
||||||
|
@ -408,7 +540,7 @@ def test_zero_suggestions():
|
||||||
)
|
)
|
||||||
train_examples = make_examples(nlp)
|
train_examples = make_examples(nlp)
|
||||||
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||||
assert spancat.model.get_dim("nO") == 2
|
assert spancat.model.get_dim("nO") == spancat._n_labels
|
||||||
assert set(spancat.labels) == {"LOC", "PERSON"}
|
assert set(spancat.labels) == {"LOC", "PERSON"}
|
||||||
|
|
||||||
nlp.update(train_examples, sgd=optimizer)
|
nlp.update(train_examples, sgd=optimizer)
|
||||||
|
@ -424,9 +556,10 @@ def test_zero_suggestions():
|
||||||
list(nlp.pipe(["", "one", "three three three"]))
|
list(nlp.pipe(["", "one", "three three three"]))
|
||||||
|
|
||||||
|
|
||||||
def test_set_candidates():
|
@pytest.mark.parametrize("name", SPANCAT_COMPONENTS)
|
||||||
|
def test_set_candidates(name):
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY})
|
spancat = nlp.add_pipe(name, config={"spans_key": SPAN_KEY})
|
||||||
train_examples = make_examples(nlp)
|
train_examples = make_examples(nlp)
|
||||||
nlp.initialize(get_examples=lambda: train_examples)
|
nlp.initialize(get_examples=lambda: train_examples)
|
||||||
texts = [
|
texts = [
|
||||||
|
@ -444,3 +577,21 @@ def test_set_candidates():
|
||||||
assert len(docs[0].spans["candidates"]) == 9
|
assert len(docs[0].spans["candidates"]) == 9
|
||||||
assert docs[0].spans["candidates"][0].text == "Just"
|
assert docs[0].spans["candidates"][0].text == "Just"
|
||||||
assert docs[0].spans["candidates"][4].text == "Just a"
|
assert docs[0].spans["candidates"][4].text == "Just a"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("name", SPANCAT_COMPONENTS)
|
||||||
|
@pytest.mark.parametrize("n_process", [1, 2])
|
||||||
|
def test_spancat_multiprocessing(name, n_process):
|
||||||
|
if isinstance(get_current_ops, NumpyOps) or n_process < 2:
|
||||||
|
nlp = Language()
|
||||||
|
spancat = nlp.add_pipe(name, config={"spans_key": SPAN_KEY})
|
||||||
|
train_examples = make_examples(nlp)
|
||||||
|
nlp.initialize(get_examples=lambda: train_examples)
|
||||||
|
texts = [
|
||||||
|
"Just a sentence.",
|
||||||
|
"I like London and Berlin",
|
||||||
|
"I like Berlin",
|
||||||
|
"I eat ham.",
|
||||||
|
]
|
||||||
|
docs = list(nlp.pipe(texts, n_process=n_process))
|
||||||
|
assert len(docs) == len(texts)
|
||||||
|
|
|
@ -213,6 +213,13 @@ def test_serialize_doc_exclude(en_vocab):
|
||||||
|
|
||||||
def test_serialize_doc_span_groups(en_vocab):
|
def test_serialize_doc_span_groups(en_vocab):
|
||||||
doc = Doc(en_vocab, words=["hello", "world", "!"])
|
doc = Doc(en_vocab, words=["hello", "world", "!"])
|
||||||
doc.spans["content"] = [doc[0:2]]
|
span = doc[0:2]
|
||||||
|
span.label_ = "test_serialize_doc_span_groups_label"
|
||||||
|
span.id_ = "test_serialize_doc_span_groups_id"
|
||||||
|
span.kb_id_ = "test_serialize_doc_span_groups_kb_id"
|
||||||
|
doc.spans["content"] = [span]
|
||||||
new_doc = Doc(en_vocab).from_bytes(doc.to_bytes())
|
new_doc = Doc(en_vocab).from_bytes(doc.to_bytes())
|
||||||
assert len(new_doc.spans["content"]) == 1
|
assert len(new_doc.spans["content"]) == 1
|
||||||
|
assert new_doc.spans["content"][0].label_ == "test_serialize_doc_span_groups_label"
|
||||||
|
assert new_doc.spans["content"][0].id_ == "test_serialize_doc_span_groups_id"
|
||||||
|
assert new_doc.spans["content"][0].kb_id_ == "test_serialize_doc_span_groups_kb_id"
|
||||||
|
|
|
@ -49,7 +49,11 @@ def test_serialize_doc_bin():
|
||||||
nlp = English()
|
nlp = English()
|
||||||
for doc in nlp.pipe(texts):
|
for doc in nlp.pipe(texts):
|
||||||
doc.cats = cats
|
doc.cats = cats
|
||||||
doc.spans["start"] = [doc[0:2]]
|
span = doc[0:2]
|
||||||
|
span.label_ = "UNUSUAL_SPAN_LABEL"
|
||||||
|
span.id_ = "UNUSUAL_SPAN_ID"
|
||||||
|
span.kb_id_ = "UNUSUAL_SPAN_KB_ID"
|
||||||
|
doc.spans["start"] = [span]
|
||||||
doc[0].norm_ = "UNUSUAL_TOKEN_NORM"
|
doc[0].norm_ = "UNUSUAL_TOKEN_NORM"
|
||||||
doc[0].ent_id_ = "UNUSUAL_TOKEN_ENT_ID"
|
doc[0].ent_id_ = "UNUSUAL_TOKEN_ENT_ID"
|
||||||
doc_bin.add(doc)
|
doc_bin.add(doc)
|
||||||
|
@ -63,6 +67,9 @@ def test_serialize_doc_bin():
|
||||||
assert doc.text == texts[i]
|
assert doc.text == texts[i]
|
||||||
assert doc.cats == cats
|
assert doc.cats == cats
|
||||||
assert len(doc.spans) == 1
|
assert len(doc.spans) == 1
|
||||||
|
assert doc.spans["start"][0].label_ == "UNUSUAL_SPAN_LABEL"
|
||||||
|
assert doc.spans["start"][0].id_ == "UNUSUAL_SPAN_ID"
|
||||||
|
assert doc.spans["start"][0].kb_id_ == "UNUSUAL_SPAN_KB_ID"
|
||||||
assert doc[0].norm_ == "UNUSUAL_TOKEN_NORM"
|
assert doc[0].norm_ == "UNUSUAL_TOKEN_NORM"
|
||||||
assert doc[0].ent_id_ == "UNUSUAL_TOKEN_ENT_ID"
|
assert doc[0].ent_id_ == "UNUSUAL_TOKEN_ENT_ID"
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,10 @@
|
||||||
from typing import Callable
|
from pathlib import Path
|
||||||
|
from typing import Callable, Iterable, Any, Dict
|
||||||
|
|
||||||
from spacy import util
|
import srsly
|
||||||
from spacy.util import ensure_path, registry, load_model_from_config
|
|
||||||
|
from spacy import util, Errors
|
||||||
|
from spacy.util import ensure_path, registry, load_model_from_config, SimpleFrozenList
|
||||||
from spacy.kb.kb_in_memory import InMemoryLookupKB
|
from spacy.kb.kb_in_memory import InMemoryLookupKB
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
from thinc.api import Config
|
from thinc.api import Config
|
||||||
|
@ -91,7 +94,10 @@ def test_serialize_subclassed_kb():
|
||||||
|
|
||||||
[components.entity_linker]
|
[components.entity_linker]
|
||||||
factory = "entity_linker"
|
factory = "entity_linker"
|
||||||
|
|
||||||
|
[components.entity_linker.generate_empty_kb]
|
||||||
|
@misc = "kb_test.CustomEmptyKB.v1"
|
||||||
|
|
||||||
[initialize]
|
[initialize]
|
||||||
|
|
||||||
[initialize.components]
|
[initialize.components]
|
||||||
|
@ -99,7 +105,7 @@ def test_serialize_subclassed_kb():
|
||||||
[initialize.components.entity_linker]
|
[initialize.components.entity_linker]
|
||||||
|
|
||||||
[initialize.components.entity_linker.kb_loader]
|
[initialize.components.entity_linker.kb_loader]
|
||||||
@misc = "spacy.CustomKB.v1"
|
@misc = "kb_test.CustomKB.v1"
|
||||||
entity_vector_length = 342
|
entity_vector_length = 342
|
||||||
custom_field = 666
|
custom_field = 666
|
||||||
"""
|
"""
|
||||||
|
@ -109,10 +115,57 @@ def test_serialize_subclassed_kb():
|
||||||
super().__init__(vocab, entity_vector_length)
|
super().__init__(vocab, entity_vector_length)
|
||||||
self.custom_field = custom_field
|
self.custom_field = custom_field
|
||||||
|
|
||||||
@registry.misc("spacy.CustomKB.v1")
|
def to_disk(self, path, exclude: Iterable[str] = SimpleFrozenList()):
|
||||||
|
"""We overwrite InMemoryLookupKB.to_disk() to ensure that self.custom_field is stored as well."""
|
||||||
|
path = ensure_path(path)
|
||||||
|
if not path.exists():
|
||||||
|
path.mkdir(parents=True)
|
||||||
|
if not path.is_dir():
|
||||||
|
raise ValueError(Errors.E928.format(loc=path))
|
||||||
|
|
||||||
|
def serialize_custom_fields(file_path: Path) -> None:
|
||||||
|
srsly.write_json(file_path, {"custom_field": self.custom_field})
|
||||||
|
|
||||||
|
serialize = {
|
||||||
|
"contents": lambda p: self.write_contents(p),
|
||||||
|
"strings.json": lambda p: self.vocab.strings.to_disk(p),
|
||||||
|
"custom_fields": lambda p: serialize_custom_fields(p),
|
||||||
|
}
|
||||||
|
util.to_disk(path, serialize, exclude)
|
||||||
|
|
||||||
|
def from_disk(self, path, exclude: Iterable[str] = SimpleFrozenList()):
|
||||||
|
"""We overwrite InMemoryLookupKB.from_disk() to ensure that self.custom_field is loaded as well."""
|
||||||
|
path = ensure_path(path)
|
||||||
|
if not path.exists():
|
||||||
|
raise ValueError(Errors.E929.format(loc=path))
|
||||||
|
if not path.is_dir():
|
||||||
|
raise ValueError(Errors.E928.format(loc=path))
|
||||||
|
|
||||||
|
def deserialize_custom_fields(file_path: Path) -> None:
|
||||||
|
self.custom_field = srsly.read_json(file_path)["custom_field"]
|
||||||
|
|
||||||
|
deserialize: Dict[str, Callable[[Any], Any]] = {
|
||||||
|
"contents": lambda p: self.read_contents(p),
|
||||||
|
"strings.json": lambda p: self.vocab.strings.from_disk(p),
|
||||||
|
"custom_fields": lambda p: deserialize_custom_fields(p),
|
||||||
|
}
|
||||||
|
util.from_disk(path, deserialize, exclude)
|
||||||
|
|
||||||
|
@registry.misc("kb_test.CustomEmptyKB.v1")
|
||||||
|
def empty_custom_kb() -> Callable[[Vocab, int], SubInMemoryLookupKB]:
|
||||||
|
def empty_kb_factory(vocab: Vocab, entity_vector_length: int):
|
||||||
|
return SubInMemoryLookupKB(
|
||||||
|
vocab=vocab,
|
||||||
|
entity_vector_length=entity_vector_length,
|
||||||
|
custom_field=0,
|
||||||
|
)
|
||||||
|
|
||||||
|
return empty_kb_factory
|
||||||
|
|
||||||
|
@registry.misc("kb_test.CustomKB.v1")
|
||||||
def custom_kb(
|
def custom_kb(
|
||||||
entity_vector_length: int, custom_field: int
|
entity_vector_length: int, custom_field: int
|
||||||
) -> Callable[[Vocab], InMemoryLookupKB]:
|
) -> Callable[[Vocab], SubInMemoryLookupKB]:
|
||||||
def custom_kb_factory(vocab):
|
def custom_kb_factory(vocab):
|
||||||
kb = SubInMemoryLookupKB(
|
kb = SubInMemoryLookupKB(
|
||||||
vocab=vocab,
|
vocab=vocab,
|
||||||
|
@ -139,6 +192,6 @@ def test_serialize_subclassed_kb():
|
||||||
nlp2 = util.load_model_from_path(tmp_dir)
|
nlp2 = util.load_model_from_path(tmp_dir)
|
||||||
entity_linker2 = nlp2.get_pipe("entity_linker")
|
entity_linker2 = nlp2.get_pipe("entity_linker")
|
||||||
# After IO, the KB is the standard one
|
# After IO, the KB is the standard one
|
||||||
assert type(entity_linker2.kb) == InMemoryLookupKB
|
assert type(entity_linker2.kb) == SubInMemoryLookupKB
|
||||||
assert entity_linker2.kb.entity_vector_length == 342
|
assert entity_linker2.kb.entity_vector_length == 342
|
||||||
assert not hasattr(entity_linker2.kb, "custom_field")
|
assert entity_linker2.kb.custom_field == 666
|
||||||
|
|
|
@ -2,7 +2,6 @@ import os
|
||||||
import math
|
import math
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
from typing import Tuple, List, Dict, Any
|
from typing import Tuple, List, Dict, Any
|
||||||
import pkg_resources
|
|
||||||
import time
|
import time
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
@ -29,6 +28,7 @@ from spacy.cli.debug_data import _print_span_characteristics
|
||||||
from spacy.cli.debug_data import _get_spans_length_freq_dist
|
from spacy.cli.debug_data import _get_spans_length_freq_dist
|
||||||
from spacy.cli.download import get_compatibility, get_version
|
from spacy.cli.download import get_compatibility, get_version
|
||||||
from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config
|
from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config
|
||||||
|
from spacy.cli.init_pipeline import _init_labels
|
||||||
from spacy.cli.package import get_third_party_dependencies
|
from spacy.cli.package import get_third_party_dependencies
|
||||||
from spacy.cli.package import _is_permitted_package_name
|
from spacy.cli.package import _is_permitted_package_name
|
||||||
from spacy.cli.project.remote_storage import RemoteStorage
|
from spacy.cli.project.remote_storage import RemoteStorage
|
||||||
|
@ -47,7 +47,6 @@ from spacy.training.converters import conll_ner_to_docs, conllu_to_docs
|
||||||
from spacy.training.converters import iob_to_docs
|
from spacy.training.converters import iob_to_docs
|
||||||
from spacy.util import ENV_VARS, get_minor_version, load_model_from_config, load_config
|
from spacy.util import ENV_VARS, get_minor_version, load_model_from_config, load_config
|
||||||
|
|
||||||
from ..cli.init_pipeline import _init_labels
|
|
||||||
from .util import make_tempdir
|
from .util import make_tempdir
|
||||||
|
|
||||||
|
|
||||||
|
@ -553,7 +552,14 @@ def test_parse_cli_overrides():
|
||||||
|
|
||||||
@pytest.mark.parametrize("lang", ["en", "nl"])
|
@pytest.mark.parametrize("lang", ["en", "nl"])
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"pipeline", [["tagger", "parser", "ner"], [], ["ner", "textcat", "sentencizer"]]
|
"pipeline",
|
||||||
|
[
|
||||||
|
["tagger", "parser", "ner"],
|
||||||
|
[],
|
||||||
|
["ner", "textcat", "sentencizer"],
|
||||||
|
["morphologizer", "spancat", "entity_linker"],
|
||||||
|
["spancat_singlelabel", "textcat_multilabel"],
|
||||||
|
],
|
||||||
)
|
)
|
||||||
@pytest.mark.parametrize("optimize", ["efficiency", "accuracy"])
|
@pytest.mark.parametrize("optimize", ["efficiency", "accuracy"])
|
||||||
@pytest.mark.parametrize("pretraining", [True, False])
|
@pytest.mark.parametrize("pretraining", [True, False])
|
||||||
|
@ -1126,6 +1132,7 @@ def test_cli_find_threshold(capsys):
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("ignore::DeprecationWarning")
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"reqs,output",
|
"reqs,output",
|
||||||
[
|
[
|
||||||
|
@ -1158,6 +1165,8 @@ def test_cli_find_threshold(capsys):
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_project_check_requirements(reqs, output):
|
def test_project_check_requirements(reqs, output):
|
||||||
|
import pkg_resources
|
||||||
|
|
||||||
# excessive guard against unlikely package name
|
# excessive guard against unlikely package name
|
||||||
try:
|
try:
|
||||||
pkg_resources.require("spacyunknowndoesnotexist12345")
|
pkg_resources.require("spacyunknowndoesnotexist12345")
|
||||||
|
|
|
@ -5,10 +5,18 @@ import srsly
|
||||||
from typer.testing import CliRunner
|
from typer.testing import CliRunner
|
||||||
from spacy.tokens import DocBin, Doc
|
from spacy.tokens import DocBin, Doc
|
||||||
|
|
||||||
from spacy.cli._util import app
|
from spacy.cli._util import app, get_git_version
|
||||||
from .util import make_tempdir, normalize_whitespace
|
from .util import make_tempdir, normalize_whitespace
|
||||||
|
|
||||||
|
|
||||||
|
def has_git():
|
||||||
|
try:
|
||||||
|
get_git_version()
|
||||||
|
return True
|
||||||
|
except RuntimeError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
def test_convert_auto():
|
def test_convert_auto():
|
||||||
with make_tempdir() as d_in, make_tempdir() as d_out:
|
with make_tempdir() as d_in, make_tempdir() as d_out:
|
||||||
for f in ["data1.iob", "data2.iob", "data3.iob"]:
|
for f in ["data1.iob", "data2.iob", "data3.iob"]:
|
||||||
|
@ -181,6 +189,7 @@ def test_project_run(project_dir):
|
||||||
assert "okokok" in result.stdout
|
assert "okokok" in result.stdout
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skipif(not has_git(), reason="git not installed")
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"options",
|
"options",
|
||||||
[
|
[
|
||||||
|
|
|
@ -275,6 +275,20 @@ def test_displacy_parse_deps(en_vocab):
|
||||||
{"start": 2, "end": 3, "label": "det", "dir": "left"},
|
{"start": 2, "end": 3, "label": "det", "dir": "left"},
|
||||||
{"start": 1, "end": 3, "label": "attr", "dir": "right"},
|
{"start": 1, "end": 3, "label": "attr", "dir": "right"},
|
||||||
]
|
]
|
||||||
|
# Test that displacy.parse_deps converts Span to Doc
|
||||||
|
deps = displacy.parse_deps(doc[:])
|
||||||
|
assert isinstance(deps, dict)
|
||||||
|
assert deps["words"] == [
|
||||||
|
{"lemma": None, "text": words[0], "tag": pos[0]},
|
||||||
|
{"lemma": None, "text": words[1], "tag": pos[1]},
|
||||||
|
{"lemma": None, "text": words[2], "tag": pos[2]},
|
||||||
|
{"lemma": None, "text": words[3], "tag": pos[3]},
|
||||||
|
]
|
||||||
|
assert deps["arcs"] == [
|
||||||
|
{"start": 0, "end": 1, "label": "nsubj", "dir": "left"},
|
||||||
|
{"start": 2, "end": 3, "label": "det", "dir": "left"},
|
||||||
|
{"start": 1, "end": 3, "label": "attr", "dir": "right"},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
def test_displacy_invalid_arcs():
|
def test_displacy_invalid_arcs():
|
||||||
|
|
|
@ -2,17 +2,19 @@ from pathlib import Path
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pytest
|
import pytest
|
||||||
import srsly
|
import srsly
|
||||||
from spacy.vocab import Vocab
|
from thinc.api import Config, get_current_ops
|
||||||
from thinc.api import Config
|
|
||||||
|
|
||||||
|
from spacy import util
|
||||||
|
from spacy.lang.en import English
|
||||||
|
from spacy.training.initialize import init_nlp
|
||||||
|
from spacy.training.loop import train
|
||||||
|
from spacy.training.pretrain import pretrain
|
||||||
|
from spacy.tokens import Doc, DocBin
|
||||||
|
from spacy.language import DEFAULT_CONFIG_PRETRAIN_PATH, DEFAULT_CONFIG_PATH
|
||||||
|
from spacy.ml.models.multi_task import create_pretrain_vectors
|
||||||
|
from spacy.vectors import Vectors
|
||||||
|
from spacy.vocab import Vocab
|
||||||
from ..util import make_tempdir
|
from ..util import make_tempdir
|
||||||
from ... import util
|
|
||||||
from ...lang.en import English
|
|
||||||
from ...training.initialize import init_nlp
|
|
||||||
from ...training.loop import train
|
|
||||||
from ...training.pretrain import pretrain
|
|
||||||
from ...tokens import Doc, DocBin
|
|
||||||
from ...language import DEFAULT_CONFIG_PRETRAIN_PATH, DEFAULT_CONFIG_PATH
|
|
||||||
|
|
||||||
pretrain_string_listener = """
|
pretrain_string_listener = """
|
||||||
[nlp]
|
[nlp]
|
||||||
|
@ -163,7 +165,8 @@ def test_pretraining_default():
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("objective", CHAR_OBJECTIVES)
|
@pytest.mark.parametrize("objective", CHAR_OBJECTIVES)
|
||||||
def test_pretraining_tok2vec_characters(objective):
|
@pytest.mark.parametrize("skip_last", (True, False))
|
||||||
|
def test_pretraining_tok2vec_characters(objective, skip_last):
|
||||||
"""Test that pretraining works with the character objective"""
|
"""Test that pretraining works with the character objective"""
|
||||||
config = Config().from_str(pretrain_string_listener)
|
config = Config().from_str(pretrain_string_listener)
|
||||||
config["pretraining"]["objective"] = objective
|
config["pretraining"]["objective"] = objective
|
||||||
|
@ -176,10 +179,14 @@ def test_pretraining_tok2vec_characters(objective):
|
||||||
filled["paths"]["raw_text"] = file_path
|
filled["paths"]["raw_text"] = file_path
|
||||||
filled = filled.interpolate()
|
filled = filled.interpolate()
|
||||||
assert filled["pretraining"]["component"] == "tok2vec"
|
assert filled["pretraining"]["component"] == "tok2vec"
|
||||||
pretrain(filled, tmp_dir)
|
pretrain(filled, tmp_dir, skip_last=skip_last)
|
||||||
assert Path(tmp_dir / "model0.bin").exists()
|
assert Path(tmp_dir / "model0.bin").exists()
|
||||||
assert Path(tmp_dir / "model4.bin").exists()
|
assert Path(tmp_dir / "model4.bin").exists()
|
||||||
assert not Path(tmp_dir / "model5.bin").exists()
|
assert not Path(tmp_dir / "model5.bin").exists()
|
||||||
|
if skip_last:
|
||||||
|
assert not Path(tmp_dir / "model-last.bin").exists()
|
||||||
|
else:
|
||||||
|
assert Path(tmp_dir / "model-last.bin").exists()
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("objective", VECTOR_OBJECTIVES)
|
@pytest.mark.parametrize("objective", VECTOR_OBJECTIVES)
|
||||||
|
@ -235,6 +242,7 @@ def test_pretraining_tagger_tok2vec(config):
|
||||||
pretrain(filled, tmp_dir)
|
pretrain(filled, tmp_dir)
|
||||||
assert Path(tmp_dir / "model0.bin").exists()
|
assert Path(tmp_dir / "model0.bin").exists()
|
||||||
assert Path(tmp_dir / "model4.bin").exists()
|
assert Path(tmp_dir / "model4.bin").exists()
|
||||||
|
assert Path(tmp_dir / "model-last.bin").exists()
|
||||||
assert not Path(tmp_dir / "model5.bin").exists()
|
assert not Path(tmp_dir / "model5.bin").exists()
|
||||||
|
|
||||||
|
|
||||||
|
@ -346,3 +354,26 @@ def write_vectors_model(tmp_dir):
|
||||||
nlp = English(vocab)
|
nlp = English(vocab)
|
||||||
nlp.to_disk(nlp_path)
|
nlp.to_disk(nlp_path)
|
||||||
return str(nlp_path)
|
return str(nlp_path)
|
||||||
|
|
||||||
|
|
||||||
|
def test_pretrain_default_vectors():
|
||||||
|
nlp = English()
|
||||||
|
nlp.add_pipe("tok2vec")
|
||||||
|
nlp.initialize()
|
||||||
|
|
||||||
|
# default vectors are supported
|
||||||
|
nlp.vocab.vectors = Vectors(shape=(10, 10))
|
||||||
|
create_pretrain_vectors(1, 1, "cosine")(nlp.vocab, nlp.get_pipe("tok2vec").model)
|
||||||
|
|
||||||
|
# floret vectors are supported
|
||||||
|
nlp.vocab.vectors = Vectors(
|
||||||
|
data=get_current_ops().xp.zeros((10, 10)), mode="floret", hash_count=1
|
||||||
|
)
|
||||||
|
create_pretrain_vectors(1, 1, "cosine")(nlp.vocab, nlp.get_pipe("tok2vec").model)
|
||||||
|
|
||||||
|
# error for no vectors
|
||||||
|
with pytest.raises(ValueError, match="E875"):
|
||||||
|
nlp.vocab.vectors = Vectors()
|
||||||
|
create_pretrain_vectors(1, 1, "cosine")(
|
||||||
|
nlp.vocab, nlp.get_pipe("tok2vec").model
|
||||||
|
)
|
||||||
|
|
|
@ -124,6 +124,10 @@ class DocBin:
|
||||||
for key, group in doc.spans.items():
|
for key, group in doc.spans.items():
|
||||||
for span in group:
|
for span in group:
|
||||||
self.strings.add(span.label_)
|
self.strings.add(span.label_)
|
||||||
|
if span.kb_id in span.doc.vocab.strings:
|
||||||
|
self.strings.add(span.kb_id_)
|
||||||
|
if span.id in span.doc.vocab.strings:
|
||||||
|
self.strings.add(span.id_)
|
||||||
|
|
||||||
def get_docs(self, vocab: Vocab) -> Iterator[Doc]:
|
def get_docs(self, vocab: Vocab) -> Iterator[Doc]:
|
||||||
"""Recover Doc objects from the annotations, using the given vocab.
|
"""Recover Doc objects from the annotations, using the given vocab.
|
||||||
|
|
|
@ -544,10 +544,6 @@ cdef class Doc:
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/doc#char_span
|
DOCS: https://spacy.io/api/doc#char_span
|
||||||
"""
|
"""
|
||||||
if not isinstance(label, int):
|
|
||||||
label = self.vocab.strings.add(label)
|
|
||||||
if not isinstance(kb_id, int):
|
|
||||||
kb_id = self.vocab.strings.add(kb_id)
|
|
||||||
alignment_modes = ("strict", "contract", "expand")
|
alignment_modes = ("strict", "contract", "expand")
|
||||||
if alignment_mode not in alignment_modes:
|
if alignment_mode not in alignment_modes:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
|
@ -1350,6 +1346,10 @@ cdef class Doc:
|
||||||
for group in self.spans.values():
|
for group in self.spans.values():
|
||||||
for span in group:
|
for span in group:
|
||||||
strings.add(span.label_)
|
strings.add(span.label_)
|
||||||
|
if span.kb_id in span.doc.vocab.strings:
|
||||||
|
strings.add(span.kb_id_)
|
||||||
|
if span.id in span.doc.vocab.strings:
|
||||||
|
strings.add(span.id_)
|
||||||
# Msgpack doesn't distinguish between lists and tuples, which is
|
# Msgpack doesn't distinguish between lists and tuples, which is
|
||||||
# vexing for user data. As a best guess, we *know* that within
|
# vexing for user data. As a best guess, we *know* that within
|
||||||
# keys, we must have tuples. In values we just have to hope
|
# keys, we must have tuples. In values we just have to hope
|
||||||
|
|
|
@ -460,9 +460,12 @@ cdef class Span:
|
||||||
start = i
|
start = i
|
||||||
if start >= self.end:
|
if start >= self.end:
|
||||||
break
|
break
|
||||||
if start < self.end:
|
elif i == self.doc.length - 1:
|
||||||
yield Span(self.doc, start, self.end)
|
yield Span(self.doc, start, self.doc.length)
|
||||||
|
|
||||||
|
# Ensure that trailing parts of the Span instance are included in last element of .sents.
|
||||||
|
if start == self.doc.length - 1:
|
||||||
|
yield Span(self.doc, start, self.doc.length)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def ents(self):
|
def ents(self):
|
||||||
|
|
|
@ -24,6 +24,7 @@ def pretrain(
|
||||||
epoch_resume: Optional[int] = None,
|
epoch_resume: Optional[int] = None,
|
||||||
use_gpu: int = -1,
|
use_gpu: int = -1,
|
||||||
silent: bool = True,
|
silent: bool = True,
|
||||||
|
skip_last: bool = False,
|
||||||
):
|
):
|
||||||
msg = Printer(no_print=silent)
|
msg = Printer(no_print=silent)
|
||||||
if config["training"]["seed"] is not None:
|
if config["training"]["seed"] is not None:
|
||||||
|
@ -60,10 +61,14 @@ def pretrain(
|
||||||
row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
|
row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
|
||||||
msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)
|
msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)
|
||||||
|
|
||||||
def _save_model(epoch, is_temp=False):
|
def _save_model(epoch, is_temp=False, is_last=False):
|
||||||
is_temp_str = ".temp" if is_temp else ""
|
is_temp_str = ".temp" if is_temp else ""
|
||||||
with model.use_params(optimizer.averages):
|
with model.use_params(optimizer.averages):
|
||||||
with (output_dir / f"model{epoch}{is_temp_str}.bin").open("wb") as file_:
|
if is_last:
|
||||||
|
save_path = output_dir / f"model-last.bin"
|
||||||
|
else:
|
||||||
|
save_path = output_dir / f"model{epoch}{is_temp_str}.bin"
|
||||||
|
with (save_path).open("wb") as file_:
|
||||||
file_.write(model.get_ref("tok2vec").to_bytes())
|
file_.write(model.get_ref("tok2vec").to_bytes())
|
||||||
log = {
|
log = {
|
||||||
"nr_word": tracker.nr_word,
|
"nr_word": tracker.nr_word,
|
||||||
|
@ -76,22 +81,26 @@ def pretrain(
|
||||||
|
|
||||||
# TODO: I think we probably want this to look more like the
|
# TODO: I think we probably want this to look more like the
|
||||||
# 'create_train_batches' function?
|
# 'create_train_batches' function?
|
||||||
for epoch in range(epoch_resume, P["max_epochs"]):
|
try:
|
||||||
for batch_id, batch in enumerate(batcher(corpus(nlp))):
|
for epoch in range(epoch_resume, P["max_epochs"]):
|
||||||
docs = ensure_docs(batch)
|
for batch_id, batch in enumerate(batcher(corpus(nlp))):
|
||||||
loss = make_update(model, docs, optimizer, objective)
|
docs = ensure_docs(batch)
|
||||||
progress = tracker.update(epoch, loss, docs)
|
loss = make_update(model, docs, optimizer, objective)
|
||||||
if progress:
|
progress = tracker.update(epoch, loss, docs)
|
||||||
msg.row(progress, **row_settings)
|
if progress:
|
||||||
if P["n_save_every"] and (batch_id % P["n_save_every"] == 0):
|
msg.row(progress, **row_settings)
|
||||||
_save_model(epoch, is_temp=True)
|
if P["n_save_every"] and (batch_id % P["n_save_every"] == 0):
|
||||||
|
_save_model(epoch, is_temp=True)
|
||||||
|
|
||||||
if P["n_save_epoch"]:
|
if P["n_save_epoch"]:
|
||||||
if epoch % P["n_save_epoch"] == 0 or epoch == P["max_epochs"] - 1:
|
if epoch % P["n_save_epoch"] == 0 or epoch == P["max_epochs"] - 1:
|
||||||
|
_save_model(epoch)
|
||||||
|
else:
|
||||||
_save_model(epoch)
|
_save_model(epoch)
|
||||||
else:
|
tracker.epoch_loss = 0.0
|
||||||
_save_model(epoch)
|
finally:
|
||||||
tracker.epoch_loss = 0.0
|
if not skip_last:
|
||||||
|
_save_model(P["max_epochs"], is_last=True)
|
||||||
|
|
||||||
|
|
||||||
def ensure_docs(examples_or_docs: Iterable[Union[Doc, Example]]) -> List[Doc]:
|
def ensure_docs(examples_or_docs: Iterable[Union[Doc, Example]]) -> List[Doc]:
|
||||||
|
|
|
@ -899,15 +899,21 @@ The `EntityLinker` model architecture is a Thinc `Model` with a
|
||||||
| `nO` | Output dimension, determined by the length of the vectors encoding each entity in the KB. If the `nO` dimension is not set, the entity linking component will set it when `initialize` is called. ~~Optional[int]~~ |
|
| `nO` | Output dimension, determined by the length of the vectors encoding each entity in the KB. If the `nO` dimension is not set, the entity linking component will set it when `initialize` is called. ~~Optional[int]~~ |
|
||||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
|
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
|
||||||
|
|
||||||
### spacy.EmptyKB.v1 {id="EmptyKB"}
|
### spacy.EmptyKB.v1 {id="EmptyKB.v1"}
|
||||||
|
|
||||||
A function that creates an empty `KnowledgeBase` from a [`Vocab`](/api/vocab)
|
A function that creates an empty `KnowledgeBase` from a [`Vocab`](/api/vocab)
|
||||||
instance. This is the default when a new entity linker component is created.
|
instance.
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ---------------------- | ----------------------------------------------------------------------------------- |
|
| ---------------------- | ----------------------------------------------------------------------------------- |
|
||||||
| `entity_vector_length` | The length of the vectors encoding each entity in the KB. Defaults to `64`. ~~int~~ |
|
| `entity_vector_length` | The length of the vectors encoding each entity in the KB. Defaults to `64`. ~~int~~ |
|
||||||
|
|
||||||
|
### spacy.EmptyKB.v2 {id="EmptyKB"}
|
||||||
|
|
||||||
|
A function that creates an empty `KnowledgeBase` from a [`Vocab`](/api/vocab)
|
||||||
|
instance. This is the default when a new entity linker component is created. It
|
||||||
|
returns a `Callable[[Vocab, int], InMemoryLookupKB]`.
|
||||||
|
|
||||||
### spacy.KBFromFile.v1 {id="KBFromFile"}
|
### spacy.KBFromFile.v1 {id="KBFromFile"}
|
||||||
|
|
||||||
A function that reads an existing `KnowledgeBase` from file.
|
A function that reads an existing `KnowledgeBase` from file.
|
||||||
|
@ -924,6 +930,15 @@ plausible [`Candidate`](/api/kb/#candidate) objects. The default
|
||||||
`CandidateGenerator` uses the text of a mention to find its potential aliases in
|
`CandidateGenerator` uses the text of a mention to find its potential aliases in
|
||||||
the `KnowledgeBase`. Note that this function is case-dependent.
|
the `KnowledgeBase`. Note that this function is case-dependent.
|
||||||
|
|
||||||
|
### spacy.CandidateBatchGenerator.v1 {id="CandidateBatchGenerator"}
|
||||||
|
|
||||||
|
A function that takes as input a [`KnowledgeBase`](/api/kb) and an `Iterable` of
|
||||||
|
[`Span`](/api/span) objects denoting named entities, and returns a list of
|
||||||
|
plausible [`Candidate`](/api/kb/#candidate) objects per specified
|
||||||
|
[`Span`](/api/span). The default `CandidateBatchGenerator` uses the text of a
|
||||||
|
mention to find its potential aliases in the `KnowledgeBase`. Note that this
|
||||||
|
function is case-dependent.
|
||||||
|
|
||||||
## Coreference {id="coref-architectures",tag="experimental"}
|
## Coreference {id="coref-architectures",tag="experimental"}
|
||||||
|
|
||||||
A [`CoreferenceResolver`](/api/coref) component identifies tokens that refer to
|
A [`CoreferenceResolver`](/api/coref) component identifies tokens that refer to
|
||||||
|
|
|
@ -1122,17 +1122,18 @@ auto-generated by setting `--pretraining` on
|
||||||
$ python -m spacy pretrain [config_path] [output_dir] [--code] [--resume-path] [--epoch-resume] [--gpu-id] [overrides]
|
$ python -m spacy pretrain [config_path] [output_dir] [--code] [--resume-path] [--epoch-resume] [--gpu-id] [overrides]
|
||||||
```
|
```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
| -------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ |
|
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ |
|
||||||
| `output_dir` | Directory to save binary weights to on each epoch. ~~Path (positional)~~ |
|
| `output_dir` | Directory to save binary weights to on each epoch. ~~Path (positional)~~ |
|
||||||
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
||||||
| `--resume-path`, `-r` | Path to pretrained weights from which to resume pretraining. ~~Optional[Path] \(option)~~ |
|
| `--resume-path`, `-r` | Path to pretrained weights from which to resume pretraining. ~~Optional[Path] \(option)~~ |
|
||||||
| `--epoch-resume`, `-er` | The epoch to resume counting from when using `--resume-path`. Prevents unintended overwriting of existing weight files. ~~Optional[int] \(option)~~ |
|
| `--epoch-resume`, `-er` | The epoch to resume counting from when using `--resume-path`. Prevents unintended overwriting of existing weight files. ~~Optional[int] \(option)~~ |
|
||||||
| `--gpu-id`, `-g` | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~ |
|
| `--gpu-id`, `-g` | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~ |
|
||||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
| `--skip-last`, `-L` <Tag variant="new">3.5.2</Tag> | Skip saving `model-last.bin`. Defaults to `False`. ~~bool (flag)~~ |
|
||||||
| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.dropout 0.2`. ~~Any (option/flag)~~ |
|
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||||
| **CREATES** | The pretrained weights that can be used to initialize `spacy train`. |
|
| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.dropout 0.2`. ~~Any (option/flag)~~ |
|
||||||
|
| **CREATES** | The pretrained weights that can be used to initialize `spacy train`. |
|
||||||
|
|
||||||
## evaluate {id="evaluate",version="2",tag="command"}
|
## evaluate {id="evaluate",version="2",tag="command"}
|
||||||
|
|
||||||
|
@ -1254,19 +1255,19 @@ be provided.
|
||||||
> $ python -m spacy find-threshold my_nlp data.spacy spancat threshold spans_sc_f
|
> $ python -m spacy find-threshold my_nlp data.spacy spancat threshold spans_sc_f
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
| ------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `model` | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~ |
|
| `model` | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~ |
|
||||||
| `data_path` | Path to file with DocBin with docs to use for threshold search. ~~Path (positional)~~ |
|
| `data_path` | Path to file with DocBin with docs to use for threshold search. ~~Path (positional)~~ |
|
||||||
| `pipe_name` | Name of pipe to examine thresholds for. ~~str (positional)~~ |
|
| `pipe_name` | Name of pipe to examine thresholds for. ~~str (positional)~~ |
|
||||||
| `threshold_key` | Key of threshold attribute in component's configuration. ~~str (positional)~~ |
|
| `threshold_key` | Key of threshold attribute in component's configuration. ~~str (positional)~~ |
|
||||||
| `scores_key` | Name of score to metric to optimize. ~~str (positional)~~ |
|
| `scores_key` | Name of score to metric to optimize. ~~str (positional)~~ |
|
||||||
| `--n_trials`, `-n` | Number of trials to determine optimal thresholds. ~~int (option)~~ |
|
| `--n_trials`, `-n` | Number of trials to determine optimal thresholds. ~~int (option)~~ |
|
||||||
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
||||||
| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ |
|
| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ |
|
||||||
| `--gold-preproc`, `-G` | Use gold preprocessing. ~~bool (flag)~~ |
|
| `--gold-preproc`, `-G` | Use gold preprocessing. ~~bool (flag)~~ |
|
||||||
| `--silent`, `-V`, `-VV` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ |
|
| `--verbose`, `-V`, `-VV` | Display more information for debugging purposes. ~~bool (flag)~~ |
|
||||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||||
|
|
||||||
## assemble {id="assemble",tag="command"}
|
## assemble {id="assemble",tag="command"}
|
||||||
|
|
||||||
|
|
|
@ -64,7 +64,7 @@ details on the architectures and their arguments and hyperparameters.
|
||||||
> config={
|
> config={
|
||||||
> "model": DEFAULT_COREF_MODEL,
|
> "model": DEFAULT_COREF_MODEL,
|
||||||
> "span_cluster_prefix": DEFAULT_CLUSTER_PREFIX,
|
> "span_cluster_prefix": DEFAULT_CLUSTER_PREFIX,
|
||||||
> },
|
> }
|
||||||
> nlp.add_pipe("experimental_coref", config=config)
|
> nlp.add_pipe("experimental_coref", config=config)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
|
|
|
@ -68,24 +68,28 @@ The following operators are supported by the `DependencyMatcher`, most of which
|
||||||
come directly from
|
come directly from
|
||||||
[Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html):
|
[Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html):
|
||||||
|
|
||||||
| Symbol | Description |
|
| Symbol | Description |
|
||||||
| --------- | -------------------------------------------------------------------------------------------------------------------- |
|
| --------------------------------------- | -------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `A < B` | `A` is the immediate dependent of `B`. |
|
| `A < B` | `A` is the immediate dependent of `B`. |
|
||||||
| `A > B` | `A` is the immediate head of `B`. |
|
| `A > B` | `A` is the immediate head of `B`. |
|
||||||
| `A << B` | `A` is the dependent in a chain to `B` following dep → head paths. |
|
| `A << B` | `A` is the dependent in a chain to `B` following dep → head paths. |
|
||||||
| `A >> B` | `A` is the head in a chain to `B` following head → dep paths. |
|
| `A >> B` | `A` is the head in a chain to `B` following head → dep paths. |
|
||||||
| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. |
|
| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. |
|
||||||
| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_. |
|
| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_. |
|
||||||
| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. |
|
| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. |
|
||||||
| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_. |
|
| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_. |
|
||||||
| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. |
|
| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. |
|
||||||
| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. |
|
| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. |
|
||||||
| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. |
|
| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. |
|
||||||
| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. |
|
| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. |
|
||||||
| `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_. |
|
| `A >+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_. |
|
||||||
| `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_. |
|
| `A >- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_. |
|
||||||
| `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_. |
|
| `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_. |
|
||||||
| `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_. |
|
| `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_. |
|
||||||
|
| `A <+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_. |
|
||||||
|
| `A <- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_. |
|
||||||
|
| `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_. |
|
||||||
|
| `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_. |
|
||||||
|
|
||||||
## DependencyMatcher.\_\_init\_\_ {id="init",tag="method"}
|
## DependencyMatcher.\_\_init\_\_ {id="init",tag="method"}
|
||||||
|
|
||||||
|
|
|
@ -53,19 +53,21 @@ architectures and their arguments and hyperparameters.
|
||||||
> nlp.add_pipe("entity_linker", config=config)
|
> nlp.add_pipe("entity_linker", config=config)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Setting | Description |
|
| Setting | Description |
|
||||||
| ---------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| --------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `labels_discard` | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~ |
|
| `labels_discard` | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~ |
|
||||||
| `n_sents` | The number of neighbouring sentences to take into account. Defaults to 0. ~~int~~ |
|
| `n_sents` | The number of neighbouring sentences to take into account. Defaults to 0. ~~int~~ |
|
||||||
| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~ |
|
| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~ |
|
||||||
| `incl_context` | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~ |
|
| `incl_context` | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~ |
|
||||||
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~ |
|
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~ |
|
||||||
| `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~ |
|
| `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~ |
|
||||||
| `use_gold_ents` | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~ |
|
| `use_gold_ents` | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~ |
|
||||||
| `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ |
|
| `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ |
|
||||||
| `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ |
|
| `get_candidates_batch` <Tag variant="new">3.5</Tag> | Function that generates plausible candidates for a given batch of `Span` objects. Defaults to [CandidateBatchGenerator](/api/architectures#CandidateBatchGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]]~~ |
|
||||||
| `scorer` <Tag variant="new">3.2</Tag> | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ |
|
| `generate_empty_kb` <Tag variant="new">3.6</Tag> | Function that generates an empty `KnowledgeBase` object. Defaults to [`spacy.EmptyKB.v2`](/api/architectures#EmptyKB), which generates an empty [`InMemoryLookupKB`](/api/inmemorylookupkb). ~~Callable[[Vocab, int], KnowledgeBase]~~ |
|
||||||
| `threshold` <Tag variant="new">3.4</Tag> | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ |
|
| `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ |
|
||||||
|
| `scorer` <Tag variant="new">3.2</Tag> | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ |
|
||||||
|
| `threshold` <Tag variant="new">3.4</Tag> | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ |
|
||||||
|
|
||||||
```python
|
```python
|
||||||
%%GITHUB_SPACY/spacy/pipeline/entity_linker.py
|
%%GITHUB_SPACY/spacy/pipeline/entity_linker.py
|
||||||
|
|
|
@ -13,8 +13,16 @@ A span categorizer consists of two parts: a [suggester function](#suggesters)
|
||||||
that proposes candidate spans, which may or may not overlap, and a labeler model
|
that proposes candidate spans, which may or may not overlap, and a labeler model
|
||||||
that predicts zero or more labels for each candidate.
|
that predicts zero or more labels for each candidate.
|
||||||
|
|
||||||
Predicted spans will be saved in a [`SpanGroup`](/api/spangroup) on the doc.
|
This component comes in two forms: `spancat` and `spancat_singlelabel` (added in
|
||||||
Individual span scores can be found in `spangroup.attrs["scores"]`.
|
spaCy v3.5.1). When you need to perform multi-label classification on your
|
||||||
|
spans, use `spancat`. The `spancat` component uses a `Logistic` layer where the
|
||||||
|
output class probabilities are independent for each class. However, if you need
|
||||||
|
to predict at most one true class for a span, then use `spancat_singlelabel`. It
|
||||||
|
uses a `Softmax` layer and treats the task as a multi-class problem.
|
||||||
|
|
||||||
|
Predicted spans will be saved in a [`SpanGroup`](/api/spangroup) on the doc
|
||||||
|
under `doc.spans[spans_key]`, where `spans_key` is a component config setting.
|
||||||
|
Individual span scores are stored in `doc.spans[spans_key].attrs["scores"]`.
|
||||||
|
|
||||||
## Assigned Attributes {id="assigned-attributes"}
|
## Assigned Attributes {id="assigned-attributes"}
|
||||||
|
|
||||||
|
@ -22,7 +30,9 @@ Predictions will be saved to `Doc.spans[spans_key]` as a
|
||||||
[`SpanGroup`](/api/spangroup). The scores for the spans in the `SpanGroup` will
|
[`SpanGroup`](/api/spangroup). The scores for the spans in the `SpanGroup` will
|
||||||
be saved in `SpanGroup.attrs["scores"]`.
|
be saved in `SpanGroup.attrs["scores"]`.
|
||||||
|
|
||||||
`spans_key` defaults to `"sc"`, but can be passed as a parameter.
|
`spans_key` defaults to `"sc"`, but can be passed as a parameter. The `spancat`
|
||||||
|
component will overwrite any existing spans under the spans key
|
||||||
|
`doc.spans[spans_key]`.
|
||||||
|
|
||||||
| Location | Value |
|
| Location | Value |
|
||||||
| -------------------------------------- | -------------------------------------------------------- |
|
| -------------------------------------- | -------------------------------------------------------- |
|
||||||
|
@ -38,7 +48,7 @@ how the component should be configured. You can override its settings via the
|
||||||
[model architectures](/api/architectures) documentation for details on the
|
[model architectures](/api/architectures) documentation for details on the
|
||||||
architectures and their arguments and hyperparameters.
|
architectures and their arguments and hyperparameters.
|
||||||
|
|
||||||
> #### Example
|
> #### Example (spancat)
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> from spacy.pipeline.spancat import DEFAULT_SPANCAT_MODEL
|
> from spacy.pipeline.spancat import DEFAULT_SPANCAT_MODEL
|
||||||
|
@ -52,14 +62,33 @@ architectures and their arguments and hyperparameters.
|
||||||
> nlp.add_pipe("spancat", config=config)
|
> nlp.add_pipe("spancat", config=config)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Setting | Description |
|
> #### Example (spancat_singlelabel)
|
||||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
>
|
||||||
| `suggester` | A function that [suggests spans](#suggesters). Spans are returned as a ragged array with two integer columns, for the start and end positions. Defaults to [`ngram_suggester`](#ngram_suggester). ~~Callable[[Iterable[Doc], Optional[Ops]], Ragged]~~ |
|
> ```python
|
||||||
| `model` | A model instance that is given a a list of documents and `(start, end)` indices representing candidate span offsets. The model predicts a probability for each category for each span. Defaults to [SpanCategorizer](/api/architectures#SpanCategorizer). ~~Model[Tuple[List[Doc], Ragged], Floats2d]~~ |
|
> from spacy.pipeline.spancat import DEFAULT_SPANCAT_SINGLELABEL_MODEL
|
||||||
| `spans_key` | Key of the [`Doc.spans`](/api/doc#spans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"sc"`. ~~str~~ |
|
> config = {
|
||||||
| `threshold` | Minimum probability to consider a prediction positive. Spans with a positive prediction will be saved on the Doc. Defaults to `0.5`. ~~float~~ |
|
> "threshold": 0.5,
|
||||||
| `max_positive` | Maximum number of labels to consider positive per span. Defaults to `None`, indicating no limit. ~~Optional[int]~~ |
|
> "spans_key": "labeled_spans",
|
||||||
| `scorer` | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for `Doc.spans[spans_key]` with overlapping spans allowed. ~~Optional[Callable]~~ |
|
> "model": DEFAULT_SPANCAT_SINGLELABEL_MODEL,
|
||||||
|
> "suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
|
||||||
|
> # Additional spancat_singlelabel parameters
|
||||||
|
> "negative_weight": 0.8,
|
||||||
|
> "allow_overlap": True,
|
||||||
|
> }
|
||||||
|
> nlp.add_pipe("spancat_singlelabel", config=config)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Setting | Description |
|
||||||
|
| --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `suggester` | A function that [suggests spans](#suggesters). Spans are returned as a ragged array with two integer columns, for the start and end positions. Defaults to [`ngram_suggester`](#ngram_suggester). ~~Callable[[Iterable[Doc], Optional[Ops]], Ragged]~~ |
|
||||||
|
| `model` | A model instance that is given a a list of documents and `(start, end)` indices representing candidate span offsets. The model predicts a probability for each category for each span. Defaults to [SpanCategorizer](/api/architectures#SpanCategorizer). ~~Model[Tuple[List[Doc], Ragged], Floats2d]~~ |
|
||||||
|
| `spans_key` | Key of the [`Doc.spans`](/api/doc#spans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"sc"`. ~~str~~ |
|
||||||
|
| `threshold` | Minimum probability to consider a prediction positive. Spans with a positive prediction will be saved on the Doc. Meant to be used in combination with the multi-class `spancat` component with a `Logistic` scoring layer. Defaults to `0.5`. ~~float~~ |
|
||||||
|
| `max_positive` | Maximum number of labels to consider positive per span. Defaults to `None`, indicating no limit. Meant to be used together with the `spancat` component and defaults to 0 with `spancat_singlelabel`. ~~Optional[int]~~ |
|
||||||
|
| `scorer` | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for `Doc.spans[spans_key]` with overlapping spans allowed. ~~Optional[Callable]~~ |
|
||||||
|
| `add_negative_label` <Tag variant="new">3.5.1</Tag> | Whether to learn to predict a special negative label for each unannotated `Span` . This should be `True` when using a `Softmax` classifier layer and so its `True` by default for `spancat_singlelabel`. Spans with negative labels and their scores are not stored as annotations. ~~bool~~ |
|
||||||
|
| `negative_weight` <Tag variant="new">3.5.1</Tag> | Multiplier for the loss terms. It can be used to downweight the negative samples if there are too many. It is only used when `add_negative_label` is `True`. Defaults to `1.0`. ~~float~~ |
|
||||||
|
| `allow_overlap` <Tag variant="new">3.5.1</Tag> | If `True`, the data is assumed to contain overlapping spans. It is only available when `max_positive` is exactly 1. Defaults to `True`. ~~bool~~ |
|
||||||
|
|
||||||
```python
|
```python
|
||||||
%%GITHUB_SPACY/spacy/pipeline/spancat.py
|
%%GITHUB_SPACY/spacy/pipeline/spancat.py
|
||||||
|
@ -71,6 +100,7 @@ architectures and their arguments and hyperparameters.
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> # Construction via add_pipe with default model
|
> # Construction via add_pipe with default model
|
||||||
|
> # Replace 'spancat' with 'spancat_singlelabel' for exclusive classes
|
||||||
> spancat = nlp.add_pipe("spancat")
|
> spancat = nlp.add_pipe("spancat")
|
||||||
>
|
>
|
||||||
> # Construction via add_pipe with custom model
|
> # Construction via add_pipe with custom model
|
||||||
|
@ -86,16 +116,19 @@ Create a new pipeline instance. In your application, you would normally use a
|
||||||
shortcut for this and instantiate the component using its string name and
|
shortcut for this and instantiate the component using its string name and
|
||||||
[`nlp.add_pipe`](/api/language#create_pipe).
|
[`nlp.add_pipe`](/api/language#create_pipe).
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
| --------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `vocab` | The shared vocabulary. ~~Vocab~~ |
|
| `vocab` | The shared vocabulary. ~~Vocab~~ |
|
||||||
| `model` | A model instance that is given a a list of documents and `(start, end)` indices representing candidate span offsets. The model predicts a probability for each category for each span. ~~Model[Tuple[List[Doc], Ragged], Floats2d]~~ |
|
| `model` | A model instance that is given a a list of documents and `(start, end)` indices representing candidate span offsets. The model predicts a probability for each category for each span. ~~Model[Tuple[List[Doc], Ragged], Floats2d]~~ |
|
||||||
| `suggester` | A function that [suggests spans](#suggesters). Spans are returned as a ragged array with two integer columns, for the start and end positions. ~~Callable[[Iterable[Doc], Optional[Ops]], Ragged]~~ |
|
| `suggester` | A function that [suggests spans](#suggesters). Spans are returned as a ragged array with two integer columns, for the start and end positions. ~~Callable[[Iterable[Doc], Optional[Ops]], Ragged]~~ |
|
||||||
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
|
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `spans_key` | Key of the [`Doc.spans`](/api/doc#sans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"sc"`. ~~str~~ |
|
| `spans_key` | Key of the [`Doc.spans`](/api/doc#sans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"sc"`. ~~str~~ |
|
||||||
| `threshold` | Minimum probability to consider a prediction positive. Spans with a positive prediction will be saved on the Doc. Defaults to `0.5`. ~~float~~ |
|
| `threshold` | Minimum probability to consider a prediction positive. Spans with a positive prediction will be saved on the Doc. Defaults to `0.5`. ~~float~~ |
|
||||||
| `max_positive` | Maximum number of labels to consider positive per span. Defaults to `None`, indicating no limit. ~~Optional[int]~~ |
|
| `max_positive` | Maximum number of labels to consider positive per span. Defaults to `None`, indicating no limit. ~~Optional[int]~~ |
|
||||||
|
| `allow_overlap` <Tag variant="new">3.5.1</Tag> | If `True`, the data is assumed to contain overlapping spans. It is only available when `max_positive` is exactly 1. Defaults to `True`. ~~bool~~ |
|
||||||
|
| `add_negative_label` <Tag variant="new">3.5.1</Tag> | Whether to learn to predict a special negative label for each unannotated `Span`. This should be `True` when using a `Softmax` classifier layer and so its `True` by default for `spancat_singlelabel` . Spans with negative labels and their scores are not stored as annotations. ~~bool~~ |
|
||||||
|
| `negative_weight` <Tag variant="new">3.5.1</Tag> | Multiplier for the loss terms. It can be used to downweight the negative samples if there are too many . It is only used when `add_negative_label` is `True`. Defaults to `1.0`. ~~float~~ |
|
||||||
|
|
||||||
## SpanCategorizer.\_\_call\_\_ {id="call",tag="method"}
|
## SpanCategorizer.\_\_call\_\_ {id="call",tag="method"}
|
||||||
|
|
||||||
|
|
|
@ -8,6 +8,13 @@ Look up strings by 64-bit hashes. As of v2.0, spaCy uses hash values instead of
|
||||||
integer IDs. This ensures that strings always map to the same ID, even from
|
integer IDs. This ensures that strings always map to the same ID, even from
|
||||||
different `StringStores`.
|
different `StringStores`.
|
||||||
|
|
||||||
|
<Infobox variant ="warning">
|
||||||
|
|
||||||
|
Note that a `StringStore` instance is not static. It increases in size as texts
|
||||||
|
with new tokens are processed.
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
## StringStore.\_\_init\_\_ {id="init",tag="method"}
|
## StringStore.\_\_init\_\_ {id="init",tag="method"}
|
||||||
|
|
||||||
Create the `StringStore`.
|
Create the `StringStore`.
|
||||||
|
|
|
@ -25,7 +25,10 @@ and call the package's own `load()` method. If a pipeline is loaded from a path,
|
||||||
spaCy will assume it's a data directory, load its
|
spaCy will assume it's a data directory, load its
|
||||||
[`config.cfg`](/api/data-formats#config) and use the language and pipeline
|
[`config.cfg`](/api/data-formats#config) and use the language and pipeline
|
||||||
information to construct the `Language` class. The data will be loaded in via
|
information to construct the `Language` class. The data will be loaded in via
|
||||||
[`Language.from_disk`](/api/language#from_disk).
|
[`Language.from_disk`](/api/language#from_disk). Loading a pipeline from a
|
||||||
|
package will also import any custom code, if present, whereas loading from a
|
||||||
|
directory does not. For these cases, you need to manually import your custom
|
||||||
|
code.
|
||||||
|
|
||||||
<Infobox variant="warning" title="Changed in v3.0">
|
<Infobox variant="warning" title="Changed in v3.0">
|
||||||
|
|
||||||
|
@ -291,7 +294,7 @@ the `manual=True` argument in `displacy.render`.
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------- | ------------------------------------------------------------------- |
|
| ----------- | ------------------------------------------------------------------- |
|
||||||
| `orig_doc` | Doc to parse dependencies. ~~Doc~~ |
|
| `orig_doc` | Doc or span to parse dependencies. ~~Union[Doc, Span]~~ |
|
||||||
| `options` | Dependency parse specific visualisation options. ~~Dict[str, Any]~~ |
|
| `options` | Dependency parse specific visualisation options. ~~Dict[str, Any]~~ |
|
||||||
| **RETURNS** | Generated dependency parse keyed by words and arcs. ~~dict~~ |
|
| **RETURNS** | Generated dependency parse keyed by words and arcs. ~~dict~~ |
|
||||||
|
|
||||||
|
@ -354,22 +357,22 @@ If a setting is not present in the options, the default value will be used.
|
||||||
> displacy.serve(doc, style="dep", options=options)
|
> displacy.serve(doc, style="dep", options=options)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ------------------ | -------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `fine_grained` | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). Defaults to `False`. ~~bool~~ |
|
| `fine_grained` | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). Defaults to `False`. ~~bool~~ |
|
||||||
| `add_lemma` | Print the lemmas in a separate row below the token texts. Defaults to `False`. ~~bool~~ |
|
| `add_lemma` | Print the lemmas in a separate row below the token texts. Defaults to `False`. ~~bool~~ |
|
||||||
| `collapse_punct` | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. Defaults to `True`. ~~bool~~ |
|
| `collapse_punct` | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. Defaults to `True`. ~~bool~~ |
|
||||||
| `collapse_phrases` | Merge noun phrases into one token. Defaults to `False`. ~~bool~~ |
|
| `collapse_phrases` | Merge noun phrases into one token. Defaults to `False`. ~~bool~~ |
|
||||||
| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ |
|
| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ |
|
||||||
| `color` | Text color (HEX, RGB or color names). Defaults to `"#000000"`. ~~str~~ |
|
| `color` | Text color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#000000"`. ~~str~~ |
|
||||||
| `bg` | Background color (HEX, RGB or color names). Defaults to `"#ffffff"`. ~~str~~ |
|
| `bg` | Background color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#ffffff"`. ~~str~~ |
|
||||||
| `font` | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~ |
|
| `font` | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~ |
|
||||||
| `offset_x` | Spacing on left side of the SVG in px. Defaults to `50`. ~~int~~ |
|
| `offset_x` | Spacing on left side of the SVG in px. Defaults to `50`. ~~int~~ |
|
||||||
| `arrow_stroke` | Width of arrow path in px. Defaults to `2`. ~~int~~ |
|
| `arrow_stroke` | Width of arrow path in px. Defaults to `2`. ~~int~~ |
|
||||||
| `arrow_width` | Width of arrow head in px. Defaults to `10` in regular mode and `8` in compact mode. ~~int~~ |
|
| `arrow_width` | Width of arrow head in px. Defaults to `10` in regular mode and `8` in compact mode. ~~int~~ |
|
||||||
| `arrow_spacing` | Spacing between arrows in px to avoid overlaps. Defaults to `20` in regular mode and `12` in compact mode. ~~int~~ |
|
| `arrow_spacing` | Spacing between arrows in px to avoid overlaps. Defaults to `20` in regular mode and `12` in compact mode. ~~int~~ |
|
||||||
| `word_spacing` | Vertical spacing between words and arcs in px. Defaults to `45`. ~~int~~ |
|
| `word_spacing` | Vertical spacing between words and arcs in px. Defaults to `45`. ~~int~~ |
|
||||||
| `distance` | Distance between words in px. Defaults to `175` in regular mode and `150` in compact mode. ~~int~~ |
|
| `distance` | Distance between words in px. Defaults to `175` in regular mode and `150` in compact mode. ~~int~~ |
|
||||||
|
|
||||||
#### Named Entity Visualizer options {id="displacy_options-ent"}
|
#### Named Entity Visualizer options {id="displacy_options-ent"}
|
||||||
|
|
||||||
|
@ -577,7 +580,7 @@ start decreasing across epochs.
|
||||||
> ```ini
|
> ```ini
|
||||||
> [training.logger]
|
> [training.logger]
|
||||||
> @loggers = "spacy.ConsoleLogger.v3"
|
> @loggers = "spacy.ConsoleLogger.v3"
|
||||||
> progress_bar = "all_steps"
|
> progress_bar = "eval"
|
||||||
> console_output = true
|
> console_output = true
|
||||||
> output_file = "training_log.jsonl"
|
> output_file = "training_log.jsonl"
|
||||||
> ```
|
> ```
|
||||||
|
|
|
@ -10,6 +10,13 @@ The `Vocab` object provides a lookup table that allows you to access
|
||||||
[`StringStore`](/api/stringstore). It also owns underlying C-data that is shared
|
[`StringStore`](/api/stringstore). It also owns underlying C-data that is shared
|
||||||
between `Doc` objects.
|
between `Doc` objects.
|
||||||
|
|
||||||
|
<Infobox variant ="warning">
|
||||||
|
|
||||||
|
Note that a `Vocab` instance is not static. It increases in size as texts with
|
||||||
|
new tokens are processed.
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
## Vocab.\_\_init\_\_ {id="init",tag="method"}
|
## Vocab.\_\_init\_\_ {id="init",tag="method"}
|
||||||
|
|
||||||
Create the vocabulary.
|
Create the vocabulary.
|
||||||
|
|
|
@ -746,13 +746,16 @@ this by setting `initialize.init_tok2vec` to the filename of the `.bin` file
|
||||||
that you want to use from pretraining.
|
that you want to use from pretraining.
|
||||||
|
|
||||||
A pretraining step that runs for 5 epochs with an output path of `pretrain/`, as
|
A pretraining step that runs for 5 epochs with an output path of `pretrain/`, as
|
||||||
an example, produces `pretrain/model0.bin` through `pretrain/model4.bin`. To
|
an example, produces `pretrain/model0.bin` through `pretrain/model4.bin` plus a
|
||||||
make use of the final output, you could fill in this value in your config file:
|
copy of the last iteration as `pretrain/model-last.bin`. Additionally, you can
|
||||||
|
configure `n_save_epoch` to tell pretraining in which epoch interval it should
|
||||||
|
save the current training progress. To use the final output to initialize your
|
||||||
|
`tok2vec` layer, you could fill in this value in your config file:
|
||||||
|
|
||||||
```ini {title="config.cfg"}
|
```ini {title="config.cfg"}
|
||||||
|
|
||||||
[paths]
|
[paths]
|
||||||
init_tok2vec = "pretrain/model4.bin"
|
init_tok2vec = "pretrain/model-last.bin"
|
||||||
|
|
||||||
[initialize]
|
[initialize]
|
||||||
init_tok2vec = ${paths.init_tok2vec}
|
init_tok2vec = ${paths.init_tok2vec}
|
||||||
|
|
|
@ -1096,20 +1096,28 @@ The following operators are supported by the `DependencyMatcher`, most of which
|
||||||
come directly from
|
come directly from
|
||||||
[Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html):
|
[Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html):
|
||||||
|
|
||||||
| Symbol | Description |
|
| Symbol | Description |
|
||||||
| --------- | -------------------------------------------------------------------------------------------------------------------- |
|
| --------------------------------------- | -------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `A < B` | `A` is the immediate dependent of `B`. |
|
| `A < B` | `A` is the immediate dependent of `B`. |
|
||||||
| `A > B` | `A` is the immediate head of `B`. |
|
| `A > B` | `A` is the immediate head of `B`. |
|
||||||
| `A << B` | `A` is the dependent in a chain to `B` following dep → head paths. |
|
| `A << B` | `A` is the dependent in a chain to `B` following dep → head paths. |
|
||||||
| `A >> B` | `A` is the head in a chain to `B` following head → dep paths. |
|
| `A >> B` | `A` is the head in a chain to `B` following head → dep paths. |
|
||||||
| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. |
|
| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. |
|
||||||
| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_. |
|
| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_. |
|
||||||
| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. |
|
| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. |
|
||||||
| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_. |
|
| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_. |
|
||||||
| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. |
|
| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. |
|
||||||
| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. |
|
| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. |
|
||||||
| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. |
|
| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. |
|
||||||
| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. |
|
| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. |
|
||||||
|
| `A >+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_. |
|
||||||
|
| `A >- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_. |
|
||||||
|
| `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_. |
|
||||||
|
| `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_. |
|
||||||
|
| `A <+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_. |
|
||||||
|
| `A <- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_. |
|
||||||
|
| `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_. |
|
||||||
|
| `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_. |
|
||||||
|
|
||||||
### Designing dependency matcher patterns {id="dependencymatcher-patterns"}
|
### Designing dependency matcher patterns {id="dependencymatcher-patterns"}
|
||||||
|
|
||||||
|
@ -1674,6 +1682,8 @@ def expand_person_entities(doc):
|
||||||
if prev_token.text in ("Dr", "Dr.", "Mr", "Mr.", "Ms", "Ms."):
|
if prev_token.text in ("Dr", "Dr.", "Mr", "Mr.", "Ms", "Ms."):
|
||||||
new_ent = Span(doc, ent.start - 1, ent.end, label=ent.label)
|
new_ent = Span(doc, ent.start - 1, ent.end, label=ent.label)
|
||||||
new_ents.append(new_ent)
|
new_ents.append(new_ent)
|
||||||
|
else:
|
||||||
|
new_ents.append(ent)
|
||||||
else:
|
else:
|
||||||
new_ents.append(ent)
|
new_ents.append(ent)
|
||||||
doc.ents = new_ents
|
doc.ents = new_ents
|
||||||
|
|
|
@ -758,6 +758,15 @@ any custom architectures, functions or
|
||||||
your pipeline and registered when it's loaded. See the documentation on
|
your pipeline and registered when it's loaded. See the documentation on
|
||||||
[saving and loading pipelines](/usage/saving-loading#models-custom) for details.
|
[saving and loading pipelines](/usage/saving-loading#models-custom) for details.
|
||||||
|
|
||||||
|
<Infobox variant="warning">
|
||||||
|
|
||||||
|
Note that the unpackaged models produced by `spacy train` are data directories
|
||||||
|
that **do not include custom code**. You need to import the code in your script
|
||||||
|
before loading in unpackaged models. For more details, see
|
||||||
|
[`spacy.load`](/api/top-level#spacy.load).
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
#### Example: Modifying the nlp object {id="custom-code-nlp-callbacks"}
|
#### Example: Modifying the nlp object {id="custom-code-nlp-callbacks"}
|
||||||
|
|
||||||
For many use cases, you don't necessarily want to implement the whole `Language`
|
For many use cases, you don't necessarily want to implement the whole `Language`
|
||||||
|
|
|
@ -58,12 +58,12 @@ arcs.
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
| Argument | Description |
|
| Argument | Description |
|
||||||
| --------- | ----------------------------------------------------------------------------------------- |
|
| --------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ |
|
| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ |
|
||||||
| `color` | Text color (HEX, RGB or color names). Defaults to `"#000000"`. ~~str~~ |
|
| `color` | Text color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#000000"`. ~~str~~ |
|
||||||
| `bg` | Background color (HEX, RGB or color names). Defaults to `"#ffffff"`. ~~str~~ |
|
| `bg` | Background color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#ffffff"`. ~~str~~ |
|
||||||
| `font` | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~ |
|
| `font` | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~ |
|
||||||
|
|
||||||
For a list of all available options, see the
|
For a list of all available options, see the
|
||||||
[`displacy` API documentation](/api/top-level#displacy_options).
|
[`displacy` API documentation](/api/top-level#displacy_options).
|
||||||
|
|
|
@ -3215,6 +3215,51 @@
|
||||||
"category": ["pipeline"],
|
"category": ["pipeline"],
|
||||||
"tags": ["syllables", "multilingual"]
|
"tags": ["syllables", "multilingual"]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"id": "sentimental-onix",
|
||||||
|
"title": "Sentimental Onix",
|
||||||
|
"slogan": "Use onnx for sentiment models",
|
||||||
|
"description": "spaCy pipeline component for sentiment analysis using onnx",
|
||||||
|
"github": "sloev/sentimental-onix",
|
||||||
|
"pip": "sentimental-onix",
|
||||||
|
"code_example": [
|
||||||
|
"# Download model:",
|
||||||
|
"# python -m sentimental_onix download en",
|
||||||
|
"import spacy",
|
||||||
|
"from sentimental_onix import pipeline",
|
||||||
|
"",
|
||||||
|
"nlp = spacy.load(\"en_core_web_sm\")",
|
||||||
|
"nlp.add_pipe(\"sentencizer\")",
|
||||||
|
"nlp.add_pipe(\"sentimental_onix\", after=\"sentencizer\")",
|
||||||
|
"",
|
||||||
|
"sentences = [",
|
||||||
|
" (sent.text, sent._.sentiment)",
|
||||||
|
" for doc in nlp.pipe(",
|
||||||
|
" [",
|
||||||
|
" \"i hate pasta on tuesdays\",",
|
||||||
|
" \"i like movies on wednesdays\",",
|
||||||
|
" \"i find your argument ridiculous\",",
|
||||||
|
" \"soda with straws are my favorite\",",
|
||||||
|
" ]",
|
||||||
|
" )",
|
||||||
|
" for sent in doc.sents",
|
||||||
|
"]",
|
||||||
|
"",
|
||||||
|
"assert sentences == [",
|
||||||
|
" (\"i hate pasta on tuesdays\", \"Negative\"),",
|
||||||
|
" (\"i like movies on wednesdays\", \"Positive\"),",
|
||||||
|
" (\"i find your argument ridiculous\", \"Negative\"),",
|
||||||
|
" (\"soda with straws are my favorite\", \"Positive\"),",
|
||||||
|
"]"
|
||||||
|
],
|
||||||
|
"thumb": "https://raw.githubusercontent.com/sloev/sentimental-onix/master/.github/onix.webp",
|
||||||
|
"author": "Johannes Valbjørn",
|
||||||
|
"author_links": {
|
||||||
|
"github": "sloev"
|
||||||
|
},
|
||||||
|
"category": ["pipeline"],
|
||||||
|
"tags": ["sentiment", "english"]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"id": "gobbli",
|
"id": "gobbli",
|
||||||
"title": "gobbli",
|
"title": "gobbli",
|
||||||
|
|
|
@ -6,6 +6,7 @@
|
||||||
"dev": "next dev",
|
"dev": "next dev",
|
||||||
"build": "next build && npm run sitemap && next export",
|
"build": "next build && npm run sitemap && next export",
|
||||||
"prebuild": "pip install -r setup/requirements.txt && sh setup/setup.sh",
|
"prebuild": "pip install -r setup/requirements.txt && sh setup/setup.sh",
|
||||||
|
"predev": "npm run prebuild",
|
||||||
"sitemap": "next-sitemap --config next-sitemap.config.mjs",
|
"sitemap": "next-sitemap --config next-sitemap.config.mjs",
|
||||||
"start": "next start",
|
"start": "next start",
|
||||||
"lint": "next lint",
|
"lint": "next lint",
|
||||||
|
|
|
@ -111,11 +111,12 @@
|
||||||
line-height: var(--line-height-xs)
|
line-height: var(--line-height-xs)
|
||||||
text-align: center
|
text-align: center
|
||||||
|
|
||||||
@include breakpoint(max, xs)
|
@include breakpoint(max, md)
|
||||||
.list
|
.alert
|
||||||
display: none
|
display: none
|
||||||
|
|
||||||
.alert
|
@include breakpoint(max, xs)
|
||||||
|
.list
|
||||||
display: none
|
display: none
|
||||||
|
|
||||||
.has-alert
|
.has-alert
|
||||||
|
|
|
@ -57,9 +57,15 @@ const AlertSpace = ({ nightly, legacy }) => {
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// const navAlert = (
|
||||||
|
// <Link to="/usage/v3-5" noLinkLayout>
|
||||||
|
// <strong>💥 Out now:</strong> spaCy v3.5
|
||||||
|
// </Link>
|
||||||
|
// )
|
||||||
|
|
||||||
const navAlert = (
|
const navAlert = (
|
||||||
<Link to="/usage/v3-5" noLinkLayout>
|
<Link to="https://form.typeform.com/to/aMel9q9f" noLinkLayout>
|
||||||
<strong>💥 Out now:</strong> spaCy v3.5
|
<strong>💥 Take the user survey!</strong>
|
||||||
</Link>
|
</Link>
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user