mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 18:06:29 +03:00
Merge remote-tracking branch 'upstream/master' into sync-v4-master-20230612
This commit is contained in:
commit
50c5e9a2dd
129
.github/azure-steps.yml
vendored
129
.github/azure-steps.yml
vendored
|
@ -1,129 +0,0 @@
|
||||||
parameters:
|
|
||||||
python_version: ''
|
|
||||||
architecture: 'x64'
|
|
||||||
num_build_jobs: 2
|
|
||||||
|
|
||||||
steps:
|
|
||||||
- task: UsePythonVersion@0
|
|
||||||
inputs:
|
|
||||||
versionSpec: ${{ parameters.python_version }}
|
|
||||||
architecture: ${{ parameters.architecture }}
|
|
||||||
allowUnstable: true
|
|
||||||
|
|
||||||
- bash: |
|
|
||||||
echo "##vso[task.setvariable variable=python_version]${{ parameters.python_version }}"
|
|
||||||
displayName: 'Set variables'
|
|
||||||
|
|
||||||
- script: |
|
|
||||||
python -m pip install -U build pip setuptools
|
|
||||||
python -m pip install -U -r requirements.txt
|
|
||||||
displayName: "Install dependencies"
|
|
||||||
|
|
||||||
- script: |
|
|
||||||
python -m build --sdist
|
|
||||||
displayName: "Build sdist"
|
|
||||||
|
|
||||||
- script: |
|
|
||||||
python -m mypy spacy
|
|
||||||
displayName: 'Run mypy'
|
|
||||||
condition: ne(variables['python_version'], '3.6')
|
|
||||||
|
|
||||||
- task: DeleteFiles@1
|
|
||||||
inputs:
|
|
||||||
contents: "spacy"
|
|
||||||
displayName: "Delete source directory"
|
|
||||||
|
|
||||||
- task: DeleteFiles@1
|
|
||||||
inputs:
|
|
||||||
contents: "*.egg-info"
|
|
||||||
displayName: "Delete egg-info directory"
|
|
||||||
|
|
||||||
- script: |
|
|
||||||
python -m pip freeze > installed.txt
|
|
||||||
python -m pip uninstall -y -r installed.txt
|
|
||||||
displayName: "Uninstall all packages"
|
|
||||||
|
|
||||||
- bash: |
|
|
||||||
SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
|
|
||||||
SPACY_NUM_BUILD_JOBS=${{ parameters.num_build_jobs }} python -m pip install dist/$SDIST
|
|
||||||
displayName: "Install from sdist"
|
|
||||||
|
|
||||||
- script: |
|
|
||||||
python -W error -c "import spacy"
|
|
||||||
displayName: "Test import"
|
|
||||||
|
|
||||||
# - script: |
|
|
||||||
# python -m spacy download ca_core_news_sm
|
|
||||||
# python -m spacy download ca_core_news_md
|
|
||||||
# python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
|
|
||||||
# displayName: 'Test download CLI'
|
|
||||||
# condition: eq(variables['python_version'], '3.8')
|
|
||||||
#
|
|
||||||
# - script: |
|
|
||||||
# python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
|
|
||||||
# displayName: 'Test no warnings on load (#11713)'
|
|
||||||
# condition: eq(variables['python_version'], '3.8')
|
|
||||||
#
|
|
||||||
# - script: |
|
|
||||||
# python -m spacy download ca_core_news_sm 2>&1 | grep -q skipping
|
|
||||||
# displayName: 'Test skip re-download (#12188)'
|
|
||||||
# condition: eq(variables['python_version'], '3.8')
|
|
||||||
|
|
||||||
# - script: |
|
|
||||||
# python -W error -m spacy info ca_core_news_sm | grep -q download_url
|
|
||||||
# displayName: 'Test download_url in info CLI'
|
|
||||||
# condition: eq(variables['python_version'] '3.8')
|
|
||||||
|
|
||||||
- script: |
|
|
||||||
python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
|
|
||||||
displayName: 'Test convert CLI'
|
|
||||||
condition: eq(variables['python_version'], '3.8')
|
|
||||||
|
|
||||||
- script: |
|
|
||||||
python -m spacy init config -p ner -l ca ner.cfg
|
|
||||||
python -m spacy debug config ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy
|
|
||||||
displayName: 'Test debug config CLI'
|
|
||||||
condition: eq(variables['python_version'], '3.8')
|
|
||||||
|
|
||||||
- script: |
|
|
||||||
# will have errors due to sparse data, check for summary in output
|
|
||||||
python -m spacy debug data ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy | grep -q Summary
|
|
||||||
displayName: 'Test debug data CLI'
|
|
||||||
condition: eq(variables['python_version'], '3.8')
|
|
||||||
|
|
||||||
- script: |
|
|
||||||
python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
|
|
||||||
displayName: 'Test train CLI'
|
|
||||||
condition: eq(variables['python_version'], '3.8')
|
|
||||||
|
|
||||||
# - script: |
|
|
||||||
# python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
|
|
||||||
# PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
|
|
||||||
# displayName: 'Test assemble CLI'
|
|
||||||
# condition: eq(variables['python_version'], '3.8')
|
|
||||||
#
|
|
||||||
# - script: |
|
|
||||||
# python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
|
|
||||||
# python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
|
|
||||||
# displayName: 'Test assemble CLI vectors warning'
|
|
||||||
# condition: eq(variables['python_version'], '3.8')
|
|
||||||
|
|
||||||
- script: |
|
|
||||||
python -m pip install -U -r requirements.txt
|
|
||||||
displayName: "Install test requirements"
|
|
||||||
|
|
||||||
- script: |
|
|
||||||
python -m pytest --pyargs spacy -W error
|
|
||||||
displayName: "Run CPU tests"
|
|
||||||
|
|
||||||
- script: |
|
|
||||||
python -m pip install 'spacy[apple]'
|
|
||||||
python -m pytest --pyargs spacy
|
|
||||||
displayName: "Run CPU tests with thinc-apple-ops"
|
|
||||||
condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.11'))
|
|
||||||
|
|
||||||
- script: |
|
|
||||||
python .github/validate_universe_json.py website/meta/universe.json
|
|
||||||
displayName: 'Test website/meta/universe.json'
|
|
||||||
condition: eq(variables['python_version'], '3.8')
|
|
||||||
|
|
45
.github/workflows/autoblack.yml
vendored
45
.github/workflows/autoblack.yml
vendored
|
@ -1,45 +0,0 @@
|
||||||
# GitHub Action that uses Black to reformat all Python code and submits a PR
|
|
||||||
# in regular intervals. Inspired by: https://github.com/cclauss/autoblack
|
|
||||||
|
|
||||||
name: autoblack
|
|
||||||
on:
|
|
||||||
workflow_dispatch: # allow manual trigger
|
|
||||||
schedule:
|
|
||||||
- cron: '0 8 * * 5' # every Friday at 8am UTC
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
autoblack:
|
|
||||||
if: github.repository_owner == 'explosion'
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v3
|
|
||||||
with:
|
|
||||||
ref: ${{ github.head_ref }}
|
|
||||||
- uses: actions/setup-python@v4
|
|
||||||
- run: pip install black -c requirements.txt
|
|
||||||
- name: Auto-format code if needed
|
|
||||||
run: black spacy
|
|
||||||
# We can't run black --check here because that returns a non-zero excit
|
|
||||||
# code and makes GitHub think the action failed
|
|
||||||
- name: Check for modified files
|
|
||||||
id: git-check
|
|
||||||
run: echo modified=$(if git diff-index --quiet HEAD --; then echo "false"; else echo "true"; fi) >> $GITHUB_OUTPUT
|
|
||||||
|
|
||||||
- name: Create Pull Request
|
|
||||||
if: steps.git-check.outputs.modified == 'true'
|
|
||||||
uses: peter-evans/create-pull-request@v4
|
|
||||||
with:
|
|
||||||
title: Auto-format code with black
|
|
||||||
labels: meta
|
|
||||||
commit-message: Auto-format code with black
|
|
||||||
committer: GitHub <noreply@github.com>
|
|
||||||
author: explosion-bot <explosion-bot@users.noreply.github.com>
|
|
||||||
body: _This PR is auto-generated._
|
|
||||||
branch: autoblack
|
|
||||||
delete-branch: true
|
|
||||||
draft: false
|
|
||||||
- name: Check outputs
|
|
||||||
if: steps.git-check.outputs.modified == 'true'
|
|
||||||
run: |
|
|
||||||
echo "Pull Request Number - ${{ steps.cpr.outputs.pull-request-number }}"
|
|
||||||
echo "Pull Request URL - ${{ steps.cpr.outputs.pull-request-url }}"
|
|
1
.github/workflows/explosionbot.yml
vendored
1
.github/workflows/explosionbot.yml
vendored
|
@ -8,6 +8,7 @@ on:
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
explosion-bot:
|
explosion-bot:
|
||||||
|
if: github.repository_owner == 'explosion'
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Dump GitHub context
|
- name: Dump GitHub context
|
||||||
|
|
1
.github/workflows/issue-manager.yml
vendored
1
.github/workflows/issue-manager.yml
vendored
|
@ -13,6 +13,7 @@ on:
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
issue-manager:
|
issue-manager:
|
||||||
|
if: github.repository_owner == 'explosion'
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: tiangolo/issue-manager@0.4.0
|
- uses: tiangolo/issue-manager@0.4.0
|
||||||
|
|
1
.github/workflows/lock.yml
vendored
1
.github/workflows/lock.yml
vendored
|
@ -13,6 +13,7 @@ concurrency:
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
action:
|
action:
|
||||||
|
if: github.repository_owner == 'explosion'
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: dessant/lock-threads@v4
|
- uses: dessant/lock-threads@v4
|
||||||
|
|
1
.github/workflows/spacy_universe_alert.yml
vendored
1
.github/workflows/spacy_universe_alert.yml
vendored
|
@ -7,6 +7,7 @@ on:
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
build:
|
build:
|
||||||
|
if: github.repository_owner == 'explosion'
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
|
|
174
.github/workflows/tests.yml
vendored
Normal file
174
.github/workflows/tests.yml
vendored
Normal file
|
@ -0,0 +1,174 @@
|
||||||
|
name: tests
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches-ignore:
|
||||||
|
- "spacy.io"
|
||||||
|
- "nightly.spacy.io"
|
||||||
|
- "v2.spacy.io"
|
||||||
|
paths-ignore:
|
||||||
|
- "*.md"
|
||||||
|
- "*.mdx"
|
||||||
|
- "website/**"
|
||||||
|
- ".github/workflows/**"
|
||||||
|
pull_request:
|
||||||
|
types: [opened, synchronize, reopened, edited]
|
||||||
|
paths-ignore:
|
||||||
|
- "*.md"
|
||||||
|
- "*.mdx"
|
||||||
|
- "website/**"
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
validate:
|
||||||
|
name: Validate
|
||||||
|
if: github.repository_owner == 'explosion'
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- name: Check out repo
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
|
- name: Configure Python version
|
||||||
|
uses: actions/setup-python@v4
|
||||||
|
with:
|
||||||
|
python-version: "3.7"
|
||||||
|
architecture: x64
|
||||||
|
|
||||||
|
- name: black
|
||||||
|
run: |
|
||||||
|
python -m pip install black -c requirements.txt
|
||||||
|
python -m black spacy --check
|
||||||
|
- name: flake8
|
||||||
|
run: |
|
||||||
|
python -m pip install flake8==5.0.4
|
||||||
|
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
|
||||||
|
tests:
|
||||||
|
name: Test
|
||||||
|
needs: Validate
|
||||||
|
strategy:
|
||||||
|
fail-fast: true
|
||||||
|
matrix:
|
||||||
|
os: [ubuntu-latest, windows-latest, macos-latest]
|
||||||
|
python_version: ["3.11"]
|
||||||
|
include:
|
||||||
|
- os: ubuntu-20.04
|
||||||
|
python_version: "3.6"
|
||||||
|
- os: windows-latest
|
||||||
|
python_version: "3.7"
|
||||||
|
- os: macos-latest
|
||||||
|
python_version: "3.8"
|
||||||
|
- os: ubuntu-latest
|
||||||
|
python_version: "3.9"
|
||||||
|
- os: windows-latest
|
||||||
|
python_version: "3.10"
|
||||||
|
|
||||||
|
runs-on: ${{ matrix.os }}
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Check out repo
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
|
- name: Configure Python version
|
||||||
|
uses: actions/setup-python@v4
|
||||||
|
with:
|
||||||
|
python-version: ${{ matrix.python_version }}
|
||||||
|
architecture: x64
|
||||||
|
|
||||||
|
- name: Install dependencies
|
||||||
|
run: |
|
||||||
|
python -m pip install -U build pip setuptools
|
||||||
|
python -m pip install -U -r requirements.txt
|
||||||
|
|
||||||
|
- name: Build sdist
|
||||||
|
run: |
|
||||||
|
python -m build --sdist
|
||||||
|
|
||||||
|
- name: Run mypy
|
||||||
|
run: |
|
||||||
|
python -m mypy spacy
|
||||||
|
if: matrix.python_version != '3.6'
|
||||||
|
|
||||||
|
- name: Delete source directory and .egg-info
|
||||||
|
run: |
|
||||||
|
rm -rf spacy *.egg-info
|
||||||
|
shell: bash
|
||||||
|
|
||||||
|
- name: Uninstall all packages
|
||||||
|
run: |
|
||||||
|
python -m pip freeze
|
||||||
|
python -m pip freeze --exclude pywin32 > installed.txt
|
||||||
|
python -m pip uninstall -y -r installed.txt
|
||||||
|
|
||||||
|
- name: Install from sdist
|
||||||
|
run: |
|
||||||
|
SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
|
||||||
|
SPACY_NUM_BUILD_JOBS=2 python -m pip install dist/$SDIST
|
||||||
|
shell: bash
|
||||||
|
|
||||||
|
- name: Test import
|
||||||
|
run: python -W error -c "import spacy"
|
||||||
|
|
||||||
|
# - name: "Test download CLI"
|
||||||
|
# run: |
|
||||||
|
# python -m spacy download ca_core_news_sm
|
||||||
|
# python -m spacy download ca_core_news_md
|
||||||
|
# python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
|
||||||
|
# if: matrix.python_version == '3.9'
|
||||||
|
#
|
||||||
|
# - name: "Test download_url in info CLI"
|
||||||
|
# run: |
|
||||||
|
# python -W error -m spacy info ca_core_news_sm | grep -q download_url
|
||||||
|
# if: matrix.python_version == '3.9'
|
||||||
|
#
|
||||||
|
# - name: "Test no warnings on load (#11713)"
|
||||||
|
# run: |
|
||||||
|
# python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
|
||||||
|
# if: matrix.python_version == '3.9'
|
||||||
|
|
||||||
|
- name: "Test convert CLI"
|
||||||
|
run: |
|
||||||
|
python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
|
||||||
|
if: matrix.python_version == '3.9'
|
||||||
|
|
||||||
|
- name: "Test debug config CLI"
|
||||||
|
run: |
|
||||||
|
python -m spacy init config -p ner -l ca ner.cfg
|
||||||
|
python -m spacy debug config ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy
|
||||||
|
if: matrix.python_version == '3.9'
|
||||||
|
|
||||||
|
- name: "Test debug data CLI"
|
||||||
|
run: |
|
||||||
|
# will have errors due to sparse data, check for summary in output
|
||||||
|
python -m spacy debug data ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy | grep -q Summary
|
||||||
|
if: matrix.python_version == '3.9'
|
||||||
|
|
||||||
|
- name: "Test train CLI"
|
||||||
|
run: |
|
||||||
|
python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
|
||||||
|
if: matrix.python_version == '3.9'
|
||||||
|
|
||||||
|
# - name: "Test assemble CLI"
|
||||||
|
# run: |
|
||||||
|
# python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
|
||||||
|
# PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
|
||||||
|
# if: matrix.python_version == '3.9'
|
||||||
|
#
|
||||||
|
# - name: "Test assemble CLI vectors warning"
|
||||||
|
# run: |
|
||||||
|
# python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
|
||||||
|
# python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
|
||||||
|
# if: matrix.python_version == '3.9'
|
||||||
|
|
||||||
|
- name: "Install test requirements"
|
||||||
|
run: |
|
||||||
|
python -m pip install -U -r requirements.txt
|
||||||
|
|
||||||
|
- name: "Run CPU tests"
|
||||||
|
run: |
|
||||||
|
python -m pytest --pyargs spacy -W error
|
||||||
|
if: "!(startsWith(matrix.os, 'macos') && matrix.python_version == '3.11')"
|
||||||
|
|
||||||
|
- name: "Run CPU tests with thinc-apple-ops"
|
||||||
|
run: |
|
||||||
|
python -m pip install 'spacy[apple]'
|
||||||
|
python -m pytest --pyargs spacy
|
||||||
|
if: startsWith(matrix.os, 'macos') && matrix.python_version == '3.11'
|
33
.github/workflows/universe_validation.yml
vendored
Normal file
33
.github/workflows/universe_validation.yml
vendored
Normal file
|
@ -0,0 +1,33 @@
|
||||||
|
name: universe validation
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches-ignore:
|
||||||
|
- "spacy.io"
|
||||||
|
- "nightly.spacy.io"
|
||||||
|
- "v2.spacy.io"
|
||||||
|
paths:
|
||||||
|
- "website/meta/universe.json"
|
||||||
|
pull_request:
|
||||||
|
types: [opened, synchronize, reopened, edited]
|
||||||
|
paths:
|
||||||
|
- "website/meta/universe.json"
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
validate:
|
||||||
|
name: Validate
|
||||||
|
if: github.repository_owner == 'explosion'
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- name: Check out repo
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
|
- name: Configure Python version
|
||||||
|
uses: actions/setup-python@v4
|
||||||
|
with:
|
||||||
|
python-version: "3.7"
|
||||||
|
architecture: x64
|
||||||
|
|
||||||
|
- name: Validate website/meta/universe.json
|
||||||
|
run: |
|
||||||
|
python .github/validate_universe_json.py website/meta/universe.json
|
32
README.md
32
README.md
|
@ -16,6 +16,9 @@ production-ready [**training system**](https://spacy.io/usage/training) and easy
|
||||||
model packaging, deployment and workflow management. spaCy is commercial
|
model packaging, deployment and workflow management. spaCy is commercial
|
||||||
open-source software, released under the [MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE).
|
open-source software, released under the [MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE).
|
||||||
|
|
||||||
|
💥 **We'd love to hear more about your experience with spaCy!**
|
||||||
|
[Fill out our survey here.](https://form.typeform.com/to/aMel9q9f)
|
||||||
|
|
||||||
💫 **Version 3.5 out now!**
|
💫 **Version 3.5 out now!**
|
||||||
[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
|
[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
|
||||||
|
|
||||||
|
@ -32,19 +35,20 @@ open-source software, released under the [MIT license](https://github.com/explos
|
||||||
|
|
||||||
## 📖 Documentation
|
## 📖 Documentation
|
||||||
|
|
||||||
| Documentation | |
|
| Documentation | |
|
||||||
| ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ----------------------------- | ---------------------------------------------------------------------- |
|
||||||
| ⭐️ **[spaCy 101]** | New to spaCy? Here's everything you need to know! |
|
| ⭐️ **[spaCy 101]** | New to spaCy? Here's everything you need to know! |
|
||||||
| 📚 **[Usage Guides]** | How to use spaCy and its features. |
|
| 📚 **[Usage Guides]** | How to use spaCy and its features. |
|
||||||
| 🚀 **[New in v3.0]** | New features, backwards incompatibilities and migration guide. |
|
| 🚀 **[New in v3.0]** | New features, backwards incompatibilities and migration guide. |
|
||||||
| 🪐 **[Project Templates]** | End-to-end workflows you can clone, modify and run. |
|
| 🪐 **[Project Templates]** | End-to-end workflows you can clone, modify and run. |
|
||||||
| 🎛 **[API Reference]** | The detailed reference for spaCy's API. |
|
| 🎛 **[API Reference]** | The detailed reference for spaCy's API. |
|
||||||
| 📦 **[Models]** | Download trained pipelines for spaCy. |
|
| 📦 **[Models]** | Download trained pipelines for spaCy. |
|
||||||
| 🌌 **[Universe]** | Plugins, extensions, demos and books from the spaCy ecosystem. |
|
| 🌌 **[Universe]** | Plugins, extensions, demos and books from the spaCy ecosystem. |
|
||||||
| 👩🏫 **[Online Course]** | Learn spaCy in this free and interactive online course. |
|
| ⚙️ **[spaCy VS Code Extension]** | Additional tooling and features for working with spaCy's config files. |
|
||||||
| 📺 **[Videos]** | Our YouTube channel with video tutorials, talks and more. |
|
| 👩🏫 **[Online Course]** | Learn spaCy in this free and interactive online course. |
|
||||||
| 🛠 **[Changelog]** | Changes and version history. |
|
| 📺 **[Videos]** | Our YouTube channel with video tutorials, talks and more. |
|
||||||
| 💝 **[Contribute]** | How to contribute to the spaCy project and code base. |
|
| 🛠 **[Changelog]** | Changes and version history. |
|
||||||
|
| 💝 **[Contribute]** | How to contribute to the spaCy project and code base. |
|
||||||
| <a href="https://explosion.ai/spacy-tailored-pipelines"><img src="https://user-images.githubusercontent.com/13643239/152853098-1c761611-ccb0-4ec6-9066-b234552831fe.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Get a custom spaCy pipeline, tailor-made for your NLP problem by spaCy's core developers. Streamlined, production-ready, predictable and maintainable. Start by completing our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-pipelines)** |
|
| <a href="https://explosion.ai/spacy-tailored-pipelines"><img src="https://user-images.githubusercontent.com/13643239/152853098-1c761611-ccb0-4ec6-9066-b234552831fe.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Get a custom spaCy pipeline, tailor-made for your NLP problem by spaCy's core developers. Streamlined, production-ready, predictable and maintainable. Start by completing our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-pipelines)** |
|
||||||
| <a href="https://explosion.ai/spacy-tailored-analysis"><img src="https://user-images.githubusercontent.com/1019791/206151300-b00cd189-e503-4797-aa1e-1bb6344062c5.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Bespoke advice for problem solving, strategy and analysis for applied NLP projects. Services include data strategy, code reviews, pipeline design and annotation coaching. Curious? Fill in our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-analysis)** |
|
| <a href="https://explosion.ai/spacy-tailored-analysis"><img src="https://user-images.githubusercontent.com/1019791/206151300-b00cd189-e503-4797-aa1e-1bb6344062c5.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Bespoke advice for problem solving, strategy and analysis for applied NLP projects. Services include data strategy, code reviews, pipeline design and annotation coaching. Curious? Fill in our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-analysis)** |
|
||||||
|
|
||||||
|
@ -54,13 +58,13 @@ open-source software, released under the [MIT license](https://github.com/explos
|
||||||
[api reference]: https://spacy.io/api/
|
[api reference]: https://spacy.io/api/
|
||||||
[models]: https://spacy.io/models
|
[models]: https://spacy.io/models
|
||||||
[universe]: https://spacy.io/universe
|
[universe]: https://spacy.io/universe
|
||||||
|
[spaCy VS Code Extension]: https://github.com/explosion/spacy-vscode
|
||||||
[videos]: https://www.youtube.com/c/ExplosionAI
|
[videos]: https://www.youtube.com/c/ExplosionAI
|
||||||
[online course]: https://course.spacy.io
|
[online course]: https://course.spacy.io
|
||||||
[project templates]: https://github.com/explosion/projects
|
[project templates]: https://github.com/explosion/projects
|
||||||
[changelog]: https://spacy.io/usage#changelog
|
[changelog]: https://spacy.io/usage#changelog
|
||||||
[contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md
|
[contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md
|
||||||
|
|
||||||
|
|
||||||
## 💬 Where to ask questions
|
## 💬 Where to ask questions
|
||||||
|
|
||||||
The spaCy project is maintained by the [spaCy team](https://explosion.ai/about).
|
The spaCy project is maintained by the [spaCy team](https://explosion.ai/about).
|
||||||
|
|
|
@ -1,99 +0,0 @@
|
||||||
trigger:
|
|
||||||
batch: true
|
|
||||||
branches:
|
|
||||||
include:
|
|
||||||
- "*"
|
|
||||||
exclude:
|
|
||||||
- "spacy.io"
|
|
||||||
- "nightly.spacy.io"
|
|
||||||
- "v2.spacy.io"
|
|
||||||
paths:
|
|
||||||
exclude:
|
|
||||||
- "website/*"
|
|
||||||
- "*.md"
|
|
||||||
- "*.mdx"
|
|
||||||
- ".github/workflows/*"
|
|
||||||
pr:
|
|
||||||
paths:
|
|
||||||
exclude:
|
|
||||||
- "*.md"
|
|
||||||
- "*.mdx"
|
|
||||||
- "website/docs/*"
|
|
||||||
- "website/src/*"
|
|
||||||
- "website/meta/*.tsx"
|
|
||||||
- "website/meta/*.mjs"
|
|
||||||
- "website/meta/languages.json"
|
|
||||||
- "website/meta/site.json"
|
|
||||||
- "website/meta/sidebars.json"
|
|
||||||
- "website/meta/type-annotations.json"
|
|
||||||
- "website/pages/*"
|
|
||||||
- ".github/workflows/*"
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
# Check formatting and linting. Perform basic checks for most important errors
|
|
||||||
# (syntax etc.) Uses the config defined in setup.cfg and overwrites the
|
|
||||||
# selected codes.
|
|
||||||
- job: "Validate"
|
|
||||||
pool:
|
|
||||||
vmImage: "ubuntu-latest"
|
|
||||||
steps:
|
|
||||||
- task: UsePythonVersion@0
|
|
||||||
inputs:
|
|
||||||
versionSpec: "3.8"
|
|
||||||
- script: |
|
|
||||||
pip install black -c requirements.txt
|
|
||||||
python -m black spacy --check
|
|
||||||
displayName: "black"
|
|
||||||
- script: |
|
|
||||||
pip install flake8==5.0.4
|
|
||||||
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
|
|
||||||
displayName: "flake8"
|
|
||||||
|
|
||||||
- job: "Test"
|
|
||||||
dependsOn: "Validate"
|
|
||||||
strategy:
|
|
||||||
matrix:
|
|
||||||
# We're only running one platform per Python version to speed up builds
|
|
||||||
# Python38Linux:
|
|
||||||
# imageName: "ubuntu-latest"
|
|
||||||
# python.version: "3.8"
|
|
||||||
# Python38Windows:
|
|
||||||
# imageName: "windows-latest"
|
|
||||||
# python.version: "3.8"
|
|
||||||
Python38Mac:
|
|
||||||
imageName: "macos-latest"
|
|
||||||
python.version: "3.8"
|
|
||||||
Python39Linux:
|
|
||||||
imageName: "ubuntu-latest"
|
|
||||||
python.version: "3.9"
|
|
||||||
# Python39Windows:
|
|
||||||
# imageName: "windows-latest"
|
|
||||||
# python.version: "3.9"
|
|
||||||
# Python39Mac:
|
|
||||||
# imageName: "macos-latest"
|
|
||||||
# python.version: "3.9"
|
|
||||||
# Python310Linux:
|
|
||||||
# imageName: "ubuntu-latest"
|
|
||||||
# python.version: "3.10"
|
|
||||||
Python310Windows:
|
|
||||||
imageName: "windows-latest"
|
|
||||||
python.version: "3.10"
|
|
||||||
# Python310Mac:
|
|
||||||
# imageName: "macos-latest"
|
|
||||||
# python.version: "3.10"
|
|
||||||
Python311Linux:
|
|
||||||
imageName: 'ubuntu-latest'
|
|
||||||
python.version: '3.11'
|
|
||||||
Python311Windows:
|
|
||||||
imageName: 'windows-latest'
|
|
||||||
python.version: '3.11'
|
|
||||||
Python311Mac:
|
|
||||||
imageName: 'macos-latest'
|
|
||||||
python.version: '3.11'
|
|
||||||
maxParallel: 4
|
|
||||||
pool:
|
|
||||||
vmImage: $(imageName)
|
|
||||||
steps:
|
|
||||||
- template: .github/azure-steps.yml
|
|
||||||
parameters:
|
|
||||||
python_version: '$(python.version)'
|
|
|
@ -9,7 +9,7 @@ murmurhash>=0.28.0,<1.1.0
|
||||||
wasabi>=0.9.1,<1.2.0
|
wasabi>=0.9.1,<1.2.0
|
||||||
srsly>=2.4.3,<3.0.0
|
srsly>=2.4.3,<3.0.0
|
||||||
catalogue>=2.0.6,<2.1.0
|
catalogue>=2.0.6,<2.1.0
|
||||||
typer>=0.3.0,<0.8.0
|
typer>=0.3.0,<0.10.0
|
||||||
pathy>=0.10.0
|
pathy>=0.10.0
|
||||||
smart-open>=5.2.1,<7.0.0
|
smart-open>=5.2.1,<7.0.0
|
||||||
# Third party dependencies
|
# Third party dependencies
|
||||||
|
|
46
setup.cfg
46
setup.cfg
|
@ -30,6 +30,14 @@ project_urls =
|
||||||
zip_safe = false
|
zip_safe = false
|
||||||
include_package_data = true
|
include_package_data = true
|
||||||
python_requires = >=3.8
|
python_requires = >=3.8
|
||||||
|
setup_requires =
|
||||||
|
cython>=0.25,<3.0
|
||||||
|
numpy>=1.15.0
|
||||||
|
# We also need our Cython packages here to compile against
|
||||||
|
cymem>=2.0.2,<2.1.0
|
||||||
|
preshed>=3.0.2,<3.1.0
|
||||||
|
murmurhash>=0.28.0,<1.1.0
|
||||||
|
thinc>=9.0.0.dev2,<9.1.0
|
||||||
install_requires =
|
install_requires =
|
||||||
# Our libraries
|
# Our libraries
|
||||||
spacy-legacy>=4.0.0.dev0,<4.1.0
|
spacy-legacy>=4.0.0.dev0,<4.1.0
|
||||||
|
@ -42,7 +50,7 @@ install_requires =
|
||||||
srsly>=2.4.3,<3.0.0
|
srsly>=2.4.3,<3.0.0
|
||||||
catalogue>=2.0.6,<2.1.0
|
catalogue>=2.0.6,<2.1.0
|
||||||
# Third-party dependencies
|
# Third-party dependencies
|
||||||
typer>=0.3.0,<0.8.0
|
typer>=0.3.0,<0.10.0
|
||||||
pathy>=0.10.0
|
pathy>=0.10.0
|
||||||
smart-open>=5.2.1,<7.0.0
|
smart-open>=5.2.1,<7.0.0
|
||||||
tqdm>=4.38.0,<5.0.0
|
tqdm>=4.38.0,<5.0.0
|
||||||
|
@ -67,41 +75,41 @@ transformers =
|
||||||
ray =
|
ray =
|
||||||
spacy_ray>=0.1.0,<1.0.0
|
spacy_ray>=0.1.0,<1.0.0
|
||||||
cuda =
|
cuda =
|
||||||
cupy>=5.0.0b4,<12.0.0
|
cupy>=5.0.0b4,<13.0.0
|
||||||
cuda80 =
|
cuda80 =
|
||||||
cupy-cuda80>=5.0.0b4,<12.0.0
|
cupy-cuda80>=5.0.0b4,<13.0.0
|
||||||
cuda90 =
|
cuda90 =
|
||||||
cupy-cuda90>=5.0.0b4,<12.0.0
|
cupy-cuda90>=5.0.0b4,<13.0.0
|
||||||
cuda91 =
|
cuda91 =
|
||||||
cupy-cuda91>=5.0.0b4,<12.0.0
|
cupy-cuda91>=5.0.0b4,<13.0.0
|
||||||
cuda92 =
|
cuda92 =
|
||||||
cupy-cuda92>=5.0.0b4,<12.0.0
|
cupy-cuda92>=5.0.0b4,<13.0.0
|
||||||
cuda100 =
|
cuda100 =
|
||||||
cupy-cuda100>=5.0.0b4,<12.0.0
|
cupy-cuda100>=5.0.0b4,<13.0.0
|
||||||
cuda101 =
|
cuda101 =
|
||||||
cupy-cuda101>=5.0.0b4,<12.0.0
|
cupy-cuda101>=5.0.0b4,<13.0.0
|
||||||
cuda102 =
|
cuda102 =
|
||||||
cupy-cuda102>=5.0.0b4,<12.0.0
|
cupy-cuda102>=5.0.0b4,<13.0.0
|
||||||
cuda110 =
|
cuda110 =
|
||||||
cupy-cuda110>=5.0.0b4,<12.0.0
|
cupy-cuda110>=5.0.0b4,<13.0.0
|
||||||
cuda111 =
|
cuda111 =
|
||||||
cupy-cuda111>=5.0.0b4,<12.0.0
|
cupy-cuda111>=5.0.0b4,<13.0.0
|
||||||
cuda112 =
|
cuda112 =
|
||||||
cupy-cuda112>=5.0.0b4,<12.0.0
|
cupy-cuda112>=5.0.0b4,<13.0.0
|
||||||
cuda113 =
|
cuda113 =
|
||||||
cupy-cuda113>=5.0.0b4,<12.0.0
|
cupy-cuda113>=5.0.0b4,<13.0.0
|
||||||
cuda114 =
|
cuda114 =
|
||||||
cupy-cuda114>=5.0.0b4,<12.0.0
|
cupy-cuda114>=5.0.0b4,<13.0.0
|
||||||
cuda115 =
|
cuda115 =
|
||||||
cupy-cuda115>=5.0.0b4,<12.0.0
|
cupy-cuda115>=5.0.0b4,<13.0.0
|
||||||
cuda116 =
|
cuda116 =
|
||||||
cupy-cuda116>=5.0.0b4,<12.0.0
|
cupy-cuda116>=5.0.0b4,<13.0.0
|
||||||
cuda117 =
|
cuda117 =
|
||||||
cupy-cuda117>=5.0.0b4,<12.0.0
|
cupy-cuda117>=5.0.0b4,<13.0.0
|
||||||
cuda11x =
|
cuda11x =
|
||||||
cupy-cuda11x>=11.0.0,<12.0.0
|
cupy-cuda11x>=11.0.0,<13.0.0
|
||||||
cuda-autodetect =
|
cuda-autodetect =
|
||||||
cupy-wheel>=11.0.0,<12.0.0
|
cupy-wheel>=11.0.0,<13.0.0
|
||||||
apple =
|
apple =
|
||||||
thinc-apple-ops>=0.1.0.dev0,<1.0.0
|
thinc-apple-ops>=0.1.0.dev0,<1.0.0
|
||||||
# Language tokenizers with external dependencies
|
# Language tokenizers with external dependencies
|
||||||
|
|
|
@ -7,6 +7,7 @@ import srsly
|
||||||
from wasabi import Printer, MESSAGES, msg
|
from wasabi import Printer, MESSAGES, msg
|
||||||
import typer
|
import typer
|
||||||
import math
|
import math
|
||||||
|
import numpy
|
||||||
|
|
||||||
from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
|
from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
|
||||||
from ._util import import_code, debug_cli, _format_number
|
from ._util import import_code, debug_cli, _format_number
|
||||||
|
@ -335,7 +336,7 @@ def debug_data(
|
||||||
show=verbose,
|
show=verbose,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
msg.good("Examples without ocurrences available for all labels")
|
msg.good("Examples without occurrences available for all labels")
|
||||||
|
|
||||||
if "ner" in factory_names:
|
if "ner" in factory_names:
|
||||||
# Get all unique NER labels present in the data
|
# Get all unique NER labels present in the data
|
||||||
|
@ -520,9 +521,13 @@ def debug_data(
|
||||||
|
|
||||||
if "tagger" in factory_names:
|
if "tagger" in factory_names:
|
||||||
msg.divider("Part-of-speech Tagging")
|
msg.divider("Part-of-speech Tagging")
|
||||||
label_list = [label for label in gold_train_data["tags"]]
|
label_list, counts = zip(*gold_train_data["tags"].items())
|
||||||
model_labels = _get_labels_from_model(nlp, "tagger")
|
|
||||||
msg.info(f"{len(label_list)} label(s) in train data")
|
msg.info(f"{len(label_list)} label(s) in train data")
|
||||||
|
p = numpy.array(counts)
|
||||||
|
p = p / p.sum()
|
||||||
|
norm_entropy = (-p * numpy.log2(p)).sum() / numpy.log2(len(label_list))
|
||||||
|
msg.info(f"{norm_entropy} is the normalised label entropy")
|
||||||
|
model_labels = _get_labels_from_model(nlp, "tagger")
|
||||||
labels = set(label_list)
|
labels = set(label_list)
|
||||||
missing_labels = model_labels - labels
|
missing_labels = model_labels - labels
|
||||||
if missing_labels:
|
if missing_labels:
|
||||||
|
|
|
@ -83,11 +83,8 @@ def download(
|
||||||
|
|
||||||
def get_model_filename(model_name: str, version: str, sdist: bool = False) -> str:
|
def get_model_filename(model_name: str, version: str, sdist: bool = False) -> str:
|
||||||
dl_tpl = "{m}-{v}/{m}-{v}{s}"
|
dl_tpl = "{m}-{v}/{m}-{v}{s}"
|
||||||
egg_tpl = "#egg={m}=={v}"
|
|
||||||
suffix = SDIST_SUFFIX if sdist else WHEEL_SUFFIX
|
suffix = SDIST_SUFFIX if sdist else WHEEL_SUFFIX
|
||||||
filename = dl_tpl.format(m=model_name, v=version, s=suffix)
|
filename = dl_tpl.format(m=model_name, v=version, s=suffix)
|
||||||
if sdist:
|
|
||||||
filename += egg_tpl.format(m=model_name, v=version)
|
|
||||||
return filename
|
return filename
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -27,6 +27,7 @@ def evaluate_cli(
|
||||||
gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"),
|
gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"),
|
||||||
displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False),
|
displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False),
|
||||||
displacy_limit: int = Opt(25, "--displacy-limit", "-dl", help="Limit of parses to render as HTML"),
|
displacy_limit: int = Opt(25, "--displacy-limit", "-dl", help="Limit of parses to render as HTML"),
|
||||||
|
per_component: bool = Opt(False, "--per-component", "-P", help="Return scores per component, only applicable when an output JSON file is specified."),
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
|
@ -50,6 +51,7 @@ def evaluate_cli(
|
||||||
gold_preproc=gold_preproc,
|
gold_preproc=gold_preproc,
|
||||||
displacy_path=displacy_path,
|
displacy_path=displacy_path,
|
||||||
displacy_limit=displacy_limit,
|
displacy_limit=displacy_limit,
|
||||||
|
per_component=per_component,
|
||||||
silent=False,
|
silent=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -64,6 +66,7 @@ def evaluate(
|
||||||
displacy_limit: int = 25,
|
displacy_limit: int = 25,
|
||||||
silent: bool = True,
|
silent: bool = True,
|
||||||
spans_key: str = "sc",
|
spans_key: str = "sc",
|
||||||
|
per_component: bool = False,
|
||||||
) -> Dict[str, Any]:
|
) -> Dict[str, Any]:
|
||||||
msg = Printer(no_print=silent, pretty=not silent)
|
msg = Printer(no_print=silent, pretty=not silent)
|
||||||
fix_random_seed()
|
fix_random_seed()
|
||||||
|
@ -78,50 +81,61 @@ def evaluate(
|
||||||
corpus = Corpus(data_path, gold_preproc=gold_preproc)
|
corpus = Corpus(data_path, gold_preproc=gold_preproc)
|
||||||
nlp = util.load_model(model)
|
nlp = util.load_model(model)
|
||||||
dev_dataset = list(corpus(nlp))
|
dev_dataset = list(corpus(nlp))
|
||||||
scores = nlp.evaluate(dev_dataset)
|
scores = nlp.evaluate(dev_dataset, per_component=per_component)
|
||||||
metrics = {
|
if per_component:
|
||||||
"TOK": "token_acc",
|
data = scores
|
||||||
"TAG": "tag_acc",
|
if output is None:
|
||||||
"POS": "pos_acc",
|
msg.warn(
|
||||||
"MORPH": "morph_acc",
|
"The per-component option is enabled but there is no output JSON file provided to save the scores to."
|
||||||
"LEMMA": "lemma_acc",
|
)
|
||||||
"UAS": "dep_uas",
|
else:
|
||||||
"LAS": "dep_las",
|
msg.info("Per-component scores will be saved to output JSON file.")
|
||||||
"NER P": "ents_p",
|
else:
|
||||||
"NER R": "ents_r",
|
metrics = {
|
||||||
"NER F": "ents_f",
|
"TOK": "token_acc",
|
||||||
"TEXTCAT": "cats_score",
|
"TAG": "tag_acc",
|
||||||
"SENT P": "sents_p",
|
"POS": "pos_acc",
|
||||||
"SENT R": "sents_r",
|
"MORPH": "morph_acc",
|
||||||
"SENT F": "sents_f",
|
"LEMMA": "lemma_acc",
|
||||||
"SPAN P": f"spans_{spans_key}_p",
|
"UAS": "dep_uas",
|
||||||
"SPAN R": f"spans_{spans_key}_r",
|
"LAS": "dep_las",
|
||||||
"SPAN F": f"spans_{spans_key}_f",
|
"NER P": "ents_p",
|
||||||
"SPEED": "speed",
|
"NER R": "ents_r",
|
||||||
}
|
"NER F": "ents_f",
|
||||||
results = {}
|
"TEXTCAT": "cats_score",
|
||||||
data = {}
|
"SENT P": "sents_p",
|
||||||
for metric, key in metrics.items():
|
"SENT R": "sents_r",
|
||||||
if key in scores:
|
"SENT F": "sents_f",
|
||||||
if key == "cats_score":
|
"SPAN P": f"spans_{spans_key}_p",
|
||||||
metric = metric + " (" + scores.get("cats_score_desc", "unk") + ")"
|
"SPAN R": f"spans_{spans_key}_r",
|
||||||
if isinstance(scores[key], (int, float)):
|
"SPAN F": f"spans_{spans_key}_f",
|
||||||
if key == "speed":
|
"SPEED": "speed",
|
||||||
results[metric] = f"{scores[key]:.0f}"
|
}
|
||||||
|
results = {}
|
||||||
|
data = {}
|
||||||
|
for metric, key in metrics.items():
|
||||||
|
if key in scores:
|
||||||
|
if key == "cats_score":
|
||||||
|
metric = metric + " (" + scores.get("cats_score_desc", "unk") + ")"
|
||||||
|
if isinstance(scores[key], (int, float)):
|
||||||
|
if key == "speed":
|
||||||
|
results[metric] = f"{scores[key]:.0f}"
|
||||||
|
else:
|
||||||
|
results[metric] = f"{scores[key]*100:.2f}"
|
||||||
else:
|
else:
|
||||||
results[metric] = f"{scores[key]*100:.2f}"
|
results[metric] = "-"
|
||||||
else:
|
data[re.sub(r"[\s/]", "_", key.lower())] = scores[key]
|
||||||
results[metric] = "-"
|
|
||||||
data[re.sub(r"[\s/]", "_", key.lower())] = scores[key]
|
|
||||||
|
|
||||||
msg.table(results, title="Results")
|
msg.table(results, title="Results")
|
||||||
data = handle_scores_per_type(scores, data, spans_key=spans_key, silent=silent)
|
data = handle_scores_per_type(scores, data, spans_key=spans_key, silent=silent)
|
||||||
|
|
||||||
if displacy_path:
|
if displacy_path:
|
||||||
factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names]
|
factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names]
|
||||||
docs = list(nlp.pipe(ex.reference.text for ex in dev_dataset[:displacy_limit]))
|
docs = list(nlp.pipe(ex.reference.text for ex in dev_dataset[:displacy_limit]))
|
||||||
render_deps = "parser" in factory_names
|
render_deps = "parser" in factory_names
|
||||||
render_ents = "ner" in factory_names
|
render_ents = "ner" in factory_names
|
||||||
|
render_spans = "spancat" in factory_names
|
||||||
|
|
||||||
render_parses(
|
render_parses(
|
||||||
docs,
|
docs,
|
||||||
displacy_path,
|
displacy_path,
|
||||||
|
@ -129,6 +143,7 @@ def evaluate(
|
||||||
limit=displacy_limit,
|
limit=displacy_limit,
|
||||||
deps=render_deps,
|
deps=render_deps,
|
||||||
ents=render_ents,
|
ents=render_ents,
|
||||||
|
spans=render_spans,
|
||||||
)
|
)
|
||||||
msg.good(f"Generated {displacy_limit} parses as HTML", displacy_path)
|
msg.good(f"Generated {displacy_limit} parses as HTML", displacy_path)
|
||||||
|
|
||||||
|
@ -182,6 +197,7 @@ def render_parses(
|
||||||
limit: int = 250,
|
limit: int = 250,
|
||||||
deps: bool = True,
|
deps: bool = True,
|
||||||
ents: bool = True,
|
ents: bool = True,
|
||||||
|
spans: bool = True,
|
||||||
):
|
):
|
||||||
docs[0].user_data["title"] = model_name
|
docs[0].user_data["title"] = model_name
|
||||||
if ents:
|
if ents:
|
||||||
|
@ -195,6 +211,11 @@ def render_parses(
|
||||||
with (output_path / "parses.html").open("w", encoding="utf8") as file_:
|
with (output_path / "parses.html").open("w", encoding="utf8") as file_:
|
||||||
file_.write(html)
|
file_.write(html)
|
||||||
|
|
||||||
|
if spans:
|
||||||
|
html = displacy.render(docs[:limit], style="span", page=True)
|
||||||
|
with (output_path / "spans.html").open("w", encoding="utf8") as file_:
|
||||||
|
file_.write(html)
|
||||||
|
|
||||||
|
|
||||||
def print_prf_per_type(
|
def print_prf_per_type(
|
||||||
msg: Printer, scores: Dict[str, Dict[str, float]], name: str, type: str
|
msg: Printer, scores: Dict[str, Dict[str, float]], name: str, type: str
|
||||||
|
|
|
@ -35,7 +35,7 @@ def find_threshold_cli(
|
||||||
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||||
use_gpu: int = Opt(_DEFAULTS["use_gpu"], "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
use_gpu: int = Opt(_DEFAULTS["use_gpu"], "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
||||||
gold_preproc: bool = Opt(_DEFAULTS["gold_preproc"], "--gold-preproc", "-G", help="Use gold preprocessing"),
|
gold_preproc: bool = Opt(_DEFAULTS["gold_preproc"], "--gold-preproc", "-G", help="Use gold preprocessing"),
|
||||||
verbose: bool = Opt(False, "--silent", "-V", "-VV", help="Display more information for debugging purposes"),
|
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -23,6 +23,7 @@ def pretrain_cli(
|
||||||
resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
|
resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
|
||||||
epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."),
|
epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."),
|
||||||
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
||||||
|
skip_last: bool = Opt(False, "--skip-last", "-L", help="Skip saving model-last.bin"),
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
|
@ -74,6 +75,7 @@ def pretrain_cli(
|
||||||
epoch_resume=epoch_resume,
|
epoch_resume=epoch_resume,
|
||||||
use_gpu=use_gpu,
|
use_gpu=use_gpu,
|
||||||
silent=False,
|
silent=False,
|
||||||
|
skip_last=skip_last,
|
||||||
)
|
)
|
||||||
msg.good("Successfully finished pretrain")
|
msg.good("Successfully finished pretrain")
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,7 @@ the docs and the init config command. It encodes various best practices and
|
||||||
can help generate the best possible configuration, given a user's requirements. #}
|
can help generate the best possible configuration, given a user's requirements. #}
|
||||||
{%- set use_transformer = hardware != "cpu" and transformer_data -%}
|
{%- set use_transformer = hardware != "cpu" and transformer_data -%}
|
||||||
{%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
|
{%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
|
||||||
{%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "spancat", "trainable_lemmatizer"] -%}
|
{%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "span_finder", "spancat", "spancat_singlelabel", "trainable_lemmatizer"] -%}
|
||||||
[paths]
|
[paths]
|
||||||
train = null
|
train = null
|
||||||
dev = null
|
dev = null
|
||||||
|
@ -24,8 +24,11 @@ gpu_allocator = null
|
||||||
lang = "{{ lang }}"
|
lang = "{{ lang }}"
|
||||||
{%- set has_textcat = ("textcat" in components or "textcat_multilabel" in components) -%}
|
{%- set has_textcat = ("textcat" in components or "textcat_multilabel" in components) -%}
|
||||||
{%- set with_accuracy = optimize == "accuracy" -%}
|
{%- set with_accuracy = optimize == "accuracy" -%}
|
||||||
{%- set has_accurate_textcat = has_textcat and with_accuracy -%}
|
{# The BOW textcat doesn't need a source of features, so it can omit the
|
||||||
{%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "spancat" in components or "trainable_lemmatizer" in components or "entity_linker" in components or has_accurate_textcat) -%}
|
tok2vec/transformer. #}
|
||||||
|
{%- set with_accuracy_or_transformer = (use_transformer or with_accuracy) -%}
|
||||||
|
{%- set textcat_needs_features = has_textcat and with_accuracy_or_transformer -%}
|
||||||
|
{%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "span_finder" in components or "spancat" in components or "spancat_singlelabel" in components or "trainable_lemmatizer" in components or "entity_linker" in components or textcat_needs_features) -%}
|
||||||
{%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components -%}
|
{%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components -%}
|
||||||
{%- else -%}
|
{%- else -%}
|
||||||
{%- set full_pipeline = components -%}
|
{%- set full_pipeline = components -%}
|
||||||
|
@ -122,6 +125,30 @@ grad_factor = 1.0
|
||||||
@layers = "reduce_mean.v1"
|
@layers = "reduce_mean.v1"
|
||||||
{% endif -%}
|
{% endif -%}
|
||||||
|
|
||||||
|
{% if "span_finder" in components -%}
|
||||||
|
[components.span_finder]
|
||||||
|
factory = "span_finder"
|
||||||
|
max_length = null
|
||||||
|
min_length = null
|
||||||
|
scorer = {"@scorers":"spacy.span_finder_scorer.v1"}
|
||||||
|
spans_key = "sc"
|
||||||
|
threshold = 0.5
|
||||||
|
|
||||||
|
[components.span_finder.model]
|
||||||
|
@architectures = "spacy.SpanFinder.v1"
|
||||||
|
|
||||||
|
[components.span_finder.model.scorer]
|
||||||
|
@layers = "spacy.LinearLogistic.v1"
|
||||||
|
nO = 2
|
||||||
|
|
||||||
|
[components.span_finder.model.tok2vec]
|
||||||
|
@architectures = "spacy-transformers.TransformerListener.v1"
|
||||||
|
grad_factor = 1.0
|
||||||
|
|
||||||
|
[components.span_finder.model.tok2vec.pooling]
|
||||||
|
@layers = "reduce_mean.v1"
|
||||||
|
{% endif -%}
|
||||||
|
|
||||||
{% if "spancat" in components -%}
|
{% if "spancat" in components -%}
|
||||||
[components.spancat]
|
[components.spancat]
|
||||||
factory = "spancat"
|
factory = "spancat"
|
||||||
|
@ -154,6 +181,36 @@ grad_factor = 1.0
|
||||||
sizes = [1,2,3]
|
sizes = [1,2,3]
|
||||||
{% endif -%}
|
{% endif -%}
|
||||||
|
|
||||||
|
{% if "spancat_singlelabel" in components %}
|
||||||
|
[components.spancat_singlelabel]
|
||||||
|
factory = "spancat_singlelabel"
|
||||||
|
negative_weight = 1.0
|
||||||
|
allow_overlap = true
|
||||||
|
scorer = {"@scorers":"spacy.spancat_scorer.v1"}
|
||||||
|
spans_key = "sc"
|
||||||
|
|
||||||
|
[components.spancat_singlelabel.model]
|
||||||
|
@architectures = "spacy.SpanCategorizer.v1"
|
||||||
|
|
||||||
|
[components.spancat_singlelabel.model.reducer]
|
||||||
|
@layers = "spacy.mean_max_reducer.v1"
|
||||||
|
hidden_size = 128
|
||||||
|
|
||||||
|
[components.spancat_singlelabel.model.scorer]
|
||||||
|
@layers = "Softmax.v2"
|
||||||
|
|
||||||
|
[components.spancat_singlelabel.model.tok2vec]
|
||||||
|
@architectures = "spacy-transformers.TransformerListener.v1"
|
||||||
|
grad_factor = 1.0
|
||||||
|
|
||||||
|
[components.spancat_singlelabel.model.tok2vec.pooling]
|
||||||
|
@layers = "reduce_mean.v1"
|
||||||
|
|
||||||
|
[components.spancat_singlelabel.suggester]
|
||||||
|
@misc = "spacy.ngram_suggester.v1"
|
||||||
|
sizes = [1,2,3]
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
{% if "trainable_lemmatizer" in components -%}
|
{% if "trainable_lemmatizer" in components -%}
|
||||||
[components.trainable_lemmatizer]
|
[components.trainable_lemmatizer]
|
||||||
factory = "trainable_lemmatizer"
|
factory = "trainable_lemmatizer"
|
||||||
|
@ -219,10 +276,16 @@ no_output_layer = false
|
||||||
|
|
||||||
{% else -%}
|
{% else -%}
|
||||||
[components.textcat.model]
|
[components.textcat.model]
|
||||||
@architectures = "spacy.TextCatBOW.v2"
|
@architectures = "spacy.TextCatCNN.v2"
|
||||||
exclusive_classes = true
|
exclusive_classes = true
|
||||||
ngram_size = 1
|
nO = null
|
||||||
no_output_layer = false
|
|
||||||
|
[components.textcat.model.tok2vec]
|
||||||
|
@architectures = "spacy-transformers.TransformerListener.v1"
|
||||||
|
grad_factor = 1.0
|
||||||
|
|
||||||
|
[components.textcat.model.tok2vec.pooling]
|
||||||
|
@layers = "reduce_mean.v1"
|
||||||
{%- endif %}
|
{%- endif %}
|
||||||
{%- endif %}
|
{%- endif %}
|
||||||
|
|
||||||
|
@ -250,10 +313,16 @@ no_output_layer = false
|
||||||
|
|
||||||
{% else -%}
|
{% else -%}
|
||||||
[components.textcat_multilabel.model]
|
[components.textcat_multilabel.model]
|
||||||
@architectures = "spacy.TextCatBOW.v2"
|
@architectures = "spacy.TextCatCNN.v2"
|
||||||
exclusive_classes = false
|
exclusive_classes = false
|
||||||
ngram_size = 1
|
nO = null
|
||||||
no_output_layer = false
|
|
||||||
|
[components.textcat_multilabel.model.tok2vec]
|
||||||
|
@architectures = "spacy-transformers.TransformerListener.v1"
|
||||||
|
grad_factor = 1.0
|
||||||
|
|
||||||
|
[components.textcat_multilabel.model.tok2vec.pooling]
|
||||||
|
@layers = "reduce_mean.v1"
|
||||||
{%- endif %}
|
{%- endif %}
|
||||||
{%- endif %}
|
{%- endif %}
|
||||||
|
|
||||||
|
@ -284,6 +353,7 @@ maxout_pieces = 3
|
||||||
{% if "morphologizer" in components %}
|
{% if "morphologizer" in components %}
|
||||||
[components.morphologizer]
|
[components.morphologizer]
|
||||||
factory = "morphologizer"
|
factory = "morphologizer"
|
||||||
|
label_smoothing = 0.05
|
||||||
|
|
||||||
[components.morphologizer.model]
|
[components.morphologizer.model]
|
||||||
@architectures = "spacy.Tagger.v2"
|
@architectures = "spacy.Tagger.v2"
|
||||||
|
@ -297,6 +367,7 @@ width = ${components.tok2vec.model.encode.width}
|
||||||
{% if "tagger" in components %}
|
{% if "tagger" in components %}
|
||||||
[components.tagger]
|
[components.tagger]
|
||||||
factory = "tagger"
|
factory = "tagger"
|
||||||
|
label_smoothing = 0.05
|
||||||
|
|
||||||
[components.tagger.model]
|
[components.tagger.model]
|
||||||
@architectures = "spacy.Tagger.v2"
|
@architectures = "spacy.Tagger.v2"
|
||||||
|
@ -341,6 +412,27 @@ nO = null
|
||||||
width = ${components.tok2vec.model.encode.width}
|
width = ${components.tok2vec.model.encode.width}
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
|
||||||
|
{% if "span_finder" in components %}
|
||||||
|
[components.span_finder]
|
||||||
|
factory = "span_finder"
|
||||||
|
max_length = null
|
||||||
|
min_length = null
|
||||||
|
scorer = {"@scorers":"spacy.span_finder_scorer.v1"}
|
||||||
|
spans_key = "sc"
|
||||||
|
threshold = 0.5
|
||||||
|
|
||||||
|
[components.span_finder.model]
|
||||||
|
@architectures = "spacy.SpanFinder.v1"
|
||||||
|
|
||||||
|
[components.span_finder.model.scorer]
|
||||||
|
@layers = "spacy.LinearLogistic.v1"
|
||||||
|
nO = 2
|
||||||
|
|
||||||
|
[components.span_finder.model.tok2vec]
|
||||||
|
@architectures = "spacy.Tok2VecListener.v1"
|
||||||
|
width = ${components.tok2vec.model.encode.width}
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
{% if "spancat" in components %}
|
{% if "spancat" in components %}
|
||||||
[components.spancat]
|
[components.spancat]
|
||||||
factory = "spancat"
|
factory = "spancat"
|
||||||
|
@ -370,6 +462,33 @@ width = ${components.tok2vec.model.encode.width}
|
||||||
sizes = [1,2,3]
|
sizes = [1,2,3]
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
|
||||||
|
{% if "spancat_singlelabel" in components %}
|
||||||
|
[components.spancat_singlelabel]
|
||||||
|
factory = "spancat_singlelabel"
|
||||||
|
negative_weight = 1.0
|
||||||
|
allow_overlap = true
|
||||||
|
scorer = {"@scorers":"spacy.spancat_scorer.v1"}
|
||||||
|
spans_key = "sc"
|
||||||
|
|
||||||
|
[components.spancat_singlelabel.model]
|
||||||
|
@architectures = "spacy.SpanCategorizer.v1"
|
||||||
|
|
||||||
|
[components.spancat_singlelabel.model.reducer]
|
||||||
|
@layers = "spacy.mean_max_reducer.v1"
|
||||||
|
hidden_size = 128
|
||||||
|
|
||||||
|
[components.spancat_singlelabel.model.scorer]
|
||||||
|
@layers = "Softmax.v2"
|
||||||
|
|
||||||
|
[components.spancat_singlelabel.model.tok2vec]
|
||||||
|
@architectures = "spacy.Tok2VecListener.v1"
|
||||||
|
width = ${components.tok2vec.model.encode.width}
|
||||||
|
|
||||||
|
[components.spancat_singlelabel.suggester]
|
||||||
|
@misc = "spacy.ngram_suggester.v1"
|
||||||
|
sizes = [1,2,3]
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
{% if "trainable_lemmatizer" in components -%}
|
{% if "trainable_lemmatizer" in components -%}
|
||||||
[components.trainable_lemmatizer]
|
[components.trainable_lemmatizer]
|
||||||
factory = "trainable_lemmatizer"
|
factory = "trainable_lemmatizer"
|
||||||
|
|
|
@ -125,13 +125,17 @@ def app(environ, start_response):
|
||||||
return [res]
|
return [res]
|
||||||
|
|
||||||
|
|
||||||
def parse_deps(orig_doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
|
def parse_deps(
|
||||||
|
orig_doc: Union[Doc, Span], options: Dict[str, Any] = {}
|
||||||
|
) -> Dict[str, Any]:
|
||||||
"""Generate dependency parse in {'words': [], 'arcs': []} format.
|
"""Generate dependency parse in {'words': [], 'arcs': []} format.
|
||||||
|
|
||||||
orig_doc (Doc): Document to parse.
|
orig_doc (Union[Doc, Span]): Document to parse.
|
||||||
options (Dict[str, Any]): Dependency parse specific visualisation options.
|
options (Dict[str, Any]): Dependency parse specific visualisation options.
|
||||||
RETURNS (dict): Generated dependency parse keyed by words and arcs.
|
RETURNS (dict): Generated dependency parse keyed by words and arcs.
|
||||||
"""
|
"""
|
||||||
|
if isinstance(orig_doc, Span):
|
||||||
|
orig_doc = orig_doc.as_doc()
|
||||||
doc = Doc(orig_doc.vocab).from_bytes(
|
doc = Doc(orig_doc.vocab).from_bytes(
|
||||||
orig_doc.to_bytes(exclude=["user_data", "user_hooks"])
|
orig_doc.to_bytes(exclude=["user_data", "user_hooks"])
|
||||||
)
|
)
|
||||||
|
|
|
@ -546,6 +546,8 @@ class Errors(metaclass=ErrorsWithCodes):
|
||||||
"during training, make sure to include it in 'annotating components'")
|
"during training, make sure to include it in 'annotating components'")
|
||||||
|
|
||||||
# New errors added in v3.x
|
# New errors added in v3.x
|
||||||
|
E850 = ("The PretrainVectors objective currently only supports default or "
|
||||||
|
"floret vectors, not {mode} vectors.")
|
||||||
E851 = ("The 'textcat' component labels should only have values of 0 or 1, "
|
E851 = ("The 'textcat' component labels should only have values of 0 or 1, "
|
||||||
"but found value of '{val}'.")
|
"but found value of '{val}'.")
|
||||||
E852 = ("The tar file pulled from the remote attempted an unsafe path "
|
E852 = ("The tar file pulled from the remote attempted an unsafe path "
|
||||||
|
@ -955,6 +957,14 @@ class Errors(metaclass=ErrorsWithCodes):
|
||||||
"with `displacy.serve(doc, port=port)`")
|
"with `displacy.serve(doc, port=port)`")
|
||||||
E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port=port)` "
|
E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port=port)` "
|
||||||
"or use `auto_select_port=True` to pick an available port automatically.")
|
"or use `auto_select_port=True` to pick an available port automatically.")
|
||||||
|
E1051 = ("'allow_overlap' can only be False when max_positive is 1, but found 'max_positive': {max_positive}.")
|
||||||
|
E1052 = ("Unable to copy spans: the character offsets for the span at "
|
||||||
|
"index {i} in the span group do not align with the tokenization "
|
||||||
|
"in the target doc.")
|
||||||
|
E1053 = ("Both 'min_length' and 'max_length' should be larger than 0, but found"
|
||||||
|
" 'min_length': {min_length}, 'max_length': {max_length}")
|
||||||
|
E1054 = ("The text, including whitespace, must match between reference and "
|
||||||
|
"predicted docs when training {component}.")
|
||||||
|
|
||||||
# v4 error strings
|
# v4 error strings
|
||||||
E4000 = ("Expected a Doc as input, but got: '{type}'")
|
E4000 = ("Expected a Doc as input, but got: '{type}'")
|
||||||
|
|
|
@ -2,12 +2,14 @@ from ...language import Language, BaseDefaults
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
|
|
||||||
|
|
||||||
class LatinDefaults(BaseDefaults):
|
class LatinDefaults(BaseDefaults):
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
lex_attr_getters = LEX_ATTRS
|
lex_attr_getters = LEX_ATTRS
|
||||||
|
syntax_iterators = SYNTAX_ITERATORS
|
||||||
|
|
||||||
|
|
||||||
class Latin(Language):
|
class Latin(Language):
|
||||||
|
|
22
spacy/lang/la/examples.py
Normal file
22
spacy/lang/la/examples.py
Normal file
|
@ -0,0 +1,22 @@
|
||||||
|
"""
|
||||||
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
>>> from spacy.lang.la.examples import sentences
|
||||||
|
>>> docs = nlp.pipe(sentences)
|
||||||
|
"""
|
||||||
|
|
||||||
|
# > Caes. BG 1.1
|
||||||
|
# > Cic. De Amic. 1
|
||||||
|
# > V. Georg. 1.1-5
|
||||||
|
# > Gen. 1:1
|
||||||
|
# > Galileo, Sid. Nunc.
|
||||||
|
# > van Schurman, Opusc. arg. 1
|
||||||
|
|
||||||
|
sentences = [
|
||||||
|
"Gallia est omnis divisa in partes tres, quarum unam incolunt Belgae, aliam Aquitani, tertiam qui ipsorum lingua Celtae, nostra Galli appellantur.",
|
||||||
|
"Q. Mucius augur multa narrare de C. Laelio socero suo memoriter et iucunde solebat nec dubitare illum in omni sermone appellare sapientem.",
|
||||||
|
"Quid faciat laetas segetes, quo sidere terram uertere, Maecenas, ulmisque adiungere uitis conueniat, quae cura boum, qui cultus habendo sit pecori, apibus quanta experientia parcis, hinc canere incipiam",
|
||||||
|
"In principio creavit Deus caelum et terram.",
|
||||||
|
"Quo sumpto, intelligatur lunaris globus, cuius maximus circulus CAF, centrum vero E, dimetiens CF, qui ad Terre diametrum est ut duo ad septem.",
|
||||||
|
"Cuicunque natura indita sunt principia, seu potentiae principiorum omnium artium, ac scientiarum, ei conveniunt omnes artes ac scientiae.",
|
||||||
|
]
|
|
@ -6,17 +6,16 @@ roman_numerals_compile = re.compile(
|
||||||
r"(?i)^(?=[MDCLXVI])M*(C[MD]|D?C{0,4})(X[CL]|L?X{0,4})(I[XV]|V?I{0,4})$"
|
r"(?i)^(?=[MDCLXVI])M*(C[MD]|D?C{0,4})(X[CL]|L?X{0,4})(I[XV]|V?I{0,4})$"
|
||||||
)
|
)
|
||||||
|
|
||||||
_num_words = set(
|
_num_words = """unus una unum duo duae tres tria quattuor quinque sex septem octo novem decem undecim duodecim tredecim quattuordecim quindecim sedecim septendecim duodeviginti undeviginti viginti triginta quadraginta quinquaginta sexaginta septuaginta octoginta nonaginta centum ducenti ducentae ducenta trecenti trecentae trecenta quadringenti quadringentae quadringenta quingenti quingentae quingenta sescenti sescentae sescenta septingenti septingentae septingenta octingenti octingentae octingenta nongenti nongentae nongenta mille
|
||||||
"""
|
|
||||||
unus una unum duo duae tres tria quattuor quinque sex septem octo novem decem
|
|
||||||
""".split()
|
""".split()
|
||||||
)
|
|
||||||
|
|
||||||
_ordinal_words = set(
|
_num_words += [item.replace("v", "u") for item in _num_words]
|
||||||
"""
|
_num_words = set(_num_words)
|
||||||
primus prima primum secundus secunda secundum tertius tertia tertium
|
|
||||||
""".split()
|
_ordinal_words = """primus prima primum secundus secunda secundum tertius tertia tertium quartus quarta quartum quintus quinta quintum sextus sexta sextum septimus septima septimum octavus octava octavum nonus nona nonum decimus decima decimum undecimus undecima undecimum duodecimus duodecima duodecimum duodevicesimus duodevicesima duodevicesimum undevicesimus undevicesima undevicesimum vicesimus vicesima vicesimum tricesimus tricesima tricesimum quadragesimus quadragesima quadragesimum quinquagesimus quinquagesima quinquagesimum sexagesimus sexagesima sexagesimum septuagesimus septuagesima septuagesimum octogesimus octogesima octogesimum nonagesimus nonagesima nonagesimum centesimus centesima centesimum ducentesimus ducentesima ducentesimum trecentesimus trecentesima trecentesimum quadringentesimus quadringentesima quadringentesimum quingentesimus quingentesima quingentesimum sescentesimus sescentesima sescentesimum septingentesimus septingentesima septingentesimum octingentesimus octingentesima octingentesimum nongentesimus nongentesima nongentesimum millesimus millesima millesimum""".split()
|
||||||
)
|
|
||||||
|
_ordinal_words += [item.replace("v", "u") for item in _ordinal_words]
|
||||||
|
_ordinal_words = set(_ordinal_words)
|
||||||
|
|
||||||
|
|
||||||
def like_num(text):
|
def like_num(text):
|
||||||
|
|
85
spacy/lang/la/syntax_iterators.py
Normal file
85
spacy/lang/la/syntax_iterators.py
Normal file
|
@ -0,0 +1,85 @@
|
||||||
|
from typing import Union, Iterator, Tuple
|
||||||
|
from ...tokens import Doc, Span
|
||||||
|
from ...symbols import NOUN, PROPN, PRON, VERB, AUX
|
||||||
|
from ...errors import Errors
|
||||||
|
|
||||||
|
# NB: Modified from da on suggestion from https://github.com/explosion/spaCy/issues/7457#issuecomment-800349751 [PJB]
|
||||||
|
|
||||||
|
|
||||||
|
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
|
||||||
|
def is_verb_token(tok):
|
||||||
|
return tok.pos in [VERB, AUX]
|
||||||
|
|
||||||
|
def get_left_bound(root):
|
||||||
|
left_bound = root
|
||||||
|
for tok in reversed(list(root.lefts)):
|
||||||
|
if tok.dep in np_left_deps:
|
||||||
|
left_bound = tok
|
||||||
|
return left_bound
|
||||||
|
|
||||||
|
def get_right_bound(doc, root):
|
||||||
|
right_bound = root
|
||||||
|
for tok in root.rights:
|
||||||
|
if tok.dep in np_right_deps:
|
||||||
|
right = get_right_bound(doc, tok)
|
||||||
|
if list(
|
||||||
|
filter(
|
||||||
|
lambda t: is_verb_token(t) or t.dep in stop_deps,
|
||||||
|
doc[root.i : right.i],
|
||||||
|
)
|
||||||
|
):
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
right_bound = right
|
||||||
|
return right_bound
|
||||||
|
|
||||||
|
def get_bounds(doc, root):
|
||||||
|
return get_left_bound(root), get_right_bound(doc, root)
|
||||||
|
|
||||||
|
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||||
|
|
||||||
|
if not doc.has_annotation("DEP"):
|
||||||
|
raise ValueError(Errors.E029)
|
||||||
|
|
||||||
|
if not len(doc):
|
||||||
|
return
|
||||||
|
|
||||||
|
left_labels = [
|
||||||
|
"det",
|
||||||
|
"fixed",
|
||||||
|
"nmod:poss",
|
||||||
|
"amod",
|
||||||
|
"flat",
|
||||||
|
"goeswith",
|
||||||
|
"nummod",
|
||||||
|
"appos",
|
||||||
|
]
|
||||||
|
right_labels = [
|
||||||
|
"fixed",
|
||||||
|
"nmod:poss",
|
||||||
|
"amod",
|
||||||
|
"flat",
|
||||||
|
"goeswith",
|
||||||
|
"nummod",
|
||||||
|
"appos",
|
||||||
|
"nmod",
|
||||||
|
"det",
|
||||||
|
]
|
||||||
|
stop_labels = ["punct"]
|
||||||
|
|
||||||
|
np_label = doc.vocab.strings.add("NP")
|
||||||
|
np_left_deps = [doc.vocab.strings.add(label) for label in left_labels]
|
||||||
|
np_right_deps = [doc.vocab.strings.add(label) for label in right_labels]
|
||||||
|
stop_deps = [doc.vocab.strings.add(label) for label in stop_labels]
|
||||||
|
|
||||||
|
prev_right = -1
|
||||||
|
for token in doclike:
|
||||||
|
if token.pos in [PROPN, NOUN, PRON]:
|
||||||
|
left, right = get_bounds(doc, token)
|
||||||
|
if left.i <= prev_right:
|
||||||
|
continue
|
||||||
|
yield left.i, right.i + 1, np_label
|
||||||
|
prev_right = right.i
|
||||||
|
|
||||||
|
|
||||||
|
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
|
|
@ -12,65 +12,15 @@ _exc = {
|
||||||
"uobiscum": [{ORTH: "uobis"}, {ORTH: "cum"}],
|
"uobiscum": [{ORTH: "uobis"}, {ORTH: "cum"}],
|
||||||
}
|
}
|
||||||
|
|
||||||
for orth in [
|
_abbrev_exc = """A. A.D. Aa. Aaa. Acc. Agr. Ap. Apr. April. A.U.C. Aug. C. Caes. Caess. Cc. Cn. Coll. Cons. Conss. Cos. Coss. D. D.N. Dat. Dd. Dec. Decemb. Decembr. F. Feb. Febr. Februar. Ian. Id. Imp. Impp. Imppp. Iul. Iun. K. Kal. L. M'. M. Mai. Mam. Mar. Mart. Med. N. Nn. Nob. Non. Nov. Novemb. Oct. Octob. Opet. Ord. P. Paul. Pf. Pl. Plur. Post. Pp. Prid. Pro. Procos. Q. Quint. S. S.C. Scr. Sept. Septemb. Ser. Sert. Sex. Sext. St. Sta. Suff. T. Ti. Trib. V. Vol. Vop. Vv.""".split()
|
||||||
"A.",
|
|
||||||
"Agr.",
|
_abbrev_exc += [item.lower() for item in _abbrev_exc]
|
||||||
"Ap.",
|
_abbrev_exc += [item.upper() for item in _abbrev_exc]
|
||||||
"C.",
|
_abbrev_exc += [item.replace("v", "u").replace("V", "U") for item in _abbrev_exc]
|
||||||
"Cn.",
|
|
||||||
"D.",
|
_abbrev_exc += ["d.N."]
|
||||||
"F.",
|
|
||||||
"K.",
|
for orth in set(_abbrev_exc):
|
||||||
"L.",
|
|
||||||
"M'.",
|
|
||||||
"M.",
|
|
||||||
"Mam.",
|
|
||||||
"N.",
|
|
||||||
"Oct.",
|
|
||||||
"Opet.",
|
|
||||||
"P.",
|
|
||||||
"Paul.",
|
|
||||||
"Post.",
|
|
||||||
"Pro.",
|
|
||||||
"Q.",
|
|
||||||
"S.",
|
|
||||||
"Ser.",
|
|
||||||
"Sert.",
|
|
||||||
"Sex.",
|
|
||||||
"St.",
|
|
||||||
"Sta.",
|
|
||||||
"T.",
|
|
||||||
"Ti.",
|
|
||||||
"V.",
|
|
||||||
"Vol.",
|
|
||||||
"Vop.",
|
|
||||||
"U.",
|
|
||||||
"Uol.",
|
|
||||||
"Uop.",
|
|
||||||
"Ian.",
|
|
||||||
"Febr.",
|
|
||||||
"Mart.",
|
|
||||||
"Apr.",
|
|
||||||
"Mai.",
|
|
||||||
"Iun.",
|
|
||||||
"Iul.",
|
|
||||||
"Aug.",
|
|
||||||
"Sept.",
|
|
||||||
"Oct.",
|
|
||||||
"Nov.",
|
|
||||||
"Nou.",
|
|
||||||
"Dec.",
|
|
||||||
"Non.",
|
|
||||||
"Id.",
|
|
||||||
"A.D.",
|
|
||||||
"Coll.",
|
|
||||||
"Cos.",
|
|
||||||
"Ord.",
|
|
||||||
"Pl.",
|
|
||||||
"S.C.",
|
|
||||||
"Suff.",
|
|
||||||
"Trib.",
|
|
||||||
]:
|
|
||||||
_exc[orth] = [{ORTH: orth}]
|
_exc[orth] = [{ORTH: orth}]
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||||
|
|
24
spacy/lang/ms/__init__.py
Normal file
24
spacy/lang/ms/__init__.py
Normal file
|
@ -0,0 +1,24 @@
|
||||||
|
from .stop_words import STOP_WORDS
|
||||||
|
from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||||
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
|
from ...language import Language, BaseDefaults
|
||||||
|
|
||||||
|
|
||||||
|
class MalayDefaults(BaseDefaults):
|
||||||
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
|
prefixes = TOKENIZER_PREFIXES
|
||||||
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
|
infixes = TOKENIZER_INFIXES
|
||||||
|
syntax_iterators = SYNTAX_ITERATORS
|
||||||
|
lex_attr_getters = LEX_ATTRS
|
||||||
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
|
class Malay(Language):
|
||||||
|
lang = "ms"
|
||||||
|
Defaults = MalayDefaults
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["Malay"]
|
1943
spacy/lang/ms/_tokenizer_exceptions_list.py
Normal file
1943
spacy/lang/ms/_tokenizer_exceptions_list.py
Normal file
File diff suppressed because it is too large
Load Diff
17
spacy/lang/ms/examples.py
Normal file
17
spacy/lang/ms/examples.py
Normal file
|
@ -0,0 +1,17 @@
|
||||||
|
"""
|
||||||
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
>>> from spacy.lang.ms.examples import sentences
|
||||||
|
>>> docs = nlp.pipe(sentences)
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
sentences = [
|
||||||
|
"Malaysia ialah sebuah negara yang terletak di Asia Tenggara.",
|
||||||
|
"Berapa banyak pelajar yang akan menghadiri majlis perpisahan sekolah?",
|
||||||
|
"Pengeluaran makanan berasal dari beberapa lokasi termasuk Cameron Highlands, Johor Bahru, dan Kuching.",
|
||||||
|
"Syarikat XYZ telah menghasilkan 20,000 unit produk baharu dalam setahun terakhir",
|
||||||
|
"Kuala Lumpur merupakan ibu negara Malaysia." "Kau berada di mana semalam?",
|
||||||
|
"Siapa yang akan memimpin projek itu?",
|
||||||
|
"Siapa perdana menteri Malaysia sekarang?",
|
||||||
|
]
|
66
spacy/lang/ms/lex_attrs.py
Normal file
66
spacy/lang/ms/lex_attrs.py
Normal file
|
@ -0,0 +1,66 @@
|
||||||
|
import unicodedata
|
||||||
|
|
||||||
|
from .punctuation import LIST_CURRENCY
|
||||||
|
from ...attrs import IS_CURRENCY, LIKE_NUM
|
||||||
|
|
||||||
|
|
||||||
|
_num_words = [
|
||||||
|
"kosong",
|
||||||
|
"satu",
|
||||||
|
"dua",
|
||||||
|
"tiga",
|
||||||
|
"empat",
|
||||||
|
"lima",
|
||||||
|
"enam",
|
||||||
|
"tujuh",
|
||||||
|
"lapan",
|
||||||
|
"sembilan",
|
||||||
|
"sepuluh",
|
||||||
|
"sebelas",
|
||||||
|
"belas",
|
||||||
|
"puluh",
|
||||||
|
"ratus",
|
||||||
|
"ribu",
|
||||||
|
"juta",
|
||||||
|
"billion",
|
||||||
|
"trillion",
|
||||||
|
"kuadrilion",
|
||||||
|
"kuintilion",
|
||||||
|
"sekstilion",
|
||||||
|
"septilion",
|
||||||
|
"oktilion",
|
||||||
|
"nonilion",
|
||||||
|
"desilion",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def like_num(text):
|
||||||
|
if text.startswith(("+", "-", "±", "~")):
|
||||||
|
text = text[1:]
|
||||||
|
text = text.replace(",", "").replace(".", "")
|
||||||
|
if text.isdigit():
|
||||||
|
return True
|
||||||
|
if text.count("/") == 1:
|
||||||
|
num, denom = text.split("/")
|
||||||
|
if num.isdigit() and denom.isdigit():
|
||||||
|
return True
|
||||||
|
if text.lower() in _num_words:
|
||||||
|
return True
|
||||||
|
if text.count("-") == 1:
|
||||||
|
_, num = text.split("-")
|
||||||
|
if num.isdigit() or num in _num_words:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def is_currency(text):
|
||||||
|
if text in LIST_CURRENCY:
|
||||||
|
return True
|
||||||
|
|
||||||
|
for char in text:
|
||||||
|
if unicodedata.category(char) != "Sc":
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
LEX_ATTRS = {IS_CURRENCY: is_currency, LIKE_NUM: like_num}
|
61
spacy/lang/ms/punctuation.py
Normal file
61
spacy/lang/ms/punctuation.py
Normal file
|
@ -0,0 +1,61 @@
|
||||||
|
from ..punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||||
|
from ..char_classes import ALPHA, merge_chars, split_chars, _currency, _units
|
||||||
|
|
||||||
|
|
||||||
|
_units = (
|
||||||
|
_units + "s bit Gbps Mbps mbps Kbps kbps ƒ ppi px "
|
||||||
|
"Hz kHz MHz GHz mAh "
|
||||||
|
"ratus rb ribu ribuan "
|
||||||
|
"juta jt jutaan mill?iar million bil[l]?iun bilyun billion "
|
||||||
|
)
|
||||||
|
_currency = _currency + r" USD RM MYR Rp IDR RMB SGD S\$"
|
||||||
|
_months = (
|
||||||
|
"Januari Februari Mac April Mei Jun Julai Ogos September "
|
||||||
|
"Oktober November Disember Januari Februari Mac Mei Jun "
|
||||||
|
"Julai Ogos Oktober Disember Jan Feb Mac Jun Julai Ogos Sept "
|
||||||
|
"Okt Nov Dis"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
UNITS = merge_chars(_units)
|
||||||
|
CURRENCY = merge_chars(_currency)
|
||||||
|
HTML_PREFIX = r"<(b|strong|i|em|p|span|div|br)\s?/>|<a([^>]+)>"
|
||||||
|
HTML_SUFFIX = r"</(b|strong|i|em|p|span|div|a)>"
|
||||||
|
MONTHS = merge_chars(_months)
|
||||||
|
LIST_CURRENCY = split_chars(_currency)
|
||||||
|
|
||||||
|
_prefixes = list(TOKENIZER_PREFIXES)
|
||||||
|
_prefixes.remove("#") # hashtag
|
||||||
|
_prefixes = _prefixes + LIST_CURRENCY + [HTML_PREFIX] + ["/", "—"]
|
||||||
|
|
||||||
|
_suffixes = (
|
||||||
|
TOKENIZER_SUFFIXES
|
||||||
|
+ [r"\-[Nn]ya", "-[KkMm]u", "[—-]"]
|
||||||
|
+ [
|
||||||
|
# disabled: variable width currency variable
|
||||||
|
# r"(?<={c})(?:[0-9]+)".format(c=CURRENCY),
|
||||||
|
r"(?<=[0-9])(?:{u})".format(u=UNITS),
|
||||||
|
r"(?<=[0-9])%",
|
||||||
|
# disabled: variable width HTML_SUFFIX variable
|
||||||
|
# r"(?<=[0-9{a}]{h})(?:[\.,:-])".format(a=ALPHA, h=HTML_SUFFIX),
|
||||||
|
r"(?<=[0-9{a}])(?:{h})".format(a=ALPHA, h=HTML_SUFFIX),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
_infixes = TOKENIZER_INFIXES + [
|
||||||
|
r"(?<=[0-9])[\\/](?=[0-9%-])",
|
||||||
|
r"(?<=[0-9])%(?=[{a}0-9/])".format(a=ALPHA),
|
||||||
|
# disabled: variable width units variable
|
||||||
|
# r"(?<={u})[\/-](?=[0-9])".format(u=UNITS),
|
||||||
|
# disabled: variable width months variable
|
||||||
|
# r"(?<={m})[\/-](?=[0-9])".format(m=MONTHS),
|
||||||
|
r'(?<=[0-9)][.,])"(?=[0-9])',
|
||||||
|
r'(?<=[{a})][.,\'])["—](?=[{a}])'.format(a=ALPHA),
|
||||||
|
r"(?<=[{a}])-(?=[0-9])".format(a=ALPHA),
|
||||||
|
r"(?<=[0-9])-(?=[{a}])".format(a=ALPHA),
|
||||||
|
r"(?<=[{a}])[\/-](?={c}|[{a}])".format(a=ALPHA, c=CURRENCY),
|
||||||
|
]
|
||||||
|
|
||||||
|
TOKENIZER_PREFIXES = _prefixes
|
||||||
|
TOKENIZER_SUFFIXES = _suffixes
|
||||||
|
TOKENIZER_INFIXES = _infixes
|
118
spacy/lang/ms/stop_words.py
Normal file
118
spacy/lang/ms/stop_words.py
Normal file
|
@ -0,0 +1,118 @@
|
||||||
|
STOP_WORDS = set(
|
||||||
|
"""
|
||||||
|
ada adalah adanya adapun agak agaknya agar akan akankah akhir akhiri akhirnya
|
||||||
|
aku akulah amat amatlah anda andalah antar antara antaranya apa apaan apabila
|
||||||
|
apakah apalagi apatah artinya asal asalkan atas atau ataukah ataupun awal
|
||||||
|
awalnya
|
||||||
|
|
||||||
|
bagai bagaikan bagaimana bagaimanakah bagaimanapun bagi bagian bahkan bahwa
|
||||||
|
bahwasanya baik bakal bakalan balik banyak bapak baru bawah beberapa begini
|
||||||
|
beginian beginikah beginilah begitu begitukah begitulah begitupun bekerja
|
||||||
|
belakang belakangan belum belumlah benar benarkah benarlah berada berakhir
|
||||||
|
berakhirlah berakhirnya berapa berapakah berapalah berapapun berarti berawal
|
||||||
|
berbagai berdatangan beri berikan berikut berikutnya berjumlah berkali-kali
|
||||||
|
berkata berkehendak berkeinginan berkenaan berlainan berlalu berlangsung
|
||||||
|
berlebihan bermacam bermacam-macam bermaksud bermula bersama bersama-sama
|
||||||
|
bersiap bersiap-siap bertanya bertanya-tanya berturut berturut-turut bertutur
|
||||||
|
berujar berupa besar betul betulkah biasa biasanya bila bilakah bisa bisakah
|
||||||
|
boleh bolehkah bolehlah buat bukan bukankah bukanlah bukannya bulan bung
|
||||||
|
|
||||||
|
cara caranya cukup cukupkah cukuplah cuma
|
||||||
|
|
||||||
|
dahulu dalam dan dapat dari daripada datang dekat demi demikian demikianlah
|
||||||
|
dengan depan di dia diakhiri diakhirinya dialah diantara diantaranya diberi
|
||||||
|
diberikan diberikannya dibuat dibuatnya didapat didatangkan digunakan
|
||||||
|
diibaratkan diibaratkannya diingat diingatkan diinginkan dijawab dijelaskan
|
||||||
|
dijelaskannya dikarenakan dikatakan dikatakannya dikerjakan diketahui
|
||||||
|
diketahuinya dikira dilakukan dilalui dilihat dimaksud dimaksudkan
|
||||||
|
dimaksudkannya dimaksudnya diminta dimintai dimisalkan dimulai dimulailah
|
||||||
|
dimulainya dimungkinkan dini dipastikan diperbuat diperbuatnya dipergunakan
|
||||||
|
diperkirakan diperlihatkan diperlukan diperlukannya dipersoalkan dipertanyakan
|
||||||
|
dipunyai diri dirinya disampaikan disebut disebutkan disebutkannya disini
|
||||||
|
disinilah ditambahkan ditandaskan ditanya ditanyai ditanyakan ditegaskan
|
||||||
|
ditujukan ditunjuk ditunjuki ditunjukkan ditunjukkannya ditunjuknya dituturkan
|
||||||
|
dituturkannya diucapkan diucapkannya diungkapkan dong dua dulu
|
||||||
|
|
||||||
|
empat enggak enggaknya entah entahlah
|
||||||
|
|
||||||
|
guna gunakan
|
||||||
|
|
||||||
|
hal hampir hanya hanyalah hari harus haruslah harusnya hendak hendaklah
|
||||||
|
hendaknya hingga
|
||||||
|
|
||||||
|
ia ialah ibarat ibaratkan ibaratnya ibu ikut ingat ingat-ingat ingin inginkah
|
||||||
|
inginkan ini inikah inilah itu itukah itulah
|
||||||
|
|
||||||
|
jadi jadilah jadinya jangan jangankan janganlah jauh jawab jawaban jawabnya
|
||||||
|
jelas jelaskan jelaslah jelasnya jika jikalau juga jumlah jumlahnya justru
|
||||||
|
|
||||||
|
kala kalau kalaulah kalaupun kalian kami kamilah kamu kamulah kan kapan
|
||||||
|
kapankah kapanpun karena karenanya kasus kata katakan katakanlah katanya ke
|
||||||
|
keadaan kebetulan kecil kedua keduanya keinginan kelamaan kelihatan
|
||||||
|
kelihatannya kelima keluar kembali kemudian kemungkinan kemungkinannya kenapa
|
||||||
|
kepada kepadanya kesampaian keseluruhan keseluruhannya keterlaluan ketika
|
||||||
|
khususnya kini kinilah kira kira-kira kiranya kita kitalah kok kurang
|
||||||
|
|
||||||
|
lagi lagian lah lain lainnya lalu lama lamanya lanjut lanjutnya lebih lewat
|
||||||
|
lima luar
|
||||||
|
|
||||||
|
macam maka makanya makin malah malahan mampu mampukah mana manakala manalagi
|
||||||
|
masa masalah masalahnya masih masihkah masing masing-masing mau maupun
|
||||||
|
melainkan melakukan melalui melihat melihatnya memang memastikan memberi
|
||||||
|
memberikan membuat memerlukan memihak meminta memintakan memisalkan memperbuat
|
||||||
|
mempergunakan memperkirakan memperlihatkan mempersiapkan mempersoalkan
|
||||||
|
mempertanyakan mempunyai memulai memungkinkan menaiki menambahkan menandaskan
|
||||||
|
menanti menanti-nanti menantikan menanya menanyai menanyakan mendapat
|
||||||
|
mendapatkan mendatang mendatangi mendatangkan menegaskan mengakhiri mengapa
|
||||||
|
mengatakan mengatakannya mengenai mengerjakan mengetahui menggunakan
|
||||||
|
menghendaki mengibaratkan mengibaratkannya mengingat mengingatkan menginginkan
|
||||||
|
mengira mengucapkan mengucapkannya mengungkapkan menjadi menjawab menjelaskan
|
||||||
|
menuju menunjuk menunjuki menunjukkan menunjuknya menurut menuturkan
|
||||||
|
menyampaikan menyangkut menyatakan menyebutkan menyeluruh menyiapkan merasa
|
||||||
|
mereka merekalah merupakan meski meskipun meyakini meyakinkan minta mirip
|
||||||
|
misal misalkan misalnya mula mulai mulailah mulanya mungkin mungkinkah
|
||||||
|
|
||||||
|
nah naik namun nanti nantinya nyaris nyatanya
|
||||||
|
|
||||||
|
oleh olehnya
|
||||||
|
|
||||||
|
pada padahal padanya pak paling panjang pantas para pasti pastilah penting
|
||||||
|
pentingnya per percuma perlu perlukah perlunya pernah persoalan pertama
|
||||||
|
pertama-tama pertanyaan pertanyakan pihak pihaknya pukul pula pun punya
|
||||||
|
|
||||||
|
rasa rasanya rata rupanya
|
||||||
|
|
||||||
|
saat saatnya saja sajalah saling sama sama-sama sambil sampai sampai-sampai
|
||||||
|
sampaikan sana sangat sangatlah satu saya sayalah se sebab sebabnya sebagai
|
||||||
|
sebagaimana sebagainya sebagian sebaik sebaik-baiknya sebaiknya sebaliknya
|
||||||
|
sebanyak sebegini sebegitu sebelum sebelumnya sebenarnya seberapa sebesar
|
||||||
|
sebetulnya sebisanya sebuah sebut sebutlah sebutnya secara secukupnya sedang
|
||||||
|
sedangkan sedemikian sedikit sedikitnya seenaknya segala segalanya segera
|
||||||
|
seharusnya sehingga seingat sejak sejauh sejenak sejumlah sekadar sekadarnya
|
||||||
|
sekali sekali-kali sekalian sekaligus sekalipun sekarang sekarang sekecil
|
||||||
|
seketika sekiranya sekitar sekitarnya sekurang-kurangnya sekurangnya sela
|
||||||
|
selain selaku selalu selama selama-lamanya selamanya selanjutnya seluruh
|
||||||
|
seluruhnya semacam semakin semampu semampunya semasa semasih semata semata-mata
|
||||||
|
semaunya sementara semisal semisalnya sempat semua semuanya semula sendiri
|
||||||
|
sendirian sendirinya seolah seolah-olah seorang sepanjang sepantasnya
|
||||||
|
sepantasnyalah seperlunya seperti sepertinya sepihak sering seringnya serta
|
||||||
|
serupa sesaat sesama sesampai sesegera sesekali seseorang sesuatu sesuatunya
|
||||||
|
sesudah sesudahnya setelah setempat setengah seterusnya setiap setiba setibanya
|
||||||
|
setidak-tidaknya setidaknya setinggi seusai sewaktu siap siapa siapakah
|
||||||
|
siapapun sini sinilah soal soalnya suatu sudah sudahkah sudahlah supaya
|
||||||
|
|
||||||
|
tadi tadinya tahu tahun tak tambah tambahnya tampak tampaknya tandas tandasnya
|
||||||
|
tanpa tanya tanyakan tanyanya tapi tegas tegasnya telah tempat tengah tentang
|
||||||
|
tentu tentulah tentunya tepat terakhir terasa terbanyak terdahulu terdapat
|
||||||
|
terdiri terhadap terhadapnya teringat teringat-ingat terjadi terjadilah
|
||||||
|
terjadinya terkira terlalu terlebih terlihat termasuk ternyata tersampaikan
|
||||||
|
tersebut tersebutlah tertentu tertuju terus terutama tetap tetapi tiap tiba
|
||||||
|
tiba-tiba tidak tidakkah tidaklah tiga tinggi toh tunjuk turut tutur tuturnya
|
||||||
|
|
||||||
|
ucap ucapnya ujar ujarnya umum umumnya ungkap ungkapnya untuk usah usai
|
||||||
|
|
||||||
|
waduh wah wahai waktu waktunya walau walaupun wong
|
||||||
|
|
||||||
|
yaitu yakin yakni yang
|
||||||
|
""".split()
|
||||||
|
)
|
41
spacy/lang/ms/syntax_iterators.py
Normal file
41
spacy/lang/ms/syntax_iterators.py
Normal file
|
@ -0,0 +1,41 @@
|
||||||
|
from typing import Union, Iterator, Tuple
|
||||||
|
|
||||||
|
from ...symbols import NOUN, PROPN, PRON
|
||||||
|
from ...errors import Errors
|
||||||
|
from ...tokens import Doc, Span
|
||||||
|
|
||||||
|
|
||||||
|
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
|
||||||
|
"""
|
||||||
|
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
|
||||||
|
"""
|
||||||
|
# fmt: off
|
||||||
|
labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
|
||||||
|
# fmt: on
|
||||||
|
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||||
|
if not doc.has_annotation("DEP"):
|
||||||
|
raise ValueError(Errors.E029)
|
||||||
|
np_deps = [doc.vocab.strings[label] for label in labels]
|
||||||
|
conj = doc.vocab.strings.add("conj")
|
||||||
|
np_label = doc.vocab.strings.add("NP")
|
||||||
|
prev_end = -1
|
||||||
|
for i, word in enumerate(doclike):
|
||||||
|
if word.pos not in (NOUN, PROPN, PRON):
|
||||||
|
continue
|
||||||
|
# Prevent nested chunks from being produced
|
||||||
|
if word.left_edge.i <= prev_end:
|
||||||
|
continue
|
||||||
|
if word.dep in np_deps:
|
||||||
|
prev_end = word.right_edge.i
|
||||||
|
yield word.left_edge.i, word.right_edge.i + 1, np_label
|
||||||
|
elif word.dep == conj:
|
||||||
|
head = word.head
|
||||||
|
while head.dep == conj and head.head.i < head.i:
|
||||||
|
head = head.head
|
||||||
|
# If the head is an NP, and we're coordinated to it, we're an NP
|
||||||
|
if head.dep in np_deps:
|
||||||
|
prev_end = word.right_edge.i
|
||||||
|
yield word.left_edge.i, word.right_edge.i + 1, np_label
|
||||||
|
|
||||||
|
|
||||||
|
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
|
1533
spacy/lang/ms/tokenizer_exceptions.py
Normal file
1533
spacy/lang/ms/tokenizer_exceptions.py
Normal file
File diff suppressed because it is too large
Load Diff
|
@ -1,11 +1,14 @@
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||||
from ...language import Language, BaseDefaults
|
from ...language import Language, BaseDefaults
|
||||||
|
|
||||||
|
|
||||||
class SerbianDefaults(BaseDefaults):
|
class SerbianDefaults(BaseDefaults):
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
|
infixes = TOKENIZER_INFIXES
|
||||||
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
lex_attr_getters = LEX_ATTRS
|
lex_attr_getters = LEX_ATTRS
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
36
spacy/lang/sr/punctuation.py
Normal file
36
spacy/lang/sr/punctuation.py
Normal file
|
@ -0,0 +1,36 @@
|
||||||
|
from ..char_classes import LIST_ELLIPSES, LIST_ICONS, LIST_PUNCT, LIST_QUOTES
|
||||||
|
from ..char_classes import CURRENCY, UNITS, PUNCT
|
||||||
|
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
|
||||||
|
|
||||||
|
|
||||||
|
_infixes = (
|
||||||
|
LIST_ELLIPSES
|
||||||
|
+ LIST_ICONS
|
||||||
|
+ [
|
||||||
|
r"(?<=[0-9])[+\-\*^](?=[0-9-])",
|
||||||
|
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
|
||||||
|
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
|
||||||
|
),
|
||||||
|
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||||
|
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
_suffixes = (
|
||||||
|
LIST_PUNCT
|
||||||
|
+ LIST_ELLIPSES
|
||||||
|
+ LIST_QUOTES
|
||||||
|
+ LIST_ICONS
|
||||||
|
+ [
|
||||||
|
r"(?<=[0-9])\+",
|
||||||
|
r"(?<=°[FfCcKk])\.",
|
||||||
|
r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
|
||||||
|
r"(?<=[0-9])(?:{u})".format(u=UNITS),
|
||||||
|
r"(?<=[{a}{e}{p}(?:{q})])\.".format(
|
||||||
|
a=ALPHA, e=r"%²\-\+", q=CONCAT_QUOTES, p=PUNCT
|
||||||
|
),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
TOKENIZER_INFIXES = _infixes
|
||||||
|
TOKENIZER_SUFFIXES = _suffixes
|
|
@ -1,6 +1,6 @@
|
||||||
from typing import Iterator, Optional, Any, Dict, Callable, Iterable, Literal
|
from typing import Iterator, Optional, Any, Dict, Callable, Iterable, Literal
|
||||||
from typing import Union, Tuple, List, Set, Pattern, Sequence
|
from typing import Union, Tuple, List, Set, Pattern, Sequence
|
||||||
from typing import NoReturn, TYPE_CHECKING, TypeVar, cast, overload
|
from typing import NoReturn, TypeVar, cast, overload
|
||||||
|
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
import random
|
import random
|
||||||
|
@ -1383,7 +1383,10 @@ class Language:
|
||||||
"No 'get_examples' callback provided to 'Language.initialize', creating dummy examples"
|
"No 'get_examples' callback provided to 'Language.initialize', creating dummy examples"
|
||||||
)
|
)
|
||||||
doc = Doc(self.vocab, words=["x", "y", "z"])
|
doc = Doc(self.vocab, words=["x", "y", "z"])
|
||||||
get_examples = lambda: [Example.from_dict(doc, {})]
|
|
||||||
|
def get_examples():
|
||||||
|
return [Example.from_dict(doc, {})]
|
||||||
|
|
||||||
if not hasattr(get_examples, "__call__"):
|
if not hasattr(get_examples, "__call__"):
|
||||||
err = Errors.E930.format(
|
err = Errors.E930.format(
|
||||||
method="Language.initialize", obj=type(get_examples)
|
method="Language.initialize", obj=type(get_examples)
|
||||||
|
@ -1488,6 +1491,7 @@ class Language:
|
||||||
scorer: Optional[Scorer] = None,
|
scorer: Optional[Scorer] = None,
|
||||||
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
|
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
|
||||||
scorer_cfg: Optional[Dict[str, Any]] = None,
|
scorer_cfg: Optional[Dict[str, Any]] = None,
|
||||||
|
per_component: bool = False,
|
||||||
) -> Dict[str, Any]:
|
) -> Dict[str, Any]:
|
||||||
"""Evaluate a model's pipeline components.
|
"""Evaluate a model's pipeline components.
|
||||||
|
|
||||||
|
@ -1499,6 +1503,8 @@ class Language:
|
||||||
arguments for specific components.
|
arguments for specific components.
|
||||||
scorer_cfg (dict): An optional dictionary with extra keyword arguments
|
scorer_cfg (dict): An optional dictionary with extra keyword arguments
|
||||||
for the scorer.
|
for the scorer.
|
||||||
|
per_component (bool): Whether to return the scores keyed by component
|
||||||
|
name. Defaults to False.
|
||||||
|
|
||||||
RETURNS (Scorer): The scorer containing the evaluation results.
|
RETURNS (Scorer): The scorer containing the evaluation results.
|
||||||
|
|
||||||
|
@ -1531,7 +1537,7 @@ class Language:
|
||||||
for eg, doc in zip(examples, docs):
|
for eg, doc in zip(examples, docs):
|
||||||
eg.predicted = doc
|
eg.predicted = doc
|
||||||
end_time = timer()
|
end_time = timer()
|
||||||
results = scorer.score(examples)
|
results = scorer.score(examples, per_component=per_component)
|
||||||
n_words = sum(len(eg.predicted) for eg in examples)
|
n_words = sum(len(eg.predicted) for eg in examples)
|
||||||
results["speed"] = n_words / (end_time - start_time)
|
results["speed"] = n_words / (end_time - start_time)
|
||||||
return results
|
return results
|
||||||
|
|
|
@ -24,7 +24,8 @@ class Lexeme:
|
||||||
def orth_(self) -> str: ...
|
def orth_(self) -> str: ...
|
||||||
@property
|
@property
|
||||||
def text(self) -> str: ...
|
def text(self) -> str: ...
|
||||||
lower: str
|
orth: int
|
||||||
|
lower: int
|
||||||
norm: int
|
norm: int
|
||||||
shape: int
|
shape: int
|
||||||
prefix: int
|
prefix: int
|
||||||
|
|
|
@ -186,7 +186,7 @@ cdef class Lexeme:
|
||||||
return self.orth_
|
return self.orth_
|
||||||
|
|
||||||
property lower:
|
property lower:
|
||||||
"""RETURNS (str): Lowercase form of the lexeme."""
|
"""RETURNS (uint64): Lowercase form of the lexeme."""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.c.lower
|
return self.c.lower
|
||||||
|
|
||||||
|
|
|
@ -432,22 +432,22 @@ cdef class DependencyMatcher:
|
||||||
return [doc[child.i] for child in doc[node].head.children if child.i < node]
|
return [doc[child.i] for child in doc[node].head.children if child.i < node]
|
||||||
|
|
||||||
def _imm_right_child(self, doc, node):
|
def _imm_right_child(self, doc, node):
|
||||||
for child in doc[node].children:
|
for child in doc[node].rights:
|
||||||
if child.i == node + 1:
|
if child.i == node + 1:
|
||||||
return [doc[child.i]]
|
return [doc[child.i]]
|
||||||
return []
|
return []
|
||||||
|
|
||||||
def _imm_left_child(self, doc, node):
|
def _imm_left_child(self, doc, node):
|
||||||
for child in doc[node].children:
|
for child in doc[node].lefts:
|
||||||
if child.i == node - 1:
|
if child.i == node - 1:
|
||||||
return [doc[child.i]]
|
return [doc[child.i]]
|
||||||
return []
|
return []
|
||||||
|
|
||||||
def _right_child(self, doc, node):
|
def _right_child(self, doc, node):
|
||||||
return [doc[child.i] for child in doc[node].children if child.i > node]
|
return [child for child in doc[node].rights]
|
||||||
|
|
||||||
def _left_child(self, doc, node):
|
def _left_child(self, doc, node):
|
||||||
return [doc[child.i] for child in doc[node].children if child.i < node]
|
return [child for child in doc[node].lefts]
|
||||||
|
|
||||||
def _imm_right_parent(self, doc, node):
|
def _imm_right_parent(self, doc, node):
|
||||||
if doc[node].head.i == node + 1:
|
if doc[node].head.i == node + 1:
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import Tuple, Callable
|
from typing import List, Tuple, Callable
|
||||||
from thinc.api import Model, to_numpy
|
from thinc.api import Model, to_numpy
|
||||||
from thinc.types import Ragged, Ints1d
|
from thinc.types import Ragged, Ints1d
|
||||||
|
|
||||||
|
@ -52,14 +52,14 @@ def _get_span_indices(ops, spans: Ragged, lengths: Ints1d) -> Ints1d:
|
||||||
indices will be [5, 6, 7, 8, 8, 9].
|
indices will be [5, 6, 7, 8, 8, 9].
|
||||||
"""
|
"""
|
||||||
spans, lengths = _ensure_cpu(spans, lengths)
|
spans, lengths = _ensure_cpu(spans, lengths)
|
||||||
indices = []
|
indices: List[int] = []
|
||||||
offset = 0
|
offset = 0
|
||||||
for i, length in enumerate(lengths):
|
for i, length in enumerate(lengths):
|
||||||
spans_i = spans[i].dataXd + offset
|
spans_i = spans[i].dataXd + offset
|
||||||
for j in range(spans_i.shape[0]):
|
for j in range(spans_i.shape[0]):
|
||||||
indices.append(ops.xp.arange(spans_i[j, 0], spans_i[j, 1])) # type: ignore[call-overload, index]
|
indices.extend(range(spans_i[j, 0], spans_i[j, 1])) # type: ignore[arg-type, call-overload]
|
||||||
offset += length
|
offset += length
|
||||||
return ops.flatten(indices, dtype="i", ndim_if_empty=1)
|
return ops.asarray1i(indices)
|
||||||
|
|
||||||
|
|
||||||
def _ensure_cpu(spans: Ragged, lengths: Ints1d) -> Tuple[Ragged, Ints1d]:
|
def _ensure_cpu(spans: Ragged, lengths: Ints1d) -> Tuple[Ragged, Ints1d]:
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
from .entity_linker import * # noqa
|
from .entity_linker import * # noqa
|
||||||
from .multi_task import * # noqa
|
from .multi_task import * # noqa
|
||||||
from .parser import * # noqa
|
from .parser import * # noqa
|
||||||
|
from .span_finder import * # noqa
|
||||||
from .spancat import * # noqa
|
from .spancat import * # noqa
|
||||||
from .tagger import * # noqa
|
from .tagger import * # noqa
|
||||||
from .textcat import * # noqa
|
from .textcat import * # noqa
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
from typing import Any, Optional, Iterable, Tuple, List, Callable, TYPE_CHECKING, cast
|
from typing import Any, Optional, Iterable, Tuple, List, Callable, TYPE_CHECKING, cast
|
||||||
from thinc.types import Floats2d
|
from thinc.types import Floats2d, Ints1d
|
||||||
from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init, Model
|
from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init, Model
|
||||||
from thinc.api import MultiSoftmax, list2array
|
from thinc.api import MultiSoftmax, list2array
|
||||||
from thinc.api import to_categorical, CosineDistance, L2Distance
|
from thinc.api import to_categorical, CosineDistance, L2Distance
|
||||||
|
@ -7,7 +7,8 @@ from thinc.loss import Loss
|
||||||
|
|
||||||
from ...util import registry, OOV_RANK
|
from ...util import registry, OOV_RANK
|
||||||
from ...errors import Errors
|
from ...errors import Errors
|
||||||
from ...attrs import ID
|
from ...attrs import ID, ORTH
|
||||||
|
from ...vectors import Mode as VectorsMode
|
||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
from functools import partial
|
from functools import partial
|
||||||
|
@ -67,14 +68,23 @@ def get_vectors_loss(ops, docs, prediction, distance):
|
||||||
"""Compute a loss based on a distance between the documents' vectors and
|
"""Compute a loss based on a distance between the documents' vectors and
|
||||||
the prediction.
|
the prediction.
|
||||||
"""
|
"""
|
||||||
# The simplest way to implement this would be to vstack the
|
vocab = docs[0].vocab
|
||||||
# token.vector values, but that's a bit inefficient, especially on GPU.
|
if vocab.vectors.mode == VectorsMode.default:
|
||||||
# Instead we fetch the index into the vectors table for each of our tokens,
|
# The simplest way to implement this would be to vstack the
|
||||||
# and look them up all at once. This prevents data copying.
|
# token.vector values, but that's a bit inefficient, especially on GPU.
|
||||||
ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
|
# Instead we fetch the index into the vectors table for each of our
|
||||||
target = docs[0].vocab.vectors.data[ids]
|
# tokens, and look them up all at once. This prevents data copying.
|
||||||
target[ids == OOV_RANK] = 0
|
ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
|
||||||
d_target, loss = distance(prediction, target)
|
target = docs[0].vocab.vectors.data[ids]
|
||||||
|
target[ids == OOV_RANK] = 0
|
||||||
|
d_target, loss = distance(prediction, target)
|
||||||
|
elif vocab.vectors.mode == VectorsMode.floret:
|
||||||
|
keys = ops.flatten([cast(Ints1d, doc.to_array(ORTH)) for doc in docs])
|
||||||
|
target = vocab.vectors.get_batch(keys)
|
||||||
|
target = ops.as_contig(target)
|
||||||
|
d_target, loss = distance(prediction, target)
|
||||||
|
else:
|
||||||
|
raise ValueError(Errors.E850.format(mode=vocab.vectors.mode))
|
||||||
return loss, d_target
|
return loss, d_target
|
||||||
|
|
||||||
|
|
||||||
|
|
42
spacy/ml/models/span_finder.py
Normal file
42
spacy/ml/models/span_finder.py
Normal file
|
@ -0,0 +1,42 @@
|
||||||
|
from typing import Callable, List, Tuple
|
||||||
|
|
||||||
|
from thinc.api import Model, chain, with_array
|
||||||
|
from thinc.types import Floats1d, Floats2d
|
||||||
|
|
||||||
|
from ...tokens import Doc
|
||||||
|
|
||||||
|
from ...util import registry
|
||||||
|
|
||||||
|
InT = List[Doc]
|
||||||
|
OutT = Floats2d
|
||||||
|
|
||||||
|
|
||||||
|
@registry.architectures("spacy.SpanFinder.v1")
|
||||||
|
def build_finder_model(
|
||||||
|
tok2vec: Model[InT, List[Floats2d]], scorer: Model[OutT, OutT]
|
||||||
|
) -> Model[InT, OutT]:
|
||||||
|
|
||||||
|
logistic_layer: Model[List[Floats2d], List[Floats2d]] = with_array(scorer)
|
||||||
|
model: Model[InT, OutT] = chain(tok2vec, logistic_layer, flattener())
|
||||||
|
model.set_ref("tok2vec", tok2vec)
|
||||||
|
model.set_ref("scorer", scorer)
|
||||||
|
model.set_ref("logistic_layer", logistic_layer)
|
||||||
|
|
||||||
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
def flattener() -> Model[List[Floats2d], Floats2d]:
|
||||||
|
"""Flattens the input to a 1-dimensional list of scores"""
|
||||||
|
|
||||||
|
def forward(
|
||||||
|
model: Model[Floats1d, Floats1d], X: List[Floats2d], is_train: bool
|
||||||
|
) -> Tuple[Floats2d, Callable[[Floats2d], List[Floats2d]]]:
|
||||||
|
lens = model.ops.asarray1i([len(doc) for doc in X])
|
||||||
|
Y = model.ops.flatten(X)
|
||||||
|
|
||||||
|
def backprop(dY: Floats2d) -> List[Floats2d]:
|
||||||
|
return model.ops.unflatten(dY, lens)
|
||||||
|
|
||||||
|
return Y, backprop
|
||||||
|
|
||||||
|
return Model("Flattener", forward=forward)
|
|
@ -2,20 +2,21 @@ from .attribute_ruler import AttributeRuler
|
||||||
from .dep_parser import DependencyParser
|
from .dep_parser import DependencyParser
|
||||||
from .edit_tree_lemmatizer import EditTreeLemmatizer
|
from .edit_tree_lemmatizer import EditTreeLemmatizer
|
||||||
from .entity_linker import EntityLinker
|
from .entity_linker import EntityLinker
|
||||||
from .ner import EntityRecognizer
|
from .functions import merge_entities, merge_noun_chunks, merge_subtokens
|
||||||
from .lemmatizer import Lemmatizer
|
from .lemmatizer import Lemmatizer
|
||||||
from .morphologizer import Morphologizer
|
from .morphologizer import Morphologizer
|
||||||
|
from .ner import EntityRecognizer
|
||||||
from .pipe import Pipe
|
from .pipe import Pipe
|
||||||
from .trainable_pipe import TrainablePipe
|
|
||||||
from .senter import SentenceRecognizer
|
|
||||||
from .sentencizer import Sentencizer
|
from .sentencizer import Sentencizer
|
||||||
|
from .senter import SentenceRecognizer
|
||||||
|
from .span_finder import SpanFinder
|
||||||
|
from .span_ruler import SpanRuler
|
||||||
|
from .spancat import SpanCategorizer
|
||||||
from .tagger import Tagger
|
from .tagger import Tagger
|
||||||
from .textcat import TextCategorizer
|
from .textcat import TextCategorizer
|
||||||
from .spancat import SpanCategorizer
|
|
||||||
from .span_ruler import SpanRuler
|
|
||||||
from .textcat_multilabel import MultiLabel_TextCategorizer
|
from .textcat_multilabel import MultiLabel_TextCategorizer
|
||||||
from .tok2vec import Tok2Vec
|
from .tok2vec import Tok2Vec
|
||||||
from .functions import merge_entities, merge_noun_chunks, merge_subtokens
|
from .trainable_pipe import TrainablePipe
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"AttributeRuler",
|
"AttributeRuler",
|
||||||
|
@ -29,6 +30,7 @@ __all__ = [
|
||||||
"SentenceRecognizer",
|
"SentenceRecognizer",
|
||||||
"Sentencizer",
|
"Sentencizer",
|
||||||
"SpanCategorizer",
|
"SpanCategorizer",
|
||||||
|
"SpanFinder",
|
||||||
"SpanRuler",
|
"SpanRuler",
|
||||||
"Tagger",
|
"Tagger",
|
||||||
"TextCategorizer",
|
"TextCategorizer",
|
||||||
|
|
|
@ -487,18 +487,24 @@ class EntityLinker(TrainablePipe):
|
||||||
|
|
||||||
# Looping through each entity in batch (TODO: rewrite)
|
# Looping through each entity in batch (TODO: rewrite)
|
||||||
for j, ent in enumerate(ent_batch):
|
for j, ent in enumerate(ent_batch):
|
||||||
sent_index = sentences.index(ent.sent)
|
assert hasattr(ent, "sents")
|
||||||
assert sent_index >= 0
|
sents = list(ent.sents)
|
||||||
|
sent_indices = (
|
||||||
|
sentences.index(sents[0]),
|
||||||
|
sentences.index(sents[-1]),
|
||||||
|
)
|
||||||
|
assert sent_indices[1] >= sent_indices[0] >= 0
|
||||||
|
|
||||||
if self.incl_context:
|
if self.incl_context:
|
||||||
# get n_neighbour sentences, clipped to the length of the document
|
# get n_neighbour sentences, clipped to the length of the document
|
||||||
start_sentence = max(0, sent_index - self.n_sents)
|
start_sentence = max(0, sent_indices[0] - self.n_sents)
|
||||||
end_sentence = min(
|
end_sentence = min(
|
||||||
len(sentences) - 1, sent_index + self.n_sents
|
len(sentences) - 1, sent_indices[1] + self.n_sents
|
||||||
)
|
)
|
||||||
start_token = sentences[start_sentence].start
|
start_token = sentences[start_sentence].start
|
||||||
end_token = sentences[end_sentence].end
|
end_token = sentences[end_sentence].end
|
||||||
sent_doc = doc[start_token:end_token].as_doc()
|
sent_doc = doc[start_token:end_token].as_doc()
|
||||||
|
|
||||||
# currently, the context is the same for each entity in a sentence (should be refined)
|
# currently, the context is the same for each entity in a sentence (should be refined)
|
||||||
sentence_encoding = self.model.predict([sent_doc])[0]
|
sentence_encoding = self.model.predict([sent_doc])[0]
|
||||||
sentence_encoding_t = sentence_encoding.T
|
sentence_encoding_t = sentence_encoding.T
|
||||||
|
|
|
@ -21,6 +21,10 @@ from ..scorer import Scorer
|
||||||
from ..training import validate_examples, validate_get_examples
|
from ..training import validate_examples, validate_get_examples
|
||||||
from ..util import registry
|
from ..util import registry
|
||||||
|
|
||||||
|
# See #9050
|
||||||
|
BACKWARD_OVERWRITE = True
|
||||||
|
BACKWARD_EXTEND = False
|
||||||
|
|
||||||
default_model_config = """
|
default_model_config = """
|
||||||
[model]
|
[model]
|
||||||
@architectures = "spacy.Tagger.v2"
|
@architectures = "spacy.Tagger.v2"
|
||||||
|
@ -55,6 +59,7 @@ DEFAULT_MORPH_MODEL = Config().from_str(default_model_config)["model"]
|
||||||
"overwrite": True,
|
"overwrite": True,
|
||||||
"extend": False,
|
"extend": False,
|
||||||
"scorer": {"@scorers": "spacy.morphologizer_scorer.v1"},
|
"scorer": {"@scorers": "spacy.morphologizer_scorer.v1"},
|
||||||
|
"label_smoothing": 0.0,
|
||||||
"save_activations": False,
|
"save_activations": False,
|
||||||
},
|
},
|
||||||
default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5, "morph_per_feat": None},
|
default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5, "morph_per_feat": None},
|
||||||
|
@ -65,10 +70,11 @@ def make_morphologizer(
|
||||||
name: str,
|
name: str,
|
||||||
overwrite: bool,
|
overwrite: bool,
|
||||||
extend: bool,
|
extend: bool,
|
||||||
|
label_smoothing: float,
|
||||||
scorer: Optional[Callable],
|
scorer: Optional[Callable],
|
||||||
save_activations: bool,
|
save_activations: bool,
|
||||||
):
|
):
|
||||||
return Morphologizer(nlp.vocab, model, name, overwrite=overwrite, extend=extend, scorer=scorer,
|
return Morphologizer(nlp.vocab, model, name, overwrite=overwrite, extend=extend, label_smoothing=label_smoothing, scorer=scorer,
|
||||||
save_activations=save_activations)
|
save_activations=save_activations)
|
||||||
|
|
||||||
|
|
||||||
|
@ -98,8 +104,9 @@ class Morphologizer(Tagger):
|
||||||
model: Model,
|
model: Model,
|
||||||
name: str = "morphologizer",
|
name: str = "morphologizer",
|
||||||
*,
|
*,
|
||||||
overwrite: bool = False,
|
overwrite: bool = BACKWARD_OVERWRITE,
|
||||||
extend: bool = False,
|
extend: bool = BACKWARD_EXTEND,
|
||||||
|
label_smoothing: float = 0.0,
|
||||||
scorer: Optional[Callable] = morphologizer_score,
|
scorer: Optional[Callable] = morphologizer_score,
|
||||||
save_activations: bool = False,
|
save_activations: bool = False,
|
||||||
):
|
):
|
||||||
|
@ -131,6 +138,7 @@ class Morphologizer(Tagger):
|
||||||
"labels_pos": {},
|
"labels_pos": {},
|
||||||
"overwrite": overwrite,
|
"overwrite": overwrite,
|
||||||
"extend": extend,
|
"extend": extend,
|
||||||
|
"label_smoothing": label_smoothing,
|
||||||
}
|
}
|
||||||
self.cfg = dict(sorted(cfg.items()))
|
self.cfg = dict(sorted(cfg.items()))
|
||||||
self.scorer = scorer
|
self.scorer = scorer
|
||||||
|
@ -139,7 +147,7 @@ class Morphologizer(Tagger):
|
||||||
@property
|
@property
|
||||||
def labels(self):
|
def labels(self):
|
||||||
"""RETURNS (Iterable[str]): The labels currently added to the component."""
|
"""RETURNS (Iterable[str]): The labels currently added to the component."""
|
||||||
return self.cfg["labels_morph"].keys()
|
return tuple(self.cfg["labels_morph"].keys())
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def label_data(self) -> Dict[str, Dict[str, Union[str, float, int, None]]]:
|
def label_data(self) -> Dict[str, Dict[str, Union[str, float, int, None]]]:
|
||||||
|
@ -289,7 +297,8 @@ class Morphologizer(Tagger):
|
||||||
DOCS: https://spacy.io/api/morphologizer#get_loss
|
DOCS: https://spacy.io/api/morphologizer#get_loss
|
||||||
"""
|
"""
|
||||||
validate_examples(examples, "Morphologizer.get_loss")
|
validate_examples(examples, "Morphologizer.get_loss")
|
||||||
loss_func = LegacySequenceCategoricalCrossentropy(names=tuple(self.labels), normalize=False)
|
loss_func = LegacySequenceCategoricalCrossentropy(names=self.labels, normalize=False,
|
||||||
|
label_smoothing=self.cfg["label_smoothing"])
|
||||||
truths = []
|
truths = []
|
||||||
for eg in examples:
|
for eg in examples:
|
||||||
eg_truths = []
|
eg_truths = []
|
||||||
|
|
336
spacy/pipeline/span_finder.py
Normal file
336
spacy/pipeline/span_finder.py
Normal file
|
@ -0,0 +1,336 @@
|
||||||
|
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple
|
||||||
|
|
||||||
|
from thinc.api import Config, Model, Optimizer, set_dropout_rate
|
||||||
|
from thinc.types import Floats2d
|
||||||
|
|
||||||
|
from ..language import Language
|
||||||
|
from .trainable_pipe import TrainablePipe
|
||||||
|
from ..scorer import Scorer
|
||||||
|
from ..tokens import Doc, Span
|
||||||
|
from ..training import Example
|
||||||
|
from ..errors import Errors
|
||||||
|
|
||||||
|
from ..util import registry
|
||||||
|
from .spancat import DEFAULT_SPANS_KEY
|
||||||
|
|
||||||
|
span_finder_default_config = """
|
||||||
|
[model]
|
||||||
|
@architectures = "spacy.SpanFinder.v1"
|
||||||
|
|
||||||
|
[model.scorer]
|
||||||
|
@layers = "spacy.LinearLogistic.v1"
|
||||||
|
nO = 2
|
||||||
|
|
||||||
|
[model.tok2vec]
|
||||||
|
@architectures = "spacy.Tok2Vec.v2"
|
||||||
|
|
||||||
|
[model.tok2vec.embed]
|
||||||
|
@architectures = "spacy.MultiHashEmbed.v2"
|
||||||
|
width = 96
|
||||||
|
rows = [5000, 1000, 2500, 1000]
|
||||||
|
attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
|
||||||
|
include_static_vectors = false
|
||||||
|
|
||||||
|
[model.tok2vec.encode]
|
||||||
|
@architectures = "spacy.MaxoutWindowEncoder.v2"
|
||||||
|
width = ${model.tok2vec.embed.width}
|
||||||
|
window_size = 1
|
||||||
|
maxout_pieces = 3
|
||||||
|
depth = 4
|
||||||
|
"""
|
||||||
|
|
||||||
|
DEFAULT_SPAN_FINDER_MODEL = Config().from_str(span_finder_default_config)["model"]
|
||||||
|
|
||||||
|
|
||||||
|
@Language.factory(
|
||||||
|
"span_finder",
|
||||||
|
assigns=["doc.spans"],
|
||||||
|
default_config={
|
||||||
|
"threshold": 0.5,
|
||||||
|
"model": DEFAULT_SPAN_FINDER_MODEL,
|
||||||
|
"spans_key": DEFAULT_SPANS_KEY,
|
||||||
|
"max_length": None,
|
||||||
|
"min_length": None,
|
||||||
|
"scorer": {"@scorers": "spacy.span_finder_scorer.v1"},
|
||||||
|
},
|
||||||
|
default_score_weights={
|
||||||
|
f"span_finder_{DEFAULT_SPANS_KEY}_f": 1.0,
|
||||||
|
f"span_finder_{DEFAULT_SPANS_KEY}_p": 0.0,
|
||||||
|
f"span_finder_{DEFAULT_SPANS_KEY}_r": 0.0,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
def make_span_finder(
|
||||||
|
nlp: Language,
|
||||||
|
name: str,
|
||||||
|
model: Model[Iterable[Doc], Floats2d],
|
||||||
|
spans_key: str,
|
||||||
|
threshold: float,
|
||||||
|
max_length: Optional[int],
|
||||||
|
min_length: Optional[int],
|
||||||
|
scorer: Optional[Callable],
|
||||||
|
) -> "SpanFinder":
|
||||||
|
"""Create a SpanFinder component. The component predicts whether a token is
|
||||||
|
the start or the end of a potential span.
|
||||||
|
|
||||||
|
model (Model[List[Doc], Floats2d]): A model instance that
|
||||||
|
is given a list of documents and predicts a probability for each token.
|
||||||
|
spans_key (str): Key of the doc.spans dict to save the spans under. During
|
||||||
|
initialization and training, the component will look for spans on the
|
||||||
|
reference document under the same key.
|
||||||
|
threshold (float): Minimum probability to consider a prediction positive.
|
||||||
|
max_length (Optional[int]): Maximum length of the produced spans, defaults
|
||||||
|
to None meaning unlimited length.
|
||||||
|
min_length (Optional[int]): Minimum length of the produced spans, defaults
|
||||||
|
to None meaning shortest span length is 1.
|
||||||
|
scorer (Optional[Callable]): The scoring method. Defaults to
|
||||||
|
Scorer.score_spans for the Doc.spans[spans_key] with overlapping
|
||||||
|
spans allowed.
|
||||||
|
"""
|
||||||
|
return SpanFinder(
|
||||||
|
nlp,
|
||||||
|
model=model,
|
||||||
|
threshold=threshold,
|
||||||
|
name=name,
|
||||||
|
scorer=scorer,
|
||||||
|
max_length=max_length,
|
||||||
|
min_length=min_length,
|
||||||
|
spans_key=spans_key,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@registry.scorers("spacy.span_finder_scorer.v1")
|
||||||
|
def make_span_finder_scorer():
|
||||||
|
return span_finder_score
|
||||||
|
|
||||||
|
|
||||||
|
def span_finder_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
||||||
|
kwargs = dict(kwargs)
|
||||||
|
attr_prefix = "span_finder_"
|
||||||
|
key = kwargs["spans_key"]
|
||||||
|
kwargs.setdefault("attr", f"{attr_prefix}{key}")
|
||||||
|
kwargs.setdefault(
|
||||||
|
"getter", lambda doc, key: doc.spans.get(key[len(attr_prefix) :], [])
|
||||||
|
)
|
||||||
|
kwargs.setdefault("has_annotation", lambda doc: key in doc.spans)
|
||||||
|
kwargs.setdefault("allow_overlap", True)
|
||||||
|
kwargs.setdefault("labeled", False)
|
||||||
|
scores = Scorer.score_spans(examples, **kwargs)
|
||||||
|
scores.pop(f"{kwargs['attr']}_per_type", None)
|
||||||
|
return scores
|
||||||
|
|
||||||
|
|
||||||
|
def _char_indices(span: Span) -> Tuple[int, int]:
|
||||||
|
start = span[0].idx
|
||||||
|
end = span[-1].idx + len(span[-1])
|
||||||
|
return start, end
|
||||||
|
|
||||||
|
|
||||||
|
class SpanFinder(TrainablePipe):
|
||||||
|
"""Pipeline that learns span boundaries.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/spanfinder
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
nlp: Language,
|
||||||
|
model: Model[Iterable[Doc], Floats2d],
|
||||||
|
name: str = "span_finder",
|
||||||
|
*,
|
||||||
|
spans_key: str = DEFAULT_SPANS_KEY,
|
||||||
|
threshold: float = 0.5,
|
||||||
|
max_length: Optional[int] = None,
|
||||||
|
min_length: Optional[int] = None,
|
||||||
|
scorer: Optional[Callable] = span_finder_score,
|
||||||
|
) -> None:
|
||||||
|
"""Initialize the span finder.
|
||||||
|
model (thinc.api.Model): The Thinc Model powering the pipeline
|
||||||
|
component.
|
||||||
|
name (str): The component instance name, used to add entries to the
|
||||||
|
losses during training.
|
||||||
|
threshold (float): Minimum probability to consider a prediction
|
||||||
|
positive.
|
||||||
|
scorer (Optional[Callable]): The scoring method.
|
||||||
|
spans_key (str): Key of the doc.spans dict to save the spans under.
|
||||||
|
During initialization and training, the component will look for
|
||||||
|
spans on the reference document under the same key.
|
||||||
|
max_length (Optional[int]): Maximum length of the produced spans,
|
||||||
|
defaults to None meaning unlimited length.
|
||||||
|
min_length (Optional[int]): Minimum length of the produced spans,
|
||||||
|
defaults to None meaning shortest span length is 1.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/spanfinder#init
|
||||||
|
"""
|
||||||
|
self.vocab = nlp.vocab
|
||||||
|
if (max_length is not None and max_length < 1) or (
|
||||||
|
min_length is not None and min_length < 1
|
||||||
|
):
|
||||||
|
raise ValueError(
|
||||||
|
Errors.E1053.format(min_length=min_length, max_length=max_length)
|
||||||
|
)
|
||||||
|
self.model = model
|
||||||
|
self.name = name
|
||||||
|
self.scorer = scorer
|
||||||
|
self.cfg: Dict[str, Any] = {
|
||||||
|
"min_length": min_length,
|
||||||
|
"max_length": max_length,
|
||||||
|
"threshold": threshold,
|
||||||
|
"spans_key": spans_key,
|
||||||
|
}
|
||||||
|
|
||||||
|
def predict(self, docs: Iterable[Doc]):
|
||||||
|
"""Apply the pipeline's model to a batch of docs, without modifying
|
||||||
|
them.
|
||||||
|
|
||||||
|
docs (Iterable[Doc]): The documents to predict.
|
||||||
|
RETURNS: The models prediction for each document.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/spanfinder#predict
|
||||||
|
"""
|
||||||
|
scores = self.model.predict(docs)
|
||||||
|
return scores
|
||||||
|
|
||||||
|
def set_annotations(self, docs: Iterable[Doc], scores: Floats2d) -> None:
|
||||||
|
"""Modify a batch of Doc objects, using pre-computed scores.
|
||||||
|
docs (Iterable[Doc]): The documents to modify.
|
||||||
|
scores: The scores to set, produced by SpanFinder predict method.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/spanfinder#set_annotations
|
||||||
|
"""
|
||||||
|
offset = 0
|
||||||
|
for i, doc in enumerate(docs):
|
||||||
|
doc.spans[self.cfg["spans_key"]] = []
|
||||||
|
starts = []
|
||||||
|
ends = []
|
||||||
|
doc_scores = scores[offset : offset + len(doc)]
|
||||||
|
|
||||||
|
for token, token_score in zip(doc, doc_scores):
|
||||||
|
if token_score[0] >= self.cfg["threshold"]:
|
||||||
|
starts.append(token.i)
|
||||||
|
if token_score[1] >= self.cfg["threshold"]:
|
||||||
|
ends.append(token.i)
|
||||||
|
|
||||||
|
for start in starts:
|
||||||
|
for end in ends:
|
||||||
|
span_length = end + 1 - start
|
||||||
|
if span_length < 1:
|
||||||
|
continue
|
||||||
|
if (
|
||||||
|
self.cfg["min_length"] is None
|
||||||
|
or self.cfg["min_length"] <= span_length
|
||||||
|
) and (
|
||||||
|
self.cfg["max_length"] is None
|
||||||
|
or span_length <= self.cfg["max_length"]
|
||||||
|
):
|
||||||
|
doc.spans[self.cfg["spans_key"]].append(doc[start : end + 1])
|
||||||
|
offset += len(doc)
|
||||||
|
|
||||||
|
def update(
|
||||||
|
self,
|
||||||
|
examples: Iterable[Example],
|
||||||
|
*,
|
||||||
|
drop: float = 0.0,
|
||||||
|
sgd: Optional[Optimizer] = None,
|
||||||
|
losses: Optional[Dict[str, float]] = None,
|
||||||
|
) -> Dict[str, float]:
|
||||||
|
"""Learn from a batch of documents and gold-standard information,
|
||||||
|
updating the pipe's model. Delegates to predict and get_loss.
|
||||||
|
examples (Iterable[Example]): A batch of Example objects.
|
||||||
|
drop (float): The dropout rate.
|
||||||
|
sgd (Optional[thinc.api.Optimizer]): The optimizer.
|
||||||
|
losses (Optional[Dict[str, float]]): Optional record of the loss during
|
||||||
|
training. Updated using the component name as the key.
|
||||||
|
RETURNS (Dict[str, float]): The updated losses dictionary.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/spanfinder#update
|
||||||
|
"""
|
||||||
|
if losses is None:
|
||||||
|
losses = {}
|
||||||
|
losses.setdefault(self.name, 0.0)
|
||||||
|
predicted = [eg.predicted for eg in examples]
|
||||||
|
set_dropout_rate(self.model, drop)
|
||||||
|
scores, backprop_scores = self.model.begin_update(predicted)
|
||||||
|
loss, d_scores = self.get_loss(examples, scores)
|
||||||
|
backprop_scores(d_scores)
|
||||||
|
if sgd is not None:
|
||||||
|
self.finish_update(sgd)
|
||||||
|
losses[self.name] += loss
|
||||||
|
return losses
|
||||||
|
|
||||||
|
def get_loss(self, examples, scores) -> Tuple[float, Floats2d]:
|
||||||
|
"""Find the loss and gradient of loss for the batch of documents and
|
||||||
|
their predicted scores.
|
||||||
|
examples (Iterable[Examples]): The batch of examples.
|
||||||
|
scores: Scores representing the model's predictions.
|
||||||
|
RETURNS (Tuple[float, Floats2d]): The loss and the gradient.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/spanfinder#get_loss
|
||||||
|
"""
|
||||||
|
truths, masks = self._get_aligned_truth_scores(examples, self.model.ops)
|
||||||
|
d_scores = scores - self.model.ops.asarray2f(truths)
|
||||||
|
d_scores *= masks
|
||||||
|
loss = float((d_scores**2).sum())
|
||||||
|
return loss, d_scores
|
||||||
|
|
||||||
|
def _get_aligned_truth_scores(self, examples, ops) -> Tuple[Floats2d, Floats2d]:
|
||||||
|
"""Align scores of the predictions to the references for calculating
|
||||||
|
the loss.
|
||||||
|
"""
|
||||||
|
truths = []
|
||||||
|
masks = []
|
||||||
|
for eg in examples:
|
||||||
|
if eg.x.text != eg.y.text:
|
||||||
|
raise ValueError(Errors.E1054.format(component="span_finder"))
|
||||||
|
n_tokens = len(eg.predicted)
|
||||||
|
truth = ops.xp.zeros((n_tokens, 2), dtype="float32")
|
||||||
|
mask = ops.xp.ones((n_tokens, 2), dtype="float32")
|
||||||
|
if self.cfg["spans_key"] in eg.reference.spans:
|
||||||
|
for span in eg.reference.spans[self.cfg["spans_key"]]:
|
||||||
|
ref_start_char, ref_end_char = _char_indices(span)
|
||||||
|
pred_span = eg.predicted.char_span(
|
||||||
|
ref_start_char, ref_end_char, alignment_mode="expand"
|
||||||
|
)
|
||||||
|
pred_start_char, pred_end_char = _char_indices(pred_span)
|
||||||
|
start_match = pred_start_char == ref_start_char
|
||||||
|
end_match = pred_end_char == ref_end_char
|
||||||
|
if start_match:
|
||||||
|
truth[pred_span[0].i, 0] = 1
|
||||||
|
else:
|
||||||
|
mask[pred_span[0].i, 0] = 0
|
||||||
|
if end_match:
|
||||||
|
truth[pred_span[-1].i, 1] = 1
|
||||||
|
else:
|
||||||
|
mask[pred_span[-1].i, 1] = 0
|
||||||
|
truths.append(truth)
|
||||||
|
masks.append(mask)
|
||||||
|
truths = ops.xp.concatenate(truths, axis=0)
|
||||||
|
masks = ops.xp.concatenate(masks, axis=0)
|
||||||
|
return truths, masks
|
||||||
|
|
||||||
|
def initialize(
|
||||||
|
self,
|
||||||
|
get_examples: Callable[[], Iterable[Example]],
|
||||||
|
*,
|
||||||
|
nlp: Optional[Language] = None,
|
||||||
|
) -> None:
|
||||||
|
"""Initialize the pipe for training, using a representative set
|
||||||
|
of data examples.
|
||||||
|
get_examples (Callable[[], Iterable[Example]]): Function that
|
||||||
|
returns a representative sample of gold-standard Example objects.
|
||||||
|
nlp (Optional[Language]): The current nlp object the component is part
|
||||||
|
of.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/spanfinder#initialize
|
||||||
|
"""
|
||||||
|
subbatch: List[Example] = []
|
||||||
|
|
||||||
|
for eg in get_examples():
|
||||||
|
if len(subbatch) < 10:
|
||||||
|
subbatch.append(eg)
|
||||||
|
|
||||||
|
if subbatch:
|
||||||
|
docs = [eg.reference for eg in subbatch]
|
||||||
|
Y, _ = self._get_aligned_truth_scores(subbatch, self.model.ops)
|
||||||
|
self.model.initialize(X=docs, Y=Y)
|
||||||
|
else:
|
||||||
|
self.model.initialize()
|
|
@ -1,20 +1,23 @@
|
||||||
from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any, cast
|
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union, cast
|
||||||
from typing import Union, Protocol, runtime_checkable
|
from typing import Union, Protocol, runtime_checkable
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from functools import partial
|
||||||
from thinc.api import Config, Model, get_current_ops, set_dropout_rate, Ops
|
from thinc.api import Config, Model, get_current_ops, set_dropout_rate, Ops
|
||||||
from thinc.api import Optimizer
|
from thinc.api import Optimizer
|
||||||
from thinc.types import Ragged, Ints2d, Floats2d
|
from thinc.types import Ragged, Ints2d, Floats2d
|
||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
|
from thinc.api import Config, Model, Ops, Optimizer, get_current_ops, set_dropout_rate
|
||||||
|
from thinc.types import Floats2d, Ints1d, Ints2d, Ragged
|
||||||
|
|
||||||
from ..scorer import Scorer
|
|
||||||
from ..language import Language
|
|
||||||
from .trainable_pipe import TrainablePipe
|
|
||||||
from ..tokens import Doc, SpanGroup, Span
|
|
||||||
from ..vocab import Vocab
|
|
||||||
from ..training import Example, validate_examples
|
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
|
from ..language import Language
|
||||||
|
from ..scorer import Scorer
|
||||||
|
from ..tokens import Doc, Span, SpanGroup
|
||||||
|
from ..training import Example, validate_examples
|
||||||
from ..util import registry
|
from ..util import registry
|
||||||
|
from ..vocab import Vocab
|
||||||
|
from .trainable_pipe import TrainablePipe
|
||||||
|
|
||||||
ActivationsT = Dict[str, Union[Floats2d, Ragged]]
|
ActivationsT = Dict[str, Union[Floats2d, Ragged]]
|
||||||
|
|
||||||
|
@ -34,8 +37,8 @@ hidden_size = 128
|
||||||
[model.tok2vec.embed]
|
[model.tok2vec.embed]
|
||||||
@architectures = "spacy.MultiHashEmbed.v2"
|
@architectures = "spacy.MultiHashEmbed.v2"
|
||||||
width = 96
|
width = 96
|
||||||
rows = [5000, 2000, 1000, 1000]
|
rows = [5000, 1000, 2500, 1000]
|
||||||
attrs = ["ORTH", "PREFIX", "SUFFIX", "SHAPE"]
|
attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
|
||||||
include_static_vectors = false
|
include_static_vectors = false
|
||||||
|
|
||||||
[model.tok2vec.encode]
|
[model.tok2vec.encode]
|
||||||
|
@ -46,7 +49,37 @@ maxout_pieces = 3
|
||||||
depth = 4
|
depth = 4
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
spancat_singlelabel_default_config = """
|
||||||
|
[model]
|
||||||
|
@architectures = "spacy.SpanCategorizer.v1"
|
||||||
|
scorer = {"@layers": "Softmax.v2"}
|
||||||
|
|
||||||
|
[model.reducer]
|
||||||
|
@layers = spacy.mean_max_reducer.v1
|
||||||
|
hidden_size = 128
|
||||||
|
|
||||||
|
[model.tok2vec]
|
||||||
|
@architectures = "spacy.Tok2Vec.v2"
|
||||||
|
[model.tok2vec.embed]
|
||||||
|
@architectures = "spacy.MultiHashEmbed.v1"
|
||||||
|
width = 96
|
||||||
|
rows = [5000, 1000, 2500, 1000]
|
||||||
|
attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
|
||||||
|
include_static_vectors = false
|
||||||
|
|
||||||
|
[model.tok2vec.encode]
|
||||||
|
@architectures = "spacy.MaxoutWindowEncoder.v2"
|
||||||
|
width = ${model.tok2vec.embed.width}
|
||||||
|
window_size = 1
|
||||||
|
maxout_pieces = 3
|
||||||
|
depth = 4
|
||||||
|
"""
|
||||||
|
|
||||||
|
DEFAULT_SPANS_KEY = "sc"
|
||||||
DEFAULT_SPANCAT_MODEL = Config().from_str(spancat_default_config)["model"]
|
DEFAULT_SPANCAT_MODEL = Config().from_str(spancat_default_config)["model"]
|
||||||
|
DEFAULT_SPANCAT_SINGLELABEL_MODEL = Config().from_str(
|
||||||
|
spancat_singlelabel_default_config
|
||||||
|
)["model"]
|
||||||
|
|
||||||
|
|
||||||
@runtime_checkable
|
@runtime_checkable
|
||||||
|
@ -55,39 +88,65 @@ class Suggester(Protocol):
|
||||||
...
|
...
|
||||||
|
|
||||||
|
|
||||||
|
def ngram_suggester(
|
||||||
|
docs: Iterable[Doc], sizes: List[int], *, ops: Optional[Ops] = None
|
||||||
|
) -> Ragged:
|
||||||
|
if ops is None:
|
||||||
|
ops = get_current_ops()
|
||||||
|
spans = []
|
||||||
|
lengths = []
|
||||||
|
for doc in docs:
|
||||||
|
starts = ops.xp.arange(len(doc), dtype="i")
|
||||||
|
starts = starts.reshape((-1, 1))
|
||||||
|
length = 0
|
||||||
|
for size in sizes:
|
||||||
|
if size <= len(doc):
|
||||||
|
starts_size = starts[: len(doc) - (size - 1)]
|
||||||
|
spans.append(ops.xp.hstack((starts_size, starts_size + size)))
|
||||||
|
length += spans[-1].shape[0]
|
||||||
|
if spans:
|
||||||
|
assert spans[-1].ndim == 2, spans[-1].shape
|
||||||
|
lengths.append(length)
|
||||||
|
lengths_array = ops.asarray1i(lengths)
|
||||||
|
if len(spans) > 0:
|
||||||
|
output = Ragged(ops.xp.vstack(spans), lengths_array)
|
||||||
|
else:
|
||||||
|
output = Ragged(ops.xp.zeros((0, 0), dtype="i"), lengths_array)
|
||||||
|
|
||||||
|
assert output.dataXd.ndim == 2
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
def preset_spans_suggester(
|
||||||
|
docs: Iterable[Doc], spans_key: str, *, ops: Optional[Ops] = None
|
||||||
|
) -> Ragged:
|
||||||
|
if ops is None:
|
||||||
|
ops = get_current_ops()
|
||||||
|
spans = []
|
||||||
|
lengths = []
|
||||||
|
for doc in docs:
|
||||||
|
length = 0
|
||||||
|
if doc.spans[spans_key]:
|
||||||
|
for span in doc.spans[spans_key]:
|
||||||
|
spans.append([span.start, span.end])
|
||||||
|
length += 1
|
||||||
|
|
||||||
|
lengths.append(length)
|
||||||
|
lengths_array = cast(Ints1d, ops.asarray(lengths, dtype="i"))
|
||||||
|
if len(spans) > 0:
|
||||||
|
output = Ragged(ops.asarray(spans, dtype="i"), lengths_array)
|
||||||
|
else:
|
||||||
|
output = Ragged(ops.xp.zeros((0, 0), dtype="i"), lengths_array)
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
@registry.misc("spacy.ngram_suggester.v1")
|
@registry.misc("spacy.ngram_suggester.v1")
|
||||||
def build_ngram_suggester(sizes: List[int]) -> Suggester:
|
def build_ngram_suggester(sizes: List[int]) -> Suggester:
|
||||||
"""Suggest all spans of the given lengths. Spans are returned as a ragged
|
"""Suggest all spans of the given lengths. Spans are returned as a ragged
|
||||||
array of integers. The array has two columns, indicating the start and end
|
array of integers. The array has two columns, indicating the start and end
|
||||||
position."""
|
position."""
|
||||||
|
|
||||||
def ngram_suggester(docs: Iterable[Doc], *, ops: Optional[Ops] = None) -> Ragged:
|
return partial(ngram_suggester, sizes=sizes)
|
||||||
if ops is None:
|
|
||||||
ops = get_current_ops()
|
|
||||||
spans = []
|
|
||||||
lengths = []
|
|
||||||
for doc in docs:
|
|
||||||
starts = ops.xp.arange(len(doc), dtype="i")
|
|
||||||
starts = starts.reshape((-1, 1))
|
|
||||||
length = 0
|
|
||||||
for size in sizes:
|
|
||||||
if size <= len(doc):
|
|
||||||
starts_size = starts[: len(doc) - (size - 1)]
|
|
||||||
spans.append(ops.xp.hstack((starts_size, starts_size + size)))
|
|
||||||
length += spans[-1].shape[0]
|
|
||||||
if spans:
|
|
||||||
assert spans[-1].ndim == 2, spans[-1].shape
|
|
||||||
lengths.append(length)
|
|
||||||
lengths_array = ops.asarray1i(lengths)
|
|
||||||
if len(spans) > 0:
|
|
||||||
output = Ragged(ops.xp.vstack(spans), lengths_array)
|
|
||||||
else:
|
|
||||||
output = Ragged(ops.xp.zeros((0, 0), dtype="i"), lengths_array)
|
|
||||||
|
|
||||||
assert output.dataXd.ndim == 2
|
|
||||||
return output
|
|
||||||
|
|
||||||
return ngram_suggester
|
|
||||||
|
|
||||||
|
|
||||||
@registry.misc("spacy.ngram_range_suggester.v1")
|
@registry.misc("spacy.ngram_range_suggester.v1")
|
||||||
|
@ -99,12 +158,20 @@ def build_ngram_range_suggester(min_size: int, max_size: int) -> Suggester:
|
||||||
return build_ngram_suggester(sizes)
|
return build_ngram_suggester(sizes)
|
||||||
|
|
||||||
|
|
||||||
|
@registry.misc("spacy.preset_spans_suggester.v1")
|
||||||
|
def build_preset_spans_suggester(spans_key: str) -> Suggester:
|
||||||
|
"""Suggest all spans that are already stored in doc.spans[spans_key].
|
||||||
|
This is useful when an upstream component is used to set the spans
|
||||||
|
on the Doc such as a SpanRuler or SpanFinder."""
|
||||||
|
return partial(preset_spans_suggester, spans_key=spans_key)
|
||||||
|
|
||||||
|
|
||||||
@Language.factory(
|
@Language.factory(
|
||||||
"spancat",
|
"spancat",
|
||||||
assigns=["doc.spans"],
|
assigns=["doc.spans"],
|
||||||
default_config={
|
default_config={
|
||||||
"threshold": 0.5,
|
"threshold": 0.5,
|
||||||
"spans_key": "sc",
|
"spans_key": DEFAULT_SPANS_KEY,
|
||||||
"max_positive": None,
|
"max_positive": None,
|
||||||
"model": DEFAULT_SPANCAT_MODEL,
|
"model": DEFAULT_SPANCAT_MODEL,
|
||||||
"suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
|
"suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
|
||||||
|
@ -124,10 +191,14 @@ def make_spancat(
|
||||||
max_positive: Optional[int],
|
max_positive: Optional[int],
|
||||||
save_activations: bool,
|
save_activations: bool,
|
||||||
) -> "SpanCategorizer":
|
) -> "SpanCategorizer":
|
||||||
"""Create a SpanCategorizer component. The span categorizer consists of two
|
"""Create a SpanCategorizer component and configure it for multi-label
|
||||||
|
classification to be able to assign multiple labels for each span.
|
||||||
|
The span categorizer consists of two
|
||||||
parts: a suggester function that proposes candidate spans, and a labeller
|
parts: a suggester function that proposes candidate spans, and a labeller
|
||||||
model that predicts one or more labels for each span.
|
model that predicts one or more labels for each span.
|
||||||
|
|
||||||
|
name (str): The component instance name, used to add entries to the
|
||||||
|
losses during training.
|
||||||
suggester (Callable[[Iterable[Doc], Optional[Ops]], Ragged]): A function that suggests spans.
|
suggester (Callable[[Iterable[Doc], Optional[Ops]], Ragged]): A function that suggests spans.
|
||||||
Spans are returned as a ragged array with two integer columns, for the
|
Spans are returned as a ragged array with two integer columns, for the
|
||||||
start and end positions.
|
start and end positions.
|
||||||
|
@ -146,16 +217,88 @@ def make_spancat(
|
||||||
0.5.
|
0.5.
|
||||||
max_positive (Optional[int]): Maximum number of labels to consider positive
|
max_positive (Optional[int]): Maximum number of labels to consider positive
|
||||||
per span. Defaults to None, indicating no limit.
|
per span. Defaults to None, indicating no limit.
|
||||||
save_activations (bool): save model activations in Doc when annotating.
|
save_activations (bool): save model activations in Doc when annotating.
|
||||||
"""
|
"""
|
||||||
return SpanCategorizer(
|
return SpanCategorizer(
|
||||||
nlp.vocab,
|
nlp.vocab,
|
||||||
suggester=suggester,
|
|
||||||
model=model,
|
model=model,
|
||||||
spans_key=spans_key,
|
suggester=suggester,
|
||||||
threshold=threshold,
|
|
||||||
max_positive=max_positive,
|
|
||||||
name=name,
|
name=name,
|
||||||
|
spans_key=spans_key,
|
||||||
|
negative_weight=None,
|
||||||
|
allow_overlap=True,
|
||||||
|
max_positive=max_positive,
|
||||||
|
threshold=threshold,
|
||||||
|
scorer=scorer,
|
||||||
|
add_negative_label=False,
|
||||||
|
save_activations=save_activations,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@Language.factory(
|
||||||
|
"spancat_singlelabel",
|
||||||
|
assigns=["doc.spans"],
|
||||||
|
default_config={
|
||||||
|
"spans_key": DEFAULT_SPANS_KEY,
|
||||||
|
"model": DEFAULT_SPANCAT_SINGLELABEL_MODEL,
|
||||||
|
"negative_weight": 1.0,
|
||||||
|
"suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
|
||||||
|
"scorer": {"@scorers": "spacy.spancat_scorer.v1"},
|
||||||
|
"allow_overlap": True,
|
||||||
|
"save_activations": False,
|
||||||
|
},
|
||||||
|
default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0},
|
||||||
|
)
|
||||||
|
def make_spancat_singlelabel(
|
||||||
|
nlp: Language,
|
||||||
|
name: str,
|
||||||
|
suggester: Suggester,
|
||||||
|
model: Model[Tuple[List[Doc], Ragged], Floats2d],
|
||||||
|
spans_key: str,
|
||||||
|
negative_weight: float,
|
||||||
|
allow_overlap: bool,
|
||||||
|
scorer: Optional[Callable],
|
||||||
|
save_activations: bool,
|
||||||
|
) -> "SpanCategorizer":
|
||||||
|
"""Create a SpanCategorizer component and configure it for multi-class
|
||||||
|
classification. With this configuration each span can get at most one
|
||||||
|
label. The span categorizer consists of two
|
||||||
|
parts: a suggester function that proposes candidate spans, and a labeller
|
||||||
|
model that predicts one or more labels for each span.
|
||||||
|
|
||||||
|
name (str): The component instance name, used to add entries to the
|
||||||
|
losses during training.
|
||||||
|
suggester (Callable[[Iterable[Doc], Optional[Ops]], Ragged]): A function that suggests spans.
|
||||||
|
Spans are returned as a ragged array with two integer columns, for the
|
||||||
|
start and end positions.
|
||||||
|
model (Model[Tuple[List[Doc], Ragged], Floats2d]): A model instance that
|
||||||
|
is given a list of documents and (start, end) indices representing
|
||||||
|
candidate span offsets. The model predicts a probability for each category
|
||||||
|
for each span.
|
||||||
|
spans_key (str): Key of the doc.spans dict to save the spans under. During
|
||||||
|
initialization and training, the component will look for spans on the
|
||||||
|
reference document under the same key.
|
||||||
|
scorer (Optional[Callable]): The scoring method. Defaults to
|
||||||
|
Scorer.score_spans for the Doc.spans[spans_key] with overlapping
|
||||||
|
spans allowed.
|
||||||
|
negative_weight (float): Multiplier for the loss terms.
|
||||||
|
Can be used to downweight the negative samples if there are too many.
|
||||||
|
allow_overlap (bool): If True the data is assumed to contain overlapping spans.
|
||||||
|
Otherwise it produces non-overlapping spans greedily prioritizing
|
||||||
|
higher assigned label scores.
|
||||||
|
save_activations (bool): save model activations in Doc when annotating.
|
||||||
|
"""
|
||||||
|
return SpanCategorizer(
|
||||||
|
nlp.vocab,
|
||||||
|
model=model,
|
||||||
|
suggester=suggester,
|
||||||
|
name=name,
|
||||||
|
spans_key=spans_key,
|
||||||
|
negative_weight=negative_weight,
|
||||||
|
allow_overlap=allow_overlap,
|
||||||
|
max_positive=1,
|
||||||
|
add_negative_label=True,
|
||||||
|
threshold=None,
|
||||||
scorer=scorer,
|
scorer=scorer,
|
||||||
save_activations=save_activations,
|
save_activations=save_activations,
|
||||||
)
|
)
|
||||||
|
@ -179,6 +322,27 @@ def make_spancat_scorer():
|
||||||
return spancat_score
|
return spancat_score
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class _Intervals:
|
||||||
|
"""
|
||||||
|
Helper class to avoid storing overlapping spans.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.ranges = set()
|
||||||
|
|
||||||
|
def add(self, i, j):
|
||||||
|
for e in range(i, j):
|
||||||
|
self.ranges.add(e)
|
||||||
|
|
||||||
|
def __contains__(self, rang):
|
||||||
|
i, j = rang
|
||||||
|
for e in range(i, j):
|
||||||
|
if e in self.ranges:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
class SpanCategorizer(TrainablePipe):
|
class SpanCategorizer(TrainablePipe):
|
||||||
"""Pipeline component to label spans of text.
|
"""Pipeline component to label spans of text.
|
||||||
|
|
||||||
|
@ -192,26 +356,44 @@ class SpanCategorizer(TrainablePipe):
|
||||||
suggester: Suggester,
|
suggester: Suggester,
|
||||||
name: str = "spancat",
|
name: str = "spancat",
|
||||||
*,
|
*,
|
||||||
|
add_negative_label: bool = False,
|
||||||
spans_key: str = "spans",
|
spans_key: str = "spans",
|
||||||
threshold: float = 0.5,
|
negative_weight: Optional[float] = 1.0,
|
||||||
|
allow_overlap: Optional[bool] = True,
|
||||||
max_positive: Optional[int] = None,
|
max_positive: Optional[int] = None,
|
||||||
|
threshold: Optional[float] = 0.5,
|
||||||
scorer: Optional[Callable] = spancat_score,
|
scorer: Optional[Callable] = spancat_score,
|
||||||
save_activations: bool = False,
|
save_activations: bool = False,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Initialize the span categorizer.
|
"""Initialize the multi-label or multi-class span categorizer.
|
||||||
|
|
||||||
vocab (Vocab): The shared vocabulary.
|
vocab (Vocab): The shared vocabulary.
|
||||||
model (thinc.api.Model): The Thinc Model powering the pipeline component.
|
model (thinc.api.Model): The Thinc Model powering the pipeline component.
|
||||||
|
For multi-class classification (single label per span) we recommend
|
||||||
|
using a Softmax classifier as a the final layer, while for multi-label
|
||||||
|
classification (multiple possible labels per span) we recommend Logistic.
|
||||||
|
suggester (Callable[[Iterable[Doc], Optional[Ops]], Ragged]): A function that suggests spans.
|
||||||
|
Spans are returned as a ragged array with two integer columns, for the
|
||||||
|
start and end positions.
|
||||||
name (str): The component instance name, used to add entries to the
|
name (str): The component instance name, used to add entries to the
|
||||||
losses during training.
|
losses during training.
|
||||||
spans_key (str): Key of the Doc.spans dict to save the spans under.
|
spans_key (str): Key of the Doc.spans dict to save the spans under.
|
||||||
During initialization and training, the component will look for
|
During initialization and training, the component will look for
|
||||||
spans on the reference document under the same key. Defaults to
|
spans on the reference document under the same key. Defaults to
|
||||||
`"spans"`.
|
`"spans"`.
|
||||||
threshold (float): Minimum probability to consider a prediction
|
add_negative_label (bool): Learn to predict a special 'negative_label'
|
||||||
positive. Spans with a positive prediction will be saved on the Doc.
|
when a Span is not annotated.
|
||||||
Defaults to 0.5.
|
threshold (Optional[float]): Minimum probability to consider a prediction
|
||||||
|
positive. Defaults to 0.5. Spans with a positive prediction will be saved
|
||||||
|
on the Doc.
|
||||||
max_positive (Optional[int]): Maximum number of labels to consider
|
max_positive (Optional[int]): Maximum number of labels to consider
|
||||||
positive per span. Defaults to None, indicating no limit.
|
positive per span. Defaults to None, indicating no limit.
|
||||||
|
negative_weight (float): Multiplier for the loss terms.
|
||||||
|
Can be used to downweight the negative samples if there are too many
|
||||||
|
when add_negative_label is True. Otherwise its unused.
|
||||||
|
allow_overlap (bool): If True the data is assumed to contain overlapping spans.
|
||||||
|
Otherwise it produces non-overlapping spans greedily prioritizing
|
||||||
|
higher assigned label scores. Only used when max_positive is 1.
|
||||||
scorer (Optional[Callable]): The scoring method. Defaults to
|
scorer (Optional[Callable]): The scoring method. Defaults to
|
||||||
Scorer.score_spans for the Doc.spans[spans_key] with overlapping
|
Scorer.score_spans for the Doc.spans[spans_key] with overlapping
|
||||||
spans allowed.
|
spans allowed.
|
||||||
|
@ -223,6 +405,8 @@ class SpanCategorizer(TrainablePipe):
|
||||||
"spans_key": spans_key,
|
"spans_key": spans_key,
|
||||||
"threshold": threshold,
|
"threshold": threshold,
|
||||||
"max_positive": max_positive,
|
"max_positive": max_positive,
|
||||||
|
"negative_weight": negative_weight,
|
||||||
|
"allow_overlap": allow_overlap,
|
||||||
}
|
}
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.suggester = suggester
|
self.suggester = suggester
|
||||||
|
@ -230,6 +414,9 @@ class SpanCategorizer(TrainablePipe):
|
||||||
self.name = name
|
self.name = name
|
||||||
self.scorer = scorer
|
self.scorer = scorer
|
||||||
self.save_activations = save_activations
|
self.save_activations = save_activations
|
||||||
|
self.add_negative_label = add_negative_label
|
||||||
|
if not allow_overlap and max_positive is not None and max_positive > 1:
|
||||||
|
raise ValueError(Errors.E1051.format(max_positive=max_positive))
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def key(self) -> str:
|
def key(self) -> str:
|
||||||
|
@ -239,6 +426,21 @@ class SpanCategorizer(TrainablePipe):
|
||||||
"""
|
"""
|
||||||
return str(self.cfg["spans_key"])
|
return str(self.cfg["spans_key"])
|
||||||
|
|
||||||
|
def _allow_extra_label(self) -> None:
|
||||||
|
"""Raise an error if the component can not add any more labels."""
|
||||||
|
nO = None
|
||||||
|
if self.model.has_dim("nO"):
|
||||||
|
nO = self.model.get_dim("nO")
|
||||||
|
elif self.model.has_ref("output_layer") and self.model.get_ref(
|
||||||
|
"output_layer"
|
||||||
|
).has_dim("nO"):
|
||||||
|
nO = self.model.get_ref("output_layer").get_dim("nO")
|
||||||
|
if nO is not None and nO == self._n_labels:
|
||||||
|
if not self.is_resizable:
|
||||||
|
raise ValueError(
|
||||||
|
Errors.E922.format(name=self.name, nO=self.model.get_dim("nO"))
|
||||||
|
)
|
||||||
|
|
||||||
def add_label(self, label: str) -> int:
|
def add_label(self, label: str) -> int:
|
||||||
"""Add a new label to the pipe.
|
"""Add a new label to the pipe.
|
||||||
|
|
||||||
|
@ -272,6 +474,27 @@ class SpanCategorizer(TrainablePipe):
|
||||||
"""
|
"""
|
||||||
return list(self.labels)
|
return list(self.labels)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def _label_map(self) -> Dict[str, int]:
|
||||||
|
"""RETURNS (Dict[str, int]): The label map."""
|
||||||
|
return {label: i for i, label in enumerate(self.labels)}
|
||||||
|
|
||||||
|
@property
|
||||||
|
def _n_labels(self) -> int:
|
||||||
|
"""RETURNS (int): Number of labels."""
|
||||||
|
if self.add_negative_label:
|
||||||
|
return len(self.labels) + 1
|
||||||
|
else:
|
||||||
|
return len(self.labels)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def _negative_label_i(self) -> Union[int, None]:
|
||||||
|
"""RETURNS (Union[int, None]): Index of the negative label."""
|
||||||
|
if self.add_negative_label:
|
||||||
|
return len(self.label_data)
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
def predict(self, docs: Iterable[Doc]) -> ActivationsT:
|
def predict(self, docs: Iterable[Doc]) -> ActivationsT:
|
||||||
"""Apply the pipeline's model to a batch of docs, without modifying them.
|
"""Apply the pipeline's model to a batch of docs, without modifying them.
|
||||||
|
|
||||||
|
@ -313,12 +536,8 @@ class SpanCategorizer(TrainablePipe):
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/spancategorizer#set_annotations
|
DOCS: https://spacy.io/api/spancategorizer#set_annotations
|
||||||
"""
|
"""
|
||||||
labels = self.labels
|
|
||||||
|
|
||||||
indices = activations["indices"]
|
indices = activations["indices"]
|
||||||
assert isinstance(indices, Ragged)
|
scores = activations["scores"]
|
||||||
scores = cast(Floats2d, activations["scores"])
|
|
||||||
|
|
||||||
offset = 0
|
offset = 0
|
||||||
for i, doc in enumerate(docs):
|
for i, doc in enumerate(docs):
|
||||||
indices_i = indices[i].dataXd
|
indices_i = indices[i].dataXd
|
||||||
|
@ -328,9 +547,21 @@ class SpanCategorizer(TrainablePipe):
|
||||||
doc.activations[self.name]["scores"] = scores[
|
doc.activations[self.name]["scores"] = scores[
|
||||||
offset : offset + indices.lengths[i]
|
offset : offset + indices.lengths[i]
|
||||||
]
|
]
|
||||||
doc.spans[self.key] = self._make_span_group(
|
|
||||||
doc, indices_i, scores[offset : offset + indices.lengths[i]], labels # type: ignore[arg-type]
|
allow_overlap = cast(bool, self.cfg["allow_overlap"])
|
||||||
)
|
if self.cfg["max_positive"] == 1:
|
||||||
|
doc.spans[self.key] = self._make_span_group_singlelabel(
|
||||||
|
doc,
|
||||||
|
indices_i,
|
||||||
|
scores[offset : offset + indices.lengths[i]],
|
||||||
|
allow_overlap,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
doc.spans[self.key] = self._make_span_group_multilabel(
|
||||||
|
doc,
|
||||||
|
indices_i,
|
||||||
|
scores[offset : offset + indices.lengths[i]],
|
||||||
|
)
|
||||||
offset += indices.lengths[i]
|
offset += indices.lengths[i]
|
||||||
|
|
||||||
def update(
|
def update(
|
||||||
|
@ -390,9 +621,11 @@ class SpanCategorizer(TrainablePipe):
|
||||||
spans = Ragged(
|
spans = Ragged(
|
||||||
self.model.ops.to_numpy(spans.data), self.model.ops.to_numpy(spans.lengths)
|
self.model.ops.to_numpy(spans.data), self.model.ops.to_numpy(spans.lengths)
|
||||||
)
|
)
|
||||||
label_map = {label: i for i, label in enumerate(self.labels)}
|
|
||||||
target = numpy.zeros(scores.shape, dtype=scores.dtype)
|
target = numpy.zeros(scores.shape, dtype=scores.dtype)
|
||||||
|
if self.add_negative_label:
|
||||||
|
negative_spans = numpy.ones((scores.shape[0]))
|
||||||
offset = 0
|
offset = 0
|
||||||
|
label_map = self._label_map
|
||||||
for i, eg in enumerate(examples):
|
for i, eg in enumerate(examples):
|
||||||
# Map (start, end) offset of spans to the row in the d_scores array,
|
# Map (start, end) offset of spans to the row in the d_scores array,
|
||||||
# so that we can adjust the gradient for predictions that were
|
# so that we can adjust the gradient for predictions that were
|
||||||
|
@ -409,10 +642,16 @@ class SpanCategorizer(TrainablePipe):
|
||||||
row = spans_index[key]
|
row = spans_index[key]
|
||||||
k = label_map[gold_span.label_]
|
k = label_map[gold_span.label_]
|
||||||
target[row, k] = 1.0
|
target[row, k] = 1.0
|
||||||
|
if self.add_negative_label:
|
||||||
|
# delete negative label target.
|
||||||
|
negative_spans[row] = 0.0
|
||||||
# The target is a flat array for all docs. Track the position
|
# The target is a flat array for all docs. Track the position
|
||||||
# we're at within the flat array.
|
# we're at within the flat array.
|
||||||
offset += spans.lengths[i]
|
offset += spans.lengths[i]
|
||||||
target = self.model.ops.asarray(target, dtype="f") # type: ignore
|
target = self.model.ops.asarray(target, dtype="f") # type: ignore
|
||||||
|
if self.add_negative_label:
|
||||||
|
negative_samples = numpy.nonzero(negative_spans)[0]
|
||||||
|
target[negative_samples, self._negative_label_i] = 1.0 # type: ignore
|
||||||
# The target will have the values 0 (for untrue predictions) or 1
|
# The target will have the values 0 (for untrue predictions) or 1
|
||||||
# (for true predictions).
|
# (for true predictions).
|
||||||
# The scores should be in the range [0, 1].
|
# The scores should be in the range [0, 1].
|
||||||
|
@ -421,6 +660,10 @@ class SpanCategorizer(TrainablePipe):
|
||||||
# If the prediction is 0.9 and it's false, the gradient will be
|
# If the prediction is 0.9 and it's false, the gradient will be
|
||||||
# 0.9 (0.9 - 0.0)
|
# 0.9 (0.9 - 0.0)
|
||||||
d_scores = scores - target
|
d_scores = scores - target
|
||||||
|
if self.add_negative_label:
|
||||||
|
neg_weight = cast(float, self.cfg["negative_weight"])
|
||||||
|
if neg_weight != 1.0:
|
||||||
|
d_scores[negative_samples] *= neg_weight
|
||||||
loss = float((d_scores**2).sum())
|
loss = float((d_scores**2).sum())
|
||||||
return loss, d_scores
|
return loss, d_scores
|
||||||
|
|
||||||
|
@ -457,7 +700,7 @@ class SpanCategorizer(TrainablePipe):
|
||||||
if subbatch:
|
if subbatch:
|
||||||
docs = [eg.x for eg in subbatch]
|
docs = [eg.x for eg in subbatch]
|
||||||
spans = build_ngram_suggester(sizes=[1])(docs)
|
spans = build_ngram_suggester(sizes=[1])(docs)
|
||||||
Y = self.model.ops.alloc2f(spans.dataXd.shape[0], len(self.labels))
|
Y = self.model.ops.alloc2f(spans.dataXd.shape[0], self._n_labels)
|
||||||
self.model.initialize(X=(docs, spans), Y=Y)
|
self.model.initialize(X=(docs, spans), Y=Y)
|
||||||
else:
|
else:
|
||||||
self.model.initialize()
|
self.model.initialize()
|
||||||
|
@ -471,31 +714,98 @@ class SpanCategorizer(TrainablePipe):
|
||||||
eg.reference.spans.get(self.key, []), allow_overlap=True
|
eg.reference.spans.get(self.key, []), allow_overlap=True
|
||||||
)
|
)
|
||||||
|
|
||||||
def _make_span_group(
|
def _make_span_group_multilabel(
|
||||||
self, doc: Doc, indices: Ints2d, scores: Floats2d, labels: List[str]
|
self,
|
||||||
|
doc: Doc,
|
||||||
|
indices: Ints2d,
|
||||||
|
scores: Floats2d,
|
||||||
) -> SpanGroup:
|
) -> SpanGroup:
|
||||||
|
"""Find the top-k labels for each span (k=max_positive)."""
|
||||||
spans = SpanGroup(doc, name=self.key)
|
spans = SpanGroup(doc, name=self.key)
|
||||||
max_positive = self.cfg["max_positive"]
|
if scores.size == 0:
|
||||||
|
return spans
|
||||||
|
scores = self.model.ops.to_numpy(scores)
|
||||||
|
indices = self.model.ops.to_numpy(indices)
|
||||||
threshold = self.cfg["threshold"]
|
threshold = self.cfg["threshold"]
|
||||||
|
max_positive = self.cfg["max_positive"]
|
||||||
|
|
||||||
keeps = scores >= threshold
|
keeps = scores >= threshold
|
||||||
ranked = (scores * -1).argsort() # type: ignore
|
|
||||||
if max_positive is not None:
|
if max_positive is not None:
|
||||||
assert isinstance(max_positive, int)
|
assert isinstance(max_positive, int)
|
||||||
|
if self.add_negative_label:
|
||||||
|
negative_scores = numpy.copy(scores[:, self._negative_label_i])
|
||||||
|
scores[:, self._negative_label_i] = -numpy.inf
|
||||||
|
ranked = (scores * -1).argsort() # type: ignore
|
||||||
|
scores[:, self._negative_label_i] = negative_scores
|
||||||
|
else:
|
||||||
|
ranked = (scores * -1).argsort() # type: ignore
|
||||||
span_filter = ranked[:, max_positive:]
|
span_filter = ranked[:, max_positive:]
|
||||||
for i, row in enumerate(span_filter):
|
for i, row in enumerate(span_filter):
|
||||||
keeps[i, row] = False
|
keeps[i, row] = False
|
||||||
spans.attrs["scores"] = scores[keeps].flatten()
|
|
||||||
|
|
||||||
indices = self.model.ops.to_numpy(indices)
|
|
||||||
keeps = self.model.ops.to_numpy(keeps)
|
|
||||||
|
|
||||||
|
attrs_scores = []
|
||||||
for i in range(indices.shape[0]):
|
for i in range(indices.shape[0]):
|
||||||
start = indices[i, 0]
|
start = indices[i, 0]
|
||||||
end = indices[i, 1]
|
end = indices[i, 1]
|
||||||
|
|
||||||
for j, keep in enumerate(keeps[i]):
|
for j, keep in enumerate(keeps[i]):
|
||||||
if keep:
|
if keep:
|
||||||
spans.append(Span(doc, start, end, label=labels[j]))
|
if j != self._negative_label_i:
|
||||||
|
spans.append(Span(doc, start, end, label=self.labels[j]))
|
||||||
|
attrs_scores.append(scores[i, j])
|
||||||
|
spans.attrs["scores"] = numpy.array(attrs_scores)
|
||||||
|
return spans
|
||||||
|
|
||||||
|
def _make_span_group_singlelabel(
|
||||||
|
self,
|
||||||
|
doc: Doc,
|
||||||
|
indices: Ints2d,
|
||||||
|
scores: Floats2d,
|
||||||
|
allow_overlap: bool = True,
|
||||||
|
) -> SpanGroup:
|
||||||
|
"""Find the argmax label for each span."""
|
||||||
|
# Handle cases when there are zero suggestions
|
||||||
|
if scores.size == 0:
|
||||||
|
return SpanGroup(doc, name=self.key)
|
||||||
|
scores = self.model.ops.to_numpy(scores)
|
||||||
|
indices = self.model.ops.to_numpy(indices)
|
||||||
|
predicted = scores.argmax(axis=1)
|
||||||
|
argmax_scores = numpy.take_along_axis(
|
||||||
|
scores, numpy.expand_dims(predicted, 1), axis=1
|
||||||
|
)
|
||||||
|
keeps = numpy.ones(predicted.shape, dtype=bool)
|
||||||
|
# Remove samples where the negative label is the argmax.
|
||||||
|
if self.add_negative_label:
|
||||||
|
keeps = numpy.logical_and(keeps, predicted != self._negative_label_i)
|
||||||
|
# Filter samples according to threshold.
|
||||||
|
threshold = self.cfg["threshold"]
|
||||||
|
if threshold is not None:
|
||||||
|
keeps = numpy.logical_and(keeps, (argmax_scores >= threshold).squeeze())
|
||||||
|
# Sort spans according to argmax probability
|
||||||
|
if not allow_overlap:
|
||||||
|
# Get the probabilities
|
||||||
|
sort_idx = (argmax_scores.squeeze() * -1).argsort()
|
||||||
|
argmax_scores = argmax_scores[sort_idx]
|
||||||
|
predicted = predicted[sort_idx]
|
||||||
|
indices = indices[sort_idx]
|
||||||
|
keeps = keeps[sort_idx]
|
||||||
|
seen = _Intervals()
|
||||||
|
spans = SpanGroup(doc, name=self.key)
|
||||||
|
attrs_scores = []
|
||||||
|
for i in range(indices.shape[0]):
|
||||||
|
if not keeps[i]:
|
||||||
|
continue
|
||||||
|
|
||||||
|
label = predicted[i]
|
||||||
|
start = indices[i, 0]
|
||||||
|
end = indices[i, 1]
|
||||||
|
|
||||||
|
if not allow_overlap:
|
||||||
|
if (start, end) in seen:
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
seen.add(start, end)
|
||||||
|
attrs_scores.append(argmax_scores[i])
|
||||||
|
spans.append(Span(doc, start, end, label=self.labels[label]))
|
||||||
|
|
||||||
|
spans.attrs["scores"] = numpy.array(attrs_scores)
|
||||||
return spans
|
return spans
|
||||||
|
|
|
@ -52,6 +52,7 @@ DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"]
|
||||||
"overwrite": False,
|
"overwrite": False,
|
||||||
"scorer": {"@scorers": "spacy.tagger_scorer.v1"},
|
"scorer": {"@scorers": "spacy.tagger_scorer.v1"},
|
||||||
"neg_prefix": "!",
|
"neg_prefix": "!",
|
||||||
|
"label_smoothing": 0.0,
|
||||||
"save_activations": False,
|
"save_activations": False,
|
||||||
},
|
},
|
||||||
default_score_weights={"tag_acc": 1.0},
|
default_score_weights={"tag_acc": 1.0},
|
||||||
|
@ -63,6 +64,7 @@ def make_tagger(
|
||||||
overwrite: bool,
|
overwrite: bool,
|
||||||
scorer: Optional[Callable],
|
scorer: Optional[Callable],
|
||||||
neg_prefix: str,
|
neg_prefix: str,
|
||||||
|
label_smoothing: float,
|
||||||
save_activations: bool,
|
save_activations: bool,
|
||||||
):
|
):
|
||||||
"""Construct a part-of-speech tagger component.
|
"""Construct a part-of-speech tagger component.
|
||||||
|
@ -73,7 +75,7 @@ def make_tagger(
|
||||||
with the rows summing to 1).
|
with the rows summing to 1).
|
||||||
"""
|
"""
|
||||||
return Tagger(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer, neg_prefix=neg_prefix,
|
return Tagger(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer, neg_prefix=neg_prefix,
|
||||||
save_activations=save_activations)
|
label_smoothing=label_smoothing, save_activations=save_activations)
|
||||||
|
|
||||||
|
|
||||||
def tagger_score(examples, **kwargs):
|
def tagger_score(examples, **kwargs):
|
||||||
|
@ -99,6 +101,7 @@ class Tagger(TrainablePipe):
|
||||||
overwrite=False,
|
overwrite=False,
|
||||||
scorer=tagger_score,
|
scorer=tagger_score,
|
||||||
neg_prefix="!",
|
neg_prefix="!",
|
||||||
|
label_smoothing=0.0,
|
||||||
save_activations: bool = False,
|
save_activations: bool = False,
|
||||||
):
|
):
|
||||||
"""Initialize a part-of-speech tagger.
|
"""Initialize a part-of-speech tagger.
|
||||||
|
@ -118,7 +121,12 @@ class Tagger(TrainablePipe):
|
||||||
self.model = model
|
self.model = model
|
||||||
self.name = name
|
self.name = name
|
||||||
self._rehearsal_model = None
|
self._rehearsal_model = None
|
||||||
cfg = {"labels": [], "overwrite": overwrite, "neg_prefix": neg_prefix}
|
cfg = {
|
||||||
|
"labels": [],
|
||||||
|
"overwrite": overwrite,
|
||||||
|
"neg_prefix": neg_prefix,
|
||||||
|
"label_smoothing": label_smoothing
|
||||||
|
}
|
||||||
self.cfg = dict(sorted(cfg.items()))
|
self.cfg = dict(sorted(cfg.items()))
|
||||||
self.scorer = scorer
|
self.scorer = scorer
|
||||||
self.save_activations = save_activations
|
self.save_activations = save_activations
|
||||||
|
@ -294,7 +302,12 @@ class Tagger(TrainablePipe):
|
||||||
DOCS: https://spacy.io/api/tagger#get_loss
|
DOCS: https://spacy.io/api/tagger#get_loss
|
||||||
"""
|
"""
|
||||||
validate_examples(examples, "Tagger.get_loss")
|
validate_examples(examples, "Tagger.get_loss")
|
||||||
loss_func = LegacySequenceCategoricalCrossentropy(names=self.labels, normalize=False, neg_prefix=self.cfg["neg_prefix"])
|
loss_func = LegacySequenceCategoricalCrossentropy(
|
||||||
|
names=self.labels,
|
||||||
|
normalize=False,
|
||||||
|
neg_prefix=self.cfg["neg_prefix"],
|
||||||
|
label_smoothing=self.cfg["label_smoothing"]
|
||||||
|
)
|
||||||
# Convert empty tag "" to missing value None so that both misaligned
|
# Convert empty tag "" to missing value None so that both misaligned
|
||||||
# tokens and tokens with missing annotation have the default missing
|
# tokens and tokens with missing annotation have the default missing
|
||||||
# value None.
|
# value None.
|
||||||
|
|
|
@ -121,20 +121,30 @@ class Scorer:
|
||||||
nlp.add_pipe(pipe)
|
nlp.add_pipe(pipe)
|
||||||
self.nlp = nlp
|
self.nlp = nlp
|
||||||
|
|
||||||
def score(self, examples: Iterable[Example]) -> Dict[str, Any]:
|
def score(
|
||||||
|
self, examples: Iterable[Example], *, per_component: bool = False
|
||||||
|
) -> Dict[str, Any]:
|
||||||
"""Evaluate a list of Examples.
|
"""Evaluate a list of Examples.
|
||||||
|
|
||||||
examples (Iterable[Example]): The predicted annotations + correct annotations.
|
examples (Iterable[Example]): The predicted annotations + correct annotations.
|
||||||
|
per_component (bool): Whether to return the scores keyed by component
|
||||||
|
name. Defaults to False.
|
||||||
RETURNS (Dict): A dictionary of scores.
|
RETURNS (Dict): A dictionary of scores.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/scorer#score
|
DOCS: https://spacy.io/api/scorer#score
|
||||||
"""
|
"""
|
||||||
scores = {}
|
scores = {}
|
||||||
if hasattr(self.nlp.tokenizer, "score"):
|
if hasattr(self.nlp.tokenizer, "score"):
|
||||||
scores.update(self.nlp.tokenizer.score(examples, **self.cfg)) # type: ignore
|
if per_component:
|
||||||
|
scores["tokenizer"] = self.nlp.tokenizer.score(examples, **self.cfg)
|
||||||
|
else:
|
||||||
|
scores.update(self.nlp.tokenizer.score(examples, **self.cfg)) # type: ignore
|
||||||
for name, component in self.nlp.pipeline:
|
for name, component in self.nlp.pipeline:
|
||||||
if hasattr(component, "score"):
|
if hasattr(component, "score"):
|
||||||
scores.update(component.score(examples, **self.cfg))
|
if per_component:
|
||||||
|
scores[name] = component.score(examples, **self.cfg)
|
||||||
|
else:
|
||||||
|
scores.update(component.score(examples, **self.cfg))
|
||||||
return scores
|
return scores
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
|
|
@ -336,6 +336,11 @@ def ml_tokenizer():
|
||||||
return get_lang_class("ml")().tokenizer
|
return get_lang_class("ml")().tokenizer
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def ms_tokenizer():
|
||||||
|
return get_lang_class("ms")().tokenizer
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def nb_tokenizer():
|
def nb_tokenizer():
|
||||||
return get_lang_class("nb")().tokenizer
|
return get_lang_class("nb")().tokenizer
|
||||||
|
|
|
@ -33,6 +33,8 @@ def test_token_morph_key(i_has):
|
||||||
def test_morph_props(i_has):
|
def test_morph_props(i_has):
|
||||||
assert i_has[0].morph.get("PronType") == ["prs"]
|
assert i_has[0].morph.get("PronType") == ["prs"]
|
||||||
assert i_has[1].morph.get("PronType") == []
|
assert i_has[1].morph.get("PronType") == []
|
||||||
|
assert i_has[1].morph.get("AsdfType", ["asdf"]) == ["asdf"]
|
||||||
|
assert i_has[1].morph.get("AsdfType", default=["asdf", "qwer"]) == ["asdf", "qwer"]
|
||||||
|
|
||||||
|
|
||||||
def test_morph_iter(i_has):
|
def test_morph_iter(i_has):
|
||||||
|
|
|
@ -250,7 +250,6 @@ def test_spans_span_sent(doc, doc_not_parsed):
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_spans_span_sent_user_hooks(doc, start, end, expected_sentence):
|
def test_spans_span_sent_user_hooks(doc, start, end, expected_sentence):
|
||||||
|
|
||||||
# Doc-level sents hook
|
# Doc-level sents hook
|
||||||
def user_hook(doc):
|
def user_hook(doc):
|
||||||
return [doc[ii : ii + 2] for ii in range(0, len(doc), 2)]
|
return [doc[ii : ii + 2] for ii in range(0, len(doc), 2)]
|
||||||
|
@ -655,7 +654,6 @@ def test_span_comparison(doc):
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_span_sents(doc, start, end, expected_sentences, expected_sentences_with_hook):
|
def test_span_sents(doc, start, end, expected_sentences, expected_sentences_with_hook):
|
||||||
|
|
||||||
assert len(list(doc[start:end].sents)) == expected_sentences
|
assert len(list(doc[start:end].sents)) == expected_sentences
|
||||||
|
|
||||||
def user_hook(doc):
|
def user_hook(doc):
|
||||||
|
@ -754,3 +752,34 @@ def test_span_start_end_sync(en_tokenizer):
|
||||||
span.start_char = 9
|
span.start_char = 9
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
span.end_char = 1
|
span.end_char = 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_for_partial_ent_sents():
|
||||||
|
"""Spans may be associated with multiple sentences. These .sents should always be complete, not partial, sentences,
|
||||||
|
which this tests for.
|
||||||
|
"""
|
||||||
|
doc = Doc(
|
||||||
|
English().vocab,
|
||||||
|
words=["Mahler's", "Symphony", "No.", "8", "was", "beautiful."],
|
||||||
|
sent_starts=[1, 0, 0, 1, 0, 0],
|
||||||
|
)
|
||||||
|
doc.set_ents([Span(doc, 1, 4, "WORK")])
|
||||||
|
# The specified entity is associated with both sentences in this doc, so we expect all sentences in the doc to be
|
||||||
|
# equal to the sentences referenced in ent.sents.
|
||||||
|
for doc_sent, ent_sent in zip(doc.sents, doc.ents[0].sents):
|
||||||
|
assert doc_sent == ent_sent
|
||||||
|
|
||||||
|
|
||||||
|
def test_for_no_ent_sents():
|
||||||
|
"""Span.sents() should set .sents correctly, even if Span in question is trailing and doesn't form a full
|
||||||
|
sentence.
|
||||||
|
"""
|
||||||
|
doc = Doc(
|
||||||
|
English().vocab,
|
||||||
|
words=["This", "is", "a", "test.", "ENTITY"],
|
||||||
|
sent_starts=[1, 0, 0, 0, 1],
|
||||||
|
)
|
||||||
|
doc.set_ents([Span(doc, 4, 5, "WORK")])
|
||||||
|
sents = list(doc.ents[0].sents)
|
||||||
|
assert len(sents) == 1
|
||||||
|
assert str(sents[0]) == str(doc.ents[0].sent) == "ENTITY"
|
||||||
|
|
|
@ -93,6 +93,21 @@ def test_span_group_copy(doc):
|
||||||
assert span_group.attrs["key"] == "value"
|
assert span_group.attrs["key"] == "value"
|
||||||
assert list(span_group) != list(clone)
|
assert list(span_group) != list(clone)
|
||||||
|
|
||||||
|
# can't copy if the character offsets don't align to tokens
|
||||||
|
doc2 = Doc(doc.vocab, words=[t.text + "x" for t in doc])
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
span_group.copy(doc=doc2)
|
||||||
|
|
||||||
|
# can copy with valid character offsets despite different tokenization
|
||||||
|
doc3 = doc.copy()
|
||||||
|
with doc3.retokenize() as retokenizer:
|
||||||
|
retokenizer.merge(doc3[0:2])
|
||||||
|
retokenizer.merge(doc3[3:6])
|
||||||
|
span_group = SpanGroup(doc, spans=[doc[0:6], doc[3:6]])
|
||||||
|
for span1, span2 in zip(span_group, span_group.copy(doc=doc3)):
|
||||||
|
assert span1.start_char == span2.start_char
|
||||||
|
assert span1.end_char == span2.end_char
|
||||||
|
|
||||||
|
|
||||||
def test_span_group_set_item(doc, other_doc):
|
def test_span_group_set_item(doc, other_doc):
|
||||||
span_group = doc.spans["SPANS"]
|
span_group = doc.spans["SPANS"]
|
||||||
|
@ -255,3 +270,12 @@ def test_span_group_typing(doc: Doc):
|
||||||
for i, span in enumerate(span_group):
|
for i, span in enumerate(span_group):
|
||||||
assert span == span_group[i] == spans[i]
|
assert span == span_group[i] == spans[i]
|
||||||
filter_spans(span_group)
|
filter_spans(span_group)
|
||||||
|
|
||||||
|
|
||||||
|
def test_span_group_init_doc(en_tokenizer):
|
||||||
|
"""Test that all spans must come from the specified doc."""
|
||||||
|
doc1 = en_tokenizer("a b c")
|
||||||
|
doc2 = en_tokenizer("a b c")
|
||||||
|
span_group = SpanGroup(doc1, spans=[doc1[0:1], doc1[1:2]])
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
span_group = SpanGroup(doc1, spans=[doc1[0:1], doc2[1:2]])
|
||||||
|
|
52
spacy/tests/lang/la/test_noun_chunks.py
Normal file
52
spacy/tests/lang/la/test_noun_chunks.py
Normal file
|
@ -0,0 +1,52 @@
|
||||||
|
import pytest
|
||||||
|
from spacy.tokens import Doc
|
||||||
|
|
||||||
|
|
||||||
|
def test_noun_chunks_is_parsed(la_tokenizer):
|
||||||
|
"""Test that noun_chunks raises Value Error for 'la' language if Doc is not parsed.
|
||||||
|
To check this test, we're constructing a Doc
|
||||||
|
with a new Vocab here and forcing is_parsed to 'False'
|
||||||
|
to make sure the noun chunks don't run.
|
||||||
|
"""
|
||||||
|
doc = la_tokenizer("Haec est sententia.")
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
list(doc.noun_chunks)
|
||||||
|
|
||||||
|
|
||||||
|
LA_NP_TEST_EXAMPLES = [
|
||||||
|
(
|
||||||
|
"Haec narrantur a poetis de Perseo.",
|
||||||
|
["DET", "VERB", "ADP", "NOUN", "ADP", "PROPN", "PUNCT"],
|
||||||
|
["nsubj:pass", "ROOT", "case", "obl", "case", "obl", "punct"],
|
||||||
|
[1, 0, -1, -1, -3, -1, -5],
|
||||||
|
["poetis", "Perseo"],
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"Perseus autem in sinu matris dormiebat.",
|
||||||
|
["NOUN", "ADV", "ADP", "NOUN", "NOUN", "VERB", "PUNCT"],
|
||||||
|
["nsubj", "discourse", "case", "obl", "nmod", "ROOT", "punct"],
|
||||||
|
[5, 4, 3, -1, -1, 0, -1],
|
||||||
|
["Perseus", "sinu matris"],
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"text,pos,deps,heads,expected_noun_chunks", LA_NP_TEST_EXAMPLES
|
||||||
|
)
|
||||||
|
def test_la_noun_chunks(la_tokenizer, text, pos, deps, heads, expected_noun_chunks):
|
||||||
|
tokens = la_tokenizer(text)
|
||||||
|
|
||||||
|
assert len(heads) == len(pos)
|
||||||
|
doc = Doc(
|
||||||
|
tokens.vocab,
|
||||||
|
words=[t.text for t in tokens],
|
||||||
|
heads=[head + i for i, head in enumerate(heads)],
|
||||||
|
deps=deps,
|
||||||
|
pos=pos,
|
||||||
|
)
|
||||||
|
|
||||||
|
noun_chunks = list(doc.noun_chunks)
|
||||||
|
assert len(noun_chunks) == len(expected_noun_chunks)
|
||||||
|
for i, np in enumerate(noun_chunks):
|
||||||
|
assert np.text == expected_noun_chunks[i]
|
0
spacy/tests/lang/ms/__init__.py
Normal file
0
spacy/tests/lang/ms/__init__.py
Normal file
8
spacy/tests/lang/ms/test_noun_chunks.py
Normal file
8
spacy/tests/lang/ms/test_noun_chunks.py
Normal file
|
@ -0,0 +1,8 @@
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
def test_noun_chunks_is_parsed_ms(ms_tokenizer):
|
||||||
|
"""Test that noun_chunks raises Value Error for 'ms' language if Doc is not parsed."""
|
||||||
|
doc = ms_tokenizer("sebelas")
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
list(doc.noun_chunks)
|
112
spacy/tests/lang/ms/test_prefix_suffix_infix.py
Normal file
112
spacy/tests/lang/ms/test_prefix_suffix_infix.py
Normal file
|
@ -0,0 +1,112 @@
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("text", ["(Ma'arif)"])
|
||||||
|
def test_ms_tokenizer_splits_no_special(id_tokenizer, text):
|
||||||
|
tokens = id_tokenizer(text)
|
||||||
|
assert len(tokens) == 3
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("text", ["Ma'arif"])
|
||||||
|
def test_ms_tokenizer_splits_no_punct(id_tokenizer, text):
|
||||||
|
tokens = id_tokenizer(text)
|
||||||
|
assert len(tokens) == 1
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("text", ["(Ma'arif"])
|
||||||
|
def test_ms_tokenizer_splits_prefix_punct(id_tokenizer, text):
|
||||||
|
tokens = id_tokenizer(text)
|
||||||
|
assert len(tokens) == 2
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("text", ["Ma'arif)"])
|
||||||
|
def test_ms_tokenizer_splits_suffix_punct(id_tokenizer, text):
|
||||||
|
tokens = id_tokenizer(text)
|
||||||
|
assert len(tokens) == 2
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("text", ["(Ma'arif)"])
|
||||||
|
def test_ms_tokenizer_splits_even_wrap(id_tokenizer, text):
|
||||||
|
tokens = id_tokenizer(text)
|
||||||
|
assert len(tokens) == 3
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("text", ["(Ma'arif?)"])
|
||||||
|
def test_tokenizer_splits_uneven_wrap(id_tokenizer, text):
|
||||||
|
tokens = id_tokenizer(text)
|
||||||
|
assert len(tokens) == 4
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("text,length", [("S.Kom.", 1), ("SKom.", 2), ("(S.Kom.", 2)])
|
||||||
|
def test_ms_tokenizer_splits_prefix_interact(id_tokenizer, text, length):
|
||||||
|
tokens = id_tokenizer(text)
|
||||||
|
assert len(tokens) == length
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("text", ["S.Kom.)"])
|
||||||
|
def test_ms_tokenizer_splits_suffix_interact(id_tokenizer, text):
|
||||||
|
tokens = id_tokenizer(text)
|
||||||
|
assert len(tokens) == 2
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("text", ["(S.Kom.)"])
|
||||||
|
def test_ms_tokenizer_splits_even_wrap_interact(id_tokenizer, text):
|
||||||
|
tokens = id_tokenizer(text)
|
||||||
|
assert len(tokens) == 3
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("text", ["(S.Kom.?)"])
|
||||||
|
def test_ms_tokenizer_splits_uneven_wrap_interact(id_tokenizer, text):
|
||||||
|
tokens = id_tokenizer(text)
|
||||||
|
assert len(tokens) == 4
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"text,length",
|
||||||
|
[("kerana", 1), ("Mahathir-Anwar", 3), ("Tun Dr. Ismail-Abdul Rahman", 6)],
|
||||||
|
)
|
||||||
|
def test_my_tokenizer_splits_hyphens(ms_tokenizer, text, length):
|
||||||
|
tokens = ms_tokenizer(text)
|
||||||
|
assert len(tokens) == length
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("text", ["0.1-13.5", "0.0-0.1", "103.27-300"])
|
||||||
|
def test_ms_tokenizer_splits_numeric_range(id_tokenizer, text):
|
||||||
|
tokens = id_tokenizer(text)
|
||||||
|
assert len(tokens) == 3
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("text", ["ini.Sani", "Halo.Malaysia"])
|
||||||
|
def test_ms_tokenizer_splits_period_infix(id_tokenizer, text):
|
||||||
|
tokens = id_tokenizer(text)
|
||||||
|
assert len(tokens) == 3
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("text", ["Halo,Malaysia", "satu,dua"])
|
||||||
|
def test_ms_tokenizer_splits_comma_infix(id_tokenizer, text):
|
||||||
|
tokens = id_tokenizer(text)
|
||||||
|
assert len(tokens) == 3
|
||||||
|
assert tokens[0].text == text.split(",")[0]
|
||||||
|
assert tokens[1].text == ","
|
||||||
|
assert tokens[2].text == text.split(",")[1]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("text", ["halo...Malaysia", "dia...pergi"])
|
||||||
|
def test_ms_tokenizer_splits_ellipsis_infix(id_tokenizer, text):
|
||||||
|
tokens = id_tokenizer(text)
|
||||||
|
assert len(tokens) == 3
|
||||||
|
|
||||||
|
|
||||||
|
def test_ms_tokenizer_splits_double_hyphen_infix(id_tokenizer):
|
||||||
|
tokens = id_tokenizer("Arsene Wenger--pengurus Arsenal--mengadakan sidang media.")
|
||||||
|
assert len(tokens) == 10
|
||||||
|
assert tokens[0].text == "Arsene"
|
||||||
|
assert tokens[1].text == "Wenger"
|
||||||
|
assert tokens[2].text == "--"
|
||||||
|
assert tokens[3].text == "pengurus"
|
||||||
|
assert tokens[4].text == "Arsenal"
|
||||||
|
assert tokens[5].text == "--"
|
||||||
|
assert tokens[6].text == "mengadakan"
|
||||||
|
assert tokens[7].text == "sidang"
|
||||||
|
assert tokens[8].text == "media"
|
||||||
|
assert tokens[9].text == "."
|
8
spacy/tests/lang/ms/test_text.py
Normal file
8
spacy/tests/lang/ms/test_text.py
Normal file
|
@ -0,0 +1,8 @@
|
||||||
|
import pytest
|
||||||
|
from spacy.lang.ms.lex_attrs import like_num
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("word", ["sebelas"])
|
||||||
|
def test_ms_lex_attrs_capitals(word):
|
||||||
|
assert like_num(word)
|
||||||
|
assert like_num(word.upper())
|
|
@ -5,7 +5,6 @@ from pathlib import Path
|
||||||
def test_build_dependencies():
|
def test_build_dependencies():
|
||||||
# Check that library requirements are pinned exactly the same across different setup files.
|
# Check that library requirements are pinned exactly the same across different setup files.
|
||||||
libs_ignore_requirements = [
|
libs_ignore_requirements = [
|
||||||
"cython",
|
|
||||||
"pytest",
|
"pytest",
|
||||||
"pytest-timeout",
|
"pytest-timeout",
|
||||||
"mock",
|
"mock",
|
||||||
|
|
|
@ -9,6 +9,8 @@ from spacy.lang.en import English
|
||||||
from spacy.lang.it import Italian
|
from spacy.lang.it import Italian
|
||||||
from spacy.language import Language
|
from spacy.language import Language
|
||||||
from spacy.lookups import Lookups
|
from spacy.lookups import Lookups
|
||||||
|
from spacy.pipeline import EntityRecognizer
|
||||||
|
from spacy.pipeline.ner import DEFAULT_NER_MODEL
|
||||||
from spacy.pipeline._parser_internals.ner import BiluoPushDown
|
from spacy.pipeline._parser_internals.ner import BiluoPushDown
|
||||||
from spacy.training import Example, iob_to_biluo, split_bilu_label
|
from spacy.training import Example, iob_to_biluo, split_bilu_label
|
||||||
from spacy.tokens import Doc, Span
|
from spacy.tokens import Doc, Span
|
||||||
|
@ -17,8 +19,6 @@ from thinc.api import fix_random_seed
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
from ..util import make_tempdir
|
from ..util import make_tempdir
|
||||||
from ...pipeline import EntityRecognizer
|
|
||||||
from ...pipeline.ner import DEFAULT_NER_MODEL
|
|
||||||
|
|
||||||
TRAIN_DATA = [
|
TRAIN_DATA = [
|
||||||
("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),
|
("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),
|
||||||
|
@ -777,9 +777,9 @@ def test_neg_annotation(neg_key):
|
||||||
ner.add_label("ORG")
|
ner.add_label("ORG")
|
||||||
example = Example.from_dict(neg_doc, {"entities": [(7, 17, "PERSON")]})
|
example = Example.from_dict(neg_doc, {"entities": [(7, 17, "PERSON")]})
|
||||||
example.reference.spans[neg_key] = [
|
example.reference.spans[neg_key] = [
|
||||||
Span(neg_doc, 2, 4, "ORG"),
|
Span(example.reference, 2, 4, "ORG"),
|
||||||
Span(neg_doc, 2, 3, "PERSON"),
|
Span(example.reference, 2, 3, "PERSON"),
|
||||||
Span(neg_doc, 1, 4, "PERSON"),
|
Span(example.reference, 1, 4, "PERSON"),
|
||||||
]
|
]
|
||||||
|
|
||||||
optimizer = nlp.initialize()
|
optimizer = nlp.initialize()
|
||||||
|
@ -804,7 +804,7 @@ def test_neg_annotation_conflict(neg_key):
|
||||||
ner.add_label("PERSON")
|
ner.add_label("PERSON")
|
||||||
ner.add_label("LOC")
|
ner.add_label("LOC")
|
||||||
example = Example.from_dict(neg_doc, {"entities": [(7, 17, "PERSON")]})
|
example = Example.from_dict(neg_doc, {"entities": [(7, 17, "PERSON")]})
|
||||||
example.reference.spans[neg_key] = [Span(neg_doc, 2, 4, "PERSON")]
|
example.reference.spans[neg_key] = [Span(example.reference, 2, 4, "PERSON")]
|
||||||
assert len(example.reference.ents) == 1
|
assert len(example.reference.ents) == 1
|
||||||
assert example.reference.ents[0].text == "Shaka Khan"
|
assert example.reference.ents[0].text == "Shaka Khan"
|
||||||
assert example.reference.ents[0].label_ == "PERSON"
|
assert example.reference.ents[0].label_ == "PERSON"
|
||||||
|
@ -837,7 +837,7 @@ def test_beam_valid_parse(neg_key):
|
||||||
|
|
||||||
doc = Doc(nlp.vocab, words=tokens)
|
doc = Doc(nlp.vocab, words=tokens)
|
||||||
example = Example.from_dict(doc, {"ner": iob})
|
example = Example.from_dict(doc, {"ner": iob})
|
||||||
neg_span = Span(doc, 50, 53, "ORG")
|
neg_span = Span(example.reference, 50, 53, "ORG")
|
||||||
example.reference.spans[neg_key] = [neg_span]
|
example.reference.spans[neg_key] = [neg_span]
|
||||||
|
|
||||||
optimizer = nlp.initialize()
|
optimizer = nlp.initialize()
|
||||||
|
|
|
@ -10,13 +10,12 @@ from spacy.lang.en import English
|
||||||
from spacy.training import Example
|
from spacy.training import Example
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
from spacy import util, registry
|
from spacy.pipeline import DependencyParser
|
||||||
|
from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL
|
||||||
|
from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
|
||||||
from thinc.api import fix_random_seed
|
from thinc.api import fix_random_seed
|
||||||
|
|
||||||
from ...pipeline import DependencyParser
|
|
||||||
from ...pipeline.dep_parser import DEFAULT_PARSER_MODEL
|
|
||||||
from ..util import apply_transition_sequence, make_tempdir
|
from ..util import apply_transition_sequence, make_tempdir
|
||||||
from ...pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
|
|
||||||
|
|
||||||
TRAIN_DATA = [
|
TRAIN_DATA = [
|
||||||
(
|
(
|
||||||
|
@ -210,7 +209,7 @@ def test_parser_apply_actions(en_vocab, en_parser):
|
||||||
|
|
||||||
assert len(active_states) == 0
|
assert len(active_states) == 0
|
||||||
|
|
||||||
for (state, doc) in zip(states, docs):
|
for state, doc in zip(states, docs):
|
||||||
moves.set_annotations(state, doc)
|
moves.set_annotations(state, doc)
|
||||||
|
|
||||||
assert docs[0][0].head.i == 1
|
assert docs[0][0].head.i == 1
|
||||||
|
|
|
@ -4,7 +4,7 @@ import pytest
|
||||||
from numpy.testing import assert_equal
|
from numpy.testing import assert_equal
|
||||||
from thinc.types import Ragged
|
from thinc.types import Ragged
|
||||||
|
|
||||||
from spacy import registry, util
|
from spacy import registry, util, Language
|
||||||
from spacy.attrs import ENT_KB_ID
|
from spacy.attrs import ENT_KB_ID
|
||||||
from spacy.compat import pickle
|
from spacy.compat import pickle
|
||||||
from spacy.kb import Candidate, InMemoryLookupKB, KnowledgeBase
|
from spacy.kb import Candidate, InMemoryLookupKB, KnowledgeBase
|
||||||
|
@ -108,18 +108,23 @@ def test_issue7065():
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.issue(7065)
|
@pytest.mark.issue(7065)
|
||||||
def test_issue7065_b():
|
@pytest.mark.parametrize("entity_in_first_sentence", [True, False])
|
||||||
|
def test_sentence_crossing_ents(entity_in_first_sentence: bool):
|
||||||
|
"""Tests if NEL crashes if entities cross sentence boundaries and the first associated sentence doesn't have an
|
||||||
|
entity.
|
||||||
|
entity_in_prior_sentence (bool): Whether to include an entity in the first sentence associated with the
|
||||||
|
sentence-crossing entity.
|
||||||
|
"""
|
||||||
# Test that the NEL doesn't crash when an entity crosses a sentence boundary
|
# Test that the NEL doesn't crash when an entity crosses a sentence boundary
|
||||||
nlp = English()
|
nlp = English()
|
||||||
vector_length = 3
|
vector_length = 3
|
||||||
nlp.add_pipe("sentencizer")
|
|
||||||
text = "Mahler 's Symphony No. 8 was beautiful."
|
text = "Mahler 's Symphony No. 8 was beautiful."
|
||||||
entities = [(0, 6, "PERSON"), (10, 24, "WORK")]
|
entities = [(10, 24, "WORK")]
|
||||||
links = {
|
links = {(10, 24): {"Q7304": 0.0, "Q270853": 1.0}}
|
||||||
(0, 6): {"Q7304": 1.0, "Q270853": 0.0},
|
if entity_in_first_sentence:
|
||||||
(10, 24): {"Q7304": 0.0, "Q270853": 1.0},
|
entities.append((0, 6, "PERSON"))
|
||||||
}
|
links[(0, 6)] = {"Q7304": 1.0, "Q270853": 0.0}
|
||||||
sent_starts = [1, -1, 0, 0, 0, 0, 0, 0, 0]
|
sent_starts = [1, -1, 0, 0, 0, 1, 0, 0, 0]
|
||||||
doc = nlp(text)
|
doc = nlp(text)
|
||||||
example = Example.from_dict(
|
example = Example.from_dict(
|
||||||
doc, {"entities": entities, "links": links, "sent_starts": sent_starts}
|
doc, {"entities": entities, "links": links, "sent_starts": sent_starts}
|
||||||
|
@ -145,31 +150,14 @@ def test_issue7065_b():
|
||||||
|
|
||||||
# Create the Entity Linker component and add it to the pipeline
|
# Create the Entity Linker component and add it to the pipeline
|
||||||
entity_linker = nlp.add_pipe("entity_linker", last=True)
|
entity_linker = nlp.add_pipe("entity_linker", last=True)
|
||||||
entity_linker.set_kb(create_kb)
|
entity_linker.set_kb(create_kb) # type: ignore
|
||||||
# train the NEL pipe
|
# train the NEL pipe
|
||||||
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||||
for i in range(2):
|
for i in range(2):
|
||||||
losses = {}
|
nlp.update(train_examples, sgd=optimizer)
|
||||||
nlp.update(train_examples, sgd=optimizer, losses=losses)
|
|
||||||
|
|
||||||
# Add a custom rule-based component to mimick NER
|
# This shouldn't crash.
|
||||||
patterns = [
|
entity_linker.predict([example.reference]) # type: ignore
|
||||||
{"label": "PERSON", "pattern": [{"LOWER": "mahler"}]},
|
|
||||||
{
|
|
||||||
"label": "WORK",
|
|
||||||
"pattern": [
|
|
||||||
{"LOWER": "symphony"},
|
|
||||||
{"LOWER": "no"},
|
|
||||||
{"LOWER": "."},
|
|
||||||
{"LOWER": "8"},
|
|
||||||
],
|
|
||||||
},
|
|
||||||
]
|
|
||||||
ruler = nlp.add_pipe("entity_ruler", before="entity_linker")
|
|
||||||
ruler.add_patterns(patterns)
|
|
||||||
# test the trained model - this should not throw E148
|
|
||||||
doc = nlp(text)
|
|
||||||
assert doc
|
|
||||||
|
|
||||||
|
|
||||||
def test_no_entities():
|
def test_no_entities():
|
||||||
|
@ -509,15 +497,15 @@ def test_el_pipe_configuration(nlp):
|
||||||
return [get_lowercased_candidates(kb, span) for span in spans]
|
return [get_lowercased_candidates(kb, span) for span in spans]
|
||||||
|
|
||||||
@registry.misc("spacy.LowercaseCandidateGenerator.v1")
|
@registry.misc("spacy.LowercaseCandidateGenerator.v1")
|
||||||
def create_candidates() -> Callable[
|
def create_candidates() -> (
|
||||||
[InMemoryLookupKB, "Span"], Iterable[Candidate]
|
Callable[[InMemoryLookupKB, "Span"], Iterable[Candidate]]
|
||||||
]:
|
):
|
||||||
return get_lowercased_candidates
|
return get_lowercased_candidates
|
||||||
|
|
||||||
@registry.misc("spacy.LowercaseCandidateBatchGenerator.v1")
|
@registry.misc("spacy.LowercaseCandidateBatchGenerator.v1")
|
||||||
def create_candidates_batch() -> Callable[
|
def create_candidates_batch() -> (
|
||||||
[InMemoryLookupKB, Iterable["Span"]], Iterable[Iterable[Candidate]]
|
Callable[[InMemoryLookupKB, Iterable["Span"]], Iterable[Iterable[Candidate]]]
|
||||||
]:
|
):
|
||||||
return get_lowercased_candidates_batch
|
return get_lowercased_candidates_batch
|
||||||
|
|
||||||
# replace the pipe with a new one with with a different candidate generator
|
# replace the pipe with a new one with with a different candidate generator
|
||||||
|
|
|
@ -1,6 +1,8 @@
|
||||||
from typing import cast
|
from typing import cast
|
||||||
import pytest
|
import pytest
|
||||||
from numpy.testing import assert_equal
|
from numpy.testing import assert_equal, assert_almost_equal
|
||||||
|
|
||||||
|
from thinc.api import get_current_ops
|
||||||
|
|
||||||
from spacy import util
|
from spacy import util
|
||||||
from spacy.training import Example
|
from spacy.training import Example
|
||||||
|
@ -21,6 +23,8 @@ def test_label_types():
|
||||||
morphologizer.add_label(9)
|
morphologizer.add_label(9)
|
||||||
|
|
||||||
|
|
||||||
|
TAGS = ["Feat=N", "Feat=V", "Feat=J"]
|
||||||
|
|
||||||
TRAIN_DATA = [
|
TRAIN_DATA = [
|
||||||
(
|
(
|
||||||
"I like green eggs",
|
"I like green eggs",
|
||||||
|
@ -34,6 +38,30 @@ TRAIN_DATA = [
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_label_smoothing():
|
||||||
|
nlp = Language()
|
||||||
|
morph_no_ls = nlp.add_pipe("morphologizer", "no_label_smoothing")
|
||||||
|
morph_ls = nlp.add_pipe(
|
||||||
|
"morphologizer", "label_smoothing", config=dict(label_smoothing=0.05)
|
||||||
|
)
|
||||||
|
train_examples = []
|
||||||
|
losses = {}
|
||||||
|
for tag in TAGS:
|
||||||
|
morph_no_ls.add_label(tag)
|
||||||
|
morph_ls.add_label(tag)
|
||||||
|
for t in TRAIN_DATA:
|
||||||
|
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||||
|
|
||||||
|
nlp.initialize(get_examples=lambda: train_examples)
|
||||||
|
tag_scores, bp_tag_scores = morph_ls.model.begin_update(
|
||||||
|
[eg.predicted for eg in train_examples]
|
||||||
|
)
|
||||||
|
ops = get_current_ops()
|
||||||
|
no_ls_grads = ops.to_numpy(morph_no_ls.get_loss(train_examples, tag_scores)[1][0])
|
||||||
|
ls_grads = ops.to_numpy(morph_ls.get_loss(train_examples, tag_scores)[1][0])
|
||||||
|
assert_almost_equal(ls_grads / no_ls_grads, 0.94285715)
|
||||||
|
|
||||||
|
|
||||||
def test_no_label():
|
def test_no_label():
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
nlp.add_pipe("morphologizer")
|
nlp.add_pipe("morphologizer")
|
||||||
|
|
242
spacy/tests/pipeline/test_span_finder.py
Normal file
242
spacy/tests/pipeline/test_span_finder.py
Normal file
|
@ -0,0 +1,242 @@
|
||||||
|
import pytest
|
||||||
|
from thinc.api import Config
|
||||||
|
|
||||||
|
from spacy.language import Language
|
||||||
|
from spacy.lang.en import English
|
||||||
|
from spacy.pipeline.span_finder import span_finder_default_config
|
||||||
|
from spacy.tokens import Doc
|
||||||
|
from spacy.training import Example
|
||||||
|
from spacy import util
|
||||||
|
from spacy.util import registry
|
||||||
|
from spacy.util import fix_random_seed, make_tempdir
|
||||||
|
|
||||||
|
|
||||||
|
SPANS_KEY = "pytest"
|
||||||
|
TRAIN_DATA = [
|
||||||
|
("Who is Shaka Khan?", {"spans": {SPANS_KEY: [(7, 17)]}}),
|
||||||
|
(
|
||||||
|
"I like London and Berlin.",
|
||||||
|
{"spans": {SPANS_KEY: [(7, 13), (18, 24)]}},
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
TRAIN_DATA_OVERLAPPING = [
|
||||||
|
("Who is Shaka Khan?", {"spans": {SPANS_KEY: [(7, 17)]}}),
|
||||||
|
(
|
||||||
|
"I like London and Berlin",
|
||||||
|
{"spans": {SPANS_KEY: [(7, 13), (18, 24), (7, 24)]}},
|
||||||
|
),
|
||||||
|
("", {"spans": {SPANS_KEY: []}}),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def make_examples(nlp, data=TRAIN_DATA):
|
||||||
|
train_examples = []
|
||||||
|
for t in data:
|
||||||
|
eg = Example.from_dict(nlp.make_doc(t[0]), t[1])
|
||||||
|
train_examples.append(eg)
|
||||||
|
return train_examples
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"tokens_predicted, tokens_reference, reference_truths",
|
||||||
|
[
|
||||||
|
(
|
||||||
|
["Mon", ".", "-", "June", "16"],
|
||||||
|
["Mon.", "-", "June", "16"],
|
||||||
|
[(0, 0), (0, 0), (0, 0), (1, 1), (0, 0)],
|
||||||
|
),
|
||||||
|
(
|
||||||
|
["Mon.", "-", "J", "une", "16"],
|
||||||
|
["Mon.", "-", "June", "16"],
|
||||||
|
[(0, 0), (0, 0), (1, 0), (0, 1), (0, 0)],
|
||||||
|
),
|
||||||
|
(
|
||||||
|
["Mon", ".", "-", "June", "16"],
|
||||||
|
["Mon.", "-", "June", "1", "6"],
|
||||||
|
[(0, 0), (0, 0), (0, 0), (1, 1), (0, 0)],
|
||||||
|
),
|
||||||
|
(
|
||||||
|
["Mon.", "-J", "un", "e 16"],
|
||||||
|
["Mon.", "-", "June", "16"],
|
||||||
|
[(0, 0), (0, 0), (0, 0), (0, 0)],
|
||||||
|
),
|
||||||
|
pytest.param(
|
||||||
|
["Mon.-June", "16"],
|
||||||
|
["Mon.", "-", "June", "16"],
|
||||||
|
[(0, 1), (0, 0)],
|
||||||
|
),
|
||||||
|
pytest.param(
|
||||||
|
["Mon.-", "June", "16"],
|
||||||
|
["Mon.", "-", "J", "une", "16"],
|
||||||
|
[(0, 0), (1, 1), (0, 0)],
|
||||||
|
),
|
||||||
|
pytest.param(
|
||||||
|
["Mon.-", "June 16"],
|
||||||
|
["Mon.", "-", "June", "16"],
|
||||||
|
[(0, 0), (1, 0)],
|
||||||
|
),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_loss_alignment_example(tokens_predicted, tokens_reference, reference_truths):
|
||||||
|
nlp = Language()
|
||||||
|
predicted = Doc(
|
||||||
|
nlp.vocab, words=tokens_predicted, spaces=[False] * len(tokens_predicted)
|
||||||
|
)
|
||||||
|
reference = Doc(
|
||||||
|
nlp.vocab, words=tokens_reference, spaces=[False] * len(tokens_reference)
|
||||||
|
)
|
||||||
|
example = Example(predicted, reference)
|
||||||
|
example.reference.spans[SPANS_KEY] = [example.reference.char_span(5, 9)]
|
||||||
|
span_finder = nlp.add_pipe("span_finder", config={"spans_key": SPANS_KEY})
|
||||||
|
nlp.initialize()
|
||||||
|
ops = span_finder.model.ops
|
||||||
|
if predicted.text != reference.text:
|
||||||
|
with pytest.raises(
|
||||||
|
ValueError, match="must match between reference and predicted"
|
||||||
|
):
|
||||||
|
span_finder._get_aligned_truth_scores([example], ops)
|
||||||
|
return
|
||||||
|
truth_scores, masks = span_finder._get_aligned_truth_scores([example], ops)
|
||||||
|
assert len(truth_scores) == len(tokens_predicted)
|
||||||
|
ops.xp.testing.assert_array_equal(truth_scores, ops.xp.asarray(reference_truths))
|
||||||
|
|
||||||
|
|
||||||
|
def test_span_finder_model():
|
||||||
|
nlp = Language()
|
||||||
|
|
||||||
|
docs = [nlp("This is an example."), nlp("This is the second example.")]
|
||||||
|
docs[0].spans[SPANS_KEY] = [docs[0][3:4]]
|
||||||
|
docs[1].spans[SPANS_KEY] = [docs[1][3:5]]
|
||||||
|
|
||||||
|
total_tokens = 0
|
||||||
|
for doc in docs:
|
||||||
|
total_tokens += len(doc)
|
||||||
|
|
||||||
|
config = Config().from_str(span_finder_default_config).interpolate()
|
||||||
|
model = registry.resolve(config)["model"]
|
||||||
|
|
||||||
|
model.initialize(X=docs)
|
||||||
|
predictions = model.predict(docs)
|
||||||
|
|
||||||
|
assert len(predictions) == total_tokens
|
||||||
|
assert len(predictions[0]) == 2
|
||||||
|
|
||||||
|
|
||||||
|
def test_span_finder_component():
|
||||||
|
nlp = Language()
|
||||||
|
|
||||||
|
docs = [nlp("This is an example."), nlp("This is the second example.")]
|
||||||
|
docs[0].spans[SPANS_KEY] = [docs[0][3:4]]
|
||||||
|
docs[1].spans[SPANS_KEY] = [docs[1][3:5]]
|
||||||
|
|
||||||
|
span_finder = nlp.add_pipe("span_finder", config={"spans_key": SPANS_KEY})
|
||||||
|
nlp.initialize()
|
||||||
|
docs = list(span_finder.pipe(docs))
|
||||||
|
|
||||||
|
assert SPANS_KEY in docs[0].spans
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"min_length, max_length, span_count",
|
||||||
|
[(0, 0, 0), (None, None, 8), (2, None, 6), (None, 1, 2), (2, 3, 2)],
|
||||||
|
)
|
||||||
|
def test_set_annotations_span_lengths(min_length, max_length, span_count):
|
||||||
|
nlp = Language()
|
||||||
|
doc = nlp("Me and Jenny goes together like peas and carrots.")
|
||||||
|
if min_length == 0 and max_length == 0:
|
||||||
|
with pytest.raises(ValueError, match="Both 'min_length' and 'max_length'"):
|
||||||
|
span_finder = nlp.add_pipe(
|
||||||
|
"span_finder",
|
||||||
|
config={
|
||||||
|
"max_length": max_length,
|
||||||
|
"min_length": min_length,
|
||||||
|
"spans_key": SPANS_KEY,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
return
|
||||||
|
span_finder = nlp.add_pipe(
|
||||||
|
"span_finder",
|
||||||
|
config={
|
||||||
|
"max_length": max_length,
|
||||||
|
"min_length": min_length,
|
||||||
|
"spans_key": SPANS_KEY,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
nlp.initialize()
|
||||||
|
# Starts [Me, Jenny, peas]
|
||||||
|
# Ends [Jenny, peas, carrots]
|
||||||
|
scores = [
|
||||||
|
(1, 0),
|
||||||
|
(0, 0),
|
||||||
|
(1, 1),
|
||||||
|
(0, 0),
|
||||||
|
(0, 0),
|
||||||
|
(0, 0),
|
||||||
|
(1, 1),
|
||||||
|
(0, 0),
|
||||||
|
(0, 1),
|
||||||
|
(0, 0),
|
||||||
|
]
|
||||||
|
span_finder.set_annotations([doc], scores)
|
||||||
|
|
||||||
|
assert doc.spans[SPANS_KEY]
|
||||||
|
assert len(doc.spans[SPANS_KEY]) == span_count
|
||||||
|
|
||||||
|
# Assert below will fail when max_length is set to 0
|
||||||
|
if max_length is None:
|
||||||
|
max_length = float("inf")
|
||||||
|
if min_length is None:
|
||||||
|
min_length = 1
|
||||||
|
|
||||||
|
assert all(min_length <= len(span) <= max_length for span in doc.spans[SPANS_KEY])
|
||||||
|
|
||||||
|
|
||||||
|
def test_overfitting_IO():
|
||||||
|
# Simple test to try and quickly overfit the span_finder component - ensuring the ML models work correctly
|
||||||
|
fix_random_seed(0)
|
||||||
|
nlp = English()
|
||||||
|
span_finder = nlp.add_pipe("span_finder", config={"spans_key": SPANS_KEY})
|
||||||
|
train_examples = make_examples(nlp)
|
||||||
|
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||||
|
assert span_finder.model.get_dim("nO") == 2
|
||||||
|
|
||||||
|
for i in range(50):
|
||||||
|
losses = {}
|
||||||
|
nlp.update(train_examples, sgd=optimizer, losses=losses)
|
||||||
|
assert losses["span_finder"] < 0.001
|
||||||
|
|
||||||
|
# test the trained model
|
||||||
|
test_text = "I like London and Berlin"
|
||||||
|
doc = nlp(test_text)
|
||||||
|
spans = doc.spans[SPANS_KEY]
|
||||||
|
assert len(spans) == 3
|
||||||
|
assert set([span.text for span in spans]) == {
|
||||||
|
"London",
|
||||||
|
"Berlin",
|
||||||
|
"London and Berlin",
|
||||||
|
}
|
||||||
|
|
||||||
|
# Also test the results are still the same after IO
|
||||||
|
with make_tempdir() as tmp_dir:
|
||||||
|
nlp.to_disk(tmp_dir)
|
||||||
|
nlp2 = util.load_model_from_path(tmp_dir)
|
||||||
|
doc2 = nlp2(test_text)
|
||||||
|
spans2 = doc2.spans[SPANS_KEY]
|
||||||
|
assert len(spans2) == 3
|
||||||
|
assert set([span.text for span in spans2]) == {
|
||||||
|
"London",
|
||||||
|
"Berlin",
|
||||||
|
"London and Berlin",
|
||||||
|
}
|
||||||
|
|
||||||
|
# Test scoring
|
||||||
|
scores = nlp.evaluate(train_examples)
|
||||||
|
assert f"span_finder_{SPANS_KEY}_f" in scores
|
||||||
|
# It's not perfect 1.0 F1 because it's designed to overgenerate for now.
|
||||||
|
assert scores[f"span_finder_{SPANS_KEY}_p"] == 0.75
|
||||||
|
assert scores[f"span_finder_{SPANS_KEY}_r"] == 1.0
|
||||||
|
|
||||||
|
# also test that the spancat works for just a single entity in a sentence
|
||||||
|
doc = nlp("London")
|
||||||
|
assert len(doc.spans[SPANS_KEY]) == 1
|
|
@ -1,7 +1,7 @@
|
||||||
import pytest
|
import pytest
|
||||||
import numpy
|
import numpy
|
||||||
from numpy.testing import assert_array_equal, assert_almost_equal
|
from numpy.testing import assert_array_equal, assert_almost_equal
|
||||||
from thinc.api import get_current_ops, Ragged, fix_random_seed
|
from thinc.api import get_current_ops, NumpyOps, Ragged, fix_random_seed
|
||||||
|
|
||||||
from spacy import util
|
from spacy import util
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
|
@ -15,6 +15,8 @@ OPS = get_current_ops()
|
||||||
|
|
||||||
SPAN_KEY = "labeled_spans"
|
SPAN_KEY = "labeled_spans"
|
||||||
|
|
||||||
|
SPANCAT_COMPONENTS = ["spancat", "spancat_singlelabel"]
|
||||||
|
|
||||||
TRAIN_DATA = [
|
TRAIN_DATA = [
|
||||||
("Who is Shaka Khan?", {"spans": {SPAN_KEY: [(7, 17, "PERSON")]}}),
|
("Who is Shaka Khan?", {"spans": {SPAN_KEY: [(7, 17, "PERSON")]}}),
|
||||||
(
|
(
|
||||||
|
@ -41,38 +43,42 @@ def make_examples(nlp, data=TRAIN_DATA):
|
||||||
return train_examples
|
return train_examples
|
||||||
|
|
||||||
|
|
||||||
def test_no_label():
|
@pytest.mark.parametrize("name", SPANCAT_COMPONENTS)
|
||||||
|
def test_no_label(name):
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY})
|
nlp.add_pipe(name, config={"spans_key": SPAN_KEY})
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
nlp.initialize()
|
nlp.initialize()
|
||||||
|
|
||||||
|
|
||||||
def test_no_resize():
|
@pytest.mark.parametrize("name", SPANCAT_COMPONENTS)
|
||||||
|
def test_no_resize(name):
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY})
|
spancat = nlp.add_pipe(name, config={"spans_key": SPAN_KEY})
|
||||||
spancat.add_label("Thing")
|
spancat.add_label("Thing")
|
||||||
spancat.add_label("Phrase")
|
spancat.add_label("Phrase")
|
||||||
assert spancat.labels == ("Thing", "Phrase")
|
assert spancat.labels == ("Thing", "Phrase")
|
||||||
nlp.initialize()
|
nlp.initialize()
|
||||||
assert spancat.model.get_dim("nO") == 2
|
assert spancat.model.get_dim("nO") == spancat._n_labels
|
||||||
# this throws an error because the spancat can't be resized after initialization
|
# this throws an error because the spancat can't be resized after initialization
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
spancat.add_label("Stuff")
|
spancat.add_label("Stuff")
|
||||||
|
|
||||||
|
|
||||||
def test_implicit_labels():
|
@pytest.mark.parametrize("name", SPANCAT_COMPONENTS)
|
||||||
|
def test_implicit_labels(name):
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY})
|
spancat = nlp.add_pipe(name, config={"spans_key": SPAN_KEY})
|
||||||
assert len(spancat.labels) == 0
|
assert len(spancat.labels) == 0
|
||||||
train_examples = make_examples(nlp)
|
train_examples = make_examples(nlp)
|
||||||
nlp.initialize(get_examples=lambda: train_examples)
|
nlp.initialize(get_examples=lambda: train_examples)
|
||||||
assert spancat.labels == ("PERSON", "LOC")
|
assert spancat.labels == ("PERSON", "LOC")
|
||||||
|
|
||||||
|
|
||||||
def test_explicit_labels():
|
@pytest.mark.parametrize("name", SPANCAT_COMPONENTS)
|
||||||
|
def test_explicit_labels(name):
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY})
|
spancat = nlp.add_pipe(name, config={"spans_key": SPAN_KEY})
|
||||||
assert len(spancat.labels) == 0
|
assert len(spancat.labels) == 0
|
||||||
spancat.add_label("PERSON")
|
spancat.add_label("PERSON")
|
||||||
spancat.add_label("LOC")
|
spancat.add_label("LOC")
|
||||||
|
@ -102,13 +108,13 @@ def test_doc_gc():
|
||||||
# XXX This fails with length 0 sometimes
|
# XXX This fails with length 0 sometimes
|
||||||
assert len(spangroup) > 0
|
assert len(spangroup) > 0
|
||||||
with pytest.raises(RuntimeError):
|
with pytest.raises(RuntimeError):
|
||||||
span = spangroup[0]
|
spangroup[0]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"max_positive,nr_results", [(None, 4), (1, 2), (2, 3), (3, 4), (4, 4)]
|
"max_positive,nr_results", [(None, 4), (1, 2), (2, 3), (3, 4), (4, 4)]
|
||||||
)
|
)
|
||||||
def test_make_spangroup(max_positive, nr_results):
|
def test_make_spangroup_multilabel(max_positive, nr_results):
|
||||||
fix_random_seed(0)
|
fix_random_seed(0)
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
spancat = nlp.add_pipe(
|
spancat = nlp.add_pipe(
|
||||||
|
@ -120,10 +126,12 @@ def test_make_spangroup(max_positive, nr_results):
|
||||||
indices = ngram_suggester([doc])[0].dataXd
|
indices = ngram_suggester([doc])[0].dataXd
|
||||||
assert_array_equal(OPS.to_numpy(indices), numpy.asarray([[0, 1], [1, 2], [0, 2]]))
|
assert_array_equal(OPS.to_numpy(indices), numpy.asarray([[0, 1], [1, 2], [0, 2]]))
|
||||||
labels = ["Thing", "City", "Person", "GreatCity"]
|
labels = ["Thing", "City", "Person", "GreatCity"]
|
||||||
|
for label in labels:
|
||||||
|
spancat.add_label(label)
|
||||||
scores = numpy.asarray(
|
scores = numpy.asarray(
|
||||||
[[0.2, 0.4, 0.3, 0.1], [0.1, 0.6, 0.2, 0.4], [0.8, 0.7, 0.3, 0.9]], dtype="f"
|
[[0.2, 0.4, 0.3, 0.1], [0.1, 0.6, 0.2, 0.4], [0.8, 0.7, 0.3, 0.9]], dtype="f"
|
||||||
)
|
)
|
||||||
spangroup = spancat._make_span_group(doc, indices, scores, labels)
|
spangroup = spancat._make_span_group_multilabel(doc, indices, scores)
|
||||||
assert len(spangroup) == nr_results
|
assert len(spangroup) == nr_results
|
||||||
|
|
||||||
# first span is always the second token "London"
|
# first span is always the second token "London"
|
||||||
|
@ -154,6 +162,130 @@ def test_make_spangroup(max_positive, nr_results):
|
||||||
assert_almost_equal(0.9, spangroup.attrs["scores"][-1], 5)
|
assert_almost_equal(0.9, spangroup.attrs["scores"][-1], 5)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"threshold,allow_overlap,nr_results",
|
||||||
|
[(0.05, True, 3), (0.05, False, 1), (0.5, True, 2), (0.5, False, 1)],
|
||||||
|
)
|
||||||
|
def test_make_spangroup_singlelabel(threshold, allow_overlap, nr_results):
|
||||||
|
fix_random_seed(0)
|
||||||
|
nlp = Language()
|
||||||
|
spancat = nlp.add_pipe(
|
||||||
|
"spancat",
|
||||||
|
config={
|
||||||
|
"spans_key": SPAN_KEY,
|
||||||
|
"threshold": threshold,
|
||||||
|
"max_positive": 1,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
doc = nlp.make_doc("Greater London")
|
||||||
|
ngram_suggester = registry.misc.get("spacy.ngram_suggester.v1")(sizes=[1, 2])
|
||||||
|
indices = ngram_suggester([doc])[0].dataXd
|
||||||
|
assert_array_equal(OPS.to_numpy(indices), numpy.asarray([[0, 1], [1, 2], [0, 2]]))
|
||||||
|
labels = ["Thing", "City", "Person", "GreatCity"]
|
||||||
|
for label in labels:
|
||||||
|
spancat.add_label(label)
|
||||||
|
scores = numpy.asarray(
|
||||||
|
[[0.2, 0.4, 0.3, 0.1], [0.1, 0.6, 0.2, 0.4], [0.8, 0.7, 0.3, 0.9]], dtype="f"
|
||||||
|
)
|
||||||
|
spangroup = spancat._make_span_group_singlelabel(
|
||||||
|
doc, indices, scores, allow_overlap
|
||||||
|
)
|
||||||
|
if threshold > 0.4:
|
||||||
|
if allow_overlap:
|
||||||
|
assert spangroup[0].text == "London"
|
||||||
|
assert spangroup[0].label_ == "City"
|
||||||
|
assert_almost_equal(0.6, spangroup.attrs["scores"][0], 5)
|
||||||
|
assert spangroup[1].text == "Greater London"
|
||||||
|
assert spangroup[1].label_ == "GreatCity"
|
||||||
|
assert spangroup.attrs["scores"][1] == 0.9
|
||||||
|
assert_almost_equal(0.9, spangroup.attrs["scores"][1], 5)
|
||||||
|
else:
|
||||||
|
assert spangroup[0].text == "Greater London"
|
||||||
|
assert spangroup[0].label_ == "GreatCity"
|
||||||
|
assert spangroup.attrs["scores"][0] == 0.9
|
||||||
|
else:
|
||||||
|
if allow_overlap:
|
||||||
|
assert spangroup[0].text == "Greater"
|
||||||
|
assert spangroup[0].label_ == "City"
|
||||||
|
assert spangroup[1].text == "London"
|
||||||
|
assert spangroup[1].label_ == "City"
|
||||||
|
assert spangroup[2].text == "Greater London"
|
||||||
|
assert spangroup[2].label_ == "GreatCity"
|
||||||
|
else:
|
||||||
|
assert spangroup[0].text == "Greater London"
|
||||||
|
|
||||||
|
|
||||||
|
def test_make_spangroup_negative_label():
|
||||||
|
fix_random_seed(0)
|
||||||
|
nlp_single = Language()
|
||||||
|
nlp_multi = Language()
|
||||||
|
spancat_single = nlp_single.add_pipe(
|
||||||
|
"spancat",
|
||||||
|
config={
|
||||||
|
"spans_key": SPAN_KEY,
|
||||||
|
"threshold": 0.1,
|
||||||
|
"max_positive": 1,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
spancat_multi = nlp_multi.add_pipe(
|
||||||
|
"spancat",
|
||||||
|
config={
|
||||||
|
"spans_key": SPAN_KEY,
|
||||||
|
"threshold": 0.1,
|
||||||
|
"max_positive": 2,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
spancat_single.add_negative_label = True
|
||||||
|
spancat_multi.add_negative_label = True
|
||||||
|
doc = nlp_single.make_doc("Greater London")
|
||||||
|
labels = ["Thing", "City", "Person", "GreatCity"]
|
||||||
|
for label in labels:
|
||||||
|
spancat_multi.add_label(label)
|
||||||
|
spancat_single.add_label(label)
|
||||||
|
ngram_suggester = registry.misc.get("spacy.ngram_suggester.v1")(sizes=[1, 2])
|
||||||
|
indices = ngram_suggester([doc])[0].dataXd
|
||||||
|
assert_array_equal(OPS.to_numpy(indices), numpy.asarray([[0, 1], [1, 2], [0, 2]]))
|
||||||
|
scores = numpy.asarray(
|
||||||
|
[
|
||||||
|
[0.2, 0.4, 0.3, 0.1, 0.1],
|
||||||
|
[0.1, 0.6, 0.2, 0.4, 0.9],
|
||||||
|
[0.8, 0.7, 0.3, 0.9, 0.1],
|
||||||
|
],
|
||||||
|
dtype="f",
|
||||||
|
)
|
||||||
|
spangroup_multi = spancat_multi._make_span_group_multilabel(doc, indices, scores)
|
||||||
|
spangroup_single = spancat_single._make_span_group_singlelabel(doc, indices, scores)
|
||||||
|
assert len(spangroup_single) == 2
|
||||||
|
assert spangroup_single[0].text == "Greater"
|
||||||
|
assert spangroup_single[0].label_ == "City"
|
||||||
|
assert_almost_equal(0.4, spangroup_single.attrs["scores"][0], 5)
|
||||||
|
assert spangroup_single[1].text == "Greater London"
|
||||||
|
assert spangroup_single[1].label_ == "GreatCity"
|
||||||
|
assert spangroup_single.attrs["scores"][1] == 0.9
|
||||||
|
assert_almost_equal(0.9, spangroup_single.attrs["scores"][1], 5)
|
||||||
|
|
||||||
|
assert len(spangroup_multi) == 6
|
||||||
|
assert spangroup_multi[0].text == "Greater"
|
||||||
|
assert spangroup_multi[0].label_ == "City"
|
||||||
|
assert_almost_equal(0.4, spangroup_multi.attrs["scores"][0], 5)
|
||||||
|
assert spangroup_multi[1].text == "Greater"
|
||||||
|
assert spangroup_multi[1].label_ == "Person"
|
||||||
|
assert_almost_equal(0.3, spangroup_multi.attrs["scores"][1], 5)
|
||||||
|
assert spangroup_multi[2].text == "London"
|
||||||
|
assert spangroup_multi[2].label_ == "City"
|
||||||
|
assert_almost_equal(0.6, spangroup_multi.attrs["scores"][2], 5)
|
||||||
|
assert spangroup_multi[3].text == "London"
|
||||||
|
assert spangroup_multi[3].label_ == "GreatCity"
|
||||||
|
assert_almost_equal(0.4, spangroup_multi.attrs["scores"][3], 5)
|
||||||
|
assert spangroup_multi[4].text == "Greater London"
|
||||||
|
assert spangroup_multi[4].label_ == "Thing"
|
||||||
|
assert spangroup_multi[4].text == "Greater London"
|
||||||
|
assert_almost_equal(0.8, spangroup_multi.attrs["scores"][4], 5)
|
||||||
|
assert spangroup_multi[5].text == "Greater London"
|
||||||
|
assert spangroup_multi[5].label_ == "GreatCity"
|
||||||
|
assert_almost_equal(0.9, spangroup_multi.attrs["scores"][5], 5)
|
||||||
|
|
||||||
|
|
||||||
def test_ngram_suggester(en_tokenizer):
|
def test_ngram_suggester(en_tokenizer):
|
||||||
# test different n-gram lengths
|
# test different n-gram lengths
|
||||||
for size in [1, 2, 3]:
|
for size in [1, 2, 3]:
|
||||||
|
@ -274,6 +406,21 @@ def test_ngram_sizes(en_tokenizer):
|
||||||
assert_array_equal(OPS.to_numpy(ngrams_3.lengths), [0, 1, 3, 6, 9])
|
assert_array_equal(OPS.to_numpy(ngrams_3.lengths), [0, 1, 3, 6, 9])
|
||||||
|
|
||||||
|
|
||||||
|
def test_preset_spans_suggester():
|
||||||
|
nlp = Language()
|
||||||
|
docs = [nlp("This is an example."), nlp("This is the second example.")]
|
||||||
|
docs[0].spans[SPAN_KEY] = [docs[0][3:4]]
|
||||||
|
docs[1].spans[SPAN_KEY] = [docs[1][0:4], docs[1][3:5]]
|
||||||
|
suggester = registry.misc.get("spacy.preset_spans_suggester.v1")(spans_key=SPAN_KEY)
|
||||||
|
candidates = suggester(docs)
|
||||||
|
assert type(candidates) == Ragged
|
||||||
|
assert len(candidates) == 2
|
||||||
|
assert list(candidates.dataXd[0]) == [3, 4]
|
||||||
|
assert list(candidates.dataXd[1]) == [0, 4]
|
||||||
|
assert list(candidates.dataXd[2]) == [3, 5]
|
||||||
|
assert list(candidates.lengths) == [1, 2]
|
||||||
|
|
||||||
|
|
||||||
def test_overfitting_IO():
|
def test_overfitting_IO():
|
||||||
# Simple test to try and quickly overfit the spancat component - ensuring the ML models work correctly
|
# Simple test to try and quickly overfit the spancat component - ensuring the ML models work correctly
|
||||||
fix_random_seed(0)
|
fix_random_seed(0)
|
||||||
|
@ -296,7 +443,7 @@ def test_overfitting_IO():
|
||||||
spans = doc.spans[SPAN_KEY]
|
spans = doc.spans[SPAN_KEY]
|
||||||
assert len(spans) == 2
|
assert len(spans) == 2
|
||||||
assert len(spans.attrs["scores"]) == 2
|
assert len(spans.attrs["scores"]) == 2
|
||||||
assert min(spans.attrs["scores"]) > 0.9
|
assert min(spans.attrs["scores"]) > 0.8
|
||||||
assert set([span.text for span in spans]) == {"London", "Berlin"}
|
assert set([span.text for span in spans]) == {"London", "Berlin"}
|
||||||
assert set([span.label_ for span in spans]) == {"LOC"}
|
assert set([span.label_ for span in spans]) == {"LOC"}
|
||||||
|
|
||||||
|
@ -308,7 +455,7 @@ def test_overfitting_IO():
|
||||||
spans2 = doc2.spans[SPAN_KEY]
|
spans2 = doc2.spans[SPAN_KEY]
|
||||||
assert len(spans2) == 2
|
assert len(spans2) == 2
|
||||||
assert len(spans2.attrs["scores"]) == 2
|
assert len(spans2.attrs["scores"]) == 2
|
||||||
assert min(spans2.attrs["scores"]) > 0.9
|
assert min(spans2.attrs["scores"]) > 0.8
|
||||||
assert set([span.text for span in spans2]) == {"London", "Berlin"}
|
assert set([span.text for span in spans2]) == {"London", "Berlin"}
|
||||||
assert set([span.label_ for span in spans2]) == {"LOC"}
|
assert set([span.label_ for span in spans2]) == {"LOC"}
|
||||||
|
|
||||||
|
@ -371,9 +518,9 @@ def test_overfitting_IO_overlapping():
|
||||||
assert set([span.label_ for span in spans2]) == {"LOC", "DOUBLE_LOC"}
|
assert set([span.label_ for span in spans2]) == {"LOC", "DOUBLE_LOC"}
|
||||||
|
|
||||||
|
|
||||||
def test_zero_suggestions():
|
@pytest.mark.parametrize("name", SPANCAT_COMPONENTS)
|
||||||
|
def test_zero_suggestions(name):
|
||||||
# Test with a suggester that can return 0 suggestions
|
# Test with a suggester that can return 0 suggestions
|
||||||
|
|
||||||
@registry.misc("test_mixed_zero_suggester")
|
@registry.misc("test_mixed_zero_suggester")
|
||||||
def make_mixed_zero_suggester():
|
def make_mixed_zero_suggester():
|
||||||
def mixed_zero_suggester(docs, *, ops=None):
|
def mixed_zero_suggester(docs, *, ops=None):
|
||||||
|
@ -400,7 +547,7 @@ def test_zero_suggestions():
|
||||||
fix_random_seed(0)
|
fix_random_seed(0)
|
||||||
nlp = English()
|
nlp = English()
|
||||||
spancat = nlp.add_pipe(
|
spancat = nlp.add_pipe(
|
||||||
"spancat",
|
name,
|
||||||
config={
|
config={
|
||||||
"suggester": {"@misc": "test_mixed_zero_suggester"},
|
"suggester": {"@misc": "test_mixed_zero_suggester"},
|
||||||
"spans_key": SPAN_KEY,
|
"spans_key": SPAN_KEY,
|
||||||
|
@ -408,7 +555,7 @@ def test_zero_suggestions():
|
||||||
)
|
)
|
||||||
train_examples = make_examples(nlp)
|
train_examples = make_examples(nlp)
|
||||||
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||||
assert spancat.model.get_dim("nO") == 2
|
assert spancat.model.get_dim("nO") == spancat._n_labels
|
||||||
assert set(spancat.labels) == {"LOC", "PERSON"}
|
assert set(spancat.labels) == {"LOC", "PERSON"}
|
||||||
|
|
||||||
nlp.update(train_examples, sgd=optimizer)
|
nlp.update(train_examples, sgd=optimizer)
|
||||||
|
@ -424,9 +571,10 @@ def test_zero_suggestions():
|
||||||
list(nlp.pipe(["", "one", "three three three"]))
|
list(nlp.pipe(["", "one", "three three three"]))
|
||||||
|
|
||||||
|
|
||||||
def test_set_candidates():
|
@pytest.mark.parametrize("name", SPANCAT_COMPONENTS)
|
||||||
|
def test_set_candidates(name):
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY})
|
spancat = nlp.add_pipe(name, config={"spans_key": SPAN_KEY})
|
||||||
train_examples = make_examples(nlp)
|
train_examples = make_examples(nlp)
|
||||||
nlp.initialize(get_examples=lambda: train_examples)
|
nlp.initialize(get_examples=lambda: train_examples)
|
||||||
texts = [
|
texts = [
|
||||||
|
@ -464,3 +612,21 @@ def test_save_activations():
|
||||||
assert set(doc.activations["spancat"].keys()) == {"indices", "scores"}
|
assert set(doc.activations["spancat"].keys()) == {"indices", "scores"}
|
||||||
assert doc.activations["spancat"]["indices"].shape == (12, 2)
|
assert doc.activations["spancat"]["indices"].shape == (12, 2)
|
||||||
assert doc.activations["spancat"]["scores"].shape == (12, nO)
|
assert doc.activations["spancat"]["scores"].shape == (12, nO)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("name", SPANCAT_COMPONENTS)
|
||||||
|
@pytest.mark.parametrize("n_process", [1, 2])
|
||||||
|
def test_spancat_multiprocessing(name, n_process):
|
||||||
|
if isinstance(get_current_ops, NumpyOps) or n_process < 2:
|
||||||
|
nlp = Language()
|
||||||
|
spancat = nlp.add_pipe(name, config={"spans_key": SPAN_KEY})
|
||||||
|
train_examples = make_examples(nlp)
|
||||||
|
nlp.initialize(get_examples=lambda: train_examples)
|
||||||
|
texts = [
|
||||||
|
"Just a sentence.",
|
||||||
|
"I like London and Berlin",
|
||||||
|
"I like Berlin",
|
||||||
|
"I eat ham.",
|
||||||
|
]
|
||||||
|
docs = list(nlp.pipe(texts, n_process=n_process))
|
||||||
|
assert len(docs) == len(texts)
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
from typing import cast
|
from typing import cast
|
||||||
import pytest
|
import pytest
|
||||||
from numpy.testing import assert_equal
|
from numpy.testing import assert_equal, assert_almost_equal
|
||||||
from spacy.attrs import TAG
|
from spacy.attrs import TAG
|
||||||
|
|
||||||
from spacy import util
|
from spacy import util
|
||||||
|
@ -8,7 +8,7 @@ from spacy.training import Example
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.language import Language
|
from spacy.language import Language
|
||||||
from spacy.pipeline import TrainablePipe
|
from spacy.pipeline import TrainablePipe
|
||||||
from thinc.api import compounding
|
from thinc.api import compounding, get_current_ops
|
||||||
|
|
||||||
from ..util import make_tempdir
|
from ..util import make_tempdir
|
||||||
|
|
||||||
|
@ -71,6 +71,30 @@ PARTIAL_DATA = [
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_label_smoothing():
|
||||||
|
nlp = Language()
|
||||||
|
tagger_no_ls = nlp.add_pipe("tagger", "no_label_smoothing")
|
||||||
|
tagger_ls = nlp.add_pipe(
|
||||||
|
"tagger", "label_smoothing", config=dict(label_smoothing=0.05)
|
||||||
|
)
|
||||||
|
train_examples = []
|
||||||
|
losses = {}
|
||||||
|
for tag in TAGS:
|
||||||
|
tagger_no_ls.add_label(tag)
|
||||||
|
tagger_ls.add_label(tag)
|
||||||
|
for t in TRAIN_DATA:
|
||||||
|
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||||
|
|
||||||
|
nlp.initialize(get_examples=lambda: train_examples)
|
||||||
|
tag_scores, bp_tag_scores = tagger_ls.model.begin_update(
|
||||||
|
[eg.predicted for eg in train_examples]
|
||||||
|
)
|
||||||
|
ops = get_current_ops()
|
||||||
|
no_ls_grads = ops.to_numpy(tagger_no_ls.get_loss(train_examples, tag_scores)[1][0])
|
||||||
|
ls_grads = ops.to_numpy(tagger_ls.get_loss(train_examples, tag_scores)[1][0])
|
||||||
|
assert_almost_equal(ls_grads / no_ls_grads, 0.925)
|
||||||
|
|
||||||
|
|
||||||
def test_no_label():
|
def test_no_label():
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
nlp.add_pipe("tagger")
|
nlp.add_pipe("tagger")
|
||||||
|
|
|
@ -72,7 +72,7 @@ def entity_linker():
|
||||||
|
|
||||||
def create_kb(vocab):
|
def create_kb(vocab):
|
||||||
kb = InMemoryLookupKB(vocab, entity_vector_length=1)
|
kb = InMemoryLookupKB(vocab, entity_vector_length=1)
|
||||||
kb.add_entity("test", 0.0, zeros((1, 1), dtype="f"))
|
kb.add_entity("test", 0.0, zeros((1,), dtype="f"))
|
||||||
return kb
|
return kb
|
||||||
|
|
||||||
entity_linker = nlp.add_pipe("entity_linker")
|
entity_linker = nlp.add_pipe("entity_linker")
|
||||||
|
|
|
@ -213,6 +213,13 @@ def test_serialize_doc_exclude(en_vocab):
|
||||||
|
|
||||||
def test_serialize_doc_span_groups(en_vocab):
|
def test_serialize_doc_span_groups(en_vocab):
|
||||||
doc = Doc(en_vocab, words=["hello", "world", "!"])
|
doc = Doc(en_vocab, words=["hello", "world", "!"])
|
||||||
doc.spans["content"] = [doc[0:2]]
|
span = doc[0:2]
|
||||||
|
span.label_ = "test_serialize_doc_span_groups_label"
|
||||||
|
span.id_ = "test_serialize_doc_span_groups_id"
|
||||||
|
span.kb_id_ = "test_serialize_doc_span_groups_kb_id"
|
||||||
|
doc.spans["content"] = [span]
|
||||||
new_doc = Doc(en_vocab).from_bytes(doc.to_bytes())
|
new_doc = Doc(en_vocab).from_bytes(doc.to_bytes())
|
||||||
assert len(new_doc.spans["content"]) == 1
|
assert len(new_doc.spans["content"]) == 1
|
||||||
|
assert new_doc.spans["content"][0].label_ == "test_serialize_doc_span_groups_label"
|
||||||
|
assert new_doc.spans["content"][0].id_ == "test_serialize_doc_span_groups_id"
|
||||||
|
assert new_doc.spans["content"][0].kb_id_ == "test_serialize_doc_span_groups_kb_id"
|
||||||
|
|
|
@ -49,7 +49,11 @@ def test_serialize_doc_bin():
|
||||||
nlp = English()
|
nlp = English()
|
||||||
for doc in nlp.pipe(texts):
|
for doc in nlp.pipe(texts):
|
||||||
doc.cats = cats
|
doc.cats = cats
|
||||||
doc.spans["start"] = [doc[0:2]]
|
span = doc[0:2]
|
||||||
|
span.label_ = "UNUSUAL_SPAN_LABEL"
|
||||||
|
span.id_ = "UNUSUAL_SPAN_ID"
|
||||||
|
span.kb_id_ = "UNUSUAL_SPAN_KB_ID"
|
||||||
|
doc.spans["start"] = [span]
|
||||||
doc[0].norm_ = "UNUSUAL_TOKEN_NORM"
|
doc[0].norm_ = "UNUSUAL_TOKEN_NORM"
|
||||||
doc[0].ent_id_ = "UNUSUAL_TOKEN_ENT_ID"
|
doc[0].ent_id_ = "UNUSUAL_TOKEN_ENT_ID"
|
||||||
doc_bin.add(doc)
|
doc_bin.add(doc)
|
||||||
|
@ -63,6 +67,9 @@ def test_serialize_doc_bin():
|
||||||
assert doc.text == texts[i]
|
assert doc.text == texts[i]
|
||||||
assert doc.cats == cats
|
assert doc.cats == cats
|
||||||
assert len(doc.spans) == 1
|
assert len(doc.spans) == 1
|
||||||
|
assert doc.spans["start"][0].label_ == "UNUSUAL_SPAN_LABEL"
|
||||||
|
assert doc.spans["start"][0].id_ == "UNUSUAL_SPAN_ID"
|
||||||
|
assert doc.spans["start"][0].kb_id_ == "UNUSUAL_SPAN_KB_ID"
|
||||||
assert doc[0].norm_ == "UNUSUAL_TOKEN_NORM"
|
assert doc[0].norm_ == "UNUSUAL_TOKEN_NORM"
|
||||||
assert doc[0].ent_id_ == "UNUSUAL_TOKEN_ENT_ID"
|
assert doc[0].ent_id_ == "UNUSUAL_TOKEN_ENT_ID"
|
||||||
|
|
||||||
|
|
|
@ -12,6 +12,7 @@ import srsly
|
||||||
from click import NoSuchOption
|
from click import NoSuchOption
|
||||||
from packaging.specifiers import SpecifierSet
|
from packaging.specifiers import SpecifierSet
|
||||||
from thinc.api import Config, ConfigValidationError
|
from thinc.api import Config, ConfigValidationError
|
||||||
|
from spacy.tokens import DocBin
|
||||||
|
|
||||||
from spacy import about
|
from spacy import about
|
||||||
from spacy.cli import info
|
from spacy.cli import info
|
||||||
|
@ -27,7 +28,9 @@ from spacy.cli.debug_data import _get_span_characteristics
|
||||||
from spacy.cli.debug_data import _print_span_characteristics
|
from spacy.cli.debug_data import _print_span_characteristics
|
||||||
from spacy.cli.debug_data import _get_spans_length_freq_dist
|
from spacy.cli.debug_data import _get_spans_length_freq_dist
|
||||||
from spacy.cli.download import get_compatibility, get_version
|
from spacy.cli.download import get_compatibility, get_version
|
||||||
|
from spacy.cli.evaluate import render_parses
|
||||||
from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config
|
from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config
|
||||||
|
from spacy.cli.init_pipeline import _init_labels
|
||||||
from spacy.cli.package import get_third_party_dependencies
|
from spacy.cli.package import get_third_party_dependencies
|
||||||
from spacy.cli.package import _is_permitted_package_name
|
from spacy.cli.package import _is_permitted_package_name
|
||||||
from spacy.cli.project.remote_storage import RemoteStorage
|
from spacy.cli.project.remote_storage import RemoteStorage
|
||||||
|
@ -46,7 +49,6 @@ from spacy.training.converters import conll_ner_to_docs, conllu_to_docs
|
||||||
from spacy.training.converters import iob_to_docs
|
from spacy.training.converters import iob_to_docs
|
||||||
from spacy.util import ENV_VARS, get_minor_version, load_model_from_config, load_config
|
from spacy.util import ENV_VARS, get_minor_version, load_model_from_config, load_config
|
||||||
|
|
||||||
from ..cli.init_pipeline import _init_labels
|
|
||||||
from .util import make_tempdir
|
from .util import make_tempdir
|
||||||
|
|
||||||
|
|
||||||
|
@ -144,6 +146,70 @@ def test_issue11235():
|
||||||
assert cfg["commands"][0]["script"][0] == f"hello {lang_var}"
|
assert cfg["commands"][0]["script"][0] == f"hello {lang_var}"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.issue(12566)
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"factory,output_file",
|
||||||
|
[("deps", "parses.html"), ("ents", "entities.html"), ("spans", "spans.html")],
|
||||||
|
)
|
||||||
|
def test_issue12566(factory: str, output_file: str):
|
||||||
|
"""
|
||||||
|
Test if all displaCy types (ents, dep, spans) produce an HTML file
|
||||||
|
"""
|
||||||
|
with make_tempdir() as tmp_dir:
|
||||||
|
# Create sample spaCy file
|
||||||
|
doc_json = {
|
||||||
|
"ents": [
|
||||||
|
{"end": 54, "label": "nam_adj_country", "start": 44},
|
||||||
|
{"end": 83, "label": "nam_liv_person", "start": 69},
|
||||||
|
{"end": 100, "label": "nam_pro_title_book", "start": 86},
|
||||||
|
],
|
||||||
|
"spans": {
|
||||||
|
"sc": [
|
||||||
|
{"end": 54, "kb_id": "", "label": "nam_adj_country", "start": 44},
|
||||||
|
{"end": 83, "kb_id": "", "label": "nam_liv_person", "start": 69},
|
||||||
|
{
|
||||||
|
"end": 100,
|
||||||
|
"kb_id": "",
|
||||||
|
"label": "nam_pro_title_book",
|
||||||
|
"start": 86,
|
||||||
|
},
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"text": "Niedawno czytał em nową książkę znakomitego szkockiego medioznawcy , "
|
||||||
|
"Briana McNaira - Cultural Chaos .",
|
||||||
|
"tokens": [
|
||||||
|
# fmt: off
|
||||||
|
{"id": 0, "start": 0, "end": 8, "tag": "ADV", "pos": "ADV", "morph": "Degree=Pos", "lemma": "niedawno", "dep": "advmod", "head": 1, },
|
||||||
|
{"id": 1, "start": 9, "end": 15, "tag": "PRAET", "pos": "VERB", "morph": "Animacy=Hum|Aspect=Imp|Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act", "lemma": "czytać", "dep": "ROOT", "head": 1, },
|
||||||
|
{"id": 2, "start": 16, "end": 18, "tag": "AGLT", "pos": "NOUN", "morph": "Animacy=Inan|Case=Ins|Gender=Masc|Number=Sing", "lemma": "em", "dep": "iobj", "head": 1, },
|
||||||
|
{"id": 3, "start": 19, "end": 23, "tag": "ADJ", "pos": "ADJ", "morph": "Case=Acc|Degree=Pos|Gender=Fem|Number=Sing", "lemma": "nowy", "dep": "amod", "head": 4, },
|
||||||
|
{"id": 4, "start": 24, "end": 31, "tag": "SUBST", "pos": "NOUN", "morph": "Case=Acc|Gender=Fem|Number=Sing", "lemma": "książka", "dep": "obj", "head": 1, },
|
||||||
|
{"id": 5, "start": 32, "end": 43, "tag": "ADJ", "pos": "ADJ", "morph": "Animacy=Nhum|Case=Gen|Degree=Pos|Gender=Masc|Number=Sing", "lemma": "znakomit", "dep": "acl", "head": 4, },
|
||||||
|
{"id": 6, "start": 44, "end": 54, "tag": "ADJ", "pos": "ADJ", "morph": "Animacy=Hum|Case=Gen|Degree=Pos|Gender=Masc|Number=Sing", "lemma": "szkockiy", "dep": "amod", "head": 7, },
|
||||||
|
{"id": 7, "start": 55, "end": 66, "tag": "SUBST", "pos": "NOUN", "morph": "Animacy=Hum|Case=Gen|Gender=Masc|Number=Sing", "lemma": "medioznawca", "dep": "iobj", "head": 5, },
|
||||||
|
{"id": 8, "start": 67, "end": 68, "tag": "INTERP", "pos": "PUNCT", "morph": "PunctType=Comm", "lemma": ",", "dep": "punct", "head": 9, },
|
||||||
|
{"id": 9, "start": 69, "end": 75, "tag": "SUBST", "pos": "PROPN", "morph": "Animacy=Hum|Case=Gen|Gender=Masc|Number=Sing", "lemma": "Brian", "dep": "nmod", "head": 4, },
|
||||||
|
{"id": 10, "start": 76, "end": 83, "tag": "SUBST", "pos": "PROPN", "morph": "Animacy=Hum|Case=Gen|Gender=Masc|Number=Sing", "lemma": "McNair", "dep": "flat", "head": 9, },
|
||||||
|
{"id": 11, "start": 84, "end": 85, "tag": "INTERP", "pos": "PUNCT", "morph": "PunctType=Dash", "lemma": "-", "dep": "punct", "head": 12, },
|
||||||
|
{"id": 12, "start": 86, "end": 94, "tag": "SUBST", "pos": "PROPN", "morph": "Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing", "lemma": "Cultural", "dep": "conj", "head": 4, },
|
||||||
|
{"id": 13, "start": 95, "end": 100, "tag": "SUBST", "pos": "NOUN", "morph": "Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing", "lemma": "Chaos", "dep": "flat", "head": 12, },
|
||||||
|
{"id": 14, "start": 101, "end": 102, "tag": "INTERP", "pos": "PUNCT", "morph": "PunctType=Peri", "lemma": ".", "dep": "punct", "head": 1, },
|
||||||
|
# fmt: on
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
|
# Create a .spacy file
|
||||||
|
nlp = spacy.blank("pl")
|
||||||
|
doc = Doc(nlp.vocab).from_json(doc_json)
|
||||||
|
|
||||||
|
# Run the evaluate command and check if the html files exist
|
||||||
|
render_parses(
|
||||||
|
docs=[doc], output_path=tmp_dir, model_name="", limit=1, **{factory: True}
|
||||||
|
)
|
||||||
|
|
||||||
|
assert (tmp_dir / output_file).is_file()
|
||||||
|
|
||||||
|
|
||||||
def test_cli_info():
|
def test_cli_info():
|
||||||
nlp = Dutch()
|
nlp = Dutch()
|
||||||
nlp.add_pipe("textcat")
|
nlp.add_pipe("textcat")
|
||||||
|
@ -552,7 +618,14 @@ def test_parse_cli_overrides():
|
||||||
|
|
||||||
@pytest.mark.parametrize("lang", ["en", "nl"])
|
@pytest.mark.parametrize("lang", ["en", "nl"])
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"pipeline", [["tagger", "parser", "ner"], [], ["ner", "textcat", "sentencizer"]]
|
"pipeline",
|
||||||
|
[
|
||||||
|
["tagger", "parser", "ner"],
|
||||||
|
[],
|
||||||
|
["ner", "textcat", "sentencizer"],
|
||||||
|
["morphologizer", "spancat", "entity_linker"],
|
||||||
|
["spancat_singlelabel", "textcat_multilabel"],
|
||||||
|
],
|
||||||
)
|
)
|
||||||
@pytest.mark.parametrize("optimize", ["efficiency", "accuracy"])
|
@pytest.mark.parametrize("optimize", ["efficiency", "accuracy"])
|
||||||
@pytest.mark.parametrize("pretraining", [True, False])
|
@pytest.mark.parametrize("pretraining", [True, False])
|
||||||
|
|
|
@ -5,10 +5,18 @@ import srsly
|
||||||
from typer.testing import CliRunner
|
from typer.testing import CliRunner
|
||||||
from spacy.tokens import DocBin, Doc
|
from spacy.tokens import DocBin, Doc
|
||||||
|
|
||||||
from spacy.cli._util import app
|
from spacy.cli._util import app, get_git_version
|
||||||
from .util import make_tempdir, normalize_whitespace
|
from .util import make_tempdir, normalize_whitespace
|
||||||
|
|
||||||
|
|
||||||
|
def has_git():
|
||||||
|
try:
|
||||||
|
get_git_version()
|
||||||
|
return True
|
||||||
|
except RuntimeError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
def test_convert_auto():
|
def test_convert_auto():
|
||||||
with make_tempdir() as d_in, make_tempdir() as d_out:
|
with make_tempdir() as d_in, make_tempdir() as d_out:
|
||||||
for f in ["data1.iob", "data2.iob", "data3.iob"]:
|
for f in ["data1.iob", "data2.iob", "data3.iob"]:
|
||||||
|
@ -95,6 +103,8 @@ def test_debug_data_trainable_lemmatizer_cli(en_vocab):
|
||||||
|
|
||||||
# project tests
|
# project tests
|
||||||
|
|
||||||
|
CFG_FILE = "myconfig.cfg"
|
||||||
|
|
||||||
SAMPLE_PROJECT = {
|
SAMPLE_PROJECT = {
|
||||||
"title": "Sample project",
|
"title": "Sample project",
|
||||||
"description": "This is a project for testing",
|
"description": "This is a project for testing",
|
||||||
|
@ -120,13 +130,8 @@ SAMPLE_PROJECT = {
|
||||||
{
|
{
|
||||||
"name": "create",
|
"name": "create",
|
||||||
"help": "make a file",
|
"help": "make a file",
|
||||||
"script": ["touch abc.txt"],
|
"script": [f"python -m spacy init config {CFG_FILE}"],
|
||||||
"outputs": ["abc.txt"],
|
"outputs": [f"{CFG_FILE}"],
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "clean",
|
|
||||||
"help": "remove test file",
|
|
||||||
"script": ["rm abc.txt"],
|
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
}
|
}
|
||||||
|
@ -167,7 +172,7 @@ def test_project_assets(project_dir):
|
||||||
|
|
||||||
def test_project_run(project_dir):
|
def test_project_run(project_dir):
|
||||||
# make sure dry run works
|
# make sure dry run works
|
||||||
test_file = project_dir / "abc.txt"
|
test_file = project_dir / CFG_FILE
|
||||||
result = CliRunner().invoke(
|
result = CliRunner().invoke(
|
||||||
app, ["project", "run", "--dry", "create", str(project_dir)]
|
app, ["project", "run", "--dry", "create", str(project_dir)]
|
||||||
)
|
)
|
||||||
|
@ -181,6 +186,7 @@ def test_project_run(project_dir):
|
||||||
assert "okokok" in result.stdout
|
assert "okokok" in result.stdout
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skipif(not has_git(), reason="git not installed")
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"options",
|
"options",
|
||||||
[
|
[
|
||||||
|
@ -214,14 +220,13 @@ def test_project_push_pull(project_dir):
|
||||||
proj_text = srsly.yaml_dumps(proj)
|
proj_text = srsly.yaml_dumps(proj)
|
||||||
(project_dir / "project.yml").write_text(proj_text)
|
(project_dir / "project.yml").write_text(proj_text)
|
||||||
|
|
||||||
test_file = project_dir / "abc.txt"
|
test_file = project_dir / CFG_FILE
|
||||||
result = CliRunner().invoke(app, ["project", "run", "create", str(project_dir)])
|
result = CliRunner().invoke(app, ["project", "run", "create", str(project_dir)])
|
||||||
assert result.exit_code == 0
|
assert result.exit_code == 0
|
||||||
assert test_file.is_file()
|
assert test_file.is_file()
|
||||||
result = CliRunner().invoke(app, ["project", "push", remote, str(project_dir)])
|
result = CliRunner().invoke(app, ["project", "push", remote, str(project_dir)])
|
||||||
assert result.exit_code == 0
|
assert result.exit_code == 0
|
||||||
result = CliRunner().invoke(app, ["project", "run", "clean", str(project_dir)])
|
test_file.unlink()
|
||||||
assert result.exit_code == 0
|
|
||||||
assert not test_file.exists()
|
assert not test_file.exists()
|
||||||
result = CliRunner().invoke(app, ["project", "pull", remote, str(project_dir)])
|
result = CliRunner().invoke(app, ["project", "pull", remote, str(project_dir)])
|
||||||
assert result.exit_code == 0
|
assert result.exit_code == 0
|
||||||
|
|
|
@ -275,6 +275,20 @@ def test_displacy_parse_deps(en_vocab):
|
||||||
{"start": 2, "end": 3, "label": "det", "dir": "left"},
|
{"start": 2, "end": 3, "label": "det", "dir": "left"},
|
||||||
{"start": 1, "end": 3, "label": "attr", "dir": "right"},
|
{"start": 1, "end": 3, "label": "attr", "dir": "right"},
|
||||||
]
|
]
|
||||||
|
# Test that displacy.parse_deps converts Span to Doc
|
||||||
|
deps = displacy.parse_deps(doc[:])
|
||||||
|
assert isinstance(deps, dict)
|
||||||
|
assert deps["words"] == [
|
||||||
|
{"lemma": None, "text": words[0], "tag": pos[0]},
|
||||||
|
{"lemma": None, "text": words[1], "tag": pos[1]},
|
||||||
|
{"lemma": None, "text": words[2], "tag": pos[2]},
|
||||||
|
{"lemma": None, "text": words[3], "tag": pos[3]},
|
||||||
|
]
|
||||||
|
assert deps["arcs"] == [
|
||||||
|
{"start": 0, "end": 1, "label": "nsubj", "dir": "left"},
|
||||||
|
{"start": 2, "end": 3, "label": "det", "dir": "left"},
|
||||||
|
{"start": 1, "end": 3, "label": "attr", "dir": "right"},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
def test_displacy_invalid_arcs():
|
def test_displacy_invalid_arcs():
|
||||||
|
|
|
@ -115,6 +115,14 @@ def test_tokenization(sented_doc):
|
||||||
assert scores["token_r"] == approx(0.33333333)
|
assert scores["token_r"] == approx(0.33333333)
|
||||||
assert scores["token_f"] == 0.4
|
assert scores["token_f"] == 0.4
|
||||||
|
|
||||||
|
# per-component scoring
|
||||||
|
scorer = Scorer()
|
||||||
|
scores = scorer.score([example], per_component=True)
|
||||||
|
assert scores["tokenizer"]["token_acc"] == 0.5
|
||||||
|
assert scores["tokenizer"]["token_p"] == 0.5
|
||||||
|
assert scores["tokenizer"]["token_r"] == approx(0.33333333)
|
||||||
|
assert scores["tokenizer"]["token_f"] == 0.4
|
||||||
|
|
||||||
|
|
||||||
def test_sents(sented_doc):
|
def test_sents(sented_doc):
|
||||||
scorer = Scorer()
|
scorer = Scorer()
|
||||||
|
@ -278,6 +286,13 @@ def test_tag_score(tagged_doc):
|
||||||
assert results["morph_per_feat"]["Poss"]["f"] == 0.0
|
assert results["morph_per_feat"]["Poss"]["f"] == 0.0
|
||||||
assert results["morph_per_feat"]["Number"]["f"] == approx(0.72727272)
|
assert results["morph_per_feat"]["Number"]["f"] == approx(0.72727272)
|
||||||
|
|
||||||
|
# per-component scoring
|
||||||
|
scorer = Scorer()
|
||||||
|
results = scorer.score([example], per_component=True)
|
||||||
|
assert results["tagger"]["tag_acc"] == 0.9
|
||||||
|
assert results["morphologizer"]["pos_acc"] == 0.9
|
||||||
|
assert results["morphologizer"]["morph_acc"] == approx(0.8)
|
||||||
|
|
||||||
|
|
||||||
def test_partial_annotation(en_tokenizer):
|
def test_partial_annotation(en_tokenizer):
|
||||||
pred_doc = en_tokenizer("a b c d e")
|
pred_doc = en_tokenizer("a b c d e")
|
||||||
|
@ -423,14 +438,14 @@ def test_score_spans():
|
||||||
return doc.spans[span_key]
|
return doc.spans[span_key]
|
||||||
|
|
||||||
# Predict exactly the same, but overlapping spans will be discarded
|
# Predict exactly the same, but overlapping spans will be discarded
|
||||||
pred.spans[key] = spans
|
pred.spans[key] = gold.spans[key].copy(doc=pred)
|
||||||
eg = Example(pred, gold)
|
eg = Example(pred, gold)
|
||||||
scores = Scorer.score_spans([eg], attr=key, getter=span_getter)
|
scores = Scorer.score_spans([eg], attr=key, getter=span_getter)
|
||||||
assert scores[f"{key}_p"] == 1.0
|
assert scores[f"{key}_p"] == 1.0
|
||||||
assert scores[f"{key}_r"] < 1.0
|
assert scores[f"{key}_r"] < 1.0
|
||||||
|
|
||||||
# Allow overlapping, now both precision and recall should be 100%
|
# Allow overlapping, now both precision and recall should be 100%
|
||||||
pred.spans[key] = spans
|
pred.spans[key] = gold.spans[key].copy(doc=pred)
|
||||||
eg = Example(pred, gold)
|
eg = Example(pred, gold)
|
||||||
scores = Scorer.score_spans([eg], attr=key, getter=span_getter, allow_overlap=True)
|
scores = Scorer.score_spans([eg], attr=key, getter=span_getter, allow_overlap=True)
|
||||||
assert scores[f"{key}_p"] == 1.0
|
assert scores[f"{key}_p"] == 1.0
|
||||||
|
|
|
@ -2,17 +2,19 @@ from pathlib import Path
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pytest
|
import pytest
|
||||||
import srsly
|
import srsly
|
||||||
from spacy.vocab import Vocab
|
from thinc.api import Config, get_current_ops
|
||||||
from thinc.api import Config
|
|
||||||
|
|
||||||
|
from spacy import util
|
||||||
|
from spacy.lang.en import English
|
||||||
|
from spacy.training.initialize import init_nlp
|
||||||
|
from spacy.training.loop import train
|
||||||
|
from spacy.training.pretrain import pretrain
|
||||||
|
from spacy.tokens import Doc, DocBin
|
||||||
|
from spacy.language import DEFAULT_CONFIG_PRETRAIN_PATH, DEFAULT_CONFIG_PATH
|
||||||
|
from spacy.ml.models.multi_task import create_pretrain_vectors
|
||||||
|
from spacy.vectors import Vectors
|
||||||
|
from spacy.vocab import Vocab
|
||||||
from ..util import make_tempdir
|
from ..util import make_tempdir
|
||||||
from ... import util
|
|
||||||
from ...lang.en import English
|
|
||||||
from ...training.initialize import init_nlp
|
|
||||||
from ...training.loop import train
|
|
||||||
from ...training.pretrain import pretrain
|
|
||||||
from ...tokens import Doc, DocBin
|
|
||||||
from ...language import DEFAULT_CONFIG_PRETRAIN_PATH, DEFAULT_CONFIG_PATH
|
|
||||||
|
|
||||||
pretrain_string_listener = """
|
pretrain_string_listener = """
|
||||||
[nlp]
|
[nlp]
|
||||||
|
@ -163,7 +165,8 @@ def test_pretraining_default():
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("objective", CHAR_OBJECTIVES)
|
@pytest.mark.parametrize("objective", CHAR_OBJECTIVES)
|
||||||
def test_pretraining_tok2vec_characters(objective):
|
@pytest.mark.parametrize("skip_last", (True, False))
|
||||||
|
def test_pretraining_tok2vec_characters(objective, skip_last):
|
||||||
"""Test that pretraining works with the character objective"""
|
"""Test that pretraining works with the character objective"""
|
||||||
config = Config().from_str(pretrain_string_listener)
|
config = Config().from_str(pretrain_string_listener)
|
||||||
config["pretraining"]["objective"] = objective
|
config["pretraining"]["objective"] = objective
|
||||||
|
@ -176,10 +179,14 @@ def test_pretraining_tok2vec_characters(objective):
|
||||||
filled["paths"]["raw_text"] = file_path
|
filled["paths"]["raw_text"] = file_path
|
||||||
filled = filled.interpolate()
|
filled = filled.interpolate()
|
||||||
assert filled["pretraining"]["component"] == "tok2vec"
|
assert filled["pretraining"]["component"] == "tok2vec"
|
||||||
pretrain(filled, tmp_dir)
|
pretrain(filled, tmp_dir, skip_last=skip_last)
|
||||||
assert Path(tmp_dir / "model0.bin").exists()
|
assert Path(tmp_dir / "model0.bin").exists()
|
||||||
assert Path(tmp_dir / "model4.bin").exists()
|
assert Path(tmp_dir / "model4.bin").exists()
|
||||||
assert not Path(tmp_dir / "model5.bin").exists()
|
assert not Path(tmp_dir / "model5.bin").exists()
|
||||||
|
if skip_last:
|
||||||
|
assert not Path(tmp_dir / "model-last.bin").exists()
|
||||||
|
else:
|
||||||
|
assert Path(tmp_dir / "model-last.bin").exists()
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("objective", VECTOR_OBJECTIVES)
|
@pytest.mark.parametrize("objective", VECTOR_OBJECTIVES)
|
||||||
|
@ -235,6 +242,7 @@ def test_pretraining_tagger_tok2vec(config):
|
||||||
pretrain(filled, tmp_dir)
|
pretrain(filled, tmp_dir)
|
||||||
assert Path(tmp_dir / "model0.bin").exists()
|
assert Path(tmp_dir / "model0.bin").exists()
|
||||||
assert Path(tmp_dir / "model4.bin").exists()
|
assert Path(tmp_dir / "model4.bin").exists()
|
||||||
|
assert Path(tmp_dir / "model-last.bin").exists()
|
||||||
assert not Path(tmp_dir / "model5.bin").exists()
|
assert not Path(tmp_dir / "model5.bin").exists()
|
||||||
|
|
||||||
|
|
||||||
|
@ -346,3 +354,26 @@ def write_vectors_model(tmp_dir):
|
||||||
nlp = English(vocab)
|
nlp = English(vocab)
|
||||||
nlp.to_disk(nlp_path)
|
nlp.to_disk(nlp_path)
|
||||||
return str(nlp_path)
|
return str(nlp_path)
|
||||||
|
|
||||||
|
|
||||||
|
def test_pretrain_default_vectors():
|
||||||
|
nlp = English()
|
||||||
|
nlp.add_pipe("tok2vec")
|
||||||
|
nlp.initialize()
|
||||||
|
|
||||||
|
# default vectors are supported
|
||||||
|
nlp.vocab.vectors = Vectors(shape=(10, 10))
|
||||||
|
create_pretrain_vectors(1, 1, "cosine")(nlp.vocab, nlp.get_pipe("tok2vec").model)
|
||||||
|
|
||||||
|
# floret vectors are supported
|
||||||
|
nlp.vocab.vectors = Vectors(
|
||||||
|
data=get_current_ops().xp.zeros((10, 10)), mode="floret", hash_count=1
|
||||||
|
)
|
||||||
|
create_pretrain_vectors(1, 1, "cosine")(nlp.vocab, nlp.get_pipe("tok2vec").model)
|
||||||
|
|
||||||
|
# error for no vectors
|
||||||
|
with pytest.raises(ValueError, match="E875"):
|
||||||
|
nlp.vocab.vectors = Vectors()
|
||||||
|
create_pretrain_vectors(1, 1, "cosine")(
|
||||||
|
nlp.vocab, nlp.get_pipe("tok2vec").model
|
||||||
|
)
|
||||||
|
|
|
@ -833,10 +833,12 @@ cdef class Tokenizer:
|
||||||
self.token_match = re.compile(data["token_match"]).match
|
self.token_match = re.compile(data["token_match"]).match
|
||||||
if "url_match" in data and isinstance(data["url_match"], str):
|
if "url_match" in data and isinstance(data["url_match"], str):
|
||||||
self.url_match = re.compile(data["url_match"]).match
|
self.url_match = re.compile(data["url_match"]).match
|
||||||
if "rules" in data and isinstance(data["rules"], dict):
|
|
||||||
self.rules = data["rules"]
|
|
||||||
if "faster_heuristics" in data:
|
if "faster_heuristics" in data:
|
||||||
self.faster_heuristics = data["faster_heuristics"]
|
self.faster_heuristics = data["faster_heuristics"]
|
||||||
|
# always load rules last so that all other settings are set before the
|
||||||
|
# internal tokenization for the phrase matcher
|
||||||
|
if "rules" in data and isinstance(data["rules"], dict):
|
||||||
|
self.rules = data["rules"]
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -544,10 +544,6 @@ cdef class Doc:
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/doc#char_span
|
DOCS: https://spacy.io/api/doc#char_span
|
||||||
"""
|
"""
|
||||||
if not isinstance(label, int):
|
|
||||||
label = self.vocab.strings.add(label)
|
|
||||||
if not isinstance(kb_id, int):
|
|
||||||
kb_id = self.vocab.strings.add(kb_id)
|
|
||||||
alignment_modes = ("strict", "contract", "expand")
|
alignment_modes = ("strict", "contract", "expand")
|
||||||
if alignment_mode not in alignment_modes:
|
if alignment_mode not in alignment_modes:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
|
@ -1282,12 +1278,14 @@ cdef class Doc:
|
||||||
other.user_span_hooks = dict(self.user_span_hooks)
|
other.user_span_hooks = dict(self.user_span_hooks)
|
||||||
other.length = self.length
|
other.length = self.length
|
||||||
other.max_length = self.max_length
|
other.max_length = self.max_length
|
||||||
other.spans = self.spans.copy(doc=other)
|
|
||||||
buff_size = other.max_length + (PADDING*2)
|
buff_size = other.max_length + (PADDING*2)
|
||||||
assert buff_size > 0
|
assert buff_size > 0
|
||||||
tokens = <TokenC*>other.mem.alloc(buff_size, sizeof(TokenC))
|
tokens = <TokenC*>other.mem.alloc(buff_size, sizeof(TokenC))
|
||||||
memcpy(tokens, self.c - PADDING, buff_size * sizeof(TokenC))
|
memcpy(tokens, self.c - PADDING, buff_size * sizeof(TokenC))
|
||||||
other.c = &tokens[PADDING]
|
other.c = &tokens[PADDING]
|
||||||
|
# copy spans after setting tokens so that SpanGroup.copy can verify
|
||||||
|
# that the start/end offsets are valid
|
||||||
|
other.spans = self.spans.copy(doc=other)
|
||||||
return other
|
return other
|
||||||
|
|
||||||
def to_disk(self, path, *, exclude=tuple()):
|
def to_disk(self, path, *, exclude=tuple()):
|
||||||
|
@ -1364,6 +1362,10 @@ cdef class Doc:
|
||||||
for group in self.spans.values():
|
for group in self.spans.values():
|
||||||
for span in group:
|
for span in group:
|
||||||
strings.add(span.label_)
|
strings.add(span.label_)
|
||||||
|
if span.kb_id in span.doc.vocab.strings:
|
||||||
|
strings.add(span.kb_id_)
|
||||||
|
if span.id in span.doc.vocab.strings:
|
||||||
|
strings.add(span.id_)
|
||||||
# Msgpack doesn't distinguish between lists and tuples, which is
|
# Msgpack doesn't distinguish between lists and tuples, which is
|
||||||
# vexing for user data. As a best guess, we *know* that within
|
# vexing for user data. As a best guess, we *know* that within
|
||||||
# keys, we must have tuples. In values we just have to hope
|
# keys, we must have tuples. In values we just have to hope
|
||||||
|
|
|
@ -124,6 +124,10 @@ class DocBin:
|
||||||
for key, group in doc.spans.items():
|
for key, group in doc.spans.items():
|
||||||
for span in group:
|
for span in group:
|
||||||
self.strings.add(span.label_)
|
self.strings.add(span.label_)
|
||||||
|
if span.kb_id in span.doc.vocab.strings:
|
||||||
|
self.strings.add(span.kb_id_)
|
||||||
|
if span.id in span.doc.vocab.strings:
|
||||||
|
self.strings.add(span.id_)
|
||||||
|
|
||||||
def get_docs(self, vocab: Vocab) -> Iterator[Doc]:
|
def get_docs(self, vocab: Vocab) -> Iterator[Doc]:
|
||||||
"""Recover Doc objects from the annotations, using the given vocab.
|
"""Recover Doc objects from the annotations, using the given vocab.
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import Any, Dict, Iterator, List, Union
|
from typing import Any, Dict, Iterator, List, Optional, Union
|
||||||
from ..vocab import Vocab
|
from ..vocab import Vocab
|
||||||
|
|
||||||
class MorphAnalysis:
|
class MorphAnalysis:
|
||||||
|
@ -13,7 +13,7 @@ class MorphAnalysis:
|
||||||
def __hash__(self) -> int: ...
|
def __hash__(self) -> int: ...
|
||||||
def __eq__(self, other: MorphAnalysis) -> bool: ... # type: ignore[override]
|
def __eq__(self, other: MorphAnalysis) -> bool: ... # type: ignore[override]
|
||||||
def __ne__(self, other: MorphAnalysis) -> bool: ... # type: ignore[override]
|
def __ne__(self, other: MorphAnalysis) -> bool: ... # type: ignore[override]
|
||||||
def get(self, field: Any) -> List[str]: ...
|
def get(self, field: Any, default: Optional[List[str]]) -> List[str]: ...
|
||||||
def to_json(self) -> str: ...
|
def to_json(self) -> str: ...
|
||||||
def to_dict(self) -> Dict[str, str]: ...
|
def to_dict(self) -> Dict[str, str]: ...
|
||||||
def __str__(self) -> str: ...
|
def __str__(self) -> str: ...
|
||||||
|
|
|
@ -62,10 +62,14 @@ cdef class MorphAnalysis:
|
||||||
def __ne__(self, other):
|
def __ne__(self, other):
|
||||||
return self.key != other.key
|
return self.key != other.key
|
||||||
|
|
||||||
def get(self, field):
|
def get(self, field, default=None):
|
||||||
"""Retrieve feature values by field."""
|
"""Retrieve feature values by field."""
|
||||||
cdef attr_t field_id = self.vocab.strings.as_int(field)
|
cdef attr_t field_id = self.vocab.strings.as_int(field)
|
||||||
cdef np.ndarray results = get_by_field(self.c, field_id)
|
cdef np.ndarray results = get_by_field(self.c, field_id)
|
||||||
|
if len(results) == 0:
|
||||||
|
if default is None:
|
||||||
|
default = []
|
||||||
|
return default
|
||||||
features = [self.vocab.strings[result] for result in results]
|
features = [self.vocab.strings[result] for result in results]
|
||||||
return [f.split(Morphology.FIELD_SEP)[1] for f in features]
|
return [f.split(Morphology.FIELD_SEP)[1] for f in features]
|
||||||
|
|
||||||
|
|
|
@ -1,10 +1,12 @@
|
||||||
from typing import Callable, Protocol, Iterator, Optional, Union, Tuple, Any, overload
|
from typing import Any, Callable, Iterator, Optional, Protocol, Tuple, Union, overload
|
||||||
from thinc.types import Floats1d, Ints2d, FloatsXd
|
|
||||||
|
from thinc.types import Floats1d, FloatsXd, Ints2d
|
||||||
|
|
||||||
|
from ..lexeme import Lexeme
|
||||||
|
from ..vocab import Vocab
|
||||||
from .doc import Doc
|
from .doc import Doc
|
||||||
from .token import Token
|
from .token import Token
|
||||||
from .underscore import Underscore
|
from .underscore import Underscore
|
||||||
from ..lexeme import Lexeme
|
|
||||||
from ..vocab import Vocab
|
|
||||||
|
|
||||||
class SpanMethod(Protocol):
|
class SpanMethod(Protocol):
|
||||||
def __call__(self: Span, *args: Any, **kwargs: Any) -> Any: ... # type: ignore[misc]
|
def __call__(self: Span, *args: Any, **kwargs: Any) -> Any: ... # type: ignore[misc]
|
||||||
|
@ -51,7 +53,12 @@ class Span:
|
||||||
kb_id: Union[str, int] = ...,
|
kb_id: Union[str, int] = ...,
|
||||||
span_id: Union[str, int] = ...,
|
span_id: Union[str, int] = ...,
|
||||||
) -> None: ...
|
) -> None: ...
|
||||||
def __richcmp__(self, other: Span, op: int) -> bool: ...
|
def __lt__(self, other: Any) -> bool: ...
|
||||||
|
def __le__(self, other: Any) -> bool: ...
|
||||||
|
def __eq__(self, other: Any) -> bool: ...
|
||||||
|
def __ne__(self, other: Any) -> bool: ...
|
||||||
|
def __gt__(self, other: Any) -> bool: ...
|
||||||
|
def __ge__(self, other: Any) -> bool: ...
|
||||||
def __hash__(self) -> int: ...
|
def __hash__(self) -> int: ...
|
||||||
def __len__(self) -> int: ...
|
def __len__(self) -> int: ...
|
||||||
def __repr__(self) -> str: ...
|
def __repr__(self) -> str: ...
|
||||||
|
|
|
@ -494,10 +494,12 @@ cdef class Span:
|
||||||
start = i
|
start = i
|
||||||
if start >= self.end:
|
if start >= self.end:
|
||||||
break
|
break
|
||||||
if start < self.end:
|
elif i == self.doc.length - 1:
|
||||||
spans.append(Span(self.doc, start, self.end))
|
yield Span(self.doc, start, self.doc.length)
|
||||||
return tuple(spans)
|
|
||||||
|
|
||||||
|
# Ensure that trailing parts of the Span instance are included in last element of .sents.
|
||||||
|
if start == self.doc.length - 1:
|
||||||
|
yield Span(self.doc, start, self.doc.length)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def ents(self):
|
def ents(self):
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
from typing import Any, Dict, Iterable, Optional
|
from typing import Any, Dict, Iterable, Iterator, Optional
|
||||||
|
|
||||||
from .doc import Doc
|
from .doc import Doc
|
||||||
from .span import Span
|
from .span import Span
|
||||||
|
|
||||||
|
@ -18,7 +19,7 @@ class SpanGroup:
|
||||||
def doc(self) -> Doc: ...
|
def doc(self) -> Doc: ...
|
||||||
@property
|
@property
|
||||||
def has_overlap(self) -> bool: ...
|
def has_overlap(self) -> bool: ...
|
||||||
def __iter__(self): ...
|
def __iter__(self) -> Iterator[Span]: ...
|
||||||
def __len__(self) -> int: ...
|
def __len__(self) -> int: ...
|
||||||
def append(self, span: Span) -> None: ...
|
def append(self, span: Span) -> None: ...
|
||||||
def extend(self, spans: Iterable[Span]) -> None: ...
|
def extend(self, spans: Iterable[Span]) -> None: ...
|
||||||
|
|
|
@ -53,6 +53,8 @@ cdef class SpanGroup:
|
||||||
if len(spans) :
|
if len(spans) :
|
||||||
self.c.reserve(len(spans))
|
self.c.reserve(len(spans))
|
||||||
for span in spans:
|
for span in spans:
|
||||||
|
if doc is not span.doc:
|
||||||
|
raise ValueError(Errors.E855.format(obj="span"))
|
||||||
self.push_back(span.c)
|
self.push_back(span.c)
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
|
@ -264,11 +266,22 @@ cdef class SpanGroup:
|
||||||
"""
|
"""
|
||||||
if doc is None:
|
if doc is None:
|
||||||
doc = self.doc
|
doc = self.doc
|
||||||
|
if doc is self.doc:
|
||||||
|
spans = list(self)
|
||||||
|
else:
|
||||||
|
spans = [doc.char_span(span.start_char, span.end_char, label=span.label_, kb_id=span.kb_id, span_id=span.id) for span in self]
|
||||||
|
for i, span in enumerate(spans):
|
||||||
|
if span is None:
|
||||||
|
raise ValueError(Errors.E1052.format(i=i))
|
||||||
|
if span.kb_id in self.doc.vocab.strings:
|
||||||
|
doc.vocab.strings.add(span.kb_id_)
|
||||||
|
if span.id in span.doc.vocab.strings:
|
||||||
|
doc.vocab.strings.add(span.id_)
|
||||||
return SpanGroup(
|
return SpanGroup(
|
||||||
doc,
|
doc,
|
||||||
name=self.name,
|
name=self.name,
|
||||||
attrs=deepcopy(self.attrs),
|
attrs=deepcopy(self.attrs),
|
||||||
spans=list(self),
|
spans=spans,
|
||||||
)
|
)
|
||||||
|
|
||||||
def _concat(
|
def _concat(
|
||||||
|
|
|
@ -222,10 +222,11 @@ def init_vocab(
|
||||||
logger.info("Added vectors: %s", vectors)
|
logger.info("Added vectors: %s", vectors)
|
||||||
# warn if source model vectors are not identical
|
# warn if source model vectors are not identical
|
||||||
sourced_vectors_hashes = nlp.meta.pop("_sourced_vectors_hashes", {})
|
sourced_vectors_hashes = nlp.meta.pop("_sourced_vectors_hashes", {})
|
||||||
vectors_hash = hash(nlp.vocab.vectors.to_bytes(exclude=["strings"]))
|
if len(sourced_vectors_hashes) > 0:
|
||||||
for sourced_component, sourced_vectors_hash in sourced_vectors_hashes.items():
|
vectors_hash = hash(nlp.vocab.vectors.to_bytes(exclude=["strings"]))
|
||||||
if vectors_hash != sourced_vectors_hash:
|
for sourced_component, sourced_vectors_hash in sourced_vectors_hashes.items():
|
||||||
warnings.warn(Warnings.W113.format(name=sourced_component))
|
if vectors_hash != sourced_vectors_hash:
|
||||||
|
warnings.warn(Warnings.W113.format(name=sourced_component))
|
||||||
logger.info("Finished initializing nlp object")
|
logger.info("Finished initializing nlp object")
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -24,6 +24,7 @@ def pretrain(
|
||||||
epoch_resume: Optional[int] = None,
|
epoch_resume: Optional[int] = None,
|
||||||
use_gpu: int = -1,
|
use_gpu: int = -1,
|
||||||
silent: bool = True,
|
silent: bool = True,
|
||||||
|
skip_last: bool = False,
|
||||||
):
|
):
|
||||||
msg = Printer(no_print=silent)
|
msg = Printer(no_print=silent)
|
||||||
if config["training"]["seed"] is not None:
|
if config["training"]["seed"] is not None:
|
||||||
|
@ -60,10 +61,14 @@ def pretrain(
|
||||||
row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
|
row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
|
||||||
msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)
|
msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)
|
||||||
|
|
||||||
def _save_model(epoch, is_temp=False):
|
def _save_model(epoch, is_temp=False, is_last=False):
|
||||||
is_temp_str = ".temp" if is_temp else ""
|
is_temp_str = ".temp" if is_temp else ""
|
||||||
with model.use_params(optimizer.averages):
|
with model.use_params(optimizer.averages):
|
||||||
with (output_dir / f"model{epoch}{is_temp_str}.bin").open("wb") as file_:
|
if is_last:
|
||||||
|
save_path = output_dir / f"model-last.bin"
|
||||||
|
else:
|
||||||
|
save_path = output_dir / f"model{epoch}{is_temp_str}.bin"
|
||||||
|
with (save_path).open("wb") as file_:
|
||||||
file_.write(model.get_ref("tok2vec").to_bytes())
|
file_.write(model.get_ref("tok2vec").to_bytes())
|
||||||
log = {
|
log = {
|
||||||
"nr_word": tracker.nr_word,
|
"nr_word": tracker.nr_word,
|
||||||
|
@ -76,22 +81,26 @@ def pretrain(
|
||||||
|
|
||||||
# TODO: I think we probably want this to look more like the
|
# TODO: I think we probably want this to look more like the
|
||||||
# 'create_train_batches' function?
|
# 'create_train_batches' function?
|
||||||
for epoch in range(epoch_resume, P["max_epochs"]):
|
try:
|
||||||
for batch_id, batch in enumerate(batcher(corpus(nlp))):
|
for epoch in range(epoch_resume, P["max_epochs"]):
|
||||||
docs = ensure_docs(batch)
|
for batch_id, batch in enumerate(batcher(corpus(nlp))):
|
||||||
loss = make_update(model, docs, optimizer, objective)
|
docs = ensure_docs(batch)
|
||||||
progress = tracker.update(epoch, loss, docs)
|
loss = make_update(model, docs, optimizer, objective)
|
||||||
if progress:
|
progress = tracker.update(epoch, loss, docs)
|
||||||
msg.row(progress, **row_settings)
|
if progress:
|
||||||
if P["n_save_every"] and (batch_id % P["n_save_every"] == 0):
|
msg.row(progress, **row_settings)
|
||||||
_save_model(epoch, is_temp=True)
|
if P["n_save_every"] and (batch_id % P["n_save_every"] == 0):
|
||||||
|
_save_model(epoch, is_temp=True)
|
||||||
|
|
||||||
if P["n_save_epoch"]:
|
if P["n_save_epoch"]:
|
||||||
if epoch % P["n_save_epoch"] == 0 or epoch == P["max_epochs"] - 1:
|
if epoch % P["n_save_epoch"] == 0 or epoch == P["max_epochs"] - 1:
|
||||||
|
_save_model(epoch)
|
||||||
|
else:
|
||||||
_save_model(epoch)
|
_save_model(epoch)
|
||||||
else:
|
tracker.epoch_loss = 0.0
|
||||||
_save_model(epoch)
|
finally:
|
||||||
tracker.epoch_loss = 0.0
|
if not skip_last:
|
||||||
|
_save_model(P["max_epochs"], is_last=True)
|
||||||
|
|
||||||
|
|
||||||
def ensure_docs(examples_or_docs: Iterable[Union[Doc, Example]]) -> List[Doc]:
|
def ensure_docs(examples_or_docs: Iterable[Union[Doc, Example]]) -> List[Doc]:
|
||||||
|
|
|
@ -5,6 +5,7 @@ from thinc.api import Optimizer, Model
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from .training import Example
|
from .training import Example
|
||||||
|
from .language import Language
|
||||||
|
|
||||||
|
|
||||||
@runtime_checkable
|
@runtime_checkable
|
||||||
|
@ -50,7 +51,7 @@ class InitializableComponent(Protocol):
|
||||||
def initialize(
|
def initialize(
|
||||||
self,
|
self,
|
||||||
get_examples: Callable[[], Iterable["Example"]],
|
get_examples: Callable[[], Iterable["Example"]],
|
||||||
nlp: Iterable["Example"],
|
nlp: "Language",
|
||||||
**kwargs: Any
|
**kwargs: Any
|
||||||
):
|
):
|
||||||
...
|
...
|
||||||
|
|
|
@ -1121,17 +1121,18 @@ auto-generated by setting `--pretraining` on
|
||||||
$ python -m spacy pretrain [config_path] [output_dir] [--code] [--resume-path] [--epoch-resume] [--gpu-id] [overrides]
|
$ python -m spacy pretrain [config_path] [output_dir] [--code] [--resume-path] [--epoch-resume] [--gpu-id] [overrides]
|
||||||
```
|
```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
| -------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ |
|
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ |
|
||||||
| `output_dir` | Directory to save binary weights to on each epoch. ~~Path (positional)~~ |
|
| `output_dir` | Directory to save binary weights to on each epoch. ~~Path (positional)~~ |
|
||||||
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
||||||
| `--resume-path`, `-r` | Path to pretrained weights from which to resume pretraining. ~~Optional[Path] \(option)~~ |
|
| `--resume-path`, `-r` | Path to pretrained weights from which to resume pretraining. ~~Optional[Path] \(option)~~ |
|
||||||
| `--epoch-resume`, `-er` | The epoch to resume counting from when using `--resume-path`. Prevents unintended overwriting of existing weight files. ~~Optional[int] \(option)~~ |
|
| `--epoch-resume`, `-er` | The epoch to resume counting from when using `--resume-path`. Prevents unintended overwriting of existing weight files. ~~Optional[int] \(option)~~ |
|
||||||
| `--gpu-id`, `-g` | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~ |
|
| `--gpu-id`, `-g` | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~ |
|
||||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
| `--skip-last`, `-L` <Tag variant="new">3.5.2</Tag> | Skip saving `model-last.bin`. Defaults to `False`. ~~bool (flag)~~ |
|
||||||
| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.dropout 0.2`. ~~Any (option/flag)~~ |
|
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||||
| **CREATES** | The pretrained weights that can be used to initialize `spacy train`. |
|
| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.dropout 0.2`. ~~Any (option/flag)~~ |
|
||||||
|
| **CREATES** | The pretrained weights that can be used to initialize `spacy train`. |
|
||||||
|
|
||||||
## evaluate {id="evaluate",version="2",tag="command"}
|
## evaluate {id="evaluate",version="2",tag="command"}
|
||||||
|
|
||||||
|
@ -1161,18 +1162,19 @@ skew. To render a sample of dependency parses in a HTML file using the
|
||||||
$ python -m spacy benchmark accuracy [model] [data_path] [--output] [--code] [--gold-preproc] [--gpu-id] [--displacy-path] [--displacy-limit]
|
$ python -m spacy benchmark accuracy [model] [data_path] [--output] [--code] [--gold-preproc] [--gpu-id] [--displacy-path] [--displacy-limit]
|
||||||
```
|
```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
| ---------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `model` | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~ |
|
| `model` | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~ |
|
||||||
| `data_path` | Location of evaluation data in spaCy's [binary format](/api/data-formats#training). ~~Path (positional)~~ |
|
| `data_path` | Location of evaluation data in spaCy's [binary format](/api/data-formats#training). ~~Path (positional)~~ |
|
||||||
| `--output`, `-o` | Output JSON file for metrics. If not set, no metrics will be exported. ~~Optional[Path] \(option)~~ |
|
| `--output`, `-o` | Output JSON file for metrics. If not set, no metrics will be exported. ~~Optional[Path] \(option)~~ |
|
||||||
| `--code`, `-c` <Tag variant="new">3</Tag> | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
| `--code`, `-c` <Tag variant="new">3</Tag> | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
||||||
| `--gold-preproc`, `-G` | Use gold preprocessing. ~~bool (flag)~~ |
|
| `--gold-preproc`, `-G` | Use gold preprocessing. ~~bool (flag)~~ |
|
||||||
| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ |
|
| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ |
|
||||||
| `--displacy-path`, `-dp` | Directory to output rendered parses as HTML. If not set, no visualizations will be generated. ~~Optional[Path] \(option)~~ |
|
| `--displacy-path`, `-dp` | Directory to output rendered parses as HTML. If not set, no visualizations will be generated. ~~Optional[Path] \(option)~~ |
|
||||||
| `--displacy-limit`, `-dl` | Number of parses to generate per file. Defaults to `25`. Keep in mind that a significantly higher number might cause the `.html` files to render slowly. ~~int (option)~~ |
|
| `--displacy-limit`, `-dl` | Number of parses to generate per file. Defaults to `25`. Keep in mind that a significantly higher number might cause the `.html` files to render slowly. ~~int (option)~~ |
|
||||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
| `--per-component`, `-P` <Tag variant="new">3.6</Tag> | Whether to return the scores keyed by component name. Defaults to `False`. ~~bool (flag)~~ |
|
||||||
| **CREATES** | Training results and optional metrics and visualizations. |
|
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||||
|
| **CREATES** | Training results and optional metrics and visualizations. |
|
||||||
|
|
||||||
### speed {id="benchmark-speed", version="3.5", tag="command"}
|
### speed {id="benchmark-speed", version="3.5", tag="command"}
|
||||||
|
|
||||||
|
@ -1218,7 +1220,7 @@ $ python -m spacy apply [model] [data-path] [output-file] [--code] [--text-key]
|
||||||
| ------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
| ------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `model` | Pipeline to apply to the data. Can be a package or a path to a data directory. ~~str (positional)~~ |
|
| `model` | Pipeline to apply to the data. Can be a package or a path to a data directory. ~~str (positional)~~ |
|
||||||
| `data_path` | Location of data to be evaluated in spaCy's [binary format](/api/data-formats#training), jsonl, or plain text. ~~Path (positional)~~ |
|
| `data_path` | Location of data to be evaluated in spaCy's [binary format](/api/data-formats#training), jsonl, or plain text. ~~Path (positional)~~ |
|
||||||
| `output-file`, `-o` | Output `DocBin` path. ~~str (positional)~~ |
|
| `output-file` | Output `DocBin` path. ~~str (positional)~~ |
|
||||||
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
||||||
| `--text-key`, `-tk` | The key for `.jsonl` files to use to grab the texts from. Defaults to `text`. ~~Optional[str] \(option)~~ |
|
| `--text-key`, `-tk` | The key for `.jsonl` files to use to grab the texts from. Defaults to `text`. ~~Optional[str] \(option)~~ |
|
||||||
| `--force-overwrite`, `-F` | If the provided `output-file` already exists, then force `apply` to overwrite it. If this is `False` (default) then quits with a warning instead. ~~bool (flag)~~ |
|
| `--force-overwrite`, `-F` | If the provided `output-file` already exists, then force `apply` to overwrite it. If this is `False` (default) then quits with a warning instead. ~~bool (flag)~~ |
|
||||||
|
@ -1253,19 +1255,19 @@ be provided.
|
||||||
> $ python -m spacy find-threshold my_nlp data.spacy spancat threshold spans_sc_f
|
> $ python -m spacy find-threshold my_nlp data.spacy spancat threshold spans_sc_f
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
| ------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `model` | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~ |
|
| `model` | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~ |
|
||||||
| `data_path` | Path to file with DocBin with docs to use for threshold search. ~~Path (positional)~~ |
|
| `data_path` | Path to file with DocBin with docs to use for threshold search. ~~Path (positional)~~ |
|
||||||
| `pipe_name` | Name of pipe to examine thresholds for. ~~str (positional)~~ |
|
| `pipe_name` | Name of pipe to examine thresholds for. ~~str (positional)~~ |
|
||||||
| `threshold_key` | Key of threshold attribute in component's configuration. ~~str (positional)~~ |
|
| `threshold_key` | Key of threshold attribute in component's configuration. ~~str (positional)~~ |
|
||||||
| `scores_key` | Name of score to metric to optimize. ~~str (positional)~~ |
|
| `scores_key` | Name of score to metric to optimize. ~~str (positional)~~ |
|
||||||
| `--n_trials`, `-n` | Number of trials to determine optimal thresholds. ~~int (option)~~ |
|
| `--n_trials`, `-n` | Number of trials to determine optimal thresholds. ~~int (option)~~ |
|
||||||
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
||||||
| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ |
|
| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ |
|
||||||
| `--gold-preproc`, `-G` | Use gold preprocessing. ~~bool (flag)~~ |
|
| `--gold-preproc`, `-G` | Use gold preprocessing. ~~bool (flag)~~ |
|
||||||
| `--silent`, `-V`, `-VV` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ |
|
| `--verbose`, `-V`, `-VV` | Display more information for debugging purposes. ~~bool (flag)~~ |
|
||||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||||
|
|
||||||
## assemble {id="assemble",tag="command"}
|
## assemble {id="assemble",tag="command"}
|
||||||
|
|
||||||
|
@ -1638,7 +1640,7 @@ with [`spacy package`](/api/cli#package) and `--build wheel`. For more details,
|
||||||
see the spaCy project [integration](/usage/projects#huggingface_hub).
|
see the spaCy project [integration](/usage/projects#huggingface_hub).
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
$ python -m spacy huggingface-hub push [whl_path] [--org] [--msg] [--local-repo] [--verbose]
|
$ python -m spacy huggingface-hub push [whl_path] [--org] [--msg] [--verbose]
|
||||||
```
|
```
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
|
@ -1647,11 +1649,10 @@ $ python -m spacy huggingface-hub push [whl_path] [--org] [--msg] [--local-repo]
|
||||||
> $ python -m spacy huggingface-hub push en_ner_fashion-0.0.0-py3-none-any.whl
|
> $ python -m spacy huggingface-hub push en_ner_fashion-0.0.0-py3-none-any.whl
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ----------------- | ------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `whl_path` | The path to the `.whl` file packaged with [`spacy package`](https://spacy.io/api/cli#package). ~~Path(positional)~~ |
|
| `whl_path` | The path to the `.whl` file packaged with [`spacy package`](https://spacy.io/api/cli#package). ~~Path(positional)~~ |
|
||||||
| `--org`, `-o` | Optional name of organization to which the pipeline should be uploaded. ~~str (option)~~ |
|
| `--org`, `-o` | Optional name of organization to which the pipeline should be uploaded. ~~str (option)~~ |
|
||||||
| `--msg`, `-m` | Commit message to use for update. Defaults to `"Update spaCy pipeline"`. ~~str (option)~~ |
|
| `--msg`, `-m` | Commit message to use for update. Defaults to `"Update spaCy pipeline"`. ~~str (option)~~ |
|
||||||
| `--local-repo`, `-l` | Local path to the model repository (will be created if it doesn't exist). Defaults to `hub` in the current working directory. ~~Path (option)~~ |
|
| `--verbose`, `-V` | Output additional info for debugging, e.g. the full generated hub metadata. ~~bool (flag)~~ |
|
||||||
| `--verbose`, `-V` | Output additional info for debugging, e.g. the full generated hub metadata. ~~bool (flag)~~ |
|
| **UPLOADS** | The pipeline to the hub. |
|
||||||
| **UPLOADS** | The pipeline to the hub. |
|
|
||||||
|
|
|
@ -64,7 +64,7 @@ details on the architectures and their arguments and hyperparameters.
|
||||||
> config={
|
> config={
|
||||||
> "model": DEFAULT_COREF_MODEL,
|
> "model": DEFAULT_COREF_MODEL,
|
||||||
> "span_cluster_prefix": DEFAULT_CLUSTER_PREFIX,
|
> "span_cluster_prefix": DEFAULT_CLUSTER_PREFIX,
|
||||||
> },
|
> }
|
||||||
> nlp.add_pipe("experimental_coref", config=config)
|
> nlp.add_pipe("experimental_coref", config=config)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
|
|
|
@ -68,28 +68,28 @@ The following operators are supported by the `DependencyMatcher`, most of which
|
||||||
come directly from
|
come directly from
|
||||||
[Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html):
|
[Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html):
|
||||||
|
|
||||||
| Symbol | Description |
|
| Symbol | Description |
|
||||||
| --------------------------------------- | -------------------------------------------------------------------------------------------------------------------- |
|
| --------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `A < B` | `A` is the immediate dependent of `B`. |
|
| `A < B` | `A` is the immediate dependent of `B`. |
|
||||||
| `A > B` | `A` is the immediate head of `B`. |
|
| `A > B` | `A` is the immediate head of `B`. |
|
||||||
| `A << B` | `A` is the dependent in a chain to `B` following dep → head paths. |
|
| `A << B` | `A` is the dependent in a chain to `B` following dep → head paths. |
|
||||||
| `A >> B` | `A` is the head in a chain to `B` following head → dep paths. |
|
| `A >> B` | `A` is the head in a chain to `B` following head → dep paths. |
|
||||||
| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. |
|
| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. |
|
||||||
| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_. |
|
| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(Semgrex counterpart: `..`)_. |
|
||||||
| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. |
|
| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(Semgrex counterpart: `-`)_. |
|
||||||
| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_. |
|
| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(Semgrex counterpart: `--`)_. |
|
||||||
| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. |
|
| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. |
|
||||||
| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. |
|
| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. |
|
||||||
| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. |
|
| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. |
|
||||||
| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. |
|
| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. |
|
||||||
| `A >+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_. |
|
| `A >+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_. |
|
||||||
| `A >- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_. |
|
| `A >- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_. |
|
||||||
| `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_. |
|
| `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i`. |
|
||||||
| `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_. |
|
| `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i`. |
|
||||||
| `A <+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_. |
|
| `A <+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_. |
|
||||||
| `A <- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_. |
|
| `A <- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_. |
|
||||||
| `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_. |
|
| `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i`. |
|
||||||
| `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_. |
|
| `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i`. |
|
||||||
|
|
||||||
## DependencyMatcher.\_\_init\_\_ {id="init",tag="method"}
|
## DependencyMatcher.\_\_init\_\_ {id="init",tag="method"}
|
||||||
|
|
||||||
|
|
|
@ -64,10 +64,10 @@ architectures and their arguments and hyperparameters.
|
||||||
| `use_gold_ents` | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~ |
|
| `use_gold_ents` | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~ |
|
||||||
| `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ |
|
| `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ |
|
||||||
| `get_candidates_batch` <Tag variant="new">3.5</Tag> | Function that generates plausible candidates for a given batch of `Span` objects. Defaults to [CandidateBatchGenerator](/api/architectures#CandidateBatchGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]]~~ |
|
| `get_candidates_batch` <Tag variant="new">3.5</Tag> | Function that generates plausible candidates for a given batch of `Span` objects. Defaults to [CandidateBatchGenerator](/api/architectures#CandidateBatchGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]]~~ |
|
||||||
| `generate_empty_kb` <Tag variant="new">3.6</Tag> | Function that generates an empty `KnowledgeBase` object. Defaults to [`spacy.EmptyKB.v2`](/api/architectures#EmptyKB), which generates an empty [`InMemoryLookupKB`](/api/inmemorylookupkb). ~~Callable[[Vocab, int], KnowledgeBase]~~ |
|
| `generate_empty_kb` <Tag variant="new">3.5.1</Tag> | Function that generates an empty `KnowledgeBase` object. Defaults to [`spacy.EmptyKB.v2`](/api/architectures#EmptyKB), which generates an empty [`InMemoryLookupKB`](/api/inmemorylookupkb). ~~Callable[[Vocab, int], KnowledgeBase]~~ |
|
||||||
| `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ |
|
| `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ |
|
||||||
| `scorer` <Tag variant="new">3.2</Tag> | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ |
|
| `scorer` <Tag variant="new">3.2</Tag> | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ |
|
||||||
| `save_activations` <Tag variant="new">4.0</Tag> | Save activations in `Doc` when annotating. Saved activations are `"ents"` and `"scores"`. ~~Union[bool, list[str]]~~ |
|
| `save_activations` <Tag variant="new">4.0</Tag> | Save activations in `Doc` when annotating. Saved activations are `"ents"` and `"scores"`. ~~Union[bool, list[str]]~~ |
|
||||||
| `threshold` <Tag variant="new">3.4</Tag> | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ |
|
| `threshold` <Tag variant="new">3.4</Tag> | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ |
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
|
|
@ -277,7 +277,7 @@ Restore the state of the knowledge base from a given directory. Note that the
|
||||||
> ```python
|
> ```python
|
||||||
> from spacy.vocab import Vocab
|
> from spacy.vocab import Vocab
|
||||||
> vocab = Vocab().from_disk("/path/to/vocab")
|
> vocab = Vocab().from_disk("/path/to/vocab")
|
||||||
> kb = FullyImplementedKB(vocab=vocab, entity_vector_length=64)
|
> kb = InMemoryLookupKB(vocab=vocab, entity_vector_length=64)
|
||||||
> kb.from_disk("/path/to/kb")
|
> kb.from_disk("/path/to/kb")
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
|
|
|
@ -401,15 +401,16 @@ objects instead of tuples of `Doc` and `GoldParse` objects.
|
||||||
> print(scores)
|
> print(scores)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| --------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
|
| -------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
|
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `batch_size` | The batch size to use. ~~Optional[int]~~ |
|
| `batch_size` | The batch size to use. ~~Optional[int]~~ |
|
||||||
| `scorer` | Optional [`Scorer`](/api/scorer) to use. If not passed in, a new one will be created. ~~Optional[Scorer]~~ |
|
| `scorer` | Optional [`Scorer`](/api/scorer) to use. If not passed in, a new one will be created. ~~Optional[Scorer]~~ |
|
||||||
| `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ |
|
| `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ |
|
||||||
| `scorer_cfg` | Optional dictionary of keyword arguments for the `Scorer`. Defaults to `None`. ~~Optional[Dict[str, Any]]~~ |
|
| `scorer_cfg` | Optional dictionary of keyword arguments for the `Scorer`. Defaults to `None`. ~~Optional[Dict[str, Any]]~~ |
|
||||||
| **RETURNS** | A dictionary of evaluation scores. ~~Dict[str, Union[float, Dict[str, float]]]~~ |
|
| `per_component` <Tag variant="new">3.6</Tag> | Whether to return the scores keyed by component name. Defaults to `False`. ~~bool~~ |
|
||||||
|
| **RETURNS** | A dictionary of evaluation scores. ~~Dict[str, Union[float, Dict[str, float]]]~~ |
|
||||||
|
|
||||||
## Language.use_params {id="use_params",tag="contextmanager, method"}
|
## Language.use_params {id="use_params",tag="contextmanager, method"}
|
||||||
|
|
||||||
|
|
|
@ -45,9 +45,10 @@ architectures and their arguments and hyperparameters.
|
||||||
| Setting | Description |
|
| Setting | Description |
|
||||||
| ----------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ----------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `model` | The model to use. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ |
|
| `model` | The model to use. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||||
| `overwrite` <Tag variant="new">3.2</Tag> | Whether the values of existing features are overwritten. Defaults to `False`. ~~bool~~ |
|
| `overwrite` <Tag variant="new">3.2</Tag> | Whether the values of existing features are overwritten. Defaults to `True`. ~~bool~~ |
|
||||||
| `extend` <Tag variant="new">3.2</Tag> | Whether existing feature types (whose values may or may not be overwritten depending on `overwrite`) are preserved. Defaults to `False`. ~~bool~~ |
|
| `extend` <Tag variant="new">3.2</Tag> | Whether existing feature types (whose values may or may not be overwritten depending on `overwrite`) are preserved. Defaults to `False`. ~~bool~~ |
|
||||||
| `scorer` <Tag variant="new">3.2</Tag> | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"pos"` and `"morph"` and [`Scorer.score_token_attr_per_feat`](/api/scorer#score_token_attr_per_feat) for the attribute `"morph"`. ~~Optional[Callable]~~ |
|
| `scorer` <Tag variant="new">3.2</Tag> | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"pos"` and `"morph"` and [`Scorer.score_token_attr_per_feat`](/api/scorer#score_token_attr_per_feat) for the attribute `"morph"`. ~~Optional[Callable]~~ |
|
||||||
|
| `label_smoothing` <Tag variant="new">3.6</Tag> | [Label smoothing](https://arxiv.org/abs/1906.02629) factor. Defaults to `0.0`. ~~float~~ |
|
||||||
| `save_activations` <Tag variant="new">4.0</Tag> | Save activations in `Doc` when annotating. Saved activations are `"probabilities"` and `"label_ids"`. ~~Union[bool, list[str]]~~ |
|
| `save_activations` <Tag variant="new">4.0</Tag> | Save activations in `Doc` when annotating. Saved activations are `"probabilities"` and `"label_ids"`. ~~Union[bool, list[str]]~~ |
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
|
|
@ -213,10 +213,11 @@ Retrieve values for a feature by field.
|
||||||
> assert morph.get("Feat1") == ["Val1", "Val2"]
|
> assert morph.get("Feat1") == ["Val1", "Val2"]
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------- | ------------------------------------------------ |
|
| ---------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `field` | The field to retrieve. ~~str~~ |
|
| `field` | The field to retrieve. ~~str~~ |
|
||||||
| **RETURNS** | A list of the individual features. ~~List[str]~~ |
|
| `default` <Tag variant="new">3.5.3</Tag> | The value to return if the field is not present. If unset or `None`, the default return value is `[]`. ~~Optional[List[str]]~~ |
|
||||||
|
| **RETURNS** | A list of the individual features. ~~List[str]~~ |
|
||||||
|
|
||||||
### MorphAnalysis.to_dict {id="morphanalysis-to_dict",tag="method"}
|
### MorphAnalysis.to_dict {id="morphanalysis-to_dict",tag="method"}
|
||||||
|
|
||||||
|
|
|
@ -33,7 +33,7 @@ Create a new `Scorer`.
|
||||||
| `default_lang` | The language to use for a default pipeline if `nlp` is not provided. Defaults to `mul`. ~~str~~ |
|
| `default_lang` | The language to use for a default pipeline if `nlp` is not provided. Defaults to `mul`. ~~str~~ |
|
||||||
| `default_pipeline` | The pipeline components to use for a default pipeline if `nlp` is not provided. Defaults to `("senter", "tagger", "morphologizer", "parser", "ner", "textcat")`. ~~Iterable[string]~~ |
|
| `default_pipeline` | The pipeline components to use for a default pipeline if `nlp` is not provided. Defaults to `("senter", "tagger", "morphologizer", "parser", "ner", "textcat")`. ~~Iterable[string]~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `\*\*kwargs` | Any additional settings to pass on to the individual scoring methods. ~~Any~~ |
|
| `**kwargs` | Any additional settings to pass on to the individual scoring methods. ~~Any~~ |
|
||||||
|
|
||||||
## Scorer.score {id="score",tag="method"}
|
## Scorer.score {id="score",tag="method"}
|
||||||
|
|
||||||
|
@ -67,10 +67,12 @@ core pipeline components, the individual score names start with the `Token` or
|
||||||
> scores = scorer.score(examples)
|
> scores = scorer.score(examples)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------- | ------------------------------------------------------------------------------------------------------------------- |
|
| -------------------------------------------- | ------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
|
| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
|
||||||
| **RETURNS** | A dictionary of scores. ~~Dict[str, Union[float, Dict[str, float]]]~~ |
|
| _keyword-only_ | |
|
||||||
|
| `per_component` <Tag variant="new">3.6</Tag> | Whether to return the scores keyed by component name. Defaults to `False`. ~~bool~~ |
|
||||||
|
| **RETURNS** | A dictionary of scores. ~~Dict[str, Union[float, Dict[str, float]]]~~ |
|
||||||
|
|
||||||
## Scorer.score_tokenization {id="score_tokenization",tag="staticmethod",version="3"}
|
## Scorer.score_tokenization {id="score_tokenization",tag="staticmethod",version="3"}
|
||||||
|
|
||||||
|
|
|
@ -13,8 +13,16 @@ A span categorizer consists of two parts: a [suggester function](#suggesters)
|
||||||
that proposes candidate spans, which may or may not overlap, and a labeler model
|
that proposes candidate spans, which may or may not overlap, and a labeler model
|
||||||
that predicts zero or more labels for each candidate.
|
that predicts zero or more labels for each candidate.
|
||||||
|
|
||||||
Predicted spans will be saved in a [`SpanGroup`](/api/spangroup) on the doc.
|
This component comes in two forms: `spancat` and `spancat_singlelabel` (added in
|
||||||
Individual span scores can be found in `spangroup.attrs["scores"]`.
|
spaCy v3.5.1). When you need to perform multi-label classification on your
|
||||||
|
spans, use `spancat`. The `spancat` component uses a `Logistic` layer where the
|
||||||
|
output class probabilities are independent for each class. However, if you need
|
||||||
|
to predict at most one true class for a span, then use `spancat_singlelabel`. It
|
||||||
|
uses a `Softmax` layer and treats the task as a multi-class problem.
|
||||||
|
|
||||||
|
Predicted spans will be saved in a [`SpanGroup`](/api/spangroup) on the doc
|
||||||
|
under `doc.spans[spans_key]`, where `spans_key` is a component config setting.
|
||||||
|
Individual span scores are stored in `doc.spans[spans_key].attrs["scores"]`.
|
||||||
|
|
||||||
## Assigned Attributes {id="assigned-attributes"}
|
## Assigned Attributes {id="assigned-attributes"}
|
||||||
|
|
||||||
|
@ -22,7 +30,9 @@ Predictions will be saved to `Doc.spans[spans_key]` as a
|
||||||
[`SpanGroup`](/api/spangroup). The scores for the spans in the `SpanGroup` will
|
[`SpanGroup`](/api/spangroup). The scores for the spans in the `SpanGroup` will
|
||||||
be saved in `SpanGroup.attrs["scores"]`.
|
be saved in `SpanGroup.attrs["scores"]`.
|
||||||
|
|
||||||
`spans_key` defaults to `"sc"`, but can be passed as a parameter.
|
`spans_key` defaults to `"sc"`, but can be passed as a parameter. The `spancat`
|
||||||
|
component will overwrite any existing spans under the spans key
|
||||||
|
`doc.spans[spans_key]`.
|
||||||
|
|
||||||
| Location | Value |
|
| Location | Value |
|
||||||
| -------------------------------------- | -------------------------------------------------------- |
|
| -------------------------------------- | -------------------------------------------------------- |
|
||||||
|
@ -38,7 +48,7 @@ how the component should be configured. You can override its settings via the
|
||||||
[model architectures](/api/architectures) documentation for details on the
|
[model architectures](/api/architectures) documentation for details on the
|
||||||
architectures and their arguments and hyperparameters.
|
architectures and their arguments and hyperparameters.
|
||||||
|
|
||||||
> #### Example
|
> #### Example (spancat)
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> from spacy.pipeline.spancat import DEFAULT_SPANCAT_MODEL
|
> from spacy.pipeline.spancat import DEFAULT_SPANCAT_MODEL
|
||||||
|
@ -52,15 +62,34 @@ architectures and their arguments and hyperparameters.
|
||||||
> nlp.add_pipe("spancat", config=config)
|
> nlp.add_pipe("spancat", config=config)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Setting | Description |
|
> #### Example (spancat_singlelabel)
|
||||||
| ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
>
|
||||||
| `suggester` | A function that [suggests spans](#suggesters). Spans are returned as a ragged array with two integer columns, for the start and end positions. Defaults to [`ngram_suggester`](#ngram_suggester). ~~Callable[[Iterable[Doc], Optional[Ops]], Ragged]~~ |
|
> ```python
|
||||||
| `model` | A model instance that is given a a list of documents and `(start, end)` indices representing candidate span offsets. The model predicts a probability for each category for each span. Defaults to [SpanCategorizer](/api/architectures#SpanCategorizer). ~~Model[Tuple[List[Doc], Ragged], Floats2d]~~ |
|
> from spacy.pipeline.spancat import DEFAULT_SPANCAT_SINGLELABEL_MODEL
|
||||||
| `spans_key` | Key of the [`Doc.spans`](/api/doc#spans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"sc"`. ~~str~~ |
|
> config = {
|
||||||
| `threshold` | Minimum probability to consider a prediction positive. Spans with a positive prediction will be saved on the Doc. Defaults to `0.5`. ~~float~~ |
|
> "threshold": 0.5,
|
||||||
| `max_positive` | Maximum number of labels to consider positive per span. Defaults to `None`, indicating no limit. ~~Optional[int]~~ |
|
> "spans_key": "labeled_spans",
|
||||||
| `scorer` | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for `Doc.spans[spans_key]` with overlapping spans allowed. ~~Optional[Callable]~~ |
|
> "model": DEFAULT_SPANCAT_SINGLELABEL_MODEL,
|
||||||
| `save_activations` <Tag variant="new">4.0</Tag> | Save activations in `Doc` when annotating. Saved activations are `"indices"` and `"scores"`. ~~Union[bool, list[str]]~~ |
|
> "suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
|
||||||
|
> # Additional spancat_singlelabel parameters
|
||||||
|
> "negative_weight": 0.8,
|
||||||
|
> "allow_overlap": True,
|
||||||
|
> }
|
||||||
|
> nlp.add_pipe("spancat_singlelabel", config=config)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Setting | Description |
|
||||||
|
| --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `suggester` | A function that [suggests spans](#suggesters). Spans are returned as a ragged array with two integer columns, for the start and end positions. Defaults to [`ngram_suggester`](#ngram_suggester). ~~Callable[[Iterable[Doc], Optional[Ops]], Ragged]~~ |
|
||||||
|
| `model` | A model instance that is given a a list of documents and `(start, end)` indices representing candidate span offsets. The model predicts a probability for each category for each span. Defaults to [SpanCategorizer](/api/architectures#SpanCategorizer). ~~Model[Tuple[List[Doc], Ragged], Floats2d]~~ |
|
||||||
|
| `spans_key` | Key of the [`Doc.spans`](/api/doc#spans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"sc"`. ~~str~~ |
|
||||||
|
| `threshold` | Minimum probability to consider a prediction positive. Spans with a positive prediction will be saved on the Doc. Meant to be used in combination with the multi-class `spancat` component with a `Logistic` scoring layer. Defaults to `0.5`. ~~float~~ |
|
||||||
|
| `max_positive` | Maximum number of labels to consider positive per span. Defaults to `None`, indicating no limit. Meant to be used together with the `spancat` component and defaults to 0 with `spancat_singlelabel`. ~~Optional[int]~~ |
|
||||||
|
| `scorer` | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for `Doc.spans[spans_key]` with overlapping spans allowed. ~~Optional[Callable]~~ |
|
||||||
|
| `add_negative_label` <Tag variant="new">3.5.1</Tag> | Whether to learn to predict a special negative label for each unannotated `Span` . This should be `True` when using a `Softmax` classifier layer and so its `True` by default for `spancat_singlelabel`. Spans with negative labels and their scores are not stored as annotations. ~~bool~~ |
|
||||||
|
| `negative_weight` <Tag variant="new">3.5.1</Tag> | Multiplier for the loss terms. It can be used to downweight the negative samples if there are too many. It is only used when `add_negative_label` is `True`. Defaults to `1.0`. ~~float~~ |
|
||||||
|
| `allow_overlap` <Tag variant="new">3.5.1</Tag> | If `True`, the data is assumed to contain overlapping spans. It is only available when `max_positive` is exactly 1. Defaults to `True`. ~~bool~~ |
|
||||||
|
| `save_activations` <Tag variant="new">4.0</Tag> | Save activations in `Doc` when annotating. Saved activations are `"indices"` and `"scores"`. ~~Union[bool, list[str]]~~ |
|
||||||
|
|
||||||
```python
|
```python
|
||||||
%%GITHUB_SPACY/spacy/pipeline/spancat.py
|
%%GITHUB_SPACY/spacy/pipeline/spancat.py
|
||||||
|
@ -72,11 +101,12 @@ architectures and their arguments and hyperparameters.
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> # Construction via add_pipe with default model
|
> # Construction via add_pipe with default model
|
||||||
|
> # Replace 'spancat' with 'spancat_singlelabel' for exclusive classes
|
||||||
> spancat = nlp.add_pipe("spancat")
|
> spancat = nlp.add_pipe("spancat")
|
||||||
>
|
>
|
||||||
> # Construction via add_pipe with custom model
|
> # Construction via add_pipe with custom model
|
||||||
> config = {"model": {"@architectures": "my_spancat"}}
|
> config = {"model": {"@architectures": "my_spancat"}}
|
||||||
> parser = nlp.add_pipe("spancat", config=config)
|
> spancat = nlp.add_pipe("spancat", config=config)
|
||||||
>
|
>
|
||||||
> # Construction from class
|
> # Construction from class
|
||||||
> from spacy.pipeline import SpanCategorizer
|
> from spacy.pipeline import SpanCategorizer
|
||||||
|
@ -87,16 +117,19 @@ Create a new pipeline instance. In your application, you would normally use a
|
||||||
shortcut for this and instantiate the component using its string name and
|
shortcut for this and instantiate the component using its string name and
|
||||||
[`nlp.add_pipe`](/api/language#create_pipe).
|
[`nlp.add_pipe`](/api/language#create_pipe).
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
| --------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `vocab` | The shared vocabulary. ~~Vocab~~ |
|
| `vocab` | The shared vocabulary. ~~Vocab~~ |
|
||||||
| `model` | A model instance that is given a a list of documents and `(start, end)` indices representing candidate span offsets. The model predicts a probability for each category for each span. ~~Model[Tuple[List[Doc], Ragged], Floats2d]~~ |
|
| `model` | A model instance that is given a a list of documents and `(start, end)` indices representing candidate span offsets. The model predicts a probability for each category for each span. ~~Model[Tuple[List[Doc], Ragged], Floats2d]~~ |
|
||||||
| `suggester` | A function that [suggests spans](#suggesters). Spans are returned as a ragged array with two integer columns, for the start and end positions. ~~Callable[[Iterable[Doc], Optional[Ops]], Ragged]~~ |
|
| `suggester` | A function that [suggests spans](#suggesters). Spans are returned as a ragged array with two integer columns, for the start and end positions. ~~Callable[[Iterable[Doc], Optional[Ops]], Ragged]~~ |
|
||||||
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
|
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `spans_key` | Key of the [`Doc.spans`](/api/doc#sans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"sc"`. ~~str~~ |
|
| `spans_key` | Key of the [`Doc.spans`](/api/doc#sans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"sc"`. ~~str~~ |
|
||||||
| `threshold` | Minimum probability to consider a prediction positive. Spans with a positive prediction will be saved on the Doc. Defaults to `0.5`. ~~float~~ |
|
| `threshold` | Minimum probability to consider a prediction positive. Spans with a positive prediction will be saved on the Doc. Defaults to `0.5`. ~~float~~ |
|
||||||
| `max_positive` | Maximum number of labels to consider positive per span. Defaults to `None`, indicating no limit. ~~Optional[int]~~ |
|
| `max_positive` | Maximum number of labels to consider positive per span. Defaults to `None`, indicating no limit. ~~Optional[int]~~ |
|
||||||
|
| `allow_overlap` <Tag variant="new">3.5.1</Tag> | If `True`, the data is assumed to contain overlapping spans. It is only available when `max_positive` is exactly 1. Defaults to `True`. ~~bool~~ |
|
||||||
|
| `add_negative_label` <Tag variant="new">3.5.1</Tag> | Whether to learn to predict a special negative label for each unannotated `Span`. This should be `True` when using a `Softmax` classifier layer and so its `True` by default for `spancat_singlelabel` . Spans with negative labels and their scores are not stored as annotations. ~~bool~~ |
|
||||||
|
| `negative_weight` <Tag variant="new">3.5.1</Tag> | Multiplier for the loss terms. It can be used to downweight the negative samples if there are too many . It is only used when `add_negative_label` is `True`. Defaults to `1.0`. ~~float~~ |
|
||||||
|
|
||||||
## SpanCategorizer.\_\_call\_\_ {id="call",tag="method"}
|
## SpanCategorizer.\_\_call\_\_ {id="call",tag="method"}
|
||||||
|
|
||||||
|
@ -492,3 +525,22 @@ has two columns, indicating the start and end position.
|
||||||
| `min_size` | The minimal phrase lengths to suggest (inclusive). ~~[int]~~ |
|
| `min_size` | The minimal phrase lengths to suggest (inclusive). ~~[int]~~ |
|
||||||
| `max_size` | The maximal phrase lengths to suggest (exclusive). ~~[int]~~ |
|
| `max_size` | The maximal phrase lengths to suggest (exclusive). ~~[int]~~ |
|
||||||
| **CREATES** | The suggester function. ~~Callable[[Iterable[Doc], Optional[Ops]], Ragged]~~ |
|
| **CREATES** | The suggester function. ~~Callable[[Iterable[Doc], Optional[Ops]], Ragged]~~ |
|
||||||
|
|
||||||
|
### spacy.preset_spans_suggester.v1 {id="preset_spans_suggester"}
|
||||||
|
|
||||||
|
> #### Example Config
|
||||||
|
>
|
||||||
|
> ```ini
|
||||||
|
> [components.spancat.suggester]
|
||||||
|
> @misc = "spacy.preset_spans_suggester.v1"
|
||||||
|
> spans_key = "my_spans"
|
||||||
|
> ```
|
||||||
|
|
||||||
|
Suggest all spans that are already stored in doc.spans[spans_key]. This is
|
||||||
|
useful when an upstream component is used to set the spans on the Doc such as a
|
||||||
|
[`SpanRuler`](/api/spanruler) or [`SpanFinder`](/api/spanfinder).
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ----------- | ----------------------------------------------------------------------------- |
|
||||||
|
| `spans_key` | Key of [`Doc.spans`](/api/doc/#spans) that provides spans to suggest. ~~str~~ |
|
||||||
|
| **CREATES** | The suggester function. ~~Callable[[Iterable[Doc], Optional[Ops]], Ragged]~~ |
|
||||||
|
|
372
website/docs/api/spanfinder.mdx
Normal file
372
website/docs/api/spanfinder.mdx
Normal file
|
@ -0,0 +1,372 @@
|
||||||
|
---
|
||||||
|
title: SpanFinder
|
||||||
|
tag: class,experimental
|
||||||
|
source: spacy/pipeline/span_finder.py
|
||||||
|
version: 3.6
|
||||||
|
teaser:
|
||||||
|
'Pipeline component for identifying potentially overlapping spans of text'
|
||||||
|
api_base_class: /api/pipe
|
||||||
|
api_string_name: span_finder
|
||||||
|
api_trainable: true
|
||||||
|
---
|
||||||
|
|
||||||
|
The span finder identifies potentially overlapping, unlabeled spans. It
|
||||||
|
identifies tokens that start or end spans and annotates unlabeled spans between
|
||||||
|
starts and ends, with optional filters for min and max span length. It is
|
||||||
|
intended for use in combination with a component like
|
||||||
|
[`SpanCategorizer`](/api/spancategorizer) that may further filter or label the
|
||||||
|
spans. Predicted spans will be saved in a [`SpanGroup`](/api/spangroup) on the
|
||||||
|
doc under `doc.spans[spans_key]`, where `spans_key` is a component config
|
||||||
|
setting.
|
||||||
|
|
||||||
|
## Assigned Attributes {id="assigned-attributes"}
|
||||||
|
|
||||||
|
Predictions will be saved to `Doc.spans[spans_key]` as a
|
||||||
|
[`SpanGroup`](/api/spangroup).
|
||||||
|
|
||||||
|
`spans_key` defaults to `"sc"`, but can be passed as a parameter. The
|
||||||
|
`span_finder` component will overwrite any existing spans under the spans key
|
||||||
|
`doc.spans[spans_key]`.
|
||||||
|
|
||||||
|
| Location | Value |
|
||||||
|
| ---------------------- | ---------------------------------- |
|
||||||
|
| `Doc.spans[spans_key]` | The unlabeled spans. ~~SpanGroup~~ |
|
||||||
|
|
||||||
|
## Config and implementation {id="config"}
|
||||||
|
|
||||||
|
The default config is defined by the pipeline component factory and describes
|
||||||
|
how the component should be configured. You can override its settings via the
|
||||||
|
`config` argument on [`nlp.add_pipe`](/api/language#add_pipe) or in your
|
||||||
|
[`config.cfg` for training](/usage/training#config). See the
|
||||||
|
[model architectures](/api/architectures) documentation for details on the
|
||||||
|
architectures and their arguments and hyperparameters.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> from spacy.pipeline.span_finder import DEFAULT_SPAN_FINDER_MODEL
|
||||||
|
> config = {
|
||||||
|
> "threshold": 0.5,
|
||||||
|
> "spans_key": "my_spans",
|
||||||
|
> "max_length": None,
|
||||||
|
> "min_length": None,
|
||||||
|
> "model": DEFAULT_SPAN_FINDER_MODEL,
|
||||||
|
> }
|
||||||
|
> nlp.add_pipe("span_finder", config=config)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Setting | Description |
|
||||||
|
| ------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `model` | A model instance that is given a list of documents and predicts a probability for each token. ~~Model[List[Doc], Floats2d]~~ |
|
||||||
|
| `spans_key` | Key of the [`Doc.spans`](/api/doc#spans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"sc"`. ~~str~~ |
|
||||||
|
| `threshold` | Minimum probability to consider a prediction positive. Defaults to `0.5`. ~~float~~ |
|
||||||
|
| `max_length` | Maximum length of the produced spans, defaults to `None` meaning unlimited length. ~~Optional[int]~~ |
|
||||||
|
| `min_length` | Minimum length of the produced spans, defaults to `None` meaning shortest span length is 1. ~~Optional[int]~~ |
|
||||||
|
| `scorer` | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for `Doc.spans[spans_key]` with overlapping spans allowed. ~~Optional[Callable]~~ |
|
||||||
|
|
||||||
|
```python
|
||||||
|
%%GITHUB_SPACY/spacy/pipeline/span_finder.py
|
||||||
|
```
|
||||||
|
|
||||||
|
## SpanFinder.\_\_init\_\_ {id="init",tag="method"}
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> # Construction via add_pipe with default model
|
||||||
|
> span_finder = nlp.add_pipe("span_finder")
|
||||||
|
>
|
||||||
|
> # Construction via add_pipe with custom model
|
||||||
|
> config = {"model": {"@architectures": "my_span_finder"}}
|
||||||
|
> span_finder = nlp.add_pipe("span_finder", config=config)
|
||||||
|
>
|
||||||
|
> # Construction from class
|
||||||
|
> from spacy.pipeline import SpanFinder
|
||||||
|
> span_finder = SpanFinder(nlp.vocab, model)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
Create a new pipeline instance. In your application, you would normally use a
|
||||||
|
shortcut for this and instantiate the component using its string name and
|
||||||
|
[`nlp.add_pipe`](/api/language#create_pipe).
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `vocab` | The shared vocabulary. ~~Vocab~~ |
|
||||||
|
| `model` | A model instance that is given a list of documents and predicts a probability for each token. ~~Model[List[Doc], Floats2d]~~ |
|
||||||
|
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
|
||||||
|
| _keyword-only_ | |
|
||||||
|
| `spans_key` | Key of the [`Doc.spans`](/api/doc#spans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"sc"`. ~~str~~ |
|
||||||
|
| `threshold` | Minimum probability to consider a prediction positive. Defaults to `0.5`. ~~float~~ |
|
||||||
|
| `max_length` | Maximum length of the produced spans, defaults to `None` meaning unlimited length. ~~Optional[int]~~ |
|
||||||
|
| `min_length` | Minimum length of the produced spans, defaults to `None` meaning shortest span length is 1. ~~Optional[int]~~ |
|
||||||
|
| `scorer` | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for `Doc.spans[spans_key]` with overlapping spans allowed. ~~Optional[Callable]~~ |
|
||||||
|
|
||||||
|
## SpanFinder.\_\_call\_\_ {id="call",tag="method"}
|
||||||
|
|
||||||
|
Apply the pipe to one document. The document is modified in place, and returned.
|
||||||
|
This usually happens under the hood when the `nlp` object is called on a text
|
||||||
|
and all pipeline components are applied to the `Doc` in order. Both
|
||||||
|
[`__call__`](/api/spanfinder#call) and [`pipe`](/api/spanfinder#pipe) delegate
|
||||||
|
to the [`predict`](/api/spanfinder#predict) and
|
||||||
|
[`set_annotations`](/api/spanfinder#set_annotations) methods.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> doc = nlp("This is a sentence.")
|
||||||
|
> span_finder = nlp.add_pipe("span_finder")
|
||||||
|
> # This usually happens under the hood
|
||||||
|
> processed = span_finder(doc)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ----------- | -------------------------------- |
|
||||||
|
| `doc` | The document to process. ~~Doc~~ |
|
||||||
|
| **RETURNS** | The processed document. ~~Doc~~ |
|
||||||
|
|
||||||
|
## SpanFinder.pipe {id="pipe",tag="method"}
|
||||||
|
|
||||||
|
Apply the pipe to a stream of documents. This usually happens under the hood
|
||||||
|
when the `nlp` object is called on a text and all pipeline components are
|
||||||
|
applied to the `Doc` in order. Both [`__call__`](/api/spanfinder#call) and
|
||||||
|
[`pipe`](/api/spanfinder#pipe) delegate to the
|
||||||
|
[`predict`](/api/spanfinder#predict) and
|
||||||
|
[`set_annotations`](/api/spanfinder#set_annotations) methods.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> span_finder = nlp.add_pipe("span_finder")
|
||||||
|
> for doc in span_finder.pipe(docs, batch_size=50):
|
||||||
|
> pass
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| -------------- | ------------------------------------------------------------- |
|
||||||
|
| `stream` | A stream of documents. ~~Iterable[Doc]~~ |
|
||||||
|
| _keyword-only_ | |
|
||||||
|
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
||||||
|
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
||||||
|
|
||||||
|
## SpanFinder.initialize {id="initialize",tag="method"}
|
||||||
|
|
||||||
|
Initialize the component for training. `get_examples` should be a function that
|
||||||
|
returns an iterable of [`Example`](/api/example) objects. **At least one example
|
||||||
|
should be supplied.** The data examples are used to **initialize the model** of
|
||||||
|
the component and can either be the full training data or a representative
|
||||||
|
sample. Initialization includes validating the network and
|
||||||
|
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) This
|
||||||
|
method is typically called by [`Language.initialize`](/api/language#initialize)
|
||||||
|
and lets you customize arguments it receives via the
|
||||||
|
[`[initialize.components]`](/api/data-formats#config-initialize) block in the
|
||||||
|
config.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> span_finder = nlp.add_pipe("span_finder")
|
||||||
|
> span_finder.initialize(lambda: examples, nlp=nlp)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Must contain at least one `Example`. ~~Callable[[], Iterable[Example]]~~ |
|
||||||
|
| _keyword-only_ | |
|
||||||
|
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
||||||
|
|
||||||
|
## SpanFinder.predict {id="predict",tag="method"}
|
||||||
|
|
||||||
|
Apply the component's model to a batch of [`Doc`](/api/doc) objects without
|
||||||
|
modifying them.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> span_finder = nlp.add_pipe("span_finder")
|
||||||
|
> scores = span_finder.predict([doc1, doc2])
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ----------- | ------------------------------------------- |
|
||||||
|
| `docs` | The documents to predict. ~~Iterable[Doc]~~ |
|
||||||
|
| **RETURNS** | The model's prediction for each document. |
|
||||||
|
|
||||||
|
## SpanFinder.set_annotations {id="set_annotations",tag="method"}
|
||||||
|
|
||||||
|
Modify a batch of [`Doc`](/api/doc) objects using pre-computed scores.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> span_finder = nlp.add_pipe("span_finder")
|
||||||
|
> scores = span_finder.predict(docs)
|
||||||
|
> span_finder.set_annotations(docs, scores)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| -------- | ---------------------------------------------------- |
|
||||||
|
| `docs` | The documents to modify. ~~Iterable[Doc]~~ |
|
||||||
|
| `scores` | The scores to set, produced by `SpanFinder.predict`. |
|
||||||
|
|
||||||
|
## SpanFinder.update {id="update",tag="method"}
|
||||||
|
|
||||||
|
Learn from a batch of [`Example`](/api/example) objects containing the
|
||||||
|
predictions and gold-standard annotations, and update the component's model.
|
||||||
|
Delegates to [`predict`](/api/spanfinder#predict) and
|
||||||
|
[`get_loss`](/api/spanfinder#get_loss).
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> span_finder = nlp.add_pipe("span_finder")
|
||||||
|
> optimizer = nlp.initialize()
|
||||||
|
> losses = span_finder.update(examples, sgd=optimizer)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| -------------- | ------------------------------------------------------------------------------------------------------------------------ |
|
||||||
|
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
|
||||||
|
| _keyword-only_ | |
|
||||||
|
| `drop` | The dropout rate. ~~float~~ |
|
||||||
|
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
|
||||||
|
| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
|
||||||
|
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
|
||||||
|
|
||||||
|
## SpanFinder.get_loss {id="get_loss",tag="method"}
|
||||||
|
|
||||||
|
Find the loss and gradient of loss for the batch of documents and their
|
||||||
|
predicted scores.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> span_finder = nlp.add_pipe("span_finder")
|
||||||
|
> scores = span_finder.predict([eg.predicted for eg in examples])
|
||||||
|
> loss, d_loss = span_finder.get_loss(examples, scores)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| -------------- | ------------------------------------------------------------------------------ |
|
||||||
|
| `examples` | The batch of examples. ~~Iterable[Example]~~ |
|
||||||
|
| `spans_scores` | Scores representing the model's predictions. ~~Tuple[Ragged, Floats2d]~~ |
|
||||||
|
| **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, Floats2d]~~ |
|
||||||
|
|
||||||
|
## SpanFinder.create_optimizer {id="create_optimizer",tag="method"}
|
||||||
|
|
||||||
|
Create an optimizer for the pipeline component.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> span_finder = nlp.add_pipe("span_finder")
|
||||||
|
> optimizer = span_finder.create_optimizer()
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ----------- | ---------------------------- |
|
||||||
|
| **RETURNS** | The optimizer. ~~Optimizer~~ |
|
||||||
|
|
||||||
|
## SpanFinder.use_params {id="use_params",tag="method, contextmanager"}
|
||||||
|
|
||||||
|
Modify the pipe's model to use the given parameter values.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> span_finder = nlp.add_pipe("span_finder")
|
||||||
|
> with span_finder.use_params(optimizer.averages):
|
||||||
|
> span_finder.to_disk("/best_model")
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| -------- | -------------------------------------------------- |
|
||||||
|
| `params` | The parameter values to use in the model. ~~dict~~ |
|
||||||
|
|
||||||
|
## SpanFinder.to_disk {id="to_disk",tag="method"}
|
||||||
|
|
||||||
|
Serialize the pipe to disk.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> span_finder = nlp.add_pipe("span_finder")
|
||||||
|
> span_finder.to_disk("/path/to/span_finder")
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
|
| `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
|
||||||
|
| _keyword-only_ | |
|
||||||
|
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||||
|
|
||||||
|
## SpanFinder.from_disk {id="from_disk",tag="method"}
|
||||||
|
|
||||||
|
Load the pipe from disk. Modifies the object in place and returns it.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> span_finder = nlp.add_pipe("span_finder")
|
||||||
|
> span_finder.from_disk("/path/to/span_finder")
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| -------------- | ----------------------------------------------------------------------------------------------- |
|
||||||
|
| `path` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
|
||||||
|
| _keyword-only_ | |
|
||||||
|
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||||
|
| **RETURNS** | The modified `SpanFinder` object. ~~SpanFinder~~ |
|
||||||
|
|
||||||
|
## SpanFinder.to_bytes {id="to_bytes",tag="method"}
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> span_finder = nlp.add_pipe("span_finder")
|
||||||
|
> span_finder_bytes = span_finder.to_bytes()
|
||||||
|
> ```
|
||||||
|
|
||||||
|
Serialize the pipe to a bytestring.
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| -------------- | ------------------------------------------------------------------------------------------- |
|
||||||
|
| _keyword-only_ | |
|
||||||
|
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||||
|
| **RETURNS** | The serialized form of the `SpanFinder` object. ~~bytes~~ |
|
||||||
|
|
||||||
|
## SpanFinder.from_bytes {id="from_bytes",tag="method"}
|
||||||
|
|
||||||
|
Load the pipe from a bytestring. Modifies the object in place and returns it.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> span_finder_bytes = span_finder.to_bytes()
|
||||||
|
> span_finder = nlp.add_pipe("span_finder")
|
||||||
|
> span_finder.from_bytes(span_finder_bytes)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| -------------- | ------------------------------------------------------------------------------------------- |
|
||||||
|
| `bytes_data` | The data to load from. ~~bytes~~ |
|
||||||
|
| _keyword-only_ | |
|
||||||
|
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||||
|
| **RETURNS** | The `SpanFinder` object. ~~SpanFinder~~ |
|
||||||
|
|
||||||
|
## Serialization fields {id="serialization-fields"}
|
||||||
|
|
||||||
|
During serialization, spaCy will export several data fields used to restore
|
||||||
|
different aspects of the object. If needed, you can exclude them from
|
||||||
|
serialization by passing in the string names via the `exclude` argument.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> data = span_finder.to_disk("/path", exclude=["vocab"])
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ------- | -------------------------------------------------------------- |
|
||||||
|
| `vocab` | The shared [`Vocab`](/api/vocab). |
|
||||||
|
| `cfg` | The config file. You usually don't want to exclude this. |
|
||||||
|
| `model` | The binary model data. You usually don't want to exclude this. |
|
|
@ -46,6 +46,7 @@ architectures and their arguments and hyperparameters.
|
||||||
| `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~ |
|
| `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~ |
|
||||||
| `scorer` <Tag variant="new">3.2</Tag> | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attribute `"tag"`. ~~Optional[Callable]~~ |
|
| `scorer` <Tag variant="new">3.2</Tag> | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attribute `"tag"`. ~~Optional[Callable]~~ |
|
||||||
| `neg_prefix` <Tag variant="new">3.2.1</Tag> | The prefix used to specify incorrect tags while training. The tagger will learn not to predict exactly this tag. Defaults to `!`. ~~str~~ |
|
| `neg_prefix` <Tag variant="new">3.2.1</Tag> | The prefix used to specify incorrect tags while training. The tagger will learn not to predict exactly this tag. Defaults to `!`. ~~str~~ |
|
||||||
|
| `label_smoothing` <Tag variant="new">3.6</Tag> | [Label smoothing](https://arxiv.org/abs/1906.02629) factor. Defaults to `0.0`. ~~float~~ |
|
||||||
| `save_activations` <Tag variant="new">4.0</Tag> | Save activations in `Doc` when annotating. Saved activations are `"probabilities"` and `"label_ids"`. ~~Union[bool, list[str]]~~ |
|
| `save_activations` <Tag variant="new">4.0</Tag> | Save activations in `Doc` when annotating. Saved activations are `"probabilities"` and `"label_ids"`. ~~Union[bool, list[str]]~~ |
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
|
|
@ -25,7 +25,10 @@ and call the package's own `load()` method. If a pipeline is loaded from a path,
|
||||||
spaCy will assume it's a data directory, load its
|
spaCy will assume it's a data directory, load its
|
||||||
[`config.cfg`](/api/data-formats#config) and use the language and pipeline
|
[`config.cfg`](/api/data-formats#config) and use the language and pipeline
|
||||||
information to construct the `Language` class. The data will be loaded in via
|
information to construct the `Language` class. The data will be loaded in via
|
||||||
[`Language.from_disk`](/api/language#from_disk).
|
[`Language.from_disk`](/api/language#from_disk). Loading a pipeline from a
|
||||||
|
package will also import any custom code, if present, whereas loading from a
|
||||||
|
directory does not. For these cases, you need to manually import your custom
|
||||||
|
code.
|
||||||
|
|
||||||
<Infobox variant="warning" title="Changed in v3.0">
|
<Infobox variant="warning" title="Changed in v3.0">
|
||||||
|
|
||||||
|
@ -291,7 +294,7 @@ the `manual=True` argument in `displacy.render`.
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------- | ------------------------------------------------------------------- |
|
| ----------- | ------------------------------------------------------------------- |
|
||||||
| `orig_doc` | Doc to parse dependencies. ~~Doc~~ |
|
| `orig_doc` | Doc or span to parse dependencies. ~~Union[Doc, Span]~~ |
|
||||||
| `options` | Dependency parse specific visualisation options. ~~Dict[str, Any]~~ |
|
| `options` | Dependency parse specific visualisation options. ~~Dict[str, Any]~~ |
|
||||||
| **RETURNS** | Generated dependency parse keyed by words and arcs. ~~dict~~ |
|
| **RETURNS** | Generated dependency parse keyed by words and arcs. ~~dict~~ |
|
||||||
|
|
||||||
|
@ -466,7 +469,7 @@ factories.
|
||||||
| `optimizers` | Registry for functions that create [optimizers](https://thinc.ai/docs/api-optimizers). |
|
| `optimizers` | Registry for functions that create [optimizers](https://thinc.ai/docs/api-optimizers). |
|
||||||
| `readers` | Registry for file and data readers, including training and evaluation data readers like [`Corpus`](/api/corpus). |
|
| `readers` | Registry for file and data readers, including training and evaluation data readers like [`Corpus`](/api/corpus). |
|
||||||
| `schedules` | Registry for functions that create [schedules](https://thinc.ai/docs/api-schedules). |
|
| `schedules` | Registry for functions that create [schedules](https://thinc.ai/docs/api-schedules). |
|
||||||
| `scorers` | Registry for functions that create scoring methods for user with the [`Scorer`](/api/scorer). Scoring methods are called with `Iterable[Example]` and arbitrary `\*\*kwargs` and return scores as `Dict[str, Any]`. |
|
| `scorers` | Registry for functions that create scoring methods for user with the [`Scorer`](/api/scorer). Scoring methods are called with `Iterable[Example]` and arbitrary `**kwargs` and return scores as `Dict[str, Any]`. |
|
||||||
| `tokenizers` | Registry for tokenizer factories. Registered functions should return a callback that receives the `nlp` object and returns a [`Tokenizer`](/api/tokenizer) or a custom callable. |
|
| `tokenizers` | Registry for tokenizer factories. Registered functions should return a callback that receives the `nlp` object and returns a [`Tokenizer`](/api/tokenizer) or a custom callable. |
|
||||||
|
|
||||||
### spacy-transformers registry {id="registry-transformers"}
|
### spacy-transformers registry {id="registry-transformers"}
|
||||||
|
@ -577,7 +580,7 @@ start decreasing across epochs.
|
||||||
> ```ini
|
> ```ini
|
||||||
> [training.logger]
|
> [training.logger]
|
||||||
> @loggers = "spacy.ConsoleLogger.v3"
|
> @loggers = "spacy.ConsoleLogger.v3"
|
||||||
> progress_bar = "all_steps"
|
> progress_bar = "eval"
|
||||||
> console_output = true
|
> console_output = true
|
||||||
> output_file = "training_log.jsonl"
|
> output_file = "training_log.jsonl"
|
||||||
> ```
|
> ```
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user