mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-05 21:00:19 +03:00
Compare commits
73 Commits
Author | SHA1 | Date | |
---|---|---|---|
|
512241e124 | ||
|
424e917c6c | ||
|
9beaec6a03 | ||
|
931a46308f | ||
|
7c49d251c7 | ||
|
81488fa88b | ||
|
54d9198e62 | ||
|
7ae4fc19a1 | ||
|
2cfbc1209d | ||
|
42e5043816 | ||
|
4e1db35f6e | ||
|
9ec12fcfde | ||
|
139368d9ce | ||
|
0de1f8bf73 | ||
|
1f8f910554 | ||
|
e9945ccd04 | ||
|
664a53ffbe | ||
|
e05b2ccc7c | ||
|
357fdd4871 | ||
|
7bf1db87ad | ||
|
b0e5aed5ed | ||
|
6be67db59f | ||
|
18a2a88a95 | ||
|
aea4a96f92 | ||
|
e4bbdf7b50 | ||
|
f66d55fe5b | ||
|
9fbb8ee912 | ||
|
314a7cea73 | ||
|
2fbd080a03 | ||
|
bbf232e355 | ||
|
0ec4dc5c29 | ||
|
a5406a6c45 | ||
|
57ee1212de | ||
|
b228875600 | ||
|
8d064872ff | ||
|
26da226a39 | ||
|
888332dfb2 | ||
|
1b4a67bc54 | ||
|
79dcef17f7 | ||
|
0ecbeff1a6 | ||
|
4380d750f9 | ||
|
2953e7b7ce | ||
|
d2d9e9e139 | ||
|
f1a42b6fcc | ||
|
f9c0220ea5 | ||
|
6183906a0b | ||
|
bd0768c05c | ||
|
be644caa13 | ||
|
7880da952b | ||
|
545218a7d9 | ||
|
d00e58d1ac | ||
|
9ca67dc539 | ||
|
ed83cafe46 | ||
|
9da333cbfa | ||
|
8153bd573f | ||
|
83056bb44c | ||
|
03b320b3bd | ||
|
c2810575c0 | ||
|
53687b5bca | ||
|
5398e9f276 | ||
|
69ca6eb041 | ||
|
cbd85c9608 | ||
|
a1fc4ed962 | ||
|
6177c87539 | ||
|
a86ec1b2b1 | ||
|
e381efd936 | ||
|
6f1632b3e9 | ||
|
e325de3ff8 | ||
|
b3e7364551 | ||
|
f87919d8f0 | ||
|
5ed1db7ae4 | ||
|
18f4378a91 | ||
|
be673462be |
119
.github/azure-steps.yml
vendored
119
.github/azure-steps.yml
vendored
|
@ -1,119 +0,0 @@
|
|||
parameters:
|
||||
python_version: ''
|
||||
architecture: 'x64'
|
||||
num_build_jobs: 2
|
||||
|
||||
steps:
|
||||
- task: UsePythonVersion@0
|
||||
inputs:
|
||||
versionSpec: ${{ parameters.python_version }}
|
||||
architecture: ${{ parameters.architecture }}
|
||||
allowUnstable: true
|
||||
|
||||
- bash: |
|
||||
echo "##vso[task.setvariable variable=python_version]${{ parameters.python_version }}"
|
||||
displayName: 'Set variables'
|
||||
|
||||
- script: |
|
||||
python -m pip install -U build pip setuptools
|
||||
python -m pip install -U -r requirements.txt
|
||||
displayName: "Install dependencies"
|
||||
|
||||
- script: |
|
||||
python -m build --sdist
|
||||
displayName: "Build sdist"
|
||||
|
||||
- script: |
|
||||
python -m mypy spacy
|
||||
displayName: 'Run mypy'
|
||||
condition: ne(variables['python_version'], '3.6')
|
||||
|
||||
- task: DeleteFiles@1
|
||||
inputs:
|
||||
contents: "spacy"
|
||||
displayName: "Delete source directory"
|
||||
|
||||
- task: DeleteFiles@1
|
||||
inputs:
|
||||
contents: "*.egg-info"
|
||||
displayName: "Delete egg-info directory"
|
||||
|
||||
- script: |
|
||||
python -m pip freeze > installed.txt
|
||||
python -m pip uninstall -y -r installed.txt
|
||||
displayName: "Uninstall all packages"
|
||||
|
||||
- bash: |
|
||||
SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
|
||||
SPACY_NUM_BUILD_JOBS=${{ parameters.num_build_jobs }} python -m pip install dist/$SDIST
|
||||
displayName: "Install from sdist"
|
||||
|
||||
- script: |
|
||||
python -W error -c "import spacy"
|
||||
displayName: "Test import"
|
||||
|
||||
- script: |
|
||||
python -m spacy download ca_core_news_sm
|
||||
python -m spacy download ca_core_news_md
|
||||
python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
|
||||
displayName: 'Test download CLI'
|
||||
condition: eq(variables['python_version'], '3.8')
|
||||
|
||||
- script: |
|
||||
python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
|
||||
displayName: 'Test no warnings on load (#11713)'
|
||||
condition: eq(variables['python_version'], '3.8')
|
||||
|
||||
- script: |
|
||||
python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
|
||||
displayName: 'Test convert CLI'
|
||||
condition: eq(variables['python_version'], '3.8')
|
||||
|
||||
- script: |
|
||||
python -m spacy init config -p ner -l ca ner.cfg
|
||||
python -m spacy debug config ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy
|
||||
displayName: 'Test debug config CLI'
|
||||
condition: eq(variables['python_version'], '3.8')
|
||||
|
||||
- script: |
|
||||
# will have errors due to sparse data, check for summary in output
|
||||
python -m spacy debug data ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy | grep -q Summary
|
||||
displayName: 'Test debug data CLI'
|
||||
condition: eq(variables['python_version'], '3.8')
|
||||
|
||||
- script: |
|
||||
python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
|
||||
displayName: 'Test train CLI'
|
||||
condition: eq(variables['python_version'], '3.8')
|
||||
|
||||
- script: |
|
||||
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
|
||||
PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
|
||||
displayName: 'Test assemble CLI'
|
||||
condition: eq(variables['python_version'], '3.8')
|
||||
|
||||
- script: |
|
||||
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
|
||||
python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
|
||||
displayName: 'Test assemble CLI vectors warning'
|
||||
condition: eq(variables['python_version'], '3.8')
|
||||
|
||||
- script: |
|
||||
python -m pip install -U -r requirements.txt
|
||||
displayName: "Install test requirements"
|
||||
|
||||
- script: |
|
||||
python -m pytest --pyargs spacy -W error
|
||||
displayName: "Run CPU tests"
|
||||
|
||||
- script: |
|
||||
python -m pip install 'spacy[apple]'
|
||||
python -m pytest --pyargs spacy
|
||||
displayName: "Run CPU tests with thinc-apple-ops"
|
||||
condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.11'))
|
||||
|
||||
- script: |
|
||||
python .github/validate_universe_json.py website/meta/universe.json
|
||||
displayName: 'Test website/meta/universe.json'
|
||||
condition: eq(variables['python_version'], '3.8')
|
||||
|
45
.github/workflows/autoblack.yml
vendored
45
.github/workflows/autoblack.yml
vendored
|
@ -1,45 +0,0 @@
|
|||
# GitHub Action that uses Black to reformat all Python code and submits a PR
|
||||
# in regular intervals. Inspired by: https://github.com/cclauss/autoblack
|
||||
|
||||
name: autoblack
|
||||
on:
|
||||
workflow_dispatch: # allow manual trigger
|
||||
schedule:
|
||||
- cron: '0 8 * * 5' # every Friday at 8am UTC
|
||||
|
||||
jobs:
|
||||
autoblack:
|
||||
if: github.repository_owner == 'explosion'
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
ref: ${{ github.head_ref }}
|
||||
- uses: actions/setup-python@v4
|
||||
- run: pip install black -c requirements.txt
|
||||
- name: Auto-format code if needed
|
||||
run: black spacy
|
||||
# We can't run black --check here because that returns a non-zero excit
|
||||
# code and makes GitHub think the action failed
|
||||
- name: Check for modified files
|
||||
id: git-check
|
||||
run: echo modified=$(if git diff-index --quiet HEAD --; then echo "false"; else echo "true"; fi) >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Create Pull Request
|
||||
if: steps.git-check.outputs.modified == 'true'
|
||||
uses: peter-evans/create-pull-request@v4
|
||||
with:
|
||||
title: Auto-format code with black
|
||||
labels: meta
|
||||
commit-message: Auto-format code with black
|
||||
committer: GitHub <noreply@github.com>
|
||||
author: explosion-bot <explosion-bot@users.noreply.github.com>
|
||||
body: _This PR is auto-generated._
|
||||
branch: autoblack
|
||||
delete-branch: true
|
||||
draft: false
|
||||
- name: Check outputs
|
||||
if: steps.git-check.outputs.modified == 'true'
|
||||
run: |
|
||||
echo "Pull Request Number - ${{ steps.cpr.outputs.pull-request-number }}"
|
||||
echo "Pull Request URL - ${{ steps.cpr.outputs.pull-request-url }}"
|
1
.github/workflows/explosionbot.yml
vendored
1
.github/workflows/explosionbot.yml
vendored
|
@ -8,6 +8,7 @@ on:
|
|||
|
||||
jobs:
|
||||
explosion-bot:
|
||||
if: github.repository_owner == 'explosion'
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Dump GitHub context
|
||||
|
|
1
.github/workflows/issue-manager.yml
vendored
1
.github/workflows/issue-manager.yml
vendored
|
@ -13,6 +13,7 @@ on:
|
|||
|
||||
jobs:
|
||||
issue-manager:
|
||||
if: github.repository_owner == 'explosion'
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: tiangolo/issue-manager@0.4.0
|
||||
|
|
1
.github/workflows/lock.yml
vendored
1
.github/workflows/lock.yml
vendored
|
@ -13,6 +13,7 @@ concurrency:
|
|||
|
||||
jobs:
|
||||
action:
|
||||
if: github.repository_owner == 'explosion'
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: dessant/lock-threads@v4
|
||||
|
|
1
.github/workflows/spacy_universe_alert.yml
vendored
1
.github/workflows/spacy_universe_alert.yml
vendored
|
@ -7,6 +7,7 @@ on:
|
|||
|
||||
jobs:
|
||||
build:
|
||||
if: github.repository_owner == 'explosion'
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
|
|
174
.github/workflows/tests.yml
vendored
Normal file
174
.github/workflows/tests.yml
vendored
Normal file
|
@ -0,0 +1,174 @@
|
|||
name: tests
|
||||
|
||||
on:
|
||||
push:
|
||||
branches-ignore:
|
||||
- "spacy.io"
|
||||
- "nightly.spacy.io"
|
||||
- "v2.spacy.io"
|
||||
paths-ignore:
|
||||
- "*.md"
|
||||
- "*.mdx"
|
||||
- "website/**"
|
||||
- ".github/workflows/**"
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened, edited]
|
||||
paths-ignore:
|
||||
- "*.md"
|
||||
- "*.mdx"
|
||||
- "website/**"
|
||||
|
||||
jobs:
|
||||
validate:
|
||||
name: Validate
|
||||
if: github.repository_owner == 'explosion'
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Check out repo
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Configure Python version
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: "3.7"
|
||||
architecture: x64
|
||||
|
||||
- name: black
|
||||
run: |
|
||||
python -m pip install black -c requirements.txt
|
||||
python -m black spacy --check
|
||||
- name: flake8
|
||||
run: |
|
||||
python -m pip install flake8==5.0.4
|
||||
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
|
||||
tests:
|
||||
name: Test
|
||||
needs: Validate
|
||||
strategy:
|
||||
fail-fast: true
|
||||
matrix:
|
||||
os: [ubuntu-latest, windows-latest, macos-latest]
|
||||
python_version: ["3.11"]
|
||||
include:
|
||||
- os: ubuntu-20.04
|
||||
python_version: "3.6"
|
||||
- os: windows-latest
|
||||
python_version: "3.7"
|
||||
- os: macos-latest
|
||||
python_version: "3.8"
|
||||
- os: ubuntu-latest
|
||||
python_version: "3.9"
|
||||
- os: windows-latest
|
||||
python_version: "3.10"
|
||||
|
||||
runs-on: ${{ matrix.os }}
|
||||
|
||||
steps:
|
||||
- name: Check out repo
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Configure Python version
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: ${{ matrix.python_version }}
|
||||
architecture: x64
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install -U build pip setuptools
|
||||
python -m pip install -U -r requirements.txt
|
||||
|
||||
- name: Build sdist
|
||||
run: |
|
||||
python -m build --sdist
|
||||
|
||||
- name: Run mypy
|
||||
run: |
|
||||
python -m mypy spacy
|
||||
if: matrix.python_version != '3.6'
|
||||
|
||||
- name: Delete source directory and .egg-info
|
||||
run: |
|
||||
rm -rf spacy *.egg-info
|
||||
shell: bash
|
||||
|
||||
- name: Uninstall all packages
|
||||
run: |
|
||||
python -m pip freeze
|
||||
python -m pip freeze --exclude pywin32 > installed.txt
|
||||
python -m pip uninstall -y -r installed.txt
|
||||
|
||||
- name: Install from sdist
|
||||
run: |
|
||||
SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
|
||||
SPACY_NUM_BUILD_JOBS=2 python -m pip install dist/$SDIST
|
||||
shell: bash
|
||||
|
||||
- name: Test import
|
||||
run: python -W error -c "import spacy"
|
||||
|
||||
- name: "Test download CLI"
|
||||
run: |
|
||||
python -m spacy download ca_core_news_sm
|
||||
python -m spacy download ca_core_news_md
|
||||
python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
|
||||
if: matrix.python_version == '3.9'
|
||||
|
||||
- name: "Test download_url in info CLI"
|
||||
run: |
|
||||
python -W error -m spacy info ca_core_news_sm | grep -q download_url
|
||||
if: matrix.python_version == '3.9'
|
||||
|
||||
- name: "Test no warnings on load (#11713)"
|
||||
run: |
|
||||
python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
|
||||
if: matrix.python_version == '3.9'
|
||||
|
||||
- name: "Test convert CLI"
|
||||
run: |
|
||||
python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
|
||||
if: matrix.python_version == '3.9'
|
||||
|
||||
- name: "Test debug config CLI"
|
||||
run: |
|
||||
python -m spacy init config -p ner -l ca ner.cfg
|
||||
python -m spacy debug config ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy
|
||||
if: matrix.python_version == '3.9'
|
||||
|
||||
- name: "Test debug data CLI"
|
||||
run: |
|
||||
# will have errors due to sparse data, check for summary in output
|
||||
python -m spacy debug data ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy | grep -q Summary
|
||||
if: matrix.python_version == '3.9'
|
||||
|
||||
- name: "Test train CLI"
|
||||
run: |
|
||||
python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
|
||||
if: matrix.python_version == '3.9'
|
||||
|
||||
- name: "Test assemble CLI"
|
||||
run: |
|
||||
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
|
||||
PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
|
||||
if: matrix.python_version == '3.9'
|
||||
|
||||
- name: "Test assemble CLI vectors warning"
|
||||
run: |
|
||||
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
|
||||
python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
|
||||
if: matrix.python_version == '3.9'
|
||||
|
||||
- name: "Install test requirements"
|
||||
run: |
|
||||
python -m pip install -U -r requirements.txt
|
||||
|
||||
- name: "Run CPU tests"
|
||||
run: |
|
||||
python -m pytest --pyargs spacy -W error
|
||||
if: "!(startsWith(matrix.os, 'macos') && matrix.python_version == '3.11')"
|
||||
|
||||
- name: "Run CPU tests with thinc-apple-ops"
|
||||
run: |
|
||||
python -m pip install 'spacy[apple]'
|
||||
python -m pytest --pyargs spacy
|
||||
if: startsWith(matrix.os, 'macos') && matrix.python_version == '3.11'
|
33
.github/workflows/universe_validation.yml
vendored
Normal file
33
.github/workflows/universe_validation.yml
vendored
Normal file
|
@ -0,0 +1,33 @@
|
|||
name: universe validation
|
||||
|
||||
on:
|
||||
push:
|
||||
branches-ignore:
|
||||
- "spacy.io"
|
||||
- "nightly.spacy.io"
|
||||
- "v2.spacy.io"
|
||||
paths:
|
||||
- "website/meta/universe.json"
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened, edited]
|
||||
paths:
|
||||
- "website/meta/universe.json"
|
||||
|
||||
jobs:
|
||||
validate:
|
||||
name: Validate
|
||||
if: github.repository_owner == 'explosion'
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Check out repo
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Configure Python version
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: "3.7"
|
||||
architecture: x64
|
||||
|
||||
- name: Validate website/meta/universe.json
|
||||
run: |
|
||||
python .github/validate_universe_json.py website/meta/universe.json
|
|
@ -16,6 +16,9 @@ production-ready [**training system**](https://spacy.io/usage/training) and easy
|
|||
model packaging, deployment and workflow management. spaCy is commercial
|
||||
open-source software, released under the [MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE).
|
||||
|
||||
💥 **We'd love to hear more about your experience with spaCy!**
|
||||
[Fill out our survey here.](https://form.typeform.com/to/aMel9q9f)
|
||||
|
||||
💫 **Version 3.5 out now!**
|
||||
[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
|
||||
|
||||
|
|
|
@ -1,117 +0,0 @@
|
|||
trigger:
|
||||
batch: true
|
||||
branches:
|
||||
include:
|
||||
- "*"
|
||||
exclude:
|
||||
- "spacy.io"
|
||||
- "nightly.spacy.io"
|
||||
- "v2.spacy.io"
|
||||
paths:
|
||||
exclude:
|
||||
- "website/*"
|
||||
- "*.md"
|
||||
- "*.mdx"
|
||||
- ".github/workflows/*"
|
||||
pr:
|
||||
paths:
|
||||
exclude:
|
||||
- "*.md"
|
||||
- "*.mdx"
|
||||
- "website/docs/*"
|
||||
- "website/src/*"
|
||||
- "website/meta/*.tsx"
|
||||
- "website/meta/*.mjs"
|
||||
- "website/meta/languages.json"
|
||||
- "website/meta/site.json"
|
||||
- "website/meta/sidebars.json"
|
||||
- "website/meta/type-annotations.json"
|
||||
- "website/pages/*"
|
||||
- ".github/workflows/*"
|
||||
|
||||
jobs:
|
||||
# Check formatting and linting. Perform basic checks for most important errors
|
||||
# (syntax etc.) Uses the config defined in setup.cfg and overwrites the
|
||||
# selected codes.
|
||||
- job: "Validate"
|
||||
pool:
|
||||
vmImage: "ubuntu-latest"
|
||||
steps:
|
||||
- task: UsePythonVersion@0
|
||||
inputs:
|
||||
versionSpec: "3.7"
|
||||
- script: |
|
||||
pip install black -c requirements.txt
|
||||
python -m black spacy --check
|
||||
displayName: "black"
|
||||
- script: |
|
||||
pip install flake8==5.0.4
|
||||
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
|
||||
displayName: "flake8"
|
||||
|
||||
- job: "Test"
|
||||
dependsOn: "Validate"
|
||||
strategy:
|
||||
matrix:
|
||||
# We're only running one platform per Python version to speed up builds
|
||||
Python36Linux:
|
||||
imageName: "ubuntu-20.04"
|
||||
python.version: "3.6"
|
||||
# Python36Windows:
|
||||
# imageName: "windows-latest"
|
||||
# python.version: "3.6"
|
||||
# Python36Mac:
|
||||
# imageName: "macos-latest"
|
||||
# python.version: "3.6"
|
||||
# Python37Linux:
|
||||
# imageName: "ubuntu-20.04"
|
||||
# python.version: "3.7"
|
||||
Python37Windows:
|
||||
imageName: "windows-latest"
|
||||
python.version: "3.7"
|
||||
# Python37Mac:
|
||||
# imageName: "macos-latest"
|
||||
# python.version: "3.7"
|
||||
# Python38Linux:
|
||||
# imageName: "ubuntu-latest"
|
||||
# python.version: "3.8"
|
||||
# Python38Windows:
|
||||
# imageName: "windows-latest"
|
||||
# python.version: "3.8"
|
||||
Python38Mac:
|
||||
imageName: "macos-latest"
|
||||
python.version: "3.8"
|
||||
Python39Linux:
|
||||
imageName: "ubuntu-latest"
|
||||
python.version: "3.9"
|
||||
# Python39Windows:
|
||||
# imageName: "windows-latest"
|
||||
# python.version: "3.9"
|
||||
# Python39Mac:
|
||||
# imageName: "macos-latest"
|
||||
# python.version: "3.9"
|
||||
# Python310Linux:
|
||||
# imageName: "ubuntu-latest"
|
||||
# python.version: "3.10"
|
||||
Python310Windows:
|
||||
imageName: "windows-latest"
|
||||
python.version: "3.10"
|
||||
# Python310Mac:
|
||||
# imageName: "macos-latest"
|
||||
# python.version: "3.10"
|
||||
Python311Linux:
|
||||
imageName: 'ubuntu-latest'
|
||||
python.version: '3.11'
|
||||
Python311Windows:
|
||||
imageName: 'windows-latest'
|
||||
python.version: '3.11'
|
||||
Python311Mac:
|
||||
imageName: 'macos-latest'
|
||||
python.version: '3.11'
|
||||
maxParallel: 4
|
||||
pool:
|
||||
vmImage: $(imageName)
|
||||
steps:
|
||||
- template: .github/azure-steps.yml
|
||||
parameters:
|
||||
python_version: '$(python.version)'
|
|
@ -5,7 +5,7 @@ requires = [
|
|||
"cymem>=2.0.2,<2.1.0",
|
||||
"preshed>=3.0.2,<3.1.0",
|
||||
"murmurhash>=0.28.0,<1.1.0",
|
||||
"thinc>=8.1.0,<8.2.0",
|
||||
"thinc>=8.1.8,<8.2.0",
|
||||
"numpy>=1.15.0",
|
||||
]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
|
|
@ -3,7 +3,7 @@ spacy-legacy>=3.0.11,<3.1.0
|
|||
spacy-loggers>=1.0.0,<2.0.0
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
thinc>=8.1.0,<8.2.0
|
||||
thinc>=8.1.8,<8.2.0
|
||||
ml_datasets>=0.2.0,<0.3.0
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
wasabi>=0.9.1,<1.2.0
|
||||
|
|
40
setup.cfg
40
setup.cfg
|
@ -39,7 +39,7 @@ setup_requires =
|
|||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
thinc>=8.1.0,<8.2.0
|
||||
thinc>=8.1.8,<8.2.0
|
||||
install_requires =
|
||||
# Our libraries
|
||||
spacy-legacy>=3.0.11,<3.1.0
|
||||
|
@ -47,7 +47,7 @@ install_requires =
|
|||
murmurhash>=0.28.0,<1.1.0
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
thinc>=8.1.0,<8.2.0
|
||||
thinc>=8.1.8,<8.2.0
|
||||
wasabi>=0.9.1,<1.2.0
|
||||
srsly>=2.4.3,<3.0.0
|
||||
catalogue>=2.0.6,<2.1.0
|
||||
|
@ -78,41 +78,41 @@ transformers =
|
|||
ray =
|
||||
spacy_ray>=0.1.0,<1.0.0
|
||||
cuda =
|
||||
cupy>=5.0.0b4,<12.0.0
|
||||
cupy>=5.0.0b4,<13.0.0
|
||||
cuda80 =
|
||||
cupy-cuda80>=5.0.0b4,<12.0.0
|
||||
cupy-cuda80>=5.0.0b4,<13.0.0
|
||||
cuda90 =
|
||||
cupy-cuda90>=5.0.0b4,<12.0.0
|
||||
cupy-cuda90>=5.0.0b4,<13.0.0
|
||||
cuda91 =
|
||||
cupy-cuda91>=5.0.0b4,<12.0.0
|
||||
cupy-cuda91>=5.0.0b4,<13.0.0
|
||||
cuda92 =
|
||||
cupy-cuda92>=5.0.0b4,<12.0.0
|
||||
cupy-cuda92>=5.0.0b4,<13.0.0
|
||||
cuda100 =
|
||||
cupy-cuda100>=5.0.0b4,<12.0.0
|
||||
cupy-cuda100>=5.0.0b4,<13.0.0
|
||||
cuda101 =
|
||||
cupy-cuda101>=5.0.0b4,<12.0.0
|
||||
cupy-cuda101>=5.0.0b4,<13.0.0
|
||||
cuda102 =
|
||||
cupy-cuda102>=5.0.0b4,<12.0.0
|
||||
cupy-cuda102>=5.0.0b4,<13.0.0
|
||||
cuda110 =
|
||||
cupy-cuda110>=5.0.0b4,<12.0.0
|
||||
cupy-cuda110>=5.0.0b4,<13.0.0
|
||||
cuda111 =
|
||||
cupy-cuda111>=5.0.0b4,<12.0.0
|
||||
cupy-cuda111>=5.0.0b4,<13.0.0
|
||||
cuda112 =
|
||||
cupy-cuda112>=5.0.0b4,<12.0.0
|
||||
cupy-cuda112>=5.0.0b4,<13.0.0
|
||||
cuda113 =
|
||||
cupy-cuda113>=5.0.0b4,<12.0.0
|
||||
cupy-cuda113>=5.0.0b4,<13.0.0
|
||||
cuda114 =
|
||||
cupy-cuda114>=5.0.0b4,<12.0.0
|
||||
cupy-cuda114>=5.0.0b4,<13.0.0
|
||||
cuda115 =
|
||||
cupy-cuda115>=5.0.0b4,<12.0.0
|
||||
cupy-cuda115>=5.0.0b4,<13.0.0
|
||||
cuda116 =
|
||||
cupy-cuda116>=5.0.0b4,<12.0.0
|
||||
cupy-cuda116>=5.0.0b4,<13.0.0
|
||||
cuda117 =
|
||||
cupy-cuda117>=5.0.0b4,<12.0.0
|
||||
cupy-cuda117>=5.0.0b4,<13.0.0
|
||||
cuda11x =
|
||||
cupy-cuda11x>=11.0.0,<12.0.0
|
||||
cupy-cuda11x>=11.0.0,<13.0.0
|
||||
cuda-autodetect =
|
||||
cupy-wheel>=11.0.0,<12.0.0
|
||||
cupy-wheel>=11.0.0,<13.0.0
|
||||
apple =
|
||||
thinc-apple-ops>=0.1.0.dev0,<1.0.0
|
||||
# Language tokenizers with external dependencies
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
# fmt: off
|
||||
__title__ = "spacy"
|
||||
__version__ = "3.5.0"
|
||||
__version__ = "3.5.3"
|
||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||
__projects__ = "https://github.com/explosion/projects"
|
||||
|
|
|
@ -336,7 +336,7 @@ def debug_data(
|
|||
show=verbose,
|
||||
)
|
||||
else:
|
||||
msg.good("Examples without ocurrences available for all labels")
|
||||
msg.good("Examples without occurrences available for all labels")
|
||||
|
||||
if "ner" in factory_names:
|
||||
# Get all unique NER labels present in the data
|
||||
|
|
|
@ -81,11 +81,8 @@ def download(
|
|||
|
||||
def get_model_filename(model_name: str, version: str, sdist: bool = False) -> str:
|
||||
dl_tpl = "{m}-{v}/{m}-{v}{s}"
|
||||
egg_tpl = "#egg={m}=={v}"
|
||||
suffix = SDIST_SUFFIX if sdist else WHEEL_SUFFIX
|
||||
filename = dl_tpl.format(m=model_name, v=version, s=suffix)
|
||||
if sdist:
|
||||
filename += egg_tpl.format(m=model_name, v=version)
|
||||
return filename
|
||||
|
||||
|
||||
|
|
|
@ -122,6 +122,8 @@ def evaluate(
|
|||
docs = list(nlp.pipe(ex.reference.text for ex in dev_dataset[:displacy_limit]))
|
||||
render_deps = "parser" in factory_names
|
||||
render_ents = "ner" in factory_names
|
||||
render_spans = "spancat" in factory_names
|
||||
|
||||
render_parses(
|
||||
docs,
|
||||
displacy_path,
|
||||
|
@ -129,6 +131,7 @@ def evaluate(
|
|||
limit=displacy_limit,
|
||||
deps=render_deps,
|
||||
ents=render_ents,
|
||||
spans=render_spans,
|
||||
)
|
||||
msg.good(f"Generated {displacy_limit} parses as HTML", displacy_path)
|
||||
|
||||
|
@ -182,6 +185,7 @@ def render_parses(
|
|||
limit: int = 250,
|
||||
deps: bool = True,
|
||||
ents: bool = True,
|
||||
spans: bool = True,
|
||||
):
|
||||
docs[0].user_data["title"] = model_name
|
||||
if ents:
|
||||
|
@ -195,6 +199,11 @@ def render_parses(
|
|||
with (output_path / "parses.html").open("w", encoding="utf8") as file_:
|
||||
file_.write(html)
|
||||
|
||||
if spans:
|
||||
html = displacy.render(docs[:limit], style="span", page=True)
|
||||
with (output_path / "spans.html").open("w", encoding="utf8") as file_:
|
||||
file_.write(html)
|
||||
|
||||
|
||||
def print_prf_per_type(
|
||||
msg: Printer, scores: Dict[str, Dict[str, float]], name: str, type: str
|
||||
|
|
|
@ -35,7 +35,7 @@ def find_threshold_cli(
|
|||
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||
use_gpu: int = Opt(_DEFAULTS["use_gpu"], "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
||||
gold_preproc: bool = Opt(_DEFAULTS["gold_preproc"], "--gold-preproc", "-G", help="Use gold preprocessing"),
|
||||
verbose: bool = Opt(False, "--silent", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||
# fmt: on
|
||||
):
|
||||
"""
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
from typing import Optional, Dict, Any, Union, List
|
||||
import platform
|
||||
import pkg_resources
|
||||
import json
|
||||
from pathlib import Path
|
||||
from wasabi import Printer, MarkdownRenderer
|
||||
|
@ -10,6 +9,7 @@ from ._util import app, Arg, Opt, string_to_list
|
|||
from .download import get_model_filename, get_latest_version
|
||||
from .. import util
|
||||
from .. import about
|
||||
from ..compat import importlib_metadata
|
||||
|
||||
|
||||
@app.command("info")
|
||||
|
@ -137,15 +137,14 @@ def info_installed_model_url(model: str) -> Optional[str]:
|
|||
dist-info available.
|
||||
"""
|
||||
try:
|
||||
dist = pkg_resources.get_distribution(model)
|
||||
data = json.loads(dist.get_metadata("direct_url.json"))
|
||||
return data["url"]
|
||||
except pkg_resources.DistributionNotFound:
|
||||
# no such package
|
||||
return None
|
||||
dist = importlib_metadata.distribution(model)
|
||||
text = dist.read_text("direct_url.json")
|
||||
if isinstance(text, str):
|
||||
data = json.loads(text)
|
||||
return data["url"]
|
||||
except Exception:
|
||||
# something else, like no file or invalid JSON
|
||||
return None
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def info_model_url(model: str) -> Dict[str, Any]:
|
||||
|
|
|
@ -23,6 +23,7 @@ def pretrain_cli(
|
|||
resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
|
||||
epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."),
|
||||
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
||||
skip_last: bool = Opt(False, "--skip-last", "-L", help="Skip saving model-last.bin"),
|
||||
# fmt: on
|
||||
):
|
||||
"""
|
||||
|
@ -74,6 +75,7 @@ def pretrain_cli(
|
|||
epoch_resume=epoch_resume,
|
||||
use_gpu=use_gpu,
|
||||
silent=False,
|
||||
skip_last=skip_last,
|
||||
)
|
||||
msg.good("Successfully finished pretrain")
|
||||
|
||||
|
|
|
@ -2,7 +2,6 @@ from typing import Optional, List, Dict, Sequence, Any, Iterable, Tuple
|
|||
import os.path
|
||||
from pathlib import Path
|
||||
|
||||
import pkg_resources
|
||||
from wasabi import msg
|
||||
from wasabi.util import locale_escape
|
||||
import sys
|
||||
|
@ -331,6 +330,7 @@ def _check_requirements(requirements: List[str]) -> Tuple[bool, bool]:
|
|||
RETURNS (Tuple[bool, bool]): Whether (1) any packages couldn't be imported, (2) any packages with version conflicts
|
||||
exist.
|
||||
"""
|
||||
import pkg_resources
|
||||
|
||||
failed_pkgs_msgs: List[str] = []
|
||||
conflicting_pkgs_msgs: List[str] = []
|
||||
|
|
|
@ -3,7 +3,7 @@ the docs and the init config command. It encodes various best practices and
|
|||
can help generate the best possible configuration, given a user's requirements. #}
|
||||
{%- set use_transformer = hardware != "cpu" and transformer_data -%}
|
||||
{%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
|
||||
{%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "spancat", "trainable_lemmatizer"] -%}
|
||||
{%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "spancat", "spancat_singlelabel", "trainable_lemmatizer"] -%}
|
||||
[paths]
|
||||
train = null
|
||||
dev = null
|
||||
|
@ -24,8 +24,11 @@ gpu_allocator = null
|
|||
lang = "{{ lang }}"
|
||||
{%- set has_textcat = ("textcat" in components or "textcat_multilabel" in components) -%}
|
||||
{%- set with_accuracy = optimize == "accuracy" -%}
|
||||
{%- set has_accurate_textcat = has_textcat and with_accuracy -%}
|
||||
{%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "spancat" in components or "trainable_lemmatizer" in components or "entity_linker" in components or has_accurate_textcat) -%}
|
||||
{# The BOW textcat doesn't need a source of features, so it can omit the
|
||||
tok2vec/transformer. #}
|
||||
{%- set with_accuracy_or_transformer = (use_transformer or with_accuracy) -%}
|
||||
{%- set textcat_needs_features = has_textcat and with_accuracy_or_transformer -%}
|
||||
{%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "spancat" in components or "spancat_singlelabel" in components or "trainable_lemmatizer" in components or "entity_linker" in components or textcat_needs_features) -%}
|
||||
{%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components -%}
|
||||
{%- else -%}
|
||||
{%- set full_pipeline = components -%}
|
||||
|
@ -156,6 +159,36 @@ grad_factor = 1.0
|
|||
sizes = [1,2,3]
|
||||
{% endif -%}
|
||||
|
||||
{% if "spancat_singlelabel" in components %}
|
||||
[components.spancat_singlelabel]
|
||||
factory = "spancat_singlelabel"
|
||||
negative_weight = 1.0
|
||||
allow_overlap = true
|
||||
scorer = {"@scorers":"spacy.spancat_scorer.v1"}
|
||||
spans_key = "sc"
|
||||
|
||||
[components.spancat_singlelabel.model]
|
||||
@architectures = "spacy.SpanCategorizer.v1"
|
||||
|
||||
[components.spancat_singlelabel.model.reducer]
|
||||
@layers = "spacy.mean_max_reducer.v1"
|
||||
hidden_size = 128
|
||||
|
||||
[components.spancat_singlelabel.model.scorer]
|
||||
@layers = "Softmax.v2"
|
||||
|
||||
[components.spancat_singlelabel.model.tok2vec]
|
||||
@architectures = "spacy-transformers.TransformerListener.v1"
|
||||
grad_factor = 1.0
|
||||
|
||||
[components.spancat_singlelabel.model.tok2vec.pooling]
|
||||
@layers = "reduce_mean.v1"
|
||||
|
||||
[components.spancat_singlelabel.suggester]
|
||||
@misc = "spacy.ngram_suggester.v1"
|
||||
sizes = [1,2,3]
|
||||
{% endif %}
|
||||
|
||||
{% if "trainable_lemmatizer" in components -%}
|
||||
[components.trainable_lemmatizer]
|
||||
factory = "trainable_lemmatizer"
|
||||
|
@ -221,10 +254,16 @@ no_output_layer = false
|
|||
|
||||
{% else -%}
|
||||
[components.textcat.model]
|
||||
@architectures = "spacy.TextCatBOW.v2"
|
||||
@architectures = "spacy.TextCatCNN.v2"
|
||||
exclusive_classes = true
|
||||
ngram_size = 1
|
||||
no_output_layer = false
|
||||
nO = null
|
||||
|
||||
[components.textcat.model.tok2vec]
|
||||
@architectures = "spacy-transformers.TransformerListener.v1"
|
||||
grad_factor = 1.0
|
||||
|
||||
[components.textcat.model.tok2vec.pooling]
|
||||
@layers = "reduce_mean.v1"
|
||||
{%- endif %}
|
||||
{%- endif %}
|
||||
|
||||
|
@ -252,10 +291,16 @@ no_output_layer = false
|
|||
|
||||
{% else -%}
|
||||
[components.textcat_multilabel.model]
|
||||
@architectures = "spacy.TextCatBOW.v2"
|
||||
@architectures = "spacy.TextCatCNN.v2"
|
||||
exclusive_classes = false
|
||||
ngram_size = 1
|
||||
no_output_layer = false
|
||||
nO = null
|
||||
|
||||
[components.textcat_multilabel.model.tok2vec]
|
||||
@architectures = "spacy-transformers.TransformerListener.v1"
|
||||
grad_factor = 1.0
|
||||
|
||||
[components.textcat_multilabel.model.tok2vec.pooling]
|
||||
@layers = "reduce_mean.v1"
|
||||
{%- endif %}
|
||||
{%- endif %}
|
||||
|
||||
|
@ -374,6 +419,33 @@ width = ${components.tok2vec.model.encode.width}
|
|||
sizes = [1,2,3]
|
||||
{% endif %}
|
||||
|
||||
{% if "spancat_singlelabel" in components %}
|
||||
[components.spancat_singlelabel]
|
||||
factory = "spancat_singlelabel"
|
||||
negative_weight = 1.0
|
||||
allow_overlap = true
|
||||
scorer = {"@scorers":"spacy.spancat_scorer.v1"}
|
||||
spans_key = "sc"
|
||||
|
||||
[components.spancat_singlelabel.model]
|
||||
@architectures = "spacy.SpanCategorizer.v1"
|
||||
|
||||
[components.spancat_singlelabel.model.reducer]
|
||||
@layers = "spacy.mean_max_reducer.v1"
|
||||
hidden_size = 128
|
||||
|
||||
[components.spancat_singlelabel.model.scorer]
|
||||
@layers = "Softmax.v2"
|
||||
|
||||
[components.spancat_singlelabel.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecListener.v1"
|
||||
width = ${components.tok2vec.model.encode.width}
|
||||
|
||||
[components.spancat_singlelabel.suggester]
|
||||
@misc = "spacy.ngram_suggester.v1"
|
||||
sizes = [1,2,3]
|
||||
{% endif %}
|
||||
|
||||
{% if "trainable_lemmatizer" in components -%}
|
||||
[components.trainable_lemmatizer]
|
||||
factory = "trainable_lemmatizer"
|
||||
|
|
|
@ -125,13 +125,17 @@ def app(environ, start_response):
|
|||
return [res]
|
||||
|
||||
|
||||
def parse_deps(orig_doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
|
||||
def parse_deps(
|
||||
orig_doc: Union[Doc, Span], options: Dict[str, Any] = {}
|
||||
) -> Dict[str, Any]:
|
||||
"""Generate dependency parse in {'words': [], 'arcs': []} format.
|
||||
|
||||
orig_doc (Doc): Document to parse.
|
||||
orig_doc (Union[Doc, Span]): Document to parse.
|
||||
options (Dict[str, Any]): Dependency parse specific visualisation options.
|
||||
RETURNS (dict): Generated dependency parse keyed by words and arcs.
|
||||
"""
|
||||
if isinstance(orig_doc, Span):
|
||||
orig_doc = orig_doc.as_doc()
|
||||
doc = Doc(orig_doc.vocab).from_bytes(
|
||||
orig_doc.to_bytes(exclude=["user_data", "user_hooks"])
|
||||
)
|
||||
|
|
|
@ -444,8 +444,7 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
E133 = ("The sum of prior probabilities for alias '{alias}' should not "
|
||||
"exceed 1, but found {sum}.")
|
||||
E134 = ("Entity '{entity}' is not defined in the Knowledge Base.")
|
||||
E139 = ("Knowledge base for component '{name}' is empty. Use the methods "
|
||||
"`kb.add_entity` and `kb.add_alias` to add entries.")
|
||||
E139 = ("Knowledge base for component '{name}' is empty.")
|
||||
E140 = ("The list of entities, prior probabilities and entity vectors "
|
||||
"should be of equal length.")
|
||||
E141 = ("Entity vectors should be of length {required} instead of the "
|
||||
|
@ -550,6 +549,8 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
"during training, make sure to include it in 'annotating components'")
|
||||
|
||||
# New errors added in v3.x
|
||||
E850 = ("The PretrainVectors objective currently only supports default or "
|
||||
"floret vectors, not {mode} vectors.")
|
||||
E851 = ("The 'textcat' component labels should only have values of 0 or 1, "
|
||||
"but found value of '{val}'.")
|
||||
E852 = ("The tar file pulled from the remote attempted an unsafe path "
|
||||
|
@ -967,7 +968,8 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
E1049 = ("No available port found for displaCy on host {host}. Please specify an available port "
|
||||
"with `displacy.serve(doc, port=port)`")
|
||||
E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port=port)` "
|
||||
"or use `auto_switch_port=True` to pick an available port automatically.")
|
||||
"or use `auto_select_port=True` to pick an available port automatically.")
|
||||
E1051 = ("'allow_overlap' can only be False when max_positive is 1, but found 'max_positive': {max_positive}.")
|
||||
|
||||
|
||||
# Deprecated model shortcuts, only used in errors and warnings
|
||||
|
|
|
@ -46,6 +46,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
self._alias_index = PreshMap(nr_aliases + 1)
|
||||
self._aliases_table = alias_vec(nr_aliases + 1)
|
||||
|
||||
def is_empty(self):
|
||||
return len(self) == 0
|
||||
|
||||
def __len__(self):
|
||||
return self.get_size_entities()
|
||||
|
||||
|
|
|
@ -25,7 +25,8 @@ class Lexeme:
|
|||
def orth_(self) -> str: ...
|
||||
@property
|
||||
def text(self) -> str: ...
|
||||
lower: str
|
||||
orth: int
|
||||
lower: int
|
||||
norm: int
|
||||
shape: int
|
||||
prefix: int
|
||||
|
|
|
@ -199,7 +199,7 @@ cdef class Lexeme:
|
|||
return self.orth_
|
||||
|
||||
property lower:
|
||||
"""RETURNS (str): Lowercase form of the lexeme."""
|
||||
"""RETURNS (uint64): Lowercase form of the lexeme."""
|
||||
def __get__(self):
|
||||
return self.c.lower
|
||||
|
||||
|
|
|
@ -82,8 +82,12 @@ cdef class DependencyMatcher:
|
|||
"$-": self._imm_left_sib,
|
||||
"$++": self._right_sib,
|
||||
"$--": self._left_sib,
|
||||
">+": self._imm_right_child,
|
||||
">-": self._imm_left_child,
|
||||
">++": self._right_child,
|
||||
">--": self._left_child,
|
||||
"<+": self._imm_right_parent,
|
||||
"<-": self._imm_left_parent,
|
||||
"<++": self._right_parent,
|
||||
"<--": self._left_parent,
|
||||
}
|
||||
|
@ -427,11 +431,33 @@ cdef class DependencyMatcher:
|
|||
def _left_sib(self, doc, node):
|
||||
return [doc[child.i] for child in doc[node].head.children if child.i < node]
|
||||
|
||||
def _imm_right_child(self, doc, node):
|
||||
for child in doc[node].rights:
|
||||
if child.i == node + 1:
|
||||
return [doc[child.i]]
|
||||
return []
|
||||
|
||||
def _imm_left_child(self, doc, node):
|
||||
for child in doc[node].lefts:
|
||||
if child.i == node - 1:
|
||||
return [doc[child.i]]
|
||||
return []
|
||||
|
||||
def _right_child(self, doc, node):
|
||||
return [doc[child.i] for child in doc[node].children if child.i > node]
|
||||
return [child for child in doc[node].rights]
|
||||
|
||||
def _left_child(self, doc, node):
|
||||
return [doc[child.i] for child in doc[node].children if child.i < node]
|
||||
return [child for child in doc[node].lefts]
|
||||
|
||||
def _imm_right_parent(self, doc, node):
|
||||
if doc[node].head.i == node + 1:
|
||||
return [doc[node].head]
|
||||
return []
|
||||
|
||||
def _imm_left_parent(self, doc, node):
|
||||
if doc[node].head.i == node - 1:
|
||||
return [doc[node].head]
|
||||
return []
|
||||
|
||||
def _right_parent(self, doc, node):
|
||||
if doc[node].head.i > node:
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from typing import Tuple, Callable
|
||||
from typing import List, Tuple, Callable
|
||||
from thinc.api import Model, to_numpy
|
||||
from thinc.types import Ragged, Ints1d
|
||||
|
||||
|
@ -52,14 +52,14 @@ def _get_span_indices(ops, spans: Ragged, lengths: Ints1d) -> Ints1d:
|
|||
indices will be [5, 6, 7, 8, 8, 9].
|
||||
"""
|
||||
spans, lengths = _ensure_cpu(spans, lengths)
|
||||
indices = []
|
||||
indices: List[int] = []
|
||||
offset = 0
|
||||
for i, length in enumerate(lengths):
|
||||
spans_i = spans[i].dataXd + offset
|
||||
for j in range(spans_i.shape[0]):
|
||||
indices.append(ops.xp.arange(spans_i[j, 0], spans_i[j, 1])) # type: ignore[call-overload, index]
|
||||
indices.extend(range(spans_i[j, 0], spans_i[j, 1])) # type: ignore[arg-type, call-overload]
|
||||
offset += length
|
||||
return ops.flatten(indices, dtype="i", ndim_if_empty=1)
|
||||
return ops.asarray1i(indices)
|
||||
|
||||
|
||||
def _ensure_cpu(spans: Ragged, lengths: Ints1d) -> Tuple[Ragged, Ints1d]:
|
||||
|
|
|
@ -89,6 +89,14 @@ def load_kb(
|
|||
return kb_from_file
|
||||
|
||||
|
||||
@registry.misc("spacy.EmptyKB.v2")
|
||||
def empty_kb_for_config() -> Callable[[Vocab, int], KnowledgeBase]:
|
||||
def empty_kb_factory(vocab: Vocab, entity_vector_length: int):
|
||||
return InMemoryLookupKB(vocab=vocab, entity_vector_length=entity_vector_length)
|
||||
|
||||
return empty_kb_factory
|
||||
|
||||
|
||||
@registry.misc("spacy.EmptyKB.v1")
|
||||
def empty_kb(
|
||||
entity_vector_length: int,
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
from typing import Any, Optional, Iterable, Tuple, List, Callable, TYPE_CHECKING, cast
|
||||
from thinc.types import Floats2d
|
||||
from thinc.types import Floats2d, Ints1d
|
||||
from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init, Model
|
||||
from thinc.api import MultiSoftmax, list2array
|
||||
from thinc.api import to_categorical, CosineDistance, L2Distance
|
||||
|
@ -7,7 +7,8 @@ from thinc.loss import Loss
|
|||
|
||||
from ...util import registry, OOV_RANK
|
||||
from ...errors import Errors
|
||||
from ...attrs import ID
|
||||
from ...attrs import ID, ORTH
|
||||
from ...vectors import Mode as VectorsMode
|
||||
|
||||
import numpy
|
||||
from functools import partial
|
||||
|
@ -67,14 +68,23 @@ def get_vectors_loss(ops, docs, prediction, distance):
|
|||
"""Compute a loss based on a distance between the documents' vectors and
|
||||
the prediction.
|
||||
"""
|
||||
# The simplest way to implement this would be to vstack the
|
||||
# token.vector values, but that's a bit inefficient, especially on GPU.
|
||||
# Instead we fetch the index into the vectors table for each of our tokens,
|
||||
# and look them up all at once. This prevents data copying.
|
||||
ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
|
||||
target = docs[0].vocab.vectors.data[ids]
|
||||
target[ids == OOV_RANK] = 0
|
||||
d_target, loss = distance(prediction, target)
|
||||
vocab = docs[0].vocab
|
||||
if vocab.vectors.mode == VectorsMode.default:
|
||||
# The simplest way to implement this would be to vstack the
|
||||
# token.vector values, but that's a bit inefficient, especially on GPU.
|
||||
# Instead we fetch the index into the vectors table for each of our
|
||||
# tokens, and look them up all at once. This prevents data copying.
|
||||
ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
|
||||
target = docs[0].vocab.vectors.data[ids]
|
||||
target[ids == OOV_RANK] = 0
|
||||
d_target, loss = distance(prediction, target)
|
||||
elif vocab.vectors.mode == VectorsMode.floret:
|
||||
keys = ops.flatten([cast(Ints1d, doc.to_array(ORTH)) for doc in docs])
|
||||
target = vocab.vectors.get_batch(keys)
|
||||
target = ops.as_contig(target)
|
||||
d_target, loss = distance(prediction, target)
|
||||
else:
|
||||
raise ValueError(Errors.E850.format(mode=vocab.vectors.mode))
|
||||
return loss, d_target
|
||||
|
||||
|
||||
|
|
|
@ -54,6 +54,7 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]
|
|||
"entity_vector_length": 64,
|
||||
"get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
|
||||
"get_candidates_batch": {"@misc": "spacy.CandidateBatchGenerator.v1"},
|
||||
"generate_empty_kb": {"@misc": "spacy.EmptyKB.v2"},
|
||||
"overwrite": True,
|
||||
"scorer": {"@scorers": "spacy.entity_linker_scorer.v1"},
|
||||
"use_gold_ents": True,
|
||||
|
@ -80,6 +81,7 @@ def make_entity_linker(
|
|||
get_candidates_batch: Callable[
|
||||
[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
|
||||
],
|
||||
generate_empty_kb: Callable[[Vocab, int], KnowledgeBase],
|
||||
overwrite: bool,
|
||||
scorer: Optional[Callable],
|
||||
use_gold_ents: bool,
|
||||
|
@ -101,6 +103,7 @@ def make_entity_linker(
|
|||
get_candidates_batch (
|
||||
Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]], Iterable[Candidate]]
|
||||
): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
|
||||
generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase.
|
||||
scorer (Optional[Callable]): The scoring method.
|
||||
use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
|
||||
component must provide entity annotations.
|
||||
|
@ -135,6 +138,7 @@ def make_entity_linker(
|
|||
entity_vector_length=entity_vector_length,
|
||||
get_candidates=get_candidates,
|
||||
get_candidates_batch=get_candidates_batch,
|
||||
generate_empty_kb=generate_empty_kb,
|
||||
overwrite=overwrite,
|
||||
scorer=scorer,
|
||||
use_gold_ents=use_gold_ents,
|
||||
|
@ -175,6 +179,7 @@ class EntityLinker(TrainablePipe):
|
|||
get_candidates_batch: Callable[
|
||||
[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
|
||||
],
|
||||
generate_empty_kb: Callable[[Vocab, int], KnowledgeBase],
|
||||
overwrite: bool = BACKWARD_OVERWRITE,
|
||||
scorer: Optional[Callable] = entity_linker_score,
|
||||
use_gold_ents: bool,
|
||||
|
@ -198,6 +203,7 @@ class EntityLinker(TrainablePipe):
|
|||
Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]],
|
||||
Iterable[Candidate]]
|
||||
): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
|
||||
generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase.
|
||||
scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_links.
|
||||
use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
|
||||
component must provide entity annotations.
|
||||
|
@ -220,6 +226,7 @@ class EntityLinker(TrainablePipe):
|
|||
self.model = model
|
||||
self.name = name
|
||||
self.labels_discard = list(labels_discard)
|
||||
# how many neighbour sentences to take into account
|
||||
self.n_sents = n_sents
|
||||
self.incl_prior = incl_prior
|
||||
self.incl_context = incl_context
|
||||
|
@ -227,9 +234,7 @@ class EntityLinker(TrainablePipe):
|
|||
self.get_candidates_batch = get_candidates_batch
|
||||
self.cfg: Dict[str, Any] = {"overwrite": overwrite}
|
||||
self.distance = CosineDistance(normalize=False)
|
||||
# how many neighbour sentences to take into account
|
||||
# create an empty KB by default
|
||||
self.kb = empty_kb(entity_vector_length)(self.vocab)
|
||||
self.kb = generate_empty_kb(self.vocab, entity_vector_length)
|
||||
self.scorer = scorer
|
||||
self.use_gold_ents = use_gold_ents
|
||||
self.candidates_batch_size = candidates_batch_size
|
||||
|
@ -250,7 +255,7 @@ class EntityLinker(TrainablePipe):
|
|||
# Raise an error if the knowledge base is not initialized.
|
||||
if self.kb is None:
|
||||
raise ValueError(Errors.E1018.format(name=self.name))
|
||||
if len(self.kb) == 0:
|
||||
if hasattr(self.kb, "is_empty") and self.kb.is_empty():
|
||||
raise ValueError(Errors.E139.format(name=self.name))
|
||||
|
||||
def initialize(
|
||||
|
@ -469,18 +474,24 @@ class EntityLinker(TrainablePipe):
|
|||
|
||||
# Looping through each entity in batch (TODO: rewrite)
|
||||
for j, ent in enumerate(ent_batch):
|
||||
sent_index = sentences.index(ent.sent)
|
||||
assert sent_index >= 0
|
||||
assert hasattr(ent, "sents")
|
||||
sents = list(ent.sents)
|
||||
sent_indices = (
|
||||
sentences.index(sents[0]),
|
||||
sentences.index(sents[-1]),
|
||||
)
|
||||
assert sent_indices[1] >= sent_indices[0] >= 0
|
||||
|
||||
if self.incl_context:
|
||||
# get n_neighbour sentences, clipped to the length of the document
|
||||
start_sentence = max(0, sent_index - self.n_sents)
|
||||
start_sentence = max(0, sent_indices[0] - self.n_sents)
|
||||
end_sentence = min(
|
||||
len(sentences) - 1, sent_index + self.n_sents
|
||||
len(sentences) - 1, sent_indices[1] + self.n_sents
|
||||
)
|
||||
start_token = sentences[start_sentence].start
|
||||
end_token = sentences[end_sentence].end
|
||||
sent_doc = doc[start_token:end_token].as_doc()
|
||||
|
||||
# currently, the context is the same for each entity in a sentence (should be refined)
|
||||
sentence_encoding = self.model.predict([sent_doc])[0]
|
||||
sentence_encoding_t = sentence_encoding.T
|
||||
|
|
|
@ -1,4 +1,6 @@
|
|||
from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any
|
||||
from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any, cast, Union
|
||||
from dataclasses import dataclass
|
||||
from functools import partial
|
||||
from thinc.api import Config, Model, get_current_ops, set_dropout_rate, Ops
|
||||
from thinc.api import Optimizer
|
||||
from thinc.types import Ragged, Ints2d, Floats2d
|
||||
|
@ -43,7 +45,36 @@ maxout_pieces = 3
|
|||
depth = 4
|
||||
"""
|
||||
|
||||
spancat_singlelabel_default_config = """
|
||||
[model]
|
||||
@architectures = "spacy.SpanCategorizer.v1"
|
||||
scorer = {"@layers": "Softmax.v2"}
|
||||
|
||||
[model.reducer]
|
||||
@layers = spacy.mean_max_reducer.v1
|
||||
hidden_size = 128
|
||||
|
||||
[model.tok2vec]
|
||||
@architectures = "spacy.Tok2Vec.v2"
|
||||
[model.tok2vec.embed]
|
||||
@architectures = "spacy.MultiHashEmbed.v1"
|
||||
width = 96
|
||||
rows = [5000, 1000, 2500, 1000]
|
||||
attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
|
||||
include_static_vectors = false
|
||||
|
||||
[model.tok2vec.encode]
|
||||
@architectures = "spacy.MaxoutWindowEncoder.v2"
|
||||
width = ${model.tok2vec.embed.width}
|
||||
window_size = 1
|
||||
maxout_pieces = 3
|
||||
depth = 4
|
||||
"""
|
||||
|
||||
DEFAULT_SPANCAT_MODEL = Config().from_str(spancat_default_config)["model"]
|
||||
DEFAULT_SPANCAT_SINGLELABEL_MODEL = Config().from_str(
|
||||
spancat_singlelabel_default_config
|
||||
)["model"]
|
||||
|
||||
|
||||
@runtime_checkable
|
||||
|
@ -52,39 +83,42 @@ class Suggester(Protocol):
|
|||
...
|
||||
|
||||
|
||||
def ngram_suggester(
|
||||
docs: Iterable[Doc], sizes: List[int], *, ops: Optional[Ops] = None
|
||||
) -> Ragged:
|
||||
if ops is None:
|
||||
ops = get_current_ops()
|
||||
spans = []
|
||||
lengths = []
|
||||
for doc in docs:
|
||||
starts = ops.xp.arange(len(doc), dtype="i")
|
||||
starts = starts.reshape((-1, 1))
|
||||
length = 0
|
||||
for size in sizes:
|
||||
if size <= len(doc):
|
||||
starts_size = starts[: len(doc) - (size - 1)]
|
||||
spans.append(ops.xp.hstack((starts_size, starts_size + size)))
|
||||
length += spans[-1].shape[0]
|
||||
if spans:
|
||||
assert spans[-1].ndim == 2, spans[-1].shape
|
||||
lengths.append(length)
|
||||
lengths_array = ops.asarray1i(lengths)
|
||||
if len(spans) > 0:
|
||||
output = Ragged(ops.xp.vstack(spans), lengths_array)
|
||||
else:
|
||||
output = Ragged(ops.xp.zeros((0, 0), dtype="i"), lengths_array)
|
||||
|
||||
assert output.dataXd.ndim == 2
|
||||
return output
|
||||
|
||||
|
||||
@registry.misc("spacy.ngram_suggester.v1")
|
||||
def build_ngram_suggester(sizes: List[int]) -> Suggester:
|
||||
"""Suggest all spans of the given lengths. Spans are returned as a ragged
|
||||
array of integers. The array has two columns, indicating the start and end
|
||||
position."""
|
||||
|
||||
def ngram_suggester(docs: Iterable[Doc], *, ops: Optional[Ops] = None) -> Ragged:
|
||||
if ops is None:
|
||||
ops = get_current_ops()
|
||||
spans = []
|
||||
lengths = []
|
||||
for doc in docs:
|
||||
starts = ops.xp.arange(len(doc), dtype="i")
|
||||
starts = starts.reshape((-1, 1))
|
||||
length = 0
|
||||
for size in sizes:
|
||||
if size <= len(doc):
|
||||
starts_size = starts[: len(doc) - (size - 1)]
|
||||
spans.append(ops.xp.hstack((starts_size, starts_size + size)))
|
||||
length += spans[-1].shape[0]
|
||||
if spans:
|
||||
assert spans[-1].ndim == 2, spans[-1].shape
|
||||
lengths.append(length)
|
||||
lengths_array = ops.asarray1i(lengths)
|
||||
if len(spans) > 0:
|
||||
output = Ragged(ops.xp.vstack(spans), lengths_array)
|
||||
else:
|
||||
output = Ragged(ops.xp.zeros((0, 0), dtype="i"), lengths_array)
|
||||
|
||||
assert output.dataXd.ndim == 2
|
||||
return output
|
||||
|
||||
return ngram_suggester
|
||||
return partial(ngram_suggester, sizes=sizes)
|
||||
|
||||
|
||||
@registry.misc("spacy.ngram_range_suggester.v1")
|
||||
|
@ -119,10 +153,14 @@ def make_spancat(
|
|||
threshold: float,
|
||||
max_positive: Optional[int],
|
||||
) -> "SpanCategorizer":
|
||||
"""Create a SpanCategorizer component. The span categorizer consists of two
|
||||
"""Create a SpanCategorizer component and configure it for multi-label
|
||||
classification to be able to assign multiple labels for each span.
|
||||
The span categorizer consists of two
|
||||
parts: a suggester function that proposes candidate spans, and a labeller
|
||||
model that predicts one or more labels for each span.
|
||||
|
||||
name (str): The component instance name, used to add entries to the
|
||||
losses during training.
|
||||
suggester (Callable[[Iterable[Doc], Optional[Ops]], Ragged]): A function that suggests spans.
|
||||
Spans are returned as a ragged array with two integer columns, for the
|
||||
start and end positions.
|
||||
|
@ -144,12 +182,80 @@ def make_spancat(
|
|||
"""
|
||||
return SpanCategorizer(
|
||||
nlp.vocab,
|
||||
suggester=suggester,
|
||||
model=model,
|
||||
spans_key=spans_key,
|
||||
threshold=threshold,
|
||||
max_positive=max_positive,
|
||||
suggester=suggester,
|
||||
name=name,
|
||||
spans_key=spans_key,
|
||||
negative_weight=None,
|
||||
allow_overlap=True,
|
||||
max_positive=max_positive,
|
||||
threshold=threshold,
|
||||
scorer=scorer,
|
||||
add_negative_label=False,
|
||||
)
|
||||
|
||||
|
||||
@Language.factory(
|
||||
"spancat_singlelabel",
|
||||
assigns=["doc.spans"],
|
||||
default_config={
|
||||
"spans_key": "sc",
|
||||
"model": DEFAULT_SPANCAT_SINGLELABEL_MODEL,
|
||||
"negative_weight": 1.0,
|
||||
"suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
|
||||
"scorer": {"@scorers": "spacy.spancat_scorer.v1"},
|
||||
"allow_overlap": True,
|
||||
},
|
||||
default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0},
|
||||
)
|
||||
def make_spancat_singlelabel(
|
||||
nlp: Language,
|
||||
name: str,
|
||||
suggester: Suggester,
|
||||
model: Model[Tuple[List[Doc], Ragged], Floats2d],
|
||||
spans_key: str,
|
||||
negative_weight: float,
|
||||
allow_overlap: bool,
|
||||
scorer: Optional[Callable],
|
||||
) -> "SpanCategorizer":
|
||||
"""Create a SpanCategorizer component and configure it for multi-class
|
||||
classification. With this configuration each span can get at most one
|
||||
label. The span categorizer consists of two
|
||||
parts: a suggester function that proposes candidate spans, and a labeller
|
||||
model that predicts one or more labels for each span.
|
||||
|
||||
name (str): The component instance name, used to add entries to the
|
||||
losses during training.
|
||||
suggester (Callable[[Iterable[Doc], Optional[Ops]], Ragged]): A function that suggests spans.
|
||||
Spans are returned as a ragged array with two integer columns, for the
|
||||
start and end positions.
|
||||
model (Model[Tuple[List[Doc], Ragged], Floats2d]): A model instance that
|
||||
is given a list of documents and (start, end) indices representing
|
||||
candidate span offsets. The model predicts a probability for each category
|
||||
for each span.
|
||||
spans_key (str): Key of the doc.spans dict to save the spans under. During
|
||||
initialization and training, the component will look for spans on the
|
||||
reference document under the same key.
|
||||
scorer (Optional[Callable]): The scoring method. Defaults to
|
||||
Scorer.score_spans for the Doc.spans[spans_key] with overlapping
|
||||
spans allowed.
|
||||
negative_weight (float): Multiplier for the loss terms.
|
||||
Can be used to downweight the negative samples if there are too many.
|
||||
allow_overlap (bool): If True the data is assumed to contain overlapping spans.
|
||||
Otherwise it produces non-overlapping spans greedily prioritizing
|
||||
higher assigned label scores.
|
||||
"""
|
||||
return SpanCategorizer(
|
||||
nlp.vocab,
|
||||
model=model,
|
||||
suggester=suggester,
|
||||
name=name,
|
||||
spans_key=spans_key,
|
||||
negative_weight=negative_weight,
|
||||
allow_overlap=allow_overlap,
|
||||
max_positive=1,
|
||||
add_negative_label=True,
|
||||
threshold=None,
|
||||
scorer=scorer,
|
||||
)
|
||||
|
||||
|
@ -172,6 +278,27 @@ def make_spancat_scorer():
|
|||
return spancat_score
|
||||
|
||||
|
||||
@dataclass
|
||||
class _Intervals:
|
||||
"""
|
||||
Helper class to avoid storing overlapping spans.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.ranges = set()
|
||||
|
||||
def add(self, i, j):
|
||||
for e in range(i, j):
|
||||
self.ranges.add(e)
|
||||
|
||||
def __contains__(self, rang):
|
||||
i, j = rang
|
||||
for e in range(i, j):
|
||||
if e in self.ranges:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
class SpanCategorizer(TrainablePipe):
|
||||
"""Pipeline component to label spans of text.
|
||||
|
||||
|
@ -185,25 +312,43 @@ class SpanCategorizer(TrainablePipe):
|
|||
suggester: Suggester,
|
||||
name: str = "spancat",
|
||||
*,
|
||||
add_negative_label: bool = False,
|
||||
spans_key: str = "spans",
|
||||
threshold: float = 0.5,
|
||||
negative_weight: Optional[float] = 1.0,
|
||||
allow_overlap: Optional[bool] = True,
|
||||
max_positive: Optional[int] = None,
|
||||
threshold: Optional[float] = 0.5,
|
||||
scorer: Optional[Callable] = spancat_score,
|
||||
) -> None:
|
||||
"""Initialize the span categorizer.
|
||||
"""Initialize the multi-label or multi-class span categorizer.
|
||||
|
||||
vocab (Vocab): The shared vocabulary.
|
||||
model (thinc.api.Model): The Thinc Model powering the pipeline component.
|
||||
For multi-class classification (single label per span) we recommend
|
||||
using a Softmax classifier as a the final layer, while for multi-label
|
||||
classification (multiple possible labels per span) we recommend Logistic.
|
||||
suggester (Callable[[Iterable[Doc], Optional[Ops]], Ragged]): A function that suggests spans.
|
||||
Spans are returned as a ragged array with two integer columns, for the
|
||||
start and end positions.
|
||||
name (str): The component instance name, used to add entries to the
|
||||
losses during training.
|
||||
spans_key (str): Key of the Doc.spans dict to save the spans under.
|
||||
During initialization and training, the component will look for
|
||||
spans on the reference document under the same key. Defaults to
|
||||
`"spans"`.
|
||||
threshold (float): Minimum probability to consider a prediction
|
||||
positive. Spans with a positive prediction will be saved on the Doc.
|
||||
Defaults to 0.5.
|
||||
add_negative_label (bool): Learn to predict a special 'negative_label'
|
||||
when a Span is not annotated.
|
||||
threshold (Optional[float]): Minimum probability to consider a prediction
|
||||
positive. Defaults to 0.5. Spans with a positive prediction will be saved
|
||||
on the Doc.
|
||||
max_positive (Optional[int]): Maximum number of labels to consider
|
||||
positive per span. Defaults to None, indicating no limit.
|
||||
negative_weight (float): Multiplier for the loss terms.
|
||||
Can be used to downweight the negative samples if there are too many
|
||||
when add_negative_label is True. Otherwise its unused.
|
||||
allow_overlap (bool): If True the data is assumed to contain overlapping spans.
|
||||
Otherwise it produces non-overlapping spans greedily prioritizing
|
||||
higher assigned label scores. Only used when max_positive is 1.
|
||||
scorer (Optional[Callable]): The scoring method. Defaults to
|
||||
Scorer.score_spans for the Doc.spans[spans_key] with overlapping
|
||||
spans allowed.
|
||||
|
@ -215,12 +360,17 @@ class SpanCategorizer(TrainablePipe):
|
|||
"spans_key": spans_key,
|
||||
"threshold": threshold,
|
||||
"max_positive": max_positive,
|
||||
"negative_weight": negative_weight,
|
||||
"allow_overlap": allow_overlap,
|
||||
}
|
||||
self.vocab = vocab
|
||||
self.suggester = suggester
|
||||
self.model = model
|
||||
self.name = name
|
||||
self.scorer = scorer
|
||||
self.add_negative_label = add_negative_label
|
||||
if not allow_overlap and max_positive is not None and max_positive > 1:
|
||||
raise ValueError(Errors.E1051.format(max_positive=max_positive))
|
||||
|
||||
@property
|
||||
def key(self) -> str:
|
||||
|
@ -230,6 +380,21 @@ class SpanCategorizer(TrainablePipe):
|
|||
"""
|
||||
return str(self.cfg["spans_key"])
|
||||
|
||||
def _allow_extra_label(self) -> None:
|
||||
"""Raise an error if the component can not add any more labels."""
|
||||
nO = None
|
||||
if self.model.has_dim("nO"):
|
||||
nO = self.model.get_dim("nO")
|
||||
elif self.model.has_ref("output_layer") and self.model.get_ref(
|
||||
"output_layer"
|
||||
).has_dim("nO"):
|
||||
nO = self.model.get_ref("output_layer").get_dim("nO")
|
||||
if nO is not None and nO == self._n_labels:
|
||||
if not self.is_resizable:
|
||||
raise ValueError(
|
||||
Errors.E922.format(name=self.name, nO=self.model.get_dim("nO"))
|
||||
)
|
||||
|
||||
def add_label(self, label: str) -> int:
|
||||
"""Add a new label to the pipe.
|
||||
|
||||
|
@ -263,6 +428,27 @@ class SpanCategorizer(TrainablePipe):
|
|||
"""
|
||||
return list(self.labels)
|
||||
|
||||
@property
|
||||
def _label_map(self) -> Dict[str, int]:
|
||||
"""RETURNS (Dict[str, int]): The label map."""
|
||||
return {label: i for i, label in enumerate(self.labels)}
|
||||
|
||||
@property
|
||||
def _n_labels(self) -> int:
|
||||
"""RETURNS (int): Number of labels."""
|
||||
if self.add_negative_label:
|
||||
return len(self.labels) + 1
|
||||
else:
|
||||
return len(self.labels)
|
||||
|
||||
@property
|
||||
def _negative_label_i(self) -> Union[int, None]:
|
||||
"""RETURNS (Union[int, None]): Index of the negative label."""
|
||||
if self.add_negative_label:
|
||||
return len(self.label_data)
|
||||
else:
|
||||
return None
|
||||
|
||||
def predict(self, docs: Iterable[Doc]):
|
||||
"""Apply the pipeline's model to a batch of docs, without modifying them.
|
||||
|
||||
|
@ -304,14 +490,24 @@ class SpanCategorizer(TrainablePipe):
|
|||
|
||||
DOCS: https://spacy.io/api/spancategorizer#set_annotations
|
||||
"""
|
||||
labels = self.labels
|
||||
indices, scores = indices_scores
|
||||
offset = 0
|
||||
for i, doc in enumerate(docs):
|
||||
indices_i = indices[i].dataXd
|
||||
doc.spans[self.key] = self._make_span_group(
|
||||
doc, indices_i, scores[offset : offset + indices.lengths[i]], labels # type: ignore[arg-type]
|
||||
)
|
||||
allow_overlap = cast(bool, self.cfg["allow_overlap"])
|
||||
if self.cfg["max_positive"] == 1:
|
||||
doc.spans[self.key] = self._make_span_group_singlelabel(
|
||||
doc,
|
||||
indices_i,
|
||||
scores[offset : offset + indices.lengths[i]],
|
||||
allow_overlap,
|
||||
)
|
||||
else:
|
||||
doc.spans[self.key] = self._make_span_group_multilabel(
|
||||
doc,
|
||||
indices_i,
|
||||
scores[offset : offset + indices.lengths[i]],
|
||||
)
|
||||
offset += indices.lengths[i]
|
||||
|
||||
def update(
|
||||
|
@ -371,9 +567,11 @@ class SpanCategorizer(TrainablePipe):
|
|||
spans = Ragged(
|
||||
self.model.ops.to_numpy(spans.data), self.model.ops.to_numpy(spans.lengths)
|
||||
)
|
||||
label_map = {label: i for i, label in enumerate(self.labels)}
|
||||
target = numpy.zeros(scores.shape, dtype=scores.dtype)
|
||||
if self.add_negative_label:
|
||||
negative_spans = numpy.ones((scores.shape[0]))
|
||||
offset = 0
|
||||
label_map = self._label_map
|
||||
for i, eg in enumerate(examples):
|
||||
# Map (start, end) offset of spans to the row in the d_scores array,
|
||||
# so that we can adjust the gradient for predictions that were
|
||||
|
@ -390,10 +588,16 @@ class SpanCategorizer(TrainablePipe):
|
|||
row = spans_index[key]
|
||||
k = label_map[gold_span.label_]
|
||||
target[row, k] = 1.0
|
||||
if self.add_negative_label:
|
||||
# delete negative label target.
|
||||
negative_spans[row] = 0.0
|
||||
# The target is a flat array for all docs. Track the position
|
||||
# we're at within the flat array.
|
||||
offset += spans.lengths[i]
|
||||
target = self.model.ops.asarray(target, dtype="f") # type: ignore
|
||||
if self.add_negative_label:
|
||||
negative_samples = numpy.nonzero(negative_spans)[0]
|
||||
target[negative_samples, self._negative_label_i] = 1.0 # type: ignore
|
||||
# The target will have the values 0 (for untrue predictions) or 1
|
||||
# (for true predictions).
|
||||
# The scores should be in the range [0, 1].
|
||||
|
@ -402,6 +606,10 @@ class SpanCategorizer(TrainablePipe):
|
|||
# If the prediction is 0.9 and it's false, the gradient will be
|
||||
# 0.9 (0.9 - 0.0)
|
||||
d_scores = scores - target
|
||||
if self.add_negative_label:
|
||||
neg_weight = cast(float, self.cfg["negative_weight"])
|
||||
if neg_weight != 1.0:
|
||||
d_scores[negative_samples] *= neg_weight
|
||||
loss = float((d_scores**2).sum())
|
||||
return loss, d_scores
|
||||
|
||||
|
@ -438,7 +646,7 @@ class SpanCategorizer(TrainablePipe):
|
|||
if subbatch:
|
||||
docs = [eg.x for eg in subbatch]
|
||||
spans = build_ngram_suggester(sizes=[1])(docs)
|
||||
Y = self.model.ops.alloc2f(spans.dataXd.shape[0], len(self.labels))
|
||||
Y = self.model.ops.alloc2f(spans.dataXd.shape[0], self._n_labels)
|
||||
self.model.initialize(X=(docs, spans), Y=Y)
|
||||
else:
|
||||
self.model.initialize()
|
||||
|
@ -452,31 +660,98 @@ class SpanCategorizer(TrainablePipe):
|
|||
eg.reference.spans.get(self.key, []), allow_overlap=True
|
||||
)
|
||||
|
||||
def _make_span_group(
|
||||
self, doc: Doc, indices: Ints2d, scores: Floats2d, labels: List[str]
|
||||
def _make_span_group_multilabel(
|
||||
self,
|
||||
doc: Doc,
|
||||
indices: Ints2d,
|
||||
scores: Floats2d,
|
||||
) -> SpanGroup:
|
||||
"""Find the top-k labels for each span (k=max_positive)."""
|
||||
spans = SpanGroup(doc, name=self.key)
|
||||
max_positive = self.cfg["max_positive"]
|
||||
if scores.size == 0:
|
||||
return spans
|
||||
scores = self.model.ops.to_numpy(scores)
|
||||
indices = self.model.ops.to_numpy(indices)
|
||||
threshold = self.cfg["threshold"]
|
||||
max_positive = self.cfg["max_positive"]
|
||||
|
||||
keeps = scores >= threshold
|
||||
ranked = (scores * -1).argsort() # type: ignore
|
||||
if max_positive is not None:
|
||||
assert isinstance(max_positive, int)
|
||||
if self.add_negative_label:
|
||||
negative_scores = numpy.copy(scores[:, self._negative_label_i])
|
||||
scores[:, self._negative_label_i] = -numpy.inf
|
||||
ranked = (scores * -1).argsort() # type: ignore
|
||||
scores[:, self._negative_label_i] = negative_scores
|
||||
else:
|
||||
ranked = (scores * -1).argsort() # type: ignore
|
||||
span_filter = ranked[:, max_positive:]
|
||||
for i, row in enumerate(span_filter):
|
||||
keeps[i, row] = False
|
||||
spans.attrs["scores"] = scores[keeps].flatten()
|
||||
|
||||
indices = self.model.ops.to_numpy(indices)
|
||||
keeps = self.model.ops.to_numpy(keeps)
|
||||
|
||||
attrs_scores = []
|
||||
for i in range(indices.shape[0]):
|
||||
start = indices[i, 0]
|
||||
end = indices[i, 1]
|
||||
|
||||
for j, keep in enumerate(keeps[i]):
|
||||
if keep:
|
||||
spans.append(Span(doc, start, end, label=labels[j]))
|
||||
|
||||
if j != self._negative_label_i:
|
||||
spans.append(Span(doc, start, end, label=self.labels[j]))
|
||||
attrs_scores.append(scores[i, j])
|
||||
spans.attrs["scores"] = numpy.array(attrs_scores)
|
||||
return spans
|
||||
|
||||
def _make_span_group_singlelabel(
|
||||
self,
|
||||
doc: Doc,
|
||||
indices: Ints2d,
|
||||
scores: Floats2d,
|
||||
allow_overlap: bool = True,
|
||||
) -> SpanGroup:
|
||||
"""Find the argmax label for each span."""
|
||||
# Handle cases when there are zero suggestions
|
||||
if scores.size == 0:
|
||||
return SpanGroup(doc, name=self.key)
|
||||
scores = self.model.ops.to_numpy(scores)
|
||||
indices = self.model.ops.to_numpy(indices)
|
||||
predicted = scores.argmax(axis=1)
|
||||
argmax_scores = numpy.take_along_axis(
|
||||
scores, numpy.expand_dims(predicted, 1), axis=1
|
||||
)
|
||||
keeps = numpy.ones(predicted.shape, dtype=bool)
|
||||
# Remove samples where the negative label is the argmax.
|
||||
if self.add_negative_label:
|
||||
keeps = numpy.logical_and(keeps, predicted != self._negative_label_i)
|
||||
# Filter samples according to threshold.
|
||||
threshold = self.cfg["threshold"]
|
||||
if threshold is not None:
|
||||
keeps = numpy.logical_and(keeps, (argmax_scores >= threshold).squeeze())
|
||||
# Sort spans according to argmax probability
|
||||
if not allow_overlap:
|
||||
# Get the probabilities
|
||||
sort_idx = (argmax_scores.squeeze() * -1).argsort()
|
||||
argmax_scores = argmax_scores[sort_idx]
|
||||
predicted = predicted[sort_idx]
|
||||
indices = indices[sort_idx]
|
||||
keeps = keeps[sort_idx]
|
||||
seen = _Intervals()
|
||||
spans = SpanGroup(doc, name=self.key)
|
||||
attrs_scores = []
|
||||
for i in range(indices.shape[0]):
|
||||
if not keeps[i]:
|
||||
continue
|
||||
|
||||
label = predicted[i]
|
||||
start = indices[i, 0]
|
||||
end = indices[i, 1]
|
||||
|
||||
if not allow_overlap:
|
||||
if (start, end) in seen:
|
||||
continue
|
||||
else:
|
||||
seen.add(start, end)
|
||||
attrs_scores.append(argmax_scores[i])
|
||||
spans.append(Span(doc, start, end, label=self.labels[label]))
|
||||
|
||||
spans.attrs["scores"] = numpy.array(attrs_scores)
|
||||
return spans
|
||||
|
|
|
@ -33,6 +33,8 @@ def test_token_morph_key(i_has):
|
|||
def test_morph_props(i_has):
|
||||
assert i_has[0].morph.get("PronType") == ["prs"]
|
||||
assert i_has[1].morph.get("PronType") == []
|
||||
assert i_has[1].morph.get("AsdfType", ["asdf"]) == ["asdf"]
|
||||
assert i_has[1].morph.get("AsdfType", default=["asdf", "qwer"]) == ["asdf", "qwer"]
|
||||
|
||||
|
||||
def test_morph_iter(i_has):
|
||||
|
|
|
@ -700,3 +700,34 @@ def test_span_group_copy(doc):
|
|||
assert len(doc.spans["test"]) == 3
|
||||
# check that the copy spans were not modified and this is an isolated doc
|
||||
assert len(doc_copy.spans["test"]) == 2
|
||||
|
||||
|
||||
def test_for_partial_ent_sents():
|
||||
"""Spans may be associated with multiple sentences. These .sents should always be complete, not partial, sentences,
|
||||
which this tests for.
|
||||
"""
|
||||
doc = Doc(
|
||||
English().vocab,
|
||||
words=["Mahler's", "Symphony", "No.", "8", "was", "beautiful."],
|
||||
sent_starts=[1, 0, 0, 1, 0, 0],
|
||||
)
|
||||
doc.set_ents([Span(doc, 1, 4, "WORK")])
|
||||
# The specified entity is associated with both sentences in this doc, so we expect all sentences in the doc to be
|
||||
# equal to the sentences referenced in ent.sents.
|
||||
for doc_sent, ent_sent in zip(doc.sents, doc.ents[0].sents):
|
||||
assert doc_sent == ent_sent
|
||||
|
||||
|
||||
def test_for_no_ent_sents():
|
||||
"""Span.sents() should set .sents correctly, even if Span in question is trailing and doesn't form a full
|
||||
sentence.
|
||||
"""
|
||||
doc = Doc(
|
||||
English().vocab,
|
||||
words=["This", "is", "a", "test.", "ENTITY"],
|
||||
sent_starts=[1, 0, 0, 0, 1],
|
||||
)
|
||||
doc.set_ents([Span(doc, 4, 5, "WORK")])
|
||||
sents = list(doc.ents[0].sents)
|
||||
assert len(sents) == 1
|
||||
assert str(sents[0]) == str(doc.ents[0].sent) == "ENTITY"
|
||||
|
|
|
@ -316,16 +316,32 @@ def test_dependency_matcher_precedence_ops(en_vocab, op, num_matches):
|
|||
("the", "brown", "$--", 0),
|
||||
("brown", "the", "$--", 1),
|
||||
("brown", "brown", "$--", 0),
|
||||
("over", "jumped", "<+", 0),
|
||||
("quick", "fox", "<+", 0),
|
||||
("the", "quick", "<+", 0),
|
||||
("brown", "fox", "<+", 1),
|
||||
("quick", "fox", "<++", 1),
|
||||
("quick", "over", "<++", 0),
|
||||
("over", "jumped", "<++", 0),
|
||||
("the", "fox", "<++", 2),
|
||||
("brown", "fox", "<-", 0),
|
||||
("fox", "over", "<-", 0),
|
||||
("the", "over", "<-", 0),
|
||||
("over", "jumped", "<-", 1),
|
||||
("brown", "fox", "<--", 0),
|
||||
("fox", "jumped", "<--", 0),
|
||||
("fox", "over", "<--", 1),
|
||||
("fox", "brown", ">+", 0),
|
||||
("over", "fox", ">+", 0),
|
||||
("over", "the", ">+", 0),
|
||||
("jumped", "over", ">+", 1),
|
||||
("jumped", "over", ">++", 1),
|
||||
("fox", "lazy", ">++", 0),
|
||||
("over", "the", ">++", 0),
|
||||
("jumped", "over", ">-", 0),
|
||||
("fox", "quick", ">-", 0),
|
||||
("brown", "quick", ">-", 0),
|
||||
("fox", "brown", ">-", 1),
|
||||
("brown", "fox", ">--", 0),
|
||||
("fox", "brown", ">--", 1),
|
||||
("jumped", "fox", ">--", 1),
|
||||
|
|
|
@ -9,6 +9,8 @@ from spacy.lang.en import English
|
|||
from spacy.lang.it import Italian
|
||||
from spacy.language import Language
|
||||
from spacy.lookups import Lookups
|
||||
from spacy.pipeline import EntityRecognizer
|
||||
from spacy.pipeline.ner import DEFAULT_NER_MODEL
|
||||
from spacy.pipeline._parser_internals.ner import BiluoPushDown
|
||||
from spacy.training import Example, iob_to_biluo, split_bilu_label
|
||||
from spacy.tokens import Doc, Span
|
||||
|
@ -16,8 +18,6 @@ from spacy.vocab import Vocab
|
|||
import logging
|
||||
|
||||
from ..util import make_tempdir
|
||||
from ...pipeline import EntityRecognizer
|
||||
from ...pipeline.ner import DEFAULT_NER_MODEL
|
||||
|
||||
TRAIN_DATA = [
|
||||
("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),
|
||||
|
|
|
@ -8,11 +8,11 @@ from spacy.lang.en import English
|
|||
from spacy.tokens import Doc
|
||||
from spacy.training import Example
|
||||
from spacy.vocab import Vocab
|
||||
from spacy.pipeline import DependencyParser
|
||||
from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL
|
||||
from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
|
||||
|
||||
from ...pipeline import DependencyParser
|
||||
from ...pipeline.dep_parser import DEFAULT_PARSER_MODEL
|
||||
from ..util import apply_transition_sequence, make_tempdir
|
||||
from ...pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
|
||||
|
||||
TRAIN_DATA = [
|
||||
(
|
||||
|
|
|
@ -1,9 +1,9 @@
|
|||
from typing import Callable, Iterable, Dict, Any
|
||||
from typing import Callable, Iterable, Dict, Any, Tuple
|
||||
|
||||
import pytest
|
||||
from numpy.testing import assert_equal
|
||||
|
||||
from spacy import registry, util
|
||||
from spacy import registry, util, Language
|
||||
from spacy.attrs import ENT_KB_ID
|
||||
from spacy.compat import pickle
|
||||
from spacy.kb import Candidate, InMemoryLookupKB, get_candidates, KnowledgeBase
|
||||
|
@ -108,18 +108,23 @@ def test_issue7065():
|
|||
|
||||
|
||||
@pytest.mark.issue(7065)
|
||||
def test_issue7065_b():
|
||||
@pytest.mark.parametrize("entity_in_first_sentence", [True, False])
|
||||
def test_sentence_crossing_ents(entity_in_first_sentence: bool):
|
||||
"""Tests if NEL crashes if entities cross sentence boundaries and the first associated sentence doesn't have an
|
||||
entity.
|
||||
entity_in_prior_sentence (bool): Whether to include an entity in the first sentence associated with the
|
||||
sentence-crossing entity.
|
||||
"""
|
||||
# Test that the NEL doesn't crash when an entity crosses a sentence boundary
|
||||
nlp = English()
|
||||
vector_length = 3
|
||||
nlp.add_pipe("sentencizer")
|
||||
text = "Mahler 's Symphony No. 8 was beautiful."
|
||||
entities = [(0, 6, "PERSON"), (10, 24, "WORK")]
|
||||
links = {
|
||||
(0, 6): {"Q7304": 1.0, "Q270853": 0.0},
|
||||
(10, 24): {"Q7304": 0.0, "Q270853": 1.0},
|
||||
}
|
||||
sent_starts = [1, -1, 0, 0, 0, 0, 0, 0, 0]
|
||||
entities = [(10, 24, "WORK")]
|
||||
links = {(10, 24): {"Q7304": 0.0, "Q270853": 1.0}}
|
||||
if entity_in_first_sentence:
|
||||
entities.append((0, 6, "PERSON"))
|
||||
links[(0, 6)] = {"Q7304": 1.0, "Q270853": 0.0}
|
||||
sent_starts = [1, -1, 0, 0, 0, 1, 0, 0, 0]
|
||||
doc = nlp(text)
|
||||
example = Example.from_dict(
|
||||
doc, {"entities": entities, "links": links, "sent_starts": sent_starts}
|
||||
|
@ -145,31 +150,14 @@ def test_issue7065_b():
|
|||
|
||||
# Create the Entity Linker component and add it to the pipeline
|
||||
entity_linker = nlp.add_pipe("entity_linker", last=True)
|
||||
entity_linker.set_kb(create_kb)
|
||||
entity_linker.set_kb(create_kb) # type: ignore
|
||||
# train the NEL pipe
|
||||
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||
for i in range(2):
|
||||
losses = {}
|
||||
nlp.update(train_examples, sgd=optimizer, losses=losses)
|
||||
nlp.update(train_examples, sgd=optimizer)
|
||||
|
||||
# Add a custom rule-based component to mimick NER
|
||||
patterns = [
|
||||
{"label": "PERSON", "pattern": [{"LOWER": "mahler"}]},
|
||||
{
|
||||
"label": "WORK",
|
||||
"pattern": [
|
||||
{"LOWER": "symphony"},
|
||||
{"LOWER": "no"},
|
||||
{"LOWER": "."},
|
||||
{"LOWER": "8"},
|
||||
],
|
||||
},
|
||||
]
|
||||
ruler = nlp.add_pipe("entity_ruler", before="entity_linker")
|
||||
ruler.add_patterns(patterns)
|
||||
# test the trained model - this should not throw E148
|
||||
doc = nlp(text)
|
||||
assert doc
|
||||
# This shouldn't crash.
|
||||
entity_linker.predict([example.reference]) # type: ignore
|
||||
|
||||
|
||||
def test_no_entities():
|
||||
|
@ -353,6 +341,9 @@ def test_kb_default(nlp):
|
|||
"""Test that the default (empty) KB is loaded upon construction"""
|
||||
entity_linker = nlp.add_pipe("entity_linker", config={})
|
||||
assert len(entity_linker.kb) == 0
|
||||
with pytest.raises(ValueError, match="E139"):
|
||||
# this raises an error because the KB is empty
|
||||
entity_linker.validate_kb()
|
||||
assert entity_linker.kb.get_size_entities() == 0
|
||||
assert entity_linker.kb.get_size_aliases() == 0
|
||||
# 64 is the default value from pipeline.entity_linker
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import pytest
|
||||
import numpy
|
||||
from numpy.testing import assert_array_equal, assert_almost_equal
|
||||
from thinc.api import get_current_ops, Ragged
|
||||
from thinc.api import get_current_ops, NumpyOps, Ragged
|
||||
|
||||
from spacy import util
|
||||
from spacy.lang.en import English
|
||||
|
@ -15,6 +15,8 @@ OPS = get_current_ops()
|
|||
|
||||
SPAN_KEY = "labeled_spans"
|
||||
|
||||
SPANCAT_COMPONENTS = ["spancat", "spancat_singlelabel"]
|
||||
|
||||
TRAIN_DATA = [
|
||||
("Who is Shaka Khan?", {"spans": {SPAN_KEY: [(7, 17, "PERSON")]}}),
|
||||
(
|
||||
|
@ -41,38 +43,42 @@ def make_examples(nlp, data=TRAIN_DATA):
|
|||
return train_examples
|
||||
|
||||
|
||||
def test_no_label():
|
||||
@pytest.mark.parametrize("name", SPANCAT_COMPONENTS)
|
||||
def test_no_label(name):
|
||||
nlp = Language()
|
||||
nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY})
|
||||
nlp.add_pipe(name, config={"spans_key": SPAN_KEY})
|
||||
with pytest.raises(ValueError):
|
||||
nlp.initialize()
|
||||
|
||||
|
||||
def test_no_resize():
|
||||
@pytest.mark.parametrize("name", SPANCAT_COMPONENTS)
|
||||
def test_no_resize(name):
|
||||
nlp = Language()
|
||||
spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY})
|
||||
spancat = nlp.add_pipe(name, config={"spans_key": SPAN_KEY})
|
||||
spancat.add_label("Thing")
|
||||
spancat.add_label("Phrase")
|
||||
assert spancat.labels == ("Thing", "Phrase")
|
||||
nlp.initialize()
|
||||
assert spancat.model.get_dim("nO") == 2
|
||||
assert spancat.model.get_dim("nO") == spancat._n_labels
|
||||
# this throws an error because the spancat can't be resized after initialization
|
||||
with pytest.raises(ValueError):
|
||||
spancat.add_label("Stuff")
|
||||
|
||||
|
||||
def test_implicit_labels():
|
||||
@pytest.mark.parametrize("name", SPANCAT_COMPONENTS)
|
||||
def test_implicit_labels(name):
|
||||
nlp = Language()
|
||||
spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY})
|
||||
spancat = nlp.add_pipe(name, config={"spans_key": SPAN_KEY})
|
||||
assert len(spancat.labels) == 0
|
||||
train_examples = make_examples(nlp)
|
||||
nlp.initialize(get_examples=lambda: train_examples)
|
||||
assert spancat.labels == ("PERSON", "LOC")
|
||||
|
||||
|
||||
def test_explicit_labels():
|
||||
@pytest.mark.parametrize("name", SPANCAT_COMPONENTS)
|
||||
def test_explicit_labels(name):
|
||||
nlp = Language()
|
||||
spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY})
|
||||
spancat = nlp.add_pipe(name, config={"spans_key": SPAN_KEY})
|
||||
assert len(spancat.labels) == 0
|
||||
spancat.add_label("PERSON")
|
||||
spancat.add_label("LOC")
|
||||
|
@ -102,13 +108,13 @@ def test_doc_gc():
|
|||
# XXX This fails with length 0 sometimes
|
||||
assert len(spangroup) > 0
|
||||
with pytest.raises(RuntimeError):
|
||||
span = spangroup[0]
|
||||
spangroup[0]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"max_positive,nr_results", [(None, 4), (1, 2), (2, 3), (3, 4), (4, 4)]
|
||||
)
|
||||
def test_make_spangroup(max_positive, nr_results):
|
||||
def test_make_spangroup_multilabel(max_positive, nr_results):
|
||||
fix_random_seed(0)
|
||||
nlp = Language()
|
||||
spancat = nlp.add_pipe(
|
||||
|
@ -120,10 +126,12 @@ def test_make_spangroup(max_positive, nr_results):
|
|||
indices = ngram_suggester([doc])[0].dataXd
|
||||
assert_array_equal(OPS.to_numpy(indices), numpy.asarray([[0, 1], [1, 2], [0, 2]]))
|
||||
labels = ["Thing", "City", "Person", "GreatCity"]
|
||||
for label in labels:
|
||||
spancat.add_label(label)
|
||||
scores = numpy.asarray(
|
||||
[[0.2, 0.4, 0.3, 0.1], [0.1, 0.6, 0.2, 0.4], [0.8, 0.7, 0.3, 0.9]], dtype="f"
|
||||
)
|
||||
spangroup = spancat._make_span_group(doc, indices, scores, labels)
|
||||
spangroup = spancat._make_span_group_multilabel(doc, indices, scores)
|
||||
assert len(spangroup) == nr_results
|
||||
|
||||
# first span is always the second token "London"
|
||||
|
@ -154,6 +162,130 @@ def test_make_spangroup(max_positive, nr_results):
|
|||
assert_almost_equal(0.9, spangroup.attrs["scores"][-1], 5)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"threshold,allow_overlap,nr_results",
|
||||
[(0.05, True, 3), (0.05, False, 1), (0.5, True, 2), (0.5, False, 1)],
|
||||
)
|
||||
def test_make_spangroup_singlelabel(threshold, allow_overlap, nr_results):
|
||||
fix_random_seed(0)
|
||||
nlp = Language()
|
||||
spancat = nlp.add_pipe(
|
||||
"spancat",
|
||||
config={
|
||||
"spans_key": SPAN_KEY,
|
||||
"threshold": threshold,
|
||||
"max_positive": 1,
|
||||
},
|
||||
)
|
||||
doc = nlp.make_doc("Greater London")
|
||||
ngram_suggester = registry.misc.get("spacy.ngram_suggester.v1")(sizes=[1, 2])
|
||||
indices = ngram_suggester([doc])[0].dataXd
|
||||
assert_array_equal(OPS.to_numpy(indices), numpy.asarray([[0, 1], [1, 2], [0, 2]]))
|
||||
labels = ["Thing", "City", "Person", "GreatCity"]
|
||||
for label in labels:
|
||||
spancat.add_label(label)
|
||||
scores = numpy.asarray(
|
||||
[[0.2, 0.4, 0.3, 0.1], [0.1, 0.6, 0.2, 0.4], [0.8, 0.7, 0.3, 0.9]], dtype="f"
|
||||
)
|
||||
spangroup = spancat._make_span_group_singlelabel(
|
||||
doc, indices, scores, allow_overlap
|
||||
)
|
||||
if threshold > 0.4:
|
||||
if allow_overlap:
|
||||
assert spangroup[0].text == "London"
|
||||
assert spangroup[0].label_ == "City"
|
||||
assert_almost_equal(0.6, spangroup.attrs["scores"][0], 5)
|
||||
assert spangroup[1].text == "Greater London"
|
||||
assert spangroup[1].label_ == "GreatCity"
|
||||
assert spangroup.attrs["scores"][1] == 0.9
|
||||
assert_almost_equal(0.9, spangroup.attrs["scores"][1], 5)
|
||||
else:
|
||||
assert spangroup[0].text == "Greater London"
|
||||
assert spangroup[0].label_ == "GreatCity"
|
||||
assert spangroup.attrs["scores"][0] == 0.9
|
||||
else:
|
||||
if allow_overlap:
|
||||
assert spangroup[0].text == "Greater"
|
||||
assert spangroup[0].label_ == "City"
|
||||
assert spangroup[1].text == "London"
|
||||
assert spangroup[1].label_ == "City"
|
||||
assert spangroup[2].text == "Greater London"
|
||||
assert spangroup[2].label_ == "GreatCity"
|
||||
else:
|
||||
assert spangroup[0].text == "Greater London"
|
||||
|
||||
|
||||
def test_make_spangroup_negative_label():
|
||||
fix_random_seed(0)
|
||||
nlp_single = Language()
|
||||
nlp_multi = Language()
|
||||
spancat_single = nlp_single.add_pipe(
|
||||
"spancat",
|
||||
config={
|
||||
"spans_key": SPAN_KEY,
|
||||
"threshold": 0.1,
|
||||
"max_positive": 1,
|
||||
},
|
||||
)
|
||||
spancat_multi = nlp_multi.add_pipe(
|
||||
"spancat",
|
||||
config={
|
||||
"spans_key": SPAN_KEY,
|
||||
"threshold": 0.1,
|
||||
"max_positive": 2,
|
||||
},
|
||||
)
|
||||
spancat_single.add_negative_label = True
|
||||
spancat_multi.add_negative_label = True
|
||||
doc = nlp_single.make_doc("Greater London")
|
||||
labels = ["Thing", "City", "Person", "GreatCity"]
|
||||
for label in labels:
|
||||
spancat_multi.add_label(label)
|
||||
spancat_single.add_label(label)
|
||||
ngram_suggester = registry.misc.get("spacy.ngram_suggester.v1")(sizes=[1, 2])
|
||||
indices = ngram_suggester([doc])[0].dataXd
|
||||
assert_array_equal(OPS.to_numpy(indices), numpy.asarray([[0, 1], [1, 2], [0, 2]]))
|
||||
scores = numpy.asarray(
|
||||
[
|
||||
[0.2, 0.4, 0.3, 0.1, 0.1],
|
||||
[0.1, 0.6, 0.2, 0.4, 0.9],
|
||||
[0.8, 0.7, 0.3, 0.9, 0.1],
|
||||
],
|
||||
dtype="f",
|
||||
)
|
||||
spangroup_multi = spancat_multi._make_span_group_multilabel(doc, indices, scores)
|
||||
spangroup_single = spancat_single._make_span_group_singlelabel(doc, indices, scores)
|
||||
assert len(spangroup_single) == 2
|
||||
assert spangroup_single[0].text == "Greater"
|
||||
assert spangroup_single[0].label_ == "City"
|
||||
assert_almost_equal(0.4, spangroup_single.attrs["scores"][0], 5)
|
||||
assert spangroup_single[1].text == "Greater London"
|
||||
assert spangroup_single[1].label_ == "GreatCity"
|
||||
assert spangroup_single.attrs["scores"][1] == 0.9
|
||||
assert_almost_equal(0.9, spangroup_single.attrs["scores"][1], 5)
|
||||
|
||||
assert len(spangroup_multi) == 6
|
||||
assert spangroup_multi[0].text == "Greater"
|
||||
assert spangroup_multi[0].label_ == "City"
|
||||
assert_almost_equal(0.4, spangroup_multi.attrs["scores"][0], 5)
|
||||
assert spangroup_multi[1].text == "Greater"
|
||||
assert spangroup_multi[1].label_ == "Person"
|
||||
assert_almost_equal(0.3, spangroup_multi.attrs["scores"][1], 5)
|
||||
assert spangroup_multi[2].text == "London"
|
||||
assert spangroup_multi[2].label_ == "City"
|
||||
assert_almost_equal(0.6, spangroup_multi.attrs["scores"][2], 5)
|
||||
assert spangroup_multi[3].text == "London"
|
||||
assert spangroup_multi[3].label_ == "GreatCity"
|
||||
assert_almost_equal(0.4, spangroup_multi.attrs["scores"][3], 5)
|
||||
assert spangroup_multi[4].text == "Greater London"
|
||||
assert spangroup_multi[4].label_ == "Thing"
|
||||
assert spangroup_multi[4].text == "Greater London"
|
||||
assert_almost_equal(0.8, spangroup_multi.attrs["scores"][4], 5)
|
||||
assert spangroup_multi[5].text == "Greater London"
|
||||
assert spangroup_multi[5].label_ == "GreatCity"
|
||||
assert_almost_equal(0.9, spangroup_multi.attrs["scores"][5], 5)
|
||||
|
||||
|
||||
def test_ngram_suggester(en_tokenizer):
|
||||
# test different n-gram lengths
|
||||
for size in [1, 2, 3]:
|
||||
|
@ -371,9 +503,9 @@ def test_overfitting_IO_overlapping():
|
|||
assert set([span.label_ for span in spans2]) == {"LOC", "DOUBLE_LOC"}
|
||||
|
||||
|
||||
def test_zero_suggestions():
|
||||
@pytest.mark.parametrize("name", SPANCAT_COMPONENTS)
|
||||
def test_zero_suggestions(name):
|
||||
# Test with a suggester that can return 0 suggestions
|
||||
|
||||
@registry.misc("test_mixed_zero_suggester")
|
||||
def make_mixed_zero_suggester():
|
||||
def mixed_zero_suggester(docs, *, ops=None):
|
||||
|
@ -400,7 +532,7 @@ def test_zero_suggestions():
|
|||
fix_random_seed(0)
|
||||
nlp = English()
|
||||
spancat = nlp.add_pipe(
|
||||
"spancat",
|
||||
name,
|
||||
config={
|
||||
"suggester": {"@misc": "test_mixed_zero_suggester"},
|
||||
"spans_key": SPAN_KEY,
|
||||
|
@ -408,7 +540,7 @@ def test_zero_suggestions():
|
|||
)
|
||||
train_examples = make_examples(nlp)
|
||||
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||
assert spancat.model.get_dim("nO") == 2
|
||||
assert spancat.model.get_dim("nO") == spancat._n_labels
|
||||
assert set(spancat.labels) == {"LOC", "PERSON"}
|
||||
|
||||
nlp.update(train_examples, sgd=optimizer)
|
||||
|
@ -424,9 +556,10 @@ def test_zero_suggestions():
|
|||
list(nlp.pipe(["", "one", "three three three"]))
|
||||
|
||||
|
||||
def test_set_candidates():
|
||||
@pytest.mark.parametrize("name", SPANCAT_COMPONENTS)
|
||||
def test_set_candidates(name):
|
||||
nlp = Language()
|
||||
spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY})
|
||||
spancat = nlp.add_pipe(name, config={"spans_key": SPAN_KEY})
|
||||
train_examples = make_examples(nlp)
|
||||
nlp.initialize(get_examples=lambda: train_examples)
|
||||
texts = [
|
||||
|
@ -444,3 +577,21 @@ def test_set_candidates():
|
|||
assert len(docs[0].spans["candidates"]) == 9
|
||||
assert docs[0].spans["candidates"][0].text == "Just"
|
||||
assert docs[0].spans["candidates"][4].text == "Just a"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("name", SPANCAT_COMPONENTS)
|
||||
@pytest.mark.parametrize("n_process", [1, 2])
|
||||
def test_spancat_multiprocessing(name, n_process):
|
||||
if isinstance(get_current_ops, NumpyOps) or n_process < 2:
|
||||
nlp = Language()
|
||||
spancat = nlp.add_pipe(name, config={"spans_key": SPAN_KEY})
|
||||
train_examples = make_examples(nlp)
|
||||
nlp.initialize(get_examples=lambda: train_examples)
|
||||
texts = [
|
||||
"Just a sentence.",
|
||||
"I like London and Berlin",
|
||||
"I like Berlin",
|
||||
"I eat ham.",
|
||||
]
|
||||
docs = list(nlp.pipe(texts, n_process=n_process))
|
||||
assert len(docs) == len(texts)
|
||||
|
|
|
@ -213,6 +213,13 @@ def test_serialize_doc_exclude(en_vocab):
|
|||
|
||||
def test_serialize_doc_span_groups(en_vocab):
|
||||
doc = Doc(en_vocab, words=["hello", "world", "!"])
|
||||
doc.spans["content"] = [doc[0:2]]
|
||||
span = doc[0:2]
|
||||
span.label_ = "test_serialize_doc_span_groups_label"
|
||||
span.id_ = "test_serialize_doc_span_groups_id"
|
||||
span.kb_id_ = "test_serialize_doc_span_groups_kb_id"
|
||||
doc.spans["content"] = [span]
|
||||
new_doc = Doc(en_vocab).from_bytes(doc.to_bytes())
|
||||
assert len(new_doc.spans["content"]) == 1
|
||||
assert new_doc.spans["content"][0].label_ == "test_serialize_doc_span_groups_label"
|
||||
assert new_doc.spans["content"][0].id_ == "test_serialize_doc_span_groups_id"
|
||||
assert new_doc.spans["content"][0].kb_id_ == "test_serialize_doc_span_groups_kb_id"
|
||||
|
|
|
@ -49,7 +49,11 @@ def test_serialize_doc_bin():
|
|||
nlp = English()
|
||||
for doc in nlp.pipe(texts):
|
||||
doc.cats = cats
|
||||
doc.spans["start"] = [doc[0:2]]
|
||||
span = doc[0:2]
|
||||
span.label_ = "UNUSUAL_SPAN_LABEL"
|
||||
span.id_ = "UNUSUAL_SPAN_ID"
|
||||
span.kb_id_ = "UNUSUAL_SPAN_KB_ID"
|
||||
doc.spans["start"] = [span]
|
||||
doc[0].norm_ = "UNUSUAL_TOKEN_NORM"
|
||||
doc[0].ent_id_ = "UNUSUAL_TOKEN_ENT_ID"
|
||||
doc_bin.add(doc)
|
||||
|
@ -63,6 +67,9 @@ def test_serialize_doc_bin():
|
|||
assert doc.text == texts[i]
|
||||
assert doc.cats == cats
|
||||
assert len(doc.spans) == 1
|
||||
assert doc.spans["start"][0].label_ == "UNUSUAL_SPAN_LABEL"
|
||||
assert doc.spans["start"][0].id_ == "UNUSUAL_SPAN_ID"
|
||||
assert doc.spans["start"][0].kb_id_ == "UNUSUAL_SPAN_KB_ID"
|
||||
assert doc[0].norm_ == "UNUSUAL_TOKEN_NORM"
|
||||
assert doc[0].ent_id_ == "UNUSUAL_TOKEN_ENT_ID"
|
||||
|
||||
|
|
|
@ -1,7 +1,10 @@
|
|||
from typing import Callable
|
||||
from pathlib import Path
|
||||
from typing import Callable, Iterable, Any, Dict
|
||||
|
||||
from spacy import util
|
||||
from spacy.util import ensure_path, registry, load_model_from_config
|
||||
import srsly
|
||||
|
||||
from spacy import util, Errors
|
||||
from spacy.util import ensure_path, registry, load_model_from_config, SimpleFrozenList
|
||||
from spacy.kb.kb_in_memory import InMemoryLookupKB
|
||||
from spacy.vocab import Vocab
|
||||
from thinc.api import Config
|
||||
|
@ -92,6 +95,9 @@ def test_serialize_subclassed_kb():
|
|||
[components.entity_linker]
|
||||
factory = "entity_linker"
|
||||
|
||||
[components.entity_linker.generate_empty_kb]
|
||||
@misc = "kb_test.CustomEmptyKB.v1"
|
||||
|
||||
[initialize]
|
||||
|
||||
[initialize.components]
|
||||
|
@ -99,7 +105,7 @@ def test_serialize_subclassed_kb():
|
|||
[initialize.components.entity_linker]
|
||||
|
||||
[initialize.components.entity_linker.kb_loader]
|
||||
@misc = "spacy.CustomKB.v1"
|
||||
@misc = "kb_test.CustomKB.v1"
|
||||
entity_vector_length = 342
|
||||
custom_field = 666
|
||||
"""
|
||||
|
@ -109,10 +115,57 @@ def test_serialize_subclassed_kb():
|
|||
super().__init__(vocab, entity_vector_length)
|
||||
self.custom_field = custom_field
|
||||
|
||||
@registry.misc("spacy.CustomKB.v1")
|
||||
def to_disk(self, path, exclude: Iterable[str] = SimpleFrozenList()):
|
||||
"""We overwrite InMemoryLookupKB.to_disk() to ensure that self.custom_field is stored as well."""
|
||||
path = ensure_path(path)
|
||||
if not path.exists():
|
||||
path.mkdir(parents=True)
|
||||
if not path.is_dir():
|
||||
raise ValueError(Errors.E928.format(loc=path))
|
||||
|
||||
def serialize_custom_fields(file_path: Path) -> None:
|
||||
srsly.write_json(file_path, {"custom_field": self.custom_field})
|
||||
|
||||
serialize = {
|
||||
"contents": lambda p: self.write_contents(p),
|
||||
"strings.json": lambda p: self.vocab.strings.to_disk(p),
|
||||
"custom_fields": lambda p: serialize_custom_fields(p),
|
||||
}
|
||||
util.to_disk(path, serialize, exclude)
|
||||
|
||||
def from_disk(self, path, exclude: Iterable[str] = SimpleFrozenList()):
|
||||
"""We overwrite InMemoryLookupKB.from_disk() to ensure that self.custom_field is loaded as well."""
|
||||
path = ensure_path(path)
|
||||
if not path.exists():
|
||||
raise ValueError(Errors.E929.format(loc=path))
|
||||
if not path.is_dir():
|
||||
raise ValueError(Errors.E928.format(loc=path))
|
||||
|
||||
def deserialize_custom_fields(file_path: Path) -> None:
|
||||
self.custom_field = srsly.read_json(file_path)["custom_field"]
|
||||
|
||||
deserialize: Dict[str, Callable[[Any], Any]] = {
|
||||
"contents": lambda p: self.read_contents(p),
|
||||
"strings.json": lambda p: self.vocab.strings.from_disk(p),
|
||||
"custom_fields": lambda p: deserialize_custom_fields(p),
|
||||
}
|
||||
util.from_disk(path, deserialize, exclude)
|
||||
|
||||
@registry.misc("kb_test.CustomEmptyKB.v1")
|
||||
def empty_custom_kb() -> Callable[[Vocab, int], SubInMemoryLookupKB]:
|
||||
def empty_kb_factory(vocab: Vocab, entity_vector_length: int):
|
||||
return SubInMemoryLookupKB(
|
||||
vocab=vocab,
|
||||
entity_vector_length=entity_vector_length,
|
||||
custom_field=0,
|
||||
)
|
||||
|
||||
return empty_kb_factory
|
||||
|
||||
@registry.misc("kb_test.CustomKB.v1")
|
||||
def custom_kb(
|
||||
entity_vector_length: int, custom_field: int
|
||||
) -> Callable[[Vocab], InMemoryLookupKB]:
|
||||
) -> Callable[[Vocab], SubInMemoryLookupKB]:
|
||||
def custom_kb_factory(vocab):
|
||||
kb = SubInMemoryLookupKB(
|
||||
vocab=vocab,
|
||||
|
@ -139,6 +192,6 @@ def test_serialize_subclassed_kb():
|
|||
nlp2 = util.load_model_from_path(tmp_dir)
|
||||
entity_linker2 = nlp2.get_pipe("entity_linker")
|
||||
# After IO, the KB is the standard one
|
||||
assert type(entity_linker2.kb) == InMemoryLookupKB
|
||||
assert type(entity_linker2.kb) == SubInMemoryLookupKB
|
||||
assert entity_linker2.kb.entity_vector_length == 342
|
||||
assert not hasattr(entity_linker2.kb, "custom_field")
|
||||
assert entity_linker2.kb.custom_field == 666
|
||||
|
|
|
@ -2,7 +2,6 @@ import os
|
|||
import math
|
||||
from collections import Counter
|
||||
from typing import Tuple, List, Dict, Any
|
||||
import pkg_resources
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
|
@ -13,6 +12,7 @@ import srsly
|
|||
from click import NoSuchOption
|
||||
from packaging.specifiers import SpecifierSet
|
||||
from thinc.api import Config, ConfigValidationError
|
||||
from spacy.tokens import DocBin
|
||||
|
||||
from spacy import about
|
||||
from spacy.cli import info
|
||||
|
@ -28,7 +28,9 @@ from spacy.cli.debug_data import _get_span_characteristics
|
|||
from spacy.cli.debug_data import _print_span_characteristics
|
||||
from spacy.cli.debug_data import _get_spans_length_freq_dist
|
||||
from spacy.cli.download import get_compatibility, get_version
|
||||
from spacy.cli.evaluate import render_parses
|
||||
from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config
|
||||
from spacy.cli.init_pipeline import _init_labels
|
||||
from spacy.cli.package import get_third_party_dependencies
|
||||
from spacy.cli.package import _is_permitted_package_name
|
||||
from spacy.cli.project.remote_storage import RemoteStorage
|
||||
|
@ -47,7 +49,6 @@ from spacy.training.converters import conll_ner_to_docs, conllu_to_docs
|
|||
from spacy.training.converters import iob_to_docs
|
||||
from spacy.util import ENV_VARS, get_minor_version, load_model_from_config, load_config
|
||||
|
||||
from ..cli.init_pipeline import _init_labels
|
||||
from .util import make_tempdir
|
||||
|
||||
|
||||
|
@ -145,6 +146,70 @@ def test_issue11235():
|
|||
assert cfg["commands"][0]["script"][0] == f"hello {lang_var}"
|
||||
|
||||
|
||||
@pytest.mark.issue(12566)
|
||||
@pytest.mark.parametrize(
|
||||
"factory,output_file",
|
||||
[("deps", "parses.html"), ("ents", "entities.html"), ("spans", "spans.html")],
|
||||
)
|
||||
def test_issue12566(factory: str, output_file: str):
|
||||
"""
|
||||
Test if all displaCy types (ents, dep, spans) produce an HTML file
|
||||
"""
|
||||
with make_tempdir() as tmp_dir:
|
||||
# Create sample spaCy file
|
||||
doc_json = {
|
||||
"ents": [
|
||||
{"end": 54, "label": "nam_adj_country", "start": 44},
|
||||
{"end": 83, "label": "nam_liv_person", "start": 69},
|
||||
{"end": 100, "label": "nam_pro_title_book", "start": 86},
|
||||
],
|
||||
"spans": {
|
||||
"sc": [
|
||||
{"end": 54, "kb_id": "", "label": "nam_adj_country", "start": 44},
|
||||
{"end": 83, "kb_id": "", "label": "nam_liv_person", "start": 69},
|
||||
{
|
||||
"end": 100,
|
||||
"kb_id": "",
|
||||
"label": "nam_pro_title_book",
|
||||
"start": 86,
|
||||
},
|
||||
]
|
||||
},
|
||||
"text": "Niedawno czytał em nową książkę znakomitego szkockiego medioznawcy , "
|
||||
"Briana McNaira - Cultural Chaos .",
|
||||
"tokens": [
|
||||
# fmt: off
|
||||
{"id": 0, "start": 0, "end": 8, "tag": "ADV", "pos": "ADV", "morph": "Degree=Pos", "lemma": "niedawno", "dep": "advmod", "head": 1, },
|
||||
{"id": 1, "start": 9, "end": 15, "tag": "PRAET", "pos": "VERB", "morph": "Animacy=Hum|Aspect=Imp|Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act", "lemma": "czytać", "dep": "ROOT", "head": 1, },
|
||||
{"id": 2, "start": 16, "end": 18, "tag": "AGLT", "pos": "NOUN", "morph": "Animacy=Inan|Case=Ins|Gender=Masc|Number=Sing", "lemma": "em", "dep": "iobj", "head": 1, },
|
||||
{"id": 3, "start": 19, "end": 23, "tag": "ADJ", "pos": "ADJ", "morph": "Case=Acc|Degree=Pos|Gender=Fem|Number=Sing", "lemma": "nowy", "dep": "amod", "head": 4, },
|
||||
{"id": 4, "start": 24, "end": 31, "tag": "SUBST", "pos": "NOUN", "morph": "Case=Acc|Gender=Fem|Number=Sing", "lemma": "książka", "dep": "obj", "head": 1, },
|
||||
{"id": 5, "start": 32, "end": 43, "tag": "ADJ", "pos": "ADJ", "morph": "Animacy=Nhum|Case=Gen|Degree=Pos|Gender=Masc|Number=Sing", "lemma": "znakomit", "dep": "acl", "head": 4, },
|
||||
{"id": 6, "start": 44, "end": 54, "tag": "ADJ", "pos": "ADJ", "morph": "Animacy=Hum|Case=Gen|Degree=Pos|Gender=Masc|Number=Sing", "lemma": "szkockiy", "dep": "amod", "head": 7, },
|
||||
{"id": 7, "start": 55, "end": 66, "tag": "SUBST", "pos": "NOUN", "morph": "Animacy=Hum|Case=Gen|Gender=Masc|Number=Sing", "lemma": "medioznawca", "dep": "iobj", "head": 5, },
|
||||
{"id": 8, "start": 67, "end": 68, "tag": "INTERP", "pos": "PUNCT", "morph": "PunctType=Comm", "lemma": ",", "dep": "punct", "head": 9, },
|
||||
{"id": 9, "start": 69, "end": 75, "tag": "SUBST", "pos": "PROPN", "morph": "Animacy=Hum|Case=Gen|Gender=Masc|Number=Sing", "lemma": "Brian", "dep": "nmod", "head": 4, },
|
||||
{"id": 10, "start": 76, "end": 83, "tag": "SUBST", "pos": "PROPN", "morph": "Animacy=Hum|Case=Gen|Gender=Masc|Number=Sing", "lemma": "McNair", "dep": "flat", "head": 9, },
|
||||
{"id": 11, "start": 84, "end": 85, "tag": "INTERP", "pos": "PUNCT", "morph": "PunctType=Dash", "lemma": "-", "dep": "punct", "head": 12, },
|
||||
{"id": 12, "start": 86, "end": 94, "tag": "SUBST", "pos": "PROPN", "morph": "Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing", "lemma": "Cultural", "dep": "conj", "head": 4, },
|
||||
{"id": 13, "start": 95, "end": 100, "tag": "SUBST", "pos": "NOUN", "morph": "Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing", "lemma": "Chaos", "dep": "flat", "head": 12, },
|
||||
{"id": 14, "start": 101, "end": 102, "tag": "INTERP", "pos": "PUNCT", "morph": "PunctType=Peri", "lemma": ".", "dep": "punct", "head": 1, },
|
||||
# fmt: on
|
||||
],
|
||||
}
|
||||
|
||||
# Create a .spacy file
|
||||
nlp = spacy.blank("pl")
|
||||
doc = Doc(nlp.vocab).from_json(doc_json)
|
||||
|
||||
# Run the evaluate command and check if the html files exist
|
||||
render_parses(
|
||||
docs=[doc], output_path=tmp_dir, model_name="", limit=1, **{factory: True}
|
||||
)
|
||||
|
||||
assert (tmp_dir / output_file).is_file()
|
||||
|
||||
|
||||
def test_cli_info():
|
||||
nlp = Dutch()
|
||||
nlp.add_pipe("textcat")
|
||||
|
@ -553,7 +618,14 @@ def test_parse_cli_overrides():
|
|||
|
||||
@pytest.mark.parametrize("lang", ["en", "nl"])
|
||||
@pytest.mark.parametrize(
|
||||
"pipeline", [["tagger", "parser", "ner"], [], ["ner", "textcat", "sentencizer"]]
|
||||
"pipeline",
|
||||
[
|
||||
["tagger", "parser", "ner"],
|
||||
[],
|
||||
["ner", "textcat", "sentencizer"],
|
||||
["morphologizer", "spancat", "entity_linker"],
|
||||
["spancat_singlelabel", "textcat_multilabel"],
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("optimize", ["efficiency", "accuracy"])
|
||||
@pytest.mark.parametrize("pretraining", [True, False])
|
||||
|
@ -1126,6 +1198,7 @@ def test_cli_find_threshold(capsys):
|
|||
)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::DeprecationWarning")
|
||||
@pytest.mark.parametrize(
|
||||
"reqs,output",
|
||||
[
|
||||
|
@ -1158,6 +1231,8 @@ def test_cli_find_threshold(capsys):
|
|||
],
|
||||
)
|
||||
def test_project_check_requirements(reqs, output):
|
||||
import pkg_resources
|
||||
|
||||
# excessive guard against unlikely package name
|
||||
try:
|
||||
pkg_resources.require("spacyunknowndoesnotexist12345")
|
||||
|
|
|
@ -5,10 +5,18 @@ import srsly
|
|||
from typer.testing import CliRunner
|
||||
from spacy.tokens import DocBin, Doc
|
||||
|
||||
from spacy.cli._util import app
|
||||
from spacy.cli._util import app, get_git_version
|
||||
from .util import make_tempdir, normalize_whitespace
|
||||
|
||||
|
||||
def has_git():
|
||||
try:
|
||||
get_git_version()
|
||||
return True
|
||||
except RuntimeError:
|
||||
return False
|
||||
|
||||
|
||||
def test_convert_auto():
|
||||
with make_tempdir() as d_in, make_tempdir() as d_out:
|
||||
for f in ["data1.iob", "data2.iob", "data3.iob"]:
|
||||
|
@ -181,6 +189,7 @@ def test_project_run(project_dir):
|
|||
assert "okokok" in result.stdout
|
||||
|
||||
|
||||
@pytest.mark.skipif(not has_git(), reason="git not installed")
|
||||
@pytest.mark.parametrize(
|
||||
"options",
|
||||
[
|
||||
|
|
|
@ -275,6 +275,20 @@ def test_displacy_parse_deps(en_vocab):
|
|||
{"start": 2, "end": 3, "label": "det", "dir": "left"},
|
||||
{"start": 1, "end": 3, "label": "attr", "dir": "right"},
|
||||
]
|
||||
# Test that displacy.parse_deps converts Span to Doc
|
||||
deps = displacy.parse_deps(doc[:])
|
||||
assert isinstance(deps, dict)
|
||||
assert deps["words"] == [
|
||||
{"lemma": None, "text": words[0], "tag": pos[0]},
|
||||
{"lemma": None, "text": words[1], "tag": pos[1]},
|
||||
{"lemma": None, "text": words[2], "tag": pos[2]},
|
||||
{"lemma": None, "text": words[3], "tag": pos[3]},
|
||||
]
|
||||
assert deps["arcs"] == [
|
||||
{"start": 0, "end": 1, "label": "nsubj", "dir": "left"},
|
||||
{"start": 2, "end": 3, "label": "det", "dir": "left"},
|
||||
{"start": 1, "end": 3, "label": "attr", "dir": "right"},
|
||||
]
|
||||
|
||||
|
||||
def test_displacy_invalid_arcs():
|
||||
|
|
|
@ -2,17 +2,19 @@ from pathlib import Path
|
|||
import numpy as np
|
||||
import pytest
|
||||
import srsly
|
||||
from spacy.vocab import Vocab
|
||||
from thinc.api import Config
|
||||
from thinc.api import Config, get_current_ops
|
||||
|
||||
from spacy import util
|
||||
from spacy.lang.en import English
|
||||
from spacy.training.initialize import init_nlp
|
||||
from spacy.training.loop import train
|
||||
from spacy.training.pretrain import pretrain
|
||||
from spacy.tokens import Doc, DocBin
|
||||
from spacy.language import DEFAULT_CONFIG_PRETRAIN_PATH, DEFAULT_CONFIG_PATH
|
||||
from spacy.ml.models.multi_task import create_pretrain_vectors
|
||||
from spacy.vectors import Vectors
|
||||
from spacy.vocab import Vocab
|
||||
from ..util import make_tempdir
|
||||
from ... import util
|
||||
from ...lang.en import English
|
||||
from ...training.initialize import init_nlp
|
||||
from ...training.loop import train
|
||||
from ...training.pretrain import pretrain
|
||||
from ...tokens import Doc, DocBin
|
||||
from ...language import DEFAULT_CONFIG_PRETRAIN_PATH, DEFAULT_CONFIG_PATH
|
||||
|
||||
pretrain_string_listener = """
|
||||
[nlp]
|
||||
|
@ -163,7 +165,8 @@ def test_pretraining_default():
|
|||
|
||||
|
||||
@pytest.mark.parametrize("objective", CHAR_OBJECTIVES)
|
||||
def test_pretraining_tok2vec_characters(objective):
|
||||
@pytest.mark.parametrize("skip_last", (True, False))
|
||||
def test_pretraining_tok2vec_characters(objective, skip_last):
|
||||
"""Test that pretraining works with the character objective"""
|
||||
config = Config().from_str(pretrain_string_listener)
|
||||
config["pretraining"]["objective"] = objective
|
||||
|
@ -176,10 +179,14 @@ def test_pretraining_tok2vec_characters(objective):
|
|||
filled["paths"]["raw_text"] = file_path
|
||||
filled = filled.interpolate()
|
||||
assert filled["pretraining"]["component"] == "tok2vec"
|
||||
pretrain(filled, tmp_dir)
|
||||
pretrain(filled, tmp_dir, skip_last=skip_last)
|
||||
assert Path(tmp_dir / "model0.bin").exists()
|
||||
assert Path(tmp_dir / "model4.bin").exists()
|
||||
assert not Path(tmp_dir / "model5.bin").exists()
|
||||
if skip_last:
|
||||
assert not Path(tmp_dir / "model-last.bin").exists()
|
||||
else:
|
||||
assert Path(tmp_dir / "model-last.bin").exists()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("objective", VECTOR_OBJECTIVES)
|
||||
|
@ -235,6 +242,7 @@ def test_pretraining_tagger_tok2vec(config):
|
|||
pretrain(filled, tmp_dir)
|
||||
assert Path(tmp_dir / "model0.bin").exists()
|
||||
assert Path(tmp_dir / "model4.bin").exists()
|
||||
assert Path(tmp_dir / "model-last.bin").exists()
|
||||
assert not Path(tmp_dir / "model5.bin").exists()
|
||||
|
||||
|
||||
|
@ -346,3 +354,26 @@ def write_vectors_model(tmp_dir):
|
|||
nlp = English(vocab)
|
||||
nlp.to_disk(nlp_path)
|
||||
return str(nlp_path)
|
||||
|
||||
|
||||
def test_pretrain_default_vectors():
|
||||
nlp = English()
|
||||
nlp.add_pipe("tok2vec")
|
||||
nlp.initialize()
|
||||
|
||||
# default vectors are supported
|
||||
nlp.vocab.vectors = Vectors(shape=(10, 10))
|
||||
create_pretrain_vectors(1, 1, "cosine")(nlp.vocab, nlp.get_pipe("tok2vec").model)
|
||||
|
||||
# floret vectors are supported
|
||||
nlp.vocab.vectors = Vectors(
|
||||
data=get_current_ops().xp.zeros((10, 10)), mode="floret", hash_count=1
|
||||
)
|
||||
create_pretrain_vectors(1, 1, "cosine")(nlp.vocab, nlp.get_pipe("tok2vec").model)
|
||||
|
||||
# error for no vectors
|
||||
with pytest.raises(ValueError, match="E875"):
|
||||
nlp.vocab.vectors = Vectors()
|
||||
create_pretrain_vectors(1, 1, "cosine")(
|
||||
nlp.vocab, nlp.get_pipe("tok2vec").model
|
||||
)
|
||||
|
|
|
@ -834,10 +834,12 @@ cdef class Tokenizer:
|
|||
self.token_match = re.compile(data["token_match"]).match
|
||||
if "url_match" in data and isinstance(data["url_match"], str):
|
||||
self.url_match = re.compile(data["url_match"]).match
|
||||
if "rules" in data and isinstance(data["rules"], dict):
|
||||
self.rules = data["rules"]
|
||||
if "faster_heuristics" in data:
|
||||
self.faster_heuristics = data["faster_heuristics"]
|
||||
# always load rules last so that all other settings are set before the
|
||||
# internal tokenization for the phrase matcher
|
||||
if "rules" in data and isinstance(data["rules"], dict):
|
||||
self.rules = data["rules"]
|
||||
return self
|
||||
|
||||
|
||||
|
|
|
@ -124,6 +124,10 @@ class DocBin:
|
|||
for key, group in doc.spans.items():
|
||||
for span in group:
|
||||
self.strings.add(span.label_)
|
||||
if span.kb_id in span.doc.vocab.strings:
|
||||
self.strings.add(span.kb_id_)
|
||||
if span.id in span.doc.vocab.strings:
|
||||
self.strings.add(span.id_)
|
||||
|
||||
def get_docs(self, vocab: Vocab) -> Iterator[Doc]:
|
||||
"""Recover Doc objects from the annotations, using the given vocab.
|
||||
|
|
|
@ -544,10 +544,6 @@ cdef class Doc:
|
|||
|
||||
DOCS: https://spacy.io/api/doc#char_span
|
||||
"""
|
||||
if not isinstance(label, int):
|
||||
label = self.vocab.strings.add(label)
|
||||
if not isinstance(kb_id, int):
|
||||
kb_id = self.vocab.strings.add(kb_id)
|
||||
alignment_modes = ("strict", "contract", "expand")
|
||||
if alignment_mode not in alignment_modes:
|
||||
raise ValueError(
|
||||
|
@ -1350,6 +1346,10 @@ cdef class Doc:
|
|||
for group in self.spans.values():
|
||||
for span in group:
|
||||
strings.add(span.label_)
|
||||
if span.kb_id in span.doc.vocab.strings:
|
||||
strings.add(span.kb_id_)
|
||||
if span.id in span.doc.vocab.strings:
|
||||
strings.add(span.id_)
|
||||
# Msgpack doesn't distinguish between lists and tuples, which is
|
||||
# vexing for user data. As a best guess, we *know* that within
|
||||
# keys, we must have tuples. In values we just have to hope
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from typing import Any, Dict, Iterator, List, Union
|
||||
from typing import Any, Dict, Iterator, List, Optional, Union
|
||||
from ..vocab import Vocab
|
||||
|
||||
class MorphAnalysis:
|
||||
|
@ -13,7 +13,7 @@ class MorphAnalysis:
|
|||
def __hash__(self) -> int: ...
|
||||
def __eq__(self, other: MorphAnalysis) -> bool: ... # type: ignore[override]
|
||||
def __ne__(self, other: MorphAnalysis) -> bool: ... # type: ignore[override]
|
||||
def get(self, field: Any) -> List[str]: ...
|
||||
def get(self, field: Any, default: Optional[List[str]]) -> List[str]: ...
|
||||
def to_json(self) -> str: ...
|
||||
def to_dict(self) -> Dict[str, str]: ...
|
||||
def __str__(self) -> str: ...
|
||||
|
|
|
@ -58,10 +58,14 @@ cdef class MorphAnalysis:
|
|||
def __ne__(self, other):
|
||||
return self.key != other.key
|
||||
|
||||
def get(self, field):
|
||||
def get(self, field, default=None):
|
||||
"""Retrieve feature values by field."""
|
||||
cdef attr_t field_id = self.vocab.strings.as_int(field)
|
||||
cdef np.ndarray results = get_by_field(&self.c, field_id)
|
||||
if len(results) == 0:
|
||||
if default is None:
|
||||
default = []
|
||||
return default
|
||||
features = [self.vocab.strings[result] for result in results]
|
||||
return [f.split(Morphology.FIELD_SEP)[1] for f in features]
|
||||
|
||||
|
|
|
@ -460,9 +460,12 @@ cdef class Span:
|
|||
start = i
|
||||
if start >= self.end:
|
||||
break
|
||||
if start < self.end:
|
||||
yield Span(self.doc, start, self.end)
|
||||
elif i == self.doc.length - 1:
|
||||
yield Span(self.doc, start, self.doc.length)
|
||||
|
||||
# Ensure that trailing parts of the Span instance are included in last element of .sents.
|
||||
if start == self.doc.length - 1:
|
||||
yield Span(self.doc, start, self.doc.length)
|
||||
|
||||
@property
|
||||
def ents(self):
|
||||
|
|
|
@ -133,10 +133,11 @@ def init_vocab(
|
|||
logger.info("Added vectors: %s", vectors)
|
||||
# warn if source model vectors are not identical
|
||||
sourced_vectors_hashes = nlp.meta.pop("_sourced_vectors_hashes", {})
|
||||
vectors_hash = hash(nlp.vocab.vectors.to_bytes(exclude=["strings"]))
|
||||
for sourced_component, sourced_vectors_hash in sourced_vectors_hashes.items():
|
||||
if vectors_hash != sourced_vectors_hash:
|
||||
warnings.warn(Warnings.W113.format(name=sourced_component))
|
||||
if len(sourced_vectors_hashes) > 0:
|
||||
vectors_hash = hash(nlp.vocab.vectors.to_bytes(exclude=["strings"]))
|
||||
for sourced_component, sourced_vectors_hash in sourced_vectors_hashes.items():
|
||||
if vectors_hash != sourced_vectors_hash:
|
||||
warnings.warn(Warnings.W113.format(name=sourced_component))
|
||||
logger.info("Finished initializing nlp object")
|
||||
|
||||
|
||||
|
|
|
@ -24,6 +24,7 @@ def pretrain(
|
|||
epoch_resume: Optional[int] = None,
|
||||
use_gpu: int = -1,
|
||||
silent: bool = True,
|
||||
skip_last: bool = False,
|
||||
):
|
||||
msg = Printer(no_print=silent)
|
||||
if config["training"]["seed"] is not None:
|
||||
|
@ -60,10 +61,14 @@ def pretrain(
|
|||
row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
|
||||
msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)
|
||||
|
||||
def _save_model(epoch, is_temp=False):
|
||||
def _save_model(epoch, is_temp=False, is_last=False):
|
||||
is_temp_str = ".temp" if is_temp else ""
|
||||
with model.use_params(optimizer.averages):
|
||||
with (output_dir / f"model{epoch}{is_temp_str}.bin").open("wb") as file_:
|
||||
if is_last:
|
||||
save_path = output_dir / f"model-last.bin"
|
||||
else:
|
||||
save_path = output_dir / f"model{epoch}{is_temp_str}.bin"
|
||||
with (save_path).open("wb") as file_:
|
||||
file_.write(model.get_ref("tok2vec").to_bytes())
|
||||
log = {
|
||||
"nr_word": tracker.nr_word,
|
||||
|
@ -76,22 +81,26 @@ def pretrain(
|
|||
|
||||
# TODO: I think we probably want this to look more like the
|
||||
# 'create_train_batches' function?
|
||||
for epoch in range(epoch_resume, P["max_epochs"]):
|
||||
for batch_id, batch in enumerate(batcher(corpus(nlp))):
|
||||
docs = ensure_docs(batch)
|
||||
loss = make_update(model, docs, optimizer, objective)
|
||||
progress = tracker.update(epoch, loss, docs)
|
||||
if progress:
|
||||
msg.row(progress, **row_settings)
|
||||
if P["n_save_every"] and (batch_id % P["n_save_every"] == 0):
|
||||
_save_model(epoch, is_temp=True)
|
||||
try:
|
||||
for epoch in range(epoch_resume, P["max_epochs"]):
|
||||
for batch_id, batch in enumerate(batcher(corpus(nlp))):
|
||||
docs = ensure_docs(batch)
|
||||
loss = make_update(model, docs, optimizer, objective)
|
||||
progress = tracker.update(epoch, loss, docs)
|
||||
if progress:
|
||||
msg.row(progress, **row_settings)
|
||||
if P["n_save_every"] and (batch_id % P["n_save_every"] == 0):
|
||||
_save_model(epoch, is_temp=True)
|
||||
|
||||
if P["n_save_epoch"]:
|
||||
if epoch % P["n_save_epoch"] == 0 or epoch == P["max_epochs"] - 1:
|
||||
if P["n_save_epoch"]:
|
||||
if epoch % P["n_save_epoch"] == 0 or epoch == P["max_epochs"] - 1:
|
||||
_save_model(epoch)
|
||||
else:
|
||||
_save_model(epoch)
|
||||
else:
|
||||
_save_model(epoch)
|
||||
tracker.epoch_loss = 0.0
|
||||
tracker.epoch_loss = 0.0
|
||||
finally:
|
||||
if not skip_last:
|
||||
_save_model(P["max_epochs"], is_last=True)
|
||||
|
||||
|
||||
def ensure_docs(examples_or_docs: Iterable[Union[Doc, Example]]) -> List[Doc]:
|
||||
|
|
|
@ -899,15 +899,21 @@ The `EntityLinker` model architecture is a Thinc `Model` with a
|
|||
| `nO` | Output dimension, determined by the length of the vectors encoding each entity in the KB. If the `nO` dimension is not set, the entity linking component will set it when `initialize` is called. ~~Optional[int]~~ |
|
||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
|
||||
|
||||
### spacy.EmptyKB.v1 {id="EmptyKB"}
|
||||
### spacy.EmptyKB.v1 {id="EmptyKB.v1"}
|
||||
|
||||
A function that creates an empty `KnowledgeBase` from a [`Vocab`](/api/vocab)
|
||||
instance. This is the default when a new entity linker component is created.
|
||||
instance.
|
||||
|
||||
| Name | Description |
|
||||
| ---------------------- | ----------------------------------------------------------------------------------- |
|
||||
| `entity_vector_length` | The length of the vectors encoding each entity in the KB. Defaults to `64`. ~~int~~ |
|
||||
|
||||
### spacy.EmptyKB.v2 {id="EmptyKB"}
|
||||
|
||||
A function that creates an empty `KnowledgeBase` from a [`Vocab`](/api/vocab)
|
||||
instance. This is the default when a new entity linker component is created. It
|
||||
returns a `Callable[[Vocab, int], InMemoryLookupKB]`.
|
||||
|
||||
### spacy.KBFromFile.v1 {id="KBFromFile"}
|
||||
|
||||
A function that reads an existing `KnowledgeBase` from file.
|
||||
|
@ -924,6 +930,15 @@ plausible [`Candidate`](/api/kb/#candidate) objects. The default
|
|||
`CandidateGenerator` uses the text of a mention to find its potential aliases in
|
||||
the `KnowledgeBase`. Note that this function is case-dependent.
|
||||
|
||||
### spacy.CandidateBatchGenerator.v1 {id="CandidateBatchGenerator"}
|
||||
|
||||
A function that takes as input a [`KnowledgeBase`](/api/kb) and an `Iterable` of
|
||||
[`Span`](/api/span) objects denoting named entities, and returns a list of
|
||||
plausible [`Candidate`](/api/kb/#candidate) objects per specified
|
||||
[`Span`](/api/span). The default `CandidateBatchGenerator` uses the text of a
|
||||
mention to find its potential aliases in the `KnowledgeBase`. Note that this
|
||||
function is case-dependent.
|
||||
|
||||
## Coreference {id="coref-architectures",tag="experimental"}
|
||||
|
||||
A [`CoreferenceResolver`](/api/coref) component identifies tokens that refer to
|
||||
|
|
|
@ -1122,17 +1122,18 @@ auto-generated by setting `--pretraining` on
|
|||
$ python -m spacy pretrain [config_path] [output_dir] [--code] [--resume-path] [--epoch-resume] [--gpu-id] [overrides]
|
||||
```
|
||||
|
||||
| Name | Description |
|
||||
| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ |
|
||||
| `output_dir` | Directory to save binary weights to on each epoch. ~~Path (positional)~~ |
|
||||
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
||||
| `--resume-path`, `-r` | Path to pretrained weights from which to resume pretraining. ~~Optional[Path] \(option)~~ |
|
||||
| `--epoch-resume`, `-er` | The epoch to resume counting from when using `--resume-path`. Prevents unintended overwriting of existing weight files. ~~Optional[int] \(option)~~ |
|
||||
| `--gpu-id`, `-g` | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.dropout 0.2`. ~~Any (option/flag)~~ |
|
||||
| **CREATES** | The pretrained weights that can be used to initialize `spacy train`. |
|
||||
| Name | Description |
|
||||
| -------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ |
|
||||
| `output_dir` | Directory to save binary weights to on each epoch. ~~Path (positional)~~ |
|
||||
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
||||
| `--resume-path`, `-r` | Path to pretrained weights from which to resume pretraining. ~~Optional[Path] \(option)~~ |
|
||||
| `--epoch-resume`, `-er` | The epoch to resume counting from when using `--resume-path`. Prevents unintended overwriting of existing weight files. ~~Optional[int] \(option)~~ |
|
||||
| `--gpu-id`, `-g` | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~ |
|
||||
| `--skip-last`, `-L` <Tag variant="new">3.5.2</Tag> | Skip saving `model-last.bin`. Defaults to `False`. ~~bool (flag)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.dropout 0.2`. ~~Any (option/flag)~~ |
|
||||
| **CREATES** | The pretrained weights that can be used to initialize `spacy train`. |
|
||||
|
||||
## evaluate {id="evaluate",version="2",tag="command"}
|
||||
|
||||
|
@ -1254,19 +1255,19 @@ be provided.
|
|||
> $ python -m spacy find-threshold my_nlp data.spacy spancat threshold spans_sc_f
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `model` | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~ |
|
||||
| `data_path` | Path to file with DocBin with docs to use for threshold search. ~~Path (positional)~~ |
|
||||
| `pipe_name` | Name of pipe to examine thresholds for. ~~str (positional)~~ |
|
||||
| `threshold_key` | Key of threshold attribute in component's configuration. ~~str (positional)~~ |
|
||||
| `scores_key` | Name of score to metric to optimize. ~~str (positional)~~ |
|
||||
| `--n_trials`, `-n` | Number of trials to determine optimal thresholds. ~~int (option)~~ |
|
||||
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
||||
| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ |
|
||||
| `--gold-preproc`, `-G` | Use gold preprocessing. ~~bool (flag)~~ |
|
||||
| `--silent`, `-V`, `-VV` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| Name | Description |
|
||||
| ------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `model` | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~ |
|
||||
| `data_path` | Path to file with DocBin with docs to use for threshold search. ~~Path (positional)~~ |
|
||||
| `pipe_name` | Name of pipe to examine thresholds for. ~~str (positional)~~ |
|
||||
| `threshold_key` | Key of threshold attribute in component's configuration. ~~str (positional)~~ |
|
||||
| `scores_key` | Name of score to metric to optimize. ~~str (positional)~~ |
|
||||
| `--n_trials`, `-n` | Number of trials to determine optimal thresholds. ~~int (option)~~ |
|
||||
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
||||
| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ |
|
||||
| `--gold-preproc`, `-G` | Use gold preprocessing. ~~bool (flag)~~ |
|
||||
| `--verbose`, `-V`, `-VV` | Display more information for debugging purposes. ~~bool (flag)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
|
||||
## assemble {id="assemble",tag="command"}
|
||||
|
||||
|
@ -1639,7 +1640,7 @@ with [`spacy package`](/api/cli#package) and `--build wheel`. For more details,
|
|||
see the spaCy project [integration](/usage/projects#huggingface_hub).
|
||||
|
||||
```bash
|
||||
$ python -m spacy huggingface-hub push [whl_path] [--org] [--msg] [--local-repo] [--verbose]
|
||||
$ python -m spacy huggingface-hub push [whl_path] [--org] [--msg] [--verbose]
|
||||
```
|
||||
|
||||
> #### Example
|
||||
|
@ -1653,6 +1654,5 @@ $ python -m spacy huggingface-hub push [whl_path] [--org] [--msg] [--local-repo]
|
|||
| `whl_path` | The path to the `.whl` file packaged with [`spacy package`](https://spacy.io/api/cli#package). ~~Path(positional)~~ |
|
||||
| `--org`, `-o` | Optional name of organization to which the pipeline should be uploaded. ~~str (option)~~ |
|
||||
| `--msg`, `-m` | Commit message to use for update. Defaults to `"Update spaCy pipeline"`. ~~str (option)~~ |
|
||||
| `--local-repo`, `-l` | Local path to the model repository (will be created if it doesn't exist). Defaults to `hub` in the current working directory. ~~Path (option)~~ |
|
||||
| `--verbose`, `-V` | Output additional info for debugging, e.g. the full generated hub metadata. ~~bool (flag)~~ |
|
||||
| **UPLOADS** | The pipeline to the hub. |
|
||||
|
|
|
@ -64,7 +64,7 @@ details on the architectures and their arguments and hyperparameters.
|
|||
> config={
|
||||
> "model": DEFAULT_COREF_MODEL,
|
||||
> "span_cluster_prefix": DEFAULT_CLUSTER_PREFIX,
|
||||
> },
|
||||
> }
|
||||
> nlp.add_pipe("experimental_coref", config=config)
|
||||
> ```
|
||||
|
||||
|
|
|
@ -68,24 +68,28 @@ The following operators are supported by the `DependencyMatcher`, most of which
|
|||
come directly from
|
||||
[Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html):
|
||||
|
||||
| Symbol | Description |
|
||||
| --------- | -------------------------------------------------------------------------------------------------------------------- |
|
||||
| `A < B` | `A` is the immediate dependent of `B`. |
|
||||
| `A > B` | `A` is the immediate head of `B`. |
|
||||
| `A << B` | `A` is the dependent in a chain to `B` following dep → head paths. |
|
||||
| `A >> B` | `A` is the head in a chain to `B` following head → dep paths. |
|
||||
| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. |
|
||||
| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_. |
|
||||
| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. |
|
||||
| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_. |
|
||||
| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. |
|
||||
| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. |
|
||||
| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. |
|
||||
| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. |
|
||||
| `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_. |
|
||||
| `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_. |
|
||||
| `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_. |
|
||||
| `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_. |
|
||||
| Symbol | Description |
|
||||
| --------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `A < B` | `A` is the immediate dependent of `B`. |
|
||||
| `A > B` | `A` is the immediate head of `B`. |
|
||||
| `A << B` | `A` is the dependent in a chain to `B` following dep → head paths. |
|
||||
| `A >> B` | `A` is the head in a chain to `B` following head → dep paths. |
|
||||
| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. |
|
||||
| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(Semgrex counterpart: `..`)_. |
|
||||
| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(Semgrex counterpart: `-`)_. |
|
||||
| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(Semgrex counterpart: `--`)_. |
|
||||
| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. |
|
||||
| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. |
|
||||
| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. |
|
||||
| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. |
|
||||
| `A >+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_. |
|
||||
| `A >- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_. |
|
||||
| `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i`. |
|
||||
| `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i`. |
|
||||
| `A <+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_. |
|
||||
| `A <- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_. |
|
||||
| `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i`. |
|
||||
| `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i`. |
|
||||
|
||||
## DependencyMatcher.\_\_init\_\_ {id="init",tag="method"}
|
||||
|
||||
|
|
|
@ -53,19 +53,21 @@ architectures and their arguments and hyperparameters.
|
|||
> nlp.add_pipe("entity_linker", config=config)
|
||||
> ```
|
||||
|
||||
| Setting | Description |
|
||||
| ---------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `labels_discard` | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~ |
|
||||
| `n_sents` | The number of neighbouring sentences to take into account. Defaults to 0. ~~int~~ |
|
||||
| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~ |
|
||||
| `incl_context` | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~ |
|
||||
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~ |
|
||||
| `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~ |
|
||||
| `use_gold_ents` | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~ |
|
||||
| `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ |
|
||||
| `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ |
|
||||
| `scorer` <Tag variant="new">3.2</Tag> | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ |
|
||||
| `threshold` <Tag variant="new">3.4</Tag> | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ |
|
||||
| Setting | Description |
|
||||
| --------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `labels_discard` | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~ |
|
||||
| `n_sents` | The number of neighbouring sentences to take into account. Defaults to 0. ~~int~~ |
|
||||
| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~ |
|
||||
| `incl_context` | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~ |
|
||||
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~ |
|
||||
| `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~ |
|
||||
| `use_gold_ents` | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~ |
|
||||
| `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ |
|
||||
| `get_candidates_batch` <Tag variant="new">3.5</Tag> | Function that generates plausible candidates for a given batch of `Span` objects. Defaults to [CandidateBatchGenerator](/api/architectures#CandidateBatchGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]]~~ |
|
||||
| `generate_empty_kb` <Tag variant="new">3.6</Tag> | Function that generates an empty `KnowledgeBase` object. Defaults to [`spacy.EmptyKB.v2`](/api/architectures#EmptyKB), which generates an empty [`InMemoryLookupKB`](/api/inmemorylookupkb). ~~Callable[[Vocab, int], KnowledgeBase]~~ |
|
||||
| `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ |
|
||||
| `scorer` <Tag variant="new">3.2</Tag> | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ |
|
||||
| `threshold` <Tag variant="new">3.4</Tag> | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ |
|
||||
|
||||
```python
|
||||
%%GITHUB_SPACY/spacy/pipeline/entity_linker.py
|
||||
|
|
|
@ -292,7 +292,7 @@ Restore the state of the knowledge base from a given directory. Note that the
|
|||
> ```python
|
||||
> from spacy.vocab import Vocab
|
||||
> vocab = Vocab().from_disk("/path/to/vocab")
|
||||
> kb = FullyImplementedKB(vocab=vocab, entity_vector_length=64)
|
||||
> kb = InMemoryLookupKB(vocab=vocab, entity_vector_length=64)
|
||||
> kb.from_disk("/path/to/kb")
|
||||
> ```
|
||||
|
||||
|
|
|
@ -213,10 +213,11 @@ Retrieve values for a feature by field.
|
|||
> assert morph.get("Feat1") == ["Val1", "Val2"]
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------ |
|
||||
| `field` | The field to retrieve. ~~str~~ |
|
||||
| **RETURNS** | A list of the individual features. ~~List[str]~~ |
|
||||
| Name | Description |
|
||||
| -------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `field` | The field to retrieve. ~~str~~ |
|
||||
| `default` <Tag variant="new">3.6</Tag> | The value to return if the field is not present. If unset or `None`, the default return value is `[]`. ~~Optional[List[str]]~~ |
|
||||
| **RETURNS** | A list of the individual features. ~~List[str]~~ |
|
||||
|
||||
### MorphAnalysis.to_dict {id="morphanalysis-to_dict",tag="method"}
|
||||
|
||||
|
|
|
@ -13,8 +13,16 @@ A span categorizer consists of two parts: a [suggester function](#suggesters)
|
|||
that proposes candidate spans, which may or may not overlap, and a labeler model
|
||||
that predicts zero or more labels for each candidate.
|
||||
|
||||
Predicted spans will be saved in a [`SpanGroup`](/api/spangroup) on the doc.
|
||||
Individual span scores can be found in `spangroup.attrs["scores"]`.
|
||||
This component comes in two forms: `spancat` and `spancat_singlelabel` (added in
|
||||
spaCy v3.5.1). When you need to perform multi-label classification on your
|
||||
spans, use `spancat`. The `spancat` component uses a `Logistic` layer where the
|
||||
output class probabilities are independent for each class. However, if you need
|
||||
to predict at most one true class for a span, then use `spancat_singlelabel`. It
|
||||
uses a `Softmax` layer and treats the task as a multi-class problem.
|
||||
|
||||
Predicted spans will be saved in a [`SpanGroup`](/api/spangroup) on the doc
|
||||
under `doc.spans[spans_key]`, where `spans_key` is a component config setting.
|
||||
Individual span scores are stored in `doc.spans[spans_key].attrs["scores"]`.
|
||||
|
||||
## Assigned Attributes {id="assigned-attributes"}
|
||||
|
||||
|
@ -22,7 +30,9 @@ Predictions will be saved to `Doc.spans[spans_key]` as a
|
|||
[`SpanGroup`](/api/spangroup). The scores for the spans in the `SpanGroup` will
|
||||
be saved in `SpanGroup.attrs["scores"]`.
|
||||
|
||||
`spans_key` defaults to `"sc"`, but can be passed as a parameter.
|
||||
`spans_key` defaults to `"sc"`, but can be passed as a parameter. The `spancat`
|
||||
component will overwrite any existing spans under the spans key
|
||||
`doc.spans[spans_key]`.
|
||||
|
||||
| Location | Value |
|
||||
| -------------------------------------- | -------------------------------------------------------- |
|
||||
|
@ -38,7 +48,7 @@ how the component should be configured. You can override its settings via the
|
|||
[model architectures](/api/architectures) documentation for details on the
|
||||
architectures and their arguments and hyperparameters.
|
||||
|
||||
> #### Example
|
||||
> #### Example (spancat)
|
||||
>
|
||||
> ```python
|
||||
> from spacy.pipeline.spancat import DEFAULT_SPANCAT_MODEL
|
||||
|
@ -52,14 +62,33 @@ architectures and their arguments and hyperparameters.
|
|||
> nlp.add_pipe("spancat", config=config)
|
||||
> ```
|
||||
|
||||
| Setting | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `suggester` | A function that [suggests spans](#suggesters). Spans are returned as a ragged array with two integer columns, for the start and end positions. Defaults to [`ngram_suggester`](#ngram_suggester). ~~Callable[[Iterable[Doc], Optional[Ops]], Ragged]~~ |
|
||||
| `model` | A model instance that is given a a list of documents and `(start, end)` indices representing candidate span offsets. The model predicts a probability for each category for each span. Defaults to [SpanCategorizer](/api/architectures#SpanCategorizer). ~~Model[Tuple[List[Doc], Ragged], Floats2d]~~ |
|
||||
| `spans_key` | Key of the [`Doc.spans`](/api/doc#spans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"sc"`. ~~str~~ |
|
||||
| `threshold` | Minimum probability to consider a prediction positive. Spans with a positive prediction will be saved on the Doc. Defaults to `0.5`. ~~float~~ |
|
||||
| `max_positive` | Maximum number of labels to consider positive per span. Defaults to `None`, indicating no limit. ~~Optional[int]~~ |
|
||||
| `scorer` | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for `Doc.spans[spans_key]` with overlapping spans allowed. ~~Optional[Callable]~~ |
|
||||
> #### Example (spancat_singlelabel)
|
||||
>
|
||||
> ```python
|
||||
> from spacy.pipeline.spancat import DEFAULT_SPANCAT_SINGLELABEL_MODEL
|
||||
> config = {
|
||||
> "threshold": 0.5,
|
||||
> "spans_key": "labeled_spans",
|
||||
> "model": DEFAULT_SPANCAT_SINGLELABEL_MODEL,
|
||||
> "suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
|
||||
> # Additional spancat_singlelabel parameters
|
||||
> "negative_weight": 0.8,
|
||||
> "allow_overlap": True,
|
||||
> }
|
||||
> nlp.add_pipe("spancat_singlelabel", config=config)
|
||||
> ```
|
||||
|
||||
| Setting | Description |
|
||||
| --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `suggester` | A function that [suggests spans](#suggesters). Spans are returned as a ragged array with two integer columns, for the start and end positions. Defaults to [`ngram_suggester`](#ngram_suggester). ~~Callable[[Iterable[Doc], Optional[Ops]], Ragged]~~ |
|
||||
| `model` | A model instance that is given a a list of documents and `(start, end)` indices representing candidate span offsets. The model predicts a probability for each category for each span. Defaults to [SpanCategorizer](/api/architectures#SpanCategorizer). ~~Model[Tuple[List[Doc], Ragged], Floats2d]~~ |
|
||||
| `spans_key` | Key of the [`Doc.spans`](/api/doc#spans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"sc"`. ~~str~~ |
|
||||
| `threshold` | Minimum probability to consider a prediction positive. Spans with a positive prediction will be saved on the Doc. Meant to be used in combination with the multi-class `spancat` component with a `Logistic` scoring layer. Defaults to `0.5`. ~~float~~ |
|
||||
| `max_positive` | Maximum number of labels to consider positive per span. Defaults to `None`, indicating no limit. Meant to be used together with the `spancat` component and defaults to 0 with `spancat_singlelabel`. ~~Optional[int]~~ |
|
||||
| `scorer` | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for `Doc.spans[spans_key]` with overlapping spans allowed. ~~Optional[Callable]~~ |
|
||||
| `add_negative_label` <Tag variant="new">3.5.1</Tag> | Whether to learn to predict a special negative label for each unannotated `Span` . This should be `True` when using a `Softmax` classifier layer and so its `True` by default for `spancat_singlelabel`. Spans with negative labels and their scores are not stored as annotations. ~~bool~~ |
|
||||
| `negative_weight` <Tag variant="new">3.5.1</Tag> | Multiplier for the loss terms. It can be used to downweight the negative samples if there are too many. It is only used when `add_negative_label` is `True`. Defaults to `1.0`. ~~float~~ |
|
||||
| `allow_overlap` <Tag variant="new">3.5.1</Tag> | If `True`, the data is assumed to contain overlapping spans. It is only available when `max_positive` is exactly 1. Defaults to `True`. ~~bool~~ |
|
||||
|
||||
```python
|
||||
%%GITHUB_SPACY/spacy/pipeline/spancat.py
|
||||
|
@ -71,6 +100,7 @@ architectures and their arguments and hyperparameters.
|
|||
>
|
||||
> ```python
|
||||
> # Construction via add_pipe with default model
|
||||
> # Replace 'spancat' with 'spancat_singlelabel' for exclusive classes
|
||||
> spancat = nlp.add_pipe("spancat")
|
||||
>
|
||||
> # Construction via add_pipe with custom model
|
||||
|
@ -86,16 +116,19 @@ Create a new pipeline instance. In your application, you would normally use a
|
|||
shortcut for this and instantiate the component using its string name and
|
||||
[`nlp.add_pipe`](/api/language#create_pipe).
|
||||
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `vocab` | The shared vocabulary. ~~Vocab~~ |
|
||||
| `model` | A model instance that is given a a list of documents and `(start, end)` indices representing candidate span offsets. The model predicts a probability for each category for each span. ~~Model[Tuple[List[Doc], Ragged], Floats2d]~~ |
|
||||
| `suggester` | A function that [suggests spans](#suggesters). Spans are returned as a ragged array with two integer columns, for the start and end positions. ~~Callable[[Iterable[Doc], Optional[Ops]], Ragged]~~ |
|
||||
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
|
||||
| _keyword-only_ | |
|
||||
| `spans_key` | Key of the [`Doc.spans`](/api/doc#sans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"sc"`. ~~str~~ |
|
||||
| `threshold` | Minimum probability to consider a prediction positive. Spans with a positive prediction will be saved on the Doc. Defaults to `0.5`. ~~float~~ |
|
||||
| `max_positive` | Maximum number of labels to consider positive per span. Defaults to `None`, indicating no limit. ~~Optional[int]~~ |
|
||||
| Name | Description |
|
||||
| --------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | The shared vocabulary. ~~Vocab~~ |
|
||||
| `model` | A model instance that is given a a list of documents and `(start, end)` indices representing candidate span offsets. The model predicts a probability for each category for each span. ~~Model[Tuple[List[Doc], Ragged], Floats2d]~~ |
|
||||
| `suggester` | A function that [suggests spans](#suggesters). Spans are returned as a ragged array with two integer columns, for the start and end positions. ~~Callable[[Iterable[Doc], Optional[Ops]], Ragged]~~ |
|
||||
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
|
||||
| _keyword-only_ | |
|
||||
| `spans_key` | Key of the [`Doc.spans`](/api/doc#sans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"sc"`. ~~str~~ |
|
||||
| `threshold` | Minimum probability to consider a prediction positive. Spans with a positive prediction will be saved on the Doc. Defaults to `0.5`. ~~float~~ |
|
||||
| `max_positive` | Maximum number of labels to consider positive per span. Defaults to `None`, indicating no limit. ~~Optional[int]~~ |
|
||||
| `allow_overlap` <Tag variant="new">3.5.1</Tag> | If `True`, the data is assumed to contain overlapping spans. It is only available when `max_positive` is exactly 1. Defaults to `True`. ~~bool~~ |
|
||||
| `add_negative_label` <Tag variant="new">3.5.1</Tag> | Whether to learn to predict a special negative label for each unannotated `Span`. This should be `True` when using a `Softmax` classifier layer and so its `True` by default for `spancat_singlelabel` . Spans with negative labels and their scores are not stored as annotations. ~~bool~~ |
|
||||
| `negative_weight` <Tag variant="new">3.5.1</Tag> | Multiplier for the loss terms. It can be used to downweight the negative samples if there are too many . It is only used when `add_negative_label` is `True`. Defaults to `1.0`. ~~float~~ |
|
||||
|
||||
## SpanCategorizer.\_\_call\_\_ {id="call",tag="method"}
|
||||
|
||||
|
|
|
@ -8,6 +8,13 @@ Look up strings by 64-bit hashes. As of v2.0, spaCy uses hash values instead of
|
|||
integer IDs. This ensures that strings always map to the same ID, even from
|
||||
different `StringStores`.
|
||||
|
||||
<Infobox variant ="warning">
|
||||
|
||||
Note that a `StringStore` instance is not static. It increases in size as texts
|
||||
with new tokens are processed.
|
||||
|
||||
</Infobox>
|
||||
|
||||
## StringStore.\_\_init\_\_ {id="init",tag="method"}
|
||||
|
||||
Create the `StringStore`.
|
||||
|
|
|
@ -25,7 +25,10 @@ and call the package's own `load()` method. If a pipeline is loaded from a path,
|
|||
spaCy will assume it's a data directory, load its
|
||||
[`config.cfg`](/api/data-formats#config) and use the language and pipeline
|
||||
information to construct the `Language` class. The data will be loaded in via
|
||||
[`Language.from_disk`](/api/language#from_disk).
|
||||
[`Language.from_disk`](/api/language#from_disk). Loading a pipeline from a
|
||||
package will also import any custom code, if present, whereas loading from a
|
||||
directory does not. For these cases, you need to manually import your custom
|
||||
code.
|
||||
|
||||
<Infobox variant="warning" title="Changed in v3.0">
|
||||
|
||||
|
@ -291,7 +294,7 @@ the `manual=True` argument in `displacy.render`.
|
|||
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------------------- |
|
||||
| `orig_doc` | Doc to parse dependencies. ~~Doc~~ |
|
||||
| `orig_doc` | Doc or span to parse dependencies. ~~Union[Doc, Span]~~ |
|
||||
| `options` | Dependency parse specific visualisation options. ~~Dict[str, Any]~~ |
|
||||
| **RETURNS** | Generated dependency parse keyed by words and arcs. ~~dict~~ |
|
||||
|
||||
|
@ -354,22 +357,22 @@ If a setting is not present in the options, the default value will be used.
|
|||
> displacy.serve(doc, style="dep", options=options)
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ------------------ | -------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `fine_grained` | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). Defaults to `False`. ~~bool~~ |
|
||||
| `add_lemma` | Print the lemmas in a separate row below the token texts. Defaults to `False`. ~~bool~~ |
|
||||
| `collapse_punct` | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. Defaults to `True`. ~~bool~~ |
|
||||
| `collapse_phrases` | Merge noun phrases into one token. Defaults to `False`. ~~bool~~ |
|
||||
| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ |
|
||||
| `color` | Text color (HEX, RGB or color names). Defaults to `"#000000"`. ~~str~~ |
|
||||
| `bg` | Background color (HEX, RGB or color names). Defaults to `"#ffffff"`. ~~str~~ |
|
||||
| `font` | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~ |
|
||||
| `offset_x` | Spacing on left side of the SVG in px. Defaults to `50`. ~~int~~ |
|
||||
| `arrow_stroke` | Width of arrow path in px. Defaults to `2`. ~~int~~ |
|
||||
| `arrow_width` | Width of arrow head in px. Defaults to `10` in regular mode and `8` in compact mode. ~~int~~ |
|
||||
| `arrow_spacing` | Spacing between arrows in px to avoid overlaps. Defaults to `20` in regular mode and `12` in compact mode. ~~int~~ |
|
||||
| `word_spacing` | Vertical spacing between words and arcs in px. Defaults to `45`. ~~int~~ |
|
||||
| `distance` | Distance between words in px. Defaults to `175` in regular mode and `150` in compact mode. ~~int~~ |
|
||||
| Name | Description |
|
||||
| ------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `fine_grained` | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). Defaults to `False`. ~~bool~~ |
|
||||
| `add_lemma` | Print the lemmas in a separate row below the token texts. Defaults to `False`. ~~bool~~ |
|
||||
| `collapse_punct` | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. Defaults to `True`. ~~bool~~ |
|
||||
| `collapse_phrases` | Merge noun phrases into one token. Defaults to `False`. ~~bool~~ |
|
||||
| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ |
|
||||
| `color` | Text color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#000000"`. ~~str~~ |
|
||||
| `bg` | Background color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#ffffff"`. ~~str~~ |
|
||||
| `font` | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~ |
|
||||
| `offset_x` | Spacing on left side of the SVG in px. Defaults to `50`. ~~int~~ |
|
||||
| `arrow_stroke` | Width of arrow path in px. Defaults to `2`. ~~int~~ |
|
||||
| `arrow_width` | Width of arrow head in px. Defaults to `10` in regular mode and `8` in compact mode. ~~int~~ |
|
||||
| `arrow_spacing` | Spacing between arrows in px to avoid overlaps. Defaults to `20` in regular mode and `12` in compact mode. ~~int~~ |
|
||||
| `word_spacing` | Vertical spacing between words and arcs in px. Defaults to `45`. ~~int~~ |
|
||||
| `distance` | Distance between words in px. Defaults to `175` in regular mode and `150` in compact mode. ~~int~~ |
|
||||
|
||||
#### Named Entity Visualizer options {id="displacy_options-ent"}
|
||||
|
||||
|
@ -577,7 +580,7 @@ start decreasing across epochs.
|
|||
> ```ini
|
||||
> [training.logger]
|
||||
> @loggers = "spacy.ConsoleLogger.v3"
|
||||
> progress_bar = "all_steps"
|
||||
> progress_bar = "eval"
|
||||
> console_output = true
|
||||
> output_file = "training_log.jsonl"
|
||||
> ```
|
||||
|
|
|
@ -10,6 +10,13 @@ The `Vocab` object provides a lookup table that allows you to access
|
|||
[`StringStore`](/api/stringstore). It also owns underlying C-data that is shared
|
||||
between `Doc` objects.
|
||||
|
||||
<Infobox variant ="warning">
|
||||
|
||||
Note that a `Vocab` instance is not static. It increases in size as texts with
|
||||
new tokens are processed.
|
||||
|
||||
</Infobox>
|
||||
|
||||
## Vocab.\_\_init\_\_ {id="init",tag="method"}
|
||||
|
||||
Create the vocabulary.
|
||||
|
|
|
@ -746,13 +746,16 @@ this by setting `initialize.init_tok2vec` to the filename of the `.bin` file
|
|||
that you want to use from pretraining.
|
||||
|
||||
A pretraining step that runs for 5 epochs with an output path of `pretrain/`, as
|
||||
an example, produces `pretrain/model0.bin` through `pretrain/model4.bin`. To
|
||||
make use of the final output, you could fill in this value in your config file:
|
||||
an example, produces `pretrain/model0.bin` through `pretrain/model4.bin` plus a
|
||||
copy of the last iteration as `pretrain/model-last.bin`. Additionally, you can
|
||||
configure `n_save_epoch` to tell pretraining in which epoch interval it should
|
||||
save the current training progress. To use the final output to initialize your
|
||||
`tok2vec` layer, you could fill in this value in your config file:
|
||||
|
||||
```ini {title="config.cfg"}
|
||||
|
||||
[paths]
|
||||
init_tok2vec = "pretrain/model4.bin"
|
||||
init_tok2vec = "pretrain/model-last.bin"
|
||||
|
||||
[initialize]
|
||||
init_tok2vec = ${paths.init_tok2vec}
|
||||
|
|
|
@ -1096,20 +1096,28 @@ The following operators are supported by the `DependencyMatcher`, most of which
|
|||
come directly from
|
||||
[Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html):
|
||||
|
||||
| Symbol | Description |
|
||||
| --------- | -------------------------------------------------------------------------------------------------------------------- |
|
||||
| `A < B` | `A` is the immediate dependent of `B`. |
|
||||
| `A > B` | `A` is the immediate head of `B`. |
|
||||
| `A << B` | `A` is the dependent in a chain to `B` following dep → head paths. |
|
||||
| `A >> B` | `A` is the head in a chain to `B` following head → dep paths. |
|
||||
| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. |
|
||||
| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_. |
|
||||
| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. |
|
||||
| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_. |
|
||||
| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. |
|
||||
| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. |
|
||||
| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. |
|
||||
| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. |
|
||||
| Symbol | Description |
|
||||
| --------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `A < B` | `A` is the immediate dependent of `B`. |
|
||||
| `A > B` | `A` is the immediate head of `B`. |
|
||||
| `A << B` | `A` is the dependent in a chain to `B` following dep → head paths. |
|
||||
| `A >> B` | `A` is the head in a chain to `B` following head → dep paths. |
|
||||
| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. |
|
||||
| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(Semgrex counterpart: `..`)_. |
|
||||
| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(Semgrex counterpart: `-`)_. |
|
||||
| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(Semgrex counterpart: `--`)_. |
|
||||
| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. |
|
||||
| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. |
|
||||
| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. |
|
||||
| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. |
|
||||
| `A >+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_. |
|
||||
| `A >- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_. |
|
||||
| `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i`. |
|
||||
| `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i`. |
|
||||
| `A <+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_. |
|
||||
| `A <- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_. |
|
||||
| `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i`. |
|
||||
| `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i`. |
|
||||
|
||||
### Designing dependency matcher patterns {id="dependencymatcher-patterns"}
|
||||
|
||||
|
@ -1674,6 +1682,8 @@ def expand_person_entities(doc):
|
|||
if prev_token.text in ("Dr", "Dr.", "Mr", "Mr.", "Ms", "Ms."):
|
||||
new_ent = Span(doc, ent.start - 1, ent.end, label=ent.label)
|
||||
new_ents.append(new_ent)
|
||||
else:
|
||||
new_ents.append(ent)
|
||||
else:
|
||||
new_ents.append(ent)
|
||||
doc.ents = new_ents
|
||||
|
|
|
@ -758,6 +758,15 @@ any custom architectures, functions or
|
|||
your pipeline and registered when it's loaded. See the documentation on
|
||||
[saving and loading pipelines](/usage/saving-loading#models-custom) for details.
|
||||
|
||||
<Infobox variant="warning">
|
||||
|
||||
Note that the unpackaged models produced by `spacy train` are data directories
|
||||
that **do not include custom code**. You need to import the code in your script
|
||||
before loading in unpackaged models. For more details, see
|
||||
[`spacy.load`](/api/top-level#spacy.load).
|
||||
|
||||
</Infobox>
|
||||
|
||||
#### Example: Modifying the nlp object {id="custom-code-nlp-callbacks"}
|
||||
|
||||
For many use cases, you don't necessarily want to implement the whole `Language`
|
||||
|
|
|
@ -58,12 +58,12 @@ arcs.
|
|||
|
||||
</Infobox>
|
||||
|
||||
| Argument | Description |
|
||||
| --------- | ----------------------------------------------------------------------------------------- |
|
||||
| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ |
|
||||
| `color` | Text color (HEX, RGB or color names). Defaults to `"#000000"`. ~~str~~ |
|
||||
| `bg` | Background color (HEX, RGB or color names). Defaults to `"#ffffff"`. ~~str~~ |
|
||||
| `font` | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~ |
|
||||
| Argument | Description |
|
||||
| --------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ |
|
||||
| `color` | Text color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#000000"`. ~~str~~ |
|
||||
| `bg` | Background color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#ffffff"`. ~~str~~ |
|
||||
| `font` | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~ |
|
||||
|
||||
For a list of all available options, see the
|
||||
[`displacy` API documentation](/api/top-level#displacy_options).
|
||||
|
|
|
@ -1,5 +1,98 @@
|
|||
{
|
||||
"resources": [
|
||||
{
|
||||
"id": "parsigs",
|
||||
"title": "parsigs",
|
||||
"slogan": "Structuring prescriptions text made simple using spaCy",
|
||||
"description": "Parsigs is an open-source project that aims to extract the relevant dosage information from prescriptions text without compromising the patient's privacy.\n\nNotice you also need to install the model in order to use the package: `pip install https://huggingface.co/royashcenazi/en_parsigs/resolve/main/en_parsigs-any-py3-none-any.whl`",
|
||||
"github": "royashcenazi/parsigs",
|
||||
"pip": "parsigs",
|
||||
"code_language": "python",
|
||||
"author": "Roy Ashcenazi",
|
||||
"code_example": [
|
||||
"# You'll need to install the trained model, see instructions in the description section",
|
||||
"from parsigs.parse_sig_api import StructuredSig, SigParser",
|
||||
"sig_parser = SigParser()",
|
||||
"",
|
||||
"sig = 'Take 1 tablet of ibuprofen 200mg 3 times every day for 3 weeks'",
|
||||
"parsed_sig = sig_parser.parse(sig)"
|
||||
],
|
||||
"author_links": {
|
||||
"github": "royashcenazi"
|
||||
},
|
||||
"category": ["model", "research", "biomedical"],
|
||||
"tags": ["sigs", "prescription","pharma"]
|
||||
},
|
||||
{
|
||||
"id": "latincy",
|
||||
"title": "LatinCy",
|
||||
"thumb": "https://raw.githubusercontent.com/diyclassics/la_core_web_lg/main/latincy-logo.png",
|
||||
"slogan": "Synthetic trained spaCy pipelines for Latin NLP",
|
||||
"description": "Set of trained general purpose Latin-language 'core' pipelines for use with spaCy. The models are trained on a large amount of available Latin data, including all five of the Latin Universal Dependency treebanks, which have been preprocessed to be compatible with each other.",
|
||||
"url": "https://huggingface.co/latincy",
|
||||
"code_example": [
|
||||
"# pip install https://huggingface.co/latincy/la_core_web_lg/resolve/main/la_core_web_lg-any-py3-none-any.whl",
|
||||
"import spacy",
|
||||
"nlp = spacy.load('la_core_web_lg')",
|
||||
"doc = nlp('Haec narrantur a poetis de Perseo')",
|
||||
"",
|
||||
"print(f'{doc[0].text}, {doc[0].norm_}, {doc[0].lemma_}, {doc[0].pos_}')",
|
||||
"",
|
||||
"# > Haec, haec, hic, DET"
|
||||
],
|
||||
"code_language": "python",
|
||||
"author": "Patrick J. Burns",
|
||||
"author_links": {
|
||||
"twitter": "@diyclassics",
|
||||
"github": "diyclassics",
|
||||
"website": "https://diyclassics.github.io/"
|
||||
},
|
||||
"category": ["pipeline", "research"],
|
||||
"tags": ["latin"]
|
||||
},
|
||||
{
|
||||
"id": "spacy-wasm",
|
||||
"title": "spacy-wasm",
|
||||
"slogan": "spaCy in the browser using WebAssembly",
|
||||
"description": "Run spaCy directly in the browser with WebAssembly. Using Pyodide, the application loads the spaCy model and renders the text prompt with displaCy.",
|
||||
"url": "https://spacy-wasm.vercel.app/",
|
||||
"github": "SyedAhkam/spacy-wasm",
|
||||
"code_language": "python",
|
||||
"author": "Syed Ahkam",
|
||||
"author_links": {
|
||||
"twitter": "@SyedAhkam1",
|
||||
"github": "SyedAhkam"
|
||||
},
|
||||
"category": ["visualizers"],
|
||||
"tags": ["visualization", "deployment"]
|
||||
},
|
||||
{
|
||||
"id": "spacysee",
|
||||
"title": "spaCysee",
|
||||
"slogan": "Visualize spaCy's Dependency Parsing, POS tagging, and morphological analysis",
|
||||
"description": "A project that helps you visualize your spaCy docs in Jupyter notebooks. Each of the dependency tags, POS tags and morphological features are clickable. Clicking on a tag will bring up the relevant documentation for that tag.",
|
||||
"github": "moxley01/spacysee",
|
||||
"pip": "spacysee",
|
||||
"code_example": [
|
||||
"import spacy",
|
||||
"from spacysee import render",
|
||||
"",
|
||||
"nlp = spacy.load('en_core_web_sm')",
|
||||
"doc = nlp('This is a neat way to visualize your spaCy docs')",
|
||||
"render(doc, width='500', height='500')"
|
||||
],
|
||||
"code_language": "python",
|
||||
"thumb": "https://www.mattoxley.com/static/images/spacysee_logo.svg",
|
||||
"image": "https://www.mattoxley.com/static/images/spacysee_logo.svg",
|
||||
"author": "Matt Oxley",
|
||||
"author_links": {
|
||||
"twitter": "matt0xley",
|
||||
"github": "moxley01",
|
||||
"website": "https://mattoxley.com"
|
||||
},
|
||||
"category": ["visualizers"],
|
||||
"tags": ["visualization"]
|
||||
},
|
||||
{
|
||||
"id": "grecy",
|
||||
"title": "greCy",
|
||||
|
@ -1555,7 +1648,7 @@
|
|||
"twitter": "allenai_org",
|
||||
"website": "http://allenai.org"
|
||||
},
|
||||
"category": ["scientific", "models", "research"]
|
||||
"category": ["scientific", "models", "research", "biomedical"]
|
||||
},
|
||||
{
|
||||
"id": "textacy",
|
||||
|
@ -2767,6 +2860,56 @@
|
|||
"tags": ["coreference", "multi-lingual", "cross-lingual", "allennlp"],
|
||||
"spacy_version": 3
|
||||
},
|
||||
{
|
||||
"id": "adeptaugmentations",
|
||||
"title": "Adept Augmentations",
|
||||
"slogan": " A Python library aimed at dissecting and augmenting NER training data for a few-shot scenario.",
|
||||
"description": "EntitySwapAugmenter takes either a `datasets.Dataset` or a `spacy.tokens.DocBin`. Additionally, it is optional to provide a set of labels. It initially creates a knowledge base of entities belonging to a certain label. When running `augmenter.augment()` for N runs, it then creates N new sentences with random swaps of the original entities with an entity of the same corresponding label from the knowledge base.\n\nFor example, assuming that we have knowledge base for `PERSONS`, `LOCATIONS` and `PRODUCTS`. We can then create additional data for the sentence \"Momofuko Ando created instant noodles in Osaka.\" using `augmenter.augment(N=2)`, resulting in \"David created instant noodles in Madrid.\" or \"Tom created Adept Augmentations in the Netherlands\".",
|
||||
"github": "davidberenstein1957/adept-augmentations",
|
||||
"pip": "adept-augmentations",
|
||||
"thumb": "https://raw.githubusercontent.com/Pandora-Intelligence/crosslingual-coreference/master/img/logo.png",
|
||||
"code_example": [
|
||||
"import spacy",
|
||||
"from spacy.tokens import DocBin",
|
||||
"",
|
||||
"from adept_augmentations import EntitySwapAugmenter",
|
||||
"",
|
||||
"nlp = spacy.load(\"en_core_web_sm\")",
|
||||
"",
|
||||
"TRAIN_DATA = [",
|
||||
" \"Apple is looking at buying U.K. startup for $1 billion\",",
|
||||
" \"Microsoft acquires GitHub for $7.5 billion\"",
|
||||
"]",
|
||||
"docs = nlp.pipe(TRAIN_DATA)",
|
||||
"",
|
||||
"# Create a new DocBin",
|
||||
"doc_bin = DocBin(docs=docs)",
|
||||
"",
|
||||
"# Augment Data",
|
||||
"doc_bin = EntitySwapAugmenter(doc_bin).augment(4)",
|
||||
"for doc in doc_bin.get_docs(nlp.vocab):",
|
||||
" print(doc.text)",
|
||||
"",
|
||||
"# Output",
|
||||
"#",
|
||||
"# GitHub is looking at buying U.K. startup for $ 7.5 billion",
|
||||
"# Microsoft is looking at buying U.K. startup for $ 1 billion",
|
||||
"# Microsoft is looking at buying U.K. startup for $ 7.5 billion",
|
||||
"# GitHub is looking at buying U.K. startup for $ 1 billion",
|
||||
"# Microsoft acquires Apple for $ 7.5 billion",
|
||||
"# Apple acquires Microsoft for $ 1 billion",
|
||||
"# Microsoft acquires Microsoft for $ 7.5 billion",
|
||||
"# GitHub acquires GitHub for $ 1 billion"
|
||||
],
|
||||
"author": "David Berenstein",
|
||||
"author_links": {
|
||||
"github": "davidberenstein1957",
|
||||
"website": "https://www.linkedin.com/in/david-berenstein-1bab11105/"
|
||||
},
|
||||
"category": ["standalone"],
|
||||
"tags": ["ner", "few-shot", "augmentation", "datasets", "training"],
|
||||
"spacy_version": 3
|
||||
},
|
||||
{
|
||||
"id": "blackstone",
|
||||
"title": "Blackstone",
|
||||
|
@ -3215,6 +3358,51 @@
|
|||
"category": ["pipeline"],
|
||||
"tags": ["syllables", "multilingual"]
|
||||
},
|
||||
{
|
||||
"id": "sentimental-onix",
|
||||
"title": "Sentimental Onix",
|
||||
"slogan": "Use onnx for sentiment models",
|
||||
"description": "spaCy pipeline component for sentiment analysis using onnx",
|
||||
"github": "sloev/sentimental-onix",
|
||||
"pip": "sentimental-onix",
|
||||
"code_example": [
|
||||
"# Download model:",
|
||||
"# python -m sentimental_onix download en",
|
||||
"import spacy",
|
||||
"from sentimental_onix import pipeline",
|
||||
"",
|
||||
"nlp = spacy.load(\"en_core_web_sm\")",
|
||||
"nlp.add_pipe(\"sentencizer\")",
|
||||
"nlp.add_pipe(\"sentimental_onix\", after=\"sentencizer\")",
|
||||
"",
|
||||
"sentences = [",
|
||||
" (sent.text, sent._.sentiment)",
|
||||
" for doc in nlp.pipe(",
|
||||
" [",
|
||||
" \"i hate pasta on tuesdays\",",
|
||||
" \"i like movies on wednesdays\",",
|
||||
" \"i find your argument ridiculous\",",
|
||||
" \"soda with straws are my favorite\",",
|
||||
" ]",
|
||||
" )",
|
||||
" for sent in doc.sents",
|
||||
"]",
|
||||
"",
|
||||
"assert sentences == [",
|
||||
" (\"i hate pasta on tuesdays\", \"Negative\"),",
|
||||
" (\"i like movies on wednesdays\", \"Positive\"),",
|
||||
" (\"i find your argument ridiculous\", \"Negative\"),",
|
||||
" (\"soda with straws are my favorite\", \"Positive\"),",
|
||||
"]"
|
||||
],
|
||||
"thumb": "https://raw.githubusercontent.com/sloev/sentimental-onix/master/.github/onix.webp",
|
||||
"author": "Johannes Valbjørn",
|
||||
"author_links": {
|
||||
"github": "sloev"
|
||||
},
|
||||
"category": ["pipeline"],
|
||||
"tags": ["sentiment", "english"]
|
||||
},
|
||||
{
|
||||
"id": "gobbli",
|
||||
"title": "gobbli",
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
"dev": "next dev",
|
||||
"build": "next build && npm run sitemap && next export",
|
||||
"prebuild": "pip install -r setup/requirements.txt && sh setup/setup.sh",
|
||||
"predev": "npm run prebuild",
|
||||
"sitemap": "next-sitemap --config next-sitemap.config.mjs",
|
||||
"start": "next start",
|
||||
"lint": "next lint",
|
||||
|
|
|
@ -111,11 +111,12 @@
|
|||
line-height: var(--line-height-xs)
|
||||
text-align: center
|
||||
|
||||
@include breakpoint(max, xs)
|
||||
.list
|
||||
@include breakpoint(max, md)
|
||||
.alert
|
||||
display: none
|
||||
|
||||
.alert
|
||||
@include breakpoint(max, xs)
|
||||
.list
|
||||
display: none
|
||||
|
||||
.has-alert
|
||||
|
|
|
@ -57,9 +57,15 @@ const AlertSpace = ({ nightly, legacy }) => {
|
|||
)
|
||||
}
|
||||
|
||||
// const navAlert = (
|
||||
// <Link to="/usage/v3-5" noLinkLayout>
|
||||
// <strong>💥 Out now:</strong> spaCy v3.5
|
||||
// </Link>
|
||||
// )
|
||||
|
||||
const navAlert = (
|
||||
<Link to="/usage/v3-5" noLinkLayout>
|
||||
<strong>💥 Out now:</strong> spaCy v3.5
|
||||
<Link to="https://form.typeform.com/to/aMel9q9f" noLinkLayout>
|
||||
<strong>💥 Take the user survey!</strong>
|
||||
</Link>
|
||||
)
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user