mirror of
https://github.com/explosion/spaCy.git
synced 2025-06-05 05:33:15 +03:00
Merge pull request #12494 from adrianeboyd/backport/v3.5.2-1
Backports for v3.5.2
This commit is contained in:
commit
e4bbdf7b50
24
.github/azure-steps.yml
vendored
24
.github/azure-steps.yml
vendored
|
@ -57,51 +57,51 @@ steps:
|
||||||
python -m spacy download ca_core_news_md
|
python -m spacy download ca_core_news_md
|
||||||
python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
|
python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
|
||||||
displayName: 'Test download CLI'
|
displayName: 'Test download CLI'
|
||||||
condition: eq(variables['python_version'], '3.8')
|
condition: eq(variables['python_version'], '3.9')
|
||||||
|
|
||||||
- script: |
|
- script: |
|
||||||
python -W error -m spacy info ca_core_news_sm | grep -q download_url
|
python -W error -m spacy info ca_core_news_sm | grep -q download_url
|
||||||
displayName: 'Test download_url in info CLI'
|
displayName: 'Test download_url in info CLI'
|
||||||
condition: eq(variables['python_version'], '3.8')
|
condition: eq(variables['python_version'], '3.9')
|
||||||
|
|
||||||
- script: |
|
- script: |
|
||||||
python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
|
python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
|
||||||
displayName: 'Test no warnings on load (#11713)'
|
displayName: 'Test no warnings on load (#11713)'
|
||||||
condition: eq(variables['python_version'], '3.8')
|
condition: eq(variables['python_version'], '3.9')
|
||||||
|
|
||||||
- script: |
|
- script: |
|
||||||
python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
|
python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
|
||||||
displayName: 'Test convert CLI'
|
displayName: 'Test convert CLI'
|
||||||
condition: eq(variables['python_version'], '3.8')
|
condition: eq(variables['python_version'], '3.9')
|
||||||
|
|
||||||
- script: |
|
- script: |
|
||||||
python -m spacy init config -p ner -l ca ner.cfg
|
python -m spacy init config -p ner -l ca ner.cfg
|
||||||
python -m spacy debug config ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy
|
python -m spacy debug config ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy
|
||||||
displayName: 'Test debug config CLI'
|
displayName: 'Test debug config CLI'
|
||||||
condition: eq(variables['python_version'], '3.8')
|
condition: eq(variables['python_version'], '3.9')
|
||||||
|
|
||||||
- script: |
|
- script: |
|
||||||
# will have errors due to sparse data, check for summary in output
|
# will have errors due to sparse data, check for summary in output
|
||||||
python -m spacy debug data ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy | grep -q Summary
|
python -m spacy debug data ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy | grep -q Summary
|
||||||
displayName: 'Test debug data CLI'
|
displayName: 'Test debug data CLI'
|
||||||
condition: eq(variables['python_version'], '3.8')
|
condition: eq(variables['python_version'], '3.9')
|
||||||
|
|
||||||
- script: |
|
- script: |
|
||||||
python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
|
python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
|
||||||
displayName: 'Test train CLI'
|
displayName: 'Test train CLI'
|
||||||
condition: eq(variables['python_version'], '3.8')
|
condition: eq(variables['python_version'], '3.9')
|
||||||
|
|
||||||
- script: |
|
- script: |
|
||||||
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
|
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
|
||||||
PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
|
PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
|
||||||
displayName: 'Test assemble CLI'
|
displayName: 'Test assemble CLI'
|
||||||
condition: eq(variables['python_version'], '3.8')
|
condition: eq(variables['python_version'], '3.9')
|
||||||
|
|
||||||
- script: |
|
- script: |
|
||||||
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
|
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
|
||||||
python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
|
python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
|
||||||
displayName: 'Test assemble CLI vectors warning'
|
displayName: 'Test assemble CLI vectors warning'
|
||||||
condition: eq(variables['python_version'], '3.8')
|
condition: eq(variables['python_version'], '3.9')
|
||||||
|
|
||||||
- script: |
|
- script: |
|
||||||
python -m pip install -U -r requirements.txt
|
python -m pip install -U -r requirements.txt
|
||||||
|
@ -116,9 +116,3 @@ steps:
|
||||||
python -m pytest --pyargs spacy
|
python -m pytest --pyargs spacy
|
||||||
displayName: "Run CPU tests with thinc-apple-ops"
|
displayName: "Run CPU tests with thinc-apple-ops"
|
||||||
condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.11'))
|
condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.11'))
|
||||||
|
|
||||||
- script: |
|
|
||||||
python .github/validate_universe_json.py website/meta/universe.json
|
|
||||||
displayName: 'Test website/meta/universe.json'
|
|
||||||
condition: eq(variables['python_version'], '3.8')
|
|
||||||
|
|
||||||
|
|
45
.github/workflows/autoblack.yml
vendored
45
.github/workflows/autoblack.yml
vendored
|
@ -1,45 +0,0 @@
|
||||||
# GitHub Action that uses Black to reformat all Python code and submits a PR
|
|
||||||
# in regular intervals. Inspired by: https://github.com/cclauss/autoblack
|
|
||||||
|
|
||||||
name: autoblack
|
|
||||||
on:
|
|
||||||
workflow_dispatch: # allow manual trigger
|
|
||||||
schedule:
|
|
||||||
- cron: '0 8 * * 5' # every Friday at 8am UTC
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
autoblack:
|
|
||||||
if: github.repository_owner == 'explosion'
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v3
|
|
||||||
with:
|
|
||||||
ref: ${{ github.head_ref }}
|
|
||||||
- uses: actions/setup-python@v4
|
|
||||||
- run: pip install black -c requirements.txt
|
|
||||||
- name: Auto-format code if needed
|
|
||||||
run: black spacy
|
|
||||||
# We can't run black --check here because that returns a non-zero excit
|
|
||||||
# code and makes GitHub think the action failed
|
|
||||||
- name: Check for modified files
|
|
||||||
id: git-check
|
|
||||||
run: echo modified=$(if git diff-index --quiet HEAD --; then echo "false"; else echo "true"; fi) >> $GITHUB_OUTPUT
|
|
||||||
|
|
||||||
- name: Create Pull Request
|
|
||||||
if: steps.git-check.outputs.modified == 'true'
|
|
||||||
uses: peter-evans/create-pull-request@v4
|
|
||||||
with:
|
|
||||||
title: Auto-format code with black
|
|
||||||
labels: meta
|
|
||||||
commit-message: Auto-format code with black
|
|
||||||
committer: GitHub <noreply@github.com>
|
|
||||||
author: explosion-bot <explosion-bot@users.noreply.github.com>
|
|
||||||
body: _This PR is auto-generated._
|
|
||||||
branch: autoblack
|
|
||||||
delete-branch: true
|
|
||||||
draft: false
|
|
||||||
- name: Check outputs
|
|
||||||
if: steps.git-check.outputs.modified == 'true'
|
|
||||||
run: |
|
|
||||||
echo "Pull Request Number - ${{ steps.cpr.outputs.pull-request-number }}"
|
|
||||||
echo "Pull Request URL - ${{ steps.cpr.outputs.pull-request-url }}"
|
|
1
.github/workflows/explosionbot.yml
vendored
1
.github/workflows/explosionbot.yml
vendored
|
@ -8,6 +8,7 @@ on:
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
explosion-bot:
|
explosion-bot:
|
||||||
|
if: github.repository_owner == 'explosion'
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Dump GitHub context
|
- name: Dump GitHub context
|
||||||
|
|
1
.github/workflows/issue-manager.yml
vendored
1
.github/workflows/issue-manager.yml
vendored
|
@ -13,6 +13,7 @@ on:
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
issue-manager:
|
issue-manager:
|
||||||
|
if: github.repository_owner == 'explosion'
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: tiangolo/issue-manager@0.4.0
|
- uses: tiangolo/issue-manager@0.4.0
|
||||||
|
|
1
.github/workflows/lock.yml
vendored
1
.github/workflows/lock.yml
vendored
|
@ -13,6 +13,7 @@ concurrency:
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
action:
|
action:
|
||||||
|
if: github.repository_owner == 'explosion'
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: dessant/lock-threads@v4
|
- uses: dessant/lock-threads@v4
|
||||||
|
|
1
.github/workflows/spacy_universe_alert.yml
vendored
1
.github/workflows/spacy_universe_alert.yml
vendored
|
@ -7,6 +7,7 @@ on:
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
build:
|
build:
|
||||||
|
if: github.repository_owner == 'explosion'
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
|
|
173
.github/workflows/tests.yml
vendored
Normal file
173
.github/workflows/tests.yml
vendored
Normal file
|
@ -0,0 +1,173 @@
|
||||||
|
name: tests
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches-ignore:
|
||||||
|
- "spacy.io"
|
||||||
|
- "nightly.spacy.io"
|
||||||
|
- "v2.spacy.io"
|
||||||
|
paths-ignore:
|
||||||
|
- "*.md"
|
||||||
|
- "*.mdx"
|
||||||
|
- "website/**"
|
||||||
|
- ".github/workflows/**"
|
||||||
|
pull_request:
|
||||||
|
types: [opened, synchronize, reopened, edited]
|
||||||
|
paths-ignore:
|
||||||
|
- "*.md"
|
||||||
|
- "*.mdx"
|
||||||
|
- "website/**"
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
validate:
|
||||||
|
name: Validate
|
||||||
|
if: github.repository_owner == 'explosion'
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- name: Check out repo
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
|
- name: Configure Python version
|
||||||
|
uses: actions/setup-python@v4
|
||||||
|
with:
|
||||||
|
python-version: "3.7"
|
||||||
|
architecture: x64
|
||||||
|
|
||||||
|
- name: black
|
||||||
|
run: |
|
||||||
|
python -m pip install black -c requirements.txt
|
||||||
|
python -m black spacy --check
|
||||||
|
- name: flake8
|
||||||
|
run: |
|
||||||
|
python -m pip install flake8==5.0.4
|
||||||
|
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
|
||||||
|
tests:
|
||||||
|
name: Test
|
||||||
|
needs: Validate
|
||||||
|
strategy:
|
||||||
|
fail-fast: true
|
||||||
|
matrix:
|
||||||
|
os: [ubuntu-latest, windows-latest, macos-latest]
|
||||||
|
python_version: ["3.11"]
|
||||||
|
include:
|
||||||
|
- os: ubuntu-20.04
|
||||||
|
python_version: "3.6"
|
||||||
|
- os: windows-latest
|
||||||
|
python_version: "3.7"
|
||||||
|
- os: macos-latest
|
||||||
|
python_version: "3.8"
|
||||||
|
- os: ubuntu-latest
|
||||||
|
python_version: "3.9"
|
||||||
|
- os: windows-latest
|
||||||
|
python_version: "3.10"
|
||||||
|
|
||||||
|
runs-on: ${{ matrix.os }}
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Check out repo
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
|
- name: Configure Python version
|
||||||
|
uses: actions/setup-python@v4
|
||||||
|
with:
|
||||||
|
python-version: ${{ matrix.python_version }}
|
||||||
|
architecture: x64
|
||||||
|
|
||||||
|
- name: Install dependencies
|
||||||
|
run: |
|
||||||
|
python -m pip install -U build pip setuptools
|
||||||
|
python -m pip install -U -r requirements.txt
|
||||||
|
|
||||||
|
- name: Build sdist
|
||||||
|
run: |
|
||||||
|
python -m build --sdist
|
||||||
|
|
||||||
|
- name: Run mypy
|
||||||
|
run: |
|
||||||
|
python -m mypy spacy
|
||||||
|
if: matrix.python_version != '3.6'
|
||||||
|
|
||||||
|
- name: Delete source directory and .egg-info
|
||||||
|
run: |
|
||||||
|
rm -rf spacy *.egg-info
|
||||||
|
shell: bash
|
||||||
|
|
||||||
|
- name: Uninstall all packages
|
||||||
|
run: |
|
||||||
|
python -m pip freeze
|
||||||
|
python -m pip freeze --exclude pywin32 > installed.txt
|
||||||
|
python -m pip uninstall -y -r installed.txt
|
||||||
|
|
||||||
|
- name: Install from sdist
|
||||||
|
run: |
|
||||||
|
SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
|
||||||
|
SPACY_NUM_BUILD_JOBS=2 python -m pip install dist/$SDIST
|
||||||
|
shell: bash
|
||||||
|
|
||||||
|
- name: Test import
|
||||||
|
run: python -W error -c "import spacy"
|
||||||
|
|
||||||
|
- name: "Test download CLI"
|
||||||
|
run: |
|
||||||
|
python -m spacy download ca_core_news_sm
|
||||||
|
python -m spacy download ca_core_news_md
|
||||||
|
python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
|
||||||
|
if: matrix.python_version == '3.9'
|
||||||
|
|
||||||
|
- name: "Test download_url in info CLI"
|
||||||
|
run: |
|
||||||
|
python -W error -m spacy info ca_core_news_sm | grep -q download_url
|
||||||
|
if: matrix.python_version == '3.9'
|
||||||
|
|
||||||
|
- name: "Test no warnings on load (#11713)"
|
||||||
|
run: |
|
||||||
|
python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
|
||||||
|
if: matrix.python_version == '3.9'
|
||||||
|
|
||||||
|
- name: "Test convert CLI"
|
||||||
|
run: |
|
||||||
|
python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
|
||||||
|
if: matrix.python_version == '3.9'
|
||||||
|
|
||||||
|
- name: "Test debug config CLI"
|
||||||
|
run: |
|
||||||
|
python -m spacy init config -p ner -l ca ner.cfg
|
||||||
|
python -m spacy debug config ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy
|
||||||
|
if: matrix.python_version == '3.9'
|
||||||
|
|
||||||
|
- name: "Test debug data CLI"
|
||||||
|
run: |
|
||||||
|
# will have errors due to sparse data, check for summary in output
|
||||||
|
python -m spacy debug data ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy | grep -q Summary
|
||||||
|
if: matrix.python_version == '3.9'
|
||||||
|
|
||||||
|
- name: "Test train CLI"
|
||||||
|
run: |
|
||||||
|
python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
|
||||||
|
if: matrix.python_version == '3.9'
|
||||||
|
|
||||||
|
- name: "Test assemble CLI"
|
||||||
|
run: |
|
||||||
|
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
|
||||||
|
PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
|
||||||
|
if: matrix.python_version == '3.9'
|
||||||
|
|
||||||
|
- name: "Test assemble CLI vectors warning"
|
||||||
|
run: |
|
||||||
|
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
|
||||||
|
python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
|
||||||
|
if: matrix.python_version == '3.9'
|
||||||
|
|
||||||
|
- name: "Install test requirements"
|
||||||
|
run: |
|
||||||
|
python -m pip install -U -r requirements.txt
|
||||||
|
|
||||||
|
- name: "Run CPU tests"
|
||||||
|
run: |
|
||||||
|
python -m pytest --pyargs spacy -W error
|
||||||
|
|
||||||
|
- name: "Run CPU tests with thinc-apple-ops"
|
||||||
|
run: |
|
||||||
|
python -m pip install 'spacy[apple]'
|
||||||
|
python -m pytest --pyargs spacy
|
||||||
|
if: startsWith(matrix.os, 'macos') && matrix.python_version == '3.11'
|
33
.github/workflows/universe_validation.yml
vendored
Normal file
33
.github/workflows/universe_validation.yml
vendored
Normal file
|
@ -0,0 +1,33 @@
|
||||||
|
name: universe validation
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches-ignore:
|
||||||
|
- "spacy.io"
|
||||||
|
- "nightly.spacy.io"
|
||||||
|
- "v2.spacy.io"
|
||||||
|
paths:
|
||||||
|
- "website/meta/universe.json"
|
||||||
|
pull_request:
|
||||||
|
types: [opened, synchronize, reopened, edited]
|
||||||
|
paths:
|
||||||
|
- "website/meta/universe.json"
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
validate:
|
||||||
|
name: Validate
|
||||||
|
if: github.repository_owner == 'explosion'
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- name: Check out repo
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
|
- name: Configure Python version
|
||||||
|
uses: actions/setup-python@v4
|
||||||
|
with:
|
||||||
|
python-version: "3.7"
|
||||||
|
architecture: x64
|
||||||
|
|
||||||
|
- name: Validate website/meta/universe.json
|
||||||
|
run: |
|
||||||
|
python .github/validate_universe_json.py website/meta/universe.json
|
|
@ -48,6 +48,9 @@ jobs:
|
||||||
pip install flake8==5.0.4
|
pip install flake8==5.0.4
|
||||||
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
|
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
|
||||||
displayName: "flake8"
|
displayName: "flake8"
|
||||||
|
- script: |
|
||||||
|
python .github/validate_universe_json.py website/meta/universe.json
|
||||||
|
displayName: 'Validate website/meta/universe.json'
|
||||||
|
|
||||||
- job: "Test"
|
- job: "Test"
|
||||||
dependsOn: "Validate"
|
dependsOn: "Validate"
|
||||||
|
|
36
setup.cfg
36
setup.cfg
|
@ -78,41 +78,41 @@ transformers =
|
||||||
ray =
|
ray =
|
||||||
spacy_ray>=0.1.0,<1.0.0
|
spacy_ray>=0.1.0,<1.0.0
|
||||||
cuda =
|
cuda =
|
||||||
cupy>=5.0.0b4,<12.0.0
|
cupy>=5.0.0b4,<13.0.0
|
||||||
cuda80 =
|
cuda80 =
|
||||||
cupy-cuda80>=5.0.0b4,<12.0.0
|
cupy-cuda80>=5.0.0b4,<13.0.0
|
||||||
cuda90 =
|
cuda90 =
|
||||||
cupy-cuda90>=5.0.0b4,<12.0.0
|
cupy-cuda90>=5.0.0b4,<13.0.0
|
||||||
cuda91 =
|
cuda91 =
|
||||||
cupy-cuda91>=5.0.0b4,<12.0.0
|
cupy-cuda91>=5.0.0b4,<13.0.0
|
||||||
cuda92 =
|
cuda92 =
|
||||||
cupy-cuda92>=5.0.0b4,<12.0.0
|
cupy-cuda92>=5.0.0b4,<13.0.0
|
||||||
cuda100 =
|
cuda100 =
|
||||||
cupy-cuda100>=5.0.0b4,<12.0.0
|
cupy-cuda100>=5.0.0b4,<13.0.0
|
||||||
cuda101 =
|
cuda101 =
|
||||||
cupy-cuda101>=5.0.0b4,<12.0.0
|
cupy-cuda101>=5.0.0b4,<13.0.0
|
||||||
cuda102 =
|
cuda102 =
|
||||||
cupy-cuda102>=5.0.0b4,<12.0.0
|
cupy-cuda102>=5.0.0b4,<13.0.0
|
||||||
cuda110 =
|
cuda110 =
|
||||||
cupy-cuda110>=5.0.0b4,<12.0.0
|
cupy-cuda110>=5.0.0b4,<13.0.0
|
||||||
cuda111 =
|
cuda111 =
|
||||||
cupy-cuda111>=5.0.0b4,<12.0.0
|
cupy-cuda111>=5.0.0b4,<13.0.0
|
||||||
cuda112 =
|
cuda112 =
|
||||||
cupy-cuda112>=5.0.0b4,<12.0.0
|
cupy-cuda112>=5.0.0b4,<13.0.0
|
||||||
cuda113 =
|
cuda113 =
|
||||||
cupy-cuda113>=5.0.0b4,<12.0.0
|
cupy-cuda113>=5.0.0b4,<13.0.0
|
||||||
cuda114 =
|
cuda114 =
|
||||||
cupy-cuda114>=5.0.0b4,<12.0.0
|
cupy-cuda114>=5.0.0b4,<13.0.0
|
||||||
cuda115 =
|
cuda115 =
|
||||||
cupy-cuda115>=5.0.0b4,<12.0.0
|
cupy-cuda115>=5.0.0b4,<13.0.0
|
||||||
cuda116 =
|
cuda116 =
|
||||||
cupy-cuda116>=5.0.0b4,<12.0.0
|
cupy-cuda116>=5.0.0b4,<13.0.0
|
||||||
cuda117 =
|
cuda117 =
|
||||||
cupy-cuda117>=5.0.0b4,<12.0.0
|
cupy-cuda117>=5.0.0b4,<13.0.0
|
||||||
cuda11x =
|
cuda11x =
|
||||||
cupy-cuda11x>=11.0.0,<12.0.0
|
cupy-cuda11x>=11.0.0,<13.0.0
|
||||||
cuda-autodetect =
|
cuda-autodetect =
|
||||||
cupy-wheel>=11.0.0,<12.0.0
|
cupy-wheel>=11.0.0,<13.0.0
|
||||||
apple =
|
apple =
|
||||||
thinc-apple-ops>=0.1.0.dev0,<1.0.0
|
thinc-apple-ops>=0.1.0.dev0,<1.0.0
|
||||||
# Language tokenizers with external dependencies
|
# Language tokenizers with external dependencies
|
||||||
|
|
|
@ -35,7 +35,7 @@ def find_threshold_cli(
|
||||||
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||||
use_gpu: int = Opt(_DEFAULTS["use_gpu"], "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
use_gpu: int = Opt(_DEFAULTS["use_gpu"], "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
||||||
gold_preproc: bool = Opt(_DEFAULTS["gold_preproc"], "--gold-preproc", "-G", help="Use gold preprocessing"),
|
gold_preproc: bool = Opt(_DEFAULTS["gold_preproc"], "--gold-preproc", "-G", help="Use gold preprocessing"),
|
||||||
verbose: bool = Opt(False, "--silent", "-V", "-VV", help="Display more information for debugging purposes"),
|
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -23,6 +23,7 @@ def pretrain_cli(
|
||||||
resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
|
resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
|
||||||
epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."),
|
epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."),
|
||||||
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
||||||
|
skip_last: bool = Opt(False, "--skip-last", "-L", help="Skip saving model-last.bin"),
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
|
@ -74,6 +75,7 @@ def pretrain_cli(
|
||||||
epoch_resume=epoch_resume,
|
epoch_resume=epoch_resume,
|
||||||
use_gpu=use_gpu,
|
use_gpu=use_gpu,
|
||||||
silent=False,
|
silent=False,
|
||||||
|
skip_last=skip_last,
|
||||||
)
|
)
|
||||||
msg.good("Successfully finished pretrain")
|
msg.good("Successfully finished pretrain")
|
||||||
|
|
||||||
|
|
|
@ -125,13 +125,17 @@ def app(environ, start_response):
|
||||||
return [res]
|
return [res]
|
||||||
|
|
||||||
|
|
||||||
def parse_deps(orig_doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
|
def parse_deps(
|
||||||
|
orig_doc: Union[Doc, Span], options: Dict[str, Any] = {}
|
||||||
|
) -> Dict[str, Any]:
|
||||||
"""Generate dependency parse in {'words': [], 'arcs': []} format.
|
"""Generate dependency parse in {'words': [], 'arcs': []} format.
|
||||||
|
|
||||||
orig_doc (Doc): Document to parse.
|
orig_doc (Union[Doc, Span]): Document to parse.
|
||||||
options (Dict[str, Any]): Dependency parse specific visualisation options.
|
options (Dict[str, Any]): Dependency parse specific visualisation options.
|
||||||
RETURNS (dict): Generated dependency parse keyed by words and arcs.
|
RETURNS (dict): Generated dependency parse keyed by words and arcs.
|
||||||
"""
|
"""
|
||||||
|
if isinstance(orig_doc, Span):
|
||||||
|
orig_doc = orig_doc.as_doc()
|
||||||
doc = Doc(orig_doc.vocab).from_bytes(
|
doc = Doc(orig_doc.vocab).from_bytes(
|
||||||
orig_doc.to_bytes(exclude=["user_data", "user_hooks"])
|
orig_doc.to_bytes(exclude=["user_data", "user_hooks"])
|
||||||
)
|
)
|
||||||
|
|
|
@ -549,8 +549,8 @@ class Errors(metaclass=ErrorsWithCodes):
|
||||||
"during training, make sure to include it in 'annotating components'")
|
"during training, make sure to include it in 'annotating components'")
|
||||||
|
|
||||||
# New errors added in v3.x
|
# New errors added in v3.x
|
||||||
E850 = ("The PretrainVectors objective currently only supports default "
|
E850 = ("The PretrainVectors objective currently only supports default or "
|
||||||
"vectors, not {mode} vectors.")
|
"floret vectors, not {mode} vectors.")
|
||||||
E851 = ("The 'textcat' component labels should only have values of 0 or 1, "
|
E851 = ("The 'textcat' component labels should only have values of 0 or 1, "
|
||||||
"but found value of '{val}'.")
|
"but found value of '{val}'.")
|
||||||
E852 = ("The tar file pulled from the remote attempted an unsafe path "
|
E852 = ("The tar file pulled from the remote attempted an unsafe path "
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
from typing import Any, Optional, Iterable, Tuple, List, Callable, TYPE_CHECKING, cast
|
from typing import Any, Optional, Iterable, Tuple, List, Callable, TYPE_CHECKING, cast
|
||||||
from thinc.types import Floats2d
|
from thinc.types import Floats2d, Ints1d
|
||||||
from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init, Model
|
from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init, Model
|
||||||
from thinc.api import MultiSoftmax, list2array
|
from thinc.api import MultiSoftmax, list2array
|
||||||
from thinc.api import to_categorical, CosineDistance, L2Distance
|
from thinc.api import to_categorical, CosineDistance, L2Distance
|
||||||
|
@ -7,7 +7,7 @@ from thinc.loss import Loss
|
||||||
|
|
||||||
from ...util import registry, OOV_RANK
|
from ...util import registry, OOV_RANK
|
||||||
from ...errors import Errors
|
from ...errors import Errors
|
||||||
from ...attrs import ID
|
from ...attrs import ID, ORTH
|
||||||
from ...vectors import Mode as VectorsMode
|
from ...vectors import Mode as VectorsMode
|
||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
|
@ -24,8 +24,6 @@ def create_pretrain_vectors(
|
||||||
maxout_pieces: int, hidden_size: int, loss: str
|
maxout_pieces: int, hidden_size: int, loss: str
|
||||||
) -> Callable[["Vocab", Model], Model]:
|
) -> Callable[["Vocab", Model], Model]:
|
||||||
def create_vectors_objective(vocab: "Vocab", tok2vec: Model) -> Model:
|
def create_vectors_objective(vocab: "Vocab", tok2vec: Model) -> Model:
|
||||||
if vocab.vectors.mode != VectorsMode.default:
|
|
||||||
raise ValueError(Errors.E850.format(mode=vocab.vectors.mode))
|
|
||||||
if vocab.vectors.shape[1] == 0:
|
if vocab.vectors.shape[1] == 0:
|
||||||
raise ValueError(Errors.E875)
|
raise ValueError(Errors.E875)
|
||||||
model = build_cloze_multi_task_model(
|
model = build_cloze_multi_task_model(
|
||||||
|
@ -70,14 +68,23 @@ def get_vectors_loss(ops, docs, prediction, distance):
|
||||||
"""Compute a loss based on a distance between the documents' vectors and
|
"""Compute a loss based on a distance between the documents' vectors and
|
||||||
the prediction.
|
the prediction.
|
||||||
"""
|
"""
|
||||||
|
vocab = docs[0].vocab
|
||||||
|
if vocab.vectors.mode == VectorsMode.default:
|
||||||
# The simplest way to implement this would be to vstack the
|
# The simplest way to implement this would be to vstack the
|
||||||
# token.vector values, but that's a bit inefficient, especially on GPU.
|
# token.vector values, but that's a bit inefficient, especially on GPU.
|
||||||
# Instead we fetch the index into the vectors table for each of our tokens,
|
# Instead we fetch the index into the vectors table for each of our
|
||||||
# and look them up all at once. This prevents data copying.
|
# tokens, and look them up all at once. This prevents data copying.
|
||||||
ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
|
ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
|
||||||
target = docs[0].vocab.vectors.data[ids]
|
target = docs[0].vocab.vectors.data[ids]
|
||||||
target[ids == OOV_RANK] = 0
|
target[ids == OOV_RANK] = 0
|
||||||
d_target, loss = distance(prediction, target)
|
d_target, loss = distance(prediction, target)
|
||||||
|
elif vocab.vectors.mode == VectorsMode.floret:
|
||||||
|
keys = ops.flatten([cast(Ints1d, doc.to_array(ORTH)) for doc in docs])
|
||||||
|
target = vocab.vectors.get_batch(keys)
|
||||||
|
target = ops.as_contig(target)
|
||||||
|
d_target, loss = distance(prediction, target)
|
||||||
|
else:
|
||||||
|
raise ValueError(Errors.E850.format(mode=vocab.vectors.mode))
|
||||||
return loss, d_target
|
return loss, d_target
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -474,18 +474,24 @@ class EntityLinker(TrainablePipe):
|
||||||
|
|
||||||
# Looping through each entity in batch (TODO: rewrite)
|
# Looping through each entity in batch (TODO: rewrite)
|
||||||
for j, ent in enumerate(ent_batch):
|
for j, ent in enumerate(ent_batch):
|
||||||
sent_index = sentences.index(ent.sent)
|
assert hasattr(ent, "sents")
|
||||||
assert sent_index >= 0
|
sents = list(ent.sents)
|
||||||
|
sent_indices = (
|
||||||
|
sentences.index(sents[0]),
|
||||||
|
sentences.index(sents[-1]),
|
||||||
|
)
|
||||||
|
assert sent_indices[1] >= sent_indices[0] >= 0
|
||||||
|
|
||||||
if self.incl_context:
|
if self.incl_context:
|
||||||
# get n_neighbour sentences, clipped to the length of the document
|
# get n_neighbour sentences, clipped to the length of the document
|
||||||
start_sentence = max(0, sent_index - self.n_sents)
|
start_sentence = max(0, sent_indices[0] - self.n_sents)
|
||||||
end_sentence = min(
|
end_sentence = min(
|
||||||
len(sentences) - 1, sent_index + self.n_sents
|
len(sentences) - 1, sent_indices[1] + self.n_sents
|
||||||
)
|
)
|
||||||
start_token = sentences[start_sentence].start
|
start_token = sentences[start_sentence].start
|
||||||
end_token = sentences[end_sentence].end
|
end_token = sentences[end_sentence].end
|
||||||
sent_doc = doc[start_token:end_token].as_doc()
|
sent_doc = doc[start_token:end_token].as_doc()
|
||||||
|
|
||||||
# currently, the context is the same for each entity in a sentence (should be refined)
|
# currently, the context is the same for each entity in a sentence (should be refined)
|
||||||
sentence_encoding = self.model.predict([sent_doc])[0]
|
sentence_encoding = self.model.predict([sent_doc])[0]
|
||||||
sentence_encoding_t = sentence_encoding.T
|
sentence_encoding_t = sentence_encoding.T
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any, cast, Union
|
from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any, cast, Union
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
|
from functools import partial
|
||||||
from thinc.api import Config, Model, get_current_ops, set_dropout_rate, Ops
|
from thinc.api import Config, Model, get_current_ops, set_dropout_rate, Ops
|
||||||
from thinc.api import Optimizer
|
from thinc.api import Optimizer
|
||||||
from thinc.types import Ragged, Ints2d, Floats2d
|
from thinc.types import Ragged, Ints2d, Floats2d
|
||||||
|
@ -82,13 +83,9 @@ class Suggester(Protocol):
|
||||||
...
|
...
|
||||||
|
|
||||||
|
|
||||||
@registry.misc("spacy.ngram_suggester.v1")
|
def ngram_suggester(
|
||||||
def build_ngram_suggester(sizes: List[int]) -> Suggester:
|
docs: Iterable[Doc], sizes: List[int], *, ops: Optional[Ops] = None
|
||||||
"""Suggest all spans of the given lengths. Spans are returned as a ragged
|
) -> Ragged:
|
||||||
array of integers. The array has two columns, indicating the start and end
|
|
||||||
position."""
|
|
||||||
|
|
||||||
def ngram_suggester(docs: Iterable[Doc], *, ops: Optional[Ops] = None) -> Ragged:
|
|
||||||
if ops is None:
|
if ops is None:
|
||||||
ops = get_current_ops()
|
ops = get_current_ops()
|
||||||
spans = []
|
spans = []
|
||||||
|
@ -114,7 +111,14 @@ def build_ngram_suggester(sizes: List[int]) -> Suggester:
|
||||||
assert output.dataXd.ndim == 2
|
assert output.dataXd.ndim == 2
|
||||||
return output
|
return output
|
||||||
|
|
||||||
return ngram_suggester
|
|
||||||
|
@registry.misc("spacy.ngram_suggester.v1")
|
||||||
|
def build_ngram_suggester(sizes: List[int]) -> Suggester:
|
||||||
|
"""Suggest all spans of the given lengths. Spans are returned as a ragged
|
||||||
|
array of integers. The array has two columns, indicating the start and end
|
||||||
|
position."""
|
||||||
|
|
||||||
|
return partial(ngram_suggester, sizes=sizes)
|
||||||
|
|
||||||
|
|
||||||
@registry.misc("spacy.ngram_range_suggester.v1")
|
@registry.misc("spacy.ngram_range_suggester.v1")
|
||||||
|
@ -726,6 +730,7 @@ class SpanCategorizer(TrainablePipe):
|
||||||
if not allow_overlap:
|
if not allow_overlap:
|
||||||
# Get the probabilities
|
# Get the probabilities
|
||||||
sort_idx = (argmax_scores.squeeze() * -1).argsort()
|
sort_idx = (argmax_scores.squeeze() * -1).argsort()
|
||||||
|
argmax_scores = argmax_scores[sort_idx]
|
||||||
predicted = predicted[sort_idx]
|
predicted = predicted[sort_idx]
|
||||||
indices = indices[sort_idx]
|
indices = indices[sort_idx]
|
||||||
keeps = keeps[sort_idx]
|
keeps = keeps[sort_idx]
|
||||||
|
@ -748,4 +753,5 @@ class SpanCategorizer(TrainablePipe):
|
||||||
attrs_scores.append(argmax_scores[i])
|
attrs_scores.append(argmax_scores[i])
|
||||||
spans.append(Span(doc, start, end, label=self.labels[label]))
|
spans.append(Span(doc, start, end, label=self.labels[label]))
|
||||||
|
|
||||||
|
spans.attrs["scores"] = numpy.array(attrs_scores)
|
||||||
return spans
|
return spans
|
||||||
|
|
|
@ -700,3 +700,34 @@ def test_span_group_copy(doc):
|
||||||
assert len(doc.spans["test"]) == 3
|
assert len(doc.spans["test"]) == 3
|
||||||
# check that the copy spans were not modified and this is an isolated doc
|
# check that the copy spans were not modified and this is an isolated doc
|
||||||
assert len(doc_copy.spans["test"]) == 2
|
assert len(doc_copy.spans["test"]) == 2
|
||||||
|
|
||||||
|
|
||||||
|
def test_for_partial_ent_sents():
|
||||||
|
"""Spans may be associated with multiple sentences. These .sents should always be complete, not partial, sentences,
|
||||||
|
which this tests for.
|
||||||
|
"""
|
||||||
|
doc = Doc(
|
||||||
|
English().vocab,
|
||||||
|
words=["Mahler's", "Symphony", "No.", "8", "was", "beautiful."],
|
||||||
|
sent_starts=[1, 0, 0, 1, 0, 0],
|
||||||
|
)
|
||||||
|
doc.set_ents([Span(doc, 1, 4, "WORK")])
|
||||||
|
# The specified entity is associated with both sentences in this doc, so we expect all sentences in the doc to be
|
||||||
|
# equal to the sentences referenced in ent.sents.
|
||||||
|
for doc_sent, ent_sent in zip(doc.sents, doc.ents[0].sents):
|
||||||
|
assert doc_sent == ent_sent
|
||||||
|
|
||||||
|
|
||||||
|
def test_for_no_ent_sents():
|
||||||
|
"""Span.sents() should set .sents correctly, even if Span in question is trailing and doesn't form a full
|
||||||
|
sentence.
|
||||||
|
"""
|
||||||
|
doc = Doc(
|
||||||
|
English().vocab,
|
||||||
|
words=["This", "is", "a", "test.", "ENTITY"],
|
||||||
|
sent_starts=[1, 0, 0, 0, 1],
|
||||||
|
)
|
||||||
|
doc.set_ents([Span(doc, 4, 5, "WORK")])
|
||||||
|
sents = list(doc.ents[0].sents)
|
||||||
|
assert len(sents) == 1
|
||||||
|
assert str(sents[0]) == str(doc.ents[0].sent) == "ENTITY"
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
from typing import Callable, Iterable, Dict, Any
|
from typing import Callable, Iterable, Dict, Any, Tuple
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from numpy.testing import assert_equal
|
from numpy.testing import assert_equal
|
||||||
|
|
||||||
from spacy import registry, util
|
from spacy import registry, util, Language
|
||||||
from spacy.attrs import ENT_KB_ID
|
from spacy.attrs import ENT_KB_ID
|
||||||
from spacy.compat import pickle
|
from spacy.compat import pickle
|
||||||
from spacy.kb import Candidate, InMemoryLookupKB, get_candidates, KnowledgeBase
|
from spacy.kb import Candidate, InMemoryLookupKB, get_candidates, KnowledgeBase
|
||||||
|
@ -108,18 +108,23 @@ def test_issue7065():
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.issue(7065)
|
@pytest.mark.issue(7065)
|
||||||
def test_issue7065_b():
|
@pytest.mark.parametrize("entity_in_first_sentence", [True, False])
|
||||||
|
def test_sentence_crossing_ents(entity_in_first_sentence: bool):
|
||||||
|
"""Tests if NEL crashes if entities cross sentence boundaries and the first associated sentence doesn't have an
|
||||||
|
entity.
|
||||||
|
entity_in_prior_sentence (bool): Whether to include an entity in the first sentence associated with the
|
||||||
|
sentence-crossing entity.
|
||||||
|
"""
|
||||||
# Test that the NEL doesn't crash when an entity crosses a sentence boundary
|
# Test that the NEL doesn't crash when an entity crosses a sentence boundary
|
||||||
nlp = English()
|
nlp = English()
|
||||||
vector_length = 3
|
vector_length = 3
|
||||||
nlp.add_pipe("sentencizer")
|
|
||||||
text = "Mahler 's Symphony No. 8 was beautiful."
|
text = "Mahler 's Symphony No. 8 was beautiful."
|
||||||
entities = [(0, 6, "PERSON"), (10, 24, "WORK")]
|
entities = [(10, 24, "WORK")]
|
||||||
links = {
|
links = {(10, 24): {"Q7304": 0.0, "Q270853": 1.0}}
|
||||||
(0, 6): {"Q7304": 1.0, "Q270853": 0.0},
|
if entity_in_first_sentence:
|
||||||
(10, 24): {"Q7304": 0.0, "Q270853": 1.0},
|
entities.append((0, 6, "PERSON"))
|
||||||
}
|
links[(0, 6)] = {"Q7304": 1.0, "Q270853": 0.0}
|
||||||
sent_starts = [1, -1, 0, 0, 0, 0, 0, 0, 0]
|
sent_starts = [1, -1, 0, 0, 0, 1, 0, 0, 0]
|
||||||
doc = nlp(text)
|
doc = nlp(text)
|
||||||
example = Example.from_dict(
|
example = Example.from_dict(
|
||||||
doc, {"entities": entities, "links": links, "sent_starts": sent_starts}
|
doc, {"entities": entities, "links": links, "sent_starts": sent_starts}
|
||||||
|
@ -145,31 +150,14 @@ def test_issue7065_b():
|
||||||
|
|
||||||
# Create the Entity Linker component and add it to the pipeline
|
# Create the Entity Linker component and add it to the pipeline
|
||||||
entity_linker = nlp.add_pipe("entity_linker", last=True)
|
entity_linker = nlp.add_pipe("entity_linker", last=True)
|
||||||
entity_linker.set_kb(create_kb)
|
entity_linker.set_kb(create_kb) # type: ignore
|
||||||
# train the NEL pipe
|
# train the NEL pipe
|
||||||
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||||
for i in range(2):
|
for i in range(2):
|
||||||
losses = {}
|
nlp.update(train_examples, sgd=optimizer)
|
||||||
nlp.update(train_examples, sgd=optimizer, losses=losses)
|
|
||||||
|
|
||||||
# Add a custom rule-based component to mimick NER
|
# This shouldn't crash.
|
||||||
patterns = [
|
entity_linker.predict([example.reference]) # type: ignore
|
||||||
{"label": "PERSON", "pattern": [{"LOWER": "mahler"}]},
|
|
||||||
{
|
|
||||||
"label": "WORK",
|
|
||||||
"pattern": [
|
|
||||||
{"LOWER": "symphony"},
|
|
||||||
{"LOWER": "no"},
|
|
||||||
{"LOWER": "."},
|
|
||||||
{"LOWER": "8"},
|
|
||||||
],
|
|
||||||
},
|
|
||||||
]
|
|
||||||
ruler = nlp.add_pipe("entity_ruler", before="entity_linker")
|
|
||||||
ruler.add_patterns(patterns)
|
|
||||||
# test the trained model - this should not throw E148
|
|
||||||
doc = nlp(text)
|
|
||||||
assert doc
|
|
||||||
|
|
||||||
|
|
||||||
def test_no_entities():
|
def test_no_entities():
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
import pytest
|
import pytest
|
||||||
import numpy
|
import numpy
|
||||||
from numpy.testing import assert_array_equal, assert_almost_equal
|
from numpy.testing import assert_array_equal, assert_almost_equal
|
||||||
from thinc.api import get_current_ops, Ragged
|
from thinc.api import get_current_ops, NumpyOps, Ragged
|
||||||
|
|
||||||
from spacy import util
|
from spacy import util
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
|
@ -190,17 +190,19 @@ def test_make_spangroup_singlelabel(threshold, allow_overlap, nr_results):
|
||||||
spangroup = spancat._make_span_group_singlelabel(
|
spangroup = spancat._make_span_group_singlelabel(
|
||||||
doc, indices, scores, allow_overlap
|
doc, indices, scores, allow_overlap
|
||||||
)
|
)
|
||||||
assert len(spangroup) == nr_results
|
|
||||||
if threshold > 0.4:
|
if threshold > 0.4:
|
||||||
if allow_overlap:
|
if allow_overlap:
|
||||||
assert spangroup[0].text == "London"
|
assert spangroup[0].text == "London"
|
||||||
assert spangroup[0].label_ == "City"
|
assert spangroup[0].label_ == "City"
|
||||||
|
assert_almost_equal(0.6, spangroup.attrs["scores"][0], 5)
|
||||||
assert spangroup[1].text == "Greater London"
|
assert spangroup[1].text == "Greater London"
|
||||||
assert spangroup[1].label_ == "GreatCity"
|
assert spangroup[1].label_ == "GreatCity"
|
||||||
|
assert spangroup.attrs["scores"][1] == 0.9
|
||||||
|
assert_almost_equal(0.9, spangroup.attrs["scores"][1], 5)
|
||||||
else:
|
else:
|
||||||
assert spangroup[0].text == "Greater London"
|
assert spangroup[0].text == "Greater London"
|
||||||
assert spangroup[0].label_ == "GreatCity"
|
assert spangroup[0].label_ == "GreatCity"
|
||||||
|
assert spangroup.attrs["scores"][0] == 0.9
|
||||||
else:
|
else:
|
||||||
if allow_overlap:
|
if allow_overlap:
|
||||||
assert spangroup[0].text == "Greater"
|
assert spangroup[0].text == "Greater"
|
||||||
|
@ -256,22 +258,32 @@ def test_make_spangroup_negative_label():
|
||||||
assert len(spangroup_single) == 2
|
assert len(spangroup_single) == 2
|
||||||
assert spangroup_single[0].text == "Greater"
|
assert spangroup_single[0].text == "Greater"
|
||||||
assert spangroup_single[0].label_ == "City"
|
assert spangroup_single[0].label_ == "City"
|
||||||
|
assert_almost_equal(0.4, spangroup_single.attrs["scores"][0], 5)
|
||||||
assert spangroup_single[1].text == "Greater London"
|
assert spangroup_single[1].text == "Greater London"
|
||||||
assert spangroup_single[1].label_ == "GreatCity"
|
assert spangroup_single[1].label_ == "GreatCity"
|
||||||
|
assert spangroup_single.attrs["scores"][1] == 0.9
|
||||||
|
assert_almost_equal(0.9, spangroup_single.attrs["scores"][1], 5)
|
||||||
|
|
||||||
assert len(spangroup_multi) == 6
|
assert len(spangroup_multi) == 6
|
||||||
assert spangroup_multi[0].text == "Greater"
|
assert spangroup_multi[0].text == "Greater"
|
||||||
assert spangroup_multi[0].label_ == "City"
|
assert spangroup_multi[0].label_ == "City"
|
||||||
|
assert_almost_equal(0.4, spangroup_multi.attrs["scores"][0], 5)
|
||||||
assert spangroup_multi[1].text == "Greater"
|
assert spangroup_multi[1].text == "Greater"
|
||||||
assert spangroup_multi[1].label_ == "Person"
|
assert spangroup_multi[1].label_ == "Person"
|
||||||
|
assert_almost_equal(0.3, spangroup_multi.attrs["scores"][1], 5)
|
||||||
assert spangroup_multi[2].text == "London"
|
assert spangroup_multi[2].text == "London"
|
||||||
assert spangroup_multi[2].label_ == "City"
|
assert spangroup_multi[2].label_ == "City"
|
||||||
|
assert_almost_equal(0.6, spangroup_multi.attrs["scores"][2], 5)
|
||||||
assert spangroup_multi[3].text == "London"
|
assert spangroup_multi[3].text == "London"
|
||||||
assert spangroup_multi[3].label_ == "GreatCity"
|
assert spangroup_multi[3].label_ == "GreatCity"
|
||||||
|
assert_almost_equal(0.4, spangroup_multi.attrs["scores"][3], 5)
|
||||||
assert spangroup_multi[4].text == "Greater London"
|
assert spangroup_multi[4].text == "Greater London"
|
||||||
assert spangroup_multi[4].label_ == "Thing"
|
assert spangroup_multi[4].label_ == "Thing"
|
||||||
|
assert spangroup_multi[4].text == "Greater London"
|
||||||
|
assert_almost_equal(0.8, spangroup_multi.attrs["scores"][4], 5)
|
||||||
assert spangroup_multi[5].text == "Greater London"
|
assert spangroup_multi[5].text == "Greater London"
|
||||||
assert spangroup_multi[5].label_ == "GreatCity"
|
assert spangroup_multi[5].label_ == "GreatCity"
|
||||||
|
assert_almost_equal(0.9, spangroup_multi.attrs["scores"][5], 5)
|
||||||
|
|
||||||
|
|
||||||
def test_ngram_suggester(en_tokenizer):
|
def test_ngram_suggester(en_tokenizer):
|
||||||
|
@ -565,3 +577,21 @@ def test_set_candidates(name):
|
||||||
assert len(docs[0].spans["candidates"]) == 9
|
assert len(docs[0].spans["candidates"]) == 9
|
||||||
assert docs[0].spans["candidates"][0].text == "Just"
|
assert docs[0].spans["candidates"][0].text == "Just"
|
||||||
assert docs[0].spans["candidates"][4].text == "Just a"
|
assert docs[0].spans["candidates"][4].text == "Just a"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("name", SPANCAT_COMPONENTS)
|
||||||
|
@pytest.mark.parametrize("n_process", [1, 2])
|
||||||
|
def test_spancat_multiprocessing(name, n_process):
|
||||||
|
if isinstance(get_current_ops, NumpyOps) or n_process < 2:
|
||||||
|
nlp = Language()
|
||||||
|
spancat = nlp.add_pipe(name, config={"spans_key": SPAN_KEY})
|
||||||
|
train_examples = make_examples(nlp)
|
||||||
|
nlp.initialize(get_examples=lambda: train_examples)
|
||||||
|
texts = [
|
||||||
|
"Just a sentence.",
|
||||||
|
"I like London and Berlin",
|
||||||
|
"I like Berlin",
|
||||||
|
"I eat ham.",
|
||||||
|
]
|
||||||
|
docs = list(nlp.pipe(texts, n_process=n_process))
|
||||||
|
assert len(docs) == len(texts)
|
||||||
|
|
|
@ -213,6 +213,13 @@ def test_serialize_doc_exclude(en_vocab):
|
||||||
|
|
||||||
def test_serialize_doc_span_groups(en_vocab):
|
def test_serialize_doc_span_groups(en_vocab):
|
||||||
doc = Doc(en_vocab, words=["hello", "world", "!"])
|
doc = Doc(en_vocab, words=["hello", "world", "!"])
|
||||||
doc.spans["content"] = [doc[0:2]]
|
span = doc[0:2]
|
||||||
|
span.label_ = "test_serialize_doc_span_groups_label"
|
||||||
|
span.id_ = "test_serialize_doc_span_groups_id"
|
||||||
|
span.kb_id_ = "test_serialize_doc_span_groups_kb_id"
|
||||||
|
doc.spans["content"] = [span]
|
||||||
new_doc = Doc(en_vocab).from_bytes(doc.to_bytes())
|
new_doc = Doc(en_vocab).from_bytes(doc.to_bytes())
|
||||||
assert len(new_doc.spans["content"]) == 1
|
assert len(new_doc.spans["content"]) == 1
|
||||||
|
assert new_doc.spans["content"][0].label_ == "test_serialize_doc_span_groups_label"
|
||||||
|
assert new_doc.spans["content"][0].id_ == "test_serialize_doc_span_groups_id"
|
||||||
|
assert new_doc.spans["content"][0].kb_id_ == "test_serialize_doc_span_groups_kb_id"
|
||||||
|
|
|
@ -49,7 +49,11 @@ def test_serialize_doc_bin():
|
||||||
nlp = English()
|
nlp = English()
|
||||||
for doc in nlp.pipe(texts):
|
for doc in nlp.pipe(texts):
|
||||||
doc.cats = cats
|
doc.cats = cats
|
||||||
doc.spans["start"] = [doc[0:2]]
|
span = doc[0:2]
|
||||||
|
span.label_ = "UNUSUAL_SPAN_LABEL"
|
||||||
|
span.id_ = "UNUSUAL_SPAN_ID"
|
||||||
|
span.kb_id_ = "UNUSUAL_SPAN_KB_ID"
|
||||||
|
doc.spans["start"] = [span]
|
||||||
doc[0].norm_ = "UNUSUAL_TOKEN_NORM"
|
doc[0].norm_ = "UNUSUAL_TOKEN_NORM"
|
||||||
doc[0].ent_id_ = "UNUSUAL_TOKEN_ENT_ID"
|
doc[0].ent_id_ = "UNUSUAL_TOKEN_ENT_ID"
|
||||||
doc_bin.add(doc)
|
doc_bin.add(doc)
|
||||||
|
@ -63,6 +67,9 @@ def test_serialize_doc_bin():
|
||||||
assert doc.text == texts[i]
|
assert doc.text == texts[i]
|
||||||
assert doc.cats == cats
|
assert doc.cats == cats
|
||||||
assert len(doc.spans) == 1
|
assert len(doc.spans) == 1
|
||||||
|
assert doc.spans["start"][0].label_ == "UNUSUAL_SPAN_LABEL"
|
||||||
|
assert doc.spans["start"][0].id_ == "UNUSUAL_SPAN_ID"
|
||||||
|
assert doc.spans["start"][0].kb_id_ == "UNUSUAL_SPAN_KB_ID"
|
||||||
assert doc[0].norm_ == "UNUSUAL_TOKEN_NORM"
|
assert doc[0].norm_ == "UNUSUAL_TOKEN_NORM"
|
||||||
assert doc[0].ent_id_ == "UNUSUAL_TOKEN_ENT_ID"
|
assert doc[0].ent_id_ == "UNUSUAL_TOKEN_ENT_ID"
|
||||||
|
|
||||||
|
|
|
@ -275,6 +275,20 @@ def test_displacy_parse_deps(en_vocab):
|
||||||
{"start": 2, "end": 3, "label": "det", "dir": "left"},
|
{"start": 2, "end": 3, "label": "det", "dir": "left"},
|
||||||
{"start": 1, "end": 3, "label": "attr", "dir": "right"},
|
{"start": 1, "end": 3, "label": "attr", "dir": "right"},
|
||||||
]
|
]
|
||||||
|
# Test that displacy.parse_deps converts Span to Doc
|
||||||
|
deps = displacy.parse_deps(doc[:])
|
||||||
|
assert isinstance(deps, dict)
|
||||||
|
assert deps["words"] == [
|
||||||
|
{"lemma": None, "text": words[0], "tag": pos[0]},
|
||||||
|
{"lemma": None, "text": words[1], "tag": pos[1]},
|
||||||
|
{"lemma": None, "text": words[2], "tag": pos[2]},
|
||||||
|
{"lemma": None, "text": words[3], "tag": pos[3]},
|
||||||
|
]
|
||||||
|
assert deps["arcs"] == [
|
||||||
|
{"start": 0, "end": 1, "label": "nsubj", "dir": "left"},
|
||||||
|
{"start": 2, "end": 3, "label": "det", "dir": "left"},
|
||||||
|
{"start": 1, "end": 3, "label": "attr", "dir": "right"},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
def test_displacy_invalid_arcs():
|
def test_displacy_invalid_arcs():
|
||||||
|
|
|
@ -165,7 +165,8 @@ def test_pretraining_default():
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("objective", CHAR_OBJECTIVES)
|
@pytest.mark.parametrize("objective", CHAR_OBJECTIVES)
|
||||||
def test_pretraining_tok2vec_characters(objective):
|
@pytest.mark.parametrize("skip_last", (True, False))
|
||||||
|
def test_pretraining_tok2vec_characters(objective, skip_last):
|
||||||
"""Test that pretraining works with the character objective"""
|
"""Test that pretraining works with the character objective"""
|
||||||
config = Config().from_str(pretrain_string_listener)
|
config = Config().from_str(pretrain_string_listener)
|
||||||
config["pretraining"]["objective"] = objective
|
config["pretraining"]["objective"] = objective
|
||||||
|
@ -178,10 +179,14 @@ def test_pretraining_tok2vec_characters(objective):
|
||||||
filled["paths"]["raw_text"] = file_path
|
filled["paths"]["raw_text"] = file_path
|
||||||
filled = filled.interpolate()
|
filled = filled.interpolate()
|
||||||
assert filled["pretraining"]["component"] == "tok2vec"
|
assert filled["pretraining"]["component"] == "tok2vec"
|
||||||
pretrain(filled, tmp_dir)
|
pretrain(filled, tmp_dir, skip_last=skip_last)
|
||||||
assert Path(tmp_dir / "model0.bin").exists()
|
assert Path(tmp_dir / "model0.bin").exists()
|
||||||
assert Path(tmp_dir / "model4.bin").exists()
|
assert Path(tmp_dir / "model4.bin").exists()
|
||||||
assert not Path(tmp_dir / "model5.bin").exists()
|
assert not Path(tmp_dir / "model5.bin").exists()
|
||||||
|
if skip_last:
|
||||||
|
assert not Path(tmp_dir / "model-last.bin").exists()
|
||||||
|
else:
|
||||||
|
assert Path(tmp_dir / "model-last.bin").exists()
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("objective", VECTOR_OBJECTIVES)
|
@pytest.mark.parametrize("objective", VECTOR_OBJECTIVES)
|
||||||
|
@ -237,6 +242,7 @@ def test_pretraining_tagger_tok2vec(config):
|
||||||
pretrain(filled, tmp_dir)
|
pretrain(filled, tmp_dir)
|
||||||
assert Path(tmp_dir / "model0.bin").exists()
|
assert Path(tmp_dir / "model0.bin").exists()
|
||||||
assert Path(tmp_dir / "model4.bin").exists()
|
assert Path(tmp_dir / "model4.bin").exists()
|
||||||
|
assert Path(tmp_dir / "model-last.bin").exists()
|
||||||
assert not Path(tmp_dir / "model5.bin").exists()
|
assert not Path(tmp_dir / "model5.bin").exists()
|
||||||
|
|
||||||
|
|
||||||
|
@ -359,19 +365,15 @@ def test_pretrain_default_vectors():
|
||||||
nlp.vocab.vectors = Vectors(shape=(10, 10))
|
nlp.vocab.vectors = Vectors(shape=(10, 10))
|
||||||
create_pretrain_vectors(1, 1, "cosine")(nlp.vocab, nlp.get_pipe("tok2vec").model)
|
create_pretrain_vectors(1, 1, "cosine")(nlp.vocab, nlp.get_pipe("tok2vec").model)
|
||||||
|
|
||||||
|
# floret vectors are supported
|
||||||
|
nlp.vocab.vectors = Vectors(
|
||||||
|
data=get_current_ops().xp.zeros((10, 10)), mode="floret", hash_count=1
|
||||||
|
)
|
||||||
|
create_pretrain_vectors(1, 1, "cosine")(nlp.vocab, nlp.get_pipe("tok2vec").model)
|
||||||
|
|
||||||
# error for no vectors
|
# error for no vectors
|
||||||
with pytest.raises(ValueError, match="E875"):
|
with pytest.raises(ValueError, match="E875"):
|
||||||
nlp.vocab.vectors = Vectors()
|
nlp.vocab.vectors = Vectors()
|
||||||
create_pretrain_vectors(1, 1, "cosine")(
|
create_pretrain_vectors(1, 1, "cosine")(
|
||||||
nlp.vocab, nlp.get_pipe("tok2vec").model
|
nlp.vocab, nlp.get_pipe("tok2vec").model
|
||||||
)
|
)
|
||||||
|
|
||||||
# error for floret vectors
|
|
||||||
with pytest.raises(ValueError, match="E850"):
|
|
||||||
ops = get_current_ops()
|
|
||||||
nlp.vocab.vectors = Vectors(
|
|
||||||
data=ops.xp.zeros((10, 10)), mode="floret", hash_count=1
|
|
||||||
)
|
|
||||||
create_pretrain_vectors(1, 1, "cosine")(
|
|
||||||
nlp.vocab, nlp.get_pipe("tok2vec").model
|
|
||||||
)
|
|
||||||
|
|
|
@ -124,6 +124,10 @@ class DocBin:
|
||||||
for key, group in doc.spans.items():
|
for key, group in doc.spans.items():
|
||||||
for span in group:
|
for span in group:
|
||||||
self.strings.add(span.label_)
|
self.strings.add(span.label_)
|
||||||
|
if span.kb_id in span.doc.vocab.strings:
|
||||||
|
self.strings.add(span.kb_id_)
|
||||||
|
if span.id in span.doc.vocab.strings:
|
||||||
|
self.strings.add(span.id_)
|
||||||
|
|
||||||
def get_docs(self, vocab: Vocab) -> Iterator[Doc]:
|
def get_docs(self, vocab: Vocab) -> Iterator[Doc]:
|
||||||
"""Recover Doc objects from the annotations, using the given vocab.
|
"""Recover Doc objects from the annotations, using the given vocab.
|
||||||
|
|
|
@ -544,10 +544,6 @@ cdef class Doc:
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/doc#char_span
|
DOCS: https://spacy.io/api/doc#char_span
|
||||||
"""
|
"""
|
||||||
if not isinstance(label, int):
|
|
||||||
label = self.vocab.strings.add(label)
|
|
||||||
if not isinstance(kb_id, int):
|
|
||||||
kb_id = self.vocab.strings.add(kb_id)
|
|
||||||
alignment_modes = ("strict", "contract", "expand")
|
alignment_modes = ("strict", "contract", "expand")
|
||||||
if alignment_mode not in alignment_modes:
|
if alignment_mode not in alignment_modes:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
|
@ -1350,6 +1346,10 @@ cdef class Doc:
|
||||||
for group in self.spans.values():
|
for group in self.spans.values():
|
||||||
for span in group:
|
for span in group:
|
||||||
strings.add(span.label_)
|
strings.add(span.label_)
|
||||||
|
if span.kb_id in span.doc.vocab.strings:
|
||||||
|
strings.add(span.kb_id_)
|
||||||
|
if span.id in span.doc.vocab.strings:
|
||||||
|
strings.add(span.id_)
|
||||||
# Msgpack doesn't distinguish between lists and tuples, which is
|
# Msgpack doesn't distinguish between lists and tuples, which is
|
||||||
# vexing for user data. As a best guess, we *know* that within
|
# vexing for user data. As a best guess, we *know* that within
|
||||||
# keys, we must have tuples. In values we just have to hope
|
# keys, we must have tuples. In values we just have to hope
|
||||||
|
|
|
@ -460,9 +460,12 @@ cdef class Span:
|
||||||
start = i
|
start = i
|
||||||
if start >= self.end:
|
if start >= self.end:
|
||||||
break
|
break
|
||||||
if start < self.end:
|
elif i == self.doc.length - 1:
|
||||||
yield Span(self.doc, start, self.end)
|
yield Span(self.doc, start, self.doc.length)
|
||||||
|
|
||||||
|
# Ensure that trailing parts of the Span instance are included in last element of .sents.
|
||||||
|
if start == self.doc.length - 1:
|
||||||
|
yield Span(self.doc, start, self.doc.length)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def ents(self):
|
def ents(self):
|
||||||
|
|
|
@ -24,6 +24,7 @@ def pretrain(
|
||||||
epoch_resume: Optional[int] = None,
|
epoch_resume: Optional[int] = None,
|
||||||
use_gpu: int = -1,
|
use_gpu: int = -1,
|
||||||
silent: bool = True,
|
silent: bool = True,
|
||||||
|
skip_last: bool = False,
|
||||||
):
|
):
|
||||||
msg = Printer(no_print=silent)
|
msg = Printer(no_print=silent)
|
||||||
if config["training"]["seed"] is not None:
|
if config["training"]["seed"] is not None:
|
||||||
|
@ -60,10 +61,14 @@ def pretrain(
|
||||||
row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
|
row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
|
||||||
msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)
|
msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)
|
||||||
|
|
||||||
def _save_model(epoch, is_temp=False):
|
def _save_model(epoch, is_temp=False, is_last=False):
|
||||||
is_temp_str = ".temp" if is_temp else ""
|
is_temp_str = ".temp" if is_temp else ""
|
||||||
with model.use_params(optimizer.averages):
|
with model.use_params(optimizer.averages):
|
||||||
with (output_dir / f"model{epoch}{is_temp_str}.bin").open("wb") as file_:
|
if is_last:
|
||||||
|
save_path = output_dir / f"model-last.bin"
|
||||||
|
else:
|
||||||
|
save_path = output_dir / f"model{epoch}{is_temp_str}.bin"
|
||||||
|
with (save_path).open("wb") as file_:
|
||||||
file_.write(model.get_ref("tok2vec").to_bytes())
|
file_.write(model.get_ref("tok2vec").to_bytes())
|
||||||
log = {
|
log = {
|
||||||
"nr_word": tracker.nr_word,
|
"nr_word": tracker.nr_word,
|
||||||
|
@ -76,6 +81,7 @@ def pretrain(
|
||||||
|
|
||||||
# TODO: I think we probably want this to look more like the
|
# TODO: I think we probably want this to look more like the
|
||||||
# 'create_train_batches' function?
|
# 'create_train_batches' function?
|
||||||
|
try:
|
||||||
for epoch in range(epoch_resume, P["max_epochs"]):
|
for epoch in range(epoch_resume, P["max_epochs"]):
|
||||||
for batch_id, batch in enumerate(batcher(corpus(nlp))):
|
for batch_id, batch in enumerate(batcher(corpus(nlp))):
|
||||||
docs = ensure_docs(batch)
|
docs = ensure_docs(batch)
|
||||||
|
@ -92,6 +98,9 @@ def pretrain(
|
||||||
else:
|
else:
|
||||||
_save_model(epoch)
|
_save_model(epoch)
|
||||||
tracker.epoch_loss = 0.0
|
tracker.epoch_loss = 0.0
|
||||||
|
finally:
|
||||||
|
if not skip_last:
|
||||||
|
_save_model(P["max_epochs"], is_last=True)
|
||||||
|
|
||||||
|
|
||||||
def ensure_docs(examples_or_docs: Iterable[Union[Doc, Example]]) -> List[Doc]:
|
def ensure_docs(examples_or_docs: Iterable[Union[Doc, Example]]) -> List[Doc]:
|
||||||
|
|
|
@ -1123,13 +1123,14 @@ $ python -m spacy pretrain [config_path] [output_dir] [--code] [--resume-path] [
|
||||||
```
|
```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
| -------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ |
|
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ |
|
||||||
| `output_dir` | Directory to save binary weights to on each epoch. ~~Path (positional)~~ |
|
| `output_dir` | Directory to save binary weights to on each epoch. ~~Path (positional)~~ |
|
||||||
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
||||||
| `--resume-path`, `-r` | Path to pretrained weights from which to resume pretraining. ~~Optional[Path] \(option)~~ |
|
| `--resume-path`, `-r` | Path to pretrained weights from which to resume pretraining. ~~Optional[Path] \(option)~~ |
|
||||||
| `--epoch-resume`, `-er` | The epoch to resume counting from when using `--resume-path`. Prevents unintended overwriting of existing weight files. ~~Optional[int] \(option)~~ |
|
| `--epoch-resume`, `-er` | The epoch to resume counting from when using `--resume-path`. Prevents unintended overwriting of existing weight files. ~~Optional[int] \(option)~~ |
|
||||||
| `--gpu-id`, `-g` | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~ |
|
| `--gpu-id`, `-g` | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~ |
|
||||||
|
| `--skip-last`, `-L` <Tag variant="new">3.5.2</Tag> | Skip saving `model-last.bin`. Defaults to `False`. ~~bool (flag)~~ |
|
||||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||||
| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.dropout 0.2`. ~~Any (option/flag)~~ |
|
| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.dropout 0.2`. ~~Any (option/flag)~~ |
|
||||||
| **CREATES** | The pretrained weights that can be used to initialize `spacy train`. |
|
| **CREATES** | The pretrained weights that can be used to initialize `spacy train`. |
|
||||||
|
@ -1255,7 +1256,7 @@ be provided.
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
| ------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `model` | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~ |
|
| `model` | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~ |
|
||||||
| `data_path` | Path to file with DocBin with docs to use for threshold search. ~~Path (positional)~~ |
|
| `data_path` | Path to file with DocBin with docs to use for threshold search. ~~Path (positional)~~ |
|
||||||
| `pipe_name` | Name of pipe to examine thresholds for. ~~str (positional)~~ |
|
| `pipe_name` | Name of pipe to examine thresholds for. ~~str (positional)~~ |
|
||||||
|
@ -1265,7 +1266,7 @@ be provided.
|
||||||
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
||||||
| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ |
|
| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ |
|
||||||
| `--gold-preproc`, `-G` | Use gold preprocessing. ~~bool (flag)~~ |
|
| `--gold-preproc`, `-G` | Use gold preprocessing. ~~bool (flag)~~ |
|
||||||
| `--silent`, `-V`, `-VV` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ |
|
| `--verbose`, `-V`, `-VV` | Display more information for debugging purposes. ~~bool (flag)~~ |
|
||||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||||
|
|
||||||
## assemble {id="assemble",tag="command"}
|
## assemble {id="assemble",tag="command"}
|
||||||
|
|
|
@ -64,7 +64,7 @@ details on the architectures and their arguments and hyperparameters.
|
||||||
> config={
|
> config={
|
||||||
> "model": DEFAULT_COREF_MODEL,
|
> "model": DEFAULT_COREF_MODEL,
|
||||||
> "span_cluster_prefix": DEFAULT_CLUSTER_PREFIX,
|
> "span_cluster_prefix": DEFAULT_CLUSTER_PREFIX,
|
||||||
> },
|
> }
|
||||||
> nlp.add_pipe("experimental_coref", config=config)
|
> nlp.add_pipe("experimental_coref", config=config)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
|
|
|
@ -20,8 +20,9 @@ output class probabilities are independent for each class. However, if you need
|
||||||
to predict at most one true class for a span, then use `spancat_singlelabel`. It
|
to predict at most one true class for a span, then use `spancat_singlelabel`. It
|
||||||
uses a `Softmax` layer and treats the task as a multi-class problem.
|
uses a `Softmax` layer and treats the task as a multi-class problem.
|
||||||
|
|
||||||
Predicted spans will be saved in a [`SpanGroup`](/api/spangroup) on the doc.
|
Predicted spans will be saved in a [`SpanGroup`](/api/spangroup) on the doc
|
||||||
Individual span scores can be found in `spangroup.attrs["scores"]`.
|
under `doc.spans[spans_key]`, where `spans_key` is a component config setting.
|
||||||
|
Individual span scores are stored in `doc.spans[spans_key].attrs["scores"]`.
|
||||||
|
|
||||||
## Assigned Attributes {id="assigned-attributes"}
|
## Assigned Attributes {id="assigned-attributes"}
|
||||||
|
|
||||||
|
@ -29,7 +30,9 @@ Predictions will be saved to `Doc.spans[spans_key]` as a
|
||||||
[`SpanGroup`](/api/spangroup). The scores for the spans in the `SpanGroup` will
|
[`SpanGroup`](/api/spangroup). The scores for the spans in the `SpanGroup` will
|
||||||
be saved in `SpanGroup.attrs["scores"]`.
|
be saved in `SpanGroup.attrs["scores"]`.
|
||||||
|
|
||||||
`spans_key` defaults to `"sc"`, but can be passed as a parameter.
|
`spans_key` defaults to `"sc"`, but can be passed as a parameter. The `spancat`
|
||||||
|
component will overwrite any existing spans under the spans key
|
||||||
|
`doc.spans[spans_key]`.
|
||||||
|
|
||||||
| Location | Value |
|
| Location | Value |
|
||||||
| -------------------------------------- | -------------------------------------------------------- |
|
| -------------------------------------- | -------------------------------------------------------- |
|
||||||
|
|
|
@ -8,6 +8,13 @@ Look up strings by 64-bit hashes. As of v2.0, spaCy uses hash values instead of
|
||||||
integer IDs. This ensures that strings always map to the same ID, even from
|
integer IDs. This ensures that strings always map to the same ID, even from
|
||||||
different `StringStores`.
|
different `StringStores`.
|
||||||
|
|
||||||
|
<Infobox variant ="warning">
|
||||||
|
|
||||||
|
Note that a `StringStore` instance is not static. It increases in size as texts
|
||||||
|
with new tokens are processed.
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
## StringStore.\_\_init\_\_ {id="init",tag="method"}
|
## StringStore.\_\_init\_\_ {id="init",tag="method"}
|
||||||
|
|
||||||
Create the `StringStore`.
|
Create the `StringStore`.
|
||||||
|
|
|
@ -25,7 +25,10 @@ and call the package's own `load()` method. If a pipeline is loaded from a path,
|
||||||
spaCy will assume it's a data directory, load its
|
spaCy will assume it's a data directory, load its
|
||||||
[`config.cfg`](/api/data-formats#config) and use the language and pipeline
|
[`config.cfg`](/api/data-formats#config) and use the language and pipeline
|
||||||
information to construct the `Language` class. The data will be loaded in via
|
information to construct the `Language` class. The data will be loaded in via
|
||||||
[`Language.from_disk`](/api/language#from_disk).
|
[`Language.from_disk`](/api/language#from_disk). Loading a pipeline from a
|
||||||
|
package will also import any custom code, if present, whereas loading from a
|
||||||
|
directory does not. For these cases, you need to manually import your custom
|
||||||
|
code.
|
||||||
|
|
||||||
<Infobox variant="warning" title="Changed in v3.0">
|
<Infobox variant="warning" title="Changed in v3.0">
|
||||||
|
|
||||||
|
@ -291,7 +294,7 @@ the `manual=True` argument in `displacy.render`.
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------- | ------------------------------------------------------------------- |
|
| ----------- | ------------------------------------------------------------------- |
|
||||||
| `orig_doc` | Doc to parse dependencies. ~~Doc~~ |
|
| `orig_doc` | Doc or span to parse dependencies. ~~Union[Doc, Span]~~ |
|
||||||
| `options` | Dependency parse specific visualisation options. ~~Dict[str, Any]~~ |
|
| `options` | Dependency parse specific visualisation options. ~~Dict[str, Any]~~ |
|
||||||
| **RETURNS** | Generated dependency parse keyed by words and arcs. ~~dict~~ |
|
| **RETURNS** | Generated dependency parse keyed by words and arcs. ~~dict~~ |
|
||||||
|
|
||||||
|
@ -577,7 +580,7 @@ start decreasing across epochs.
|
||||||
> ```ini
|
> ```ini
|
||||||
> [training.logger]
|
> [training.logger]
|
||||||
> @loggers = "spacy.ConsoleLogger.v3"
|
> @loggers = "spacy.ConsoleLogger.v3"
|
||||||
> progress_bar = "all_steps"
|
> progress_bar = "eval"
|
||||||
> console_output = true
|
> console_output = true
|
||||||
> output_file = "training_log.jsonl"
|
> output_file = "training_log.jsonl"
|
||||||
> ```
|
> ```
|
||||||
|
|
|
@ -10,6 +10,13 @@ The `Vocab` object provides a lookup table that allows you to access
|
||||||
[`StringStore`](/api/stringstore). It also owns underlying C-data that is shared
|
[`StringStore`](/api/stringstore). It also owns underlying C-data that is shared
|
||||||
between `Doc` objects.
|
between `Doc` objects.
|
||||||
|
|
||||||
|
<Infobox variant ="warning">
|
||||||
|
|
||||||
|
Note that a `Vocab` instance is not static. It increases in size as texts with
|
||||||
|
new tokens are processed.
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
## Vocab.\_\_init\_\_ {id="init",tag="method"}
|
## Vocab.\_\_init\_\_ {id="init",tag="method"}
|
||||||
|
|
||||||
Create the vocabulary.
|
Create the vocabulary.
|
||||||
|
|
|
@ -746,13 +746,16 @@ this by setting `initialize.init_tok2vec` to the filename of the `.bin` file
|
||||||
that you want to use from pretraining.
|
that you want to use from pretraining.
|
||||||
|
|
||||||
A pretraining step that runs for 5 epochs with an output path of `pretrain/`, as
|
A pretraining step that runs for 5 epochs with an output path of `pretrain/`, as
|
||||||
an example, produces `pretrain/model0.bin` through `pretrain/model4.bin`. To
|
an example, produces `pretrain/model0.bin` through `pretrain/model4.bin` plus a
|
||||||
make use of the final output, you could fill in this value in your config file:
|
copy of the last iteration as `pretrain/model-last.bin`. Additionally, you can
|
||||||
|
configure `n_save_epoch` to tell pretraining in which epoch interval it should
|
||||||
|
save the current training progress. To use the final output to initialize your
|
||||||
|
`tok2vec` layer, you could fill in this value in your config file:
|
||||||
|
|
||||||
```ini {title="config.cfg"}
|
```ini {title="config.cfg"}
|
||||||
|
|
||||||
[paths]
|
[paths]
|
||||||
init_tok2vec = "pretrain/model4.bin"
|
init_tok2vec = "pretrain/model-last.bin"
|
||||||
|
|
||||||
[initialize]
|
[initialize]
|
||||||
init_tok2vec = ${paths.init_tok2vec}
|
init_tok2vec = ${paths.init_tok2vec}
|
||||||
|
|
|
@ -1684,6 +1684,8 @@ def expand_person_entities(doc):
|
||||||
new_ents.append(new_ent)
|
new_ents.append(new_ent)
|
||||||
else:
|
else:
|
||||||
new_ents.append(ent)
|
new_ents.append(ent)
|
||||||
|
else:
|
||||||
|
new_ents.append(ent)
|
||||||
doc.ents = new_ents
|
doc.ents = new_ents
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
|
|
|
@ -758,6 +758,15 @@ any custom architectures, functions or
|
||||||
your pipeline and registered when it's loaded. See the documentation on
|
your pipeline and registered when it's loaded. See the documentation on
|
||||||
[saving and loading pipelines](/usage/saving-loading#models-custom) for details.
|
[saving and loading pipelines](/usage/saving-loading#models-custom) for details.
|
||||||
|
|
||||||
|
<Infobox variant="warning">
|
||||||
|
|
||||||
|
Note that the unpackaged models produced by `spacy train` are data directories
|
||||||
|
that **do not include custom code**. You need to import the code in your script
|
||||||
|
before loading in unpackaged models. For more details, see
|
||||||
|
[`spacy.load`](/api/top-level#spacy.load).
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
#### Example: Modifying the nlp object {id="custom-code-nlp-callbacks"}
|
#### Example: Modifying the nlp object {id="custom-code-nlp-callbacks"}
|
||||||
|
|
||||||
For many use cases, you don't necessarily want to implement the whole `Language`
|
For many use cases, you don't necessarily want to implement the whole `Language`
|
||||||
|
|
|
@ -3215,6 +3215,51 @@
|
||||||
"category": ["pipeline"],
|
"category": ["pipeline"],
|
||||||
"tags": ["syllables", "multilingual"]
|
"tags": ["syllables", "multilingual"]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"id": "sentimental-onix",
|
||||||
|
"title": "Sentimental Onix",
|
||||||
|
"slogan": "Use onnx for sentiment models",
|
||||||
|
"description": "spaCy pipeline component for sentiment analysis using onnx",
|
||||||
|
"github": "sloev/sentimental-onix",
|
||||||
|
"pip": "sentimental-onix",
|
||||||
|
"code_example": [
|
||||||
|
"# Download model:",
|
||||||
|
"# python -m sentimental_onix download en",
|
||||||
|
"import spacy",
|
||||||
|
"from sentimental_onix import pipeline",
|
||||||
|
"",
|
||||||
|
"nlp = spacy.load(\"en_core_web_sm\")",
|
||||||
|
"nlp.add_pipe(\"sentencizer\")",
|
||||||
|
"nlp.add_pipe(\"sentimental_onix\", after=\"sentencizer\")",
|
||||||
|
"",
|
||||||
|
"sentences = [",
|
||||||
|
" (sent.text, sent._.sentiment)",
|
||||||
|
" for doc in nlp.pipe(",
|
||||||
|
" [",
|
||||||
|
" \"i hate pasta on tuesdays\",",
|
||||||
|
" \"i like movies on wednesdays\",",
|
||||||
|
" \"i find your argument ridiculous\",",
|
||||||
|
" \"soda with straws are my favorite\",",
|
||||||
|
" ]",
|
||||||
|
" )",
|
||||||
|
" for sent in doc.sents",
|
||||||
|
"]",
|
||||||
|
"",
|
||||||
|
"assert sentences == [",
|
||||||
|
" (\"i hate pasta on tuesdays\", \"Negative\"),",
|
||||||
|
" (\"i like movies on wednesdays\", \"Positive\"),",
|
||||||
|
" (\"i find your argument ridiculous\", \"Negative\"),",
|
||||||
|
" (\"soda with straws are my favorite\", \"Positive\"),",
|
||||||
|
"]"
|
||||||
|
],
|
||||||
|
"thumb": "https://raw.githubusercontent.com/sloev/sentimental-onix/master/.github/onix.webp",
|
||||||
|
"author": "Johannes Valbjørn",
|
||||||
|
"author_links": {
|
||||||
|
"github": "sloev"
|
||||||
|
},
|
||||||
|
"category": ["pipeline"],
|
||||||
|
"tags": ["sentiment", "english"]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"id": "gobbli",
|
"id": "gobbli",
|
||||||
"title": "gobbli",
|
"title": "gobbli",
|
||||||
|
|
|
@ -111,11 +111,12 @@
|
||||||
line-height: var(--line-height-xs)
|
line-height: var(--line-height-xs)
|
||||||
text-align: center
|
text-align: center
|
||||||
|
|
||||||
@include breakpoint(max, xs)
|
@include breakpoint(max, md)
|
||||||
.list
|
.alert
|
||||||
display: none
|
display: none
|
||||||
|
|
||||||
.alert
|
@include breakpoint(max, xs)
|
||||||
|
.list
|
||||||
display: none
|
display: none
|
||||||
|
|
||||||
.has-alert
|
.has-alert
|
||||||
|
|
|
@ -25,11 +25,6 @@ const AlertSpace = ({ nightly, legacy }) => {
|
||||||
const isOnline = useOnlineStatus()
|
const isOnline = useOnlineStatus()
|
||||||
return (
|
return (
|
||||||
<>
|
<>
|
||||||
{isOnline && (
|
|
||||||
<Alert title="💥 We'd love to learn more about your experience with spaCy!">
|
|
||||||
<Link to="https://form.typeform.com/to/aMel9q9f">Take our survey here.</Link>
|
|
||||||
</Alert>
|
|
||||||
)}
|
|
||||||
{nightly && (
|
{nightly && (
|
||||||
<Alert
|
<Alert
|
||||||
title="You're viewing the pre-release docs."
|
title="You're viewing the pre-release docs."
|
||||||
|
@ -62,9 +57,15 @@ const AlertSpace = ({ nightly, legacy }) => {
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// const navAlert = (
|
||||||
|
// <Link to="/usage/v3-5" noLinkLayout>
|
||||||
|
// <strong>💥 Out now:</strong> spaCy v3.5
|
||||||
|
// </Link>
|
||||||
|
// )
|
||||||
|
|
||||||
const navAlert = (
|
const navAlert = (
|
||||||
<Link to="/usage/v3-5" noLinkLayout>
|
<Link to="https://form.typeform.com/to/aMel9q9f" noLinkLayout>
|
||||||
<strong>💥 Out now:</strong> spaCy v3.5
|
<strong>💥 Take the user survey!</strong>
|
||||||
</Link>
|
</Link>
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user