Merge pull request #12784 from explosion/master

Merge `master` into `develop`
This commit is contained in:
Sofie Van Landeghem 2023-07-04 15:05:15 +02:00 committed by GitHub
commit 6f3a71999e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
872 changed files with 42733 additions and 39253 deletions

View File

@ -1,118 +0,0 @@
parameters:
python_version: ''
architecture: ''
prefix: ''
gpu: false
num_build_jobs: 1
steps:
- task: UsePythonVersion@0
inputs:
versionSpec: ${{ parameters.python_version }}
architecture: ${{ parameters.architecture }}
allowUnstable: true
- bash: |
echo "##vso[task.setvariable variable=python_version]${{ parameters.python_version }}"
displayName: 'Set variables'
- script: |
${{ parameters.prefix }} python -m pip install -U pip setuptools
${{ parameters.prefix }} python -m pip install -U -r requirements.txt
displayName: "Install dependencies"
- script: |
${{ parameters.prefix }} python setup.py build_ext --inplace -j ${{ parameters.num_build_jobs }}
${{ parameters.prefix }} python setup.py sdist --formats=gztar
displayName: "Compile and build sdist"
- script: python -m mypy spacy
displayName: 'Run mypy'
condition: ne(variables['python_version'], '3.6')
- task: DeleteFiles@1
inputs:
contents: "spacy"
displayName: "Delete source directory"
- script: |
${{ parameters.prefix }} python -m pip freeze --exclude torch --exclude cupy-cuda110 > installed.txt
${{ parameters.prefix }} python -m pip uninstall -y -r installed.txt
displayName: "Uninstall all packages"
- bash: |
${{ parameters.prefix }} SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
${{ parameters.prefix }} SPACY_NUM_BUILD_JOBS=2 python -m pip install dist/$SDIST
displayName: "Install from sdist"
- script: |
${{ parameters.prefix }} python -m pip install -U -r requirements.txt
displayName: "Install test requirements"
- script: |
${{ parameters.prefix }} python -m pip install -U cupy-cuda110 -f https://github.com/cupy/cupy/releases/v9.0.0
${{ parameters.prefix }} python -m pip install "torch==1.7.1+cu110" -f https://download.pytorch.org/whl/torch_stable.html
displayName: "Install GPU requirements"
condition: eq(${{ parameters.gpu }}, true)
- script: |
${{ parameters.prefix }} python -m pytest --pyargs spacy -W error
displayName: "Run CPU tests"
condition: eq(${{ parameters.gpu }}, false)
- script: |
${{ parameters.prefix }} python -m pytest --pyargs spacy -W error -p spacy.tests.enable_gpu
displayName: "Run GPU tests"
condition: eq(${{ parameters.gpu }}, true)
- script: |
python -m spacy download ca_core_news_sm
python -m spacy download ca_core_news_md
python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
displayName: 'Test download CLI'
condition: eq(variables['python_version'], '3.8')
- script: |
python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
displayName: 'Test convert CLI'
condition: eq(variables['python_version'], '3.8')
- script: |
python -m spacy init config -p ner -l ca ner.cfg
python -m spacy debug config ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy
displayName: 'Test debug config CLI'
condition: eq(variables['python_version'], '3.8')
- script: |
# will have errors due to sparse data, check for summary in output
python -m spacy debug data ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy | grep -q Summary
displayName: 'Test debug data CLI'
condition: eq(variables['python_version'], '3.8')
- script: |
python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
displayName: 'Test train CLI'
condition: eq(variables['python_version'], '3.8')
- script: |
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
displayName: 'Test assemble CLI'
condition: eq(variables['python_version'], '3.8')
- script: |
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
displayName: 'Test assemble CLI vectors warning'
condition: eq(variables['python_version'], '3.8')
- script: |
python .github/validate_universe_json.py website/meta/universe.json
displayName: 'Test website/meta/universe.json'
condition: eq(variables['python_version'], '3.8')
- script: |
${{ parameters.prefix }} python -m pip install --pre thinc-apple-ops
${{ parameters.prefix }} python -m pytest --pyargs spacy
displayName: "Run CPU tests with thinc-apple-ops"
condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.10'))

View File

@ -1,44 +0,0 @@
# GitHub Action that uses Black to reformat all Python code and submits a PR
# in regular intervals. Inspired by: https://github.com/cclauss/autoblack
name: autoblack
on:
workflow_dispatch: # allow manual trigger
schedule:
- cron: '0 8 * * 5' # every Friday at 8am UTC
jobs:
autoblack:
if: github.repository_owner == 'explosion'
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
with:
ref: ${{ github.head_ref }}
- uses: actions/setup-python@v2
- run: pip install black
- name: Auto-format code if needed
run: black spacy
# We can't run black --check here because that returns a non-zero excit
# code and makes GitHub think the action failed
- name: Check for modified files
id: git-check
run: echo ::set-output name=modified::$(if git diff-index --quiet HEAD --; then echo "false"; else echo "true"; fi)
- name: Create Pull Request
if: steps.git-check.outputs.modified == 'true'
uses: peter-evans/create-pull-request@v3
with:
title: Auto-format code with black
labels: meta
commit-message: Auto-format code with black
committer: GitHub <noreply@github.com>
author: explosion-bot <explosion-bot@users.noreply.github.com>
body: _This PR is auto-generated._
branch: autoblack
delete-branch: true
draft: false
- name: Check outputs
if: steps.git-check.outputs.modified == 'true'
run: |
echo "Pull Request Number - ${{ steps.cpr.outputs.pull-request-number }}"
echo "Pull Request URL - ${{ steps.cpr.outputs.pull-request-url }}"

View File

@ -8,14 +8,15 @@ on:
jobs:
explosion-bot:
runs-on: ubuntu-18.04
if: github.repository_owner == 'explosion'
runs-on: ubuntu-latest
steps:
- name: Dump GitHub context
env:
GITHUB_CONTEXT: ${{ toJson(github) }}
run: echo "$GITHUB_CONTEXT"
- uses: actions/checkout@v1
- uses: actions/setup-python@v1
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
- name: Install and run explosion-bot
run: |
pip install git+https://${{ secrets.EXPLOSIONBOT_TOKEN }}@github.com/explosion/explosion-bot

View File

@ -13,6 +13,7 @@ on:
jobs:
issue-manager:
if: github.repository_owner == 'explosion'
runs-on: ubuntu-latest
steps:
- uses: tiangolo/issue-manager@0.4.0

View File

@ -13,9 +13,10 @@ concurrency:
jobs:
action:
if: github.repository_owner == 'explosion'
runs-on: ubuntu-latest
steps:
- uses: dessant/lock-threads@v3
- uses: dessant/lock-threads@v4
with:
process-only: 'issues'
issue-inactive-days: '30'

View File

@ -14,7 +14,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v1
uses: actions/checkout@v3
with:
ref: ${{ matrix.branch }}
- name: Get commits from past 24 hours
@ -23,9 +23,9 @@ jobs:
today=$(date '+%Y-%m-%d %H:%M:%S')
yesterday=$(date -d "yesterday" '+%Y-%m-%d %H:%M:%S')
if git log --after="$yesterday" --before="$today" | grep commit ; then
echo "::set-output name=run_tests::true"
echo run_tests=true >> $GITHUB_OUTPUT
else
echo "::set-output name=run_tests::false"
echo run_tests=false >> $GITHUB_OUTPUT
fi
- name: Trigger buildkite build

View File

@ -7,6 +7,7 @@ on:
jobs:
build:
if: github.repository_owner == 'explosion'
runs-on: ubuntu-latest
steps:
@ -17,8 +18,10 @@ jobs:
run: |
echo "$GITHUB_CONTEXT"
- uses: actions/checkout@v1
- uses: actions/setup-python@v1
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
with:
python-version: '3.10'
- name: Install Bernadette app dependency and send an alert
env:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}

178
.github/workflows/tests.yml vendored Normal file
View File

@ -0,0 +1,178 @@
name: tests
on:
push:
branches-ignore:
- "spacy.io"
- "nightly.spacy.io"
- "v2.spacy.io"
paths-ignore:
- "*.md"
- "*.mdx"
- "website/**"
- ".github/workflows/**"
pull_request:
types: [opened, synchronize, reopened, edited]
paths-ignore:
- "*.md"
- "*.mdx"
- "website/**"
jobs:
validate:
name: Validate
if: github.repository_owner == 'explosion'
runs-on: ubuntu-latest
steps:
- name: Check out repo
uses: actions/checkout@v3
- name: Configure Python version
uses: actions/setup-python@v4
with:
python-version: "3.7"
architecture: x64
- name: black
run: |
python -m pip install black -c requirements.txt
python -m black spacy --check
- name: isort
run: |
python -m pip install isort -c requirements.txt
python -m isort spacy --check
- name: flake8
run: |
python -m pip install flake8==5.0.4
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
tests:
name: Test
needs: Validate
strategy:
fail-fast: true
matrix:
os: [ubuntu-latest, windows-latest, macos-latest]
python_version: ["3.11"]
include:
- os: ubuntu-20.04
python_version: "3.6"
- os: windows-latest
python_version: "3.7"
- os: macos-latest
python_version: "3.8"
- os: ubuntu-latest
python_version: "3.9"
- os: windows-latest
python_version: "3.10"
runs-on: ${{ matrix.os }}
steps:
- name: Check out repo
uses: actions/checkout@v3
- name: Configure Python version
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python_version }}
architecture: x64
- name: Install dependencies
run: |
python -m pip install -U build pip setuptools
python -m pip install -U -r requirements.txt
- name: Build sdist
run: |
python -m build --sdist
- name: Run mypy
run: |
python -m mypy spacy
if: matrix.python_version != '3.6'
- name: Delete source directory and .egg-info
run: |
rm -rf spacy *.egg-info
shell: bash
- name: Uninstall all packages
run: |
python -m pip freeze
python -m pip freeze --exclude pywin32 > installed.txt
python -m pip uninstall -y -r installed.txt
- name: Install from sdist
run: |
SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
SPACY_NUM_BUILD_JOBS=2 python -m pip install dist/$SDIST
shell: bash
- name: Test import
run: python -W error -c "import spacy"
# - name: "Test download CLI"
# run: |
# python -m spacy download ca_core_news_sm
# python -m spacy download ca_core_news_md
# python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
# if: matrix.python_version == '3.9'
#
# - name: "Test download_url in info CLI"
# run: |
# python -W error -m spacy info ca_core_news_sm | grep -q download_url
# if: matrix.python_version == '3.9'
#
# - name: "Test no warnings on load (#11713)"
# run: |
# python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
# if: matrix.python_version == '3.9'
- name: "Test convert CLI"
run: |
python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
if: matrix.python_version == '3.9'
- name: "Test debug config CLI"
run: |
python -m spacy init config -p ner -l ca ner.cfg
python -m spacy debug config ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy
if: matrix.python_version == '3.9'
- name: "Test debug data CLI"
run: |
# will have errors due to sparse data, check for summary in output
python -m spacy debug data ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy | grep -q Summary
if: matrix.python_version == '3.9'
- name: "Test train CLI"
run: |
python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
if: matrix.python_version == '3.9'
# - name: "Test assemble CLI"
# run: |
# python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
# PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
# if: matrix.python_version == '3.9'
#
# - name: "Test assemble CLI vectors warning"
# run: |
# python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
# python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
# if: matrix.python_version == '3.9'
- name: "Install test requirements"
run: |
python -m pip install -U -r requirements.txt
- name: "Run CPU tests"
run: |
python -m pytest --pyargs spacy -W error
if: "!(startsWith(matrix.os, 'macos') && matrix.python_version == '3.11')"
- name: "Run CPU tests with thinc-apple-ops"
run: |
python -m pip install 'spacy[apple]'
python -m pytest --pyargs spacy
if: startsWith(matrix.os, 'macos') && matrix.python_version == '3.11'

View File

@ -0,0 +1,33 @@
name: universe validation
on:
push:
branches-ignore:
- "spacy.io"
- "nightly.spacy.io"
- "v2.spacy.io"
paths:
- "website/meta/universe.json"
pull_request:
types: [opened, synchronize, reopened, edited]
paths:
- "website/meta/universe.json"
jobs:
validate:
name: Validate
if: github.repository_owner == 'explosion'
runs-on: ubuntu-latest
steps:
- name: Check out repo
uses: actions/checkout@v3
- name: Configure Python version
uses: actions/setup-python@v4
with:
python-version: "3.7"
architecture: x64
- name: Validate website/meta/universe.json
run: |
python .github/validate_universe_json.py website/meta/universe.json

10
.gitignore vendored
View File

@ -10,16 +10,6 @@ spacy/tests/package/setup.cfg
spacy/tests/package/pyproject.toml
spacy/tests/package/requirements.txt
# Website
website/.cache/
website/public/
website/node_modules
website/.npm
website/logs
*.log
npm-debug.log*
quickstart-training-generator.js
# Cython / C extensions
cythonize.json
spacy/*.html

View File

@ -5,7 +5,7 @@ repos:
- id: black
language_version: python3.7
additional_dependencies: ['click==8.0.4']
- repo: https://gitlab.com/pycqa/flake8
- repo: https://github.com/pycqa/flake8
rev: 5.0.4
hooks:
- id: flake8

View File

@ -173,6 +173,11 @@ formatting and [`flake8`](http://flake8.pycqa.org/en/latest/) for linting its
Python modules. If you've built spaCy from source, you'll already have both
tools installed.
As a general rule of thumb, we use f-strings for any formatting of strings.
One exception are calls to Python's `logging` functionality.
To avoid unnecessary string conversions in these cases, we use string formatting
templates with `%s` and `%d` etc.
**⚠️ Note that formatting and linting is currently only possible for Python
modules in `.py` files, not Cython modules in `.pyx` and `.pxd` files.**

View File

@ -8,15 +8,18 @@ be used in real products.
spaCy comes with
[pretrained pipelines](https://spacy.io/models) and
currently supports tokenization and training for **60+ languages**. It features
currently supports tokenization and training for **70+ languages**. It features
state-of-the-art speed and **neural network models** for tagging,
parsing, **named entity recognition**, **text classification** and more,
multi-task learning with pretrained **transformers** like BERT, as well as a
production-ready [**training system**](https://spacy.io/usage/training) and easy
model packaging, deployment and workflow management. spaCy is commercial
open-source software, released under the MIT license.
open-source software, released under the [MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE).
💫 **Version 3.4.0 out now!**
💥 **We'd love to hear more about your experience with spaCy!**
[Fill out our survey here.](https://form.typeform.com/to/aMel9q9f)
💫 **Version 3.5 out now!**
[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
[![Azure Pipelines](https://img.shields.io/azure-devops/build/explosion-ai/public/8/master.svg?logo=azure-pipelines&style=flat-square&label=build)](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)
@ -32,20 +35,22 @@ open-source software, released under the MIT license.
## 📖 Documentation
| Documentation | |
| ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| ⭐️ **[spaCy 101]** | New to spaCy? Here's everything you need to know! |
| 📚 **[Usage Guides]** | How to use spaCy and its features. |
| 🚀 **[New in v3.0]** | New features, backwards incompatibilities and migration guide. |
| 🪐 **[Project Templates]** | End-to-end workflows you can clone, modify and run. |
| 🎛 **[API Reference]** | The detailed reference for spaCy's API. |
| 📦 **[Models]** | Download trained pipelines for spaCy. |
| 🌌 **[Universe]** | Plugins, extensions, demos and books from the spaCy ecosystem. |
| 👩‍🏫 **[Online Course]** | Learn spaCy in this free and interactive online course. |
| 📺 **[Videos]** | Our YouTube channel with video tutorials, talks and more. |
| 🛠 **[Changelog]** | Changes and version history. |
| 💝 **[Contribute]** | How to contribute to the spaCy project and code base. |
| Documentation | |
| ----------------------------- | ---------------------------------------------------------------------- |
| ⭐️ **[spaCy 101]** | New to spaCy? Here's everything you need to know! |
| 📚 **[Usage Guides]** | How to use spaCy and its features. |
| 🚀 **[New in v3.0]** | New features, backwards incompatibilities and migration guide. |
| 🪐 **[Project Templates]** | End-to-end workflows you can clone, modify and run. |
| 🎛 **[API Reference]** | The detailed reference for spaCy's API. |
| 📦 **[Models]** | Download trained pipelines for spaCy. |
| 🌌 **[Universe]** | Plugins, extensions, demos and books from the spaCy ecosystem. |
| ⚙️ **[spaCy VS Code Extension]** | Additional tooling and features for working with spaCy's config files. |
| 👩‍🏫 **[Online Course]** | Learn spaCy in this free and interactive online course. |
| 📺 **[Videos]** | Our YouTube channel with video tutorials, talks and more. |
| 🛠 **[Changelog]** | Changes and version history. |
| 💝 **[Contribute]** | How to contribute to the spaCy project and code base. |
| <a href="https://explosion.ai/spacy-tailored-pipelines"><img src="https://user-images.githubusercontent.com/13643239/152853098-1c761611-ccb0-4ec6-9066-b234552831fe.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Get a custom spaCy pipeline, tailor-made for your NLP problem by spaCy's core developers. Streamlined, production-ready, predictable and maintainable. Start by completing our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more &rarr;](https://explosion.ai/spacy-tailored-pipelines)** |
| <a href="https://explosion.ai/spacy-tailored-analysis"><img src="https://user-images.githubusercontent.com/1019791/206151300-b00cd189-e503-4797-aa1e-1bb6344062c5.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Bespoke advice for problem solving, strategy and analysis for applied NLP projects. Services include data strategy, code reviews, pipeline design and annotation coaching. Curious? Fill in our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more &rarr;](https://explosion.ai/spacy-tailored-analysis)** |
[spacy 101]: https://spacy.io/usage/spacy-101
[new in v3.0]: https://spacy.io/usage/v3
@ -53,6 +58,7 @@ open-source software, released under the MIT license.
[api reference]: https://spacy.io/api/
[models]: https://spacy.io/models
[universe]: https://spacy.io/universe
[spaCy VS Code Extension]: https://github.com/explosion/spacy-vscode
[videos]: https://www.youtube.com/c/ExplosionAI
[online course]: https://course.spacy.io
[project templates]: https://github.com/explosion/projects
@ -79,7 +85,7 @@ more people can benefit from it.
## Features
- Support for **60+ languages**
- Support for **70+ languages**
- **Trained pipelines** for different languages and tasks
- Multi-task learning with pretrained **transformers** like BERT
- Support for pretrained **word vectors** and embeddings

View File

@ -1,120 +0,0 @@
trigger:
batch: true
branches:
include:
- "*"
exclude:
- "spacy.io"
- "nightly.spacy.io"
- "v2.spacy.io"
paths:
exclude:
- "website/*"
- "*.md"
- ".github/workflows/*"
pr:
paths:
exclude:
- "*.md"
- "website/docs/*"
- "website/src/*"
- ".github/workflows/*"
jobs:
# Perform basic checks for most important errors (syntax etc.) Uses the config
# defined in .flake8 and overwrites the selected codes.
- job: "Validate"
pool:
vmImage: "ubuntu-latest"
steps:
- task: UsePythonVersion@0
inputs:
versionSpec: "3.7"
- script: |
pip install flake8==5.0.4
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
displayName: "flake8"
- job: "Test"
dependsOn: "Validate"
strategy:
matrix:
# We're only running one platform per Python version to speed up builds
Python36Linux:
imageName: "ubuntu-latest"
python.version: "3.6"
# Python36Windows:
# imageName: "windows-latest"
# python.version: "3.6"
# Python36Mac:
# imageName: "macos-latest"
# python.version: "3.6"
# Python37Linux:
# imageName: "ubuntu-latest"
# python.version: "3.7"
Python37Windows:
imageName: "windows-latest"
python.version: "3.7"
# Python37Mac:
# imageName: "macos-latest"
# python.version: "3.7"
# Python38Linux:
# imageName: "ubuntu-latest"
# python.version: "3.8"
# Python38Windows:
# imageName: "windows-latest"
# python.version: "3.8"
Python38Mac:
imageName: "macos-latest"
python.version: "3.8"
Python39Linux:
imageName: "ubuntu-latest"
python.version: "3.9"
# Python39Windows:
# imageName: "windows-latest"
# python.version: "3.9"
# Python39Mac:
# imageName: "macos-latest"
# python.version: "3.9"
Python310Linux:
imageName: "ubuntu-latest"
python.version: "3.10"
Python310Windows:
imageName: "windows-latest"
python.version: "3.10"
Python310Mac:
imageName: "macos-latest"
python.version: "3.10"
Python311Linux:
imageName: 'ubuntu-latest'
python.version: '3.11.0-rc.2'
Python311Windows:
imageName: 'windows-latest'
python.version: '3.11.0-rc.2'
Python311Mac:
imageName: 'macos-latest'
python.version: '3.11.0-rc.2'
maxParallel: 4
pool:
vmImage: $(imageName)
steps:
- template: .github/azure-steps.yml
parameters:
python_version: '$(python.version)'
architecture: 'x64'
# - job: "TestGPU"
# dependsOn: "Validate"
# strategy:
# matrix:
# Python38LinuxX64_GPU:
# python.version: '3.8'
# pool:
# name: "LinuxX64_GPU"
# steps:
# - template: .github/azure-steps.yml
# parameters:
# python_version: '$(python.version)'
# architecture: 'x64'
# gpu: true
# num_build_jobs: 24

View File

@ -5,4 +5,5 @@ numpy==1.17.3; python_version=='3.8' and platform_machine!='aarch64'
numpy==1.19.2; python_version=='3.8' and platform_machine=='aarch64'
numpy==1.19.3; python_version=='3.9'
numpy==1.21.3; python_version=='3.10'
numpy; python_version>='3.11'
numpy==1.23.2; python_version=='3.11'
numpy; python_version>='3.12'

View File

@ -5,7 +5,10 @@ requires = [
"cymem>=2.0.2,<2.1.0",
"preshed>=3.0.2,<3.1.0",
"murmurhash>=0.28.0,<1.1.0",
"thinc>=8.1.0,<8.2.0",
"thinc>=8.1.8,<8.2.0",
"numpy>=1.15.0",
]
build-backend = "setuptools.build_meta"
[tool.isort]
profile = "black"

View File

@ -1,16 +1,17 @@
# Our libraries
spacy-legacy>=3.0.10,<3.1.0
spacy-legacy>=3.0.11,<3.1.0
spacy-loggers>=1.0.0,<2.0.0
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
thinc>=8.1.0,<8.2.0
thinc>=8.1.8,<8.2.0
ml_datasets>=0.2.0,<0.3.0
murmurhash>=0.28.0,<1.1.0
wasabi>=0.9.1,<1.1.0
wasabi>=0.9.1,<1.2.0
srsly>=2.4.3,<3.0.0
catalogue>=2.0.6,<2.1.0
typer>=0.3.0,<0.5.0
pathy>=0.3.5
typer>=0.3.0,<0.10.0
pathy>=0.10.0
smart-open>=5.2.1,<7.0.0
# Third party dependencies
numpy>=1.15.0
requests>=2.13.0,<3.0.0
@ -21,7 +22,7 @@ langcodes>=3.2.0,<4.0.0
# Official Python utilities
setuptools
packaging>=20.0
typing_extensions>=3.7.4.1,<4.2.0; python_version < "3.8"
typing_extensions>=3.7.4.1,<4.5.0; python_version < "3.8"
# Development dependencies
pre-commit>=2.13.0
cython>=0.25,<3.0
@ -30,10 +31,11 @@ pytest-timeout>=1.3.0,<2.0.0
mock>=2.0.0,<3.0.0
flake8>=3.8.0,<6.0.0
hypothesis>=3.27.0,<7.0.0
mypy>=0.980,<0.990; platform_machine != "aarch64" and python_version >= "3.7"
mypy>=0.990,<1.1.0; platform_machine != "aarch64" and python_version >= "3.7"
types-dataclasses>=0.1.3; python_version < "3.7"
types-mock>=0.1.1
types-setuptools>=57.0.0
types-requests
types-setuptools>=57.0.0
black>=22.0,<23.0
black==22.3.0
isort>=5.0,<6.0

View File

@ -22,6 +22,7 @@ classifiers =
Programming Language :: Python :: 3.8
Programming Language :: Python :: 3.9
Programming Language :: Python :: 3.10
Programming Language :: Python :: 3.11
Topic :: Scientific/Engineering
project_urls =
Release notes = https://github.com/explosion/spaCy/releases
@ -38,21 +39,22 @@ setup_requires =
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
murmurhash>=0.28.0,<1.1.0
thinc>=8.1.0,<8.2.0
thinc>=8.1.8,<8.2.0
install_requires =
# Our libraries
spacy-legacy>=3.0.10,<3.1.0
spacy-legacy>=3.0.11,<3.1.0
spacy-loggers>=1.0.0,<2.0.0
murmurhash>=0.28.0,<1.1.0
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
thinc>=8.1.0,<8.2.0
wasabi>=0.9.1,<1.1.0
thinc>=8.1.8,<8.2.0
wasabi>=0.9.1,<1.2.0
srsly>=2.4.3,<3.0.0
catalogue>=2.0.6,<2.1.0
# Third-party dependencies
typer>=0.3.0,<0.5.0
pathy>=0.3.5
typer>=0.3.0,<0.10.0
pathy>=0.10.0
smart-open>=5.2.1,<7.0.0
tqdm>=4.38.0,<5.0.0
numpy>=1.15.0
requests>=2.13.0,<3.0.0
@ -61,7 +63,7 @@ install_requires =
# Official Python utilities
setuptools
packaging>=20.0
typing_extensions>=3.7.4,<4.2.0; python_version < "3.8"
typing_extensions>=3.7.4.1,<4.5.0; python_version < "3.8"
langcodes>=3.2.0,<4.0.0
[options.entry_points]
@ -72,45 +74,45 @@ console_scripts =
lookups =
spacy_lookups_data>=1.0.3,<1.1.0
transformers =
spacy_transformers>=1.1.2,<1.2.0
spacy_transformers>=1.1.2,<1.3.0
ray =
spacy_ray>=0.1.0,<1.0.0
cuda =
cupy>=5.0.0b4,<12.0.0
cupy>=5.0.0b4,<13.0.0
cuda80 =
cupy-cuda80>=5.0.0b4,<12.0.0
cupy-cuda80>=5.0.0b4,<13.0.0
cuda90 =
cupy-cuda90>=5.0.0b4,<12.0.0
cupy-cuda90>=5.0.0b4,<13.0.0
cuda91 =
cupy-cuda91>=5.0.0b4,<12.0.0
cupy-cuda91>=5.0.0b4,<13.0.0
cuda92 =
cupy-cuda92>=5.0.0b4,<12.0.0
cupy-cuda92>=5.0.0b4,<13.0.0
cuda100 =
cupy-cuda100>=5.0.0b4,<12.0.0
cupy-cuda100>=5.0.0b4,<13.0.0
cuda101 =
cupy-cuda101>=5.0.0b4,<12.0.0
cupy-cuda101>=5.0.0b4,<13.0.0
cuda102 =
cupy-cuda102>=5.0.0b4,<12.0.0
cupy-cuda102>=5.0.0b4,<13.0.0
cuda110 =
cupy-cuda110>=5.0.0b4,<12.0.0
cupy-cuda110>=5.0.0b4,<13.0.0
cuda111 =
cupy-cuda111>=5.0.0b4,<12.0.0
cupy-cuda111>=5.0.0b4,<13.0.0
cuda112 =
cupy-cuda112>=5.0.0b4,<12.0.0
cupy-cuda112>=5.0.0b4,<13.0.0
cuda113 =
cupy-cuda113>=5.0.0b4,<12.0.0
cupy-cuda113>=5.0.0b4,<13.0.0
cuda114 =
cupy-cuda114>=5.0.0b4,<12.0.0
cupy-cuda114>=5.0.0b4,<13.0.0
cuda115 =
cupy-cuda115>=5.0.0b4,<12.0.0
cupy-cuda115>=5.0.0b4,<13.0.0
cuda116 =
cupy-cuda116>=5.0.0b4,<12.0.0
cupy-cuda116>=5.0.0b4,<13.0.0
cuda117 =
cupy-cuda117>=5.0.0b4,<12.0.0
cupy-cuda117>=5.0.0b4,<13.0.0
cuda11x =
cupy-cuda11x>=11.0.0,<12.0.0
cupy-cuda11x>=11.0.0,<13.0.0
cuda-autodetect =
cupy-wheel>=11.0.0,<12.0.0
cupy-wheel>=11.0.0,<13.0.0
apple =
thinc-apple-ops>=0.1.0.dev0,<1.0.0
# Language tokenizers with external dependencies

View File

@ -1,6 +1,6 @@
from typing import Union, Iterable, Dict, Any
from pathlib import Path
import sys
from pathlib import Path
from typing import Any, Dict, Iterable, Union
# set library-specific custom warning handling before doing anything else
from .errors import setup_default_warnings
@ -8,20 +8,17 @@ from .errors import setup_default_warnings
setup_default_warnings() # noqa: E402
# These are imported as part of the API
from thinc.api import prefer_gpu, require_gpu, require_cpu # noqa: F401
from thinc.api import Config
from thinc.api import Config, prefer_gpu, require_cpu, require_gpu # noqa: F401
from . import pipeline # noqa: F401
from .cli.info import info # noqa: F401
from .glossary import explain # noqa: F401
from .about import __version__ # noqa: F401
from .util import registry, logger # noqa: F401
from .errors import Errors
from .language import Language
from .vocab import Vocab
from . import util
from .about import __version__ # noqa: F401
from .cli.info import info # noqa: F401
from .errors import Errors
from .glossary import explain # noqa: F401
from .language import Language
from .util import logger, registry # noqa: F401
from .vocab import Vocab
if sys.maxunicode == 65535:
raise SystemError(Errors.E130)

View File

@ -1,6 +1,6 @@
# fmt: off
__title__ = "spacy"
__version__ = "3.4.2"
__version__ = "3.6.0"
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
__projects__ = "https://github.com/explosion/projects"

View File

@ -1,6 +1,7 @@
# Reserve 64 values for flag features
from . cimport symbols
cdef enum attr_id_t:
NULL_ATTR
IS_ALPHA

View File

@ -1,32 +1,35 @@
from wasabi import msg
from ._util import app, setup_cli # noqa: F401
from .apply import apply # noqa: F401
from .assemble import assemble_cli # noqa: F401
# These are the actual functions, NOT the wrapped CLI commands. The CLI commands
# are registered automatically and won't have to be imported here.
from .download import download # noqa: F401
from .info import info # noqa: F401
from .package import package # noqa: F401
from .profile import profile # noqa: F401
from .train import train_cli # noqa: F401
from .assemble import assemble_cli # noqa: F401
from .pretrain import pretrain # noqa: F401
from .debug_data import debug_data # noqa: F401
from .debug_config import debug_config # noqa: F401
from .debug_model import debug_model # noqa: F401
from .debug_diff import debug_diff # noqa: F401
from .evaluate import evaluate # noqa: F401
from .benchmark_speed import benchmark_speed_cli # noqa: F401
from .convert import convert # noqa: F401
from .debug_config import debug_config # noqa: F401
from .debug_data import debug_data # noqa: F401
from .debug_diff import debug_diff # noqa: F401
from .debug_model import debug_model # noqa: F401
from .download import download # noqa: F401
from .evaluate import evaluate # noqa: F401
from .find_threshold import find_threshold # noqa: F401
from .info import info # noqa: F401
from .init_config import fill_config, init_config # noqa: F401
from .init_pipeline import init_pipeline_cli # noqa: F401
from .init_config import init_config, fill_config # noqa: F401
from .validate import validate # noqa: F401
from .project.clone import project_clone # noqa: F401
from .package import package # noqa: F401
from .pretrain import pretrain # noqa: F401
from .profile import profile # noqa: F401
from .project.assets import project_assets # noqa: F401
from .project.run import project_run # noqa: F401
from .project.dvc import project_update_dvc # noqa: F401
from .project.push import project_push # noqa: F401
from .project.pull import project_pull # noqa: F401
from .project.clone import project_clone # noqa: F401
from .project.document import project_document # noqa: F401
from .project.dvc import project_update_dvc # noqa: F401
from .project.pull import project_pull # noqa: F401
from .project.push import project_push # noqa: F401
from .project.run import project_run # noqa: F401
from .train import train_cli # noqa: F401
from .validate import validate # noqa: F401
@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)

View File

@ -1,29 +1,47 @@
from typing import Dict, Any, Union, List, Optional, Tuple, Iterable
from typing import TYPE_CHECKING, overload
import sys
import shutil
from pathlib import Path
from wasabi import msg, Printer
import srsly
import hashlib
import os
import shutil
import sys
from configparser import InterpolationError
from contextlib import contextmanager
from pathlib import Path
from typing import (
TYPE_CHECKING,
Any,
Dict,
Iterable,
List,
Optional,
Tuple,
Union,
overload,
)
import srsly
import typer
from click import NoSuchOption
from click.parser import split_arg_string
from typer.main import get_command
from contextlib import contextmanager
from thinc.api import Config, ConfigValidationError, require_gpu
from thinc.util import gpu_is_available
from configparser import InterpolationError
import os
from typer.main import get_command
from wasabi import Printer, msg
from .. import about
from ..compat import Literal
from ..schemas import ProjectConfigSchema, validate
from ..util import import_file, run_command, make_tempdir, registry, logger
from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS
from .. import about
from ..util import (
ENV_VARS,
SimpleFrozenDict,
import_file,
is_compatible_version,
logger,
make_tempdir,
registry,
run_command,
)
if TYPE_CHECKING:
from pathy import Pathy # noqa: F401
from pathy import FluidPath # noqa: F401
SDIST_SUFFIX = ".tar.gz"
@ -46,6 +64,7 @@ DEBUG_HELP = """Suite of helpful commands for debugging and profiling. Includes
commands to check and validate your config files, training and evaluation data,
and custom model implementations.
"""
BENCHMARK_HELP = """Commands for benchmarking pipelines."""
INIT_HELP = """Commands for initializing configs and pipeline packages."""
# Wrappers for Typer's annotations. Initially created to set defaults and to
@ -54,12 +73,14 @@ Arg = typer.Argument
Opt = typer.Option
app = typer.Typer(name=NAME, help=HELP)
benchmark_cli = typer.Typer(name="benchmark", help=BENCHMARK_HELP, no_args_is_help=True)
project_cli = typer.Typer(name="project", help=PROJECT_HELP, no_args_is_help=True)
debug_cli = typer.Typer(name="debug", help=DEBUG_HELP, no_args_is_help=True)
init_cli = typer.Typer(name="init", help=INIT_HELP, no_args_is_help=True)
app.add_typer(project_cli)
app.add_typer(debug_cli)
app.add_typer(benchmark_cli)
app.add_typer(init_cli)
@ -87,9 +108,9 @@ def parse_config_overrides(
cli_overrides = _parse_overrides(args, is_cli=True)
if cli_overrides:
keys = [k for k in cli_overrides if k not in env_overrides]
logger.debug(f"Config overrides from CLI: {keys}")
logger.debug("Config overrides from CLI: %s", keys)
if env_overrides:
logger.debug(f"Config overrides from env variables: {list(env_overrides)}")
logger.debug("Config overrides from env variables: %s", list(env_overrides))
return {**cli_overrides, **env_overrides}
@ -158,15 +179,15 @@ def load_project_config(
sys.exit(1)
validate_project_version(config)
validate_project_commands(config)
if interpolate:
err = f"{PROJECT_FILE} validation error"
with show_validation_error(title=err, hint_fill=False):
config = substitute_project_variables(config, overrides)
# Make sure directories defined in config exist
for subdir in config.get("directories", []):
dir_path = path / subdir
if not dir_path.exists():
dir_path.mkdir(parents=True)
if interpolate:
err = f"{PROJECT_FILE} validation error"
with show_validation_error(title=err, hint_fill=False):
config = substitute_project_variables(config, overrides)
return config
@ -331,7 +352,7 @@ def import_code(code_path: Optional[Union[Path, str]]) -> None:
msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1)
def upload_file(src: Path, dest: Union[str, "Pathy"]) -> None:
def upload_file(src: Path, dest: Union[str, "FluidPath"]) -> None:
"""Upload a file.
src (Path): The source path.
@ -339,13 +360,20 @@ def upload_file(src: Path, dest: Union[str, "Pathy"]) -> None:
"""
import smart_open
# Create parent directories for local paths
if isinstance(dest, Path):
if not dest.parent.exists():
dest.parent.mkdir(parents=True)
dest = str(dest)
with smart_open.open(dest, mode="wb") as output_file:
with src.open(mode="rb") as input_file:
output_file.write(input_file.read())
def download_file(src: Union[str, "Pathy"], dest: Path, *, force: bool = False) -> None:
def download_file(
src: Union[str, "FluidPath"], dest: Path, *, force: bool = False
) -> None:
"""Download a file using smart_open.
url (str): The URL of the file.
@ -358,7 +386,7 @@ def download_file(src: Union[str, "Pathy"], dest: Path, *, force: bool = False)
if dest.exists() and not force:
return None
src = str(src)
with smart_open.open(src, mode="rb", ignore_ext=True) as input_file:
with smart_open.open(src, mode="rb", compression="disable") as input_file:
with dest.open(mode="wb") as output_file:
shutil.copyfileobj(input_file, output_file)
@ -368,7 +396,7 @@ def ensure_pathy(path):
slow and annoying Google Cloud warning)."""
from pathy import Pathy # noqa: F811
return Pathy(path)
return Pathy.fluid(path)
def git_checkout(
@ -575,6 +603,33 @@ def setup_gpu(use_gpu: int, silent=None) -> None:
local_msg.info("To switch to GPU 0, use the option: --gpu-id 0")
def walk_directory(path: Path, suffix: Optional[str] = None) -> List[Path]:
"""Given a directory and a suffix, recursively find all files matching the suffix.
Directories or files with names beginning with a . are ignored, but hidden flags on
filesystems are not checked.
When provided with a suffix `None`, there is no suffix-based filtering."""
if not path.is_dir():
return [path]
paths = [path]
locs = []
seen = set()
for path in paths:
if str(path) in seen:
continue
seen.add(str(path))
if path.parts[-1].startswith("."):
continue
elif path.is_dir():
paths.extend(path.iterdir())
elif suffix is not None and not path.parts[-1].endswith(suffix):
continue
else:
locs.append(path)
# It's good to sort these, in case the ordering messes up cache.
locs.sort()
return locs
def _format_number(number: Union[int, float], ndigits: int = 2) -> str:
"""Formats a number (float or int) rounding to `ndigits`, without truncating trailing 0s,
as happens with `round(number, ndigits)`"""

140
spacy/cli/apply.py Normal file
View File

@ -0,0 +1,140 @@
from itertools import chain
from pathlib import Path
from typing import Iterable, List, Optional, Union, cast
import srsly
import tqdm
from wasabi import msg
from ..tokens import Doc, DocBin
from ..util import ensure_path, load_model
from ..vocab import Vocab
from ._util import Arg, Opt, app, import_code, setup_gpu, walk_directory
path_help = """Location of the documents to predict on.
Can be a single file in .spacy format or a .jsonl file.
Files with other extensions are treated as single plain text documents.
If a directory is provided it is traversed recursively to grab
all files to be processed.
The files can be a mixture of .spacy, .jsonl and text files.
If .jsonl is provided the specified field is going
to be grabbed ("text" by default)."""
out_help = "Path to save the resulting .spacy file"
code_help = (
"Path to Python file with additional " "code (registered functions) to be imported"
)
gold_help = "Use gold preprocessing provided in the .spacy files"
force_msg = (
"The provided output file already exists. "
"To force overwriting the output file, set the --force or -F flag."
)
DocOrStrStream = Union[Iterable[str], Iterable[Doc]]
def _stream_docbin(path: Path, vocab: Vocab) -> Iterable[Doc]:
"""
Stream Doc objects from DocBin.
"""
docbin = DocBin().from_disk(path)
for doc in docbin.get_docs(vocab):
yield doc
def _stream_jsonl(path: Path, field: str) -> Iterable[str]:
"""
Stream "text" field from JSONL. If the field "text" is
not found it raises error.
"""
for entry in srsly.read_jsonl(path):
if field not in entry:
msg.fail(f"{path} does not contain the required '{field}' field.", exits=1)
else:
yield entry[field]
def _stream_texts(paths: Iterable[Path]) -> Iterable[str]:
"""
Yields strings from text files in paths.
"""
for path in paths:
with open(path, "r") as fin:
text = fin.read()
yield text
@app.command("apply")
def apply_cli(
# fmt: off
model: str = Arg(..., help="Model name or path"),
data_path: Path = Arg(..., help=path_help, exists=True),
output_file: Path = Arg(..., help=out_help, dir_okay=False),
code_path: Optional[Path] = Opt(None, "--code", "-c", help=code_help),
text_key: str = Opt("text", "--text-key", "-tk", help="Key containing text string for JSONL"),
force_overwrite: bool = Opt(False, "--force", "-F", help="Force overwriting the output file"),
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU."),
batch_size: int = Opt(1, "--batch-size", "-b", help="Batch size."),
n_process: int = Opt(1, "--n-process", "-n", help="number of processors to use.")
):
"""
Apply a trained pipeline to documents to get predictions.
Expects a loadable spaCy pipeline and path to the data, which
can be a directory or a file.
The data files can be provided in multiple formats:
1. .spacy files
2. .jsonl files with a specified "field" to read the text from.
3. Files with any other extension are assumed to be containing
a single document.
DOCS: https://spacy.io/api/cli#apply
"""
data_path = ensure_path(data_path)
output_file = ensure_path(output_file)
code_path = ensure_path(code_path)
if output_file.exists() and not force_overwrite:
msg.fail(force_msg, exits=1)
if not data_path.exists():
msg.fail(f"Couldn't find data path: {data_path}", exits=1)
import_code(code_path)
setup_gpu(use_gpu)
apply(data_path, output_file, model, text_key, batch_size, n_process)
def apply(
data_path: Path,
output_file: Path,
model: str,
json_field: str,
batch_size: int,
n_process: int,
):
docbin = DocBin(store_user_data=True)
paths = walk_directory(data_path)
if len(paths) == 0:
docbin.to_disk(output_file)
msg.warn(
"Did not find data to process,"
f" {data_path} seems to be an empty directory."
)
return
nlp = load_model(model)
msg.good(f"Loaded model {model}")
vocab = nlp.vocab
streams: List[DocOrStrStream] = []
text_files = []
for path in paths:
if path.suffix == ".spacy":
streams.append(_stream_docbin(path, vocab))
elif path.suffix == ".jsonl":
streams.append(_stream_jsonl(path, json_field))
else:
text_files.append(path)
if len(text_files) > 0:
streams.append(_stream_texts(text_files))
datagen = cast(DocOrStrStream, chain(*streams))
for doc in tqdm.tqdm(nlp.pipe(datagen, batch_size=batch_size, n_process=n_process)):
docbin.add(doc)
if output_file.suffix == "":
output_file = output_file.with_suffix(".spacy")
docbin.to_disk(output_file)

View File

@ -1,13 +1,20 @@
from typing import Optional
from pathlib import Path
from wasabi import msg
import typer
import logging
from pathlib import Path
from typing import Optional
import typer
from wasabi import msg
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
from ._util import import_code
from .. import util
from ..util import get_sourced_components, load_model_from_config
from ._util import (
Arg,
Opt,
app,
import_code,
parse_config_overrides,
show_validation_error,
)
@app.command(

View File

@ -0,0 +1,175 @@
import random
import time
from itertools import islice
from pathlib import Path
from typing import Iterable, List, Optional
import numpy
import typer
from tqdm import tqdm
from wasabi import msg
from .. import util
from ..language import Language
from ..tokens import Doc
from ..training import Corpus
from ._util import Arg, Opt, benchmark_cli, setup_gpu
@benchmark_cli.command(
"speed",
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
)
def benchmark_speed_cli(
# fmt: off
ctx: typer.Context,
model: str = Arg(..., help="Model name or path"),
data_path: Path = Arg(..., help="Location of binary evaluation data in .spacy format", exists=True),
batch_size: Optional[int] = Opt(None, "--batch-size", "-b", min=1, help="Override the pipeline batch size"),
no_shuffle: bool = Opt(False, "--no-shuffle", help="Do not shuffle benchmark data"),
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
n_batches: int = Opt(50, "--batches", help="Minimum number of batches to benchmark", min=30,),
warmup_epochs: int = Opt(3, "--warmup", "-w", min=0, help="Number of iterations over the data for warmup"),
# fmt: on
):
"""
Benchmark a pipeline. Expects a loadable spaCy pipeline and benchmark
data in the binary .spacy format.
"""
setup_gpu(use_gpu=use_gpu, silent=False)
nlp = util.load_model(model)
batch_size = batch_size if batch_size is not None else nlp.batch_size
corpus = Corpus(data_path)
docs = [eg.predicted for eg in corpus(nlp)]
if len(docs) == 0:
msg.fail("Cannot benchmark speed using an empty corpus.", exits=1)
print(f"Warming up for {warmup_epochs} epochs...")
warmup(nlp, docs, warmup_epochs, batch_size)
print()
print(f"Benchmarking {n_batches} batches...")
wps = benchmark(nlp, docs, n_batches, batch_size, not no_shuffle)
print()
print_outliers(wps)
print_mean_with_ci(wps)
# Lowercased, behaves as a context manager function.
class time_context:
"""Register the running time of a context."""
def __enter__(self):
self.start = time.perf_counter()
return self
def __exit__(self, type, value, traceback):
self.elapsed = time.perf_counter() - self.start
class Quartiles:
"""Calculate the q1, q2, q3 quartiles and the inter-quartile range (iqr)
of a sample."""
q1: float
q2: float
q3: float
iqr: float
def __init__(self, sample: numpy.ndarray) -> None:
self.q1 = numpy.quantile(sample, 0.25)
self.q2 = numpy.quantile(sample, 0.5)
self.q3 = numpy.quantile(sample, 0.75)
self.iqr = self.q3 - self.q1
def annotate(
nlp: Language, docs: List[Doc], batch_size: Optional[int]
) -> numpy.ndarray:
docs = nlp.pipe(tqdm(docs, unit="doc"), batch_size=batch_size)
wps = []
while True:
with time_context() as elapsed:
batch_docs = list(
islice(docs, batch_size if batch_size else nlp.batch_size)
)
if len(batch_docs) == 0:
break
n_tokens = count_tokens(batch_docs)
wps.append(n_tokens / elapsed.elapsed)
return numpy.array(wps)
def benchmark(
nlp: Language,
docs: List[Doc],
n_batches: int,
batch_size: int,
shuffle: bool,
) -> numpy.ndarray:
if shuffle:
bench_docs = [
nlp.make_doc(random.choice(docs).text)
for _ in range(n_batches * batch_size)
]
else:
bench_docs = [
nlp.make_doc(docs[i % len(docs)].text)
for i in range(n_batches * batch_size)
]
return annotate(nlp, bench_docs, batch_size)
def bootstrap(x, statistic=numpy.mean, iterations=10000) -> numpy.ndarray:
"""Apply a statistic to repeated random samples of an array."""
return numpy.fromiter(
(
statistic(numpy.random.choice(x, len(x), replace=True))
for _ in range(iterations)
),
numpy.float64,
)
def count_tokens(docs: Iterable[Doc]) -> int:
return sum(len(doc) for doc in docs)
def print_mean_with_ci(sample: numpy.ndarray):
mean = numpy.mean(sample)
bootstrap_means = bootstrap(sample)
bootstrap_means.sort()
# 95% confidence interval
low = bootstrap_means[int(len(bootstrap_means) * 0.025)]
high = bootstrap_means[int(len(bootstrap_means) * 0.975)]
print(f"Mean: {mean:.1f} words/s (95% CI: {low-mean:.1f} +{high-mean:.1f})")
def print_outliers(sample: numpy.ndarray):
quartiles = Quartiles(sample)
n_outliers = numpy.sum(
(sample < (quartiles.q1 - 1.5 * quartiles.iqr))
| (sample > (quartiles.q3 + 1.5 * quartiles.iqr))
)
n_extreme_outliers = numpy.sum(
(sample < (quartiles.q1 - 3.0 * quartiles.iqr))
| (sample > (quartiles.q3 + 3.0 * quartiles.iqr))
)
print(
f"Outliers: {(100 * n_outliers) / len(sample):.1f}%, extreme outliers: {(100 * n_extreme_outliers) / len(sample)}%"
)
def warmup(
nlp: Language, docs: List[Doc], warmup_epochs: int, batch_size: Optional[int]
) -> numpy.ndarray:
docs = warmup_epochs * docs
return annotate(nlp, docs, batch_size)

View File

@ -1,18 +1,22 @@
from typing import Callable, Iterable, Mapping, Optional, Any, List, Union
from enum import Enum
from pathlib import Path
from wasabi import Printer
import srsly
import itertools
import re
import sys
import itertools
from enum import Enum
from pathlib import Path
from typing import Any, Callable, Iterable, Mapping, Optional, Union
import srsly
from wasabi import Printer
from ._util import app, Arg, Opt
from ..training import docs_to_json
from ..tokens import Doc, DocBin
from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs
from ..training.converters import conllu_to_docs
from ..training import docs_to_json
from ..training.converters import (
conll_ner_to_docs,
conllu_to_docs,
iob_to_docs,
json_to_docs,
)
from ._util import Arg, Opt, app, walk_directory
# Converters are matched by file extension except for ner/iob, which are
# matched by file extension and content. To add a converter, add a new
@ -28,6 +32,8 @@ CONVERTERS: Mapping[str, Callable[..., Iterable[Doc]]] = {
"json": json_to_docs,
}
AUTO = "auto"
# File types that can be written to stdout
FILE_TYPES_STDOUT = ("json",)
@ -49,7 +55,7 @@ def convert_cli(
model: Optional[str] = Opt(None, "--model", "--base", "-b", help="Trained spaCy pipeline for sentence segmentation to use as base (for --seg-sents)"),
morphology: bool = Opt(False, "--morphology", "-m", help="Enable appending morphology to tags"),
merge_subtokens: bool = Opt(False, "--merge-subtokens", "-T", help="Merge CoNLL-U subtokens"),
converter: str = Opt("auto", "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}"),
converter: str = Opt(AUTO, "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}"),
ner_map: Optional[Path] = Opt(None, "--ner-map", "-nm", help="NER tag mapping (as JSON-encoded dict of entity types)", exists=True),
lang: Optional[str] = Opt(None, "--lang", "-l", help="Language (if tokenizer required)"),
concatenate: bool = Opt(None, "--concatenate", "-C", help="Concatenate output to a single file"),
@ -70,8 +76,8 @@ def convert_cli(
output_dir: Union[str, Path] = "-" if output_dir == Path("-") else output_dir
silent = output_dir == "-"
msg = Printer(no_print=silent)
verify_cli_args(msg, input_path, output_dir, file_type.value, converter, ner_map)
converter = _get_converter(msg, converter, input_path)
verify_cli_args(msg, input_path, output_dir, file_type.value, converter, ner_map)
convert(
input_path,
output_dir,
@ -100,7 +106,7 @@ def convert(
model: Optional[str] = None,
morphology: bool = False,
merge_subtokens: bool = False,
converter: str = "auto",
converter: str,
ner_map: Optional[Path] = None,
lang: Optional[str] = None,
concatenate: bool = False,
@ -189,33 +195,6 @@ def autodetect_ner_format(input_data: str) -> Optional[str]:
return None
def walk_directory(path: Path, converter: str) -> List[Path]:
if not path.is_dir():
return [path]
paths = [path]
locs = []
seen = set()
for path in paths:
if str(path) in seen:
continue
seen.add(str(path))
if path.parts[-1].startswith("."):
continue
elif path.is_dir():
paths.extend(path.iterdir())
elif converter == "json" and not path.parts[-1].endswith("json"):
continue
elif converter == "conll" and not path.parts[-1].endswith("conll"):
continue
elif converter == "iob" and not path.parts[-1].endswith("iob"):
continue
else:
locs.append(path)
# It's good to sort these, in case the ordering messes up cache.
locs.sort()
return locs
def verify_cli_args(
msg: Printer,
input_path: Path,
@ -239,18 +218,22 @@ def verify_cli_args(
input_locs = walk_directory(input_path, converter)
if len(input_locs) == 0:
msg.fail("No input files in directory", input_path, exits=1)
file_types = list(set([loc.suffix[1:] for loc in input_locs]))
if converter == "auto" and len(file_types) >= 2:
file_types_str = ",".join(file_types)
msg.fail("All input files must be same type", file_types_str, exits=1)
if converter != "auto" and converter not in CONVERTERS:
if converter not in CONVERTERS:
msg.fail(f"Can't find converter for {converter}", exits=1)
def _get_converter(msg, converter, input_path: Path):
if input_path.is_dir():
input_path = walk_directory(input_path, converter)[0]
if converter == "auto":
if converter == AUTO:
input_locs = walk_directory(input_path, suffix=None)
file_types = list(set([loc.suffix[1:] for loc in input_locs]))
if len(file_types) >= 2:
file_types_str = ",".join(file_types)
msg.fail("All input files must be same type", file_types_str, exits=1)
input_path = input_locs[0]
else:
input_path = walk_directory(input_path, suffix=converter)[0]
if converter == AUTO:
converter = input_path.suffix[1:]
if converter == "ner" or converter == "iob":
with input_path.open(encoding="utf8") as file_:

View File

@ -1,15 +1,22 @@
from typing import Optional, Dict, Any, Union, List
from pathlib import Path
from wasabi import msg, table
from typing import Any, Dict, List, Optional, Union
import typer
from thinc.api import Config
from thinc.config import VARIABLE_RE
import typer
from wasabi import msg, table
from ._util import Arg, Opt, show_validation_error, parse_config_overrides
from ._util import import_code, debug_cli
from .. import util
from ..schemas import ConfigSchemaInit, ConfigSchemaTraining
from ..util import registry
from .. import util
from ._util import (
Arg,
Opt,
debug_cli,
import_code,
parse_config_overrides,
show_validation_error,
)
@debug_cli.command(

View File

@ -1,28 +1,49 @@
from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Tuple, Union
from typing import cast, overload
from pathlib import Path
from collections import Counter
import sys
import srsly
from wasabi import Printer, MESSAGES, msg
import typer
import math
import sys
from collections import Counter
from pathlib import Path
from typing import (
Any,
Dict,
Iterable,
List,
Optional,
Sequence,
Set,
Tuple,
Union,
cast,
overload,
)
from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
from ._util import import_code, debug_cli, _format_number
from ..training import Example, remove_bilu_prefix
from ..training.initialize import get_sourced_components
from ..schemas import ConfigSchemaTraining
import numpy
import srsly
import typer
from wasabi import MESSAGES, Printer, msg
from .. import util
from ..compat import Literal
from ..language import Language
from ..morphology import Morphology
from ..pipeline import Morphologizer, SpanCategorizer, TrainablePipe
from ..pipeline._edit_tree_internals.edit_trees import EditTrees
from ..pipeline._parser_internals import nonproj
from ..pipeline._parser_internals.nonproj import DELIMITER
from ..pipeline import Morphologizer, SpanCategorizer
from ..morphology import Morphology
from ..language import Language
from ..schemas import ConfigSchemaTraining
from ..training import Example, remove_bilu_prefix
from ..training.initialize import get_sourced_components
from ..util import registry, resolve_dot_names
from ..compat import Literal
from ..vectors import Mode as VectorsMode
from .. import util
from ._util import (
Arg,
Opt,
_format_number,
app,
debug_cli,
import_code,
parse_config_overrides,
show_validation_error,
)
# Minimum number of expected occurrences of NER label in data to train new label
NEW_LABEL_THRESHOLD = 50
@ -209,7 +230,7 @@ def debug_data(
else:
msg.info("No word vectors present in the package")
if "spancat" in factory_names:
if "spancat" in factory_names or "spancat_singlelabel" in factory_names:
model_labels_spancat = _get_labels_from_spancat(nlp)
has_low_data_warning = False
has_no_neg_warning = False
@ -334,7 +355,7 @@ def debug_data(
show=verbose,
)
else:
msg.good("Examples without ocurrences available for all labels")
msg.good("Examples without occurrences available for all labels")
if "ner" in factory_names:
# Get all unique NER labels present in the data
@ -519,9 +540,13 @@ def debug_data(
if "tagger" in factory_names:
msg.divider("Part-of-speech Tagging")
label_list = [label for label in gold_train_data["tags"]]
model_labels = _get_labels_from_model(nlp, "tagger")
label_list, counts = zip(*gold_train_data["tags"].items())
msg.info(f"{len(label_list)} label(s) in train data")
p = numpy.array(counts)
p = p / p.sum()
norm_entropy = (-p * numpy.log2(p)).sum() / numpy.log2(len(label_list))
msg.info(f"{norm_entropy} is the normalised label entropy")
model_labels = _get_labels_from_model(nlp, "tagger")
labels = set(label_list)
missing_labels = model_labels - labels
if missing_labels:
@ -670,6 +695,59 @@ def debug_data(
f"Found {gold_train_data['n_cycles']} projectivized train sentence(s) with cycles"
)
if "trainable_lemmatizer" in factory_names:
msg.divider("Trainable Lemmatizer")
trees_train: Set[str] = gold_train_data["lemmatizer_trees"]
trees_dev: Set[str] = gold_dev_data["lemmatizer_trees"]
# This is necessary context when someone is attempting to interpret whether the
# number of trees exclusively in the dev set is meaningful.
msg.info(f"{len(trees_train)} lemmatizer trees generated from training data")
msg.info(f"{len(trees_dev)} lemmatizer trees generated from dev data")
dev_not_train = trees_dev - trees_train
if len(dev_not_train) != 0:
pct = len(dev_not_train) / len(trees_dev)
msg.info(
f"{len(dev_not_train)} lemmatizer trees ({pct*100:.1f}% of dev trees)"
" were found exclusively in the dev data."
)
else:
# Would we ever expect this case? It seems like it would be pretty rare,
# and we might actually want a warning?
msg.info("All trees in dev data present in training data.")
if gold_train_data["n_low_cardinality_lemmas"] > 0:
n = gold_train_data["n_low_cardinality_lemmas"]
msg.warn(f"{n} training docs with 0 or 1 unique lemmas.")
if gold_dev_data["n_low_cardinality_lemmas"] > 0:
n = gold_dev_data["n_low_cardinality_lemmas"]
msg.warn(f"{n} dev docs with 0 or 1 unique lemmas.")
if gold_train_data["no_lemma_annotations"] > 0:
n = gold_train_data["no_lemma_annotations"]
msg.warn(f"{n} training docs with no lemma annotations.")
else:
msg.good("All training docs have lemma annotations.")
if gold_dev_data["no_lemma_annotations"] > 0:
n = gold_dev_data["no_lemma_annotations"]
msg.warn(f"{n} dev docs with no lemma annotations.")
else:
msg.good("All dev docs have lemma annotations.")
if gold_train_data["partial_lemma_annotations"] > 0:
n = gold_train_data["partial_lemma_annotations"]
msg.info(f"{n} training docs with partial lemma annotations.")
else:
msg.good("All training docs have complete lemma annotations.")
if gold_dev_data["partial_lemma_annotations"] > 0:
n = gold_dev_data["partial_lemma_annotations"]
msg.info(f"{n} dev docs with partial lemma annotations.")
else:
msg.good("All dev docs have complete lemma annotations.")
msg.divider("Summary")
good_counts = msg.counts[MESSAGES.GOOD]
warn_counts = msg.counts[MESSAGES.WARN]
@ -731,7 +809,13 @@ def _compile_gold(
"n_cats_multilabel": 0,
"n_cats_bad_values": 0,
"texts": set(),
"lemmatizer_trees": set(),
"no_lemma_annotations": 0,
"partial_lemma_annotations": 0,
"n_low_cardinality_lemmas": 0,
}
if "trainable_lemmatizer" in factory_names:
trees = EditTrees(nlp.vocab.strings)
for eg in examples:
gold = eg.reference
doc = eg.predicted
@ -764,7 +848,7 @@ def _compile_gold(
data["boundary_cross_ents"] += 1
elif label == "-":
data["ner"]["-"] += 1
if "spancat" in factory_names:
if "spancat" in factory_names or "spancat_singlelabel" in factory_names:
for spans_key in list(eg.reference.spans.keys()):
# Obtain the span frequency
if spans_key not in data["spancat"]:
@ -861,6 +945,25 @@ def _compile_gold(
data["n_nonproj"] += 1
if nonproj.contains_cycle(aligned_heads):
data["n_cycles"] += 1
if "trainable_lemmatizer" in factory_names:
# from EditTreeLemmatizer._labels_from_data
if all(token.lemma == 0 for token in gold):
data["no_lemma_annotations"] += 1
continue
if any(token.lemma == 0 for token in gold):
data["partial_lemma_annotations"] += 1
lemma_set = set()
for token in gold:
if token.lemma != 0:
lemma_set.add(token.lemma)
tree_id = trees.add(token.text, token.lemma_)
tree_str = trees.tree_to_str(tree_id)
data["lemmatizer_trees"].add(tree_str)
# We want to identify cases where lemmas aren't assigned
# or are all assigned the same value, as this would indicate
# an issue since we're expecting a large set of lemmas
if len(lemma_set) < 2 and len(gold) > 1:
data["n_low_cardinality_lemmas"] += 1
return data
@ -934,6 +1037,7 @@ def _get_labels_from_model(nlp: Language, factory_name: str) -> Set[str]:
labels: Set[str] = set()
for pipe_name in pipe_names:
pipe = nlp.get_pipe(pipe_name)
assert isinstance(pipe, TrainablePipe)
labels.update(pipe.labels)
return labels
@ -942,7 +1046,7 @@ def _get_labels_from_spancat(nlp: Language) -> Dict[str, Set[str]]:
pipe_names = [
pipe_name
for pipe_name in nlp.pipe_names
if nlp.get_pipe_meta(pipe_name).factory == "spancat"
if nlp.get_pipe_meta(pipe_name).factory in ("spancat", "spancat_singlelabel")
]
labels: Dict[str, Set[str]] = {}
for pipe_name in pipe_names:

View File

@ -1,13 +1,13 @@
from pathlib import Path
from typing import Optional
import typer
from wasabi import Printer, diff_strings, MarkdownRenderer
from pathlib import Path
from thinc.api import Config
from wasabi import MarkdownRenderer, Printer, diff_strings
from ._util import debug_cli, Arg, Opt, show_validation_error, parse_config_overrides
from ..util import load_config
from .init_config import init_config, Optimizations
from ._util import Arg, Opt, debug_cli, parse_config_overrides, show_validation_error
from .init_config import Optimizations, init_config
@debug_cli.command(

View File

@ -1,19 +1,32 @@
from typing import Dict, Any, Optional
from pathlib import Path
import itertools
from pathlib import Path
from typing import Any, Dict, Optional
import typer
from thinc.api import (
Model,
data_validation,
fix_random_seed,
set_dropout_rate,
set_gpu_allocator,
)
from wasabi import msg
from spacy.training import Example
from spacy.util import resolve_dot_names
from wasabi import msg
from thinc.api import fix_random_seed, set_dropout_rate
from thinc.api import Model, data_validation, set_gpu_allocator
import typer
from ._util import Arg, Opt, debug_cli, show_validation_error
from ._util import parse_config_overrides, string_to_list, setup_gpu
from .. import util
from ..schemas import ConfigSchemaTraining
from ..util import registry
from .. import util
from ._util import (
Arg,
Opt,
debug_cli,
parse_config_overrides,
setup_gpu,
show_validation_error,
string_to_list,
)
@debug_cli.command(

View File

@ -1,14 +1,14 @@
from typing import Optional, Sequence
import requests
import sys
from wasabi import msg
import typer
from typing import Optional, Sequence
import requests
import typer
from wasabi import msg
from ._util import app, Arg, Opt, WHEEL_SUFFIX, SDIST_SUFFIX
from .. import about
from ..util import is_package, get_minor_version, run_command
from ..util import is_prerelease_version
from ..errors import OLD_MODEL_SHORTCUTS
from ..util import get_minor_version, is_package, is_prerelease_version, run_command
from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app
@app.command(
@ -81,11 +81,8 @@ def download(
def get_model_filename(model_name: str, version: str, sdist: bool = False) -> str:
dl_tpl = "{m}-{v}/{m}-{v}{s}"
egg_tpl = "#egg={m}=={v}"
suffix = SDIST_SUFFIX if sdist else WHEEL_SUFFIX
filename = dl_tpl.format(m=model_name, v=version, s=suffix)
if sdist:
filename += egg_tpl.format(m=model_name, v=version)
return filename

View File

@ -1,18 +1,21 @@
from typing import Optional, List, Dict, Any, Union
from wasabi import Printer
from pathlib import Path
import re
from pathlib import Path
from typing import Any, Dict, List, Optional, Union
import srsly
from thinc.api import fix_random_seed
from wasabi import Printer
from ..training import Corpus
from ..tokens import Doc
from ._util import app, Arg, Opt, setup_gpu, import_code
from .. import displacy, util
from ..scorer import Scorer
from .. import util
from .. import displacy
from ..tokens import Doc
from ..training import Corpus
from ._util import Arg, Opt, app, benchmark_cli, import_code, setup_gpu
@benchmark_cli.command(
"accuracy",
)
@app.command("evaluate")
def evaluate_cli(
# fmt: off
@ -24,6 +27,7 @@ def evaluate_cli(
gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"),
displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False),
displacy_limit: int = Opt(25, "--displacy-limit", "-dl", help="Limit of parses to render as HTML"),
per_component: bool = Opt(False, "--per-component", "-P", help="Return scores per component, only applicable when an output JSON file is specified."),
# fmt: on
):
"""
@ -36,7 +40,7 @@ def evaluate_cli(
dependency parses in a HTML file, set as output directory as the
displacy_path argument.
DOCS: https://spacy.io/api/cli#evaluate
DOCS: https://spacy.io/api/cli#benchmark-accuracy
"""
import_code(code_path)
evaluate(
@ -47,6 +51,7 @@ def evaluate_cli(
gold_preproc=gold_preproc,
displacy_path=displacy_path,
displacy_limit=displacy_limit,
per_component=per_component,
silent=False,
)
@ -61,6 +66,7 @@ def evaluate(
displacy_limit: int = 25,
silent: bool = True,
spans_key: str = "sc",
per_component: bool = False,
) -> Dict[str, Any]:
msg = Printer(no_print=silent, pretty=not silent)
fix_random_seed()
@ -75,50 +81,61 @@ def evaluate(
corpus = Corpus(data_path, gold_preproc=gold_preproc)
nlp = util.load_model(model)
dev_dataset = list(corpus(nlp))
scores = nlp.evaluate(dev_dataset)
metrics = {
"TOK": "token_acc",
"TAG": "tag_acc",
"POS": "pos_acc",
"MORPH": "morph_acc",
"LEMMA": "lemma_acc",
"UAS": "dep_uas",
"LAS": "dep_las",
"NER P": "ents_p",
"NER R": "ents_r",
"NER F": "ents_f",
"TEXTCAT": "cats_score",
"SENT P": "sents_p",
"SENT R": "sents_r",
"SENT F": "sents_f",
"SPAN P": f"spans_{spans_key}_p",
"SPAN R": f"spans_{spans_key}_r",
"SPAN F": f"spans_{spans_key}_f",
"SPEED": "speed",
}
results = {}
data = {}
for metric, key in metrics.items():
if key in scores:
if key == "cats_score":
metric = metric + " (" + scores.get("cats_score_desc", "unk") + ")"
if isinstance(scores[key], (int, float)):
if key == "speed":
results[metric] = f"{scores[key]:.0f}"
scores = nlp.evaluate(dev_dataset, per_component=per_component)
if per_component:
data = scores
if output is None:
msg.warn(
"The per-component option is enabled but there is no output JSON file provided to save the scores to."
)
else:
msg.info("Per-component scores will be saved to output JSON file.")
else:
metrics = {
"TOK": "token_acc",
"TAG": "tag_acc",
"POS": "pos_acc",
"MORPH": "morph_acc",
"LEMMA": "lemma_acc",
"UAS": "dep_uas",
"LAS": "dep_las",
"NER P": "ents_p",
"NER R": "ents_r",
"NER F": "ents_f",
"TEXTCAT": "cats_score",
"SENT P": "sents_p",
"SENT R": "sents_r",
"SENT F": "sents_f",
"SPAN P": f"spans_{spans_key}_p",
"SPAN R": f"spans_{spans_key}_r",
"SPAN F": f"spans_{spans_key}_f",
"SPEED": "speed",
}
results = {}
data = {}
for metric, key in metrics.items():
if key in scores:
if key == "cats_score":
metric = metric + " (" + scores.get("cats_score_desc", "unk") + ")"
if isinstance(scores[key], (int, float)):
if key == "speed":
results[metric] = f"{scores[key]:.0f}"
else:
results[metric] = f"{scores[key]*100:.2f}"
else:
results[metric] = f"{scores[key]*100:.2f}"
else:
results[metric] = "-"
data[re.sub(r"[\s/]", "_", key.lower())] = scores[key]
results[metric] = "-"
data[re.sub(r"[\s/]", "_", key.lower())] = scores[key]
msg.table(results, title="Results")
data = handle_scores_per_type(scores, data, spans_key=spans_key, silent=silent)
msg.table(results, title="Results")
data = handle_scores_per_type(scores, data, spans_key=spans_key, silent=silent)
if displacy_path:
factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names]
docs = list(nlp.pipe(ex.reference.text for ex in dev_dataset[:displacy_limit]))
render_deps = "parser" in factory_names
render_ents = "ner" in factory_names
render_spans = "spancat" in factory_names
render_parses(
docs,
displacy_path,
@ -126,6 +143,7 @@ def evaluate(
limit=displacy_limit,
deps=render_deps,
ents=render_ents,
spans=render_spans,
)
msg.good(f"Generated {displacy_limit} parses as HTML", displacy_path)
@ -179,6 +197,7 @@ def render_parses(
limit: int = 250,
deps: bool = True,
ents: bool = True,
spans: bool = True,
):
docs[0].user_data["title"] = model_name
if ents:
@ -192,6 +211,11 @@ def render_parses(
with (output_path / "parses.html").open("w", encoding="utf8") as file_:
file_.write(html)
if spans:
html = displacy.render(docs[:limit], style="span", page=True)
with (output_path / "spans.html").open("w", encoding="utf8") as file_:
file_.write(html)
def print_prf_per_type(
msg: Printer, scores: Dict[str, Dict[str, float]], name: str, type: str

233
spacy/cli/find_threshold.py Normal file
View File

@ -0,0 +1,233 @@
import functools
import logging
import operator
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
import numpy
import wasabi.tables
from .. import util
from ..errors import Errors
from ..pipeline import MultiLabel_TextCategorizer, TextCategorizer
from ..training import Corpus
from ._util import Arg, Opt, app, import_code, setup_gpu
_DEFAULTS = {
"n_trials": 11,
"use_gpu": -1,
"gold_preproc": False,
}
@app.command(
"find-threshold",
context_settings={"allow_extra_args": False, "ignore_unknown_options": True},
)
def find_threshold_cli(
# fmt: off
model: str = Arg(..., help="Model name or path"),
data_path: Path = Arg(..., help="Location of binary evaluation data in .spacy format", exists=True),
pipe_name: str = Arg(..., help="Name of pipe to examine thresholds for"),
threshold_key: str = Arg(..., help="Key of threshold attribute in component's configuration"),
scores_key: str = Arg(..., help="Metric to optimize"),
n_trials: int = Opt(_DEFAULTS["n_trials"], "--n_trials", "-n", help="Number of trials to determine optimal thresholds"),
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
use_gpu: int = Opt(_DEFAULTS["use_gpu"], "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
gold_preproc: bool = Opt(_DEFAULTS["gold_preproc"], "--gold-preproc", "-G", help="Use gold preprocessing"),
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
# fmt: on
):
"""
Runs prediction trials for a trained model with varying tresholds to maximize
the specified metric. The search space for the threshold is traversed linearly
from 0 to 1 in `n_trials` steps. Results are displayed in a table on `stdout`
(the corresponding API call to `spacy.cli.find_threshold.find_threshold()`
returns all results).
This is applicable only for components whose predictions are influenced by
thresholds - e.g. `textcat_multilabel` and `spancat`, but not `textcat`. Note
that the full path to the corresponding threshold attribute in the config has to
be provided.
DOCS: https://spacy.io/api/cli#find-threshold
"""
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
import_code(code_path)
find_threshold(
model=model,
data_path=data_path,
pipe_name=pipe_name,
threshold_key=threshold_key,
scores_key=scores_key,
n_trials=n_trials,
use_gpu=use_gpu,
gold_preproc=gold_preproc,
silent=False,
)
def find_threshold(
model: str,
data_path: Path,
pipe_name: str,
threshold_key: str,
scores_key: str,
*,
n_trials: int = _DEFAULTS["n_trials"], # type: ignore
use_gpu: int = _DEFAULTS["use_gpu"], # type: ignore
gold_preproc: bool = _DEFAULTS["gold_preproc"], # type: ignore
silent: bool = True,
) -> Tuple[float, float, Dict[float, float]]:
"""
Runs prediction trials for models with varying tresholds to maximize the specified metric.
model (Union[str, Path]): Pipeline to evaluate. Can be a package or a path to a data directory.
data_path (Path): Path to file with DocBin with docs to use for threshold search.
pipe_name (str): Name of pipe to examine thresholds for.
threshold_key (str): Key of threshold attribute in component's configuration.
scores_key (str): Name of score to metric to optimize.
n_trials (int): Number of trials to determine optimal thresholds.
use_gpu (int): GPU ID or -1 for CPU.
gold_preproc (bool): Whether to use gold preprocessing. Gold preprocessing helps the annotations align to the
tokenization, and may result in sequences of more consistent length. However, it may reduce runtime accuracy due
to train/test skew.
silent (bool): Whether to print non-error-related output to stdout.
RETURNS (Tuple[float, float, Dict[float, float]]): Best found threshold, the corresponding score, scores for all
evaluated thresholds.
"""
setup_gpu(use_gpu, silent=silent)
data_path = util.ensure_path(data_path)
if not data_path.exists():
wasabi.msg.fail("Evaluation data not found", data_path, exits=1)
nlp = util.load_model(model)
if pipe_name not in nlp.component_names:
raise AttributeError(
Errors.E001.format(name=pipe_name, opts=nlp.component_names)
)
pipe = nlp.get_pipe(pipe_name)
if not hasattr(pipe, "scorer"):
raise AttributeError(Errors.E1045)
if type(pipe) == TextCategorizer:
wasabi.msg.warn(
"The `textcat` component doesn't use a threshold as it's not applicable to the concept of "
"exclusive classes. All thresholds will yield the same results."
)
if not silent:
wasabi.msg.info(
title=f"Optimizing for {scores_key} for component '{pipe_name}' with {n_trials} "
f"trials."
)
# Load evaluation corpus.
corpus = Corpus(data_path, gold_preproc=gold_preproc)
dev_dataset = list(corpus(nlp))
config_keys = threshold_key.split(".")
def set_nested_item(
config: Dict[str, Any], keys: List[str], value: float
) -> Dict[str, Any]:
"""Set item in nested dictionary. Adapted from https://stackoverflow.com/a/54138200.
config (Dict[str, Any]): Configuration dictionary.
keys (List[Any]): Path to value to set.
value (float): Value to set.
RETURNS (Dict[str, Any]): Updated dictionary.
"""
functools.reduce(operator.getitem, keys[:-1], config)[keys[-1]] = value
return config
def filter_config(
config: Dict[str, Any], keys: List[str], full_key: str
) -> Dict[str, Any]:
"""Filters provided config dictionary so that only the specified keys path remains.
config (Dict[str, Any]): Configuration dictionary.
keys (List[Any]): Path to value to set.
full_key (str): Full user-specified key.
RETURNS (Dict[str, Any]): Filtered dictionary.
"""
if keys[0] not in config:
wasabi.msg.fail(
title=f"Failed to look up `{full_key}` in config: sub-key {[keys[0]]} not found.",
text=f"Make sure you specified {[keys[0]]} correctly. The following sub-keys are available instead: "
f"{list(config.keys())}",
exits=1,
)
return {
keys[0]: filter_config(config[keys[0]], keys[1:], full_key)
if len(keys) > 1
else config[keys[0]]
}
# Evaluate with varying threshold values.
scores: Dict[float, float] = {}
config_keys_full = ["components", pipe_name, *config_keys]
table_col_widths = (10, 10)
thresholds = numpy.linspace(0, 1, n_trials)
print(wasabi.tables.row(["Threshold", f"{scores_key}"], widths=table_col_widths))
for threshold in thresholds:
# Reload pipeline with overrides specifying the new threshold.
nlp = util.load_model(
model,
config=set_nested_item(
filter_config(
nlp.config, config_keys_full, ".".join(config_keys_full)
).copy(),
config_keys_full,
threshold,
),
)
if hasattr(pipe, "cfg"):
setattr(
nlp.get_pipe(pipe_name),
"cfg",
set_nested_item(getattr(pipe, "cfg"), config_keys, threshold),
)
eval_scores = nlp.evaluate(dev_dataset)
if scores_key not in eval_scores:
wasabi.msg.fail(
title=f"Failed to look up score `{scores_key}` in evaluation results.",
text=f"Make sure you specified the correct value for `scores_key`. The following scores are "
f"available: {list(eval_scores.keys())}",
exits=1,
)
scores[threshold] = eval_scores[scores_key]
if not isinstance(scores[threshold], (float, int)):
wasabi.msg.fail(
f"Returned score for key '{scores_key}' is not numeric. Threshold optimization only works for numeric "
f"scores.",
exits=1,
)
print(
wasabi.row(
[round(threshold, 3), round(scores[threshold], 3)],
widths=table_col_widths,
)
)
best_threshold = max(scores.keys(), key=(lambda key: scores[key]))
# If all scores are identical, emit warning.
if len(set(scores.values())) == 1:
wasabi.msg.warn(
title="All scores are identical. Verify that all settings are correct.",
text=""
if (
not isinstance(pipe, MultiLabel_TextCategorizer)
or scores_key in ("cats_macro_f", "cats_micro_f")
)
else "Use `cats_macro_f` or `cats_micro_f` when optimizing the threshold for `textcat_multilabel`.",
)
else:
if not silent:
print(
f"\nBest threshold: {round(best_threshold, ndigits=4)} with {scores_key} value of {scores[best_threshold]}."
)
return best_threshold, scores[best_threshold], scores

View File

@ -1,15 +1,15 @@
from typing import Optional, Dict, Any, Union, List
import platform
import pkg_resources
import json
import platform
from pathlib import Path
from wasabi import Printer, MarkdownRenderer
import srsly
from typing import Any, Dict, List, Optional, Union
from ._util import app, Arg, Opt, string_to_list
from .download import get_model_filename, get_latest_version
from .. import util
from .. import about
import srsly
from wasabi import MarkdownRenderer, Printer
from .. import about, util
from ..compat import importlib_metadata
from ._util import Arg, Opt, app, string_to_list
from .download import get_latest_version, get_model_filename
@app.command("info")
@ -137,15 +137,14 @@ def info_installed_model_url(model: str) -> Optional[str]:
dist-info available.
"""
try:
dist = pkg_resources.get_distribution(model)
data = json.loads(dist.get_metadata("direct_url.json"))
return data["url"]
except pkg_resources.DistributionNotFound:
# no such package
return None
dist = importlib_metadata.distribution(model)
text = dist.read_text("direct_url.json")
if isinstance(text, str):
data = json.loads(text)
return data["url"]
except Exception:
# something else, like no file or invalid JSON
return None
pass
return None
def info_model_url(model: str) -> Dict[str, Any]:

View File

@ -1,19 +1,26 @@
from typing import Optional, List, Tuple
import re
from enum import Enum
from pathlib import Path
from wasabi import Printer, diff_strings
from thinc.api import Config
from typing import List, Optional, Tuple
import srsly
import re
from jinja2 import Template
from thinc.api import Config
from wasabi import Printer, diff_strings
from .. import util
from ..language import DEFAULT_CONFIG_PRETRAIN_PATH
from ..schemas import RecommendationSchema
from ..util import SimpleFrozenList
from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND
from ._util import string_to_list, import_code
from ._util import (
COMMAND,
Arg,
Opt,
import_code,
init_cli,
show_validation_error,
string_to_list,
)
ROOT = Path(__file__).parent / "templates"
TEMPLATE_PATH = ROOT / "quickstart_training.jinja"

View File

@ -1,15 +1,23 @@
from typing import Optional
import logging
from pathlib import Path
from wasabi import msg
import typer
from typing import Optional
import srsly
import typer
from wasabi import msg
from .. import util
from ..training.initialize import init_nlp, convert_vectors
from ..language import Language
from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
from ._util import import_code, setup_gpu
from ..training.initialize import convert_vectors, init_nlp
from ._util import (
Arg,
Opt,
import_code,
init_cli,
parse_config_overrides,
setup_gpu,
show_validation_error,
)
@init_cli.command("vectors")
@ -24,6 +32,7 @@ def init_vectors_cli(
name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True),
attr: str = Opt("ORTH", "--attr", "-a", help="Optional token attribute to use for vectors, e.g. LOWER or NORM"),
# fmt: on
):
"""Convert word vectors for use with spaCy. Will export an nlp object that
@ -42,6 +51,7 @@ def init_vectors_cli(
prune=prune,
name=name,
mode=mode,
attr=attr,
)
msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
nlp.to_disk(output_dir)

View File

@ -1,18 +1,18 @@
from typing import Optional, Union, Any, Dict, List, Tuple, cast
import shutil
from pathlib import Path
from wasabi import Printer, MarkdownRenderer, get_raw_input
from thinc.api import Config
from collections import defaultdict
from catalogue import RegistryError
import srsly
import sys
import re
import shutil
import sys
from collections import defaultdict
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union, cast
from ._util import app, Arg, Opt, string_to_list, WHEEL_SUFFIX, SDIST_SUFFIX
from ..schemas import validate, ModelMetaSchema
from .. import util
from .. import about
import srsly
from catalogue import RegistryError
from thinc.api import Config
from wasabi import MarkdownRenderer, Printer, get_raw_input
from .. import about, util
from ..schemas import ModelMetaSchema, validate
from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app, string_to_list
@app.command("package")
@ -252,7 +252,7 @@ def get_third_party_dependencies(
raise regerr from None
module_name = func_info.get("module") # type: ignore[attr-defined]
if module_name: # the code is part of a module, not a --code file
modules.add(func_info["module"].split(".")[0]) # type: ignore[index]
modules.add(func_info["module"].split(".")[0]) # type: ignore[union-attr]
dependencies = []
for module_name in modules:
if module_name in distributions:

View File

@ -1,13 +1,21 @@
from typing import Optional
from pathlib import Path
from wasabi import msg
import typer
import re
from pathlib import Path
from typing import Optional
import typer
from wasabi import msg
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
from ._util import import_code, setup_gpu
from ..training.pretrain import pretrain
from ..util import load_config
from ._util import (
Arg,
Opt,
app,
import_code,
parse_config_overrides,
setup_gpu,
show_validation_error,
)
@app.command(
@ -23,6 +31,7 @@ def pretrain_cli(
resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."),
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
skip_last: bool = Opt(False, "--skip-last", "-L", help="Skip saving model-last.bin"),
# fmt: on
):
"""
@ -74,6 +83,7 @@ def pretrain_cli(
epoch_resume=epoch_resume,
use_gpu=use_gpu,
silent=False,
skip_last=skip_last,
)
msg.good("Successfully finished pretrain")

View File

@ -1,17 +1,18 @@
from typing import Optional, Sequence, Union, Iterator
import tqdm
from pathlib import Path
import srsly
import cProfile
import itertools
import pstats
import sys
import itertools
from wasabi import msg, Printer
import typer
from pathlib import Path
from typing import Iterator, Optional, Sequence, Union
import srsly
import tqdm
import typer
from wasabi import Printer, msg
from ._util import app, debug_cli, Arg, Opt, NAME
from ..language import Language
from ..util import load_model
from ._util import NAME, Arg, Opt, app, debug_cli
@debug_cli.command("profile")

View File

@ -1,16 +1,27 @@
from typing import Any, Dict, Optional
from pathlib import Path
from wasabi import msg
import os
import re
import shutil
from pathlib import Path
from typing import Any, Dict, Optional
import requests
import typer
from wasabi import msg
from ...util import ensure_path, working_dir
from .._util import project_cli, Arg, Opt, PROJECT_FILE, load_project_config
from .._util import get_checksum, download_file, git_checkout, get_git_version
from .._util import SimpleFrozenDict, parse_config_overrides
from .._util import (
PROJECT_FILE,
Arg,
Opt,
SimpleFrozenDict,
download_file,
get_checksum,
get_git_version,
git_checkout,
load_project_config,
parse_config_overrides,
project_cli,
)
# Whether assets are extra if `extra` is not set.
EXTRA_DEFAULT = False
@ -189,7 +200,11 @@ def convert_asset_url(url: str) -> str:
RETURNS (str): The converted URL.
"""
# If the asset URL is a regular GitHub URL it's likely a mistake
if re.match(r"(http(s?)):\/\/github.com", url) and "releases/download" not in url:
if (
re.match(r"(http(s?)):\/\/github.com", url)
and "releases/download" not in url
and "/raw/" not in url
):
converted = url.replace("github.com", "raw.githubusercontent.com")
converted = re.sub(r"/(tree|blob)/", "/", converted)
msg.warn(

View File

@ -1,13 +1,22 @@
from typing import Optional
from pathlib import Path
from wasabi import msg
import subprocess
import re
import subprocess
from pathlib import Path
from typing import Optional
from wasabi import msg
from ... import about
from ...util import ensure_path
from .._util import project_cli, Arg, Opt, COMMAND, PROJECT_FILE
from .._util import git_checkout, get_git_version, git_repo_branch_exists
from .._util import (
COMMAND,
PROJECT_FILE,
Arg,
Opt,
get_git_version,
git_checkout,
git_repo_branch_exists,
project_cli,
)
DEFAULT_REPO = about.__projects__
DEFAULT_PROJECTS_BRANCH = about.__projects_branch__

View File

@ -1,9 +1,9 @@
from pathlib import Path
from wasabi import msg, MarkdownRenderer
from wasabi import MarkdownRenderer, msg
from ...util import working_dir
from .._util import project_cli, Arg, Opt, PROJECT_FILE, load_project_config
from .._util import PROJECT_FILE, Arg, Opt, load_project_config, project_cli
DOCS_URL = "https://spacy.io"
INTRO_PROJECT = f"""The [`{PROJECT_FILE}`]({PROJECT_FILE}) defines the data assets required by the

View File

@ -1,15 +1,28 @@
"""This module contains helpers and subcommands for integrating spaCy projects
with Data Version Controk (DVC). https://dvc.org"""
from typing import Dict, Any, List, Optional, Iterable
import subprocess
from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional
from wasabi import msg
from .._util import PROJECT_FILE, load_project_config, get_hash, project_cli
from .._util import Arg, Opt, NAME, COMMAND
from ...util import working_dir, split_command, join_command, run_command
from ...util import SimpleFrozenList
from ...util import (
SimpleFrozenList,
join_command,
run_command,
split_command,
working_dir,
)
from .._util import (
COMMAND,
NAME,
PROJECT_FILE,
Arg,
Opt,
get_hash,
load_project_config,
project_cli,
)
DVC_CONFIG = "dvc.yaml"
DVC_DIR = ".dvc"

View File

@ -1,9 +1,9 @@
from pathlib import Path
from wasabi import msg
from .remote_storage import RemoteStorage
from .remote_storage import get_command_hash
from .._util import project_cli, Arg, logger
from .._util import load_project_config
from .._util import Arg, load_project_config, logger, project_cli
from .remote_storage import RemoteStorage, get_command_hash
from .run import update_lockfile
@ -39,14 +39,17 @@ def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
# in the list.
while commands:
for i, cmd in enumerate(list(commands)):
logger.debug(f"CMD: {cmd['name']}.")
logger.debug("CMD: %s.", cmd["name"])
deps = [project_dir / dep for dep in cmd.get("deps", [])]
if all(dep.exists() for dep in deps):
cmd_hash = get_command_hash("", "", deps, cmd["script"])
for output_path in cmd.get("outputs", []):
url = storage.pull(output_path, command_hash=cmd_hash)
logger.debug(
f"URL: {url} for {output_path} with command hash {cmd_hash}"
"URL: %s for %s with command hash %s",
url,
output_path,
cmd_hash,
)
yield url, output_path
@ -58,7 +61,7 @@ def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
commands.pop(i)
break
else:
logger.debug(f"Dependency missing. Skipping {cmd['name']} outputs.")
logger.debug("Dependency missing. Skipping %s outputs.", cmd["name"])
else:
# If we didn't break the for loop, break the while loop.
break

View File

@ -1,9 +1,9 @@
from pathlib import Path
from wasabi import msg
from .remote_storage import RemoteStorage
from .remote_storage import get_content_hash, get_command_hash
from .._util import load_project_config
from .._util import project_cli, Arg, logger
from .._util import Arg, load_project_config, logger, project_cli
from .remote_storage import RemoteStorage, get_command_hash, get_content_hash
@project_cli.command("push")
@ -37,15 +37,15 @@ def project_push(project_dir: Path, remote: str):
remote = config["remotes"][remote]
storage = RemoteStorage(project_dir, remote)
for cmd in config.get("commands", []):
logger.debug(f"CMD: cmd['name']")
logger.debug("CMD: %s", cmd["name"])
deps = [project_dir / dep for dep in cmd.get("deps", [])]
if any(not dep.exists() for dep in deps):
logger.debug(f"Dependency missing. Skipping {cmd['name']} outputs")
logger.debug("Dependency missing. Skipping %s outputs", cmd["name"])
continue
cmd_hash = get_command_hash(
"", "", [project_dir / dep for dep in cmd.get("deps", [])], cmd["script"]
)
logger.debug(f"CMD_HASH: {cmd_hash}")
logger.debug("CMD_HASH: %s", cmd_hash)
for output_path in cmd.get("outputs", []):
output_loc = project_dir / output_path
if output_loc.exists() and _is_not_empty_dir(output_loc):
@ -55,7 +55,7 @@ def project_push(project_dir: Path, remote: str):
content_hash=get_content_hash(output_loc),
)
logger.debug(
f"URL: {url} for output {output_path} with cmd_hash {cmd_hash}"
"URL: %s for output %s with cmd_hash %s", url, output_path, cmd_hash
)
yield output_path, url

View File

@ -1,18 +1,28 @@
from typing import Optional, List, Dict, TYPE_CHECKING
import hashlib
import os
import site
import hashlib
import urllib.parse
import tarfile
import urllib.parse
from pathlib import Path
from typing import TYPE_CHECKING, Dict, List, Optional
from wasabi import msg
from .._util import get_hash, get_checksum, download_file, ensure_pathy
from ...util import make_tempdir, get_minor_version, ENV_VARS, check_bool_env_var
from ...git_info import GIT_VERSION
from ... import about
from ...errors import Errors
from ...git_info import GIT_VERSION
from ...util import ENV_VARS, check_bool_env_var, get_minor_version
from .._util import (
download_file,
ensure_pathy,
get_checksum,
get_hash,
make_tempdir,
upload_file,
)
if TYPE_CHECKING:
from pathy import Pathy # noqa: F401
from pathy import FluidPath # noqa: F401
class RemoteStorage:
@ -27,7 +37,7 @@ class RemoteStorage:
self.url = ensure_pathy(url)
self.compression = compression
def push(self, path: Path, command_hash: str, content_hash: str) -> "Pathy":
def push(self, path: Path, command_hash: str, content_hash: str) -> "FluidPath":
"""Compress a file or directory within a project and upload it to a remote
storage. If an object exists at the full URL, nothing is done.
@ -48,9 +58,7 @@ class RemoteStorage:
mode_string = f"w:{self.compression}" if self.compression else "w"
with tarfile.open(tar_loc, mode=mode_string) as tar_file:
tar_file.add(str(loc), arcname=str(path))
with tar_loc.open(mode="rb") as input_file:
with url.open(mode="wb") as output_file:
output_file.write(input_file.read())
upload_file(tar_loc, url)
return url
def pull(
@ -59,7 +67,7 @@ class RemoteStorage:
*,
command_hash: Optional[str] = None,
content_hash: Optional[str] = None,
) -> Optional["Pathy"]:
) -> Optional["FluidPath"]:
"""Retrieve a file from the remote cache. If the file already exists,
nothing is done.
@ -84,7 +92,23 @@ class RemoteStorage:
with tarfile.open(tar_loc, mode=mode_string) as tar_file:
# This requires that the path is added correctly, relative
# to root. This is how we set things up in push()
tar_file.extractall(self.root)
# Disallow paths outside the current directory for the tar
# file (CVE-2007-4559, directory traversal vulnerability)
def is_within_directory(directory, target):
abs_directory = os.path.abspath(directory)
abs_target = os.path.abspath(target)
prefix = os.path.commonprefix([abs_directory, abs_target])
return prefix == abs_directory
def safe_extract(tar, path):
for member in tar.getmembers():
member_path = os.path.join(path, member.name)
if not is_within_directory(path, member_path):
raise ValueError(Errors.E852)
tar.extractall(path)
safe_extract(tar_file, self.root)
return url
def find(
@ -93,25 +117,37 @@ class RemoteStorage:
*,
command_hash: Optional[str] = None,
content_hash: Optional[str] = None,
) -> Optional["Pathy"]:
) -> Optional["FluidPath"]:
"""Find the best matching version of a file within the storage,
or `None` if no match can be found. If both the creation and content hash
are specified, only exact matches will be returned. Otherwise, the most
recent matching file is preferred.
"""
name = self.encode_name(str(path))
urls = []
if command_hash is not None and content_hash is not None:
url = self.make_url(path, command_hash, content_hash)
url = self.url / name / command_hash / content_hash
urls = [url] if url.exists() else []
elif command_hash is not None:
urls = list((self.url / name / command_hash).iterdir())
if (self.url / name / command_hash).exists():
urls = list((self.url / name / command_hash).iterdir())
else:
urls = list((self.url / name).iterdir())
if content_hash is not None:
urls = [url for url in urls if url.parts[-1] == content_hash]
if (self.url / name).exists():
for sub_dir in (self.url / name).iterdir():
urls.extend(sub_dir.iterdir())
if content_hash is not None:
urls = [url for url in urls if url.parts[-1] == content_hash]
if len(urls) >= 2:
try:
urls.sort(key=lambda x: x.stat().last_modified) # type: ignore
except Exception:
msg.warn(
"Unable to sort remote files by last modified. The file(s) "
"pulled from the cache may not be the most recent."
)
return urls[-1] if urls else None
def make_url(self, path: Path, command_hash: str, content_hash: str) -> "Pathy":
def make_url(self, path: Path, command_hash: str, content_hash: str) -> "FluidPath":
"""Construct a URL from a subpath, a creation hash and a content hash."""
return self.url / self.encode_name(str(path)) / command_hash / content_hash

View File

@ -1,21 +1,39 @@
from typing import Optional, List, Dict, Sequence, Any, Iterable, Tuple
import os.path
from pathlib import Path
import pkg_resources
from wasabi import msg
from wasabi.util import locale_escape
import sys
from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple
import srsly
import typer
from wasabi import msg
from wasabi.util import locale_escape
from ... import about
from ...git_info import GIT_VERSION
from ...util import working_dir, run_command, split_command, is_cwd, join_command
from ...util import SimpleFrozenList, is_minor_version_match, ENV_VARS
from ...util import check_bool_env_var, SimpleFrozenDict
from .._util import PROJECT_FILE, PROJECT_LOCK, load_project_config, get_hash
from .._util import get_checksum, project_cli, Arg, Opt, COMMAND, parse_config_overrides
from ...util import (
ENV_VARS,
SimpleFrozenDict,
SimpleFrozenList,
check_bool_env_var,
is_cwd,
is_minor_version_match,
join_command,
run_command,
split_command,
working_dir,
)
from .._util import (
COMMAND,
PROJECT_FILE,
PROJECT_LOCK,
Arg,
Opt,
get_checksum,
get_hash,
load_project_config,
parse_config_overrides,
project_cli,
)
@project_cli.command(
@ -53,6 +71,7 @@ def project_run(
force: bool = False,
dry: bool = False,
capture: bool = False,
skip_requirements_check: bool = False,
) -> None:
"""Run a named script defined in the project.yml. If the script is part
of the default pipeline (defined in the "run" section), DVC is used to
@ -69,6 +88,7 @@ def project_run(
sys.exit will be called with the return code. You should use capture=False
when you want to turn over execution to the command, and capture=True
when you want to run the command more like a function.
skip_requirements_check (bool): Whether to skip the requirements check.
"""
config = load_project_config(project_dir, overrides=overrides)
commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
@ -76,9 +96,10 @@ def project_run(
validate_subcommand(list(commands.keys()), list(workflows.keys()), subcommand)
req_path = project_dir / "requirements.txt"
if config.get("check_requirements", True) and os.path.exists(req_path):
with req_path.open() as requirements_file:
_check_requirements([req.replace("\n", "") for req in requirements_file])
if not skip_requirements_check:
if config.get("check_requirements", True) and os.path.exists(req_path):
with req_path.open() as requirements_file:
_check_requirements([req.strip() for req in requirements_file])
if subcommand in workflows:
msg.info(f"Running workflow '{subcommand}'")
@ -90,6 +111,7 @@ def project_run(
force=force,
dry=dry,
capture=capture,
skip_requirements_check=True,
)
else:
cmd = commands[subcommand]
@ -97,8 +119,8 @@ def project_run(
if not (project_dir / dep).exists():
err = f"Missing dependency specified by command '{subcommand}': {dep}"
err_help = "Maybe you forgot to run the 'project assets' command or a previous step?"
err_kwargs = {"exits": 1} if not dry else {}
msg.fail(err, err_help, **err_kwargs)
err_exits = 1 if not dry else None
msg.fail(err, err_help, exits=err_exits)
check_spacy_commit = check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION)
with working_dir(project_dir) as current_dir:
msg.divider(subcommand)
@ -327,6 +349,7 @@ def _check_requirements(requirements: List[str]) -> Tuple[bool, bool]:
RETURNS (Tuple[bool, bool]): Whether (1) any packages couldn't be imported, (2) any packages with version conflicts
exist.
"""
import pkg_resources
failed_pkgs_msgs: List[str] = []
conflicting_pkgs_msgs: List[str] = []
@ -338,6 +361,12 @@ def _check_requirements(requirements: List[str]) -> Tuple[bool, bool]:
failed_pkgs_msgs.append(dnf.report())
except pkg_resources.VersionConflict as vc:
conflicting_pkgs_msgs.append(vc.report())
except Exception:
msg.warn(
f"Unable to check requirement: {req} "
"Checks are currently limited to requirement specifiers "
"(PEP 508)"
)
if len(failed_pkgs_msgs) or len(conflicting_pkgs_msgs):
msg.warn(

View File

@ -1,9 +1,9 @@
{# This is a template for training configs used for the quickstart widget in
the docs and the init config command. It encodes various best practices and
can help generate the best possible configuration, given a user's requirements. #}
{%- set use_transformer = hardware != "cpu" -%}
{%- set use_transformer = hardware != "cpu" and transformer_data -%}
{%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
{%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "spancat", "trainable_lemmatizer"] -%}
{%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "span_finder", "spancat", "spancat_singlelabel", "trainable_lemmatizer"] -%}
[paths]
train = null
dev = null
@ -24,8 +24,11 @@ gpu_allocator = null
lang = "{{ lang }}"
{%- set has_textcat = ("textcat" in components or "textcat_multilabel" in components) -%}
{%- set with_accuracy = optimize == "accuracy" -%}
{%- set has_accurate_textcat = has_textcat and with_accuracy -%}
{%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "spancat" in components or "trainable_lemmatizer" in components or "entity_linker" in components or has_accurate_textcat) -%}
{# The BOW textcat doesn't need a source of features, so it can omit the
tok2vec/transformer. #}
{%- set with_accuracy_or_transformer = (use_transformer or with_accuracy) -%}
{%- set textcat_needs_features = has_textcat and with_accuracy_or_transformer -%}
{%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "span_finder" in components or "spancat" in components or "spancat_singlelabel" in components or "trainable_lemmatizer" in components or "entity_linker" in components or textcat_needs_features) -%}
{%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components -%}
{%- else -%}
{%- set full_pipeline = components -%}
@ -124,6 +127,30 @@ grad_factor = 1.0
@layers = "reduce_mean.v1"
{% endif -%}
{% if "span_finder" in components -%}
[components.span_finder]
factory = "span_finder"
max_length = null
min_length = null
scorer = {"@scorers":"spacy.span_finder_scorer.v1"}
spans_key = "sc"
threshold = 0.5
[components.span_finder.model]
@architectures = "spacy.SpanFinder.v1"
[components.span_finder.model.scorer]
@layers = "spacy.LinearLogistic.v1"
nO = 2
[components.span_finder.model.tok2vec]
@architectures = "spacy-transformers.TransformerListener.v1"
grad_factor = 1.0
[components.span_finder.model.tok2vec.pooling]
@layers = "reduce_mean.v1"
{% endif -%}
{% if "spancat" in components -%}
[components.spancat]
factory = "spancat"
@ -156,6 +183,36 @@ grad_factor = 1.0
sizes = [1,2,3]
{% endif -%}
{% if "spancat_singlelabel" in components %}
[components.spancat_singlelabel]
factory = "spancat_singlelabel"
negative_weight = 1.0
allow_overlap = true
scorer = {"@scorers":"spacy.spancat_scorer.v1"}
spans_key = "sc"
[components.spancat_singlelabel.model]
@architectures = "spacy.SpanCategorizer.v1"
[components.spancat_singlelabel.model.reducer]
@layers = "spacy.mean_max_reducer.v1"
hidden_size = 128
[components.spancat_singlelabel.model.scorer]
@layers = "Softmax.v2"
[components.spancat_singlelabel.model.tok2vec]
@architectures = "spacy-transformers.TransformerListener.v1"
grad_factor = 1.0
[components.spancat_singlelabel.model.tok2vec.pooling]
@layers = "reduce_mean.v1"
[components.spancat_singlelabel.suggester]
@misc = "spacy.ngram_suggester.v1"
sizes = [1,2,3]
{% endif %}
{% if "trainable_lemmatizer" in components -%}
[components.trainable_lemmatizer]
factory = "trainable_lemmatizer"
@ -221,10 +278,16 @@ no_output_layer = false
{% else -%}
[components.textcat.model]
@architectures = "spacy.TextCatBOW.v2"
@architectures = "spacy.TextCatCNN.v2"
exclusive_classes = true
ngram_size = 1
no_output_layer = false
nO = null
[components.textcat.model.tok2vec]
@architectures = "spacy-transformers.TransformerListener.v1"
grad_factor = 1.0
[components.textcat.model.tok2vec.pooling]
@layers = "reduce_mean.v1"
{%- endif %}
{%- endif %}
@ -252,10 +315,16 @@ no_output_layer = false
{% else -%}
[components.textcat_multilabel.model]
@architectures = "spacy.TextCatBOW.v2"
@architectures = "spacy.TextCatCNN.v2"
exclusive_classes = false
ngram_size = 1
no_output_layer = false
nO = null
[components.textcat_multilabel.model.tok2vec]
@architectures = "spacy-transformers.TransformerListener.v1"
grad_factor = 1.0
[components.textcat_multilabel.model.tok2vec.pooling]
@layers = "reduce_mean.v1"
{%- endif %}
{%- endif %}
@ -286,6 +355,7 @@ maxout_pieces = 3
{% if "morphologizer" in components %}
[components.morphologizer]
factory = "morphologizer"
label_smoothing = 0.05
[components.morphologizer.model]
@architectures = "spacy.Tagger.v2"
@ -299,6 +369,7 @@ width = ${components.tok2vec.model.encode.width}
{% if "tagger" in components %}
[components.tagger]
factory = "tagger"
label_smoothing = 0.05
[components.tagger.model]
@architectures = "spacy.Tagger.v2"
@ -345,6 +416,27 @@ nO = null
width = ${components.tok2vec.model.encode.width}
{% endif %}
{% if "span_finder" in components %}
[components.span_finder]
factory = "span_finder"
max_length = null
min_length = null
scorer = {"@scorers":"spacy.span_finder_scorer.v1"}
spans_key = "sc"
threshold = 0.5
[components.span_finder.model]
@architectures = "spacy.SpanFinder.v1"
[components.span_finder.model.scorer]
@layers = "spacy.LinearLogistic.v1"
nO = 2
[components.span_finder.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode.width}
{% endif %}
{% if "spancat" in components %}
[components.spancat]
factory = "spancat"
@ -374,6 +466,33 @@ width = ${components.tok2vec.model.encode.width}
sizes = [1,2,3]
{% endif %}
{% if "spancat_singlelabel" in components %}
[components.spancat_singlelabel]
factory = "spancat_singlelabel"
negative_weight = 1.0
allow_overlap = true
scorer = {"@scorers":"spacy.spancat_scorer.v1"}
spans_key = "sc"
[components.spancat_singlelabel.model]
@architectures = "spacy.SpanCategorizer.v1"
[components.spancat_singlelabel.model.reducer]
@layers = "spacy.mean_max_reducer.v1"
hidden_size = 128
[components.spancat_singlelabel.model.scorer]
@layers = "Softmax.v2"
[components.spancat_singlelabel.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode.width}
[components.spancat_singlelabel.suggester]
@misc = "spacy.ngram_suggester.v1"
sizes = [1,2,3]
{% endif %}
{% if "trainable_lemmatizer" in components -%}
[components.trainable_lemmatizer]
factory = "trainable_lemmatizer"

View File

@ -37,6 +37,15 @@ bn:
accuracy:
name: sagorsarker/bangla-bert-base
size_factor: 3
ca:
word_vectors: null
transformer:
efficiency:
name: projecte-aina/roberta-base-ca-v2
size_factor: 3
accuracy:
name: projecte-aina/roberta-base-ca-v2
size_factor: 3
da:
word_vectors: da_core_news_lg
transformer:

View File

@ -1,15 +1,23 @@
from typing import Optional, Dict, Any, Union
from pathlib import Path
from wasabi import msg
import typer
import logging
import sys
from pathlib import Path
from typing import Any, Dict, Optional, Union
import typer
from wasabi import msg
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
from ._util import import_code, setup_gpu
from ..training.loop import train as train_nlp
from ..training.initialize import init_nlp
from .. import util
from ..training.initialize import init_nlp
from ..training.loop import train as train_nlp
from ._util import (
Arg,
Opt,
app,
import_code,
parse_config_overrides,
setup_gpu,
show_validation_error,
)
@app.command(

View File

@ -1,14 +1,21 @@
from typing import Tuple
from pathlib import Path
import sys
import requests
from wasabi import msg, Printer
import warnings
from pathlib import Path
from typing import Tuple
import requests
from wasabi import Printer, msg
from ._util import app
from .. import about
from ..util import get_package_version, get_installed_models, get_minor_version
from ..util import get_package_path, get_model_meta, is_compatible_version
from ..util import (
get_installed_models,
get_minor_version,
get_model_meta,
get_package_path,
get_package_version,
is_compatible_version,
)
from ._util import app
@app.command("validate")

View File

@ -1,5 +1,6 @@
"""Helpers for Python and platform compatibility."""
import sys
from thinc.util import copy_array
try:

View File

@ -90,6 +90,8 @@ dev_corpus = "corpora.dev"
train_corpus = "corpora.train"
# Optional callback before nlp object is saved to disk after training
before_to_disk = null
# Optional callback that is invoked at the start of each training step
before_update = null
[training.logger]
@loggers = "spacy.ConsoleLogger.v1"

View File

@ -4,14 +4,13 @@ spaCy's built in visualization suite for dependencies and named entities.
DOCS: https://spacy.io/api/top-level#displacy
USAGE: https://spacy.io/usage/visualizers
"""
from typing import Union, Iterable, Optional, Dict, Any, Callable
import warnings
from typing import Any, Callable, Dict, Iterable, Optional, Union
from .render import DependencyRenderer, EntityRenderer, SpanRenderer
from ..tokens import Doc, Span
from ..errors import Errors, Warnings
from ..util import is_in_jupyter
from ..tokens import Doc, Span
from ..util import find_available_port, is_in_jupyter
from .render import DependencyRenderer, EntityRenderer, SpanRenderer
_html = {}
RENDER_WRAPPER = None
@ -36,7 +35,7 @@ def render(
jupyter (bool): Override Jupyter auto-detection.
options (dict): Visualiser-specific options, e.g. colors.
manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
RETURNS (str): Rendered HTML markup.
RETURNS (str): Rendered SVG or HTML markup.
DOCS: https://spacy.io/api/top-level#displacy.render
USAGE: https://spacy.io/usage/visualizers
@ -67,7 +66,7 @@ def render(
if jupyter or (jupyter is None and is_in_jupyter()):
# return HTML rendered by IPython display()
# See #4840 for details on span wrapper to disable mathjax
from IPython.core.display import display, HTML
from IPython.core.display import HTML, display
return display(HTML('<span class="tex2jax_ignore">{}</span>'.format(html)))
return html
@ -82,6 +81,7 @@ def serve(
manual: bool = False,
port: int = 5000,
host: str = "0.0.0.0",
auto_select_port: bool = False,
) -> None:
"""Serve displaCy visualisation.
@ -93,12 +93,15 @@ def serve(
manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
port (int): Port to serve visualisation.
host (str): Host to serve visualisation.
auto_select_port (bool): Automatically select a port if the specified port is in use.
DOCS: https://spacy.io/api/top-level#displacy.serve
USAGE: https://spacy.io/usage/visualizers
"""
from wsgiref import simple_server
port = find_available_port(port, host, auto_select_port)
if is_in_jupyter():
warnings.warn(Warnings.W011)
render(docs, style=style, page=page, minify=minify, options=options, manual=manual)
@ -120,13 +123,17 @@ def app(environ, start_response):
return [res]
def parse_deps(orig_doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
def parse_deps(
orig_doc: Union[Doc, Span], options: Dict[str, Any] = {}
) -> Dict[str, Any]:
"""Generate dependency parse in {'words': [], 'arcs': []} format.
orig_doc (Doc): Document to parse.
orig_doc (Union[Doc, Span]): Document to parse.
options (Dict[str, Any]): Dependency parse specific visualisation options.
RETURNS (dict): Generated dependency parse keyed by words and arcs.
"""
if isinstance(orig_doc, Span):
orig_doc = orig_doc.as_doc()
doc = Doc(orig_doc.vocab).from_bytes(
orig_doc.to_bytes(exclude=["user_data", "user_hooks"])
)
@ -228,12 +235,13 @@ def parse_spans(doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
"kb_id": span.kb_id_ if span.kb_id_ else "",
"kb_url": kb_url_template.format(span.kb_id_) if kb_url_template else "#",
}
for span in doc.spans[spans_key]
for span in doc.spans.get(spans_key, [])
]
tokens = [token.text for token in doc]
if not spans:
warnings.warn(Warnings.W117.format(spans_key=spans_key))
keys = list(doc.spans.keys())
warnings.warn(Warnings.W117.format(spans_key=spans_key, keys=keys))
title = doc.user_data.get("title", None) if hasattr(doc, "user_data") else None
settings = get_doc_settings(doc)
return {

View File

@ -1,15 +1,29 @@
from typing import Any, Dict, List, Optional, Tuple, Union
import uuid
import itertools
import uuid
from typing import Any, Dict, List, Optional, Tuple, Union
from ..errors import Errors
from ..util import escape_html, minify_html, registry
from .templates import TPL_DEP_ARCS, TPL_DEP_SVG, TPL_DEP_WORDS
from .templates import TPL_DEP_WORDS_LEMMA, TPL_ENT, TPL_ENT_RTL, TPL_ENTS
from .templates import TPL_FIGURE, TPL_KB_LINK, TPL_PAGE, TPL_SPAN
from .templates import TPL_SPAN_RTL, TPL_SPAN_SLICE, TPL_SPAN_SLICE_RTL
from .templates import TPL_SPAN_START, TPL_SPAN_START_RTL, TPL_SPANS
from .templates import TPL_TITLE
from .templates import (
TPL_DEP_ARCS,
TPL_DEP_SVG,
TPL_DEP_WORDS,
TPL_DEP_WORDS_LEMMA,
TPL_ENT,
TPL_ENT_RTL,
TPL_ENTS,
TPL_FIGURE,
TPL_KB_LINK,
TPL_PAGE,
TPL_SPAN,
TPL_SPAN_RTL,
TPL_SPAN_SLICE,
TPL_SPAN_SLICE_RTL,
TPL_SPAN_START,
TPL_SPAN_START_RTL,
TPL_SPANS,
TPL_TITLE,
)
DEFAULT_LANG = "en"
DEFAULT_DIR = "ltr"
@ -94,7 +108,7 @@ class SpanRenderer:
parsed (list): Dependency parses to render.
page (bool): Render parses wrapped as full HTML page.
minify (bool): Minify HTML markup.
RETURNS (str): Rendered HTML markup.
RETURNS (str): Rendered SVG or HTML markup.
"""
rendered = []
for i, p in enumerate(parsed):
@ -510,7 +524,7 @@ class EntityRenderer:
parsed (list): Dependency parses to render.
page (bool): Render parses wrapped as full HTML page.
minify (bool): Minify HTML markup.
RETURNS (str): Rendered HTML markup.
RETURNS (str): Rendered SVG or HTML markup.
"""
rendered = []
for i, p in enumerate(parsed):

View File

@ -1,4 +1,5 @@
import warnings
from .compat import Literal
@ -199,7 +200,7 @@ class Warnings(metaclass=ErrorsWithCodes):
W117 = ("No spans to visualize found in Doc object with spans_key: '{spans_key}'. If this is "
"surprising to you, make sure the Doc was processed using a model "
"that supports span categorization, and check the `doc.spans[spans_key]` "
"property manually if necessary.")
"property manually if necessary.\n\nAvailable keys: {keys}")
W118 = ("Term '{term}' not found in glossary. It may however be explained in documentation "
"for the corpora used to train the language. Please check "
"`nlp.meta[\"sources\"]` for any relevant links.")
@ -212,8 +213,12 @@ class Warnings(metaclass=ErrorsWithCodes):
W121 = ("Attempting to trace non-existent method '{method}' in pipe '{pipe}'")
W122 = ("Couldn't trace method '{method}' in pipe '{pipe}'. This can happen if the pipe class "
"is a Cython extension type.")
W123 = ("Argument {arg} with value {arg_value} is used instead of {config_value} as specified in the config. Be "
"aware that this might affect other components in your pipeline.")
W123 = ("Argument `enable` with value {enable} does not contain all values specified in the config option "
"`enabled` ({enabled}). Be aware that this might affect other components in your pipeline.")
W124 = ("{host}:{port} is already in use, using the nearest available port {serve_port} as an alternative.")
W125 = ("The StaticVectors key_attr is no longer used. To set a custom "
"key attribute for vectors, configure it through Vectors(attr=) or "
"'spacy init vectors --attr'")
class Errors(metaclass=ErrorsWithCodes):
@ -345,6 +350,11 @@ class Errors(metaclass=ErrorsWithCodes):
"clear the existing vectors and resize the table.")
E074 = ("Error interpreting compiled match pattern: patterns are expected "
"to end with the attribute {attr}. Got: {bad_attr}.")
E079 = ("Error computing states in beam: number of predicted beams "
"({pbeams}) does not equal number of gold beams ({gbeams}).")
E080 = ("Duplicate state found in beam: {key}.")
E081 = ("Error getting gradient in beam: number of histories ({n_hist}) "
"does not equal number of losses ({losses}).")
E082 = ("Error deprojectivizing parse: number of heads ({n_heads}), "
"projective heads ({n_proj_heads}) and labels ({n_labels}) do not "
"match.")
@ -438,8 +448,7 @@ class Errors(metaclass=ErrorsWithCodes):
E133 = ("The sum of prior probabilities for alias '{alias}' should not "
"exceed 1, but found {sum}.")
E134 = ("Entity '{entity}' is not defined in the Knowledge Base.")
E139 = ("Knowledge base for component '{name}' is empty. Use the methods "
"`kb.add_entity` and `kb.add_alias` to add entries.")
E139 = ("Knowledge base for component '{name}' is empty.")
E140 = ("The list of entities, prior probabilities and entity vectors "
"should be of equal length.")
E141 = ("Entity vectors should be of length {required} instead of the "
@ -544,6 +553,12 @@ class Errors(metaclass=ErrorsWithCodes):
"during training, make sure to include it in 'annotating components'")
# New errors added in v3.x
E850 = ("The PretrainVectors objective currently only supports default or "
"floret vectors, not {mode} vectors.")
E851 = ("The 'textcat' component labels should only have values of 0 or 1, "
"but found value of '{val}'.")
E852 = ("The tar file pulled from the remote attempted an unsafe path "
"traversal.")
E853 = ("Unsupported component factory name '{name}'. The character '.' is "
"not permitted in factory names.")
E854 = ("Unable to set doc.ents. Check that the 'ents_filter' does not "
@ -727,8 +742,8 @@ class Errors(metaclass=ErrorsWithCodes):
"model from a shortcut, which is obsolete as of spaCy v3.0. To "
"load the model, use its full name instead:\n\n"
"nlp = spacy.load(\"{full}\")\n\nFor more details on the available "
"models, see the models directory: https://spacy.io/models. If you "
"want to create a blank model, use spacy.blank: "
"models, see the models directory: https://spacy.io/models and if "
"you want to create a blank model, use spacy.blank: "
"nlp = spacy.blank(\"{name}\")")
E942 = ("Executing `after_{name}` callback failed. Expected the function to "
"return an initialized nlp object but got: {value}. Maybe "
@ -952,6 +967,20 @@ class Errors(metaclass=ErrorsWithCodes):
"sure it's overwritten on the subclass.")
E1046 = ("{cls_name} is an abstract class and cannot be instantiated. If you are looking for spaCy's default "
"knowledge base, use `InMemoryLookupKB`.")
E1047 = ("`find_threshold()` only supports components with a `scorer` attribute.")
E1048 = ("Got '{unexpected}' as console progress bar type, but expected one of the following: {expected}")
E1049 = ("No available port found for displaCy on host {host}. Please specify an available port "
"with `displacy.serve(doc, port=port)`")
E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port=port)` "
"or use `auto_select_port=True` to pick an available port automatically.")
E1051 = ("'allow_overlap' can only be False when max_positive is 1, but found 'max_positive': {max_positive}.")
E1052 = ("Unable to copy spans: the character offsets for the span at "
"index {i} in the span group do not align with the tokenization "
"in the target doc.")
E1053 = ("Both 'min_length' and 'max_length' should be larger than 0, but found"
" 'min_length': {min_length}, 'max_length': {max_length}")
E1054 = ("The text, including whitespace, must match between reference and "
"predicted docs when training {component}.")
# Deprecated model shortcuts, only used in errors and warnings

View File

@ -1,4 +1,5 @@
import warnings
from .errors import Warnings

View File

@ -1,3 +1,3 @@
from .candidate import Candidate, get_candidates, get_candidates_batch
from .kb import KnowledgeBase
from .kb_in_memory import InMemoryLookupKB
from .candidate import Candidate, get_candidates, get_candidates_batch

View File

@ -1,6 +1,8 @@
from .kb cimport KnowledgeBase
from libcpp.vector cimport vector
from ..typedefs cimport hash_t
from .kb cimport KnowledgeBase
# Object used by the Entity Linker that summarizes one entity-alias candidate combination.
cdef class Candidate:

View File

@ -1,9 +1,12 @@
# cython: infer_types=True, profile=True
from typing import Iterable
from .kb cimport KnowledgeBase
from ..tokens import Span
cdef class Candidate:
"""A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved
to a specific `entity` from a Knowledge Base. This will be used as input for the entity linking

View File

@ -2,8 +2,10 @@
from cymem.cymem cimport Pool
from libc.stdint cimport int64_t
from ..vocab cimport Vocab
cdef class KnowledgeBase:
cdef Pool mem
cdef readonly Vocab vocab

View File

@ -2,12 +2,13 @@
from pathlib import Path
from typing import Iterable, Tuple, Union
from cymem.cymem cimport Pool
from .candidate import Candidate
from ..errors import Errors
from ..tokens import Span
from ..util import SimpleFrozenList
from ..errors import Errors
from .candidate import Candidate
cdef class KnowledgeBase:

View File

@ -1,11 +1,11 @@
"""Knowledge-base for entity or concept linking."""
from preshed.maps cimport PreshMap
from libcpp.vector cimport vector
from libc.stdint cimport int32_t, int64_t
from libc.stdio cimport FILE
from libcpp.vector cimport vector
from preshed.maps cimport PreshMap
from ..structs cimport AliasC, KBEntryC
from ..typedefs cimport hash_t
from ..structs cimport KBEntryC, AliasC
from .kb cimport KnowledgeBase
ctypedef vector[KBEntryC] entry_vec

View File

@ -1,23 +1,28 @@
# cython: infer_types=True, profile=True
from typing import Iterable, Callable, Dict, Any, Union
from typing import Any, Callable, Dict, Iterable, Union
import srsly
from preshed.maps cimport PreshMap
from cpython.exc cimport PyErr_SetFromErrno
from libc.stdio cimport fopen, fclose, fread, fwrite, feof, fseek
from libc.stdint cimport int32_t, int64_t
from libcpp.vector cimport vector
from pathlib import Path
from cpython.exc cimport PyErr_SetFromErrno
from libc.stdint cimport int32_t, int64_t
from libc.stdio cimport fclose, feof, fopen, fread, fseek, fwrite
from libcpp.vector cimport vector
from preshed.maps cimport PreshMap
import warnings
from pathlib import Path
from ..tokens import Span
from ..typedefs cimport hash_t
from ..errors import Errors, Warnings
from .. import util
from ..errors import Errors, Warnings
from ..util import SimpleFrozenList, ensure_path
from ..vocab cimport Vocab
from .kb cimport KnowledgeBase
from .candidate import Candidate as Candidate
@ -25,7 +30,7 @@ cdef class InMemoryLookupKB(KnowledgeBase):
"""An `InMemoryLookupKB` instance stores unique identifiers for entities and their textual aliases,
to support entity linking of named entities to real-world concepts.
DOCS: https://spacy.io/api/kb_in_memory
DOCS: https://spacy.io/api/inmemorylookupkb
"""
def __init__(self, Vocab vocab, entity_vector_length):
@ -46,6 +51,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
self._alias_index = PreshMap(nr_aliases + 1)
self._aliases_table = alias_vec(nr_aliases + 1)
def is_empty(self):
return len(self) == 0
def __len__(self):
return self.get_size_entities()

View File

@ -1,5 +1,5 @@
from ...language import BaseDefaults, Language
from .stop_words import STOP_WORDS
from ...language import Language, BaseDefaults
class AfrikaansDefaults(BaseDefaults):

View File

@ -1,12 +1,11 @@
from .stop_words import STOP_WORDS
from ...attrs import LANG
from ...language import BaseDefaults, Language
from ...util import update_exc
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language, BaseDefaults
from ...attrs import LANG
from ...util import update_exc
class AmharicDefaults(BaseDefaults):

View File

@ -1,5 +1,11 @@
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
from ..char_classes import UNITS, ALPHA_UPPER
from ..char_classes import (
ALPHA_UPPER,
CURRENCY,
LIST_ELLIPSES,
LIST_PUNCT,
LIST_QUOTES,
UNITS,
)
_list_punct = LIST_PUNCT + "፡ ። ፣ ፤ ፥ ፦ ፧ ፠ ፨".strip().split()

View File

@ -1,5 +1,4 @@
from ...symbols import ORTH, NORM
from ...symbols import NORM, ORTH
_exc = {}

View File

@ -1,8 +1,8 @@
from .stop_words import STOP_WORDS
from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from ...language import Language, BaseDefaults
class ArabicDefaults(BaseDefaults):

View File

@ -1,5 +1,11 @@
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
from ..char_classes import UNITS, ALPHA_UPPER
from ..char_classes import (
ALPHA_UPPER,
CURRENCY,
LIST_ELLIPSES,
LIST_PUNCT,
LIST_QUOTES,
UNITS,
)
_suffixes = (
LIST_PUNCT

View File

@ -1,7 +1,6 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, NORM
from ...symbols import NORM, ORTH
from ...util import update_exc
from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc = {}

View File

@ -1,6 +1,6 @@
from .stop_words import STOP_WORDS
from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
from ...language import Language, BaseDefaults
from .stop_words import STOP_WORDS
class AzerbaijaniDefaults(BaseDefaults):

View File

@ -1,6 +1,5 @@
from ...attrs import LIKE_NUM
# Eleven, twelve etc. are written separate: on bir, on iki
_num_words = [

View File

@ -1,12 +1,14 @@
from ...attrs import LANG
from ...language import BaseDefaults, Language
from ...util import update_exc
from ..punctuation import (
COMBINING_DIACRITICS_TOKENIZER_INFIXES,
COMBINING_DIACRITICS_TOKENIZER_SUFFIXES,
)
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from .lex_attrs import LEX_ATTRS
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .lex_attrs import LEX_ATTRS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_INFIXES
from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_SUFFIXES
from ...language import Language, BaseDefaults
from ...attrs import LANG
from ...util import update_exc
class BulgarianDefaults(BaseDefaults):

View File

@ -1,6 +1,5 @@
from ...attrs import LIKE_NUM
_num_words = [
"нула",
"едно",

View File

@ -4,8 +4,7 @@ References:
(countries, occupations, fields of studies and more).
"""
from ...symbols import ORTH, NORM
from ...symbols import NORM, ORTH
_exc = {}

View File

@ -1,10 +1,12 @@
from typing import Optional, Callable
from typing import Callable, Optional
from thinc.api import Model
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from .stop_words import STOP_WORDS
from ...language import Language, BaseDefaults
from ...language import BaseDefaults, Language
from ...pipeline import Lemmatizer
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class BengaliDefaults(BaseDefaults):

View File

@ -1,6 +1,14 @@
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
from ..char_classes import ALPHA_LOWER, ALPHA, HYPHENS, CONCAT_QUOTES, UNITS
from ..char_classes import (
ALPHA,
ALPHA_LOWER,
CONCAT_QUOTES,
HYPHENS,
LIST_ELLIPSES,
LIST_ICONS,
LIST_PUNCT,
LIST_QUOTES,
UNITS,
)
_currency = r"\$¢£€¥฿৳"
_quotes = CONCAT_QUOTES.replace("'", "")

View File

@ -1,7 +1,6 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, NORM
from ...symbols import NORM, ORTH
from ...util import update_exc
from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc = {}

View File

@ -1,14 +1,14 @@
from typing import Optional, Callable
from typing import Callable, Optional
from thinc.api import Model
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS
from ...language import Language, BaseDefaults
from ...language import BaseDefaults, Language
from .lemmatizer import CatalanLemmatizer
from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class CatalanDefaults(BaseDefaults):

View File

@ -1,6 +1,5 @@
from ...attrs import LIKE_NUM
_num_words = [
"zero",
"un",

View File

@ -1,9 +1,18 @@
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
from ..char_classes import LIST_CURRENCY
from ..char_classes import CURRENCY
from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT
from ..char_classes import merge_chars, _units
from ..char_classes import (
ALPHA,
ALPHA_LOWER,
ALPHA_UPPER,
CONCAT_QUOTES,
CURRENCY,
LIST_CURRENCY,
LIST_ELLIPSES,
LIST_ICONS,
LIST_PUNCT,
LIST_QUOTES,
PUNCT,
_units,
merge_chars,
)
ELISION = " ' ".strip().replace(" ", "").replace("\n", "")

View File

@ -1,7 +1,8 @@
from typing import Union, Iterator, Tuple
from ...tokens import Doc, Span
from ...symbols import NOUN, PROPN
from typing import Iterator, Tuple, Union
from ...errors import Errors
from ...symbols import NOUN, PROPN
from ...tokens import Doc, Span
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:

View File

@ -1,7 +1,6 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, NORM
from ...symbols import NORM, ORTH
from ...util import update_exc
from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc = {}

View File

@ -1,6 +1,6 @@
from .stop_words import STOP_WORDS
from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
from ...language import Language, BaseDefaults
from .stop_words import STOP_WORDS
class CzechDefaults(BaseDefaults):

View File

@ -1,9 +1,9 @@
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS
from ...language import Language, BaseDefaults
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class DanishDefaults(BaseDefaults):

View File

@ -1,6 +1,5 @@
from ...attrs import LIKE_NUM
# Source http://fjern-uv.dk/tal.php
_num_words = """nul
en et to tre fire fem seks syv otte ni ti

View File

@ -1,8 +1,13 @@
from ..char_classes import LIST_ELLIPSES, LIST_ICONS
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
from ..char_classes import (
ALPHA,
ALPHA_LOWER,
ALPHA_UPPER,
CONCAT_QUOTES,
LIST_ELLIPSES,
LIST_ICONS,
)
from ..punctuation import TOKENIZER_SUFFIXES
_quotes = CONCAT_QUOTES.replace("'", "")
_infixes = (

View File

@ -1,7 +1,8 @@
from typing import Union, Iterator, Tuple
from ...tokens import Doc, Span
from ...symbols import NOUN, PROPN, PRON, VERB, AUX
from typing import Iterator, Tuple, Union
from ...errors import Errors
from ...symbols import AUX, NOUN, PRON, PROPN, VERB
from ...tokens import Doc, Span
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:

View File

@ -2,10 +2,9 @@
Tokenizer Exceptions.
Source: https://forkortelse.dk/ and various others.
"""
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, NORM
from ...symbols import NORM, ORTH
from ...util import update_exc
from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc = {}

View File

@ -1,8 +1,8 @@
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from ...language import BaseDefaults, Language
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS
from ...language import Language, BaseDefaults
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class GermanDefaults(BaseDefaults):

View File

@ -1,9 +1,18 @@
from ..char_classes import LIST_ELLIPSES, LIST_ICONS, LIST_PUNCT, LIST_QUOTES
from ..char_classes import CURRENCY, UNITS, PUNCT
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
from ..char_classes import (
ALPHA,
ALPHA_LOWER,
ALPHA_UPPER,
CONCAT_QUOTES,
CURRENCY,
LIST_ELLIPSES,
LIST_ICONS,
LIST_PUNCT,
LIST_QUOTES,
PUNCT,
UNITS,
)
from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES
_prefixes = ["``"] + BASE_TOKENIZER_PREFIXES
_suffixes = (

View File

@ -1,7 +1,7 @@
from typing import Union, Iterator, Tuple
from typing import Iterator, Tuple, Union
from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors
from ...symbols import NOUN, PRON, PROPN
from ...tokens import Doc, Span

View File

@ -1,7 +1,6 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, NORM
from ...symbols import NORM, ORTH
from ...util import update_exc
from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc = {
"auf'm": [{ORTH: "auf"}, {ORTH: "'m", NORM: "dem"}],

View File

@ -1,6 +1,6 @@
from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
from .stop_words import STOP_WORDS
from ...language import Language, BaseDefaults
class LowerSorbianDefaults(BaseDefaults):

View File

@ -1,13 +1,14 @@
from typing import Optional, Callable
from typing import Callable, Optional
from thinc.api import Model
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from ...language import BaseDefaults, Language
from .lemmatizer import GreekLemmatizer
from ...language import Language, BaseDefaults
from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class GreekDefaults(BaseDefaults):

View File

@ -1,5 +1,6 @@
def get_pos_from_wiktionary():
import re
from gensim.corpora.wikicorpus import extract_pages
regex = re.compile(r"==={{(\w+)\|el}}===")

View File

@ -1,6 +1,16 @@
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
from ..char_classes import LIST_ICONS, ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS
from ..char_classes import CONCAT_QUOTES, CURRENCY
from ..char_classes import (
ALPHA,
ALPHA_LOWER,
ALPHA_UPPER,
CONCAT_QUOTES,
CURRENCY,
HYPHENS,
LIST_CURRENCY,
LIST_ELLIPSES,
LIST_ICONS,
LIST_PUNCT,
LIST_QUOTES,
)
_units = (
"km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft "

View File

@ -1,7 +1,7 @@
from typing import Union, Iterator, Tuple
from typing import Iterator, Tuple, Union
from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors
from ...symbols import NOUN, PRON, PROPN
from ...tokens import Doc, Span

Some files were not shown because too many files have changed in this diff Show More