Merge branch 'v4' into feature/multiple-code-files

This commit is contained in:
Adriane Boyd 2023-07-18 15:06:18 +02:00 committed by GitHub
commit 28c8a577fc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
600 changed files with 11226 additions and 3076 deletions

View File

@ -1,129 +0,0 @@
parameters:
python_version: ''
architecture: 'x64'
num_build_jobs: 2
steps:
- task: UsePythonVersion@0
inputs:
versionSpec: ${{ parameters.python_version }}
architecture: ${{ parameters.architecture }}
allowUnstable: true
- bash: |
echo "##vso[task.setvariable variable=python_version]${{ parameters.python_version }}"
displayName: 'Set variables'
- script: |
python -m pip install -U build pip setuptools
python -m pip install -U -r requirements.txt
displayName: "Install dependencies"
- script: |
python -m build --sdist
displayName: "Build sdist"
- script: |
python -m mypy spacy
displayName: 'Run mypy'
condition: ne(variables['python_version'], '3.6')
- task: DeleteFiles@1
inputs:
contents: "spacy"
displayName: "Delete source directory"
- task: DeleteFiles@1
inputs:
contents: "*.egg-info"
displayName: "Delete egg-info directory"
- script: |
python -m pip freeze > installed.txt
python -m pip uninstall -y -r installed.txt
displayName: "Uninstall all packages"
- bash: |
SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
SPACY_NUM_BUILD_JOBS=${{ parameters.num_build_jobs }} python -m pip install dist/$SDIST
displayName: "Install from sdist"
- script: |
python -W error -c "import spacy"
displayName: "Test import"
# - script: |
# python -m spacy download ca_core_news_sm
# python -m spacy download ca_core_news_md
# python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
# displayName: 'Test download CLI'
# condition: eq(variables['python_version'], '3.8')
#
# - script: |
# python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
# displayName: 'Test no warnings on load (#11713)'
# condition: eq(variables['python_version'], '3.8')
#
# - script: |
# python -m spacy download ca_core_news_sm 2>&1 | grep -q skipping
# displayName: 'Test skip re-download (#12188)'
# condition: eq(variables['python_version'], '3.8')
# - script: |
# python -W error -m spacy info ca_core_news_sm | grep -q download_url
# displayName: 'Test download_url in info CLI'
# condition: eq(variables['python_version'] '3.8')
- script: |
python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
displayName: 'Test convert CLI'
condition: eq(variables['python_version'], '3.8')
- script: |
python -m spacy init config -p ner -l ca ner.cfg
python -m spacy debug config ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy
displayName: 'Test debug config CLI'
condition: eq(variables['python_version'], '3.8')
- script: |
# will have errors due to sparse data, check for summary in output
python -m spacy debug data ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy | grep -q Summary
displayName: 'Test debug data CLI'
condition: eq(variables['python_version'], '3.8')
- script: |
python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
displayName: 'Test train CLI'
condition: eq(variables['python_version'], '3.8')
# - script: |
# python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
# PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
# displayName: 'Test assemble CLI'
# condition: eq(variables['python_version'], '3.8')
#
# - script: |
# python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
# python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
# displayName: 'Test assemble CLI vectors warning'
# condition: eq(variables['python_version'], '3.8')
- script: |
python -m pip install -U -r requirements.txt
displayName: "Install test requirements"
- script: |
python -m pytest --pyargs spacy -W error
displayName: "Run CPU tests"
- script: |
python -m pip install 'spacy[apple]'
python -m pytest --pyargs spacy
displayName: "Run CPU tests with thinc-apple-ops"
condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.11'))
- script: |
python .github/validate_universe_json.py website/meta/universe.json
displayName: 'Test website/meta/universe.json'
condition: eq(variables['python_version'], '3.8')

View File

@ -1,45 +0,0 @@
# GitHub Action that uses Black to reformat all Python code and submits a PR
# in regular intervals. Inspired by: https://github.com/cclauss/autoblack
name: autoblack
on:
workflow_dispatch: # allow manual trigger
schedule:
- cron: '0 8 * * 5' # every Friday at 8am UTC
jobs:
autoblack:
if: github.repository_owner == 'explosion'
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
with:
ref: ${{ github.head_ref }}
- uses: actions/setup-python@v4
- run: pip install black -c requirements.txt
- name: Auto-format code if needed
run: black spacy
# We can't run black --check here because that returns a non-zero excit
# code and makes GitHub think the action failed
- name: Check for modified files
id: git-check
run: echo modified=$(if git diff-index --quiet HEAD --; then echo "false"; else echo "true"; fi) >> $GITHUB_OUTPUT
- name: Create Pull Request
if: steps.git-check.outputs.modified == 'true'
uses: peter-evans/create-pull-request@v4
with:
title: Auto-format code with black
labels: meta
commit-message: Auto-format code with black
committer: GitHub <noreply@github.com>
author: explosion-bot <explosion-bot@users.noreply.github.com>
body: _This PR is auto-generated._
branch: autoblack
delete-branch: true
draft: false
- name: Check outputs
if: steps.git-check.outputs.modified == 'true'
run: |
echo "Pull Request Number - ${{ steps.cpr.outputs.pull-request-number }}"
echo "Pull Request URL - ${{ steps.cpr.outputs.pull-request-url }}"

View File

@ -8,6 +8,7 @@ on:
jobs: jobs:
explosion-bot: explosion-bot:
if: github.repository_owner == 'explosion'
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- name: Dump GitHub context - name: Dump GitHub context

View File

@ -13,6 +13,7 @@ on:
jobs: jobs:
issue-manager: issue-manager:
if: github.repository_owner == 'explosion'
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- uses: tiangolo/issue-manager@0.4.0 - uses: tiangolo/issue-manager@0.4.0

View File

@ -13,6 +13,7 @@ concurrency:
jobs: jobs:
action: action:
if: github.repository_owner == 'explosion'
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- uses: dessant/lock-threads@v4 - uses: dessant/lock-threads@v4

View File

@ -7,6 +7,7 @@ on:
jobs: jobs:
build: build:
if: github.repository_owner == 'explosion'
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:

173
.github/workflows/tests.yml vendored Normal file
View File

@ -0,0 +1,173 @@
name: tests
on:
push:
branches-ignore:
- "spacy.io"
- "nightly.spacy.io"
- "v2.spacy.io"
paths-ignore:
- "*.md"
- "*.mdx"
- "website/**"
- ".github/workflows/**"
pull_request:
types: [opened, synchronize, reopened, edited]
paths-ignore:
- "*.md"
- "*.mdx"
- "website/**"
jobs:
validate:
name: Validate
if: github.repository_owner == 'explosion'
runs-on: ubuntu-latest
steps:
- name: Check out repo
uses: actions/checkout@v3
- name: Configure Python version
uses: actions/setup-python@v4
with:
python-version: "3.8"
architecture: x64
- name: black
run: |
python -m pip install black -c requirements.txt
python -m black spacy --check
- name: isort
run: |
python -m pip install isort -c requirements.txt
python -m isort spacy --check
- name: flake8
run: |
python -m pip install flake8==5.0.4
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
tests:
name: Test
needs: Validate
strategy:
fail-fast: true
matrix:
os: [ubuntu-latest, windows-latest, macos-latest]
python_version: ["3.11"]
include:
- os: macos-latest
python_version: "3.8"
- os: ubuntu-20.04
python_version: "3.9"
- os: windows-latest
python_version: "3.10"
runs-on: ${{ matrix.os }}
steps:
- name: Check out repo
uses: actions/checkout@v3
- name: Configure Python version
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python_version }}
architecture: x64
- name: Install dependencies
run: |
python -m pip install -U build pip setuptools
python -m pip install -U -r requirements.txt
- name: Build sdist
run: |
python -m build --sdist
- name: Run mypy
run: |
python -m mypy spacy
- name: Delete source directory and .egg-info
run: |
rm -rf spacy *.egg-info
shell: bash
- name: Uninstall all packages
run: |
python -m pip freeze
python -m pip freeze --exclude pywin32 > installed.txt
python -m pip uninstall -y -r installed.txt
- name: Install from sdist
run: |
SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
SPACY_NUM_BUILD_JOBS=2 python -m pip install dist/$SDIST
shell: bash
- name: Test import
run: python -W error -c "import spacy"
# - name: "Test download CLI"
# run: |
# python -m spacy download ca_core_news_sm
# python -m spacy download ca_core_news_md
# python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
# if: matrix.python_version == '3.9'
#
# - name: "Test download_url in info CLI"
# run: |
# python -W error -m spacy info ca_core_news_sm | grep -q download_url
# if: matrix.python_version == '3.9'
#
# - name: "Test no warnings on load (#11713)"
# run: |
# python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
# if: matrix.python_version == '3.9'
- name: "Test convert CLI"
run: |
python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
if: matrix.python_version == '3.9'
- name: "Test debug config CLI"
run: |
python -m spacy init config -p ner -l ca ner.cfg
python -m spacy debug config ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy
if: matrix.python_version == '3.9'
- name: "Test debug data CLI"
run: |
# will have errors due to sparse data, check for summary in output
python -m spacy debug data ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy | grep -q Summary
if: matrix.python_version == '3.9'
- name: "Test train CLI"
run: |
python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
if: matrix.python_version == '3.9'
# - name: "Test assemble CLI"
# run: |
# python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
# PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
# if: matrix.python_version == '3.9'
#
# - name: "Test assemble CLI vectors warning"
# run: |
# python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
# python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
# if: matrix.python_version == '3.9'
- name: "Install test requirements"
run: |
python -m pip install -U -r requirements.txt
- name: "Run CPU tests"
run: |
python -m pytest --pyargs spacy -W error
if: "!(startsWith(matrix.os, 'macos') && matrix.python_version == '3.11')"
- name: "Run CPU tests with thinc-apple-ops"
run: |
python -m pip install 'spacy[apple]'
python -m pytest --pyargs spacy
if: startsWith(matrix.os, 'macos') && matrix.python_version == '3.11'

View File

@ -0,0 +1,33 @@
name: universe validation
on:
push:
branches-ignore:
- "spacy.io"
- "nightly.spacy.io"
- "v2.spacy.io"
paths:
- "website/meta/universe.json"
pull_request:
types: [opened, synchronize, reopened, edited]
paths:
- "website/meta/universe.json"
jobs:
validate:
name: Validate
if: github.repository_owner == 'explosion'
runs-on: ubuntu-latest
steps:
- name: Check out repo
uses: actions/checkout@v3
- name: Configure Python version
uses: actions/setup-python@v4
with:
python-version: "3.8"
architecture: x64
- name: Validate website/meta/universe.json
run: |
python .github/validate_universe_json.py website/meta/universe.json

View File

@ -16,6 +16,9 @@ production-ready [**training system**](https://spacy.io/usage/training) and easy
model packaging, deployment and workflow management. spaCy is commercial model packaging, deployment and workflow management. spaCy is commercial
open-source software, released under the [MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE). open-source software, released under the [MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE).
💥 **We'd love to hear more about your experience with spaCy!**
[Fill out our survey here.](https://form.typeform.com/to/aMel9q9f)
💫 **Version 3.5 out now!** 💫 **Version 3.5 out now!**
[Check out the release notes here.](https://github.com/explosion/spaCy/releases) [Check out the release notes here.](https://github.com/explosion/spaCy/releases)
@ -32,19 +35,20 @@ open-source software, released under the [MIT license](https://github.com/explos
## 📖 Documentation ## 📖 Documentation
| Documentation | | | Documentation | |
| ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ----------------------------- | ---------------------------------------------------------------------- |
| ⭐️ **[spaCy 101]** | New to spaCy? Here's everything you need to know! | | ⭐️ **[spaCy 101]** | New to spaCy? Here's everything you need to know! |
| 📚 **[Usage Guides]** | How to use spaCy and its features. | | 📚 **[Usage Guides]** | How to use spaCy and its features. |
| 🚀 **[New in v3.0]** | New features, backwards incompatibilities and migration guide. | | 🚀 **[New in v3.0]** | New features, backwards incompatibilities and migration guide. |
| 🪐 **[Project Templates]** | End-to-end workflows you can clone, modify and run. | | 🪐 **[Project Templates]** | End-to-end workflows you can clone, modify and run. |
| 🎛 **[API Reference]** | The detailed reference for spaCy's API. | | 🎛 **[API Reference]** | The detailed reference for spaCy's API. |
| 📦 **[Models]** | Download trained pipelines for spaCy. | | 📦 **[Models]** | Download trained pipelines for spaCy. |
| 🌌 **[Universe]** | Plugins, extensions, demos and books from the spaCy ecosystem. | | 🌌 **[Universe]** | Plugins, extensions, demos and books from the spaCy ecosystem. |
| 👩‍🏫 **[Online Course]** | Learn spaCy in this free and interactive online course. | | ⚙️ **[spaCy VS Code Extension]** | Additional tooling and features for working with spaCy's config files. |
| 📺 **[Videos]** | Our YouTube channel with video tutorials, talks and more. | | 👩‍🏫 **[Online Course]** | Learn spaCy in this free and interactive online course. |
| 🛠 **[Changelog]** | Changes and version history. | | 📺 **[Videos]** | Our YouTube channel with video tutorials, talks and more. |
| 💝 **[Contribute]** | How to contribute to the spaCy project and code base. | | 🛠 **[Changelog]** | Changes and version history. |
| 💝 **[Contribute]** | How to contribute to the spaCy project and code base. |
| <a href="https://explosion.ai/spacy-tailored-pipelines"><img src="https://user-images.githubusercontent.com/13643239/152853098-1c761611-ccb0-4ec6-9066-b234552831fe.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Get a custom spaCy pipeline, tailor-made for your NLP problem by spaCy's core developers. Streamlined, production-ready, predictable and maintainable. Start by completing our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more &rarr;](https://explosion.ai/spacy-tailored-pipelines)** | | <a href="https://explosion.ai/spacy-tailored-pipelines"><img src="https://user-images.githubusercontent.com/13643239/152853098-1c761611-ccb0-4ec6-9066-b234552831fe.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Get a custom spaCy pipeline, tailor-made for your NLP problem by spaCy's core developers. Streamlined, production-ready, predictable and maintainable. Start by completing our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more &rarr;](https://explosion.ai/spacy-tailored-pipelines)** |
| <a href="https://explosion.ai/spacy-tailored-analysis"><img src="https://user-images.githubusercontent.com/1019791/206151300-b00cd189-e503-4797-aa1e-1bb6344062c5.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Bespoke advice for problem solving, strategy and analysis for applied NLP projects. Services include data strategy, code reviews, pipeline design and annotation coaching. Curious? Fill in our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more &rarr;](https://explosion.ai/spacy-tailored-analysis)** | | <a href="https://explosion.ai/spacy-tailored-analysis"><img src="https://user-images.githubusercontent.com/1019791/206151300-b00cd189-e503-4797-aa1e-1bb6344062c5.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Bespoke advice for problem solving, strategy and analysis for applied NLP projects. Services include data strategy, code reviews, pipeline design and annotation coaching. Curious? Fill in our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more &rarr;](https://explosion.ai/spacy-tailored-analysis)** |
@ -54,13 +58,13 @@ open-source software, released under the [MIT license](https://github.com/explos
[api reference]: https://spacy.io/api/ [api reference]: https://spacy.io/api/
[models]: https://spacy.io/models [models]: https://spacy.io/models
[universe]: https://spacy.io/universe [universe]: https://spacy.io/universe
[spaCy VS Code Extension]: https://github.com/explosion/spacy-vscode
[videos]: https://www.youtube.com/c/ExplosionAI [videos]: https://www.youtube.com/c/ExplosionAI
[online course]: https://course.spacy.io [online course]: https://course.spacy.io
[project templates]: https://github.com/explosion/projects [project templates]: https://github.com/explosion/projects
[changelog]: https://spacy.io/usage#changelog [changelog]: https://spacy.io/usage#changelog
[contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md [contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md
## 💬 Where to ask questions ## 💬 Where to ask questions
The spaCy project is maintained by the [spaCy team](https://explosion.ai/about). The spaCy project is maintained by the [spaCy team](https://explosion.ai/about).

View File

@ -1,99 +0,0 @@
trigger:
batch: true
branches:
include:
- "*"
exclude:
- "spacy.io"
- "nightly.spacy.io"
- "v2.spacy.io"
paths:
exclude:
- "website/*"
- "*.md"
- "*.mdx"
- ".github/workflows/*"
pr:
paths:
exclude:
- "*.md"
- "*.mdx"
- "website/docs/*"
- "website/src/*"
- "website/meta/*.tsx"
- "website/meta/*.mjs"
- "website/meta/languages.json"
- "website/meta/site.json"
- "website/meta/sidebars.json"
- "website/meta/type-annotations.json"
- "website/pages/*"
- ".github/workflows/*"
jobs:
# Check formatting and linting. Perform basic checks for most important errors
# (syntax etc.) Uses the config defined in setup.cfg and overwrites the
# selected codes.
- job: "Validate"
pool:
vmImage: "ubuntu-latest"
steps:
- task: UsePythonVersion@0
inputs:
versionSpec: "3.8"
- script: |
pip install black -c requirements.txt
python -m black spacy --check
displayName: "black"
- script: |
pip install flake8==5.0.4
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
displayName: "flake8"
- job: "Test"
dependsOn: "Validate"
strategy:
matrix:
# We're only running one platform per Python version to speed up builds
# Python38Linux:
# imageName: "ubuntu-latest"
# python.version: "3.8"
# Python38Windows:
# imageName: "windows-latest"
# python.version: "3.8"
Python38Mac:
imageName: "macos-latest"
python.version: "3.8"
Python39Linux:
imageName: "ubuntu-latest"
python.version: "3.9"
# Python39Windows:
# imageName: "windows-latest"
# python.version: "3.9"
# Python39Mac:
# imageName: "macos-latest"
# python.version: "3.9"
# Python310Linux:
# imageName: "ubuntu-latest"
# python.version: "3.10"
Python310Windows:
imageName: "windows-latest"
python.version: "3.10"
# Python310Mac:
# imageName: "macos-latest"
# python.version: "3.10"
Python311Linux:
imageName: 'ubuntu-latest'
python.version: '3.11'
Python311Windows:
imageName: 'windows-latest'
python.version: '3.11'
Python311Mac:
imageName: 'macos-latest'
python.version: '3.11'
maxParallel: 4
pool:
vmImage: $(imageName)
steps:
- template: .github/azure-steps.yml
parameters:
python_version: '$(python.version)'

View File

@ -1,6 +1,4 @@
# build version constraints for use with wheelwright + multibuild # build version constraints for use with wheelwright + multibuild
numpy==1.15.0; python_version<='3.7' and platform_machine!='aarch64'
numpy==1.19.2; python_version<='3.7' and platform_machine=='aarch64'
numpy==1.17.3; python_version=='3.8' and platform_machine!='aarch64' numpy==1.17.3; python_version=='3.8' and platform_machine!='aarch64'
numpy==1.19.2; python_version=='3.8' and platform_machine=='aarch64' numpy==1.19.2; python_version=='3.8' and platform_machine=='aarch64'
numpy==1.19.3; python_version=='3.9' numpy==1.19.3; python_version=='3.9'

View File

@ -9,3 +9,6 @@ requires = [
"numpy>=1.15.0", "numpy>=1.15.0",
] ]
build-backend = "setuptools.build_meta" build-backend = "setuptools.build_meta"
[tool.isort]
profile = "black"

View File

@ -9,7 +9,7 @@ murmurhash>=0.28.0,<1.1.0
wasabi>=0.9.1,<1.2.0 wasabi>=0.9.1,<1.2.0
srsly>=2.4.3,<3.0.0 srsly>=2.4.3,<3.0.0
catalogue>=2.0.6,<2.1.0 catalogue>=2.0.6,<2.1.0
typer>=0.3.0,<0.8.0 typer>=0.3.0,<0.10.0
pathy>=0.10.0 pathy>=0.10.0
smart-open>=5.2.1,<7.0.0 smart-open>=5.2.1,<7.0.0
# Third party dependencies # Third party dependencies
@ -30,10 +30,10 @@ pytest-timeout>=1.3.0,<2.0.0
mock>=2.0.0,<3.0.0 mock>=2.0.0,<3.0.0
flake8>=3.8.0,<6.0.0 flake8>=3.8.0,<6.0.0
hypothesis>=3.27.0,<7.0.0 hypothesis>=3.27.0,<7.0.0
mypy>=0.990,<1.1.0; platform_machine != "aarch64" and python_version >= "3.7" mypy>=0.990,<1.1.0; platform_machine != "aarch64"
types-dataclasses>=0.1.3; python_version < "3.7"
types-mock>=0.1.1 types-mock>=0.1.1
types-setuptools>=57.0.0 types-setuptools>=57.0.0
types-requests types-requests
types-setuptools>=57.0.0 types-setuptools>=57.0.0
black==22.3.0 black==22.3.0
isort>=5.0,<6.0

View File

@ -30,6 +30,14 @@ project_urls =
zip_safe = false zip_safe = false
include_package_data = true include_package_data = true
python_requires = >=3.8 python_requires = >=3.8
setup_requires =
cython>=0.25,<3.0
numpy>=1.15.0
# We also need our Cython packages here to compile against
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
murmurhash>=0.28.0,<1.1.0
thinc>=9.0.0.dev2,<9.1.0
install_requires = install_requires =
# Our libraries # Our libraries
spacy-legacy>=4.0.0.dev0,<4.1.0 spacy-legacy>=4.0.0.dev0,<4.1.0
@ -42,7 +50,7 @@ install_requires =
srsly>=2.4.3,<3.0.0 srsly>=2.4.3,<3.0.0
catalogue>=2.0.6,<2.1.0 catalogue>=2.0.6,<2.1.0
# Third-party dependencies # Third-party dependencies
typer>=0.3.0,<0.8.0 typer>=0.3.0,<0.10.0
pathy>=0.10.0 pathy>=0.10.0
smart-open>=5.2.1,<7.0.0 smart-open>=5.2.1,<7.0.0
tqdm>=4.38.0,<5.0.0 tqdm>=4.38.0,<5.0.0
@ -67,41 +75,41 @@ transformers =
ray = ray =
spacy_ray>=0.1.0,<1.0.0 spacy_ray>=0.1.0,<1.0.0
cuda = cuda =
cupy>=5.0.0b4,<12.0.0 cupy>=5.0.0b4,<13.0.0
cuda80 = cuda80 =
cupy-cuda80>=5.0.0b4,<12.0.0 cupy-cuda80>=5.0.0b4,<13.0.0
cuda90 = cuda90 =
cupy-cuda90>=5.0.0b4,<12.0.0 cupy-cuda90>=5.0.0b4,<13.0.0
cuda91 = cuda91 =
cupy-cuda91>=5.0.0b4,<12.0.0 cupy-cuda91>=5.0.0b4,<13.0.0
cuda92 = cuda92 =
cupy-cuda92>=5.0.0b4,<12.0.0 cupy-cuda92>=5.0.0b4,<13.0.0
cuda100 = cuda100 =
cupy-cuda100>=5.0.0b4,<12.0.0 cupy-cuda100>=5.0.0b4,<13.0.0
cuda101 = cuda101 =
cupy-cuda101>=5.0.0b4,<12.0.0 cupy-cuda101>=5.0.0b4,<13.0.0
cuda102 = cuda102 =
cupy-cuda102>=5.0.0b4,<12.0.0 cupy-cuda102>=5.0.0b4,<13.0.0
cuda110 = cuda110 =
cupy-cuda110>=5.0.0b4,<12.0.0 cupy-cuda110>=5.0.0b4,<13.0.0
cuda111 = cuda111 =
cupy-cuda111>=5.0.0b4,<12.0.0 cupy-cuda111>=5.0.0b4,<13.0.0
cuda112 = cuda112 =
cupy-cuda112>=5.0.0b4,<12.0.0 cupy-cuda112>=5.0.0b4,<13.0.0
cuda113 = cuda113 =
cupy-cuda113>=5.0.0b4,<12.0.0 cupy-cuda113>=5.0.0b4,<13.0.0
cuda114 = cuda114 =
cupy-cuda114>=5.0.0b4,<12.0.0 cupy-cuda114>=5.0.0b4,<13.0.0
cuda115 = cuda115 =
cupy-cuda115>=5.0.0b4,<12.0.0 cupy-cuda115>=5.0.0b4,<13.0.0
cuda116 = cuda116 =
cupy-cuda116>=5.0.0b4,<12.0.0 cupy-cuda116>=5.0.0b4,<13.0.0
cuda117 = cuda117 =
cupy-cuda117>=5.0.0b4,<12.0.0 cupy-cuda117>=5.0.0b4,<13.0.0
cuda11x = cuda11x =
cupy-cuda11x>=11.0.0,<12.0.0 cupy-cuda11x>=11.0.0,<13.0.0
cuda-autodetect = cuda-autodetect =
cupy-wheel>=11.0.0,<12.0.0 cupy-wheel>=11.0.0,<13.0.0
apple = apple =
thinc-apple-ops>=0.1.0.dev0,<1.0.0 thinc-apple-ops>=0.1.0.dev0,<1.0.0
# Language tokenizers with external dependencies # Language tokenizers with external dependencies

View File

@ -1,6 +1,6 @@
from typing import Union, Iterable, Dict, Any
from pathlib import Path
import sys import sys
from pathlib import Path
from typing import Any, Dict, Iterable, Union
# set library-specific custom warning handling before doing anything else # set library-specific custom warning handling before doing anything else
from .errors import setup_default_warnings from .errors import setup_default_warnings
@ -8,20 +8,17 @@ from .errors import setup_default_warnings
setup_default_warnings() # noqa: E402 setup_default_warnings() # noqa: E402
# These are imported as part of the API # These are imported as part of the API
from thinc.api import prefer_gpu, require_gpu, require_cpu # noqa: F401 from thinc.api import Config, prefer_gpu, require_cpu, require_gpu # noqa: F401
from thinc.api import Config
from . import pipeline # noqa: F401 from . import pipeline # noqa: F401
from .cli.info import info # noqa: F401
from .glossary import explain # noqa: F401
from .about import __version__ # noqa: F401
from .util import registry, logger # noqa: F401
from .errors import Errors
from .language import Language
from .vocab import Vocab
from . import util from . import util
from .about import __version__ # noqa: F401
from .cli.info import info # noqa: F401
from .errors import Errors
from .glossary import explain # noqa: F401
from .language import Language
from .util import logger, registry # noqa: F401
from .vocab import Vocab
if sys.maxunicode == 65535: if sys.maxunicode == 65535:
raise SystemError(Errors.E130) raise SystemError(Errors.E130)

View File

@ -1,6 +1,6 @@
# fmt: off # fmt: off
__title__ = "spacy" __title__ = "spacy"
__version__ = "4.0.0.dev0" __version__ = "4.0.0.dev1"
__download_url__ = "https://github.com/explosion/spacy-models/releases/download" __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
__projects__ = "https://github.com/explosion/projects" __projects__ = "https://github.com/explosion/projects"

View File

@ -1,5 +1,6 @@
from . cimport symbols from . cimport symbols
cdef enum attr_id_t: cdef enum attr_id_t:
NULL_ATTR = 0 NULL_ATTR = 0
IS_ALPHA = symbols.IS_ALPHA IS_ALPHA = symbols.IS_ALPHA

View File

@ -1,35 +1,35 @@
from wasabi import msg from wasabi import msg
from ._util import app, setup_cli # noqa: F401 from ._util import app, setup_cli # noqa: F401
from .apply import apply # noqa: F401
from .assemble import assemble_cli # noqa: F401
# These are the actual functions, NOT the wrapped CLI commands. The CLI commands # These are the actual functions, NOT the wrapped CLI commands. The CLI commands
# are registered automatically and won't have to be imported here. # are registered automatically and won't have to be imported here.
from .benchmark_speed import benchmark_speed_cli # noqa: F401 from .benchmark_speed import benchmark_speed_cli # noqa: F401
from .download import download # noqa: F401
from .info import info # noqa: F401
from .package import package # noqa: F401
from .profile import profile # noqa: F401
from .train import train_cli # noqa: F401
from .assemble import assemble_cli # noqa: F401
from .pretrain import pretrain # noqa: F401
from .debug_data import debug_data # noqa: F401
from .debug_config import debug_config # noqa: F401
from .debug_model import debug_model # noqa: F401
from .debug_diff import debug_diff # noqa: F401
from .evaluate import evaluate # noqa: F401
from .apply import apply # noqa: F401
from .convert import convert # noqa: F401 from .convert import convert # noqa: F401
from .init_pipeline import init_pipeline_cli # noqa: F401 from .debug_config import debug_config # noqa: F401
from .init_config import init_config, fill_config # noqa: F401 from .debug_data import debug_data # noqa: F401
from .validate import validate # noqa: F401 from .debug_diff import debug_diff # noqa: F401
from .project.clone import project_clone # noqa: F401 from .debug_model import debug_model # noqa: F401
from .project.assets import project_assets # noqa: F401 from .download import download # noqa: F401
from .project.run import project_run # noqa: F401 from .evaluate import evaluate # noqa: F401
from .project.dvc import project_update_dvc # noqa: F401
from .project.push import project_push # noqa: F401
from .project.pull import project_pull # noqa: F401
from .project.document import project_document # noqa: F401
from .find_threshold import find_threshold # noqa: F401 from .find_threshold import find_threshold # noqa: F401
from .info import info # noqa: F401
from .init_config import fill_config, init_config # noqa: F401
from .init_pipeline import init_pipeline_cli # noqa: F401
from .package import package # noqa: F401
from .pretrain import pretrain # noqa: F401
from .profile import profile # noqa: F401
from .project.assets import project_assets # noqa: F401
from .project.clone import project_clone # noqa: F401
from .project.document import project_document # noqa: F401
from .project.dvc import project_update_dvc # noqa: F401
from .project.pull import project_pull # noqa: F401
from .project.push import project_push # noqa: F401
from .project.run import project_run # noqa: F401
from .train import train_cli # noqa: F401
from .validate import validate # noqa: F401
@app.command("link", no_args_is_help=True, deprecated=True, hidden=True) @app.command("link", no_args_is_help=True, deprecated=True, hidden=True)

View File

@ -1,26 +1,45 @@
from typing import Dict, Any, Union, List, Optional, Tuple, Iterable, Literal
from typing import TYPE_CHECKING, overload
import sys
import shutil
from pathlib import Path
from wasabi import msg, Printer
import srsly
import hashlib import hashlib
import os
import shutil
import sys
from configparser import InterpolationError
from contextlib import contextmanager
from pathlib import Path
from typing import (
TYPE_CHECKING,
Any,
Dict,
Iterable,
List,
Literal,
Optional,
Tuple,
Union,
overload,
)
import srsly
import typer import typer
from click import NoSuchOption from click import NoSuchOption
from click.parser import split_arg_string from click.parser import split_arg_string
from typer.main import get_command
from contextlib import contextmanager
from thinc.api import Config, ConfigValidationError, require_gpu from thinc.api import Config, ConfigValidationError, require_gpu
from thinc.util import gpu_is_available from thinc.util import gpu_is_available
from configparser import InterpolationError from typer.main import get_command
import os from wasabi import Printer, msg
from ..schemas import ProjectConfigSchema, validate
from ..util import import_file, run_command, make_tempdir, registry, logger
from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS
from ..errors import RENAMED_LANGUAGE_CODES
from .. import about from .. import about
from ..errors import RENAMED_LANGUAGE_CODES
from ..schemas import ProjectConfigSchema, validate
from ..util import (
ENV_VARS,
SimpleFrozenDict,
import_file,
is_compatible_version,
logger,
make_tempdir,
registry,
run_command,
)
if TYPE_CHECKING: if TYPE_CHECKING:
from pathy import FluidPath # noqa: F401 from pathy import FluidPath # noqa: F401

View File

@ -1,18 +1,15 @@
import tqdm
import srsly
from itertools import chain from itertools import chain
from pathlib import Path from pathlib import Path
from typing import Optional, List, Iterable, cast, Union from typing import Iterable, List, Optional, Union, cast
import srsly
import tqdm
from wasabi import msg from wasabi import msg
from ._util import app, Arg, Opt, setup_gpu, import_code, walk_directory
from ..tokens import Doc, DocBin from ..tokens import Doc, DocBin
from ..vocab import Vocab
from ..util import ensure_path, load_model from ..util import ensure_path, load_model
from ..vocab import Vocab
from ._util import Arg, Opt, app, import_code, setup_gpu, walk_directory
path_help = """Location of the documents to predict on. path_help = """Location of the documents to predict on.
Can be a single file in .spacy format or a .jsonl file. Can be a single file in .spacy format or a .jsonl file.

View File

@ -1,13 +1,20 @@
from typing import Optional
from pathlib import Path
from wasabi import msg
import typer
import logging import logging
from pathlib import Path
from typing import Optional
import typer
from wasabi import msg
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
from ._util import import_code_paths
from .. import util from .. import util
from ..util import get_sourced_components, load_model_from_config from ..util import get_sourced_components, load_model_from_config
from ._util import (
Arg,
Opt,
app,
import_code_paths,
parse_config_overrides,
show_validation_error,
)
@app.command( @app.command(

View File

@ -1,11 +1,12 @@
from typing import Iterable, List, Optional
import random import random
from itertools import islice
import numpy
from pathlib import Path
import time import time
from tqdm import tqdm from itertools import islice
from pathlib import Path
from typing import Iterable, List, Optional
import numpy
import typer import typer
from tqdm import tqdm
from wasabi import msg from wasabi import msg
from .. import util from .. import util

View File

@ -1,18 +1,22 @@
from typing import Callable, Iterable, Mapping, Optional, Any, Union import itertools
from enum import Enum
from pathlib import Path
from wasabi import Printer
import srsly
import re import re
import sys import sys
import itertools from enum import Enum
from pathlib import Path
from typing import Any, Callable, Iterable, Mapping, Optional, Union
import srsly
from wasabi import Printer
from ._util import app, Arg, Opt, _handle_renamed_language_codes, walk_directory
from ..training import docs_to_json
from ..tokens import Doc, DocBin from ..tokens import Doc, DocBin
from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs from ..training import docs_to_json
from ..training.converters import conllu_to_docs from ..training.converters import (
conll_ner_to_docs,
conllu_to_docs,
iob_to_docs,
json_to_docs,
)
from ._util import Arg, Opt, _handle_renamed_language_codes, app, walk_directory
# Converters are matched by file extension except for ner/iob, which are # Converters are matched by file extension except for ner/iob, which are
# matched by file extension and content. To add a converter, add a new # matched by file extension and content. To add a converter, add a new

View File

@ -1,15 +1,22 @@
from typing import Optional, Dict, Any, Union, List
from pathlib import Path from pathlib import Path
from wasabi import msg, table from typing import Any, Dict, List, Optional, Union
import typer
from thinc.api import Config from thinc.api import Config
from thinc.config import VARIABLE_RE from thinc.config import VARIABLE_RE
import typer from wasabi import msg, table
from ._util import Arg, Opt, show_validation_error, parse_config_overrides from .. import util
from ._util import import_code_paths, debug_cli
from ..schemas import ConfigSchemaInit, ConfigSchemaTraining from ..schemas import ConfigSchemaInit, ConfigSchemaTraining
from ..util import registry from ..util import registry
from .. import util from ._util import (
Arg,
Opt,
debug_cli,
import_code_paths,
parse_config_overrides,
show_validation_error,
)
@debug_cli.command( @debug_cli.command(

View File

@ -1,29 +1,49 @@
from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Tuple, Union
from typing import Literal, cast, overload
from pathlib import Path
from collections import Counter
import sys
import srsly
from wasabi import Printer, MESSAGES, msg
import typer
import math import math
import sys
from collections import Counter
from pathlib import Path
from typing import (
Any,
Dict,
Iterable,
List,
Literal,
Optional,
Sequence,
Set,
Tuple,
Union,
cast,
overload,
)
from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides import numpy
from ._util import import_code_paths, debug_cli, _format_number import srsly
from ..training import Example, remove_bilu_prefix import typer
from ..training.initialize import get_sourced_components from wasabi import MESSAGES, Printer, msg
from ..schemas import ConfigSchemaTraining
from ..pipeline import TrainablePipe from .. import util
from ..language import Language
from ..morphology import Morphology
from ..pipeline import Morphologizer, SpanCategorizer, TrainablePipe
from ..pipeline._edit_tree_internals.edit_trees import EditTrees
from ..pipeline._parser_internals import nonproj from ..pipeline._parser_internals import nonproj
from ..pipeline._parser_internals.nonproj import DELIMITER from ..pipeline._parser_internals.nonproj import DELIMITER
from ..pipeline import Morphologizer, SpanCategorizer from ..schemas import ConfigSchemaTraining
from ..pipeline._edit_tree_internals.edit_trees import EditTrees from ..training import Example, remove_bilu_prefix
from ..morphology import Morphology from ..training.initialize import get_sourced_components
from ..language import Language
from ..util import registry, resolve_dot_names from ..util import registry, resolve_dot_names
from ..vectors import Mode as VectorsMode from ..vectors import Mode as VectorsMode
from .. import util from ._util import (
Arg,
Opt,
_format_number,
app,
debug_cli,
import_code_paths,
parse_config_overrides,
show_validation_error,
)
# Minimum number of expected occurrences of NER label in data to train new label # Minimum number of expected occurrences of NER label in data to train new label
NEW_LABEL_THRESHOLD = 50 NEW_LABEL_THRESHOLD = 50
@ -210,7 +230,7 @@ def debug_data(
else: else:
msg.info("No word vectors present in the package") msg.info("No word vectors present in the package")
if "spancat" in factory_names: if "spancat" in factory_names or "spancat_singlelabel" in factory_names:
model_labels_spancat = _get_labels_from_spancat(nlp) model_labels_spancat = _get_labels_from_spancat(nlp)
has_low_data_warning = False has_low_data_warning = False
has_no_neg_warning = False has_no_neg_warning = False
@ -335,7 +355,7 @@ def debug_data(
show=verbose, show=verbose,
) )
else: else:
msg.good("Examples without ocurrences available for all labels") msg.good("Examples without occurrences available for all labels")
if "ner" in factory_names: if "ner" in factory_names:
# Get all unique NER labels present in the data # Get all unique NER labels present in the data
@ -520,9 +540,13 @@ def debug_data(
if "tagger" in factory_names: if "tagger" in factory_names:
msg.divider("Part-of-speech Tagging") msg.divider("Part-of-speech Tagging")
label_list = [label for label in gold_train_data["tags"]] label_list, counts = zip(*gold_train_data["tags"].items())
model_labels = _get_labels_from_model(nlp, "tagger")
msg.info(f"{len(label_list)} label(s) in train data") msg.info(f"{len(label_list)} label(s) in train data")
p = numpy.array(counts)
p = p / p.sum()
norm_entropy = (-p * numpy.log2(p)).sum() / numpy.log2(len(label_list))
msg.info(f"{norm_entropy} is the normalised label entropy")
model_labels = _get_labels_from_model(nlp, "tagger")
labels = set(label_list) labels = set(label_list)
missing_labels = model_labels - labels missing_labels = model_labels - labels
if missing_labels: if missing_labels:
@ -824,7 +848,7 @@ def _compile_gold(
data["boundary_cross_ents"] += 1 data["boundary_cross_ents"] += 1
elif label == "-": elif label == "-":
data["ner"]["-"] += 1 data["ner"]["-"] += 1
if "spancat" in factory_names: if "spancat" in factory_names or "spancat_singlelabel" in factory_names:
for spans_key in list(eg.reference.spans.keys()): for spans_key in list(eg.reference.spans.keys()):
# Obtain the span frequency # Obtain the span frequency
if spans_key not in data["spancat"]: if spans_key not in data["spancat"]:
@ -1022,7 +1046,7 @@ def _get_labels_from_spancat(nlp: Language) -> Dict[str, Set[str]]:
pipe_names = [ pipe_names = [
pipe_name pipe_name
for pipe_name in nlp.pipe_names for pipe_name in nlp.pipe_names
if nlp.get_pipe_meta(pipe_name).factory == "spancat" if nlp.get_pipe_meta(pipe_name).factory in ("spancat", "spancat_singlelabel")
] ]
labels: Dict[str, Set[str]] = {} labels: Dict[str, Set[str]] = {}
for pipe_name in pipe_names: for pipe_name in pipe_names:

View File

@ -1,13 +1,13 @@
from pathlib import Path
from typing import Optional from typing import Optional
import typer import typer
from wasabi import Printer, diff_strings, MarkdownRenderer
from pathlib import Path
from thinc.api import Config from thinc.api import Config
from wasabi import MarkdownRenderer, Printer, diff_strings
from ._util import debug_cli, Arg, Opt, show_validation_error, parse_config_overrides
from ..util import load_config from ..util import load_config
from .init_config import init_config, Optimizations from ._util import Arg, Opt, debug_cli, parse_config_overrides, show_validation_error
from .init_config import Optimizations, init_config
@debug_cli.command( @debug_cli.command(

View File

@ -1,19 +1,32 @@
from typing import Dict, Any, Optional
from pathlib import Path
import itertools import itertools
from pathlib import Path
from typing import Any, Dict, Optional
import typer
from thinc.api import (
Model,
data_validation,
fix_random_seed,
set_dropout_rate,
set_gpu_allocator,
)
from wasabi import msg
from spacy.training import Example from spacy.training import Example
from spacy.util import resolve_dot_names from spacy.util import resolve_dot_names
from wasabi import msg
from thinc.api import fix_random_seed, set_dropout_rate
from thinc.api import Model, data_validation, set_gpu_allocator
import typer
from ._util import Arg, Opt, debug_cli, show_validation_error from .. import util
from ._util import parse_config_overrides, string_to_list, setup_gpu
from ..schemas import ConfigSchemaTraining from ..schemas import ConfigSchemaTraining
from ..util import registry from ..util import registry
from .. import util from ._util import (
Arg,
Opt,
debug_cli,
parse_config_overrides,
setup_gpu,
show_validation_error,
string_to_list,
)
@debug_cli.command( @debug_cli.command(

View File

@ -1,14 +1,20 @@
from typing import Optional, Sequence
import requests
import sys import sys
from wasabi import msg from typing import Optional, Sequence
import typer
import requests
import typer
from wasabi import msg
from ._util import app, Arg, Opt, WHEEL_SUFFIX, SDIST_SUFFIX
from .. import about from .. import about
from ..util import is_package, get_minor_version, run_command from ..util import (
from ..util import is_prerelease_version, get_installed_models get_installed_models,
from ..util import get_package_version get_minor_version,
get_package_version,
is_package,
is_prerelease_version,
run_command,
)
from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app
@app.command( @app.command(
@ -83,11 +89,8 @@ def download(
def get_model_filename(model_name: str, version: str, sdist: bool = False) -> str: def get_model_filename(model_name: str, version: str, sdist: bool = False) -> str:
dl_tpl = "{m}-{v}/{m}-{v}{s}" dl_tpl = "{m}-{v}/{m}-{v}{s}"
egg_tpl = "#egg={m}=={v}"
suffix = SDIST_SUFFIX if sdist else WHEEL_SUFFIX suffix = SDIST_SUFFIX if sdist else WHEEL_SUFFIX
filename = dl_tpl.format(m=model_name, v=version, s=suffix) filename = dl_tpl.format(m=model_name, v=version, s=suffix)
if sdist:
filename += egg_tpl.format(m=model_name, v=version)
return filename return filename

View File

@ -1,16 +1,16 @@
from typing import Optional, List, Dict, Any, Union
from wasabi import Printer
from pathlib import Path
import re import re
from pathlib import Path
from typing import Any, Dict, List, Optional, Union
import srsly import srsly
from thinc.api import fix_random_seed from thinc.api import fix_random_seed
from wasabi import Printer
from ..training import Corpus from .. import displacy, util
from ..tokens import Doc
from ._util import app, Arg, Opt, setup_gpu, import_code_paths, benchmark_cli
from ..scorer import Scorer from ..scorer import Scorer
from .. import util from ..tokens import Doc
from .. import displacy from ..training import Corpus
from ._util import Arg, Opt, app, benchmark_cli, import_code_paths, setup_gpu
@benchmark_cli.command( @benchmark_cli.command(
@ -27,6 +27,7 @@ def evaluate_cli(
gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"), gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"),
displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False), displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False),
displacy_limit: int = Opt(25, "--displacy-limit", "-dl", help="Limit of parses to render as HTML"), displacy_limit: int = Opt(25, "--displacy-limit", "-dl", help="Limit of parses to render as HTML"),
per_component: bool = Opt(False, "--per-component", "-P", help="Return scores per component, only applicable when an output JSON file is specified."),
# fmt: on # fmt: on
): ):
""" """
@ -50,6 +51,7 @@ def evaluate_cli(
gold_preproc=gold_preproc, gold_preproc=gold_preproc,
displacy_path=displacy_path, displacy_path=displacy_path,
displacy_limit=displacy_limit, displacy_limit=displacy_limit,
per_component=per_component,
silent=False, silent=False,
) )
@ -64,6 +66,7 @@ def evaluate(
displacy_limit: int = 25, displacy_limit: int = 25,
silent: bool = True, silent: bool = True,
spans_key: str = "sc", spans_key: str = "sc",
per_component: bool = False,
) -> Dict[str, Any]: ) -> Dict[str, Any]:
msg = Printer(no_print=silent, pretty=not silent) msg = Printer(no_print=silent, pretty=not silent)
fix_random_seed() fix_random_seed()
@ -78,50 +81,61 @@ def evaluate(
corpus = Corpus(data_path, gold_preproc=gold_preproc) corpus = Corpus(data_path, gold_preproc=gold_preproc)
nlp = util.load_model(model) nlp = util.load_model(model)
dev_dataset = list(corpus(nlp)) dev_dataset = list(corpus(nlp))
scores = nlp.evaluate(dev_dataset) scores = nlp.evaluate(dev_dataset, per_component=per_component)
metrics = { if per_component:
"TOK": "token_acc", data = scores
"TAG": "tag_acc", if output is None:
"POS": "pos_acc", msg.warn(
"MORPH": "morph_acc", "The per-component option is enabled but there is no output JSON file provided to save the scores to."
"LEMMA": "lemma_acc", )
"UAS": "dep_uas", else:
"LAS": "dep_las", msg.info("Per-component scores will be saved to output JSON file.")
"NER P": "ents_p", else:
"NER R": "ents_r", metrics = {
"NER F": "ents_f", "TOK": "token_acc",
"TEXTCAT": "cats_score", "TAG": "tag_acc",
"SENT P": "sents_p", "POS": "pos_acc",
"SENT R": "sents_r", "MORPH": "morph_acc",
"SENT F": "sents_f", "LEMMA": "lemma_acc",
"SPAN P": f"spans_{spans_key}_p", "UAS": "dep_uas",
"SPAN R": f"spans_{spans_key}_r", "LAS": "dep_las",
"SPAN F": f"spans_{spans_key}_f", "NER P": "ents_p",
"SPEED": "speed", "NER R": "ents_r",
} "NER F": "ents_f",
results = {} "TEXTCAT": "cats_score",
data = {} "SENT P": "sents_p",
for metric, key in metrics.items(): "SENT R": "sents_r",
if key in scores: "SENT F": "sents_f",
if key == "cats_score": "SPAN P": f"spans_{spans_key}_p",
metric = metric + " (" + scores.get("cats_score_desc", "unk") + ")" "SPAN R": f"spans_{spans_key}_r",
if isinstance(scores[key], (int, float)): "SPAN F": f"spans_{spans_key}_f",
if key == "speed": "SPEED": "speed",
results[metric] = f"{scores[key]:.0f}" }
results = {}
data = {}
for metric, key in metrics.items():
if key in scores:
if key == "cats_score":
metric = metric + " (" + scores.get("cats_score_desc", "unk") + ")"
if isinstance(scores[key], (int, float)):
if key == "speed":
results[metric] = f"{scores[key]:.0f}"
else:
results[metric] = f"{scores[key]*100:.2f}"
else: else:
results[metric] = f"{scores[key]*100:.2f}" results[metric] = "-"
else: data[re.sub(r"[\s/]", "_", key.lower())] = scores[key]
results[metric] = "-"
data[re.sub(r"[\s/]", "_", key.lower())] = scores[key]
msg.table(results, title="Results") msg.table(results, title="Results")
data = handle_scores_per_type(scores, data, spans_key=spans_key, silent=silent) data = handle_scores_per_type(scores, data, spans_key=spans_key, silent=silent)
if displacy_path: if displacy_path:
factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names] factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names]
docs = list(nlp.pipe(ex.reference.text for ex in dev_dataset[:displacy_limit])) docs = list(nlp.pipe(ex.reference.text for ex in dev_dataset[:displacy_limit]))
render_deps = "parser" in factory_names render_deps = "parser" in factory_names
render_ents = "ner" in factory_names render_ents = "ner" in factory_names
render_spans = "spancat" in factory_names
render_parses( render_parses(
docs, docs,
displacy_path, displacy_path,
@ -129,6 +143,7 @@ def evaluate(
limit=displacy_limit, limit=displacy_limit,
deps=render_deps, deps=render_deps,
ents=render_ents, ents=render_ents,
spans=render_spans,
) )
msg.good(f"Generated {displacy_limit} parses as HTML", displacy_path) msg.good(f"Generated {displacy_limit} parses as HTML", displacy_path)
@ -182,6 +197,7 @@ def render_parses(
limit: int = 250, limit: int = 250,
deps: bool = True, deps: bool = True,
ents: bool = True, ents: bool = True,
spans: bool = True,
): ):
docs[0].user_data["title"] = model_name docs[0].user_data["title"] = model_name
if ents: if ents:
@ -195,6 +211,11 @@ def render_parses(
with (output_path / "parses.html").open("w", encoding="utf8") as file_: with (output_path / "parses.html").open("w", encoding="utf8") as file_:
file_.write(html) file_.write(html)
if spans:
html = displacy.render(docs[:limit], style="span", page=True)
with (output_path / "spans.html").open("w", encoding="utf8") as file_:
file_.write(html)
def print_prf_per_type( def print_prf_per_type(
msg: Printer, scores: Dict[str, Dict[str, float]], name: str, type: str msg: Printer, scores: Dict[str, Dict[str, float]], name: str, type: str

View File

@ -1,17 +1,17 @@
import functools import functools
import logging
import operator import operator
from pathlib import Path from pathlib import Path
import logging from typing import Any, Dict, List, Optional, Tuple
from typing import Optional, Tuple, Any, Dict, List
import numpy import numpy
import wasabi.tables import wasabi.tables
from ..pipeline import TextCategorizer, MultiLabel_TextCategorizer
from ..errors import Errors
from ..training import Corpus
from ._util import app, Arg, Opt, import_code, setup_gpu
from .. import util from .. import util
from ..errors import Errors
from ..pipeline import MultiLabel_TextCategorizer, TextCategorizer
from ..training import Corpus
from ._util import Arg, Opt, app, import_code, setup_gpu
_DEFAULTS = { _DEFAULTS = {
"n_trials": 11, "n_trials": 11,
@ -35,7 +35,7 @@ def find_threshold_cli(
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
use_gpu: int = Opt(_DEFAULTS["use_gpu"], "--gpu-id", "-g", help="GPU ID or -1 for CPU"), use_gpu: int = Opt(_DEFAULTS["use_gpu"], "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
gold_preproc: bool = Opt(_DEFAULTS["gold_preproc"], "--gold-preproc", "-G", help="Use gold preprocessing"), gold_preproc: bool = Opt(_DEFAULTS["gold_preproc"], "--gold-preproc", "-G", help="Use gold preprocessing"),
verbose: bool = Opt(False, "--silent", "-V", "-VV", help="Display more information for debugging purposes"), verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
# fmt: on # fmt: on
): ):
""" """

View File

@ -1,15 +1,15 @@
from typing import Optional, Dict, Any, Union, List
import platform
import json
from pathlib import Path
from wasabi import Printer, MarkdownRenderer
import srsly
import importlib.metadata import importlib.metadata
import json
import platform
from pathlib import Path
from typing import Any, Dict, List, Optional, Union
from ._util import app, Arg, Opt, string_to_list import srsly
from .download import get_model_filename, get_latest_version from wasabi import MarkdownRenderer, Printer
from .. import util
from .. import about from .. import about, util
from ._util import Arg, Opt, app, string_to_list
from .download import get_latest_version, get_model_filename
@app.command("info") @app.command("info")

View File

@ -1,19 +1,27 @@
from typing import Optional, List, Tuple import re
from enum import Enum from enum import Enum
from pathlib import Path from pathlib import Path
from wasabi import Printer, diff_strings from typing import List, Optional, Tuple
from thinc.api import Config
import srsly import srsly
import re
from jinja2 import Template from jinja2 import Template
from thinc.api import Config
from wasabi import Printer, diff_strings
from .. import util from .. import util
from ..language import DEFAULT_CONFIG_DISTILL_PATH, DEFAULT_CONFIG_PRETRAIN_PATH from ..language import DEFAULT_CONFIG_DISTILL_PATH, DEFAULT_CONFIG_PRETRAIN_PATH
from ..schemas import RecommendationSchema from ..schemas import RecommendationSchema
from ..util import SimpleFrozenList from ..util import SimpleFrozenList
from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND from ._util import (
from ._util import string_to_list, import_code, _handle_renamed_language_codes COMMAND,
Arg,
Opt,
_handle_renamed_language_codes,
import_code,
init_cli,
show_validation_error,
string_to_list,
)
ROOT = Path(__file__).parent / "templates" ROOT = Path(__file__).parent / "templates"
TEMPLATE_PATH = ROOT / "quickstart_training.jinja" TEMPLATE_PATH = ROOT / "quickstart_training.jinja"

View File

@ -1,15 +1,24 @@
from typing import Optional
import logging import logging
from pathlib import Path from pathlib import Path
from wasabi import msg from typing import Optional
import typer
import srsly import srsly
import typer
from wasabi import msg
from .. import util from .. import util
from ..training.initialize import init_nlp, convert_vectors
from ..language import Language from ..language import Language
from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error from ..training.initialize import convert_vectors, init_nlp
from ._util import import_code, setup_gpu, _handle_renamed_language_codes from ._util import (
Arg,
Opt,
_handle_renamed_language_codes,
import_code,
init_cli,
parse_config_overrides,
setup_gpu,
show_validation_error,
)
@init_cli.command("vectors") @init_cli.command("vectors")

View File

@ -1,18 +1,18 @@
from typing import Optional, Union, Any, Dict, List, Tuple, cast
import shutil
from pathlib import Path
from wasabi import Printer, MarkdownRenderer, get_raw_input
from thinc.api import Config
from collections import defaultdict
from catalogue import RegistryError
import srsly
import sys
import re import re
import shutil
import sys
from collections import defaultdict
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union, cast
from ._util import app, Arg, Opt, string_to_list, WHEEL_SUFFIX, SDIST_SUFFIX import srsly
from ..schemas import validate, ModelMetaSchema from catalogue import RegistryError
from .. import util from thinc.api import Config
from .. import about from wasabi import MarkdownRenderer, Printer, get_raw_input
from .. import about, util
from ..schemas import ModelMetaSchema, validate
from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app, string_to_list
@app.command("package") @app.command("package")

View File

@ -1,13 +1,21 @@
from typing import Optional
from pathlib import Path
from wasabi import msg
import typer
import re import re
from pathlib import Path
from typing import Optional
import typer
from wasabi import msg
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
from ._util import import_code_paths, setup_gpu
from ..training.pretrain import pretrain from ..training.pretrain import pretrain
from ..util import load_config from ..util import load_config
from ._util import (
Arg,
Opt,
app,
import_code_paths,
parse_config_overrides,
setup_gpu,
show_validation_error,
)
@app.command( @app.command(
@ -23,6 +31,7 @@ def pretrain_cli(
resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"), resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."), epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."),
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"), use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
skip_last: bool = Opt(False, "--skip-last", "-L", help="Skip saving model-last.bin"),
# fmt: on # fmt: on
): ):
""" """
@ -74,6 +83,7 @@ def pretrain_cli(
epoch_resume=epoch_resume, epoch_resume=epoch_resume,
use_gpu=use_gpu, use_gpu=use_gpu,
silent=False, silent=False,
skip_last=skip_last,
) )
msg.good("Successfully finished pretrain") msg.good("Successfully finished pretrain")

View File

@ -1,17 +1,18 @@
from typing import Optional, Sequence, Union, Iterator
import tqdm
from pathlib import Path
import srsly
import cProfile import cProfile
import itertools
import pstats import pstats
import sys import sys
import itertools from pathlib import Path
from wasabi import msg, Printer from typing import Iterator, Optional, Sequence, Union
import typer
import srsly
import tqdm
import typer
from wasabi import Printer, msg
from ._util import app, debug_cli, Arg, Opt, NAME
from ..language import Language from ..language import Language
from ..util import load_model from ..util import load_model
from ._util import NAME, Arg, Opt, app, debug_cli
@debug_cli.command("profile") @debug_cli.command("profile")

View File

@ -1,16 +1,27 @@
from typing import Any, Dict, Optional
from pathlib import Path
from wasabi import msg
import os import os
import re import re
import shutil import shutil
from pathlib import Path
from typing import Any, Dict, Optional
import requests import requests
import typer import typer
from wasabi import msg
from ...util import ensure_path, working_dir from ...util import ensure_path, working_dir
from .._util import project_cli, Arg, Opt, PROJECT_FILE, load_project_config from .._util import (
from .._util import get_checksum, download_file, git_checkout, get_git_version PROJECT_FILE,
from .._util import SimpleFrozenDict, parse_config_overrides Arg,
Opt,
SimpleFrozenDict,
download_file,
get_checksum,
get_git_version,
git_checkout,
load_project_config,
parse_config_overrides,
project_cli,
)
# Whether assets are extra if `extra` is not set. # Whether assets are extra if `extra` is not set.
EXTRA_DEFAULT = False EXTRA_DEFAULT = False

View File

@ -1,13 +1,22 @@
from typing import Optional
from pathlib import Path
from wasabi import msg
import subprocess
import re import re
import subprocess
from pathlib import Path
from typing import Optional
from wasabi import msg
from ... import about from ... import about
from ...util import ensure_path from ...util import ensure_path
from .._util import project_cli, Arg, Opt, COMMAND, PROJECT_FILE from .._util import (
from .._util import git_checkout, get_git_version, git_repo_branch_exists COMMAND,
PROJECT_FILE,
Arg,
Opt,
get_git_version,
git_checkout,
git_repo_branch_exists,
project_cli,
)
DEFAULT_REPO = about.__projects__ DEFAULT_REPO = about.__projects__
DEFAULT_PROJECTS_BRANCH = about.__projects_branch__ DEFAULT_PROJECTS_BRANCH = about.__projects_branch__

View File

@ -1,9 +1,9 @@
from pathlib import Path from pathlib import Path
from wasabi import msg, MarkdownRenderer
from wasabi import MarkdownRenderer, msg
from ...util import working_dir from ...util import working_dir
from .._util import project_cli, Arg, Opt, PROJECT_FILE, load_project_config from .._util import PROJECT_FILE, Arg, Opt, load_project_config, project_cli
DOCS_URL = "https://spacy.io" DOCS_URL = "https://spacy.io"
INTRO_PROJECT = f"""The [`{PROJECT_FILE}`]({PROJECT_FILE}) defines the data assets required by the INTRO_PROJECT = f"""The [`{PROJECT_FILE}`]({PROJECT_FILE}) defines the data assets required by the

View File

@ -1,15 +1,28 @@
"""This module contains helpers and subcommands for integrating spaCy projects """This module contains helpers and subcommands for integrating spaCy projects
with Data Version Controk (DVC). https://dvc.org""" with Data Version Controk (DVC). https://dvc.org"""
from typing import Dict, Any, List, Optional, Iterable
import subprocess import subprocess
from pathlib import Path from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional
from wasabi import msg from wasabi import msg
from .._util import PROJECT_FILE, load_project_config, get_hash, project_cli from ...util import (
from .._util import Arg, Opt, NAME, COMMAND SimpleFrozenList,
from ...util import working_dir, split_command, join_command, run_command join_command,
from ...util import SimpleFrozenList run_command,
split_command,
working_dir,
)
from .._util import (
COMMAND,
NAME,
PROJECT_FILE,
Arg,
Opt,
get_hash,
load_project_config,
project_cli,
)
DVC_CONFIG = "dvc.yaml" DVC_CONFIG = "dvc.yaml"
DVC_DIR = ".dvc" DVC_DIR = ".dvc"

View File

@ -1,9 +1,9 @@
from pathlib import Path from pathlib import Path
from wasabi import msg from wasabi import msg
from .remote_storage import RemoteStorage
from .remote_storage import get_command_hash from .._util import Arg, load_project_config, logger, project_cli
from .._util import project_cli, Arg, logger from .remote_storage import RemoteStorage, get_command_hash
from .._util import load_project_config
from .run import update_lockfile from .run import update_lockfile

View File

@ -1,9 +1,9 @@
from pathlib import Path from pathlib import Path
from wasabi import msg from wasabi import msg
from .remote_storage import RemoteStorage
from .remote_storage import get_content_hash, get_command_hash from .._util import Arg, load_project_config, logger, project_cli
from .._util import load_project_config from .remote_storage import RemoteStorage, get_command_hash, get_content_hash
from .._util import project_cli, Arg, logger
@project_cli.command("push") @project_cli.command("push")

View File

@ -1,18 +1,25 @@
from typing import Optional, List, Dict, TYPE_CHECKING import hashlib
import os import os
import site import site
import hashlib
import urllib.parse
import tarfile import tarfile
import urllib.parse
from pathlib import Path from pathlib import Path
from typing import TYPE_CHECKING, Dict, List, Optional
from wasabi import msg from wasabi import msg
from .._util import get_hash, get_checksum, upload_file, download_file
from .._util import ensure_pathy, make_tempdir
from ...util import get_minor_version, ENV_VARS, check_bool_env_var
from ...git_info import GIT_VERSION
from ... import about from ... import about
from ...errors import Errors from ...errors import Errors
from ...git_info import GIT_VERSION
from ...util import ENV_VARS, check_bool_env_var, get_minor_version
from .._util import (
download_file,
ensure_pathy,
get_checksum,
get_hash,
make_tempdir,
upload_file,
)
if TYPE_CHECKING: if TYPE_CHECKING:
from pathy import FluidPath # noqa: F401 from pathy import FluidPath # noqa: F401

View File

@ -1,20 +1,39 @@
from typing import Optional, List, Dict, Sequence, Any, Iterable, Tuple
import os.path import os.path
from pathlib import Path
from wasabi import msg
from wasabi.util import locale_escape
import sys import sys
from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple
import srsly import srsly
import typer import typer
from wasabi import msg
from wasabi.util import locale_escape
from ... import about from ... import about
from ...git_info import GIT_VERSION from ...git_info import GIT_VERSION
from ...util import working_dir, run_command, split_command, is_cwd, join_command from ...util import (
from ...util import SimpleFrozenList, is_minor_version_match, ENV_VARS ENV_VARS,
from ...util import check_bool_env_var, SimpleFrozenDict SimpleFrozenDict,
from .._util import PROJECT_FILE, PROJECT_LOCK, load_project_config, get_hash SimpleFrozenList,
from .._util import get_checksum, project_cli, Arg, Opt, COMMAND, parse_config_overrides check_bool_env_var,
is_cwd,
is_minor_version_match,
join_command,
run_command,
split_command,
working_dir,
)
from .._util import (
COMMAND,
PROJECT_FILE,
PROJECT_LOCK,
Arg,
Opt,
get_checksum,
get_hash,
load_project_config,
parse_config_overrides,
project_cli,
)
@project_cli.command( @project_cli.command(

View File

@ -3,7 +3,7 @@ the docs and the init config command. It encodes various best practices and
can help generate the best possible configuration, given a user's requirements. #} can help generate the best possible configuration, given a user's requirements. #}
{%- set use_transformer = hardware != "cpu" and transformer_data -%} {%- set use_transformer = hardware != "cpu" and transformer_data -%}
{%- set transformer = transformer_data[optimize] if use_transformer else {} -%} {%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
{%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "spancat", "trainable_lemmatizer"] -%} {%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "span_finder", "spancat", "spancat_singlelabel", "trainable_lemmatizer"] -%}
[paths] [paths]
train = null train = null
dev = null dev = null
@ -24,8 +24,11 @@ gpu_allocator = null
lang = "{{ lang }}" lang = "{{ lang }}"
{%- set has_textcat = ("textcat" in components or "textcat_multilabel" in components) -%} {%- set has_textcat = ("textcat" in components or "textcat_multilabel" in components) -%}
{%- set with_accuracy = optimize == "accuracy" -%} {%- set with_accuracy = optimize == "accuracy" -%}
{%- set has_accurate_textcat = has_textcat and with_accuracy -%} {# The BOW textcat doesn't need a source of features, so it can omit the
{%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "spancat" in components or "trainable_lemmatizer" in components or "entity_linker" in components or has_accurate_textcat) -%} tok2vec/transformer. #}
{%- set with_accuracy_or_transformer = (use_transformer or with_accuracy) -%}
{%- set textcat_needs_features = has_textcat and with_accuracy_or_transformer -%}
{%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "span_finder" in components or "spancat" in components or "spancat_singlelabel" in components or "trainable_lemmatizer" in components or "entity_linker" in components or textcat_needs_features) -%}
{%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components -%} {%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components -%}
{%- else -%} {%- else -%}
{%- set full_pipeline = components -%} {%- set full_pipeline = components -%}
@ -122,6 +125,30 @@ grad_factor = 1.0
@layers = "reduce_mean.v1" @layers = "reduce_mean.v1"
{% endif -%} {% endif -%}
{% if "span_finder" in components -%}
[components.span_finder]
factory = "span_finder"
max_length = null
min_length = null
scorer = {"@scorers":"spacy.span_finder_scorer.v1"}
spans_key = "sc"
threshold = 0.5
[components.span_finder.model]
@architectures = "spacy.SpanFinder.v1"
[components.span_finder.model.scorer]
@layers = "spacy.LinearLogistic.v1"
nO = 2
[components.span_finder.model.tok2vec]
@architectures = "spacy-transformers.TransformerListener.v1"
grad_factor = 1.0
[components.span_finder.model.tok2vec.pooling]
@layers = "reduce_mean.v1"
{% endif -%}
{% if "spancat" in components -%} {% if "spancat" in components -%}
[components.spancat] [components.spancat]
factory = "spancat" factory = "spancat"
@ -154,6 +181,36 @@ grad_factor = 1.0
sizes = [1,2,3] sizes = [1,2,3]
{% endif -%} {% endif -%}
{% if "spancat_singlelabel" in components %}
[components.spancat_singlelabel]
factory = "spancat_singlelabel"
negative_weight = 1.0
allow_overlap = true
scorer = {"@scorers":"spacy.spancat_scorer.v1"}
spans_key = "sc"
[components.spancat_singlelabel.model]
@architectures = "spacy.SpanCategorizer.v1"
[components.spancat_singlelabel.model.reducer]
@layers = "spacy.mean_max_reducer.v1"
hidden_size = 128
[components.spancat_singlelabel.model.scorer]
@layers = "Softmax.v2"
[components.spancat_singlelabel.model.tok2vec]
@architectures = "spacy-transformers.TransformerListener.v1"
grad_factor = 1.0
[components.spancat_singlelabel.model.tok2vec.pooling]
@layers = "reduce_mean.v1"
[components.spancat_singlelabel.suggester]
@misc = "spacy.ngram_suggester.v1"
sizes = [1,2,3]
{% endif %}
{% if "trainable_lemmatizer" in components -%} {% if "trainable_lemmatizer" in components -%}
[components.trainable_lemmatizer] [components.trainable_lemmatizer]
factory = "trainable_lemmatizer" factory = "trainable_lemmatizer"
@ -219,10 +276,16 @@ no_output_layer = false
{% else -%} {% else -%}
[components.textcat.model] [components.textcat.model]
@architectures = "spacy.TextCatBOW.v2" @architectures = "spacy.TextCatCNN.v2"
exclusive_classes = true exclusive_classes = true
ngram_size = 1 nO = null
no_output_layer = false
[components.textcat.model.tok2vec]
@architectures = "spacy-transformers.TransformerListener.v1"
grad_factor = 1.0
[components.textcat.model.tok2vec.pooling]
@layers = "reduce_mean.v1"
{%- endif %} {%- endif %}
{%- endif %} {%- endif %}
@ -250,10 +313,16 @@ no_output_layer = false
{% else -%} {% else -%}
[components.textcat_multilabel.model] [components.textcat_multilabel.model]
@architectures = "spacy.TextCatBOW.v2" @architectures = "spacy.TextCatCNN.v2"
exclusive_classes = false exclusive_classes = false
ngram_size = 1 nO = null
no_output_layer = false
[components.textcat_multilabel.model.tok2vec]
@architectures = "spacy-transformers.TransformerListener.v1"
grad_factor = 1.0
[components.textcat_multilabel.model.tok2vec.pooling]
@layers = "reduce_mean.v1"
{%- endif %} {%- endif %}
{%- endif %} {%- endif %}
@ -284,6 +353,7 @@ maxout_pieces = 3
{% if "morphologizer" in components %} {% if "morphologizer" in components %}
[components.morphologizer] [components.morphologizer]
factory = "morphologizer" factory = "morphologizer"
label_smoothing = 0.05
[components.morphologizer.model] [components.morphologizer.model]
@architectures = "spacy.Tagger.v2" @architectures = "spacy.Tagger.v2"
@ -297,6 +367,7 @@ width = ${components.tok2vec.model.encode.width}
{% if "tagger" in components %} {% if "tagger" in components %}
[components.tagger] [components.tagger]
factory = "tagger" factory = "tagger"
label_smoothing = 0.05
[components.tagger.model] [components.tagger.model]
@architectures = "spacy.Tagger.v2" @architectures = "spacy.Tagger.v2"
@ -341,6 +412,27 @@ nO = null
width = ${components.tok2vec.model.encode.width} width = ${components.tok2vec.model.encode.width}
{% endif %} {% endif %}
{% if "span_finder" in components %}
[components.span_finder]
factory = "span_finder"
max_length = null
min_length = null
scorer = {"@scorers":"spacy.span_finder_scorer.v1"}
spans_key = "sc"
threshold = 0.5
[components.span_finder.model]
@architectures = "spacy.SpanFinder.v1"
[components.span_finder.model.scorer]
@layers = "spacy.LinearLogistic.v1"
nO = 2
[components.span_finder.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode.width}
{% endif %}
{% if "spancat" in components %} {% if "spancat" in components %}
[components.spancat] [components.spancat]
factory = "spancat" factory = "spancat"
@ -370,6 +462,33 @@ width = ${components.tok2vec.model.encode.width}
sizes = [1,2,3] sizes = [1,2,3]
{% endif %} {% endif %}
{% if "spancat_singlelabel" in components %}
[components.spancat_singlelabel]
factory = "spancat_singlelabel"
negative_weight = 1.0
allow_overlap = true
scorer = {"@scorers":"spacy.spancat_scorer.v1"}
spans_key = "sc"
[components.spancat_singlelabel.model]
@architectures = "spacy.SpanCategorizer.v1"
[components.spancat_singlelabel.model.reducer]
@layers = "spacy.mean_max_reducer.v1"
hidden_size = 128
[components.spancat_singlelabel.model.scorer]
@layers = "Softmax.v2"
[components.spancat_singlelabel.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode.width}
[components.spancat_singlelabel.suggester]
@misc = "spacy.ngram_suggester.v1"
sizes = [1,2,3]
{% endif %}
{% if "trainable_lemmatizer" in components -%} {% if "trainable_lemmatizer" in components -%}
[components.trainable_lemmatizer] [components.trainable_lemmatizer]
factory = "trainable_lemmatizer" factory = "trainable_lemmatizer"

View File

@ -1,15 +1,23 @@
from typing import Optional, Dict, Any, Union
from pathlib import Path
from wasabi import msg
import typer
import logging import logging
import sys import sys
from pathlib import Path
from typing import Any, Dict, Optional, Union
import typer
from wasabi import msg
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
from ._util import import_code_paths, setup_gpu
from ..training.loop import train as train_nlp
from ..training.initialize import init_nlp
from .. import util from .. import util
from ..training.initialize import init_nlp
from ..training.loop import train as train_nlp
from ._util import (
Arg,
Opt,
app,
import_code_paths,
parse_config_overrides,
setup_gpu,
show_validation_error,
)
@app.command( @app.command(

View File

@ -1,14 +1,21 @@
from typing import Tuple
from pathlib import Path
import sys import sys
import requests
from wasabi import msg, Printer
import warnings import warnings
from pathlib import Path
from typing import Tuple
import requests
from wasabi import Printer, msg
from ._util import app
from .. import about from .. import about
from ..util import get_package_version, get_installed_models, get_minor_version from ..util import (
from ..util import get_package_path, get_model_meta, is_compatible_version get_installed_models,
get_minor_version,
get_model_meta,
get_package_path,
get_package_version,
is_compatible_version,
)
from ._util import app
@app.command("validate") @app.command("validate")

View File

@ -1,5 +1,6 @@
"""Helpers for Python and platform compatibility.""" """Helpers for Python and platform compatibility."""
import sys import sys
from thinc.util import copy_array from thinc.util import copy_array
try: try:

View File

@ -4,15 +4,13 @@ spaCy's built in visualization suite for dependencies and named entities.
DOCS: https://spacy.io/api/top-level#displacy DOCS: https://spacy.io/api/top-level#displacy
USAGE: https://spacy.io/usage/visualizers USAGE: https://spacy.io/usage/visualizers
""" """
from typing import Union, Iterable, Optional, Dict, Any, Callable
import warnings import warnings
from typing import Any, Callable, Dict, Iterable, Optional, Union
from .render import DependencyRenderer, EntityRenderer, SpanRenderer
from ..tokens import Doc, Span
from ..errors import Errors, Warnings from ..errors import Errors, Warnings
from ..util import is_in_jupyter from ..tokens import Doc, Span
from ..util import find_available_port from ..util import find_available_port, is_in_jupyter
from .render import DependencyRenderer, EntityRenderer, SpanRenderer
_html = {} _html = {}
RENDER_WRAPPER = None RENDER_WRAPPER = None
@ -68,7 +66,7 @@ def render(
if jupyter or (jupyter is None and is_in_jupyter()): if jupyter or (jupyter is None and is_in_jupyter()):
# return HTML rendered by IPython display() # return HTML rendered by IPython display()
# See #4840 for details on span wrapper to disable mathjax # See #4840 for details on span wrapper to disable mathjax
from IPython.core.display import display, HTML from IPython.core.display import HTML, display
return display(HTML('<span class="tex2jax_ignore">{}</span>'.format(html))) return display(HTML('<span class="tex2jax_ignore">{}</span>'.format(html)))
return html return html
@ -125,13 +123,17 @@ def app(environ, start_response):
return [res] return [res]
def parse_deps(orig_doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]: def parse_deps(
orig_doc: Union[Doc, Span], options: Dict[str, Any] = {}
) -> Dict[str, Any]:
"""Generate dependency parse in {'words': [], 'arcs': []} format. """Generate dependency parse in {'words': [], 'arcs': []} format.
orig_doc (Doc): Document to parse. orig_doc (Union[Doc, Span]): Document to parse.
options (Dict[str, Any]): Dependency parse specific visualisation options. options (Dict[str, Any]): Dependency parse specific visualisation options.
RETURNS (dict): Generated dependency parse keyed by words and arcs. RETURNS (dict): Generated dependency parse keyed by words and arcs.
""" """
if isinstance(orig_doc, Span):
orig_doc = orig_doc.as_doc()
doc = Doc(orig_doc.vocab).from_bytes( doc = Doc(orig_doc.vocab).from_bytes(
orig_doc.to_bytes(exclude=["user_data", "user_hooks"]) orig_doc.to_bytes(exclude=["user_data", "user_hooks"])
) )

View File

@ -1,15 +1,29 @@
from typing import Any, Dict, List, Optional, Tuple, Union
import uuid
import itertools import itertools
import uuid
from typing import Any, Dict, List, Optional, Tuple, Union
from ..errors import Errors from ..errors import Errors
from ..util import escape_html, minify_html, registry from ..util import escape_html, minify_html, registry
from .templates import TPL_DEP_ARCS, TPL_DEP_SVG, TPL_DEP_WORDS from .templates import (
from .templates import TPL_DEP_WORDS_LEMMA, TPL_ENT, TPL_ENT_RTL, TPL_ENTS TPL_DEP_ARCS,
from .templates import TPL_FIGURE, TPL_KB_LINK, TPL_PAGE, TPL_SPAN TPL_DEP_SVG,
from .templates import TPL_SPAN_RTL, TPL_SPAN_SLICE, TPL_SPAN_SLICE_RTL TPL_DEP_WORDS,
from .templates import TPL_SPAN_START, TPL_SPAN_START_RTL, TPL_SPANS TPL_DEP_WORDS_LEMMA,
from .templates import TPL_TITLE TPL_ENT,
TPL_ENT_RTL,
TPL_ENTS,
TPL_FIGURE,
TPL_KB_LINK,
TPL_PAGE,
TPL_SPAN,
TPL_SPAN_RTL,
TPL_SPAN_SLICE,
TPL_SPAN_SLICE_RTL,
TPL_SPAN_START,
TPL_SPAN_START_RTL,
TPL_SPANS,
TPL_TITLE,
)
DEFAULT_LANG = "en" DEFAULT_LANG = "en"
DEFAULT_DIR = "ltr" DEFAULT_DIR = "ltr"

View File

@ -1,5 +1,5 @@
from typing import Literal
import warnings import warnings
from typing import Literal
class ErrorsWithCodes(type): class ErrorsWithCodes(type):
@ -82,7 +82,7 @@ class Warnings(metaclass=ErrorsWithCodes):
"ignoring the duplicate entry.") "ignoring the duplicate entry.")
W021 = ("Unexpected hash collision in PhraseMatcher. Matches may be " W021 = ("Unexpected hash collision in PhraseMatcher. Matches may be "
"incorrect. Modify PhraseMatcher._terminal_hash to fix.") "incorrect. Modify PhraseMatcher._terminal_hash to fix.")
W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in " W024 = ("Entity '{entity}' - alias '{alias}' combination already exists in "
"the Knowledge Base.") "the Knowledge Base.")
W026 = ("Unable to set all sentence boundaries from dependency parses. If " W026 = ("Unable to set all sentence boundaries from dependency parses. If "
"you are constructing a parse tree incrementally by setting " "you are constructing a parse tree incrementally by setting "
@ -209,7 +209,11 @@ class Warnings(metaclass=ErrorsWithCodes):
"`enabled` ({enabled}). Be aware that this might affect other components in your pipeline.") "`enabled` ({enabled}). Be aware that this might affect other components in your pipeline.")
W124 = ("{host}:{port} is already in use, using the nearest available port {serve_port} as an alternative.") W124 = ("{host}:{port} is already in use, using the nearest available port {serve_port} as an alternative.")
# v4 warning strings
W400 = ("`use_upper=False` is ignored, the upper layer is always enabled") W400 = ("`use_upper=False` is ignored, the upper layer is always enabled")
W401 = ("`incl_prior is True`, but the selected knowledge base type {kb_type} doesn't support prior probability "
"lookups so this setting will be ignored. If your KB does support prior probability lookups, make sure "
"to return `True` in `.supports_prior_probs`.")
class Errors(metaclass=ErrorsWithCodes): class Errors(metaclass=ErrorsWithCodes):
@ -542,6 +546,8 @@ class Errors(metaclass=ErrorsWithCodes):
"during training, make sure to include it in 'annotating components'") "during training, make sure to include it in 'annotating components'")
# New errors added in v3.x # New errors added in v3.x
E850 = ("The PretrainVectors objective currently only supports default or "
"floret vectors, not {mode} vectors.")
E851 = ("The 'textcat' component labels should only have values of 0 or 1, " E851 = ("The 'textcat' component labels should only have values of 0 or 1, "
"but found value of '{val}'.") "but found value of '{val}'.")
E852 = ("The tar file pulled from the remote attempted an unsafe path " E852 = ("The tar file pulled from the remote attempted an unsafe path "
@ -922,7 +928,7 @@ class Errors(metaclass=ErrorsWithCodes):
E1029 = ("Edit tree cannot be applied to form.") E1029 = ("Edit tree cannot be applied to form.")
E1030 = ("Edit tree identifier out of range.") E1030 = ("Edit tree identifier out of range.")
E1031 = ("Could not find gold transition - see logs above.") E1031 = ("Could not find gold transition - see logs above.")
E1032 = ("`{var}` should not be {forbidden}, but received {value}.") E1032 = ("Span {var} {value} is out of bounds for {obj} with length {length}.")
E1033 = ("Dimension {name} invalid -- only nO, nF, nP") E1033 = ("Dimension {name} invalid -- only nO, nF, nP")
E1034 = ("Node index {i} out of bounds ({length})") E1034 = ("Node index {i} out of bounds ({length})")
E1035 = ("Token index {i} out of bounds ({length})") E1035 = ("Token index {i} out of bounds ({length})")
@ -951,6 +957,14 @@ class Errors(metaclass=ErrorsWithCodes):
"with `displacy.serve(doc, port=port)`") "with `displacy.serve(doc, port=port)`")
E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port=port)` " E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port=port)` "
"or use `auto_select_port=True` to pick an available port automatically.") "or use `auto_select_port=True` to pick an available port automatically.")
E1051 = ("'allow_overlap' can only be False when max_positive is 1, but found 'max_positive': {max_positive}.")
E1052 = ("Unable to copy spans: the character offsets for the span at "
"index {i} in the span group do not align with the tokenization "
"in the target doc.")
E1053 = ("Both 'min_length' and 'max_length' should be larger than 0, but found"
" 'min_length': {min_length}, 'max_length': {max_length}")
E1054 = ("The text, including whitespace, must match between reference and "
"predicted docs when training {component}.")
# v4 error strings # v4 error strings
E4000 = ("Expected a Doc as input, but got: '{type}'") E4000 = ("Expected a Doc as input, but got: '{type}'")
@ -961,6 +975,12 @@ class Errors(metaclass=ErrorsWithCodes):
"reference and predicted docs.") "reference and predicted docs.")
E4004 = ("Backprop is not supported when is_train is not set.") E4004 = ("Backprop is not supported when is_train is not set.")
E4005 = ("EntityLinker_v1 is not supported in spaCy v4. Update your configuration.") E4005 = ("EntityLinker_v1 is not supported in spaCy v4. Update your configuration.")
E4006 = ("Expected `entity_id` to be of type {exp_type}, but is of type {found_type}.")
E4007 = ("Span {var} {value} must be {op} Span {existing_var} "
"{existing_value}.")
E4008 = ("Span {pos}_char {value} does not correspond to a token {pos}.")
E4009 = ("The '{attr}' parameter should be 'None' or 'True', but found '{value}'.")
RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"} RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"}

View File

@ -1,4 +1,5 @@
import warnings import warnings
from .errors import Warnings from .errors import Warnings

View File

@ -1,3 +1,5 @@
from .candidate import Candidate, InMemoryCandidate
from .kb import KnowledgeBase from .kb import KnowledgeBase
from .kb_in_memory import InMemoryLookupKB from .kb_in_memory import InMemoryLookupKB
from .candidate import Candidate, get_candidates, get_candidates_batch
__all__ = ["KnowledgeBase", "InMemoryLookupKB", "Candidate", "InMemoryCandidate"]

View File

@ -1,12 +1,17 @@
from .kb cimport KnowledgeBase
from libcpp.vector cimport vector from libcpp.vector cimport vector
from ..typedefs cimport hash_t
# Object used by the Entity Linker that summarizes one entity-alias candidate combination. from ..typedefs cimport hash_t
from .kb_in_memory cimport InMemoryLookupKB
cdef class Candidate: cdef class Candidate:
cdef readonly KnowledgeBase kb pass
cdef hash_t entity_hash
cdef float entity_freq
cdef vector[float] entity_vector cdef class InMemoryCandidate(Candidate):
cdef hash_t alias_hash cdef readonly hash_t _entity_hash
cdef float prior_prob cdef readonly hash_t _alias_hash
cdef vector[float] _entity_vector
cdef float _prior_prob
cdef readonly InMemoryLookupKB _kb
cdef float _entity_freq

View File

@ -1,74 +1,98 @@
# cython: infer_types=True, profile=True # cython: infer_types=True, profile=True
from typing import Iterable from .kb_in_memory cimport InMemoryLookupKB
from .kb cimport KnowledgeBase
from ..tokens import Span from ..errors import Errors
cdef class Candidate: cdef class Candidate:
"""A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved """A `Candidate` object refers to a textual mention that may or may not be resolved
to a specific `entity` from a Knowledge Base. This will be used as input for the entity linking to a specific entity from a Knowledge Base. This will be used as input for the entity linking
algorithm which will disambiguate the various candidates to the correct one. algorithm which will disambiguate the various candidates to the correct one.
Each candidate (alias, entity) pair is assigned a certain prior probability. Each candidate, which represents a possible link between one textual mention and one entity in the knowledge base,
is assigned a certain prior probability.
DOCS: https://spacy.io/api/kb/#candidate-init DOCS: https://spacy.io/api/kb/#candidate-init
""" """
def __init__(self, KnowledgeBase kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob): def __init__(self):
self.kb = kb # Make sure abstract Candidate is not instantiated.
self.entity_hash = entity_hash if self.__class__ == Candidate:
self.entity_freq = entity_freq raise TypeError(
self.entity_vector = entity_vector Errors.E1046.format(cls_name=self.__class__.__name__)
self.alias_hash = alias_hash )
self.prior_prob = prior_prob
@property @property
def entity(self) -> int: def entity_id(self) -> int:
"""RETURNS (uint64): hash of the entity's KB ID/name""" """RETURNS (int): Numerical representation of entity ID (if entity ID is numerical, this is just the entity ID,
return self.entity_hash otherwise the hash of the entity ID string)."""
raise NotImplementedError
@property @property
def entity_(self) -> str: def entity_id_(self) -> str:
"""RETURNS (str): ID/name of this entity in the KB""" """RETURNS (str): String representation of entity ID."""
return self.kb.vocab.strings[self.entity_hash] raise NotImplementedError
@property @property
def alias(self) -> int: def entity_vector(self) -> vector[float]:
"""RETURNS (uint64): hash of the alias""" """RETURNS (vector[float]): Entity vector."""
return self.alias_hash raise NotImplementedError
cdef class InMemoryCandidate(Candidate):
"""Candidate for InMemoryLookupKB."""
def __init__(
self,
kb: InMemoryLookupKB,
entity_hash: int,
alias_hash: int,
entity_vector: vector[float],
prior_prob: float,
entity_freq: float
):
"""
kb (InMemoryLookupKB]): InMemoryLookupKB instance.
entity_id (int): Entity ID as hash that can be looked up with InMemoryKB.vocab.strings.__getitem__().
entity_freq (int): Entity frequency in KB corpus.
entity_vector (List[float]): Entity embedding.
alias_hash (int): Alias hash.
prior_prob (float): Prior probability of entity for this alias. I. e. the probability that, independent of
the context, this alias - which matches one of this entity's aliases - resolves to one this entity.
"""
super().__init__()
self._entity_hash = entity_hash
self._entity_vector = entity_vector
self._prior_prob = prior_prob
self._kb = kb
self._alias_hash = alias_hash
self._entity_freq = entity_freq
@property @property
def alias_(self) -> str: def entity_id(self) -> int:
"""RETURNS (str): ID of the original alias""" return self._entity_hash
return self.kb.vocab.strings[self.alias_hash]
@property @property
def entity_freq(self) -> float: def entity_vector(self) -> vector[float]:
return self.entity_freq return self._entity_vector
@property
def entity_vector(self) -> Iterable[float]:
return self.entity_vector
@property @property
def prior_prob(self) -> float: def prior_prob(self) -> float:
return self.prior_prob """RETURNS (float): Prior probability that this alias, which matches one of this entity's synonyms, resolves to
this entity."""
return self._prior_prob
@property
def alias(self) -> str:
"""RETURNS (str): Alias."""
return self._kb.vocab.strings[self._alias_hash]
def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]: @property
""" def entity_id_(self) -> str:
Return candidate entities for a given mention and fetching appropriate entries from the index. return self._kb.vocab.strings[self._entity_hash]
kb (KnowledgeBase): Knowledge base to query.
mention (Span): Entity mention for which to identify candidates.
RETURNS (Iterable[Candidate]): Identified candidates.
"""
return kb.get_candidates(mention)
@property
def get_candidates_batch(kb: KnowledgeBase, mentions: Iterable[Span]) -> Iterable[Iterable[Candidate]]: def entity_freq(self) -> float:
""" """RETURNS (float): Entity frequency in KB corpus."""
Return candidate entities for the given mentions and fetching appropriate entries from the index. return self._entity_freq
kb (KnowledgeBase): Knowledge base to query.
mention (Iterable[Span]): Entity mentions for which to identify candidates.
RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
"""
return kb.get_candidates_batch(mentions)

View File

@ -2,8 +2,10 @@
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from libc.stdint cimport int64_t from libc.stdint cimport int64_t
from ..vocab cimport Vocab from ..vocab cimport Vocab
cdef class KnowledgeBase: cdef class KnowledgeBase:
cdef Pool mem cdef Pool mem
cdef readonly Vocab vocab cdef readonly Vocab vocab

View File

@ -2,12 +2,13 @@
from pathlib import Path from pathlib import Path
from typing import Iterable, Tuple, Union from typing import Iterable, Tuple, Union
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from .candidate import Candidate
from ..tokens import Span
from ..util import SimpleFrozenList
from ..errors import Errors from ..errors import Errors
from ..tokens import Span, SpanGroup
from ..util import SimpleFrozenList
from .candidate import Candidate
cdef class KnowledgeBase: cdef class KnowledgeBase:
@ -30,21 +31,23 @@ cdef class KnowledgeBase:
self.entity_vector_length = entity_vector_length self.entity_vector_length = entity_vector_length
self.mem = Pool() self.mem = Pool()
def get_candidates_batch(self, mentions: Iterable[Span]) -> Iterable[Iterable[Candidate]]: def get_candidates_batch(self, mentions: SpanGroup) -> Iterable[Iterable[Candidate]]:
""" """
Return candidate entities for specified texts. Each candidate defines the entity, the original alias, Return candidate entities for a specified Span mention. Each candidate defines at least the entity and the
and the prior probability of that alias resolving to that entity. entity's embedding vector. Depending on the KB implementation, further properties - such as the prior
If no candidate is found for a given text, an empty list is returned. probability of the specified mention text resolving to that entity - might be included.
mentions (Iterable[Span]): Mentions for which to get candidates. If no candidates are found for a given mention, an empty list is returned.
mentions (SpanGroup): Mentions for which to get candidates.
RETURNS (Iterable[Iterable[Candidate]]): Identified candidates. RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
""" """
return [self.get_candidates(span) for span in mentions] return [self.get_candidates(span) for span in mentions]
def get_candidates(self, mention: Span) -> Iterable[Candidate]: def get_candidates(self, mention: Span) -> Iterable[Candidate]:
""" """
Return candidate entities for specified text. Each candidate defines the entity, the original alias, Return candidate entities for a specific mention. Each candidate defines at least the entity and the
and the prior probability of that alias resolving to that entity. entity's embedding vector. Depending on the KB implementation, further properties - such as the prior
If the no candidate is found for a given text, an empty list is returned. probability of the specified mention text resolving to that entity - might be included.
If no candidate is found for the given mention, an empty list is returned.
mention (Span): Mention for which to get candidates. mention (Span): Mention for which to get candidates.
RETURNS (Iterable[Candidate]): Identified candidates. RETURNS (Iterable[Candidate]): Identified candidates.
""" """
@ -106,3 +109,10 @@ cdef class KnowledgeBase:
raise NotImplementedError( raise NotImplementedError(
Errors.E1045.format(parent="KnowledgeBase", method="from_disk", name=self.__name__) Errors.E1045.format(parent="KnowledgeBase", method="from_disk", name=self.__name__)
) )
@property
def supports_prior_probs(self) -> bool:
"""RETURNS (bool): Whether this KB type supports looking up prior probabilities for entity mentions."""
raise NotImplementedError(
Errors.E1045.format(parent="KnowledgeBase", method="supports_prior_probs", name=self.__name__)
)

View File

@ -1,11 +1,11 @@
"""Knowledge-base for entity or concept linking.""" """Knowledge-base for entity or concept linking."""
from preshed.maps cimport PreshMap
from libcpp.vector cimport vector
from libc.stdint cimport int32_t, int64_t from libc.stdint cimport int32_t, int64_t
from libc.stdio cimport FILE from libc.stdio cimport FILE
from libcpp.vector cimport vector
from preshed.maps cimport PreshMap
from ..structs cimport AliasC, KBEntryC
from ..typedefs cimport hash_t from ..typedefs cimport hash_t
from ..structs cimport KBEntryC, AliasC
from .kb cimport KnowledgeBase from .kb cimport KnowledgeBase
ctypedef vector[KBEntryC] entry_vec ctypedef vector[KBEntryC] entry_vec

View File

@ -1,24 +1,29 @@
# cython: infer_types=True, profile=True # cython: infer_types=True, profile=True
from typing import Iterable, Callable, Dict, Any, Union from typing import Any, Callable, Dict, Iterable, Union
import srsly import srsly
from preshed.maps cimport PreshMap
from cpython.exc cimport PyErr_SetFromErrno
from libc.stdio cimport fopen, fclose, fread, fwrite, feof, fseek
from libc.stdint cimport int32_t, int64_t
from libcpp.vector cimport vector
from pathlib import Path from cpython.exc cimport PyErr_SetFromErrno
from libc.stdint cimport int32_t, int64_t
from libc.stdio cimport fclose, feof, fopen, fread, fseek, fwrite
from libcpp.vector cimport vector
from preshed.maps cimport PreshMap
import warnings import warnings
from pathlib import Path
from ..tokens import Span from ..tokens import Span
from ..typedefs cimport hash_t from ..typedefs cimport hash_t
from ..errors import Errors, Warnings
from .. import util from .. import util
from ..errors import Errors, Warnings
from ..util import SimpleFrozenList, ensure_path from ..util import SimpleFrozenList, ensure_path
from ..vocab cimport Vocab from ..vocab cimport Vocab
from .kb cimport KnowledgeBase from .kb cimport KnowledgeBase
from .candidate import Candidate as Candidate
from .candidate import InMemoryCandidate
cdef class InMemoryLookupKB(KnowledgeBase): cdef class InMemoryLookupKB(KnowledgeBase):
@ -226,10 +231,10 @@ cdef class InMemoryLookupKB(KnowledgeBase):
alias_entry.probs = probs alias_entry.probs = probs
self._aliases_table[alias_index] = alias_entry self._aliases_table[alias_index] = alias_entry
def get_candidates(self, mention: Span) -> Iterable[Candidate]: def get_candidates(self, mention: Span) -> Iterable[InMemoryCandidate]:
return self.get_alias_candidates(mention.text) # type: ignore return self._get_alias_candidates(mention.text) # type: ignore
def get_alias_candidates(self, str alias) -> Iterable[Candidate]: def _get_alias_candidates(self, str alias) -> Iterable[InMemoryCandidate]:
""" """
Return candidate entities for an alias. Each candidate defines the entity, the original alias, Return candidate entities for an alias. Each candidate defines the entity, the original alias,
and the prior probability of that alias resolving to that entity. and the prior probability of that alias resolving to that entity.
@ -241,14 +246,18 @@ cdef class InMemoryLookupKB(KnowledgeBase):
alias_index = <int64_t>self._alias_index.get(alias_hash) alias_index = <int64_t>self._alias_index.get(alias_hash)
alias_entry = self._aliases_table[alias_index] alias_entry = self._aliases_table[alias_index]
return [Candidate(kb=self, return [
entity_hash=self._entries[entry_index].entity_hash, InMemoryCandidate(
entity_freq=self._entries[entry_index].freq, kb=self,
entity_vector=self._vectors_table[self._entries[entry_index].vector_index], entity_hash=self._entries[entry_index].entity_hash,
alias_hash=alias_hash, alias_hash=alias_hash,
prior_prob=prior_prob) entity_vector=self._vectors_table[self._entries[entry_index].vector_index],
for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs) prior_prob=prior_prob,
if entry_index != 0] entity_freq=self._entries[entry_index].freq
)
for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs)
if entry_index != 0
]
def get_vector(self, str entity): def get_vector(self, str entity):
cdef hash_t entity_hash = self.vocab.strings[entity] cdef hash_t entity_hash = self.vocab.strings[entity]
@ -279,6 +288,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
return 0.0 return 0.0
def supports_prior_probs(self) -> bool:
return True
def to_bytes(self, **kwargs): def to_bytes(self, **kwargs):
"""Serialize the current state to a binary string. """Serialize the current state to a binary string.
""" """

View File

@ -1,5 +1,5 @@
from ...language import BaseDefaults, Language
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ...language import Language, BaseDefaults
class AfrikaansDefaults(BaseDefaults): class AfrikaansDefaults(BaseDefaults):

View File

@ -1,12 +1,11 @@
from .stop_words import STOP_WORDS from ...attrs import LANG
from ...language import BaseDefaults, Language
from ...util import update_exc
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language, BaseDefaults
from ...attrs import LANG
from ...util import update_exc
class AmharicDefaults(BaseDefaults): class AmharicDefaults(BaseDefaults):

View File

@ -1,5 +1,11 @@
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY from ..char_classes import (
from ..char_classes import UNITS, ALPHA_UPPER ALPHA_UPPER,
CURRENCY,
LIST_ELLIPSES,
LIST_PUNCT,
LIST_QUOTES,
UNITS,
)
_list_punct = LIST_PUNCT + "፡ ። ፣ ፤ ፥ ፦ ፧ ፠ ፨".strip().split() _list_punct = LIST_PUNCT + "፡ ። ፣ ፤ ፥ ፦ ፧ ፠ ፨".strip().split()

View File

@ -1,5 +1,4 @@
from ...symbols import ORTH, NORM from ...symbols import NORM, ORTH
_exc = {} _exc = {}

View File

@ -1,8 +1,8 @@
from .stop_words import STOP_WORDS from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from ...language import Language, BaseDefaults
class ArabicDefaults(BaseDefaults): class ArabicDefaults(BaseDefaults):

View File

@ -1,5 +1,11 @@
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY from ..char_classes import (
from ..char_classes import UNITS, ALPHA_UPPER ALPHA_UPPER,
CURRENCY,
LIST_ELLIPSES,
LIST_PUNCT,
LIST_QUOTES,
UNITS,
)
_suffixes = ( _suffixes = (
LIST_PUNCT LIST_PUNCT

View File

@ -1,7 +1,6 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...symbols import NORM, ORTH
from ...symbols import ORTH, NORM
from ...util import update_exc from ...util import update_exc
from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc = {} _exc = {}

View File

@ -1,6 +1,6 @@
from .stop_words import STOP_WORDS from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ...language import Language, BaseDefaults from .stop_words import STOP_WORDS
class AzerbaijaniDefaults(BaseDefaults): class AzerbaijaniDefaults(BaseDefaults):

View File

@ -1,6 +1,5 @@
from ...attrs import LIKE_NUM from ...attrs import LIKE_NUM
# Eleven, twelve etc. are written separate: on bir, on iki # Eleven, twelve etc. are written separate: on bir, on iki
_num_words = [ _num_words = [

View File

@ -1,12 +1,14 @@
from ...attrs import LANG
from ...language import BaseDefaults, Language
from ...util import update_exc
from ..punctuation import (
COMBINING_DIACRITICS_TOKENIZER_INFIXES,
COMBINING_DIACRITICS_TOKENIZER_SUFFIXES,
)
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from .lex_attrs import LEX_ATTRS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .lex_attrs import LEX_ATTRS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_INFIXES
from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_SUFFIXES
from ...language import Language, BaseDefaults
from ...attrs import LANG
from ...util import update_exc
class BulgarianDefaults(BaseDefaults): class BulgarianDefaults(BaseDefaults):

View File

@ -1,6 +1,5 @@
from ...attrs import LIKE_NUM from ...attrs import LIKE_NUM
_num_words = [ _num_words = [
"нула", "нула",
"едно", "едно",

View File

@ -4,8 +4,7 @@ References:
(countries, occupations, fields of studies and more). (countries, occupations, fields of studies and more).
""" """
from ...symbols import ORTH, NORM from ...symbols import NORM, ORTH
_exc = {} _exc = {}

View File

@ -1,10 +1,12 @@
from typing import Optional, Callable from typing import Callable, Optional
from thinc.api import Model from thinc.api import Model
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES from ...language import BaseDefaults, Language
from .stop_words import STOP_WORDS
from ...language import Language, BaseDefaults
from ...pipeline import Lemmatizer from ...pipeline import Lemmatizer
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class BengaliDefaults(BaseDefaults): class BengaliDefaults(BaseDefaults):

View File

@ -1,6 +1,14 @@
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS from ..char_classes import (
from ..char_classes import ALPHA_LOWER, ALPHA, HYPHENS, CONCAT_QUOTES, UNITS ALPHA,
ALPHA_LOWER,
CONCAT_QUOTES,
HYPHENS,
LIST_ELLIPSES,
LIST_ICONS,
LIST_PUNCT,
LIST_QUOTES,
UNITS,
)
_currency = r"\$¢£€¥฿৳" _currency = r"\$¢£€¥฿৳"
_quotes = CONCAT_QUOTES.replace("'", "") _quotes = CONCAT_QUOTES.replace("'", "")

View File

@ -1,7 +1,6 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...symbols import NORM, ORTH
from ...symbols import ORTH, NORM
from ...util import update_exc from ...util import update_exc
from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc = {} _exc = {}

View File

@ -1,14 +1,14 @@
from typing import Optional, Callable from typing import Callable, Optional
from thinc.api import Model from thinc.api import Model
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from ...language import BaseDefaults, Language
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS
from ...language import Language, BaseDefaults
from .lemmatizer import CatalanLemmatizer from .lemmatizer import CatalanLemmatizer
from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class CatalanDefaults(BaseDefaults): class CatalanDefaults(BaseDefaults):

View File

@ -1,6 +1,5 @@
from ...attrs import LIKE_NUM from ...attrs import LIKE_NUM
_num_words = [ _num_words = [
"zero", "zero",
"un", "un",

View File

@ -1,9 +1,18 @@
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS from ..char_classes import (
from ..char_classes import LIST_CURRENCY ALPHA,
from ..char_classes import CURRENCY ALPHA_LOWER,
from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT ALPHA_UPPER,
from ..char_classes import merge_chars, _units CONCAT_QUOTES,
CURRENCY,
LIST_CURRENCY,
LIST_ELLIPSES,
LIST_ICONS,
LIST_PUNCT,
LIST_QUOTES,
PUNCT,
_units,
merge_chars,
)
ELISION = " ' ".strip().replace(" ", "").replace("\n", "") ELISION = " ' ".strip().replace(" ", "").replace("\n", "")

View File

@ -1,7 +1,8 @@
from typing import Union, Iterator, Tuple from typing import Iterator, Tuple, Union
from ...tokens import Doc, Span
from ...symbols import NOUN, PROPN
from ...errors import Errors from ...errors import Errors
from ...symbols import NOUN, PROPN
from ...tokens import Doc, Span
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]: def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:

View File

@ -1,7 +1,6 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...symbols import NORM, ORTH
from ...symbols import ORTH, NORM
from ...util import update_exc from ...util import update_exc
from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc = {} _exc = {}

View File

@ -1,6 +1,6 @@
from .stop_words import STOP_WORDS from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ...language import Language, BaseDefaults from .stop_words import STOP_WORDS
class CzechDefaults(BaseDefaults): class CzechDefaults(BaseDefaults):

View File

@ -1,9 +1,9 @@
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
from ...language import Language, BaseDefaults from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class DanishDefaults(BaseDefaults): class DanishDefaults(BaseDefaults):

View File

@ -1,6 +1,5 @@
from ...attrs import LIKE_NUM from ...attrs import LIKE_NUM
# Source http://fjern-uv.dk/tal.php # Source http://fjern-uv.dk/tal.php
_num_words = """nul _num_words = """nul
en et to tre fire fem seks syv otte ni ti en et to tre fire fem seks syv otte ni ti

View File

@ -1,8 +1,13 @@
from ..char_classes import LIST_ELLIPSES, LIST_ICONS from ..char_classes import (
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER ALPHA,
ALPHA_LOWER,
ALPHA_UPPER,
CONCAT_QUOTES,
LIST_ELLIPSES,
LIST_ICONS,
)
from ..punctuation import TOKENIZER_SUFFIXES from ..punctuation import TOKENIZER_SUFFIXES
_quotes = CONCAT_QUOTES.replace("'", "") _quotes = CONCAT_QUOTES.replace("'", "")
_infixes = ( _infixes = (

View File

@ -1,7 +1,8 @@
from typing import Union, Iterator, Tuple from typing import Iterator, Tuple, Union
from ...tokens import Doc, Span
from ...symbols import NOUN, PROPN, PRON, VERB, AUX
from ...errors import Errors from ...errors import Errors
from ...symbols import AUX, NOUN, PRON, PROPN, VERB
from ...tokens import Doc, Span
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]: def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:

View File

@ -2,10 +2,9 @@
Tokenizer Exceptions. Tokenizer Exceptions.
Source: https://forkortelse.dk/ and various others. Source: https://forkortelse.dk/ and various others.
""" """
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...symbols import NORM, ORTH
from ...symbols import ORTH, NORM
from ...util import update_exc from ...util import update_exc
from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc = {} _exc = {}

View File

@ -1,8 +1,8 @@
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from ...language import BaseDefaults, Language
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
from ...language import Language, BaseDefaults from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class GermanDefaults(BaseDefaults): class GermanDefaults(BaseDefaults):

View File

@ -1,9 +1,18 @@
from ..char_classes import LIST_ELLIPSES, LIST_ICONS, LIST_PUNCT, LIST_QUOTES from ..char_classes import (
from ..char_classes import CURRENCY, UNITS, PUNCT ALPHA,
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER ALPHA_LOWER,
ALPHA_UPPER,
CONCAT_QUOTES,
CURRENCY,
LIST_ELLIPSES,
LIST_ICONS,
LIST_PUNCT,
LIST_QUOTES,
PUNCT,
UNITS,
)
from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES
_prefixes = ["``"] + BASE_TOKENIZER_PREFIXES _prefixes = ["``"] + BASE_TOKENIZER_PREFIXES
_suffixes = ( _suffixes = (

View File

@ -1,7 +1,7 @@
from typing import Union, Iterator, Tuple from typing import Iterator, Tuple, Union
from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors from ...errors import Errors
from ...symbols import NOUN, PRON, PROPN
from ...tokens import Doc, Span from ...tokens import Doc, Span

View File

@ -1,7 +1,6 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...symbols import NORM, ORTH
from ...symbols import ORTH, NORM
from ...util import update_exc from ...util import update_exc
from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc = { _exc = {
"auf'm": [{ORTH: "auf"}, {ORTH: "'m", NORM: "dem"}], "auf'm": [{ORTH: "auf"}, {ORTH: "'m", NORM: "dem"}],

View File

@ -1,6 +1,6 @@
from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ...language import Language, BaseDefaults
class LowerSorbianDefaults(BaseDefaults): class LowerSorbianDefaults(BaseDefaults):

View File

@ -1,13 +1,14 @@
from typing import Optional, Callable from typing import Callable, Optional
from thinc.api import Model from thinc.api import Model
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from ...language import BaseDefaults, Language
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from .lemmatizer import GreekLemmatizer from .lemmatizer import GreekLemmatizer
from ...language import Language, BaseDefaults from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class GreekDefaults(BaseDefaults): class GreekDefaults(BaseDefaults):

View File

@ -1,5 +1,6 @@
def get_pos_from_wiktionary(): def get_pos_from_wiktionary():
import re import re
from gensim.corpora.wikicorpus import extract_pages from gensim.corpora.wikicorpus import extract_pages
regex = re.compile(r"==={{(\w+)\|el}}===") regex = re.compile(r"==={{(\w+)\|el}}===")

View File

@ -1,6 +1,16 @@
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY from ..char_classes import (
from ..char_classes import LIST_ICONS, ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS ALPHA,
from ..char_classes import CONCAT_QUOTES, CURRENCY ALPHA_LOWER,
ALPHA_UPPER,
CONCAT_QUOTES,
CURRENCY,
HYPHENS,
LIST_CURRENCY,
LIST_ELLIPSES,
LIST_ICONS,
LIST_PUNCT,
LIST_QUOTES,
)
_units = ( _units = (
"km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft " "km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft "

View File

@ -1,7 +1,7 @@
from typing import Union, Iterator, Tuple from typing import Iterator, Tuple, Union
from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors from ...errors import Errors
from ...symbols import NOUN, PRON, PROPN
from ...tokens import Doc, Span from ...tokens import Doc, Span

View File

@ -1,6 +1,6 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...symbols import NORM, ORTH
from ...symbols import ORTH, NORM
from ...util import update_exc from ...util import update_exc
from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc = {} _exc = {}

View File

@ -1,13 +1,14 @@
from typing import Optional, Callable from typing import Callable, Optional
from thinc.api import Model from thinc.api import Model
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from ...language import BaseDefaults, Language
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS
from .punctuation import TOKENIZER_INFIXES
from .lemmatizer import EnglishLemmatizer from .lemmatizer import EnglishLemmatizer
from ...language import Language, BaseDefaults from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_INFIXES
from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class EnglishDefaults(BaseDefaults): class EnglishDefaults(BaseDefaults):

View File

@ -1,5 +1,12 @@
from ..char_classes import LIST_ELLIPSES, LIST_ICONS, HYPHENS from ..char_classes import (
from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA ALPHA,
ALPHA_LOWER,
ALPHA_UPPER,
CONCAT_QUOTES,
HYPHENS,
LIST_ELLIPSES,
LIST_ICONS,
)
_infixes = ( _infixes = (
LIST_ELLIPSES LIST_ELLIPSES

View File

@ -1,7 +1,7 @@
from typing import Union, Iterator, Tuple from typing import Iterator, Tuple, Union
from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors from ...errors import Errors
from ...symbols import NOUN, PRON, PROPN
from ...tokens import Doc, Span from ...tokens import Doc, Span

View File

@ -1,8 +1,8 @@
from typing import Dict, List from typing import Dict, List
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, NORM
from ...util import update_exc
from ...symbols import NORM, ORTH
from ...util import update_exc
from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc: Dict[str, List[Dict]] = {} _exc: Dict[str, List[Dict]] = {}
_exclude = [ _exclude = [

View File

@ -1,12 +1,14 @@
from typing import Optional, Callable from typing import Callable, Optional
from thinc.api import Model from thinc.api import Model
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
from .lemmatizer import SpanishLemmatizer from .lemmatizer import SpanishLemmatizer
from .syntax_iterators import SYNTAX_ITERATORS from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
from ...language import Language, BaseDefaults from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class SpanishDefaults(BaseDefaults): class SpanishDefaults(BaseDefaults):

Some files were not shown because too many files have changed in this diff Show More