mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-10 08:12:24 +03:00
Merge branch 'v4' into feature/multiple-code-files
This commit is contained in:
commit
28c8a577fc
129
.github/azure-steps.yml
vendored
129
.github/azure-steps.yml
vendored
|
@ -1,129 +0,0 @@
|
||||||
parameters:
|
|
||||||
python_version: ''
|
|
||||||
architecture: 'x64'
|
|
||||||
num_build_jobs: 2
|
|
||||||
|
|
||||||
steps:
|
|
||||||
- task: UsePythonVersion@0
|
|
||||||
inputs:
|
|
||||||
versionSpec: ${{ parameters.python_version }}
|
|
||||||
architecture: ${{ parameters.architecture }}
|
|
||||||
allowUnstable: true
|
|
||||||
|
|
||||||
- bash: |
|
|
||||||
echo "##vso[task.setvariable variable=python_version]${{ parameters.python_version }}"
|
|
||||||
displayName: 'Set variables'
|
|
||||||
|
|
||||||
- script: |
|
|
||||||
python -m pip install -U build pip setuptools
|
|
||||||
python -m pip install -U -r requirements.txt
|
|
||||||
displayName: "Install dependencies"
|
|
||||||
|
|
||||||
- script: |
|
|
||||||
python -m build --sdist
|
|
||||||
displayName: "Build sdist"
|
|
||||||
|
|
||||||
- script: |
|
|
||||||
python -m mypy spacy
|
|
||||||
displayName: 'Run mypy'
|
|
||||||
condition: ne(variables['python_version'], '3.6')
|
|
||||||
|
|
||||||
- task: DeleteFiles@1
|
|
||||||
inputs:
|
|
||||||
contents: "spacy"
|
|
||||||
displayName: "Delete source directory"
|
|
||||||
|
|
||||||
- task: DeleteFiles@1
|
|
||||||
inputs:
|
|
||||||
contents: "*.egg-info"
|
|
||||||
displayName: "Delete egg-info directory"
|
|
||||||
|
|
||||||
- script: |
|
|
||||||
python -m pip freeze > installed.txt
|
|
||||||
python -m pip uninstall -y -r installed.txt
|
|
||||||
displayName: "Uninstall all packages"
|
|
||||||
|
|
||||||
- bash: |
|
|
||||||
SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
|
|
||||||
SPACY_NUM_BUILD_JOBS=${{ parameters.num_build_jobs }} python -m pip install dist/$SDIST
|
|
||||||
displayName: "Install from sdist"
|
|
||||||
|
|
||||||
- script: |
|
|
||||||
python -W error -c "import spacy"
|
|
||||||
displayName: "Test import"
|
|
||||||
|
|
||||||
# - script: |
|
|
||||||
# python -m spacy download ca_core_news_sm
|
|
||||||
# python -m spacy download ca_core_news_md
|
|
||||||
# python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
|
|
||||||
# displayName: 'Test download CLI'
|
|
||||||
# condition: eq(variables['python_version'], '3.8')
|
|
||||||
#
|
|
||||||
# - script: |
|
|
||||||
# python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
|
|
||||||
# displayName: 'Test no warnings on load (#11713)'
|
|
||||||
# condition: eq(variables['python_version'], '3.8')
|
|
||||||
#
|
|
||||||
# - script: |
|
|
||||||
# python -m spacy download ca_core_news_sm 2>&1 | grep -q skipping
|
|
||||||
# displayName: 'Test skip re-download (#12188)'
|
|
||||||
# condition: eq(variables['python_version'], '3.8')
|
|
||||||
|
|
||||||
# - script: |
|
|
||||||
# python -W error -m spacy info ca_core_news_sm | grep -q download_url
|
|
||||||
# displayName: 'Test download_url in info CLI'
|
|
||||||
# condition: eq(variables['python_version'] '3.8')
|
|
||||||
|
|
||||||
- script: |
|
|
||||||
python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
|
|
||||||
displayName: 'Test convert CLI'
|
|
||||||
condition: eq(variables['python_version'], '3.8')
|
|
||||||
|
|
||||||
- script: |
|
|
||||||
python -m spacy init config -p ner -l ca ner.cfg
|
|
||||||
python -m spacy debug config ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy
|
|
||||||
displayName: 'Test debug config CLI'
|
|
||||||
condition: eq(variables['python_version'], '3.8')
|
|
||||||
|
|
||||||
- script: |
|
|
||||||
# will have errors due to sparse data, check for summary in output
|
|
||||||
python -m spacy debug data ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy | grep -q Summary
|
|
||||||
displayName: 'Test debug data CLI'
|
|
||||||
condition: eq(variables['python_version'], '3.8')
|
|
||||||
|
|
||||||
- script: |
|
|
||||||
python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
|
|
||||||
displayName: 'Test train CLI'
|
|
||||||
condition: eq(variables['python_version'], '3.8')
|
|
||||||
|
|
||||||
# - script: |
|
|
||||||
# python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
|
|
||||||
# PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
|
|
||||||
# displayName: 'Test assemble CLI'
|
|
||||||
# condition: eq(variables['python_version'], '3.8')
|
|
||||||
#
|
|
||||||
# - script: |
|
|
||||||
# python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
|
|
||||||
# python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
|
|
||||||
# displayName: 'Test assemble CLI vectors warning'
|
|
||||||
# condition: eq(variables['python_version'], '3.8')
|
|
||||||
|
|
||||||
- script: |
|
|
||||||
python -m pip install -U -r requirements.txt
|
|
||||||
displayName: "Install test requirements"
|
|
||||||
|
|
||||||
- script: |
|
|
||||||
python -m pytest --pyargs spacy -W error
|
|
||||||
displayName: "Run CPU tests"
|
|
||||||
|
|
||||||
- script: |
|
|
||||||
python -m pip install 'spacy[apple]'
|
|
||||||
python -m pytest --pyargs spacy
|
|
||||||
displayName: "Run CPU tests with thinc-apple-ops"
|
|
||||||
condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.11'))
|
|
||||||
|
|
||||||
- script: |
|
|
||||||
python .github/validate_universe_json.py website/meta/universe.json
|
|
||||||
displayName: 'Test website/meta/universe.json'
|
|
||||||
condition: eq(variables['python_version'], '3.8')
|
|
||||||
|
|
45
.github/workflows/autoblack.yml
vendored
45
.github/workflows/autoblack.yml
vendored
|
@ -1,45 +0,0 @@
|
||||||
# GitHub Action that uses Black to reformat all Python code and submits a PR
|
|
||||||
# in regular intervals. Inspired by: https://github.com/cclauss/autoblack
|
|
||||||
|
|
||||||
name: autoblack
|
|
||||||
on:
|
|
||||||
workflow_dispatch: # allow manual trigger
|
|
||||||
schedule:
|
|
||||||
- cron: '0 8 * * 5' # every Friday at 8am UTC
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
autoblack:
|
|
||||||
if: github.repository_owner == 'explosion'
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v3
|
|
||||||
with:
|
|
||||||
ref: ${{ github.head_ref }}
|
|
||||||
- uses: actions/setup-python@v4
|
|
||||||
- run: pip install black -c requirements.txt
|
|
||||||
- name: Auto-format code if needed
|
|
||||||
run: black spacy
|
|
||||||
# We can't run black --check here because that returns a non-zero excit
|
|
||||||
# code and makes GitHub think the action failed
|
|
||||||
- name: Check for modified files
|
|
||||||
id: git-check
|
|
||||||
run: echo modified=$(if git diff-index --quiet HEAD --; then echo "false"; else echo "true"; fi) >> $GITHUB_OUTPUT
|
|
||||||
|
|
||||||
- name: Create Pull Request
|
|
||||||
if: steps.git-check.outputs.modified == 'true'
|
|
||||||
uses: peter-evans/create-pull-request@v4
|
|
||||||
with:
|
|
||||||
title: Auto-format code with black
|
|
||||||
labels: meta
|
|
||||||
commit-message: Auto-format code with black
|
|
||||||
committer: GitHub <noreply@github.com>
|
|
||||||
author: explosion-bot <explosion-bot@users.noreply.github.com>
|
|
||||||
body: _This PR is auto-generated._
|
|
||||||
branch: autoblack
|
|
||||||
delete-branch: true
|
|
||||||
draft: false
|
|
||||||
- name: Check outputs
|
|
||||||
if: steps.git-check.outputs.modified == 'true'
|
|
||||||
run: |
|
|
||||||
echo "Pull Request Number - ${{ steps.cpr.outputs.pull-request-number }}"
|
|
||||||
echo "Pull Request URL - ${{ steps.cpr.outputs.pull-request-url }}"
|
|
1
.github/workflows/explosionbot.yml
vendored
1
.github/workflows/explosionbot.yml
vendored
|
@ -8,6 +8,7 @@ on:
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
explosion-bot:
|
explosion-bot:
|
||||||
|
if: github.repository_owner == 'explosion'
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Dump GitHub context
|
- name: Dump GitHub context
|
||||||
|
|
1
.github/workflows/issue-manager.yml
vendored
1
.github/workflows/issue-manager.yml
vendored
|
@ -13,6 +13,7 @@ on:
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
issue-manager:
|
issue-manager:
|
||||||
|
if: github.repository_owner == 'explosion'
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: tiangolo/issue-manager@0.4.0
|
- uses: tiangolo/issue-manager@0.4.0
|
||||||
|
|
1
.github/workflows/lock.yml
vendored
1
.github/workflows/lock.yml
vendored
|
@ -13,6 +13,7 @@ concurrency:
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
action:
|
action:
|
||||||
|
if: github.repository_owner == 'explosion'
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: dessant/lock-threads@v4
|
- uses: dessant/lock-threads@v4
|
||||||
|
|
1
.github/workflows/spacy_universe_alert.yml
vendored
1
.github/workflows/spacy_universe_alert.yml
vendored
|
@ -7,6 +7,7 @@ on:
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
build:
|
build:
|
||||||
|
if: github.repository_owner == 'explosion'
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
|
|
173
.github/workflows/tests.yml
vendored
Normal file
173
.github/workflows/tests.yml
vendored
Normal file
|
@ -0,0 +1,173 @@
|
||||||
|
name: tests
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches-ignore:
|
||||||
|
- "spacy.io"
|
||||||
|
- "nightly.spacy.io"
|
||||||
|
- "v2.spacy.io"
|
||||||
|
paths-ignore:
|
||||||
|
- "*.md"
|
||||||
|
- "*.mdx"
|
||||||
|
- "website/**"
|
||||||
|
- ".github/workflows/**"
|
||||||
|
pull_request:
|
||||||
|
types: [opened, synchronize, reopened, edited]
|
||||||
|
paths-ignore:
|
||||||
|
- "*.md"
|
||||||
|
- "*.mdx"
|
||||||
|
- "website/**"
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
validate:
|
||||||
|
name: Validate
|
||||||
|
if: github.repository_owner == 'explosion'
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- name: Check out repo
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
|
- name: Configure Python version
|
||||||
|
uses: actions/setup-python@v4
|
||||||
|
with:
|
||||||
|
python-version: "3.8"
|
||||||
|
architecture: x64
|
||||||
|
|
||||||
|
- name: black
|
||||||
|
run: |
|
||||||
|
python -m pip install black -c requirements.txt
|
||||||
|
python -m black spacy --check
|
||||||
|
- name: isort
|
||||||
|
run: |
|
||||||
|
python -m pip install isort -c requirements.txt
|
||||||
|
python -m isort spacy --check
|
||||||
|
- name: flake8
|
||||||
|
run: |
|
||||||
|
python -m pip install flake8==5.0.4
|
||||||
|
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
|
||||||
|
tests:
|
||||||
|
name: Test
|
||||||
|
needs: Validate
|
||||||
|
strategy:
|
||||||
|
fail-fast: true
|
||||||
|
matrix:
|
||||||
|
os: [ubuntu-latest, windows-latest, macos-latest]
|
||||||
|
python_version: ["3.11"]
|
||||||
|
include:
|
||||||
|
- os: macos-latest
|
||||||
|
python_version: "3.8"
|
||||||
|
- os: ubuntu-20.04
|
||||||
|
python_version: "3.9"
|
||||||
|
- os: windows-latest
|
||||||
|
python_version: "3.10"
|
||||||
|
|
||||||
|
runs-on: ${{ matrix.os }}
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Check out repo
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
|
- name: Configure Python version
|
||||||
|
uses: actions/setup-python@v4
|
||||||
|
with:
|
||||||
|
python-version: ${{ matrix.python_version }}
|
||||||
|
architecture: x64
|
||||||
|
|
||||||
|
- name: Install dependencies
|
||||||
|
run: |
|
||||||
|
python -m pip install -U build pip setuptools
|
||||||
|
python -m pip install -U -r requirements.txt
|
||||||
|
|
||||||
|
- name: Build sdist
|
||||||
|
run: |
|
||||||
|
python -m build --sdist
|
||||||
|
|
||||||
|
- name: Run mypy
|
||||||
|
run: |
|
||||||
|
python -m mypy spacy
|
||||||
|
|
||||||
|
- name: Delete source directory and .egg-info
|
||||||
|
run: |
|
||||||
|
rm -rf spacy *.egg-info
|
||||||
|
shell: bash
|
||||||
|
|
||||||
|
- name: Uninstall all packages
|
||||||
|
run: |
|
||||||
|
python -m pip freeze
|
||||||
|
python -m pip freeze --exclude pywin32 > installed.txt
|
||||||
|
python -m pip uninstall -y -r installed.txt
|
||||||
|
|
||||||
|
- name: Install from sdist
|
||||||
|
run: |
|
||||||
|
SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
|
||||||
|
SPACY_NUM_BUILD_JOBS=2 python -m pip install dist/$SDIST
|
||||||
|
shell: bash
|
||||||
|
|
||||||
|
- name: Test import
|
||||||
|
run: python -W error -c "import spacy"
|
||||||
|
|
||||||
|
# - name: "Test download CLI"
|
||||||
|
# run: |
|
||||||
|
# python -m spacy download ca_core_news_sm
|
||||||
|
# python -m spacy download ca_core_news_md
|
||||||
|
# python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
|
||||||
|
# if: matrix.python_version == '3.9'
|
||||||
|
#
|
||||||
|
# - name: "Test download_url in info CLI"
|
||||||
|
# run: |
|
||||||
|
# python -W error -m spacy info ca_core_news_sm | grep -q download_url
|
||||||
|
# if: matrix.python_version == '3.9'
|
||||||
|
#
|
||||||
|
# - name: "Test no warnings on load (#11713)"
|
||||||
|
# run: |
|
||||||
|
# python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
|
||||||
|
# if: matrix.python_version == '3.9'
|
||||||
|
|
||||||
|
- name: "Test convert CLI"
|
||||||
|
run: |
|
||||||
|
python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
|
||||||
|
if: matrix.python_version == '3.9'
|
||||||
|
|
||||||
|
- name: "Test debug config CLI"
|
||||||
|
run: |
|
||||||
|
python -m spacy init config -p ner -l ca ner.cfg
|
||||||
|
python -m spacy debug config ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy
|
||||||
|
if: matrix.python_version == '3.9'
|
||||||
|
|
||||||
|
- name: "Test debug data CLI"
|
||||||
|
run: |
|
||||||
|
# will have errors due to sparse data, check for summary in output
|
||||||
|
python -m spacy debug data ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy | grep -q Summary
|
||||||
|
if: matrix.python_version == '3.9'
|
||||||
|
|
||||||
|
- name: "Test train CLI"
|
||||||
|
run: |
|
||||||
|
python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
|
||||||
|
if: matrix.python_version == '3.9'
|
||||||
|
|
||||||
|
# - name: "Test assemble CLI"
|
||||||
|
# run: |
|
||||||
|
# python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
|
||||||
|
# PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
|
||||||
|
# if: matrix.python_version == '3.9'
|
||||||
|
#
|
||||||
|
# - name: "Test assemble CLI vectors warning"
|
||||||
|
# run: |
|
||||||
|
# python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
|
||||||
|
# python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
|
||||||
|
# if: matrix.python_version == '3.9'
|
||||||
|
|
||||||
|
- name: "Install test requirements"
|
||||||
|
run: |
|
||||||
|
python -m pip install -U -r requirements.txt
|
||||||
|
|
||||||
|
- name: "Run CPU tests"
|
||||||
|
run: |
|
||||||
|
python -m pytest --pyargs spacy -W error
|
||||||
|
if: "!(startsWith(matrix.os, 'macos') && matrix.python_version == '3.11')"
|
||||||
|
|
||||||
|
- name: "Run CPU tests with thinc-apple-ops"
|
||||||
|
run: |
|
||||||
|
python -m pip install 'spacy[apple]'
|
||||||
|
python -m pytest --pyargs spacy
|
||||||
|
if: startsWith(matrix.os, 'macos') && matrix.python_version == '3.11'
|
33
.github/workflows/universe_validation.yml
vendored
Normal file
33
.github/workflows/universe_validation.yml
vendored
Normal file
|
@ -0,0 +1,33 @@
|
||||||
|
name: universe validation
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches-ignore:
|
||||||
|
- "spacy.io"
|
||||||
|
- "nightly.spacy.io"
|
||||||
|
- "v2.spacy.io"
|
||||||
|
paths:
|
||||||
|
- "website/meta/universe.json"
|
||||||
|
pull_request:
|
||||||
|
types: [opened, synchronize, reopened, edited]
|
||||||
|
paths:
|
||||||
|
- "website/meta/universe.json"
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
validate:
|
||||||
|
name: Validate
|
||||||
|
if: github.repository_owner == 'explosion'
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- name: Check out repo
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
|
- name: Configure Python version
|
||||||
|
uses: actions/setup-python@v4
|
||||||
|
with:
|
||||||
|
python-version: "3.8"
|
||||||
|
architecture: x64
|
||||||
|
|
||||||
|
- name: Validate website/meta/universe.json
|
||||||
|
run: |
|
||||||
|
python .github/validate_universe_json.py website/meta/universe.json
|
32
README.md
32
README.md
|
@ -16,6 +16,9 @@ production-ready [**training system**](https://spacy.io/usage/training) and easy
|
||||||
model packaging, deployment and workflow management. spaCy is commercial
|
model packaging, deployment and workflow management. spaCy is commercial
|
||||||
open-source software, released under the [MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE).
|
open-source software, released under the [MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE).
|
||||||
|
|
||||||
|
💥 **We'd love to hear more about your experience with spaCy!**
|
||||||
|
[Fill out our survey here.](https://form.typeform.com/to/aMel9q9f)
|
||||||
|
|
||||||
💫 **Version 3.5 out now!**
|
💫 **Version 3.5 out now!**
|
||||||
[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
|
[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
|
||||||
|
|
||||||
|
@ -32,19 +35,20 @@ open-source software, released under the [MIT license](https://github.com/explos
|
||||||
|
|
||||||
## 📖 Documentation
|
## 📖 Documentation
|
||||||
|
|
||||||
| Documentation | |
|
| Documentation | |
|
||||||
| ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ----------------------------- | ---------------------------------------------------------------------- |
|
||||||
| ⭐️ **[spaCy 101]** | New to spaCy? Here's everything you need to know! |
|
| ⭐️ **[spaCy 101]** | New to spaCy? Here's everything you need to know! |
|
||||||
| 📚 **[Usage Guides]** | How to use spaCy and its features. |
|
| 📚 **[Usage Guides]** | How to use spaCy and its features. |
|
||||||
| 🚀 **[New in v3.0]** | New features, backwards incompatibilities and migration guide. |
|
| 🚀 **[New in v3.0]** | New features, backwards incompatibilities and migration guide. |
|
||||||
| 🪐 **[Project Templates]** | End-to-end workflows you can clone, modify and run. |
|
| 🪐 **[Project Templates]** | End-to-end workflows you can clone, modify and run. |
|
||||||
| 🎛 **[API Reference]** | The detailed reference for spaCy's API. |
|
| 🎛 **[API Reference]** | The detailed reference for spaCy's API. |
|
||||||
| 📦 **[Models]** | Download trained pipelines for spaCy. |
|
| 📦 **[Models]** | Download trained pipelines for spaCy. |
|
||||||
| 🌌 **[Universe]** | Plugins, extensions, demos and books from the spaCy ecosystem. |
|
| 🌌 **[Universe]** | Plugins, extensions, demos and books from the spaCy ecosystem. |
|
||||||
| 👩🏫 **[Online Course]** | Learn spaCy in this free and interactive online course. |
|
| ⚙️ **[spaCy VS Code Extension]** | Additional tooling and features for working with spaCy's config files. |
|
||||||
| 📺 **[Videos]** | Our YouTube channel with video tutorials, talks and more. |
|
| 👩🏫 **[Online Course]** | Learn spaCy in this free and interactive online course. |
|
||||||
| 🛠 **[Changelog]** | Changes and version history. |
|
| 📺 **[Videos]** | Our YouTube channel with video tutorials, talks and more. |
|
||||||
| 💝 **[Contribute]** | How to contribute to the spaCy project and code base. |
|
| 🛠 **[Changelog]** | Changes and version history. |
|
||||||
|
| 💝 **[Contribute]** | How to contribute to the spaCy project and code base. |
|
||||||
| <a href="https://explosion.ai/spacy-tailored-pipelines"><img src="https://user-images.githubusercontent.com/13643239/152853098-1c761611-ccb0-4ec6-9066-b234552831fe.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Get a custom spaCy pipeline, tailor-made for your NLP problem by spaCy's core developers. Streamlined, production-ready, predictable and maintainable. Start by completing our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-pipelines)** |
|
| <a href="https://explosion.ai/spacy-tailored-pipelines"><img src="https://user-images.githubusercontent.com/13643239/152853098-1c761611-ccb0-4ec6-9066-b234552831fe.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Get a custom spaCy pipeline, tailor-made for your NLP problem by spaCy's core developers. Streamlined, production-ready, predictable and maintainable. Start by completing our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-pipelines)** |
|
||||||
| <a href="https://explosion.ai/spacy-tailored-analysis"><img src="https://user-images.githubusercontent.com/1019791/206151300-b00cd189-e503-4797-aa1e-1bb6344062c5.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Bespoke advice for problem solving, strategy and analysis for applied NLP projects. Services include data strategy, code reviews, pipeline design and annotation coaching. Curious? Fill in our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-analysis)** |
|
| <a href="https://explosion.ai/spacy-tailored-analysis"><img src="https://user-images.githubusercontent.com/1019791/206151300-b00cd189-e503-4797-aa1e-1bb6344062c5.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Bespoke advice for problem solving, strategy and analysis for applied NLP projects. Services include data strategy, code reviews, pipeline design and annotation coaching. Curious? Fill in our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-analysis)** |
|
||||||
|
|
||||||
|
@ -54,13 +58,13 @@ open-source software, released under the [MIT license](https://github.com/explos
|
||||||
[api reference]: https://spacy.io/api/
|
[api reference]: https://spacy.io/api/
|
||||||
[models]: https://spacy.io/models
|
[models]: https://spacy.io/models
|
||||||
[universe]: https://spacy.io/universe
|
[universe]: https://spacy.io/universe
|
||||||
|
[spaCy VS Code Extension]: https://github.com/explosion/spacy-vscode
|
||||||
[videos]: https://www.youtube.com/c/ExplosionAI
|
[videos]: https://www.youtube.com/c/ExplosionAI
|
||||||
[online course]: https://course.spacy.io
|
[online course]: https://course.spacy.io
|
||||||
[project templates]: https://github.com/explosion/projects
|
[project templates]: https://github.com/explosion/projects
|
||||||
[changelog]: https://spacy.io/usage#changelog
|
[changelog]: https://spacy.io/usage#changelog
|
||||||
[contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md
|
[contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md
|
||||||
|
|
||||||
|
|
||||||
## 💬 Where to ask questions
|
## 💬 Where to ask questions
|
||||||
|
|
||||||
The spaCy project is maintained by the [spaCy team](https://explosion.ai/about).
|
The spaCy project is maintained by the [spaCy team](https://explosion.ai/about).
|
||||||
|
|
|
@ -1,99 +0,0 @@
|
||||||
trigger:
|
|
||||||
batch: true
|
|
||||||
branches:
|
|
||||||
include:
|
|
||||||
- "*"
|
|
||||||
exclude:
|
|
||||||
- "spacy.io"
|
|
||||||
- "nightly.spacy.io"
|
|
||||||
- "v2.spacy.io"
|
|
||||||
paths:
|
|
||||||
exclude:
|
|
||||||
- "website/*"
|
|
||||||
- "*.md"
|
|
||||||
- "*.mdx"
|
|
||||||
- ".github/workflows/*"
|
|
||||||
pr:
|
|
||||||
paths:
|
|
||||||
exclude:
|
|
||||||
- "*.md"
|
|
||||||
- "*.mdx"
|
|
||||||
- "website/docs/*"
|
|
||||||
- "website/src/*"
|
|
||||||
- "website/meta/*.tsx"
|
|
||||||
- "website/meta/*.mjs"
|
|
||||||
- "website/meta/languages.json"
|
|
||||||
- "website/meta/site.json"
|
|
||||||
- "website/meta/sidebars.json"
|
|
||||||
- "website/meta/type-annotations.json"
|
|
||||||
- "website/pages/*"
|
|
||||||
- ".github/workflows/*"
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
# Check formatting and linting. Perform basic checks for most important errors
|
|
||||||
# (syntax etc.) Uses the config defined in setup.cfg and overwrites the
|
|
||||||
# selected codes.
|
|
||||||
- job: "Validate"
|
|
||||||
pool:
|
|
||||||
vmImage: "ubuntu-latest"
|
|
||||||
steps:
|
|
||||||
- task: UsePythonVersion@0
|
|
||||||
inputs:
|
|
||||||
versionSpec: "3.8"
|
|
||||||
- script: |
|
|
||||||
pip install black -c requirements.txt
|
|
||||||
python -m black spacy --check
|
|
||||||
displayName: "black"
|
|
||||||
- script: |
|
|
||||||
pip install flake8==5.0.4
|
|
||||||
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
|
|
||||||
displayName: "flake8"
|
|
||||||
|
|
||||||
- job: "Test"
|
|
||||||
dependsOn: "Validate"
|
|
||||||
strategy:
|
|
||||||
matrix:
|
|
||||||
# We're only running one platform per Python version to speed up builds
|
|
||||||
# Python38Linux:
|
|
||||||
# imageName: "ubuntu-latest"
|
|
||||||
# python.version: "3.8"
|
|
||||||
# Python38Windows:
|
|
||||||
# imageName: "windows-latest"
|
|
||||||
# python.version: "3.8"
|
|
||||||
Python38Mac:
|
|
||||||
imageName: "macos-latest"
|
|
||||||
python.version: "3.8"
|
|
||||||
Python39Linux:
|
|
||||||
imageName: "ubuntu-latest"
|
|
||||||
python.version: "3.9"
|
|
||||||
# Python39Windows:
|
|
||||||
# imageName: "windows-latest"
|
|
||||||
# python.version: "3.9"
|
|
||||||
# Python39Mac:
|
|
||||||
# imageName: "macos-latest"
|
|
||||||
# python.version: "3.9"
|
|
||||||
# Python310Linux:
|
|
||||||
# imageName: "ubuntu-latest"
|
|
||||||
# python.version: "3.10"
|
|
||||||
Python310Windows:
|
|
||||||
imageName: "windows-latest"
|
|
||||||
python.version: "3.10"
|
|
||||||
# Python310Mac:
|
|
||||||
# imageName: "macos-latest"
|
|
||||||
# python.version: "3.10"
|
|
||||||
Python311Linux:
|
|
||||||
imageName: 'ubuntu-latest'
|
|
||||||
python.version: '3.11'
|
|
||||||
Python311Windows:
|
|
||||||
imageName: 'windows-latest'
|
|
||||||
python.version: '3.11'
|
|
||||||
Python311Mac:
|
|
||||||
imageName: 'macos-latest'
|
|
||||||
python.version: '3.11'
|
|
||||||
maxParallel: 4
|
|
||||||
pool:
|
|
||||||
vmImage: $(imageName)
|
|
||||||
steps:
|
|
||||||
- template: .github/azure-steps.yml
|
|
||||||
parameters:
|
|
||||||
python_version: '$(python.version)'
|
|
|
@ -1,6 +1,4 @@
|
||||||
# build version constraints for use with wheelwright + multibuild
|
# build version constraints for use with wheelwright + multibuild
|
||||||
numpy==1.15.0; python_version<='3.7' and platform_machine!='aarch64'
|
|
||||||
numpy==1.19.2; python_version<='3.7' and platform_machine=='aarch64'
|
|
||||||
numpy==1.17.3; python_version=='3.8' and platform_machine!='aarch64'
|
numpy==1.17.3; python_version=='3.8' and platform_machine!='aarch64'
|
||||||
numpy==1.19.2; python_version=='3.8' and platform_machine=='aarch64'
|
numpy==1.19.2; python_version=='3.8' and platform_machine=='aarch64'
|
||||||
numpy==1.19.3; python_version=='3.9'
|
numpy==1.19.3; python_version=='3.9'
|
||||||
|
|
|
@ -9,3 +9,6 @@ requires = [
|
||||||
"numpy>=1.15.0",
|
"numpy>=1.15.0",
|
||||||
]
|
]
|
||||||
build-backend = "setuptools.build_meta"
|
build-backend = "setuptools.build_meta"
|
||||||
|
|
||||||
|
[tool.isort]
|
||||||
|
profile = "black"
|
||||||
|
|
|
@ -9,7 +9,7 @@ murmurhash>=0.28.0,<1.1.0
|
||||||
wasabi>=0.9.1,<1.2.0
|
wasabi>=0.9.1,<1.2.0
|
||||||
srsly>=2.4.3,<3.0.0
|
srsly>=2.4.3,<3.0.0
|
||||||
catalogue>=2.0.6,<2.1.0
|
catalogue>=2.0.6,<2.1.0
|
||||||
typer>=0.3.0,<0.8.0
|
typer>=0.3.0,<0.10.0
|
||||||
pathy>=0.10.0
|
pathy>=0.10.0
|
||||||
smart-open>=5.2.1,<7.0.0
|
smart-open>=5.2.1,<7.0.0
|
||||||
# Third party dependencies
|
# Third party dependencies
|
||||||
|
@ -30,10 +30,10 @@ pytest-timeout>=1.3.0,<2.0.0
|
||||||
mock>=2.0.0,<3.0.0
|
mock>=2.0.0,<3.0.0
|
||||||
flake8>=3.8.0,<6.0.0
|
flake8>=3.8.0,<6.0.0
|
||||||
hypothesis>=3.27.0,<7.0.0
|
hypothesis>=3.27.0,<7.0.0
|
||||||
mypy>=0.990,<1.1.0; platform_machine != "aarch64" and python_version >= "3.7"
|
mypy>=0.990,<1.1.0; platform_machine != "aarch64"
|
||||||
types-dataclasses>=0.1.3; python_version < "3.7"
|
|
||||||
types-mock>=0.1.1
|
types-mock>=0.1.1
|
||||||
types-setuptools>=57.0.0
|
types-setuptools>=57.0.0
|
||||||
types-requests
|
types-requests
|
||||||
types-setuptools>=57.0.0
|
types-setuptools>=57.0.0
|
||||||
black==22.3.0
|
black==22.3.0
|
||||||
|
isort>=5.0,<6.0
|
||||||
|
|
46
setup.cfg
46
setup.cfg
|
@ -30,6 +30,14 @@ project_urls =
|
||||||
zip_safe = false
|
zip_safe = false
|
||||||
include_package_data = true
|
include_package_data = true
|
||||||
python_requires = >=3.8
|
python_requires = >=3.8
|
||||||
|
setup_requires =
|
||||||
|
cython>=0.25,<3.0
|
||||||
|
numpy>=1.15.0
|
||||||
|
# We also need our Cython packages here to compile against
|
||||||
|
cymem>=2.0.2,<2.1.0
|
||||||
|
preshed>=3.0.2,<3.1.0
|
||||||
|
murmurhash>=0.28.0,<1.1.0
|
||||||
|
thinc>=9.0.0.dev2,<9.1.0
|
||||||
install_requires =
|
install_requires =
|
||||||
# Our libraries
|
# Our libraries
|
||||||
spacy-legacy>=4.0.0.dev0,<4.1.0
|
spacy-legacy>=4.0.0.dev0,<4.1.0
|
||||||
|
@ -42,7 +50,7 @@ install_requires =
|
||||||
srsly>=2.4.3,<3.0.0
|
srsly>=2.4.3,<3.0.0
|
||||||
catalogue>=2.0.6,<2.1.0
|
catalogue>=2.0.6,<2.1.0
|
||||||
# Third-party dependencies
|
# Third-party dependencies
|
||||||
typer>=0.3.0,<0.8.0
|
typer>=0.3.0,<0.10.0
|
||||||
pathy>=0.10.0
|
pathy>=0.10.0
|
||||||
smart-open>=5.2.1,<7.0.0
|
smart-open>=5.2.1,<7.0.0
|
||||||
tqdm>=4.38.0,<5.0.0
|
tqdm>=4.38.0,<5.0.0
|
||||||
|
@ -67,41 +75,41 @@ transformers =
|
||||||
ray =
|
ray =
|
||||||
spacy_ray>=0.1.0,<1.0.0
|
spacy_ray>=0.1.0,<1.0.0
|
||||||
cuda =
|
cuda =
|
||||||
cupy>=5.0.0b4,<12.0.0
|
cupy>=5.0.0b4,<13.0.0
|
||||||
cuda80 =
|
cuda80 =
|
||||||
cupy-cuda80>=5.0.0b4,<12.0.0
|
cupy-cuda80>=5.0.0b4,<13.0.0
|
||||||
cuda90 =
|
cuda90 =
|
||||||
cupy-cuda90>=5.0.0b4,<12.0.0
|
cupy-cuda90>=5.0.0b4,<13.0.0
|
||||||
cuda91 =
|
cuda91 =
|
||||||
cupy-cuda91>=5.0.0b4,<12.0.0
|
cupy-cuda91>=5.0.0b4,<13.0.0
|
||||||
cuda92 =
|
cuda92 =
|
||||||
cupy-cuda92>=5.0.0b4,<12.0.0
|
cupy-cuda92>=5.0.0b4,<13.0.0
|
||||||
cuda100 =
|
cuda100 =
|
||||||
cupy-cuda100>=5.0.0b4,<12.0.0
|
cupy-cuda100>=5.0.0b4,<13.0.0
|
||||||
cuda101 =
|
cuda101 =
|
||||||
cupy-cuda101>=5.0.0b4,<12.0.0
|
cupy-cuda101>=5.0.0b4,<13.0.0
|
||||||
cuda102 =
|
cuda102 =
|
||||||
cupy-cuda102>=5.0.0b4,<12.0.0
|
cupy-cuda102>=5.0.0b4,<13.0.0
|
||||||
cuda110 =
|
cuda110 =
|
||||||
cupy-cuda110>=5.0.0b4,<12.0.0
|
cupy-cuda110>=5.0.0b4,<13.0.0
|
||||||
cuda111 =
|
cuda111 =
|
||||||
cupy-cuda111>=5.0.0b4,<12.0.0
|
cupy-cuda111>=5.0.0b4,<13.0.0
|
||||||
cuda112 =
|
cuda112 =
|
||||||
cupy-cuda112>=5.0.0b4,<12.0.0
|
cupy-cuda112>=5.0.0b4,<13.0.0
|
||||||
cuda113 =
|
cuda113 =
|
||||||
cupy-cuda113>=5.0.0b4,<12.0.0
|
cupy-cuda113>=5.0.0b4,<13.0.0
|
||||||
cuda114 =
|
cuda114 =
|
||||||
cupy-cuda114>=5.0.0b4,<12.0.0
|
cupy-cuda114>=5.0.0b4,<13.0.0
|
||||||
cuda115 =
|
cuda115 =
|
||||||
cupy-cuda115>=5.0.0b4,<12.0.0
|
cupy-cuda115>=5.0.0b4,<13.0.0
|
||||||
cuda116 =
|
cuda116 =
|
||||||
cupy-cuda116>=5.0.0b4,<12.0.0
|
cupy-cuda116>=5.0.0b4,<13.0.0
|
||||||
cuda117 =
|
cuda117 =
|
||||||
cupy-cuda117>=5.0.0b4,<12.0.0
|
cupy-cuda117>=5.0.0b4,<13.0.0
|
||||||
cuda11x =
|
cuda11x =
|
||||||
cupy-cuda11x>=11.0.0,<12.0.0
|
cupy-cuda11x>=11.0.0,<13.0.0
|
||||||
cuda-autodetect =
|
cuda-autodetect =
|
||||||
cupy-wheel>=11.0.0,<12.0.0
|
cupy-wheel>=11.0.0,<13.0.0
|
||||||
apple =
|
apple =
|
||||||
thinc-apple-ops>=0.1.0.dev0,<1.0.0
|
thinc-apple-ops>=0.1.0.dev0,<1.0.0
|
||||||
# Language tokenizers with external dependencies
|
# Language tokenizers with external dependencies
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
from typing import Union, Iterable, Dict, Any
|
|
||||||
from pathlib import Path
|
|
||||||
import sys
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Dict, Iterable, Union
|
||||||
|
|
||||||
# set library-specific custom warning handling before doing anything else
|
# set library-specific custom warning handling before doing anything else
|
||||||
from .errors import setup_default_warnings
|
from .errors import setup_default_warnings
|
||||||
|
@ -8,20 +8,17 @@ from .errors import setup_default_warnings
|
||||||
setup_default_warnings() # noqa: E402
|
setup_default_warnings() # noqa: E402
|
||||||
|
|
||||||
# These are imported as part of the API
|
# These are imported as part of the API
|
||||||
from thinc.api import prefer_gpu, require_gpu, require_cpu # noqa: F401
|
from thinc.api import Config, prefer_gpu, require_cpu, require_gpu # noqa: F401
|
||||||
from thinc.api import Config
|
|
||||||
|
|
||||||
from . import pipeline # noqa: F401
|
from . import pipeline # noqa: F401
|
||||||
from .cli.info import info # noqa: F401
|
|
||||||
from .glossary import explain # noqa: F401
|
|
||||||
from .about import __version__ # noqa: F401
|
|
||||||
from .util import registry, logger # noqa: F401
|
|
||||||
|
|
||||||
from .errors import Errors
|
|
||||||
from .language import Language
|
|
||||||
from .vocab import Vocab
|
|
||||||
from . import util
|
from . import util
|
||||||
|
from .about import __version__ # noqa: F401
|
||||||
|
from .cli.info import info # noqa: F401
|
||||||
|
from .errors import Errors
|
||||||
|
from .glossary import explain # noqa: F401
|
||||||
|
from .language import Language
|
||||||
|
from .util import logger, registry # noqa: F401
|
||||||
|
from .vocab import Vocab
|
||||||
|
|
||||||
if sys.maxunicode == 65535:
|
if sys.maxunicode == 65535:
|
||||||
raise SystemError(Errors.E130)
|
raise SystemError(Errors.E130)
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
# fmt: off
|
# fmt: off
|
||||||
__title__ = "spacy"
|
__title__ = "spacy"
|
||||||
__version__ = "4.0.0.dev0"
|
__version__ = "4.0.0.dev1"
|
||||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||||
__projects__ = "https://github.com/explosion/projects"
|
__projects__ = "https://github.com/explosion/projects"
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
from . cimport symbols
|
from . cimport symbols
|
||||||
|
|
||||||
|
|
||||||
cdef enum attr_id_t:
|
cdef enum attr_id_t:
|
||||||
NULL_ATTR = 0
|
NULL_ATTR = 0
|
||||||
IS_ALPHA = symbols.IS_ALPHA
|
IS_ALPHA = symbols.IS_ALPHA
|
||||||
|
|
|
@ -1,35 +1,35 @@
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
|
|
||||||
from ._util import app, setup_cli # noqa: F401
|
from ._util import app, setup_cli # noqa: F401
|
||||||
|
from .apply import apply # noqa: F401
|
||||||
|
from .assemble import assemble_cli # noqa: F401
|
||||||
|
|
||||||
# These are the actual functions, NOT the wrapped CLI commands. The CLI commands
|
# These are the actual functions, NOT the wrapped CLI commands. The CLI commands
|
||||||
# are registered automatically and won't have to be imported here.
|
# are registered automatically and won't have to be imported here.
|
||||||
from .benchmark_speed import benchmark_speed_cli # noqa: F401
|
from .benchmark_speed import benchmark_speed_cli # noqa: F401
|
||||||
from .download import download # noqa: F401
|
|
||||||
from .info import info # noqa: F401
|
|
||||||
from .package import package # noqa: F401
|
|
||||||
from .profile import profile # noqa: F401
|
|
||||||
from .train import train_cli # noqa: F401
|
|
||||||
from .assemble import assemble_cli # noqa: F401
|
|
||||||
from .pretrain import pretrain # noqa: F401
|
|
||||||
from .debug_data import debug_data # noqa: F401
|
|
||||||
from .debug_config import debug_config # noqa: F401
|
|
||||||
from .debug_model import debug_model # noqa: F401
|
|
||||||
from .debug_diff import debug_diff # noqa: F401
|
|
||||||
from .evaluate import evaluate # noqa: F401
|
|
||||||
from .apply import apply # noqa: F401
|
|
||||||
from .convert import convert # noqa: F401
|
from .convert import convert # noqa: F401
|
||||||
from .init_pipeline import init_pipeline_cli # noqa: F401
|
from .debug_config import debug_config # noqa: F401
|
||||||
from .init_config import init_config, fill_config # noqa: F401
|
from .debug_data import debug_data # noqa: F401
|
||||||
from .validate import validate # noqa: F401
|
from .debug_diff import debug_diff # noqa: F401
|
||||||
from .project.clone import project_clone # noqa: F401
|
from .debug_model import debug_model # noqa: F401
|
||||||
from .project.assets import project_assets # noqa: F401
|
from .download import download # noqa: F401
|
||||||
from .project.run import project_run # noqa: F401
|
from .evaluate import evaluate # noqa: F401
|
||||||
from .project.dvc import project_update_dvc # noqa: F401
|
|
||||||
from .project.push import project_push # noqa: F401
|
|
||||||
from .project.pull import project_pull # noqa: F401
|
|
||||||
from .project.document import project_document # noqa: F401
|
|
||||||
from .find_threshold import find_threshold # noqa: F401
|
from .find_threshold import find_threshold # noqa: F401
|
||||||
|
from .info import info # noqa: F401
|
||||||
|
from .init_config import fill_config, init_config # noqa: F401
|
||||||
|
from .init_pipeline import init_pipeline_cli # noqa: F401
|
||||||
|
from .package import package # noqa: F401
|
||||||
|
from .pretrain import pretrain # noqa: F401
|
||||||
|
from .profile import profile # noqa: F401
|
||||||
|
from .project.assets import project_assets # noqa: F401
|
||||||
|
from .project.clone import project_clone # noqa: F401
|
||||||
|
from .project.document import project_document # noqa: F401
|
||||||
|
from .project.dvc import project_update_dvc # noqa: F401
|
||||||
|
from .project.pull import project_pull # noqa: F401
|
||||||
|
from .project.push import project_push # noqa: F401
|
||||||
|
from .project.run import project_run # noqa: F401
|
||||||
|
from .train import train_cli # noqa: F401
|
||||||
|
from .validate import validate # noqa: F401
|
||||||
|
|
||||||
|
|
||||||
@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
|
@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
|
||||||
|
|
|
@ -1,26 +1,45 @@
|
||||||
from typing import Dict, Any, Union, List, Optional, Tuple, Iterable, Literal
|
|
||||||
from typing import TYPE_CHECKING, overload
|
|
||||||
import sys
|
|
||||||
import shutil
|
|
||||||
from pathlib import Path
|
|
||||||
from wasabi import msg, Printer
|
|
||||||
import srsly
|
|
||||||
import hashlib
|
import hashlib
|
||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
import sys
|
||||||
|
from configparser import InterpolationError
|
||||||
|
from contextlib import contextmanager
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import (
|
||||||
|
TYPE_CHECKING,
|
||||||
|
Any,
|
||||||
|
Dict,
|
||||||
|
Iterable,
|
||||||
|
List,
|
||||||
|
Literal,
|
||||||
|
Optional,
|
||||||
|
Tuple,
|
||||||
|
Union,
|
||||||
|
overload,
|
||||||
|
)
|
||||||
|
|
||||||
|
import srsly
|
||||||
import typer
|
import typer
|
||||||
from click import NoSuchOption
|
from click import NoSuchOption
|
||||||
from click.parser import split_arg_string
|
from click.parser import split_arg_string
|
||||||
from typer.main import get_command
|
|
||||||
from contextlib import contextmanager
|
|
||||||
from thinc.api import Config, ConfigValidationError, require_gpu
|
from thinc.api import Config, ConfigValidationError, require_gpu
|
||||||
from thinc.util import gpu_is_available
|
from thinc.util import gpu_is_available
|
||||||
from configparser import InterpolationError
|
from typer.main import get_command
|
||||||
import os
|
from wasabi import Printer, msg
|
||||||
|
|
||||||
from ..schemas import ProjectConfigSchema, validate
|
|
||||||
from ..util import import_file, run_command, make_tempdir, registry, logger
|
|
||||||
from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS
|
|
||||||
from ..errors import RENAMED_LANGUAGE_CODES
|
|
||||||
from .. import about
|
from .. import about
|
||||||
|
from ..errors import RENAMED_LANGUAGE_CODES
|
||||||
|
from ..schemas import ProjectConfigSchema, validate
|
||||||
|
from ..util import (
|
||||||
|
ENV_VARS,
|
||||||
|
SimpleFrozenDict,
|
||||||
|
import_file,
|
||||||
|
is_compatible_version,
|
||||||
|
logger,
|
||||||
|
make_tempdir,
|
||||||
|
registry,
|
||||||
|
run_command,
|
||||||
|
)
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from pathy import FluidPath # noqa: F401
|
from pathy import FluidPath # noqa: F401
|
||||||
|
|
|
@ -1,18 +1,15 @@
|
||||||
import tqdm
|
|
||||||
import srsly
|
|
||||||
|
|
||||||
from itertools import chain
|
from itertools import chain
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional, List, Iterable, cast, Union
|
from typing import Iterable, List, Optional, Union, cast
|
||||||
|
|
||||||
|
import srsly
|
||||||
|
import tqdm
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
|
|
||||||
from ._util import app, Arg, Opt, setup_gpu, import_code, walk_directory
|
|
||||||
|
|
||||||
from ..tokens import Doc, DocBin
|
from ..tokens import Doc, DocBin
|
||||||
from ..vocab import Vocab
|
|
||||||
from ..util import ensure_path, load_model
|
from ..util import ensure_path, load_model
|
||||||
|
from ..vocab import Vocab
|
||||||
|
from ._util import Arg, Opt, app, import_code, setup_gpu, walk_directory
|
||||||
|
|
||||||
path_help = """Location of the documents to predict on.
|
path_help = """Location of the documents to predict on.
|
||||||
Can be a single file in .spacy format or a .jsonl file.
|
Can be a single file in .spacy format or a .jsonl file.
|
||||||
|
|
|
@ -1,13 +1,20 @@
|
||||||
from typing import Optional
|
|
||||||
from pathlib import Path
|
|
||||||
from wasabi import msg
|
|
||||||
import typer
|
|
||||||
import logging
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import typer
|
||||||
|
from wasabi import msg
|
||||||
|
|
||||||
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
|
|
||||||
from ._util import import_code_paths
|
|
||||||
from .. import util
|
from .. import util
|
||||||
from ..util import get_sourced_components, load_model_from_config
|
from ..util import get_sourced_components, load_model_from_config
|
||||||
|
from ._util import (
|
||||||
|
Arg,
|
||||||
|
Opt,
|
||||||
|
app,
|
||||||
|
import_code_paths,
|
||||||
|
parse_config_overrides,
|
||||||
|
show_validation_error,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@app.command(
|
@app.command(
|
||||||
|
|
|
@ -1,11 +1,12 @@
|
||||||
from typing import Iterable, List, Optional
|
|
||||||
import random
|
import random
|
||||||
from itertools import islice
|
|
||||||
import numpy
|
|
||||||
from pathlib import Path
|
|
||||||
import time
|
import time
|
||||||
from tqdm import tqdm
|
from itertools import islice
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Iterable, List, Optional
|
||||||
|
|
||||||
|
import numpy
|
||||||
import typer
|
import typer
|
||||||
|
from tqdm import tqdm
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
|
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
|
@ -1,18 +1,22 @@
|
||||||
from typing import Callable, Iterable, Mapping, Optional, Any, Union
|
import itertools
|
||||||
from enum import Enum
|
|
||||||
from pathlib import Path
|
|
||||||
from wasabi import Printer
|
|
||||||
import srsly
|
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
import itertools
|
from enum import Enum
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Callable, Iterable, Mapping, Optional, Union
|
||||||
|
|
||||||
|
import srsly
|
||||||
|
from wasabi import Printer
|
||||||
|
|
||||||
from ._util import app, Arg, Opt, _handle_renamed_language_codes, walk_directory
|
|
||||||
from ..training import docs_to_json
|
|
||||||
from ..tokens import Doc, DocBin
|
from ..tokens import Doc, DocBin
|
||||||
from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs
|
from ..training import docs_to_json
|
||||||
from ..training.converters import conllu_to_docs
|
from ..training.converters import (
|
||||||
|
conll_ner_to_docs,
|
||||||
|
conllu_to_docs,
|
||||||
|
iob_to_docs,
|
||||||
|
json_to_docs,
|
||||||
|
)
|
||||||
|
from ._util import Arg, Opt, _handle_renamed_language_codes, app, walk_directory
|
||||||
|
|
||||||
# Converters are matched by file extension except for ner/iob, which are
|
# Converters are matched by file extension except for ner/iob, which are
|
||||||
# matched by file extension and content. To add a converter, add a new
|
# matched by file extension and content. To add a converter, add a new
|
||||||
|
|
|
@ -1,15 +1,22 @@
|
||||||
from typing import Optional, Dict, Any, Union, List
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from wasabi import msg, table
|
from typing import Any, Dict, List, Optional, Union
|
||||||
|
|
||||||
|
import typer
|
||||||
from thinc.api import Config
|
from thinc.api import Config
|
||||||
from thinc.config import VARIABLE_RE
|
from thinc.config import VARIABLE_RE
|
||||||
import typer
|
from wasabi import msg, table
|
||||||
|
|
||||||
from ._util import Arg, Opt, show_validation_error, parse_config_overrides
|
from .. import util
|
||||||
from ._util import import_code_paths, debug_cli
|
|
||||||
from ..schemas import ConfigSchemaInit, ConfigSchemaTraining
|
from ..schemas import ConfigSchemaInit, ConfigSchemaTraining
|
||||||
from ..util import registry
|
from ..util import registry
|
||||||
from .. import util
|
from ._util import (
|
||||||
|
Arg,
|
||||||
|
Opt,
|
||||||
|
debug_cli,
|
||||||
|
import_code_paths,
|
||||||
|
parse_config_overrides,
|
||||||
|
show_validation_error,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@debug_cli.command(
|
@debug_cli.command(
|
||||||
|
|
|
@ -1,29 +1,49 @@
|
||||||
from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Tuple, Union
|
|
||||||
from typing import Literal, cast, overload
|
|
||||||
from pathlib import Path
|
|
||||||
from collections import Counter
|
|
||||||
import sys
|
|
||||||
import srsly
|
|
||||||
from wasabi import Printer, MESSAGES, msg
|
|
||||||
import typer
|
|
||||||
import math
|
import math
|
||||||
|
import sys
|
||||||
|
from collections import Counter
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import (
|
||||||
|
Any,
|
||||||
|
Dict,
|
||||||
|
Iterable,
|
||||||
|
List,
|
||||||
|
Literal,
|
||||||
|
Optional,
|
||||||
|
Sequence,
|
||||||
|
Set,
|
||||||
|
Tuple,
|
||||||
|
Union,
|
||||||
|
cast,
|
||||||
|
overload,
|
||||||
|
)
|
||||||
|
|
||||||
from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
|
import numpy
|
||||||
from ._util import import_code_paths, debug_cli, _format_number
|
import srsly
|
||||||
from ..training import Example, remove_bilu_prefix
|
import typer
|
||||||
from ..training.initialize import get_sourced_components
|
from wasabi import MESSAGES, Printer, msg
|
||||||
from ..schemas import ConfigSchemaTraining
|
|
||||||
from ..pipeline import TrainablePipe
|
from .. import util
|
||||||
|
from ..language import Language
|
||||||
|
from ..morphology import Morphology
|
||||||
|
from ..pipeline import Morphologizer, SpanCategorizer, TrainablePipe
|
||||||
|
from ..pipeline._edit_tree_internals.edit_trees import EditTrees
|
||||||
from ..pipeline._parser_internals import nonproj
|
from ..pipeline._parser_internals import nonproj
|
||||||
from ..pipeline._parser_internals.nonproj import DELIMITER
|
from ..pipeline._parser_internals.nonproj import DELIMITER
|
||||||
from ..pipeline import Morphologizer, SpanCategorizer
|
from ..schemas import ConfigSchemaTraining
|
||||||
from ..pipeline._edit_tree_internals.edit_trees import EditTrees
|
from ..training import Example, remove_bilu_prefix
|
||||||
from ..morphology import Morphology
|
from ..training.initialize import get_sourced_components
|
||||||
from ..language import Language
|
|
||||||
from ..util import registry, resolve_dot_names
|
from ..util import registry, resolve_dot_names
|
||||||
from ..vectors import Mode as VectorsMode
|
from ..vectors import Mode as VectorsMode
|
||||||
from .. import util
|
from ._util import (
|
||||||
|
Arg,
|
||||||
|
Opt,
|
||||||
|
_format_number,
|
||||||
|
app,
|
||||||
|
debug_cli,
|
||||||
|
import_code_paths,
|
||||||
|
parse_config_overrides,
|
||||||
|
show_validation_error,
|
||||||
|
)
|
||||||
|
|
||||||
# Minimum number of expected occurrences of NER label in data to train new label
|
# Minimum number of expected occurrences of NER label in data to train new label
|
||||||
NEW_LABEL_THRESHOLD = 50
|
NEW_LABEL_THRESHOLD = 50
|
||||||
|
@ -210,7 +230,7 @@ def debug_data(
|
||||||
else:
|
else:
|
||||||
msg.info("No word vectors present in the package")
|
msg.info("No word vectors present in the package")
|
||||||
|
|
||||||
if "spancat" in factory_names:
|
if "spancat" in factory_names or "spancat_singlelabel" in factory_names:
|
||||||
model_labels_spancat = _get_labels_from_spancat(nlp)
|
model_labels_spancat = _get_labels_from_spancat(nlp)
|
||||||
has_low_data_warning = False
|
has_low_data_warning = False
|
||||||
has_no_neg_warning = False
|
has_no_neg_warning = False
|
||||||
|
@ -335,7 +355,7 @@ def debug_data(
|
||||||
show=verbose,
|
show=verbose,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
msg.good("Examples without ocurrences available for all labels")
|
msg.good("Examples without occurrences available for all labels")
|
||||||
|
|
||||||
if "ner" in factory_names:
|
if "ner" in factory_names:
|
||||||
# Get all unique NER labels present in the data
|
# Get all unique NER labels present in the data
|
||||||
|
@ -520,9 +540,13 @@ def debug_data(
|
||||||
|
|
||||||
if "tagger" in factory_names:
|
if "tagger" in factory_names:
|
||||||
msg.divider("Part-of-speech Tagging")
|
msg.divider("Part-of-speech Tagging")
|
||||||
label_list = [label for label in gold_train_data["tags"]]
|
label_list, counts = zip(*gold_train_data["tags"].items())
|
||||||
model_labels = _get_labels_from_model(nlp, "tagger")
|
|
||||||
msg.info(f"{len(label_list)} label(s) in train data")
|
msg.info(f"{len(label_list)} label(s) in train data")
|
||||||
|
p = numpy.array(counts)
|
||||||
|
p = p / p.sum()
|
||||||
|
norm_entropy = (-p * numpy.log2(p)).sum() / numpy.log2(len(label_list))
|
||||||
|
msg.info(f"{norm_entropy} is the normalised label entropy")
|
||||||
|
model_labels = _get_labels_from_model(nlp, "tagger")
|
||||||
labels = set(label_list)
|
labels = set(label_list)
|
||||||
missing_labels = model_labels - labels
|
missing_labels = model_labels - labels
|
||||||
if missing_labels:
|
if missing_labels:
|
||||||
|
@ -824,7 +848,7 @@ def _compile_gold(
|
||||||
data["boundary_cross_ents"] += 1
|
data["boundary_cross_ents"] += 1
|
||||||
elif label == "-":
|
elif label == "-":
|
||||||
data["ner"]["-"] += 1
|
data["ner"]["-"] += 1
|
||||||
if "spancat" in factory_names:
|
if "spancat" in factory_names or "spancat_singlelabel" in factory_names:
|
||||||
for spans_key in list(eg.reference.spans.keys()):
|
for spans_key in list(eg.reference.spans.keys()):
|
||||||
# Obtain the span frequency
|
# Obtain the span frequency
|
||||||
if spans_key not in data["spancat"]:
|
if spans_key not in data["spancat"]:
|
||||||
|
@ -1022,7 +1046,7 @@ def _get_labels_from_spancat(nlp: Language) -> Dict[str, Set[str]]:
|
||||||
pipe_names = [
|
pipe_names = [
|
||||||
pipe_name
|
pipe_name
|
||||||
for pipe_name in nlp.pipe_names
|
for pipe_name in nlp.pipe_names
|
||||||
if nlp.get_pipe_meta(pipe_name).factory == "spancat"
|
if nlp.get_pipe_meta(pipe_name).factory in ("spancat", "spancat_singlelabel")
|
||||||
]
|
]
|
||||||
labels: Dict[str, Set[str]] = {}
|
labels: Dict[str, Set[str]] = {}
|
||||||
for pipe_name in pipe_names:
|
for pipe_name in pipe_names:
|
||||||
|
|
|
@ -1,13 +1,13 @@
|
||||||
|
from pathlib import Path
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
import typer
|
import typer
|
||||||
from wasabi import Printer, diff_strings, MarkdownRenderer
|
|
||||||
from pathlib import Path
|
|
||||||
from thinc.api import Config
|
from thinc.api import Config
|
||||||
|
from wasabi import MarkdownRenderer, Printer, diff_strings
|
||||||
|
|
||||||
from ._util import debug_cli, Arg, Opt, show_validation_error, parse_config_overrides
|
|
||||||
from ..util import load_config
|
from ..util import load_config
|
||||||
from .init_config import init_config, Optimizations
|
from ._util import Arg, Opt, debug_cli, parse_config_overrides, show_validation_error
|
||||||
|
from .init_config import Optimizations, init_config
|
||||||
|
|
||||||
|
|
||||||
@debug_cli.command(
|
@debug_cli.command(
|
||||||
|
|
|
@ -1,19 +1,32 @@
|
||||||
from typing import Dict, Any, Optional
|
|
||||||
from pathlib import Path
|
|
||||||
import itertools
|
import itertools
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Dict, Optional
|
||||||
|
|
||||||
|
import typer
|
||||||
|
from thinc.api import (
|
||||||
|
Model,
|
||||||
|
data_validation,
|
||||||
|
fix_random_seed,
|
||||||
|
set_dropout_rate,
|
||||||
|
set_gpu_allocator,
|
||||||
|
)
|
||||||
|
from wasabi import msg
|
||||||
|
|
||||||
from spacy.training import Example
|
from spacy.training import Example
|
||||||
from spacy.util import resolve_dot_names
|
from spacy.util import resolve_dot_names
|
||||||
from wasabi import msg
|
|
||||||
from thinc.api import fix_random_seed, set_dropout_rate
|
|
||||||
from thinc.api import Model, data_validation, set_gpu_allocator
|
|
||||||
import typer
|
|
||||||
|
|
||||||
from ._util import Arg, Opt, debug_cli, show_validation_error
|
from .. import util
|
||||||
from ._util import parse_config_overrides, string_to_list, setup_gpu
|
|
||||||
from ..schemas import ConfigSchemaTraining
|
from ..schemas import ConfigSchemaTraining
|
||||||
from ..util import registry
|
from ..util import registry
|
||||||
from .. import util
|
from ._util import (
|
||||||
|
Arg,
|
||||||
|
Opt,
|
||||||
|
debug_cli,
|
||||||
|
parse_config_overrides,
|
||||||
|
setup_gpu,
|
||||||
|
show_validation_error,
|
||||||
|
string_to_list,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@debug_cli.command(
|
@debug_cli.command(
|
||||||
|
|
|
@ -1,14 +1,20 @@
|
||||||
from typing import Optional, Sequence
|
|
||||||
import requests
|
|
||||||
import sys
|
import sys
|
||||||
from wasabi import msg
|
from typing import Optional, Sequence
|
||||||
import typer
|
|
||||||
|
import requests
|
||||||
|
import typer
|
||||||
|
from wasabi import msg
|
||||||
|
|
||||||
from ._util import app, Arg, Opt, WHEEL_SUFFIX, SDIST_SUFFIX
|
|
||||||
from .. import about
|
from .. import about
|
||||||
from ..util import is_package, get_minor_version, run_command
|
from ..util import (
|
||||||
from ..util import is_prerelease_version, get_installed_models
|
get_installed_models,
|
||||||
from ..util import get_package_version
|
get_minor_version,
|
||||||
|
get_package_version,
|
||||||
|
is_package,
|
||||||
|
is_prerelease_version,
|
||||||
|
run_command,
|
||||||
|
)
|
||||||
|
from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app
|
||||||
|
|
||||||
|
|
||||||
@app.command(
|
@app.command(
|
||||||
|
@ -83,11 +89,8 @@ def download(
|
||||||
|
|
||||||
def get_model_filename(model_name: str, version: str, sdist: bool = False) -> str:
|
def get_model_filename(model_name: str, version: str, sdist: bool = False) -> str:
|
||||||
dl_tpl = "{m}-{v}/{m}-{v}{s}"
|
dl_tpl = "{m}-{v}/{m}-{v}{s}"
|
||||||
egg_tpl = "#egg={m}=={v}"
|
|
||||||
suffix = SDIST_SUFFIX if sdist else WHEEL_SUFFIX
|
suffix = SDIST_SUFFIX if sdist else WHEEL_SUFFIX
|
||||||
filename = dl_tpl.format(m=model_name, v=version, s=suffix)
|
filename = dl_tpl.format(m=model_name, v=version, s=suffix)
|
||||||
if sdist:
|
|
||||||
filename += egg_tpl.format(m=model_name, v=version)
|
|
||||||
return filename
|
return filename
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,16 +1,16 @@
|
||||||
from typing import Optional, List, Dict, Any, Union
|
|
||||||
from wasabi import Printer
|
|
||||||
from pathlib import Path
|
|
||||||
import re
|
import re
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Dict, List, Optional, Union
|
||||||
|
|
||||||
import srsly
|
import srsly
|
||||||
from thinc.api import fix_random_seed
|
from thinc.api import fix_random_seed
|
||||||
|
from wasabi import Printer
|
||||||
|
|
||||||
from ..training import Corpus
|
from .. import displacy, util
|
||||||
from ..tokens import Doc
|
|
||||||
from ._util import app, Arg, Opt, setup_gpu, import_code_paths, benchmark_cli
|
|
||||||
from ..scorer import Scorer
|
from ..scorer import Scorer
|
||||||
from .. import util
|
from ..tokens import Doc
|
||||||
from .. import displacy
|
from ..training import Corpus
|
||||||
|
from ._util import Arg, Opt, app, benchmark_cli, import_code_paths, setup_gpu
|
||||||
|
|
||||||
|
|
||||||
@benchmark_cli.command(
|
@benchmark_cli.command(
|
||||||
|
@ -27,6 +27,7 @@ def evaluate_cli(
|
||||||
gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"),
|
gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"),
|
||||||
displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False),
|
displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False),
|
||||||
displacy_limit: int = Opt(25, "--displacy-limit", "-dl", help="Limit of parses to render as HTML"),
|
displacy_limit: int = Opt(25, "--displacy-limit", "-dl", help="Limit of parses to render as HTML"),
|
||||||
|
per_component: bool = Opt(False, "--per-component", "-P", help="Return scores per component, only applicable when an output JSON file is specified."),
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
|
@ -50,6 +51,7 @@ def evaluate_cli(
|
||||||
gold_preproc=gold_preproc,
|
gold_preproc=gold_preproc,
|
||||||
displacy_path=displacy_path,
|
displacy_path=displacy_path,
|
||||||
displacy_limit=displacy_limit,
|
displacy_limit=displacy_limit,
|
||||||
|
per_component=per_component,
|
||||||
silent=False,
|
silent=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -64,6 +66,7 @@ def evaluate(
|
||||||
displacy_limit: int = 25,
|
displacy_limit: int = 25,
|
||||||
silent: bool = True,
|
silent: bool = True,
|
||||||
spans_key: str = "sc",
|
spans_key: str = "sc",
|
||||||
|
per_component: bool = False,
|
||||||
) -> Dict[str, Any]:
|
) -> Dict[str, Any]:
|
||||||
msg = Printer(no_print=silent, pretty=not silent)
|
msg = Printer(no_print=silent, pretty=not silent)
|
||||||
fix_random_seed()
|
fix_random_seed()
|
||||||
|
@ -78,50 +81,61 @@ def evaluate(
|
||||||
corpus = Corpus(data_path, gold_preproc=gold_preproc)
|
corpus = Corpus(data_path, gold_preproc=gold_preproc)
|
||||||
nlp = util.load_model(model)
|
nlp = util.load_model(model)
|
||||||
dev_dataset = list(corpus(nlp))
|
dev_dataset = list(corpus(nlp))
|
||||||
scores = nlp.evaluate(dev_dataset)
|
scores = nlp.evaluate(dev_dataset, per_component=per_component)
|
||||||
metrics = {
|
if per_component:
|
||||||
"TOK": "token_acc",
|
data = scores
|
||||||
"TAG": "tag_acc",
|
if output is None:
|
||||||
"POS": "pos_acc",
|
msg.warn(
|
||||||
"MORPH": "morph_acc",
|
"The per-component option is enabled but there is no output JSON file provided to save the scores to."
|
||||||
"LEMMA": "lemma_acc",
|
)
|
||||||
"UAS": "dep_uas",
|
else:
|
||||||
"LAS": "dep_las",
|
msg.info("Per-component scores will be saved to output JSON file.")
|
||||||
"NER P": "ents_p",
|
else:
|
||||||
"NER R": "ents_r",
|
metrics = {
|
||||||
"NER F": "ents_f",
|
"TOK": "token_acc",
|
||||||
"TEXTCAT": "cats_score",
|
"TAG": "tag_acc",
|
||||||
"SENT P": "sents_p",
|
"POS": "pos_acc",
|
||||||
"SENT R": "sents_r",
|
"MORPH": "morph_acc",
|
||||||
"SENT F": "sents_f",
|
"LEMMA": "lemma_acc",
|
||||||
"SPAN P": f"spans_{spans_key}_p",
|
"UAS": "dep_uas",
|
||||||
"SPAN R": f"spans_{spans_key}_r",
|
"LAS": "dep_las",
|
||||||
"SPAN F": f"spans_{spans_key}_f",
|
"NER P": "ents_p",
|
||||||
"SPEED": "speed",
|
"NER R": "ents_r",
|
||||||
}
|
"NER F": "ents_f",
|
||||||
results = {}
|
"TEXTCAT": "cats_score",
|
||||||
data = {}
|
"SENT P": "sents_p",
|
||||||
for metric, key in metrics.items():
|
"SENT R": "sents_r",
|
||||||
if key in scores:
|
"SENT F": "sents_f",
|
||||||
if key == "cats_score":
|
"SPAN P": f"spans_{spans_key}_p",
|
||||||
metric = metric + " (" + scores.get("cats_score_desc", "unk") + ")"
|
"SPAN R": f"spans_{spans_key}_r",
|
||||||
if isinstance(scores[key], (int, float)):
|
"SPAN F": f"spans_{spans_key}_f",
|
||||||
if key == "speed":
|
"SPEED": "speed",
|
||||||
results[metric] = f"{scores[key]:.0f}"
|
}
|
||||||
|
results = {}
|
||||||
|
data = {}
|
||||||
|
for metric, key in metrics.items():
|
||||||
|
if key in scores:
|
||||||
|
if key == "cats_score":
|
||||||
|
metric = metric + " (" + scores.get("cats_score_desc", "unk") + ")"
|
||||||
|
if isinstance(scores[key], (int, float)):
|
||||||
|
if key == "speed":
|
||||||
|
results[metric] = f"{scores[key]:.0f}"
|
||||||
|
else:
|
||||||
|
results[metric] = f"{scores[key]*100:.2f}"
|
||||||
else:
|
else:
|
||||||
results[metric] = f"{scores[key]*100:.2f}"
|
results[metric] = "-"
|
||||||
else:
|
data[re.sub(r"[\s/]", "_", key.lower())] = scores[key]
|
||||||
results[metric] = "-"
|
|
||||||
data[re.sub(r"[\s/]", "_", key.lower())] = scores[key]
|
|
||||||
|
|
||||||
msg.table(results, title="Results")
|
msg.table(results, title="Results")
|
||||||
data = handle_scores_per_type(scores, data, spans_key=spans_key, silent=silent)
|
data = handle_scores_per_type(scores, data, spans_key=spans_key, silent=silent)
|
||||||
|
|
||||||
if displacy_path:
|
if displacy_path:
|
||||||
factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names]
|
factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names]
|
||||||
docs = list(nlp.pipe(ex.reference.text for ex in dev_dataset[:displacy_limit]))
|
docs = list(nlp.pipe(ex.reference.text for ex in dev_dataset[:displacy_limit]))
|
||||||
render_deps = "parser" in factory_names
|
render_deps = "parser" in factory_names
|
||||||
render_ents = "ner" in factory_names
|
render_ents = "ner" in factory_names
|
||||||
|
render_spans = "spancat" in factory_names
|
||||||
|
|
||||||
render_parses(
|
render_parses(
|
||||||
docs,
|
docs,
|
||||||
displacy_path,
|
displacy_path,
|
||||||
|
@ -129,6 +143,7 @@ def evaluate(
|
||||||
limit=displacy_limit,
|
limit=displacy_limit,
|
||||||
deps=render_deps,
|
deps=render_deps,
|
||||||
ents=render_ents,
|
ents=render_ents,
|
||||||
|
spans=render_spans,
|
||||||
)
|
)
|
||||||
msg.good(f"Generated {displacy_limit} parses as HTML", displacy_path)
|
msg.good(f"Generated {displacy_limit} parses as HTML", displacy_path)
|
||||||
|
|
||||||
|
@ -182,6 +197,7 @@ def render_parses(
|
||||||
limit: int = 250,
|
limit: int = 250,
|
||||||
deps: bool = True,
|
deps: bool = True,
|
||||||
ents: bool = True,
|
ents: bool = True,
|
||||||
|
spans: bool = True,
|
||||||
):
|
):
|
||||||
docs[0].user_data["title"] = model_name
|
docs[0].user_data["title"] = model_name
|
||||||
if ents:
|
if ents:
|
||||||
|
@ -195,6 +211,11 @@ def render_parses(
|
||||||
with (output_path / "parses.html").open("w", encoding="utf8") as file_:
|
with (output_path / "parses.html").open("w", encoding="utf8") as file_:
|
||||||
file_.write(html)
|
file_.write(html)
|
||||||
|
|
||||||
|
if spans:
|
||||||
|
html = displacy.render(docs[:limit], style="span", page=True)
|
||||||
|
with (output_path / "spans.html").open("w", encoding="utf8") as file_:
|
||||||
|
file_.write(html)
|
||||||
|
|
||||||
|
|
||||||
def print_prf_per_type(
|
def print_prf_per_type(
|
||||||
msg: Printer, scores: Dict[str, Dict[str, float]], name: str, type: str
|
msg: Printer, scores: Dict[str, Dict[str, float]], name: str, type: str
|
||||||
|
|
|
@ -1,17 +1,17 @@
|
||||||
import functools
|
import functools
|
||||||
|
import logging
|
||||||
import operator
|
import operator
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import logging
|
from typing import Any, Dict, List, Optional, Tuple
|
||||||
from typing import Optional, Tuple, Any, Dict, List
|
|
||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
import wasabi.tables
|
import wasabi.tables
|
||||||
|
|
||||||
from ..pipeline import TextCategorizer, MultiLabel_TextCategorizer
|
|
||||||
from ..errors import Errors
|
|
||||||
from ..training import Corpus
|
|
||||||
from ._util import app, Arg, Opt, import_code, setup_gpu
|
|
||||||
from .. import util
|
from .. import util
|
||||||
|
from ..errors import Errors
|
||||||
|
from ..pipeline import MultiLabel_TextCategorizer, TextCategorizer
|
||||||
|
from ..training import Corpus
|
||||||
|
from ._util import Arg, Opt, app, import_code, setup_gpu
|
||||||
|
|
||||||
_DEFAULTS = {
|
_DEFAULTS = {
|
||||||
"n_trials": 11,
|
"n_trials": 11,
|
||||||
|
@ -35,7 +35,7 @@ def find_threshold_cli(
|
||||||
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||||
use_gpu: int = Opt(_DEFAULTS["use_gpu"], "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
use_gpu: int = Opt(_DEFAULTS["use_gpu"], "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
||||||
gold_preproc: bool = Opt(_DEFAULTS["gold_preproc"], "--gold-preproc", "-G", help="Use gold preprocessing"),
|
gold_preproc: bool = Opt(_DEFAULTS["gold_preproc"], "--gold-preproc", "-G", help="Use gold preprocessing"),
|
||||||
verbose: bool = Opt(False, "--silent", "-V", "-VV", help="Display more information for debugging purposes"),
|
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -1,15 +1,15 @@
|
||||||
from typing import Optional, Dict, Any, Union, List
|
|
||||||
import platform
|
|
||||||
import json
|
|
||||||
from pathlib import Path
|
|
||||||
from wasabi import Printer, MarkdownRenderer
|
|
||||||
import srsly
|
|
||||||
import importlib.metadata
|
import importlib.metadata
|
||||||
|
import json
|
||||||
|
import platform
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Dict, List, Optional, Union
|
||||||
|
|
||||||
from ._util import app, Arg, Opt, string_to_list
|
import srsly
|
||||||
from .download import get_model_filename, get_latest_version
|
from wasabi import MarkdownRenderer, Printer
|
||||||
from .. import util
|
|
||||||
from .. import about
|
from .. import about, util
|
||||||
|
from ._util import Arg, Opt, app, string_to_list
|
||||||
|
from .download import get_latest_version, get_model_filename
|
||||||
|
|
||||||
|
|
||||||
@app.command("info")
|
@app.command("info")
|
||||||
|
|
|
@ -1,19 +1,27 @@
|
||||||
from typing import Optional, List, Tuple
|
import re
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from wasabi import Printer, diff_strings
|
from typing import List, Optional, Tuple
|
||||||
from thinc.api import Config
|
|
||||||
import srsly
|
import srsly
|
||||||
import re
|
|
||||||
from jinja2 import Template
|
from jinja2 import Template
|
||||||
|
from thinc.api import Config
|
||||||
|
from wasabi import Printer, diff_strings
|
||||||
|
|
||||||
from .. import util
|
from .. import util
|
||||||
from ..language import DEFAULT_CONFIG_DISTILL_PATH, DEFAULT_CONFIG_PRETRAIN_PATH
|
from ..language import DEFAULT_CONFIG_DISTILL_PATH, DEFAULT_CONFIG_PRETRAIN_PATH
|
||||||
from ..schemas import RecommendationSchema
|
from ..schemas import RecommendationSchema
|
||||||
from ..util import SimpleFrozenList
|
from ..util import SimpleFrozenList
|
||||||
from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND
|
from ._util import (
|
||||||
from ._util import string_to_list, import_code, _handle_renamed_language_codes
|
COMMAND,
|
||||||
|
Arg,
|
||||||
|
Opt,
|
||||||
|
_handle_renamed_language_codes,
|
||||||
|
import_code,
|
||||||
|
init_cli,
|
||||||
|
show_validation_error,
|
||||||
|
string_to_list,
|
||||||
|
)
|
||||||
|
|
||||||
ROOT = Path(__file__).parent / "templates"
|
ROOT = Path(__file__).parent / "templates"
|
||||||
TEMPLATE_PATH = ROOT / "quickstart_training.jinja"
|
TEMPLATE_PATH = ROOT / "quickstart_training.jinja"
|
||||||
|
|
|
@ -1,15 +1,24 @@
|
||||||
from typing import Optional
|
|
||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from wasabi import msg
|
from typing import Optional
|
||||||
import typer
|
|
||||||
import srsly
|
import srsly
|
||||||
|
import typer
|
||||||
|
from wasabi import msg
|
||||||
|
|
||||||
from .. import util
|
from .. import util
|
||||||
from ..training.initialize import init_nlp, convert_vectors
|
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
|
from ..training.initialize import convert_vectors, init_nlp
|
||||||
from ._util import import_code, setup_gpu, _handle_renamed_language_codes
|
from ._util import (
|
||||||
|
Arg,
|
||||||
|
Opt,
|
||||||
|
_handle_renamed_language_codes,
|
||||||
|
import_code,
|
||||||
|
init_cli,
|
||||||
|
parse_config_overrides,
|
||||||
|
setup_gpu,
|
||||||
|
show_validation_error,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@init_cli.command("vectors")
|
@init_cli.command("vectors")
|
||||||
|
|
|
@ -1,18 +1,18 @@
|
||||||
from typing import Optional, Union, Any, Dict, List, Tuple, cast
|
|
||||||
import shutil
|
|
||||||
from pathlib import Path
|
|
||||||
from wasabi import Printer, MarkdownRenderer, get_raw_input
|
|
||||||
from thinc.api import Config
|
|
||||||
from collections import defaultdict
|
|
||||||
from catalogue import RegistryError
|
|
||||||
import srsly
|
|
||||||
import sys
|
|
||||||
import re
|
import re
|
||||||
|
import shutil
|
||||||
|
import sys
|
||||||
|
from collections import defaultdict
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Dict, List, Optional, Tuple, Union, cast
|
||||||
|
|
||||||
from ._util import app, Arg, Opt, string_to_list, WHEEL_SUFFIX, SDIST_SUFFIX
|
import srsly
|
||||||
from ..schemas import validate, ModelMetaSchema
|
from catalogue import RegistryError
|
||||||
from .. import util
|
from thinc.api import Config
|
||||||
from .. import about
|
from wasabi import MarkdownRenderer, Printer, get_raw_input
|
||||||
|
|
||||||
|
from .. import about, util
|
||||||
|
from ..schemas import ModelMetaSchema, validate
|
||||||
|
from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app, string_to_list
|
||||||
|
|
||||||
|
|
||||||
@app.command("package")
|
@app.command("package")
|
||||||
|
|
|
@ -1,13 +1,21 @@
|
||||||
from typing import Optional
|
|
||||||
from pathlib import Path
|
|
||||||
from wasabi import msg
|
|
||||||
import typer
|
|
||||||
import re
|
import re
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import typer
|
||||||
|
from wasabi import msg
|
||||||
|
|
||||||
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
|
|
||||||
from ._util import import_code_paths, setup_gpu
|
|
||||||
from ..training.pretrain import pretrain
|
from ..training.pretrain import pretrain
|
||||||
from ..util import load_config
|
from ..util import load_config
|
||||||
|
from ._util import (
|
||||||
|
Arg,
|
||||||
|
Opt,
|
||||||
|
app,
|
||||||
|
import_code_paths,
|
||||||
|
parse_config_overrides,
|
||||||
|
setup_gpu,
|
||||||
|
show_validation_error,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@app.command(
|
@app.command(
|
||||||
|
@ -23,6 +31,7 @@ def pretrain_cli(
|
||||||
resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
|
resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
|
||||||
epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."),
|
epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."),
|
||||||
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
||||||
|
skip_last: bool = Opt(False, "--skip-last", "-L", help="Skip saving model-last.bin"),
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
|
@ -74,6 +83,7 @@ def pretrain_cli(
|
||||||
epoch_resume=epoch_resume,
|
epoch_resume=epoch_resume,
|
||||||
use_gpu=use_gpu,
|
use_gpu=use_gpu,
|
||||||
silent=False,
|
silent=False,
|
||||||
|
skip_last=skip_last,
|
||||||
)
|
)
|
||||||
msg.good("Successfully finished pretrain")
|
msg.good("Successfully finished pretrain")
|
||||||
|
|
||||||
|
|
|
@ -1,17 +1,18 @@
|
||||||
from typing import Optional, Sequence, Union, Iterator
|
|
||||||
import tqdm
|
|
||||||
from pathlib import Path
|
|
||||||
import srsly
|
|
||||||
import cProfile
|
import cProfile
|
||||||
|
import itertools
|
||||||
import pstats
|
import pstats
|
||||||
import sys
|
import sys
|
||||||
import itertools
|
from pathlib import Path
|
||||||
from wasabi import msg, Printer
|
from typing import Iterator, Optional, Sequence, Union
|
||||||
import typer
|
|
||||||
|
import srsly
|
||||||
|
import tqdm
|
||||||
|
import typer
|
||||||
|
from wasabi import Printer, msg
|
||||||
|
|
||||||
from ._util import app, debug_cli, Arg, Opt, NAME
|
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..util import load_model
|
from ..util import load_model
|
||||||
|
from ._util import NAME, Arg, Opt, app, debug_cli
|
||||||
|
|
||||||
|
|
||||||
@debug_cli.command("profile")
|
@debug_cli.command("profile")
|
||||||
|
|
|
@ -1,16 +1,27 @@
|
||||||
from typing import Any, Dict, Optional
|
|
||||||
from pathlib import Path
|
|
||||||
from wasabi import msg
|
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import shutil
|
import shutil
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Dict, Optional
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
import typer
|
import typer
|
||||||
|
from wasabi import msg
|
||||||
|
|
||||||
from ...util import ensure_path, working_dir
|
from ...util import ensure_path, working_dir
|
||||||
from .._util import project_cli, Arg, Opt, PROJECT_FILE, load_project_config
|
from .._util import (
|
||||||
from .._util import get_checksum, download_file, git_checkout, get_git_version
|
PROJECT_FILE,
|
||||||
from .._util import SimpleFrozenDict, parse_config_overrides
|
Arg,
|
||||||
|
Opt,
|
||||||
|
SimpleFrozenDict,
|
||||||
|
download_file,
|
||||||
|
get_checksum,
|
||||||
|
get_git_version,
|
||||||
|
git_checkout,
|
||||||
|
load_project_config,
|
||||||
|
parse_config_overrides,
|
||||||
|
project_cli,
|
||||||
|
)
|
||||||
|
|
||||||
# Whether assets are extra if `extra` is not set.
|
# Whether assets are extra if `extra` is not set.
|
||||||
EXTRA_DEFAULT = False
|
EXTRA_DEFAULT = False
|
||||||
|
|
|
@ -1,13 +1,22 @@
|
||||||
from typing import Optional
|
|
||||||
from pathlib import Path
|
|
||||||
from wasabi import msg
|
|
||||||
import subprocess
|
|
||||||
import re
|
import re
|
||||||
|
import subprocess
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from wasabi import msg
|
||||||
|
|
||||||
from ... import about
|
from ... import about
|
||||||
from ...util import ensure_path
|
from ...util import ensure_path
|
||||||
from .._util import project_cli, Arg, Opt, COMMAND, PROJECT_FILE
|
from .._util import (
|
||||||
from .._util import git_checkout, get_git_version, git_repo_branch_exists
|
COMMAND,
|
||||||
|
PROJECT_FILE,
|
||||||
|
Arg,
|
||||||
|
Opt,
|
||||||
|
get_git_version,
|
||||||
|
git_checkout,
|
||||||
|
git_repo_branch_exists,
|
||||||
|
project_cli,
|
||||||
|
)
|
||||||
|
|
||||||
DEFAULT_REPO = about.__projects__
|
DEFAULT_REPO = about.__projects__
|
||||||
DEFAULT_PROJECTS_BRANCH = about.__projects_branch__
|
DEFAULT_PROJECTS_BRANCH = about.__projects_branch__
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from wasabi import msg, MarkdownRenderer
|
|
||||||
|
from wasabi import MarkdownRenderer, msg
|
||||||
|
|
||||||
from ...util import working_dir
|
from ...util import working_dir
|
||||||
from .._util import project_cli, Arg, Opt, PROJECT_FILE, load_project_config
|
from .._util import PROJECT_FILE, Arg, Opt, load_project_config, project_cli
|
||||||
|
|
||||||
|
|
||||||
DOCS_URL = "https://spacy.io"
|
DOCS_URL = "https://spacy.io"
|
||||||
INTRO_PROJECT = f"""The [`{PROJECT_FILE}`]({PROJECT_FILE}) defines the data assets required by the
|
INTRO_PROJECT = f"""The [`{PROJECT_FILE}`]({PROJECT_FILE}) defines the data assets required by the
|
||||||
|
|
|
@ -1,15 +1,28 @@
|
||||||
"""This module contains helpers and subcommands for integrating spaCy projects
|
"""This module contains helpers and subcommands for integrating spaCy projects
|
||||||
with Data Version Controk (DVC). https://dvc.org"""
|
with Data Version Controk (DVC). https://dvc.org"""
|
||||||
from typing import Dict, Any, List, Optional, Iterable
|
|
||||||
import subprocess
|
import subprocess
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from typing import Any, Dict, Iterable, List, Optional
|
||||||
|
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
|
|
||||||
from .._util import PROJECT_FILE, load_project_config, get_hash, project_cli
|
from ...util import (
|
||||||
from .._util import Arg, Opt, NAME, COMMAND
|
SimpleFrozenList,
|
||||||
from ...util import working_dir, split_command, join_command, run_command
|
join_command,
|
||||||
from ...util import SimpleFrozenList
|
run_command,
|
||||||
|
split_command,
|
||||||
|
working_dir,
|
||||||
|
)
|
||||||
|
from .._util import (
|
||||||
|
COMMAND,
|
||||||
|
NAME,
|
||||||
|
PROJECT_FILE,
|
||||||
|
Arg,
|
||||||
|
Opt,
|
||||||
|
get_hash,
|
||||||
|
load_project_config,
|
||||||
|
project_cli,
|
||||||
|
)
|
||||||
|
|
||||||
DVC_CONFIG = "dvc.yaml"
|
DVC_CONFIG = "dvc.yaml"
|
||||||
DVC_DIR = ".dvc"
|
DVC_DIR = ".dvc"
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
from .remote_storage import RemoteStorage
|
|
||||||
from .remote_storage import get_command_hash
|
from .._util import Arg, load_project_config, logger, project_cli
|
||||||
from .._util import project_cli, Arg, logger
|
from .remote_storage import RemoteStorage, get_command_hash
|
||||||
from .._util import load_project_config
|
|
||||||
from .run import update_lockfile
|
from .run import update_lockfile
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
from .remote_storage import RemoteStorage
|
|
||||||
from .remote_storage import get_content_hash, get_command_hash
|
from .._util import Arg, load_project_config, logger, project_cli
|
||||||
from .._util import load_project_config
|
from .remote_storage import RemoteStorage, get_command_hash, get_content_hash
|
||||||
from .._util import project_cli, Arg, logger
|
|
||||||
|
|
||||||
|
|
||||||
@project_cli.command("push")
|
@project_cli.command("push")
|
||||||
|
|
|
@ -1,18 +1,25 @@
|
||||||
from typing import Optional, List, Dict, TYPE_CHECKING
|
import hashlib
|
||||||
import os
|
import os
|
||||||
import site
|
import site
|
||||||
import hashlib
|
|
||||||
import urllib.parse
|
|
||||||
import tarfile
|
import tarfile
|
||||||
|
import urllib.parse
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from typing import TYPE_CHECKING, Dict, List, Optional
|
||||||
|
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
|
|
||||||
from .._util import get_hash, get_checksum, upload_file, download_file
|
|
||||||
from .._util import ensure_pathy, make_tempdir
|
|
||||||
from ...util import get_minor_version, ENV_VARS, check_bool_env_var
|
|
||||||
from ...git_info import GIT_VERSION
|
|
||||||
from ... import about
|
from ... import about
|
||||||
from ...errors import Errors
|
from ...errors import Errors
|
||||||
|
from ...git_info import GIT_VERSION
|
||||||
|
from ...util import ENV_VARS, check_bool_env_var, get_minor_version
|
||||||
|
from .._util import (
|
||||||
|
download_file,
|
||||||
|
ensure_pathy,
|
||||||
|
get_checksum,
|
||||||
|
get_hash,
|
||||||
|
make_tempdir,
|
||||||
|
upload_file,
|
||||||
|
)
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from pathy import FluidPath # noqa: F401
|
from pathy import FluidPath # noqa: F401
|
||||||
|
|
|
@ -1,20 +1,39 @@
|
||||||
from typing import Optional, List, Dict, Sequence, Any, Iterable, Tuple
|
|
||||||
import os.path
|
import os.path
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
from wasabi import msg
|
|
||||||
from wasabi.util import locale_escape
|
|
||||||
import sys
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple
|
||||||
|
|
||||||
import srsly
|
import srsly
|
||||||
import typer
|
import typer
|
||||||
|
from wasabi import msg
|
||||||
|
from wasabi.util import locale_escape
|
||||||
|
|
||||||
from ... import about
|
from ... import about
|
||||||
from ...git_info import GIT_VERSION
|
from ...git_info import GIT_VERSION
|
||||||
from ...util import working_dir, run_command, split_command, is_cwd, join_command
|
from ...util import (
|
||||||
from ...util import SimpleFrozenList, is_minor_version_match, ENV_VARS
|
ENV_VARS,
|
||||||
from ...util import check_bool_env_var, SimpleFrozenDict
|
SimpleFrozenDict,
|
||||||
from .._util import PROJECT_FILE, PROJECT_LOCK, load_project_config, get_hash
|
SimpleFrozenList,
|
||||||
from .._util import get_checksum, project_cli, Arg, Opt, COMMAND, parse_config_overrides
|
check_bool_env_var,
|
||||||
|
is_cwd,
|
||||||
|
is_minor_version_match,
|
||||||
|
join_command,
|
||||||
|
run_command,
|
||||||
|
split_command,
|
||||||
|
working_dir,
|
||||||
|
)
|
||||||
|
from .._util import (
|
||||||
|
COMMAND,
|
||||||
|
PROJECT_FILE,
|
||||||
|
PROJECT_LOCK,
|
||||||
|
Arg,
|
||||||
|
Opt,
|
||||||
|
get_checksum,
|
||||||
|
get_hash,
|
||||||
|
load_project_config,
|
||||||
|
parse_config_overrides,
|
||||||
|
project_cli,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@project_cli.command(
|
@project_cli.command(
|
||||||
|
|
|
@ -3,7 +3,7 @@ the docs and the init config command. It encodes various best practices and
|
||||||
can help generate the best possible configuration, given a user's requirements. #}
|
can help generate the best possible configuration, given a user's requirements. #}
|
||||||
{%- set use_transformer = hardware != "cpu" and transformer_data -%}
|
{%- set use_transformer = hardware != "cpu" and transformer_data -%}
|
||||||
{%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
|
{%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
|
||||||
{%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "spancat", "trainable_lemmatizer"] -%}
|
{%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "span_finder", "spancat", "spancat_singlelabel", "trainable_lemmatizer"] -%}
|
||||||
[paths]
|
[paths]
|
||||||
train = null
|
train = null
|
||||||
dev = null
|
dev = null
|
||||||
|
@ -24,8 +24,11 @@ gpu_allocator = null
|
||||||
lang = "{{ lang }}"
|
lang = "{{ lang }}"
|
||||||
{%- set has_textcat = ("textcat" in components or "textcat_multilabel" in components) -%}
|
{%- set has_textcat = ("textcat" in components or "textcat_multilabel" in components) -%}
|
||||||
{%- set with_accuracy = optimize == "accuracy" -%}
|
{%- set with_accuracy = optimize == "accuracy" -%}
|
||||||
{%- set has_accurate_textcat = has_textcat and with_accuracy -%}
|
{# The BOW textcat doesn't need a source of features, so it can omit the
|
||||||
{%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "spancat" in components or "trainable_lemmatizer" in components or "entity_linker" in components or has_accurate_textcat) -%}
|
tok2vec/transformer. #}
|
||||||
|
{%- set with_accuracy_or_transformer = (use_transformer or with_accuracy) -%}
|
||||||
|
{%- set textcat_needs_features = has_textcat and with_accuracy_or_transformer -%}
|
||||||
|
{%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "span_finder" in components or "spancat" in components or "spancat_singlelabel" in components or "trainable_lemmatizer" in components or "entity_linker" in components or textcat_needs_features) -%}
|
||||||
{%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components -%}
|
{%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components -%}
|
||||||
{%- else -%}
|
{%- else -%}
|
||||||
{%- set full_pipeline = components -%}
|
{%- set full_pipeline = components -%}
|
||||||
|
@ -122,6 +125,30 @@ grad_factor = 1.0
|
||||||
@layers = "reduce_mean.v1"
|
@layers = "reduce_mean.v1"
|
||||||
{% endif -%}
|
{% endif -%}
|
||||||
|
|
||||||
|
{% if "span_finder" in components -%}
|
||||||
|
[components.span_finder]
|
||||||
|
factory = "span_finder"
|
||||||
|
max_length = null
|
||||||
|
min_length = null
|
||||||
|
scorer = {"@scorers":"spacy.span_finder_scorer.v1"}
|
||||||
|
spans_key = "sc"
|
||||||
|
threshold = 0.5
|
||||||
|
|
||||||
|
[components.span_finder.model]
|
||||||
|
@architectures = "spacy.SpanFinder.v1"
|
||||||
|
|
||||||
|
[components.span_finder.model.scorer]
|
||||||
|
@layers = "spacy.LinearLogistic.v1"
|
||||||
|
nO = 2
|
||||||
|
|
||||||
|
[components.span_finder.model.tok2vec]
|
||||||
|
@architectures = "spacy-transformers.TransformerListener.v1"
|
||||||
|
grad_factor = 1.0
|
||||||
|
|
||||||
|
[components.span_finder.model.tok2vec.pooling]
|
||||||
|
@layers = "reduce_mean.v1"
|
||||||
|
{% endif -%}
|
||||||
|
|
||||||
{% if "spancat" in components -%}
|
{% if "spancat" in components -%}
|
||||||
[components.spancat]
|
[components.spancat]
|
||||||
factory = "spancat"
|
factory = "spancat"
|
||||||
|
@ -154,6 +181,36 @@ grad_factor = 1.0
|
||||||
sizes = [1,2,3]
|
sizes = [1,2,3]
|
||||||
{% endif -%}
|
{% endif -%}
|
||||||
|
|
||||||
|
{% if "spancat_singlelabel" in components %}
|
||||||
|
[components.spancat_singlelabel]
|
||||||
|
factory = "spancat_singlelabel"
|
||||||
|
negative_weight = 1.0
|
||||||
|
allow_overlap = true
|
||||||
|
scorer = {"@scorers":"spacy.spancat_scorer.v1"}
|
||||||
|
spans_key = "sc"
|
||||||
|
|
||||||
|
[components.spancat_singlelabel.model]
|
||||||
|
@architectures = "spacy.SpanCategorizer.v1"
|
||||||
|
|
||||||
|
[components.spancat_singlelabel.model.reducer]
|
||||||
|
@layers = "spacy.mean_max_reducer.v1"
|
||||||
|
hidden_size = 128
|
||||||
|
|
||||||
|
[components.spancat_singlelabel.model.scorer]
|
||||||
|
@layers = "Softmax.v2"
|
||||||
|
|
||||||
|
[components.spancat_singlelabel.model.tok2vec]
|
||||||
|
@architectures = "spacy-transformers.TransformerListener.v1"
|
||||||
|
grad_factor = 1.0
|
||||||
|
|
||||||
|
[components.spancat_singlelabel.model.tok2vec.pooling]
|
||||||
|
@layers = "reduce_mean.v1"
|
||||||
|
|
||||||
|
[components.spancat_singlelabel.suggester]
|
||||||
|
@misc = "spacy.ngram_suggester.v1"
|
||||||
|
sizes = [1,2,3]
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
{% if "trainable_lemmatizer" in components -%}
|
{% if "trainable_lemmatizer" in components -%}
|
||||||
[components.trainable_lemmatizer]
|
[components.trainable_lemmatizer]
|
||||||
factory = "trainable_lemmatizer"
|
factory = "trainable_lemmatizer"
|
||||||
|
@ -219,10 +276,16 @@ no_output_layer = false
|
||||||
|
|
||||||
{% else -%}
|
{% else -%}
|
||||||
[components.textcat.model]
|
[components.textcat.model]
|
||||||
@architectures = "spacy.TextCatBOW.v2"
|
@architectures = "spacy.TextCatCNN.v2"
|
||||||
exclusive_classes = true
|
exclusive_classes = true
|
||||||
ngram_size = 1
|
nO = null
|
||||||
no_output_layer = false
|
|
||||||
|
[components.textcat.model.tok2vec]
|
||||||
|
@architectures = "spacy-transformers.TransformerListener.v1"
|
||||||
|
grad_factor = 1.0
|
||||||
|
|
||||||
|
[components.textcat.model.tok2vec.pooling]
|
||||||
|
@layers = "reduce_mean.v1"
|
||||||
{%- endif %}
|
{%- endif %}
|
||||||
{%- endif %}
|
{%- endif %}
|
||||||
|
|
||||||
|
@ -250,10 +313,16 @@ no_output_layer = false
|
||||||
|
|
||||||
{% else -%}
|
{% else -%}
|
||||||
[components.textcat_multilabel.model]
|
[components.textcat_multilabel.model]
|
||||||
@architectures = "spacy.TextCatBOW.v2"
|
@architectures = "spacy.TextCatCNN.v2"
|
||||||
exclusive_classes = false
|
exclusive_classes = false
|
||||||
ngram_size = 1
|
nO = null
|
||||||
no_output_layer = false
|
|
||||||
|
[components.textcat_multilabel.model.tok2vec]
|
||||||
|
@architectures = "spacy-transformers.TransformerListener.v1"
|
||||||
|
grad_factor = 1.0
|
||||||
|
|
||||||
|
[components.textcat_multilabel.model.tok2vec.pooling]
|
||||||
|
@layers = "reduce_mean.v1"
|
||||||
{%- endif %}
|
{%- endif %}
|
||||||
{%- endif %}
|
{%- endif %}
|
||||||
|
|
||||||
|
@ -284,6 +353,7 @@ maxout_pieces = 3
|
||||||
{% if "morphologizer" in components %}
|
{% if "morphologizer" in components %}
|
||||||
[components.morphologizer]
|
[components.morphologizer]
|
||||||
factory = "morphologizer"
|
factory = "morphologizer"
|
||||||
|
label_smoothing = 0.05
|
||||||
|
|
||||||
[components.morphologizer.model]
|
[components.morphologizer.model]
|
||||||
@architectures = "spacy.Tagger.v2"
|
@architectures = "spacy.Tagger.v2"
|
||||||
|
@ -297,6 +367,7 @@ width = ${components.tok2vec.model.encode.width}
|
||||||
{% if "tagger" in components %}
|
{% if "tagger" in components %}
|
||||||
[components.tagger]
|
[components.tagger]
|
||||||
factory = "tagger"
|
factory = "tagger"
|
||||||
|
label_smoothing = 0.05
|
||||||
|
|
||||||
[components.tagger.model]
|
[components.tagger.model]
|
||||||
@architectures = "spacy.Tagger.v2"
|
@architectures = "spacy.Tagger.v2"
|
||||||
|
@ -341,6 +412,27 @@ nO = null
|
||||||
width = ${components.tok2vec.model.encode.width}
|
width = ${components.tok2vec.model.encode.width}
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
|
||||||
|
{% if "span_finder" in components %}
|
||||||
|
[components.span_finder]
|
||||||
|
factory = "span_finder"
|
||||||
|
max_length = null
|
||||||
|
min_length = null
|
||||||
|
scorer = {"@scorers":"spacy.span_finder_scorer.v1"}
|
||||||
|
spans_key = "sc"
|
||||||
|
threshold = 0.5
|
||||||
|
|
||||||
|
[components.span_finder.model]
|
||||||
|
@architectures = "spacy.SpanFinder.v1"
|
||||||
|
|
||||||
|
[components.span_finder.model.scorer]
|
||||||
|
@layers = "spacy.LinearLogistic.v1"
|
||||||
|
nO = 2
|
||||||
|
|
||||||
|
[components.span_finder.model.tok2vec]
|
||||||
|
@architectures = "spacy.Tok2VecListener.v1"
|
||||||
|
width = ${components.tok2vec.model.encode.width}
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
{% if "spancat" in components %}
|
{% if "spancat" in components %}
|
||||||
[components.spancat]
|
[components.spancat]
|
||||||
factory = "spancat"
|
factory = "spancat"
|
||||||
|
@ -370,6 +462,33 @@ width = ${components.tok2vec.model.encode.width}
|
||||||
sizes = [1,2,3]
|
sizes = [1,2,3]
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
|
||||||
|
{% if "spancat_singlelabel" in components %}
|
||||||
|
[components.spancat_singlelabel]
|
||||||
|
factory = "spancat_singlelabel"
|
||||||
|
negative_weight = 1.0
|
||||||
|
allow_overlap = true
|
||||||
|
scorer = {"@scorers":"spacy.spancat_scorer.v1"}
|
||||||
|
spans_key = "sc"
|
||||||
|
|
||||||
|
[components.spancat_singlelabel.model]
|
||||||
|
@architectures = "spacy.SpanCategorizer.v1"
|
||||||
|
|
||||||
|
[components.spancat_singlelabel.model.reducer]
|
||||||
|
@layers = "spacy.mean_max_reducer.v1"
|
||||||
|
hidden_size = 128
|
||||||
|
|
||||||
|
[components.spancat_singlelabel.model.scorer]
|
||||||
|
@layers = "Softmax.v2"
|
||||||
|
|
||||||
|
[components.spancat_singlelabel.model.tok2vec]
|
||||||
|
@architectures = "spacy.Tok2VecListener.v1"
|
||||||
|
width = ${components.tok2vec.model.encode.width}
|
||||||
|
|
||||||
|
[components.spancat_singlelabel.suggester]
|
||||||
|
@misc = "spacy.ngram_suggester.v1"
|
||||||
|
sizes = [1,2,3]
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
{% if "trainable_lemmatizer" in components -%}
|
{% if "trainable_lemmatizer" in components -%}
|
||||||
[components.trainable_lemmatizer]
|
[components.trainable_lemmatizer]
|
||||||
factory = "trainable_lemmatizer"
|
factory = "trainable_lemmatizer"
|
||||||
|
|
|
@ -1,15 +1,23 @@
|
||||||
from typing import Optional, Dict, Any, Union
|
|
||||||
from pathlib import Path
|
|
||||||
from wasabi import msg
|
|
||||||
import typer
|
|
||||||
import logging
|
import logging
|
||||||
import sys
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Dict, Optional, Union
|
||||||
|
|
||||||
|
import typer
|
||||||
|
from wasabi import msg
|
||||||
|
|
||||||
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
|
|
||||||
from ._util import import_code_paths, setup_gpu
|
|
||||||
from ..training.loop import train as train_nlp
|
|
||||||
from ..training.initialize import init_nlp
|
|
||||||
from .. import util
|
from .. import util
|
||||||
|
from ..training.initialize import init_nlp
|
||||||
|
from ..training.loop import train as train_nlp
|
||||||
|
from ._util import (
|
||||||
|
Arg,
|
||||||
|
Opt,
|
||||||
|
app,
|
||||||
|
import_code_paths,
|
||||||
|
parse_config_overrides,
|
||||||
|
setup_gpu,
|
||||||
|
show_validation_error,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@app.command(
|
@app.command(
|
||||||
|
|
|
@ -1,14 +1,21 @@
|
||||||
from typing import Tuple
|
|
||||||
from pathlib import Path
|
|
||||||
import sys
|
import sys
|
||||||
import requests
|
|
||||||
from wasabi import msg, Printer
|
|
||||||
import warnings
|
import warnings
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Tuple
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from wasabi import Printer, msg
|
||||||
|
|
||||||
from ._util import app
|
|
||||||
from .. import about
|
from .. import about
|
||||||
from ..util import get_package_version, get_installed_models, get_minor_version
|
from ..util import (
|
||||||
from ..util import get_package_path, get_model_meta, is_compatible_version
|
get_installed_models,
|
||||||
|
get_minor_version,
|
||||||
|
get_model_meta,
|
||||||
|
get_package_path,
|
||||||
|
get_package_version,
|
||||||
|
is_compatible_version,
|
||||||
|
)
|
||||||
|
from ._util import app
|
||||||
|
|
||||||
|
|
||||||
@app.command("validate")
|
@app.command("validate")
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
"""Helpers for Python and platform compatibility."""
|
"""Helpers for Python and platform compatibility."""
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
from thinc.util import copy_array
|
from thinc.util import copy_array
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|
|
@ -4,15 +4,13 @@ spaCy's built in visualization suite for dependencies and named entities.
|
||||||
DOCS: https://spacy.io/api/top-level#displacy
|
DOCS: https://spacy.io/api/top-level#displacy
|
||||||
USAGE: https://spacy.io/usage/visualizers
|
USAGE: https://spacy.io/usage/visualizers
|
||||||
"""
|
"""
|
||||||
from typing import Union, Iterable, Optional, Dict, Any, Callable
|
|
||||||
import warnings
|
import warnings
|
||||||
|
from typing import Any, Callable, Dict, Iterable, Optional, Union
|
||||||
|
|
||||||
from .render import DependencyRenderer, EntityRenderer, SpanRenderer
|
|
||||||
from ..tokens import Doc, Span
|
|
||||||
from ..errors import Errors, Warnings
|
from ..errors import Errors, Warnings
|
||||||
from ..util import is_in_jupyter
|
from ..tokens import Doc, Span
|
||||||
from ..util import find_available_port
|
from ..util import find_available_port, is_in_jupyter
|
||||||
|
from .render import DependencyRenderer, EntityRenderer, SpanRenderer
|
||||||
|
|
||||||
_html = {}
|
_html = {}
|
||||||
RENDER_WRAPPER = None
|
RENDER_WRAPPER = None
|
||||||
|
@ -68,7 +66,7 @@ def render(
|
||||||
if jupyter or (jupyter is None and is_in_jupyter()):
|
if jupyter or (jupyter is None and is_in_jupyter()):
|
||||||
# return HTML rendered by IPython display()
|
# return HTML rendered by IPython display()
|
||||||
# See #4840 for details on span wrapper to disable mathjax
|
# See #4840 for details on span wrapper to disable mathjax
|
||||||
from IPython.core.display import display, HTML
|
from IPython.core.display import HTML, display
|
||||||
|
|
||||||
return display(HTML('<span class="tex2jax_ignore">{}</span>'.format(html)))
|
return display(HTML('<span class="tex2jax_ignore">{}</span>'.format(html)))
|
||||||
return html
|
return html
|
||||||
|
@ -125,13 +123,17 @@ def app(environ, start_response):
|
||||||
return [res]
|
return [res]
|
||||||
|
|
||||||
|
|
||||||
def parse_deps(orig_doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
|
def parse_deps(
|
||||||
|
orig_doc: Union[Doc, Span], options: Dict[str, Any] = {}
|
||||||
|
) -> Dict[str, Any]:
|
||||||
"""Generate dependency parse in {'words': [], 'arcs': []} format.
|
"""Generate dependency parse in {'words': [], 'arcs': []} format.
|
||||||
|
|
||||||
orig_doc (Doc): Document to parse.
|
orig_doc (Union[Doc, Span]): Document to parse.
|
||||||
options (Dict[str, Any]): Dependency parse specific visualisation options.
|
options (Dict[str, Any]): Dependency parse specific visualisation options.
|
||||||
RETURNS (dict): Generated dependency parse keyed by words and arcs.
|
RETURNS (dict): Generated dependency parse keyed by words and arcs.
|
||||||
"""
|
"""
|
||||||
|
if isinstance(orig_doc, Span):
|
||||||
|
orig_doc = orig_doc.as_doc()
|
||||||
doc = Doc(orig_doc.vocab).from_bytes(
|
doc = Doc(orig_doc.vocab).from_bytes(
|
||||||
orig_doc.to_bytes(exclude=["user_data", "user_hooks"])
|
orig_doc.to_bytes(exclude=["user_data", "user_hooks"])
|
||||||
)
|
)
|
||||||
|
|
|
@ -1,15 +1,29 @@
|
||||||
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
||||||
import uuid
|
|
||||||
import itertools
|
import itertools
|
||||||
|
import uuid
|
||||||
|
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||||
|
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
from ..util import escape_html, minify_html, registry
|
from ..util import escape_html, minify_html, registry
|
||||||
from .templates import TPL_DEP_ARCS, TPL_DEP_SVG, TPL_DEP_WORDS
|
from .templates import (
|
||||||
from .templates import TPL_DEP_WORDS_LEMMA, TPL_ENT, TPL_ENT_RTL, TPL_ENTS
|
TPL_DEP_ARCS,
|
||||||
from .templates import TPL_FIGURE, TPL_KB_LINK, TPL_PAGE, TPL_SPAN
|
TPL_DEP_SVG,
|
||||||
from .templates import TPL_SPAN_RTL, TPL_SPAN_SLICE, TPL_SPAN_SLICE_RTL
|
TPL_DEP_WORDS,
|
||||||
from .templates import TPL_SPAN_START, TPL_SPAN_START_RTL, TPL_SPANS
|
TPL_DEP_WORDS_LEMMA,
|
||||||
from .templates import TPL_TITLE
|
TPL_ENT,
|
||||||
|
TPL_ENT_RTL,
|
||||||
|
TPL_ENTS,
|
||||||
|
TPL_FIGURE,
|
||||||
|
TPL_KB_LINK,
|
||||||
|
TPL_PAGE,
|
||||||
|
TPL_SPAN,
|
||||||
|
TPL_SPAN_RTL,
|
||||||
|
TPL_SPAN_SLICE,
|
||||||
|
TPL_SPAN_SLICE_RTL,
|
||||||
|
TPL_SPAN_START,
|
||||||
|
TPL_SPAN_START_RTL,
|
||||||
|
TPL_SPANS,
|
||||||
|
TPL_TITLE,
|
||||||
|
)
|
||||||
|
|
||||||
DEFAULT_LANG = "en"
|
DEFAULT_LANG = "en"
|
||||||
DEFAULT_DIR = "ltr"
|
DEFAULT_DIR = "ltr"
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
from typing import Literal
|
|
||||||
import warnings
|
import warnings
|
||||||
|
from typing import Literal
|
||||||
|
|
||||||
|
|
||||||
class ErrorsWithCodes(type):
|
class ErrorsWithCodes(type):
|
||||||
|
@ -82,7 +82,7 @@ class Warnings(metaclass=ErrorsWithCodes):
|
||||||
"ignoring the duplicate entry.")
|
"ignoring the duplicate entry.")
|
||||||
W021 = ("Unexpected hash collision in PhraseMatcher. Matches may be "
|
W021 = ("Unexpected hash collision in PhraseMatcher. Matches may be "
|
||||||
"incorrect. Modify PhraseMatcher._terminal_hash to fix.")
|
"incorrect. Modify PhraseMatcher._terminal_hash to fix.")
|
||||||
W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in "
|
W024 = ("Entity '{entity}' - alias '{alias}' combination already exists in "
|
||||||
"the Knowledge Base.")
|
"the Knowledge Base.")
|
||||||
W026 = ("Unable to set all sentence boundaries from dependency parses. If "
|
W026 = ("Unable to set all sentence boundaries from dependency parses. If "
|
||||||
"you are constructing a parse tree incrementally by setting "
|
"you are constructing a parse tree incrementally by setting "
|
||||||
|
@ -209,7 +209,11 @@ class Warnings(metaclass=ErrorsWithCodes):
|
||||||
"`enabled` ({enabled}). Be aware that this might affect other components in your pipeline.")
|
"`enabled` ({enabled}). Be aware that this might affect other components in your pipeline.")
|
||||||
W124 = ("{host}:{port} is already in use, using the nearest available port {serve_port} as an alternative.")
|
W124 = ("{host}:{port} is already in use, using the nearest available port {serve_port} as an alternative.")
|
||||||
|
|
||||||
|
# v4 warning strings
|
||||||
W400 = ("`use_upper=False` is ignored, the upper layer is always enabled")
|
W400 = ("`use_upper=False` is ignored, the upper layer is always enabled")
|
||||||
|
W401 = ("`incl_prior is True`, but the selected knowledge base type {kb_type} doesn't support prior probability "
|
||||||
|
"lookups so this setting will be ignored. If your KB does support prior probability lookups, make sure "
|
||||||
|
"to return `True` in `.supports_prior_probs`.")
|
||||||
|
|
||||||
|
|
||||||
class Errors(metaclass=ErrorsWithCodes):
|
class Errors(metaclass=ErrorsWithCodes):
|
||||||
|
@ -542,6 +546,8 @@ class Errors(metaclass=ErrorsWithCodes):
|
||||||
"during training, make sure to include it in 'annotating components'")
|
"during training, make sure to include it in 'annotating components'")
|
||||||
|
|
||||||
# New errors added in v3.x
|
# New errors added in v3.x
|
||||||
|
E850 = ("The PretrainVectors objective currently only supports default or "
|
||||||
|
"floret vectors, not {mode} vectors.")
|
||||||
E851 = ("The 'textcat' component labels should only have values of 0 or 1, "
|
E851 = ("The 'textcat' component labels should only have values of 0 or 1, "
|
||||||
"but found value of '{val}'.")
|
"but found value of '{val}'.")
|
||||||
E852 = ("The tar file pulled from the remote attempted an unsafe path "
|
E852 = ("The tar file pulled from the remote attempted an unsafe path "
|
||||||
|
@ -922,7 +928,7 @@ class Errors(metaclass=ErrorsWithCodes):
|
||||||
E1029 = ("Edit tree cannot be applied to form.")
|
E1029 = ("Edit tree cannot be applied to form.")
|
||||||
E1030 = ("Edit tree identifier out of range.")
|
E1030 = ("Edit tree identifier out of range.")
|
||||||
E1031 = ("Could not find gold transition - see logs above.")
|
E1031 = ("Could not find gold transition - see logs above.")
|
||||||
E1032 = ("`{var}` should not be {forbidden}, but received {value}.")
|
E1032 = ("Span {var} {value} is out of bounds for {obj} with length {length}.")
|
||||||
E1033 = ("Dimension {name} invalid -- only nO, nF, nP")
|
E1033 = ("Dimension {name} invalid -- only nO, nF, nP")
|
||||||
E1034 = ("Node index {i} out of bounds ({length})")
|
E1034 = ("Node index {i} out of bounds ({length})")
|
||||||
E1035 = ("Token index {i} out of bounds ({length})")
|
E1035 = ("Token index {i} out of bounds ({length})")
|
||||||
|
@ -951,6 +957,14 @@ class Errors(metaclass=ErrorsWithCodes):
|
||||||
"with `displacy.serve(doc, port=port)`")
|
"with `displacy.serve(doc, port=port)`")
|
||||||
E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port=port)` "
|
E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port=port)` "
|
||||||
"or use `auto_select_port=True` to pick an available port automatically.")
|
"or use `auto_select_port=True` to pick an available port automatically.")
|
||||||
|
E1051 = ("'allow_overlap' can only be False when max_positive is 1, but found 'max_positive': {max_positive}.")
|
||||||
|
E1052 = ("Unable to copy spans: the character offsets for the span at "
|
||||||
|
"index {i} in the span group do not align with the tokenization "
|
||||||
|
"in the target doc.")
|
||||||
|
E1053 = ("Both 'min_length' and 'max_length' should be larger than 0, but found"
|
||||||
|
" 'min_length': {min_length}, 'max_length': {max_length}")
|
||||||
|
E1054 = ("The text, including whitespace, must match between reference and "
|
||||||
|
"predicted docs when training {component}.")
|
||||||
|
|
||||||
# v4 error strings
|
# v4 error strings
|
||||||
E4000 = ("Expected a Doc as input, but got: '{type}'")
|
E4000 = ("Expected a Doc as input, but got: '{type}'")
|
||||||
|
@ -961,6 +975,12 @@ class Errors(metaclass=ErrorsWithCodes):
|
||||||
"reference and predicted docs.")
|
"reference and predicted docs.")
|
||||||
E4004 = ("Backprop is not supported when is_train is not set.")
|
E4004 = ("Backprop is not supported when is_train is not set.")
|
||||||
E4005 = ("EntityLinker_v1 is not supported in spaCy v4. Update your configuration.")
|
E4005 = ("EntityLinker_v1 is not supported in spaCy v4. Update your configuration.")
|
||||||
|
E4006 = ("Expected `entity_id` to be of type {exp_type}, but is of type {found_type}.")
|
||||||
|
E4007 = ("Span {var} {value} must be {op} Span {existing_var} "
|
||||||
|
"{existing_value}.")
|
||||||
|
E4008 = ("Span {pos}_char {value} does not correspond to a token {pos}.")
|
||||||
|
E4009 = ("The '{attr}' parameter should be 'None' or 'True', but found '{value}'.")
|
||||||
|
|
||||||
|
|
||||||
RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"}
|
RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"}
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
from .errors import Warnings
|
from .errors import Warnings
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
from .candidate import Candidate, InMemoryCandidate
|
||||||
from .kb import KnowledgeBase
|
from .kb import KnowledgeBase
|
||||||
from .kb_in_memory import InMemoryLookupKB
|
from .kb_in_memory import InMemoryLookupKB
|
||||||
from .candidate import Candidate, get_candidates, get_candidates_batch
|
|
||||||
|
__all__ = ["KnowledgeBase", "InMemoryLookupKB", "Candidate", "InMemoryCandidate"]
|
||||||
|
|
|
@ -1,12 +1,17 @@
|
||||||
from .kb cimport KnowledgeBase
|
|
||||||
from libcpp.vector cimport vector
|
from libcpp.vector cimport vector
|
||||||
from ..typedefs cimport hash_t
|
|
||||||
|
|
||||||
# Object used by the Entity Linker that summarizes one entity-alias candidate combination.
|
from ..typedefs cimport hash_t
|
||||||
|
from .kb_in_memory cimport InMemoryLookupKB
|
||||||
|
|
||||||
|
|
||||||
cdef class Candidate:
|
cdef class Candidate:
|
||||||
cdef readonly KnowledgeBase kb
|
pass
|
||||||
cdef hash_t entity_hash
|
|
||||||
cdef float entity_freq
|
|
||||||
cdef vector[float] entity_vector
|
cdef class InMemoryCandidate(Candidate):
|
||||||
cdef hash_t alias_hash
|
cdef readonly hash_t _entity_hash
|
||||||
cdef float prior_prob
|
cdef readonly hash_t _alias_hash
|
||||||
|
cdef vector[float] _entity_vector
|
||||||
|
cdef float _prior_prob
|
||||||
|
cdef readonly InMemoryLookupKB _kb
|
||||||
|
cdef float _entity_freq
|
||||||
|
|
|
@ -1,74 +1,98 @@
|
||||||
# cython: infer_types=True, profile=True
|
# cython: infer_types=True, profile=True
|
||||||
|
|
||||||
from typing import Iterable
|
from .kb_in_memory cimport InMemoryLookupKB
|
||||||
from .kb cimport KnowledgeBase
|
|
||||||
from ..tokens import Span
|
from ..errors import Errors
|
||||||
|
|
||||||
|
|
||||||
cdef class Candidate:
|
cdef class Candidate:
|
||||||
"""A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved
|
"""A `Candidate` object refers to a textual mention that may or may not be resolved
|
||||||
to a specific `entity` from a Knowledge Base. This will be used as input for the entity linking
|
to a specific entity from a Knowledge Base. This will be used as input for the entity linking
|
||||||
algorithm which will disambiguate the various candidates to the correct one.
|
algorithm which will disambiguate the various candidates to the correct one.
|
||||||
Each candidate (alias, entity) pair is assigned a certain prior probability.
|
Each candidate, which represents a possible link between one textual mention and one entity in the knowledge base,
|
||||||
|
is assigned a certain prior probability.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/kb/#candidate-init
|
DOCS: https://spacy.io/api/kb/#candidate-init
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, KnowledgeBase kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob):
|
def __init__(self):
|
||||||
self.kb = kb
|
# Make sure abstract Candidate is not instantiated.
|
||||||
self.entity_hash = entity_hash
|
if self.__class__ == Candidate:
|
||||||
self.entity_freq = entity_freq
|
raise TypeError(
|
||||||
self.entity_vector = entity_vector
|
Errors.E1046.format(cls_name=self.__class__.__name__)
|
||||||
self.alias_hash = alias_hash
|
)
|
||||||
self.prior_prob = prior_prob
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def entity(self) -> int:
|
def entity_id(self) -> int:
|
||||||
"""RETURNS (uint64): hash of the entity's KB ID/name"""
|
"""RETURNS (int): Numerical representation of entity ID (if entity ID is numerical, this is just the entity ID,
|
||||||
return self.entity_hash
|
otherwise the hash of the entity ID string)."""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def entity_(self) -> str:
|
def entity_id_(self) -> str:
|
||||||
"""RETURNS (str): ID/name of this entity in the KB"""
|
"""RETURNS (str): String representation of entity ID."""
|
||||||
return self.kb.vocab.strings[self.entity_hash]
|
raise NotImplementedError
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def alias(self) -> int:
|
def entity_vector(self) -> vector[float]:
|
||||||
"""RETURNS (uint64): hash of the alias"""
|
"""RETURNS (vector[float]): Entity vector."""
|
||||||
return self.alias_hash
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
|
cdef class InMemoryCandidate(Candidate):
|
||||||
|
"""Candidate for InMemoryLookupKB."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
kb: InMemoryLookupKB,
|
||||||
|
entity_hash: int,
|
||||||
|
alias_hash: int,
|
||||||
|
entity_vector: vector[float],
|
||||||
|
prior_prob: float,
|
||||||
|
entity_freq: float
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
kb (InMemoryLookupKB]): InMemoryLookupKB instance.
|
||||||
|
entity_id (int): Entity ID as hash that can be looked up with InMemoryKB.vocab.strings.__getitem__().
|
||||||
|
entity_freq (int): Entity frequency in KB corpus.
|
||||||
|
entity_vector (List[float]): Entity embedding.
|
||||||
|
alias_hash (int): Alias hash.
|
||||||
|
prior_prob (float): Prior probability of entity for this alias. I. e. the probability that, independent of
|
||||||
|
the context, this alias - which matches one of this entity's aliases - resolves to one this entity.
|
||||||
|
"""
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
self._entity_hash = entity_hash
|
||||||
|
self._entity_vector = entity_vector
|
||||||
|
self._prior_prob = prior_prob
|
||||||
|
self._kb = kb
|
||||||
|
self._alias_hash = alias_hash
|
||||||
|
self._entity_freq = entity_freq
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def alias_(self) -> str:
|
def entity_id(self) -> int:
|
||||||
"""RETURNS (str): ID of the original alias"""
|
return self._entity_hash
|
||||||
return self.kb.vocab.strings[self.alias_hash]
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def entity_freq(self) -> float:
|
def entity_vector(self) -> vector[float]:
|
||||||
return self.entity_freq
|
return self._entity_vector
|
||||||
|
|
||||||
@property
|
|
||||||
def entity_vector(self) -> Iterable[float]:
|
|
||||||
return self.entity_vector
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def prior_prob(self) -> float:
|
def prior_prob(self) -> float:
|
||||||
return self.prior_prob
|
"""RETURNS (float): Prior probability that this alias, which matches one of this entity's synonyms, resolves to
|
||||||
|
this entity."""
|
||||||
|
return self._prior_prob
|
||||||
|
|
||||||
|
@property
|
||||||
|
def alias(self) -> str:
|
||||||
|
"""RETURNS (str): Alias."""
|
||||||
|
return self._kb.vocab.strings[self._alias_hash]
|
||||||
|
|
||||||
def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]:
|
@property
|
||||||
"""
|
def entity_id_(self) -> str:
|
||||||
Return candidate entities for a given mention and fetching appropriate entries from the index.
|
return self._kb.vocab.strings[self._entity_hash]
|
||||||
kb (KnowledgeBase): Knowledge base to query.
|
|
||||||
mention (Span): Entity mention for which to identify candidates.
|
|
||||||
RETURNS (Iterable[Candidate]): Identified candidates.
|
|
||||||
"""
|
|
||||||
return kb.get_candidates(mention)
|
|
||||||
|
|
||||||
|
@property
|
||||||
def get_candidates_batch(kb: KnowledgeBase, mentions: Iterable[Span]) -> Iterable[Iterable[Candidate]]:
|
def entity_freq(self) -> float:
|
||||||
"""
|
"""RETURNS (float): Entity frequency in KB corpus."""
|
||||||
Return candidate entities for the given mentions and fetching appropriate entries from the index.
|
return self._entity_freq
|
||||||
kb (KnowledgeBase): Knowledge base to query.
|
|
||||||
mention (Iterable[Span]): Entity mentions for which to identify candidates.
|
|
||||||
RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
|
|
||||||
"""
|
|
||||||
return kb.get_candidates_batch(mentions)
|
|
||||||
|
|
|
@ -2,8 +2,10 @@
|
||||||
|
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
from libc.stdint cimport int64_t
|
from libc.stdint cimport int64_t
|
||||||
|
|
||||||
from ..vocab cimport Vocab
|
from ..vocab cimport Vocab
|
||||||
|
|
||||||
|
|
||||||
cdef class KnowledgeBase:
|
cdef class KnowledgeBase:
|
||||||
cdef Pool mem
|
cdef Pool mem
|
||||||
cdef readonly Vocab vocab
|
cdef readonly Vocab vocab
|
||||||
|
|
|
@ -2,12 +2,13 @@
|
||||||
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterable, Tuple, Union
|
from typing import Iterable, Tuple, Union
|
||||||
|
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
|
|
||||||
from .candidate import Candidate
|
|
||||||
from ..tokens import Span
|
|
||||||
from ..util import SimpleFrozenList
|
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
|
from ..tokens import Span, SpanGroup
|
||||||
|
from ..util import SimpleFrozenList
|
||||||
|
from .candidate import Candidate
|
||||||
|
|
||||||
|
|
||||||
cdef class KnowledgeBase:
|
cdef class KnowledgeBase:
|
||||||
|
@ -30,21 +31,23 @@ cdef class KnowledgeBase:
|
||||||
self.entity_vector_length = entity_vector_length
|
self.entity_vector_length = entity_vector_length
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
|
|
||||||
def get_candidates_batch(self, mentions: Iterable[Span]) -> Iterable[Iterable[Candidate]]:
|
def get_candidates_batch(self, mentions: SpanGroup) -> Iterable[Iterable[Candidate]]:
|
||||||
"""
|
"""
|
||||||
Return candidate entities for specified texts. Each candidate defines the entity, the original alias,
|
Return candidate entities for a specified Span mention. Each candidate defines at least the entity and the
|
||||||
and the prior probability of that alias resolving to that entity.
|
entity's embedding vector. Depending on the KB implementation, further properties - such as the prior
|
||||||
If no candidate is found for a given text, an empty list is returned.
|
probability of the specified mention text resolving to that entity - might be included.
|
||||||
mentions (Iterable[Span]): Mentions for which to get candidates.
|
If no candidates are found for a given mention, an empty list is returned.
|
||||||
|
mentions (SpanGroup): Mentions for which to get candidates.
|
||||||
RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
|
RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
|
||||||
"""
|
"""
|
||||||
return [self.get_candidates(span) for span in mentions]
|
return [self.get_candidates(span) for span in mentions]
|
||||||
|
|
||||||
def get_candidates(self, mention: Span) -> Iterable[Candidate]:
|
def get_candidates(self, mention: Span) -> Iterable[Candidate]:
|
||||||
"""
|
"""
|
||||||
Return candidate entities for specified text. Each candidate defines the entity, the original alias,
|
Return candidate entities for a specific mention. Each candidate defines at least the entity and the
|
||||||
and the prior probability of that alias resolving to that entity.
|
entity's embedding vector. Depending on the KB implementation, further properties - such as the prior
|
||||||
If the no candidate is found for a given text, an empty list is returned.
|
probability of the specified mention text resolving to that entity - might be included.
|
||||||
|
If no candidate is found for the given mention, an empty list is returned.
|
||||||
mention (Span): Mention for which to get candidates.
|
mention (Span): Mention for which to get candidates.
|
||||||
RETURNS (Iterable[Candidate]): Identified candidates.
|
RETURNS (Iterable[Candidate]): Identified candidates.
|
||||||
"""
|
"""
|
||||||
|
@ -106,3 +109,10 @@ cdef class KnowledgeBase:
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
Errors.E1045.format(parent="KnowledgeBase", method="from_disk", name=self.__name__)
|
Errors.E1045.format(parent="KnowledgeBase", method="from_disk", name=self.__name__)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def supports_prior_probs(self) -> bool:
|
||||||
|
"""RETURNS (bool): Whether this KB type supports looking up prior probabilities for entity mentions."""
|
||||||
|
raise NotImplementedError(
|
||||||
|
Errors.E1045.format(parent="KnowledgeBase", method="supports_prior_probs", name=self.__name__)
|
||||||
|
)
|
||||||
|
|
|
@ -1,11 +1,11 @@
|
||||||
"""Knowledge-base for entity or concept linking."""
|
"""Knowledge-base for entity or concept linking."""
|
||||||
from preshed.maps cimport PreshMap
|
|
||||||
from libcpp.vector cimport vector
|
|
||||||
from libc.stdint cimport int32_t, int64_t
|
from libc.stdint cimport int32_t, int64_t
|
||||||
from libc.stdio cimport FILE
|
from libc.stdio cimport FILE
|
||||||
|
from libcpp.vector cimport vector
|
||||||
|
from preshed.maps cimport PreshMap
|
||||||
|
|
||||||
|
from ..structs cimport AliasC, KBEntryC
|
||||||
from ..typedefs cimport hash_t
|
from ..typedefs cimport hash_t
|
||||||
from ..structs cimport KBEntryC, AliasC
|
|
||||||
from .kb cimport KnowledgeBase
|
from .kb cimport KnowledgeBase
|
||||||
|
|
||||||
ctypedef vector[KBEntryC] entry_vec
|
ctypedef vector[KBEntryC] entry_vec
|
||||||
|
|
|
@ -1,24 +1,29 @@
|
||||||
# cython: infer_types=True, profile=True
|
# cython: infer_types=True, profile=True
|
||||||
from typing import Iterable, Callable, Dict, Any, Union
|
from typing import Any, Callable, Dict, Iterable, Union
|
||||||
|
|
||||||
import srsly
|
import srsly
|
||||||
from preshed.maps cimport PreshMap
|
|
||||||
from cpython.exc cimport PyErr_SetFromErrno
|
|
||||||
from libc.stdio cimport fopen, fclose, fread, fwrite, feof, fseek
|
|
||||||
from libc.stdint cimport int32_t, int64_t
|
|
||||||
from libcpp.vector cimport vector
|
|
||||||
|
|
||||||
from pathlib import Path
|
from cpython.exc cimport PyErr_SetFromErrno
|
||||||
|
from libc.stdint cimport int32_t, int64_t
|
||||||
|
from libc.stdio cimport fclose, feof, fopen, fread, fseek, fwrite
|
||||||
|
from libcpp.vector cimport vector
|
||||||
|
from preshed.maps cimport PreshMap
|
||||||
|
|
||||||
import warnings
|
import warnings
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
from ..tokens import Span
|
from ..tokens import Span
|
||||||
|
|
||||||
from ..typedefs cimport hash_t
|
from ..typedefs cimport hash_t
|
||||||
from ..errors import Errors, Warnings
|
|
||||||
from .. import util
|
from .. import util
|
||||||
|
from ..errors import Errors, Warnings
|
||||||
from ..util import SimpleFrozenList, ensure_path
|
from ..util import SimpleFrozenList, ensure_path
|
||||||
|
|
||||||
from ..vocab cimport Vocab
|
from ..vocab cimport Vocab
|
||||||
from .kb cimport KnowledgeBase
|
from .kb cimport KnowledgeBase
|
||||||
from .candidate import Candidate as Candidate
|
|
||||||
|
from .candidate import InMemoryCandidate
|
||||||
|
|
||||||
|
|
||||||
cdef class InMemoryLookupKB(KnowledgeBase):
|
cdef class InMemoryLookupKB(KnowledgeBase):
|
||||||
|
@ -226,10 +231,10 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
||||||
alias_entry.probs = probs
|
alias_entry.probs = probs
|
||||||
self._aliases_table[alias_index] = alias_entry
|
self._aliases_table[alias_index] = alias_entry
|
||||||
|
|
||||||
def get_candidates(self, mention: Span) -> Iterable[Candidate]:
|
def get_candidates(self, mention: Span) -> Iterable[InMemoryCandidate]:
|
||||||
return self.get_alias_candidates(mention.text) # type: ignore
|
return self._get_alias_candidates(mention.text) # type: ignore
|
||||||
|
|
||||||
def get_alias_candidates(self, str alias) -> Iterable[Candidate]:
|
def _get_alias_candidates(self, str alias) -> Iterable[InMemoryCandidate]:
|
||||||
"""
|
"""
|
||||||
Return candidate entities for an alias. Each candidate defines the entity, the original alias,
|
Return candidate entities for an alias. Each candidate defines the entity, the original alias,
|
||||||
and the prior probability of that alias resolving to that entity.
|
and the prior probability of that alias resolving to that entity.
|
||||||
|
@ -241,14 +246,18 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
||||||
alias_index = <int64_t>self._alias_index.get(alias_hash)
|
alias_index = <int64_t>self._alias_index.get(alias_hash)
|
||||||
alias_entry = self._aliases_table[alias_index]
|
alias_entry = self._aliases_table[alias_index]
|
||||||
|
|
||||||
return [Candidate(kb=self,
|
return [
|
||||||
entity_hash=self._entries[entry_index].entity_hash,
|
InMemoryCandidate(
|
||||||
entity_freq=self._entries[entry_index].freq,
|
kb=self,
|
||||||
entity_vector=self._vectors_table[self._entries[entry_index].vector_index],
|
entity_hash=self._entries[entry_index].entity_hash,
|
||||||
alias_hash=alias_hash,
|
alias_hash=alias_hash,
|
||||||
prior_prob=prior_prob)
|
entity_vector=self._vectors_table[self._entries[entry_index].vector_index],
|
||||||
for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs)
|
prior_prob=prior_prob,
|
||||||
if entry_index != 0]
|
entity_freq=self._entries[entry_index].freq
|
||||||
|
)
|
||||||
|
for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs)
|
||||||
|
if entry_index != 0
|
||||||
|
]
|
||||||
|
|
||||||
def get_vector(self, str entity):
|
def get_vector(self, str entity):
|
||||||
cdef hash_t entity_hash = self.vocab.strings[entity]
|
cdef hash_t entity_hash = self.vocab.strings[entity]
|
||||||
|
@ -279,6 +288,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
||||||
|
|
||||||
return 0.0
|
return 0.0
|
||||||
|
|
||||||
|
def supports_prior_probs(self) -> bool:
|
||||||
|
return True
|
||||||
|
|
||||||
def to_bytes(self, **kwargs):
|
def to_bytes(self, **kwargs):
|
||||||
"""Serialize the current state to a binary string.
|
"""Serialize the current state to a binary string.
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
|
from ...language import BaseDefaults, Language
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ...language import Language, BaseDefaults
|
|
||||||
|
|
||||||
|
|
||||||
class AfrikaansDefaults(BaseDefaults):
|
class AfrikaansDefaults(BaseDefaults):
|
||||||
|
|
|
@ -1,12 +1,11 @@
|
||||||
from .stop_words import STOP_WORDS
|
from ...attrs import LANG
|
||||||
|
from ...language import BaseDefaults, Language
|
||||||
|
from ...util import update_exc
|
||||||
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .punctuation import TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_SUFFIXES
|
||||||
|
from .stop_words import STOP_WORDS
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
|
||||||
from ...language import Language, BaseDefaults
|
|
||||||
from ...attrs import LANG
|
|
||||||
from ...util import update_exc
|
|
||||||
|
|
||||||
|
|
||||||
class AmharicDefaults(BaseDefaults):
|
class AmharicDefaults(BaseDefaults):
|
||||||
|
|
|
@ -1,5 +1,11 @@
|
||||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
|
from ..char_classes import (
|
||||||
from ..char_classes import UNITS, ALPHA_UPPER
|
ALPHA_UPPER,
|
||||||
|
CURRENCY,
|
||||||
|
LIST_ELLIPSES,
|
||||||
|
LIST_PUNCT,
|
||||||
|
LIST_QUOTES,
|
||||||
|
UNITS,
|
||||||
|
)
|
||||||
|
|
||||||
_list_punct = LIST_PUNCT + "፡ ። ፣ ፤ ፥ ፦ ፧ ፠ ፨".strip().split()
|
_list_punct = LIST_PUNCT + "፡ ። ፣ ፤ ፥ ፦ ፧ ፠ ፨".strip().split()
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,4 @@
|
||||||
from ...symbols import ORTH, NORM
|
from ...symbols import NORM, ORTH
|
||||||
|
|
||||||
|
|
||||||
_exc = {}
|
_exc = {}
|
||||||
|
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
from .stop_words import STOP_WORDS
|
from ...language import BaseDefaults, Language
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .punctuation import TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_SUFFIXES
|
||||||
|
from .stop_words import STOP_WORDS
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from ...language import Language, BaseDefaults
|
|
||||||
|
|
||||||
|
|
||||||
class ArabicDefaults(BaseDefaults):
|
class ArabicDefaults(BaseDefaults):
|
||||||
|
|
|
@ -1,5 +1,11 @@
|
||||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
|
from ..char_classes import (
|
||||||
from ..char_classes import UNITS, ALPHA_UPPER
|
ALPHA_UPPER,
|
||||||
|
CURRENCY,
|
||||||
|
LIST_ELLIPSES,
|
||||||
|
LIST_PUNCT,
|
||||||
|
LIST_QUOTES,
|
||||||
|
UNITS,
|
||||||
|
)
|
||||||
|
|
||||||
_suffixes = (
|
_suffixes = (
|
||||||
LIST_PUNCT
|
LIST_PUNCT
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ...symbols import NORM, ORTH
|
||||||
from ...symbols import ORTH, NORM
|
|
||||||
from ...util import update_exc
|
from ...util import update_exc
|
||||||
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
|
|
||||||
_exc = {}
|
_exc = {}
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
from .stop_words import STOP_WORDS
|
from ...language import BaseDefaults, Language
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from ...language import Language, BaseDefaults
|
from .stop_words import STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
class AzerbaijaniDefaults(BaseDefaults):
|
class AzerbaijaniDefaults(BaseDefaults):
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
from ...attrs import LIKE_NUM
|
from ...attrs import LIKE_NUM
|
||||||
|
|
||||||
|
|
||||||
# Eleven, twelve etc. are written separate: on bir, on iki
|
# Eleven, twelve etc. are written separate: on bir, on iki
|
||||||
|
|
||||||
_num_words = [
|
_num_words = [
|
||||||
|
|
|
@ -1,12 +1,14 @@
|
||||||
|
from ...attrs import LANG
|
||||||
|
from ...language import BaseDefaults, Language
|
||||||
|
from ...util import update_exc
|
||||||
|
from ..punctuation import (
|
||||||
|
COMBINING_DIACRITICS_TOKENIZER_INFIXES,
|
||||||
|
COMBINING_DIACRITICS_TOKENIZER_SUFFIXES,
|
||||||
|
)
|
||||||
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .lex_attrs import LEX_ATTRS
|
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
|
||||||
from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_INFIXES
|
|
||||||
from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_SUFFIXES
|
|
||||||
from ...language import Language, BaseDefaults
|
|
||||||
from ...attrs import LANG
|
|
||||||
from ...util import update_exc
|
|
||||||
|
|
||||||
|
|
||||||
class BulgarianDefaults(BaseDefaults):
|
class BulgarianDefaults(BaseDefaults):
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
from ...attrs import LIKE_NUM
|
from ...attrs import LIKE_NUM
|
||||||
|
|
||||||
|
|
||||||
_num_words = [
|
_num_words = [
|
||||||
"нула",
|
"нула",
|
||||||
"едно",
|
"едно",
|
||||||
|
|
|
@ -4,8 +4,7 @@ References:
|
||||||
(countries, occupations, fields of studies and more).
|
(countries, occupations, fields of studies and more).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from ...symbols import ORTH, NORM
|
from ...symbols import NORM, ORTH
|
||||||
|
|
||||||
|
|
||||||
_exc = {}
|
_exc = {}
|
||||||
|
|
||||||
|
|
|
@ -1,10 +1,12 @@
|
||||||
from typing import Optional, Callable
|
from typing import Callable, Optional
|
||||||
|
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
|
||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
from ...language import BaseDefaults, Language
|
||||||
from .stop_words import STOP_WORDS
|
|
||||||
from ...language import Language, BaseDefaults
|
|
||||||
from ...pipeline import Lemmatizer
|
from ...pipeline import Lemmatizer
|
||||||
|
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
||||||
|
from .stop_words import STOP_WORDS
|
||||||
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
|
|
||||||
|
|
||||||
class BengaliDefaults(BaseDefaults):
|
class BengaliDefaults(BaseDefaults):
|
||||||
|
|
|
@ -1,6 +1,14 @@
|
||||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
|
from ..char_classes import (
|
||||||
from ..char_classes import ALPHA_LOWER, ALPHA, HYPHENS, CONCAT_QUOTES, UNITS
|
ALPHA,
|
||||||
|
ALPHA_LOWER,
|
||||||
|
CONCAT_QUOTES,
|
||||||
|
HYPHENS,
|
||||||
|
LIST_ELLIPSES,
|
||||||
|
LIST_ICONS,
|
||||||
|
LIST_PUNCT,
|
||||||
|
LIST_QUOTES,
|
||||||
|
UNITS,
|
||||||
|
)
|
||||||
|
|
||||||
_currency = r"\$¢£€¥฿৳"
|
_currency = r"\$¢£€¥฿৳"
|
||||||
_quotes = CONCAT_QUOTES.replace("'", "")
|
_quotes = CONCAT_QUOTES.replace("'", "")
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ...symbols import NORM, ORTH
|
||||||
from ...symbols import ORTH, NORM
|
|
||||||
from ...util import update_exc
|
from ...util import update_exc
|
||||||
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
|
|
||||||
_exc = {}
|
_exc = {}
|
||||||
|
|
||||||
|
|
|
@ -1,14 +1,14 @@
|
||||||
from typing import Optional, Callable
|
from typing import Callable, Optional
|
||||||
|
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from ...language import BaseDefaults, Language
|
||||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES
|
|
||||||
from .stop_words import STOP_WORDS
|
|
||||||
from .lex_attrs import LEX_ATTRS
|
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
|
||||||
from ...language import Language, BaseDefaults
|
|
||||||
from .lemmatizer import CatalanLemmatizer
|
from .lemmatizer import CatalanLemmatizer
|
||||||
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
||||||
|
from .stop_words import STOP_WORDS
|
||||||
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
|
|
||||||
|
|
||||||
class CatalanDefaults(BaseDefaults):
|
class CatalanDefaults(BaseDefaults):
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
from ...attrs import LIKE_NUM
|
from ...attrs import LIKE_NUM
|
||||||
|
|
||||||
|
|
||||||
_num_words = [
|
_num_words = [
|
||||||
"zero",
|
"zero",
|
||||||
"un",
|
"un",
|
||||||
|
|
|
@ -1,9 +1,18 @@
|
||||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
|
from ..char_classes import (
|
||||||
from ..char_classes import LIST_CURRENCY
|
ALPHA,
|
||||||
from ..char_classes import CURRENCY
|
ALPHA_LOWER,
|
||||||
from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT
|
ALPHA_UPPER,
|
||||||
from ..char_classes import merge_chars, _units
|
CONCAT_QUOTES,
|
||||||
|
CURRENCY,
|
||||||
|
LIST_CURRENCY,
|
||||||
|
LIST_ELLIPSES,
|
||||||
|
LIST_ICONS,
|
||||||
|
LIST_PUNCT,
|
||||||
|
LIST_QUOTES,
|
||||||
|
PUNCT,
|
||||||
|
_units,
|
||||||
|
merge_chars,
|
||||||
|
)
|
||||||
|
|
||||||
ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "")
|
ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "")
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,8 @@
|
||||||
from typing import Union, Iterator, Tuple
|
from typing import Iterator, Tuple, Union
|
||||||
from ...tokens import Doc, Span
|
|
||||||
from ...symbols import NOUN, PROPN
|
|
||||||
from ...errors import Errors
|
from ...errors import Errors
|
||||||
|
from ...symbols import NOUN, PROPN
|
||||||
|
from ...tokens import Doc, Span
|
||||||
|
|
||||||
|
|
||||||
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
|
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ...symbols import NORM, ORTH
|
||||||
from ...symbols import ORTH, NORM
|
|
||||||
from ...util import update_exc
|
from ...util import update_exc
|
||||||
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
|
|
||||||
_exc = {}
|
_exc = {}
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
from .stop_words import STOP_WORDS
|
from ...language import BaseDefaults, Language
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from ...language import Language, BaseDefaults
|
from .stop_words import STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
class CzechDefaults(BaseDefaults):
|
class CzechDefaults(BaseDefaults):
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from ...language import BaseDefaults, Language
|
||||||
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from ...language import Language, BaseDefaults
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
|
|
||||||
|
|
||||||
class DanishDefaults(BaseDefaults):
|
class DanishDefaults(BaseDefaults):
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
from ...attrs import LIKE_NUM
|
from ...attrs import LIKE_NUM
|
||||||
|
|
||||||
|
|
||||||
# Source http://fjern-uv.dk/tal.php
|
# Source http://fjern-uv.dk/tal.php
|
||||||
_num_words = """nul
|
_num_words = """nul
|
||||||
en et to tre fire fem seks syv otte ni ti
|
en et to tre fire fem seks syv otte ni ti
|
||||||
|
|
|
@ -1,8 +1,13 @@
|
||||||
from ..char_classes import LIST_ELLIPSES, LIST_ICONS
|
from ..char_classes import (
|
||||||
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
|
ALPHA,
|
||||||
|
ALPHA_LOWER,
|
||||||
|
ALPHA_UPPER,
|
||||||
|
CONCAT_QUOTES,
|
||||||
|
LIST_ELLIPSES,
|
||||||
|
LIST_ICONS,
|
||||||
|
)
|
||||||
from ..punctuation import TOKENIZER_SUFFIXES
|
from ..punctuation import TOKENIZER_SUFFIXES
|
||||||
|
|
||||||
|
|
||||||
_quotes = CONCAT_QUOTES.replace("'", "")
|
_quotes = CONCAT_QUOTES.replace("'", "")
|
||||||
|
|
||||||
_infixes = (
|
_infixes = (
|
||||||
|
|
|
@ -1,7 +1,8 @@
|
||||||
from typing import Union, Iterator, Tuple
|
from typing import Iterator, Tuple, Union
|
||||||
from ...tokens import Doc, Span
|
|
||||||
from ...symbols import NOUN, PROPN, PRON, VERB, AUX
|
|
||||||
from ...errors import Errors
|
from ...errors import Errors
|
||||||
|
from ...symbols import AUX, NOUN, PRON, PROPN, VERB
|
||||||
|
from ...tokens import Doc, Span
|
||||||
|
|
||||||
|
|
||||||
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
|
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
|
||||||
|
|
|
@ -2,10 +2,9 @@
|
||||||
Tokenizer Exceptions.
|
Tokenizer Exceptions.
|
||||||
Source: https://forkortelse.dk/ and various others.
|
Source: https://forkortelse.dk/ and various others.
|
||||||
"""
|
"""
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ...symbols import NORM, ORTH
|
||||||
from ...symbols import ORTH, NORM
|
|
||||||
from ...util import update_exc
|
from ...util import update_exc
|
||||||
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
|
|
||||||
_exc = {}
|
_exc = {}
|
||||||
|
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from ...language import BaseDefaults, Language
|
||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from ...language import Language, BaseDefaults
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
|
|
||||||
|
|
||||||
class GermanDefaults(BaseDefaults):
|
class GermanDefaults(BaseDefaults):
|
||||||
|
|
|
@ -1,9 +1,18 @@
|
||||||
from ..char_classes import LIST_ELLIPSES, LIST_ICONS, LIST_PUNCT, LIST_QUOTES
|
from ..char_classes import (
|
||||||
from ..char_classes import CURRENCY, UNITS, PUNCT
|
ALPHA,
|
||||||
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
|
ALPHA_LOWER,
|
||||||
|
ALPHA_UPPER,
|
||||||
|
CONCAT_QUOTES,
|
||||||
|
CURRENCY,
|
||||||
|
LIST_ELLIPSES,
|
||||||
|
LIST_ICONS,
|
||||||
|
LIST_PUNCT,
|
||||||
|
LIST_QUOTES,
|
||||||
|
PUNCT,
|
||||||
|
UNITS,
|
||||||
|
)
|
||||||
from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES
|
from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES
|
||||||
|
|
||||||
|
|
||||||
_prefixes = ["``"] + BASE_TOKENIZER_PREFIXES
|
_prefixes = ["``"] + BASE_TOKENIZER_PREFIXES
|
||||||
|
|
||||||
_suffixes = (
|
_suffixes = (
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
from typing import Union, Iterator, Tuple
|
from typing import Iterator, Tuple, Union
|
||||||
|
|
||||||
from ...symbols import NOUN, PROPN, PRON
|
|
||||||
from ...errors import Errors
|
from ...errors import Errors
|
||||||
|
from ...symbols import NOUN, PRON, PROPN
|
||||||
from ...tokens import Doc, Span
|
from ...tokens import Doc, Span
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ...symbols import NORM, ORTH
|
||||||
from ...symbols import ORTH, NORM
|
|
||||||
from ...util import update_exc
|
from ...util import update_exc
|
||||||
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
|
|
||||||
_exc = {
|
_exc = {
|
||||||
"auf'm": [{ORTH: "auf"}, {ORTH: "'m", NORM: "dem"}],
|
"auf'm": [{ORTH: "auf"}, {ORTH: "'m", NORM: "dem"}],
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
|
from ...language import BaseDefaults, Language
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ...language import Language, BaseDefaults
|
|
||||||
|
|
||||||
|
|
||||||
class LowerSorbianDefaults(BaseDefaults):
|
class LowerSorbianDefaults(BaseDefaults):
|
||||||
|
|
|
@ -1,13 +1,14 @@
|
||||||
from typing import Optional, Callable
|
from typing import Callable, Optional
|
||||||
|
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from ...language import BaseDefaults, Language
|
||||||
from .stop_words import STOP_WORDS
|
|
||||||
from .lex_attrs import LEX_ATTRS
|
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
|
||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
|
||||||
from .lemmatizer import GreekLemmatizer
|
from .lemmatizer import GreekLemmatizer
|
||||||
from ...language import Language, BaseDefaults
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
||||||
|
from .stop_words import STOP_WORDS
|
||||||
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
|
|
||||||
|
|
||||||
class GreekDefaults(BaseDefaults):
|
class GreekDefaults(BaseDefaults):
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
def get_pos_from_wiktionary():
|
def get_pos_from_wiktionary():
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from gensim.corpora.wikicorpus import extract_pages
|
from gensim.corpora.wikicorpus import extract_pages
|
||||||
|
|
||||||
regex = re.compile(r"==={{(\w+)\|el}}===")
|
regex = re.compile(r"==={{(\w+)\|el}}===")
|
||||||
|
|
|
@ -1,6 +1,16 @@
|
||||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
|
from ..char_classes import (
|
||||||
from ..char_classes import LIST_ICONS, ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS
|
ALPHA,
|
||||||
from ..char_classes import CONCAT_QUOTES, CURRENCY
|
ALPHA_LOWER,
|
||||||
|
ALPHA_UPPER,
|
||||||
|
CONCAT_QUOTES,
|
||||||
|
CURRENCY,
|
||||||
|
HYPHENS,
|
||||||
|
LIST_CURRENCY,
|
||||||
|
LIST_ELLIPSES,
|
||||||
|
LIST_ICONS,
|
||||||
|
LIST_PUNCT,
|
||||||
|
LIST_QUOTES,
|
||||||
|
)
|
||||||
|
|
||||||
_units = (
|
_units = (
|
||||||
"km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft "
|
"km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft "
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
from typing import Union, Iterator, Tuple
|
from typing import Iterator, Tuple, Union
|
||||||
|
|
||||||
from ...symbols import NOUN, PROPN, PRON
|
|
||||||
from ...errors import Errors
|
from ...errors import Errors
|
||||||
|
from ...symbols import NOUN, PRON, PROPN
|
||||||
from ...tokens import Doc, Span
|
from ...tokens import Doc, Span
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ...symbols import NORM, ORTH
|
||||||
from ...symbols import ORTH, NORM
|
|
||||||
from ...util import update_exc
|
from ...util import update_exc
|
||||||
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
|
|
||||||
_exc = {}
|
_exc = {}
|
||||||
|
|
||||||
|
|
|
@ -1,13 +1,14 @@
|
||||||
from typing import Optional, Callable
|
from typing import Callable, Optional
|
||||||
|
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from ...language import BaseDefaults, Language
|
||||||
from .stop_words import STOP_WORDS
|
|
||||||
from .lex_attrs import LEX_ATTRS
|
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
|
||||||
from .punctuation import TOKENIZER_INFIXES
|
|
||||||
from .lemmatizer import EnglishLemmatizer
|
from .lemmatizer import EnglishLemmatizer
|
||||||
from ...language import Language, BaseDefaults
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
from .punctuation import TOKENIZER_INFIXES
|
||||||
|
from .stop_words import STOP_WORDS
|
||||||
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
|
|
||||||
|
|
||||||
class EnglishDefaults(BaseDefaults):
|
class EnglishDefaults(BaseDefaults):
|
||||||
|
|
|
@ -1,5 +1,12 @@
|
||||||
from ..char_classes import LIST_ELLIPSES, LIST_ICONS, HYPHENS
|
from ..char_classes import (
|
||||||
from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
|
ALPHA,
|
||||||
|
ALPHA_LOWER,
|
||||||
|
ALPHA_UPPER,
|
||||||
|
CONCAT_QUOTES,
|
||||||
|
HYPHENS,
|
||||||
|
LIST_ELLIPSES,
|
||||||
|
LIST_ICONS,
|
||||||
|
)
|
||||||
|
|
||||||
_infixes = (
|
_infixes = (
|
||||||
LIST_ELLIPSES
|
LIST_ELLIPSES
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
from typing import Union, Iterator, Tuple
|
from typing import Iterator, Tuple, Union
|
||||||
|
|
||||||
from ...symbols import NOUN, PROPN, PRON
|
|
||||||
from ...errors import Errors
|
from ...errors import Errors
|
||||||
|
from ...symbols import NOUN, PRON, PROPN
|
||||||
from ...tokens import Doc, Span
|
from ...tokens import Doc, Span
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
from typing import Dict, List
|
from typing import Dict, List
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
|
||||||
from ...symbols import ORTH, NORM
|
|
||||||
from ...util import update_exc
|
|
||||||
|
|
||||||
|
from ...symbols import NORM, ORTH
|
||||||
|
from ...util import update_exc
|
||||||
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
|
|
||||||
_exc: Dict[str, List[Dict]] = {}
|
_exc: Dict[str, List[Dict]] = {}
|
||||||
_exclude = [
|
_exclude = [
|
||||||
|
|
|
@ -1,12 +1,14 @@
|
||||||
from typing import Optional, Callable
|
from typing import Callable, Optional
|
||||||
|
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
|
||||||
from .stop_words import STOP_WORDS
|
from ...language import BaseDefaults, Language
|
||||||
from .lex_attrs import LEX_ATTRS
|
|
||||||
from .lemmatizer import SpanishLemmatizer
|
from .lemmatizer import SpanishLemmatizer
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||||
from ...language import Language, BaseDefaults
|
from .stop_words import STOP_WORDS
|
||||||
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
|
|
||||||
|
|
||||||
class SpanishDefaults(BaseDefaults):
|
class SpanishDefaults(BaseDefaults):
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user