mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-03 11:50:19 +03:00
Merge branch 'master' into feature/projects-multiprocessing
This commit is contained in:
commit
cfaa90203e
76
.github/azure-steps.yml
vendored
76
.github/azure-steps.yml
vendored
|
@ -1,31 +1,30 @@
|
|||
parameters:
|
||||
python_version: ''
|
||||
architecture: ''
|
||||
prefix: ''
|
||||
gpu: false
|
||||
num_build_jobs: 1
|
||||
architecture: 'x64'
|
||||
num_build_jobs: 2
|
||||
|
||||
steps:
|
||||
- task: UsePythonVersion@0
|
||||
inputs:
|
||||
versionSpec: ${{ parameters.python_version }}
|
||||
architecture: ${{ parameters.architecture }}
|
||||
allowUnstable: true
|
||||
|
||||
- bash: |
|
||||
echo "##vso[task.setvariable variable=python_version]${{ parameters.python_version }}"
|
||||
displayName: 'Set variables'
|
||||
|
||||
- script: |
|
||||
${{ parameters.prefix }} python -m pip install -U pip setuptools
|
||||
${{ parameters.prefix }} python -m pip install -U -r requirements.txt
|
||||
python -m pip install -U build pip setuptools
|
||||
python -m pip install -U -r requirements.txt
|
||||
displayName: "Install dependencies"
|
||||
|
||||
- script: |
|
||||
${{ parameters.prefix }} python setup.py build_ext --inplace -j ${{ parameters.num_build_jobs }}
|
||||
${{ parameters.prefix }} python setup.py sdist --formats=gztar
|
||||
displayName: "Compile and build sdist"
|
||||
python -m build --sdist
|
||||
displayName: "Build sdist"
|
||||
|
||||
- script: python -m mypy spacy
|
||||
- script: |
|
||||
python -m mypy spacy
|
||||
displayName: 'Run mypy'
|
||||
condition: ne(variables['python_version'], '3.6')
|
||||
|
||||
|
@ -34,35 +33,24 @@ steps:
|
|||
contents: "spacy"
|
||||
displayName: "Delete source directory"
|
||||
|
||||
- task: DeleteFiles@1
|
||||
inputs:
|
||||
contents: "*.egg-info"
|
||||
displayName: "Delete egg-info directory"
|
||||
|
||||
- script: |
|
||||
${{ parameters.prefix }} python -m pip freeze --exclude torch --exclude cupy-cuda110 > installed.txt
|
||||
${{ parameters.prefix }} python -m pip uninstall -y -r installed.txt
|
||||
python -m pip freeze > installed.txt
|
||||
python -m pip uninstall -y -r installed.txt
|
||||
displayName: "Uninstall all packages"
|
||||
|
||||
- bash: |
|
||||
${{ parameters.prefix }} SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
|
||||
${{ parameters.prefix }} SPACY_NUM_BUILD_JOBS=2 python -m pip install dist/$SDIST
|
||||
SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
|
||||
SPACY_NUM_BUILD_JOBS=${{ parameters.num_build_jobs }} python -m pip install dist/$SDIST
|
||||
displayName: "Install from sdist"
|
||||
|
||||
- script: |
|
||||
${{ parameters.prefix }} python -m pip install -U -r requirements.txt
|
||||
displayName: "Install test requirements"
|
||||
|
||||
- script: |
|
||||
${{ parameters.prefix }} python -m pip install -U cupy-cuda110 -f https://github.com/cupy/cupy/releases/v9.0.0
|
||||
${{ parameters.prefix }} python -m pip install "torch==1.7.1+cu110" -f https://download.pytorch.org/whl/torch_stable.html
|
||||
displayName: "Install GPU requirements"
|
||||
condition: eq(${{ parameters.gpu }}, true)
|
||||
|
||||
- script: |
|
||||
${{ parameters.prefix }} python -m pytest --pyargs spacy -W error
|
||||
displayName: "Run CPU tests"
|
||||
condition: eq(${{ parameters.gpu }}, false)
|
||||
|
||||
- script: |
|
||||
${{ parameters.prefix }} python -m pytest --pyargs spacy -W error -p spacy.tests.enable_gpu
|
||||
displayName: "Run GPU tests"
|
||||
condition: eq(${{ parameters.gpu }}, true)
|
||||
python -W error -c "import spacy"
|
||||
displayName: "Test import"
|
||||
|
||||
- script: |
|
||||
python -m spacy download ca_core_news_sm
|
||||
|
@ -71,6 +59,11 @@ steps:
|
|||
displayName: 'Test download CLI'
|
||||
condition: eq(variables['python_version'], '3.8')
|
||||
|
||||
- script: |
|
||||
python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
|
||||
displayName: 'Test no warnings on load (#11713)'
|
||||
condition: eq(variables['python_version'], '3.8')
|
||||
|
||||
- script: |
|
||||
python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
|
||||
displayName: 'Test convert CLI'
|
||||
|
@ -105,13 +98,22 @@ steps:
|
|||
displayName: 'Test assemble CLI vectors warning'
|
||||
condition: eq(variables['python_version'], '3.8')
|
||||
|
||||
- script: |
|
||||
python -m pip install -U -r requirements.txt
|
||||
displayName: "Install test requirements"
|
||||
|
||||
- script: |
|
||||
python -m pytest --pyargs spacy -W error
|
||||
displayName: "Run CPU tests"
|
||||
|
||||
- script: |
|
||||
python -m pip install 'spacy[apple]'
|
||||
python -m pytest --pyargs spacy
|
||||
displayName: "Run CPU tests with thinc-apple-ops"
|
||||
condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.11'))
|
||||
|
||||
- script: |
|
||||
python .github/validate_universe_json.py website/meta/universe.json
|
||||
displayName: 'Test website/meta/universe.json'
|
||||
condition: eq(variables['python_version'], '3.8')
|
||||
|
||||
- script: |
|
||||
${{ parameters.prefix }} python -m pip install --pre thinc-apple-ops
|
||||
${{ parameters.prefix }} python -m pytest --pyargs spacy
|
||||
displayName: "Run CPU tests with thinc-apple-ops"
|
||||
condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.10'))
|
||||
|
|
9
.github/workflows/autoblack.yml
vendored
9
.github/workflows/autoblack.yml
vendored
|
@ -12,10 +12,10 @@ jobs:
|
|||
if: github.repository_owner == 'explosion'
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
ref: ${{ github.head_ref }}
|
||||
- uses: actions/setup-python@v2
|
||||
- uses: actions/setup-python@v4
|
||||
- run: pip install black
|
||||
- name: Auto-format code if needed
|
||||
run: black spacy
|
||||
|
@ -23,10 +23,11 @@ jobs:
|
|||
# code and makes GitHub think the action failed
|
||||
- name: Check for modified files
|
||||
id: git-check
|
||||
run: echo ::set-output name=modified::$(if git diff-index --quiet HEAD --; then echo "false"; else echo "true"; fi)
|
||||
run: echo modified=$(if git diff-index --quiet HEAD --; then echo "false"; else echo "true"; fi) >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Create Pull Request
|
||||
if: steps.git-check.outputs.modified == 'true'
|
||||
uses: peter-evans/create-pull-request@v3
|
||||
uses: peter-evans/create-pull-request@v4
|
||||
with:
|
||||
title: Auto-format code with black
|
||||
labels: meta
|
||||
|
|
6
.github/workflows/explosionbot.yml
vendored
6
.github/workflows/explosionbot.yml
vendored
|
@ -8,14 +8,14 @@ on:
|
|||
|
||||
jobs:
|
||||
explosion-bot:
|
||||
runs-on: ubuntu-18.04
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Dump GitHub context
|
||||
env:
|
||||
GITHUB_CONTEXT: ${{ toJson(github) }}
|
||||
run: echo "$GITHUB_CONTEXT"
|
||||
- uses: actions/checkout@v1
|
||||
- uses: actions/setup-python@v1
|
||||
- uses: actions/checkout@v3
|
||||
- uses: actions/setup-python@v4
|
||||
- name: Install and run explosion-bot
|
||||
run: |
|
||||
pip install git+https://${{ secrets.EXPLOSIONBOT_TOKEN }}@github.com/explosion/explosion-bot
|
||||
|
|
8
.github/workflows/lock.yml
vendored
8
.github/workflows/lock.yml
vendored
|
@ -15,11 +15,11 @@ jobs:
|
|||
action:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: dessant/lock-threads@v3
|
||||
- uses: dessant/lock-threads@v4
|
||||
with:
|
||||
process-only: 'issues'
|
||||
issue-inactive-days: '30'
|
||||
issue-comment: >
|
||||
This thread has been automatically locked since there
|
||||
has not been any recent activity after it was closed.
|
||||
issue-comment: >
|
||||
This thread has been automatically locked since there
|
||||
has not been any recent activity after it was closed.
|
||||
Please open a new issue for related bugs.
|
||||
|
|
6
.github/workflows/slowtests.yml
vendored
6
.github/workflows/slowtests.yml
vendored
|
@ -14,7 +14,7 @@ jobs:
|
|||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v1
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
ref: ${{ matrix.branch }}
|
||||
- name: Get commits from past 24 hours
|
||||
|
@ -23,9 +23,9 @@ jobs:
|
|||
today=$(date '+%Y-%m-%d %H:%M:%S')
|
||||
yesterday=$(date -d "yesterday" '+%Y-%m-%d %H:%M:%S')
|
||||
if git log --after="$yesterday" --before="$today" | grep commit ; then
|
||||
echo "::set-output name=run_tests::true"
|
||||
echo run_tests=true >> $GITHUB_OUTPUT
|
||||
else
|
||||
echo "::set-output name=run_tests::false"
|
||||
echo run_tests=false >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
- name: Trigger buildkite build
|
||||
|
|
6
.github/workflows/spacy_universe_alert.yml
vendored
6
.github/workflows/spacy_universe_alert.yml
vendored
|
@ -17,8 +17,10 @@ jobs:
|
|||
run: |
|
||||
echo "$GITHUB_CONTEXT"
|
||||
|
||||
- uses: actions/checkout@v1
|
||||
- uses: actions/setup-python@v1
|
||||
- uses: actions/checkout@v3
|
||||
- uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: '3.10'
|
||||
- name: Install Bernadette app dependency and send an alert
|
||||
env:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
|
|
10
.gitignore
vendored
10
.gitignore
vendored
|
@ -10,16 +10,6 @@ spacy/tests/package/setup.cfg
|
|||
spacy/tests/package/pyproject.toml
|
||||
spacy/tests/package/requirements.txt
|
||||
|
||||
# Website
|
||||
website/.cache/
|
||||
website/public/
|
||||
website/node_modules
|
||||
website/.npm
|
||||
website/logs
|
||||
*.log
|
||||
npm-debug.log*
|
||||
quickstart-training-generator.js
|
||||
|
||||
# Cython / C extensions
|
||||
cythonize.json
|
||||
spacy/*.html
|
||||
|
|
|
@ -5,7 +5,7 @@ repos:
|
|||
- id: black
|
||||
language_version: python3.7
|
||||
additional_dependencies: ['click==8.0.4']
|
||||
- repo: https://gitlab.com/pycqa/flake8
|
||||
- repo: https://github.com/pycqa/flake8
|
||||
rev: 5.0.4
|
||||
hooks:
|
||||
- id: flake8
|
||||
|
|
10
README.md
10
README.md
|
@ -8,15 +8,15 @@ be used in real products.
|
|||
|
||||
spaCy comes with
|
||||
[pretrained pipelines](https://spacy.io/models) and
|
||||
currently supports tokenization and training for **60+ languages**. It features
|
||||
currently supports tokenization and training for **70+ languages**. It features
|
||||
state-of-the-art speed and **neural network models** for tagging,
|
||||
parsing, **named entity recognition**, **text classification** and more,
|
||||
multi-task learning with pretrained **transformers** like BERT, as well as a
|
||||
production-ready [**training system**](https://spacy.io/usage/training) and easy
|
||||
model packaging, deployment and workflow management. spaCy is commercial
|
||||
open-source software, released under the MIT license.
|
||||
open-source software, released under the [MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE).
|
||||
|
||||
💫 **Version 3.4.0 out now!**
|
||||
💫 **Version 3.5 out now!**
|
||||
[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
|
||||
|
||||
[](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)
|
||||
|
@ -46,6 +46,7 @@ open-source software, released under the MIT license.
|
|||
| 🛠 **[Changelog]** | Changes and version history. |
|
||||
| 💝 **[Contribute]** | How to contribute to the spaCy project and code base. |
|
||||
| <a href="https://explosion.ai/spacy-tailored-pipelines"><img src="https://user-images.githubusercontent.com/13643239/152853098-1c761611-ccb0-4ec6-9066-b234552831fe.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Get a custom spaCy pipeline, tailor-made for your NLP problem by spaCy's core developers. Streamlined, production-ready, predictable and maintainable. Start by completing our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-pipelines)** |
|
||||
| <a href="https://explosion.ai/spacy-tailored-analysis"><img src="https://user-images.githubusercontent.com/1019791/206151300-b00cd189-e503-4797-aa1e-1bb6344062c5.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Bespoke advice for problem solving, strategy and analysis for applied NLP projects. Services include data strategy, code reviews, pipeline design and annotation coaching. Curious? Fill in our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-analysis)** |
|
||||
|
||||
[spacy 101]: https://spacy.io/usage/spacy-101
|
||||
[new in v3.0]: https://spacy.io/usage/v3
|
||||
|
@ -59,6 +60,7 @@ open-source software, released under the MIT license.
|
|||
[changelog]: https://spacy.io/usage#changelog
|
||||
[contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md
|
||||
|
||||
|
||||
## 💬 Where to ask questions
|
||||
|
||||
The spaCy project is maintained by the [spaCy team](https://explosion.ai/about).
|
||||
|
@ -79,7 +81,7 @@ more people can benefit from it.
|
|||
|
||||
## Features
|
||||
|
||||
- Support for **60+ languages**
|
||||
- Support for **70+ languages**
|
||||
- **Trained pipelines** for different languages and tasks
|
||||
- Multi-task learning with pretrained **transformers** like BERT
|
||||
- Support for pretrained **word vectors** and embeddings
|
||||
|
|
|
@ -41,7 +41,7 @@ jobs:
|
|||
matrix:
|
||||
# We're only running one platform per Python version to speed up builds
|
||||
Python36Linux:
|
||||
imageName: "ubuntu-latest"
|
||||
imageName: "ubuntu-20.04"
|
||||
python.version: "3.6"
|
||||
# Python36Windows:
|
||||
# imageName: "windows-latest"
|
||||
|
@ -50,7 +50,7 @@ jobs:
|
|||
# imageName: "macos-latest"
|
||||
# python.version: "3.6"
|
||||
# Python37Linux:
|
||||
# imageName: "ubuntu-latest"
|
||||
# imageName: "ubuntu-20.04"
|
||||
# python.version: "3.7"
|
||||
Python37Windows:
|
||||
imageName: "windows-latest"
|
||||
|
@ -76,15 +76,24 @@ jobs:
|
|||
# Python39Mac:
|
||||
# imageName: "macos-latest"
|
||||
# python.version: "3.9"
|
||||
Python310Linux:
|
||||
imageName: "ubuntu-latest"
|
||||
python.version: "3.10"
|
||||
# Python310Linux:
|
||||
# imageName: "ubuntu-latest"
|
||||
# python.version: "3.10"
|
||||
Python310Windows:
|
||||
imageName: "windows-latest"
|
||||
python.version: "3.10"
|
||||
Python310Mac:
|
||||
imageName: "macos-latest"
|
||||
python.version: "3.10"
|
||||
# Python310Mac:
|
||||
# imageName: "macos-latest"
|
||||
# python.version: "3.10"
|
||||
Python311Linux:
|
||||
imageName: 'ubuntu-latest'
|
||||
python.version: '3.11'
|
||||
Python311Windows:
|
||||
imageName: 'windows-latest'
|
||||
python.version: '3.11'
|
||||
Python311Mac:
|
||||
imageName: 'macos-latest'
|
||||
python.version: '3.11'
|
||||
maxParallel: 4
|
||||
pool:
|
||||
vmImage: $(imageName)
|
||||
|
@ -92,20 +101,3 @@ jobs:
|
|||
- template: .github/azure-steps.yml
|
||||
parameters:
|
||||
python_version: '$(python.version)'
|
||||
architecture: 'x64'
|
||||
|
||||
# - job: "TestGPU"
|
||||
# dependsOn: "Validate"
|
||||
# strategy:
|
||||
# matrix:
|
||||
# Python38LinuxX64_GPU:
|
||||
# python.version: '3.8'
|
||||
# pool:
|
||||
# name: "LinuxX64_GPU"
|
||||
# steps:
|
||||
# - template: .github/azure-steps.yml
|
||||
# parameters:
|
||||
# python_version: '$(python.version)'
|
||||
# architecture: 'x64'
|
||||
# gpu: true
|
||||
# num_build_jobs: 24
|
||||
|
|
|
@ -5,4 +5,5 @@ numpy==1.17.3; python_version=='3.8' and platform_machine!='aarch64'
|
|||
numpy==1.19.2; python_version=='3.8' and platform_machine=='aarch64'
|
||||
numpy==1.19.3; python_version=='3.9'
|
||||
numpy==1.21.3; python_version=='3.10'
|
||||
numpy; python_version>='3.11'
|
||||
numpy==1.23.2; python_version=='3.11'
|
||||
numpy; python_version>='3.12'
|
||||
|
|
|
@ -1,27 +1,28 @@
|
|||
# Our libraries
|
||||
spacy-legacy>=3.0.10,<3.1.0
|
||||
spacy-legacy>=3.0.11,<3.1.0
|
||||
spacy-loggers>=1.0.0,<2.0.0
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
thinc>=8.1.0,<8.2.0
|
||||
ml_datasets>=0.2.0,<0.3.0
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
wasabi>=0.9.1,<1.1.0
|
||||
wasabi>=0.9.1,<1.2.0
|
||||
srsly>=2.4.3,<3.0.0
|
||||
catalogue>=2.0.6,<2.1.0
|
||||
typer>=0.3.0,<0.5.0
|
||||
pathy>=0.3.5
|
||||
typer>=0.3.0,<0.8.0
|
||||
pathy>=0.10.0
|
||||
smart-open>=5.2.1,<7.0.0
|
||||
# Third party dependencies
|
||||
numpy>=1.15.0
|
||||
requests>=2.13.0,<3.0.0
|
||||
tqdm>=4.38.0,<5.0.0
|
||||
pydantic>=1.7.4,!=1.8,!=1.8.1,<1.10.0
|
||||
pydantic>=1.7.4,!=1.8,!=1.8.1,<1.11.0
|
||||
jinja2
|
||||
langcodes>=3.2.0,<4.0.0
|
||||
# Official Python utilities
|
||||
setuptools
|
||||
packaging>=20.0
|
||||
typing_extensions>=3.7.4.1,<4.2.0; python_version < "3.8"
|
||||
typing_extensions>=3.7.4.1,<4.5.0; python_version < "3.8"
|
||||
# Development dependencies
|
||||
pre-commit>=2.13.0
|
||||
cython>=0.25,<3.0
|
||||
|
@ -30,7 +31,7 @@ pytest-timeout>=1.3.0,<2.0.0
|
|||
mock>=2.0.0,<3.0.0
|
||||
flake8>=3.8.0,<6.0.0
|
||||
hypothesis>=3.27.0,<7.0.0
|
||||
mypy>=0.980,<0.990; platform_machine != "aarch64" and python_version >= "3.7"
|
||||
mypy>=0.990,<0.1000; platform_machine != "aarch64" and python_version >= "3.7"
|
||||
types-dataclasses>=0.1.3; python_version < "3.7"
|
||||
types-mock>=0.1.1
|
||||
types-setuptools>=57.0.0
|
||||
|
|
16
setup.cfg
16
setup.cfg
|
@ -22,6 +22,7 @@ classifiers =
|
|||
Programming Language :: Python :: 3.8
|
||||
Programming Language :: Python :: 3.9
|
||||
Programming Language :: Python :: 3.10
|
||||
Programming Language :: Python :: 3.11
|
||||
Topic :: Scientific/Engineering
|
||||
project_urls =
|
||||
Release notes = https://github.com/explosion/spaCy/releases
|
||||
|
@ -41,27 +42,28 @@ setup_requires =
|
|||
thinc>=8.1.0,<8.2.0
|
||||
install_requires =
|
||||
# Our libraries
|
||||
spacy-legacy>=3.0.10,<3.1.0
|
||||
spacy-legacy>=3.0.11,<3.1.0
|
||||
spacy-loggers>=1.0.0,<2.0.0
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
thinc>=8.1.0,<8.2.0
|
||||
wasabi>=0.9.1,<1.1.0
|
||||
wasabi>=0.9.1,<1.2.0
|
||||
srsly>=2.4.3,<3.0.0
|
||||
catalogue>=2.0.6,<2.1.0
|
||||
# Third-party dependencies
|
||||
typer>=0.3.0,<0.5.0
|
||||
pathy>=0.3.5
|
||||
typer>=0.3.0,<0.8.0
|
||||
pathy>=0.10.0
|
||||
smart-open>=5.2.1,<7.0.0
|
||||
tqdm>=4.38.0,<5.0.0
|
||||
numpy>=1.15.0
|
||||
requests>=2.13.0,<3.0.0
|
||||
pydantic>=1.7.4,!=1.8,!=1.8.1,<1.10.0
|
||||
pydantic>=1.7.4,!=1.8,!=1.8.1,<1.11.0
|
||||
jinja2
|
||||
# Official Python utilities
|
||||
setuptools
|
||||
packaging>=20.0
|
||||
typing_extensions>=3.7.4,<4.2.0; python_version < "3.8"
|
||||
typing_extensions>=3.7.4.1,<4.5.0; python_version < "3.8"
|
||||
langcodes>=3.2.0,<4.0.0
|
||||
|
||||
[options.entry_points]
|
||||
|
@ -72,7 +74,7 @@ console_scripts =
|
|||
lookups =
|
||||
spacy_lookups_data>=1.0.3,<1.1.0
|
||||
transformers =
|
||||
spacy_transformers>=1.1.2,<1.2.0
|
||||
spacy_transformers>=1.1.2,<1.3.0
|
||||
ray =
|
||||
spacy_ray>=0.1.0,<1.0.0
|
||||
cuda =
|
||||
|
|
4
setup.py
4
setup.py
|
@ -30,7 +30,9 @@ MOD_NAMES = [
|
|||
"spacy.lexeme",
|
||||
"spacy.vocab",
|
||||
"spacy.attrs",
|
||||
"spacy.kb",
|
||||
"spacy.kb.candidate",
|
||||
"spacy.kb.kb",
|
||||
"spacy.kb.kb_in_memory",
|
||||
"spacy.ml.parser_model",
|
||||
"spacy.morphology",
|
||||
"spacy.pipeline.dep_parser",
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
# fmt: off
|
||||
__title__ = "spacy"
|
||||
__version__ = "3.4.1"
|
||||
__version__ = "3.5.0"
|
||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||
__projects__ = "https://github.com/explosion/projects"
|
||||
|
|
|
@ -4,6 +4,7 @@ from ._util import app, setup_cli # noqa: F401
|
|||
|
||||
# These are the actual functions, NOT the wrapped CLI commands. The CLI commands
|
||||
# are registered automatically and won't have to be imported here.
|
||||
from .benchmark_speed import benchmark_speed_cli # noqa: F401
|
||||
from .download import download # noqa: F401
|
||||
from .info import info # noqa: F401
|
||||
from .package import package # noqa: F401
|
||||
|
@ -16,6 +17,7 @@ from .debug_config import debug_config # noqa: F401
|
|||
from .debug_model import debug_model # noqa: F401
|
||||
from .debug_diff import debug_diff # noqa: F401
|
||||
from .evaluate import evaluate # noqa: F401
|
||||
from .apply import apply # noqa: F401
|
||||
from .convert import convert # noqa: F401
|
||||
from .init_pipeline import init_pipeline_cli # noqa: F401
|
||||
from .init_config import init_config, fill_config # noqa: F401
|
||||
|
@ -27,6 +29,7 @@ from .project.dvc import project_update_dvc # noqa: F401
|
|||
from .project.push import project_push # noqa: F401
|
||||
from .project.pull import project_pull # noqa: F401
|
||||
from .project.document import project_document # noqa: F401
|
||||
from .find_threshold import find_threshold # noqa: F401
|
||||
|
||||
|
||||
@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
|
||||
|
|
|
@ -27,7 +27,7 @@ from ..util import is_minor_version_match
|
|||
from .. import about
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pathy import Pathy # noqa: F401
|
||||
from pathy import FluidPath # noqa: F401
|
||||
|
||||
|
||||
SDIST_SUFFIX = ".tar.gz"
|
||||
|
@ -50,6 +50,7 @@ DEBUG_HELP = """Suite of helpful commands for debugging and profiling. Includes
|
|||
commands to check and validate your config files, training and evaluation data,
|
||||
and custom model implementations.
|
||||
"""
|
||||
BENCHMARK_HELP = """Commands for benchmarking pipelines."""
|
||||
INIT_HELP = """Commands for initializing configs and pipeline packages."""
|
||||
|
||||
# Wrappers for Typer's annotations. Initially created to set defaults and to
|
||||
|
@ -58,12 +59,14 @@ Arg = typer.Argument
|
|||
Opt = typer.Option
|
||||
|
||||
app = typer.Typer(name=NAME, help=HELP)
|
||||
benchmark_cli = typer.Typer(name="benchmark", help=BENCHMARK_HELP, no_args_is_help=True)
|
||||
project_cli = typer.Typer(name="project", help=PROJECT_HELP, no_args_is_help=True)
|
||||
debug_cli = typer.Typer(name="debug", help=DEBUG_HELP, no_args_is_help=True)
|
||||
init_cli = typer.Typer(name="init", help=INIT_HELP, no_args_is_help=True)
|
||||
|
||||
app.add_typer(project_cli)
|
||||
app.add_typer(debug_cli)
|
||||
app.add_typer(benchmark_cli)
|
||||
app.add_typer(init_cli)
|
||||
|
||||
|
||||
|
@ -163,15 +166,15 @@ def load_project_config(
|
|||
validate_project_version(config)
|
||||
validate_max_parallel_processes(config)
|
||||
validate_project_commands(config)
|
||||
if interpolate:
|
||||
err = f"{PROJECT_FILE} validation error"
|
||||
with show_validation_error(title=err, hint_fill=False):
|
||||
config = substitute_project_variables(config, overrides)
|
||||
# Make sure directories defined in config exist
|
||||
for subdir in config.get("directories", []):
|
||||
dir_path = path / subdir
|
||||
if not dir_path.exists():
|
||||
dir_path.mkdir(parents=True)
|
||||
if interpolate:
|
||||
err = f"{PROJECT_FILE} validation error"
|
||||
with show_validation_error(title=err, hint_fill=False):
|
||||
config = substitute_project_variables(config, overrides)
|
||||
return config
|
||||
|
||||
|
||||
|
@ -373,7 +376,7 @@ def import_code(code_path: Optional[Union[Path, str]]) -> None:
|
|||
msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1)
|
||||
|
||||
|
||||
def upload_file(src: Path, dest: Union[str, "Pathy"]) -> None:
|
||||
def upload_file(src: Path, dest: Union[str, "FluidPath"]) -> None:
|
||||
"""Upload a file.
|
||||
|
||||
src (Path): The source path.
|
||||
|
@ -381,13 +384,20 @@ def upload_file(src: Path, dest: Union[str, "Pathy"]) -> None:
|
|||
"""
|
||||
import smart_open
|
||||
|
||||
# Create parent directories for local paths
|
||||
if isinstance(dest, Path):
|
||||
if not dest.parent.exists():
|
||||
dest.parent.mkdir(parents=True)
|
||||
|
||||
dest = str(dest)
|
||||
with smart_open.open(dest, mode="wb") as output_file:
|
||||
with src.open(mode="rb") as input_file:
|
||||
output_file.write(input_file.read())
|
||||
|
||||
|
||||
def download_file(src: Union[str, "Pathy"], dest: Path, *, force: bool = False) -> None:
|
||||
def download_file(
|
||||
src: Union[str, "FluidPath"], dest: Path, *, force: bool = False
|
||||
) -> None:
|
||||
"""Download a file using smart_open.
|
||||
|
||||
url (str): The URL of the file.
|
||||
|
@ -400,7 +410,7 @@ def download_file(src: Union[str, "Pathy"], dest: Path, *, force: bool = False)
|
|||
if dest.exists() and not force:
|
||||
return None
|
||||
src = str(src)
|
||||
with smart_open.open(src, mode="rb", ignore_ext=True) as input_file:
|
||||
with smart_open.open(src, mode="rb", compression="disable") as input_file:
|
||||
with dest.open(mode="wb") as output_file:
|
||||
shutil.copyfileobj(input_file, output_file)
|
||||
|
||||
|
@ -410,7 +420,7 @@ def ensure_pathy(path):
|
|||
slow and annoying Google Cloud warning)."""
|
||||
from pathy import Pathy # noqa: F811
|
||||
|
||||
return Pathy(path)
|
||||
return Pathy.fluid(path)
|
||||
|
||||
|
||||
def git_checkout(
|
||||
|
@ -690,8 +700,8 @@ def check_deps(cmd: Dict, cmd_name: str, project_dir: Path, dry: bool):
|
|||
if not (project_dir / dep).exists():
|
||||
err = f"Missing dependency specified by command '{cmd_name}': {dep}"
|
||||
err_help = "Maybe you forgot to run the 'project assets' command or a previous step?"
|
||||
err_kwargs = {"exits": 1} if not dry else {}
|
||||
msg.fail(err, err_help, **err_kwargs)
|
||||
err_exits = 1 if not dry else None
|
||||
msg.fail(err, err_help, exits=err_exits)
|
||||
|
||||
|
||||
def _get_lock_entry(project_dir: Path, command: Dict[str, Any]) -> Dict[str, Any]:
|
||||
|
@ -735,6 +745,33 @@ def _get_fileinfo(
|
|||
return data
|
||||
|
||||
|
||||
def walk_directory(path: Path, suffix: Optional[str] = None) -> List[Path]:
|
||||
"""Given a directory and a suffix, recursively find all files matching the suffix.
|
||||
Directories or files with names beginning with a . are ignored, but hidden flags on
|
||||
filesystems are not checked.
|
||||
When provided with a suffix `None`, there is no suffix-based filtering."""
|
||||
if not path.is_dir():
|
||||
return [path]
|
||||
paths = [path]
|
||||
locs = []
|
||||
seen = set()
|
||||
for path in paths:
|
||||
if str(path) in seen:
|
||||
continue
|
||||
seen.add(str(path))
|
||||
if path.parts[-1].startswith("."):
|
||||
continue
|
||||
elif path.is_dir():
|
||||
paths.extend(path.iterdir())
|
||||
elif suffix is not None and not path.parts[-1].endswith(suffix):
|
||||
continue
|
||||
else:
|
||||
locs.append(path)
|
||||
# It's good to sort these, in case the ordering messes up cache.
|
||||
locs.sort()
|
||||
return locs
|
||||
|
||||
|
||||
def _format_number(number: Union[int, float], ndigits: int = 2) -> str:
|
||||
"""Formats a number (float or int) rounding to `ndigits`, without truncating trailing 0s,
|
||||
as happens with `round(number, ndigits)`"""
|
||||
|
|
143
spacy/cli/apply.py
Normal file
143
spacy/cli/apply.py
Normal file
|
@ -0,0 +1,143 @@
|
|||
import tqdm
|
||||
import srsly
|
||||
|
||||
from itertools import chain
|
||||
from pathlib import Path
|
||||
from typing import Optional, List, Iterable, cast, Union
|
||||
|
||||
from wasabi import msg
|
||||
|
||||
from ._util import app, Arg, Opt, setup_gpu, import_code, walk_directory
|
||||
|
||||
from ..tokens import Doc, DocBin
|
||||
from ..vocab import Vocab
|
||||
from ..util import ensure_path, load_model
|
||||
|
||||
|
||||
path_help = """Location of the documents to predict on.
|
||||
Can be a single file in .spacy format or a .jsonl file.
|
||||
Files with other extensions are treated as single plain text documents.
|
||||
If a directory is provided it is traversed recursively to grab
|
||||
all files to be processed.
|
||||
The files can be a mixture of .spacy, .jsonl and text files.
|
||||
If .jsonl is provided the specified field is going
|
||||
to be grabbed ("text" by default)."""
|
||||
|
||||
out_help = "Path to save the resulting .spacy file"
|
||||
code_help = (
|
||||
"Path to Python file with additional " "code (registered functions) to be imported"
|
||||
)
|
||||
gold_help = "Use gold preprocessing provided in the .spacy files"
|
||||
force_msg = (
|
||||
"The provided output file already exists. "
|
||||
"To force overwriting the output file, set the --force or -F flag."
|
||||
)
|
||||
|
||||
|
||||
DocOrStrStream = Union[Iterable[str], Iterable[Doc]]
|
||||
|
||||
|
||||
def _stream_docbin(path: Path, vocab: Vocab) -> Iterable[Doc]:
|
||||
"""
|
||||
Stream Doc objects from DocBin.
|
||||
"""
|
||||
docbin = DocBin().from_disk(path)
|
||||
for doc in docbin.get_docs(vocab):
|
||||
yield doc
|
||||
|
||||
|
||||
def _stream_jsonl(path: Path, field: str) -> Iterable[str]:
|
||||
"""
|
||||
Stream "text" field from JSONL. If the field "text" is
|
||||
not found it raises error.
|
||||
"""
|
||||
for entry in srsly.read_jsonl(path):
|
||||
if field not in entry:
|
||||
msg.fail(f"{path} does not contain the required '{field}' field.", exits=1)
|
||||
else:
|
||||
yield entry[field]
|
||||
|
||||
|
||||
def _stream_texts(paths: Iterable[Path]) -> Iterable[str]:
|
||||
"""
|
||||
Yields strings from text files in paths.
|
||||
"""
|
||||
for path in paths:
|
||||
with open(path, "r") as fin:
|
||||
text = fin.read()
|
||||
yield text
|
||||
|
||||
|
||||
@app.command("apply")
|
||||
def apply_cli(
|
||||
# fmt: off
|
||||
model: str = Arg(..., help="Model name or path"),
|
||||
data_path: Path = Arg(..., help=path_help, exists=True),
|
||||
output_file: Path = Arg(..., help=out_help, dir_okay=False),
|
||||
code_path: Optional[Path] = Opt(None, "--code", "-c", help=code_help),
|
||||
text_key: str = Opt("text", "--text-key", "-tk", help="Key containing text string for JSONL"),
|
||||
force_overwrite: bool = Opt(False, "--force", "-F", help="Force overwriting the output file"),
|
||||
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU."),
|
||||
batch_size: int = Opt(1, "--batch-size", "-b", help="Batch size."),
|
||||
n_process: int = Opt(1, "--n-process", "-n", help="number of processors to use.")
|
||||
):
|
||||
"""
|
||||
Apply a trained pipeline to documents to get predictions.
|
||||
Expects a loadable spaCy pipeline and path to the data, which
|
||||
can be a directory or a file.
|
||||
The data files can be provided in multiple formats:
|
||||
1. .spacy files
|
||||
2. .jsonl files with a specified "field" to read the text from.
|
||||
3. Files with any other extension are assumed to be containing
|
||||
a single document.
|
||||
DOCS: https://spacy.io/api/cli#apply
|
||||
"""
|
||||
data_path = ensure_path(data_path)
|
||||
output_file = ensure_path(output_file)
|
||||
code_path = ensure_path(code_path)
|
||||
if output_file.exists() and not force_overwrite:
|
||||
msg.fail(force_msg, exits=1)
|
||||
if not data_path.exists():
|
||||
msg.fail(f"Couldn't find data path: {data_path}", exits=1)
|
||||
import_code(code_path)
|
||||
setup_gpu(use_gpu)
|
||||
apply(data_path, output_file, model, text_key, batch_size, n_process)
|
||||
|
||||
|
||||
def apply(
|
||||
data_path: Path,
|
||||
output_file: Path,
|
||||
model: str,
|
||||
json_field: str,
|
||||
batch_size: int,
|
||||
n_process: int,
|
||||
):
|
||||
docbin = DocBin(store_user_data=True)
|
||||
paths = walk_directory(data_path)
|
||||
if len(paths) == 0:
|
||||
docbin.to_disk(output_file)
|
||||
msg.warn(
|
||||
"Did not find data to process,"
|
||||
f" {data_path} seems to be an empty directory."
|
||||
)
|
||||
return
|
||||
nlp = load_model(model)
|
||||
msg.good(f"Loaded model {model}")
|
||||
vocab = nlp.vocab
|
||||
streams: List[DocOrStrStream] = []
|
||||
text_files = []
|
||||
for path in paths:
|
||||
if path.suffix == ".spacy":
|
||||
streams.append(_stream_docbin(path, vocab))
|
||||
elif path.suffix == ".jsonl":
|
||||
streams.append(_stream_jsonl(path, json_field))
|
||||
else:
|
||||
text_files.append(path)
|
||||
if len(text_files) > 0:
|
||||
streams.append(_stream_texts(text_files))
|
||||
datagen = cast(DocOrStrStream, chain(*streams))
|
||||
for doc in tqdm.tqdm(nlp.pipe(datagen, batch_size=batch_size, n_process=n_process)):
|
||||
docbin.add(doc)
|
||||
if output_file.suffix == "":
|
||||
output_file = output_file.with_suffix(".spacy")
|
||||
docbin.to_disk(output_file)
|
174
spacy/cli/benchmark_speed.py
Normal file
174
spacy/cli/benchmark_speed.py
Normal file
|
@ -0,0 +1,174 @@
|
|||
from typing import Iterable, List, Optional
|
||||
import random
|
||||
from itertools import islice
|
||||
import numpy
|
||||
from pathlib import Path
|
||||
import time
|
||||
from tqdm import tqdm
|
||||
import typer
|
||||
from wasabi import msg
|
||||
|
||||
from .. import util
|
||||
from ..language import Language
|
||||
from ..tokens import Doc
|
||||
from ..training import Corpus
|
||||
from ._util import Arg, Opt, benchmark_cli, setup_gpu
|
||||
|
||||
|
||||
@benchmark_cli.command(
|
||||
"speed",
|
||||
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
||||
)
|
||||
def benchmark_speed_cli(
|
||||
# fmt: off
|
||||
ctx: typer.Context,
|
||||
model: str = Arg(..., help="Model name or path"),
|
||||
data_path: Path = Arg(..., help="Location of binary evaluation data in .spacy format", exists=True),
|
||||
batch_size: Optional[int] = Opt(None, "--batch-size", "-b", min=1, help="Override the pipeline batch size"),
|
||||
no_shuffle: bool = Opt(False, "--no-shuffle", help="Do not shuffle benchmark data"),
|
||||
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
||||
n_batches: int = Opt(50, "--batches", help="Minimum number of batches to benchmark", min=30,),
|
||||
warmup_epochs: int = Opt(3, "--warmup", "-w", min=0, help="Number of iterations over the data for warmup"),
|
||||
# fmt: on
|
||||
):
|
||||
"""
|
||||
Benchmark a pipeline. Expects a loadable spaCy pipeline and benchmark
|
||||
data in the binary .spacy format.
|
||||
"""
|
||||
setup_gpu(use_gpu=use_gpu, silent=False)
|
||||
|
||||
nlp = util.load_model(model)
|
||||
batch_size = batch_size if batch_size is not None else nlp.batch_size
|
||||
corpus = Corpus(data_path)
|
||||
docs = [eg.predicted for eg in corpus(nlp)]
|
||||
|
||||
if len(docs) == 0:
|
||||
msg.fail("Cannot benchmark speed using an empty corpus.", exits=1)
|
||||
|
||||
print(f"Warming up for {warmup_epochs} epochs...")
|
||||
warmup(nlp, docs, warmup_epochs, batch_size)
|
||||
|
||||
print()
|
||||
print(f"Benchmarking {n_batches} batches...")
|
||||
wps = benchmark(nlp, docs, n_batches, batch_size, not no_shuffle)
|
||||
|
||||
print()
|
||||
print_outliers(wps)
|
||||
print_mean_with_ci(wps)
|
||||
|
||||
|
||||
# Lowercased, behaves as a context manager function.
|
||||
class time_context:
|
||||
"""Register the running time of a context."""
|
||||
|
||||
def __enter__(self):
|
||||
self.start = time.perf_counter()
|
||||
return self
|
||||
|
||||
def __exit__(self, type, value, traceback):
|
||||
self.elapsed = time.perf_counter() - self.start
|
||||
|
||||
|
||||
class Quartiles:
|
||||
"""Calculate the q1, q2, q3 quartiles and the inter-quartile range (iqr)
|
||||
of a sample."""
|
||||
|
||||
q1: float
|
||||
q2: float
|
||||
q3: float
|
||||
iqr: float
|
||||
|
||||
def __init__(self, sample: numpy.ndarray) -> None:
|
||||
self.q1 = numpy.quantile(sample, 0.25)
|
||||
self.q2 = numpy.quantile(sample, 0.5)
|
||||
self.q3 = numpy.quantile(sample, 0.75)
|
||||
self.iqr = self.q3 - self.q1
|
||||
|
||||
|
||||
def annotate(
|
||||
nlp: Language, docs: List[Doc], batch_size: Optional[int]
|
||||
) -> numpy.ndarray:
|
||||
docs = nlp.pipe(tqdm(docs, unit="doc"), batch_size=batch_size)
|
||||
wps = []
|
||||
while True:
|
||||
with time_context() as elapsed:
|
||||
batch_docs = list(
|
||||
islice(docs, batch_size if batch_size else nlp.batch_size)
|
||||
)
|
||||
if len(batch_docs) == 0:
|
||||
break
|
||||
n_tokens = count_tokens(batch_docs)
|
||||
wps.append(n_tokens / elapsed.elapsed)
|
||||
|
||||
return numpy.array(wps)
|
||||
|
||||
|
||||
def benchmark(
|
||||
nlp: Language,
|
||||
docs: List[Doc],
|
||||
n_batches: int,
|
||||
batch_size: int,
|
||||
shuffle: bool,
|
||||
) -> numpy.ndarray:
|
||||
if shuffle:
|
||||
bench_docs = [
|
||||
nlp.make_doc(random.choice(docs).text)
|
||||
for _ in range(n_batches * batch_size)
|
||||
]
|
||||
else:
|
||||
bench_docs = [
|
||||
nlp.make_doc(docs[i % len(docs)].text)
|
||||
for i in range(n_batches * batch_size)
|
||||
]
|
||||
|
||||
return annotate(nlp, bench_docs, batch_size)
|
||||
|
||||
|
||||
def bootstrap(x, statistic=numpy.mean, iterations=10000) -> numpy.ndarray:
|
||||
"""Apply a statistic to repeated random samples of an array."""
|
||||
return numpy.fromiter(
|
||||
(
|
||||
statistic(numpy.random.choice(x, len(x), replace=True))
|
||||
for _ in range(iterations)
|
||||
),
|
||||
numpy.float64,
|
||||
)
|
||||
|
||||
|
||||
def count_tokens(docs: Iterable[Doc]) -> int:
|
||||
return sum(len(doc) for doc in docs)
|
||||
|
||||
|
||||
def print_mean_with_ci(sample: numpy.ndarray):
|
||||
mean = numpy.mean(sample)
|
||||
bootstrap_means = bootstrap(sample)
|
||||
bootstrap_means.sort()
|
||||
|
||||
# 95% confidence interval
|
||||
low = bootstrap_means[int(len(bootstrap_means) * 0.025)]
|
||||
high = bootstrap_means[int(len(bootstrap_means) * 0.975)]
|
||||
|
||||
print(f"Mean: {mean:.1f} words/s (95% CI: {low-mean:.1f} +{high-mean:.1f})")
|
||||
|
||||
|
||||
def print_outliers(sample: numpy.ndarray):
|
||||
quartiles = Quartiles(sample)
|
||||
|
||||
n_outliers = numpy.sum(
|
||||
(sample < (quartiles.q1 - 1.5 * quartiles.iqr))
|
||||
| (sample > (quartiles.q3 + 1.5 * quartiles.iqr))
|
||||
)
|
||||
n_extreme_outliers = numpy.sum(
|
||||
(sample < (quartiles.q1 - 3.0 * quartiles.iqr))
|
||||
| (sample > (quartiles.q3 + 3.0 * quartiles.iqr))
|
||||
)
|
||||
print(
|
||||
f"Outliers: {(100 * n_outliers) / len(sample):.1f}%, extreme outliers: {(100 * n_extreme_outliers) / len(sample)}%"
|
||||
)
|
||||
|
||||
|
||||
def warmup(
|
||||
nlp: Language, docs: List[Doc], warmup_epochs: int, batch_size: Optional[int]
|
||||
) -> numpy.ndarray:
|
||||
docs = warmup_epochs * docs
|
||||
return annotate(nlp, docs, batch_size)
|
|
@ -1,4 +1,4 @@
|
|||
from typing import Callable, Iterable, Mapping, Optional, Any, List, Union
|
||||
from typing import Callable, Iterable, Mapping, Optional, Any, Union
|
||||
from enum import Enum
|
||||
from pathlib import Path
|
||||
from wasabi import Printer
|
||||
|
@ -7,7 +7,7 @@ import re
|
|||
import sys
|
||||
import itertools
|
||||
|
||||
from ._util import app, Arg, Opt
|
||||
from ._util import app, Arg, Opt, walk_directory
|
||||
from ..training import docs_to_json
|
||||
from ..tokens import Doc, DocBin
|
||||
from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs
|
||||
|
@ -28,6 +28,8 @@ CONVERTERS: Mapping[str, Callable[..., Iterable[Doc]]] = {
|
|||
"json": json_to_docs,
|
||||
}
|
||||
|
||||
AUTO = "auto"
|
||||
|
||||
|
||||
# File types that can be written to stdout
|
||||
FILE_TYPES_STDOUT = ("json",)
|
||||
|
@ -49,7 +51,7 @@ def convert_cli(
|
|||
model: Optional[str] = Opt(None, "--model", "--base", "-b", help="Trained spaCy pipeline for sentence segmentation to use as base (for --seg-sents)"),
|
||||
morphology: bool = Opt(False, "--morphology", "-m", help="Enable appending morphology to tags"),
|
||||
merge_subtokens: bool = Opt(False, "--merge-subtokens", "-T", help="Merge CoNLL-U subtokens"),
|
||||
converter: str = Opt("auto", "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}"),
|
||||
converter: str = Opt(AUTO, "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}"),
|
||||
ner_map: Optional[Path] = Opt(None, "--ner-map", "-nm", help="NER tag mapping (as JSON-encoded dict of entity types)", exists=True),
|
||||
lang: Optional[str] = Opt(None, "--lang", "-l", help="Language (if tokenizer required)"),
|
||||
concatenate: bool = Opt(None, "--concatenate", "-C", help="Concatenate output to a single file"),
|
||||
|
@ -70,8 +72,8 @@ def convert_cli(
|
|||
output_dir: Union[str, Path] = "-" if output_dir == Path("-") else output_dir
|
||||
silent = output_dir == "-"
|
||||
msg = Printer(no_print=silent)
|
||||
verify_cli_args(msg, input_path, output_dir, file_type.value, converter, ner_map)
|
||||
converter = _get_converter(msg, converter, input_path)
|
||||
verify_cli_args(msg, input_path, output_dir, file_type.value, converter, ner_map)
|
||||
convert(
|
||||
input_path,
|
||||
output_dir,
|
||||
|
@ -100,7 +102,7 @@ def convert(
|
|||
model: Optional[str] = None,
|
||||
morphology: bool = False,
|
||||
merge_subtokens: bool = False,
|
||||
converter: str = "auto",
|
||||
converter: str,
|
||||
ner_map: Optional[Path] = None,
|
||||
lang: Optional[str] = None,
|
||||
concatenate: bool = False,
|
||||
|
@ -189,33 +191,6 @@ def autodetect_ner_format(input_data: str) -> Optional[str]:
|
|||
return None
|
||||
|
||||
|
||||
def walk_directory(path: Path, converter: str) -> List[Path]:
|
||||
if not path.is_dir():
|
||||
return [path]
|
||||
paths = [path]
|
||||
locs = []
|
||||
seen = set()
|
||||
for path in paths:
|
||||
if str(path) in seen:
|
||||
continue
|
||||
seen.add(str(path))
|
||||
if path.parts[-1].startswith("."):
|
||||
continue
|
||||
elif path.is_dir():
|
||||
paths.extend(path.iterdir())
|
||||
elif converter == "json" and not path.parts[-1].endswith("json"):
|
||||
continue
|
||||
elif converter == "conll" and not path.parts[-1].endswith("conll"):
|
||||
continue
|
||||
elif converter == "iob" and not path.parts[-1].endswith("iob"):
|
||||
continue
|
||||
else:
|
||||
locs.append(path)
|
||||
# It's good to sort these, in case the ordering messes up cache.
|
||||
locs.sort()
|
||||
return locs
|
||||
|
||||
|
||||
def verify_cli_args(
|
||||
msg: Printer,
|
||||
input_path: Path,
|
||||
|
@ -239,18 +214,22 @@ def verify_cli_args(
|
|||
input_locs = walk_directory(input_path, converter)
|
||||
if len(input_locs) == 0:
|
||||
msg.fail("No input files in directory", input_path, exits=1)
|
||||
file_types = list(set([loc.suffix[1:] for loc in input_locs]))
|
||||
if converter == "auto" and len(file_types) >= 2:
|
||||
file_types_str = ",".join(file_types)
|
||||
msg.fail("All input files must be same type", file_types_str, exits=1)
|
||||
if converter != "auto" and converter not in CONVERTERS:
|
||||
if converter not in CONVERTERS:
|
||||
msg.fail(f"Can't find converter for {converter}", exits=1)
|
||||
|
||||
|
||||
def _get_converter(msg, converter, input_path: Path):
|
||||
if input_path.is_dir():
|
||||
input_path = walk_directory(input_path, converter)[0]
|
||||
if converter == "auto":
|
||||
if converter == AUTO:
|
||||
input_locs = walk_directory(input_path, suffix=None)
|
||||
file_types = list(set([loc.suffix[1:] for loc in input_locs]))
|
||||
if len(file_types) >= 2:
|
||||
file_types_str = ",".join(file_types)
|
||||
msg.fail("All input files must be same type", file_types_str, exits=1)
|
||||
input_path = input_locs[0]
|
||||
else:
|
||||
input_path = walk_directory(input_path, suffix=converter)[0]
|
||||
if converter == AUTO:
|
||||
converter = input_path.suffix[1:]
|
||||
if converter == "ner" or converter == "iob":
|
||||
with input_path.open(encoding="utf8") as file_:
|
||||
|
|
|
@ -13,6 +13,7 @@ from ._util import import_code, debug_cli, _format_number
|
|||
from ..training import Example, remove_bilu_prefix
|
||||
from ..training.initialize import get_sourced_components
|
||||
from ..schemas import ConfigSchemaTraining
|
||||
from ..pipeline import TrainablePipe
|
||||
from ..pipeline._parser_internals import nonproj
|
||||
from ..pipeline._parser_internals.nonproj import DELIMITER
|
||||
from ..pipeline import Morphologizer, SpanCategorizer
|
||||
|
@ -934,6 +935,7 @@ def _get_labels_from_model(nlp: Language, factory_name: str) -> Set[str]:
|
|||
labels: Set[str] = set()
|
||||
for pipe_name in pipe_names:
|
||||
pipe = nlp.get_pipe(pipe_name)
|
||||
assert isinstance(pipe, TrainablePipe)
|
||||
labels.update(pipe.labels)
|
||||
return labels
|
||||
|
||||
|
|
|
@ -7,12 +7,15 @@ from thinc.api import fix_random_seed
|
|||
|
||||
from ..training import Corpus
|
||||
from ..tokens import Doc
|
||||
from ._util import app, Arg, Opt, setup_gpu, import_code
|
||||
from ._util import app, Arg, Opt, setup_gpu, import_code, benchmark_cli
|
||||
from ..scorer import Scorer
|
||||
from .. import util
|
||||
from .. import displacy
|
||||
|
||||
|
||||
@benchmark_cli.command(
|
||||
"accuracy",
|
||||
)
|
||||
@app.command("evaluate")
|
||||
def evaluate_cli(
|
||||
# fmt: off
|
||||
|
@ -36,7 +39,7 @@ def evaluate_cli(
|
|||
dependency parses in a HTML file, set as output directory as the
|
||||
displacy_path argument.
|
||||
|
||||
DOCS: https://spacy.io/api/cli#evaluate
|
||||
DOCS: https://spacy.io/api/cli#benchmark-accuracy
|
||||
"""
|
||||
import_code(code_path)
|
||||
evaluate(
|
||||
|
|
233
spacy/cli/find_threshold.py
Normal file
233
spacy/cli/find_threshold.py
Normal file
|
@ -0,0 +1,233 @@
|
|||
import functools
|
||||
import operator
|
||||
from pathlib import Path
|
||||
import logging
|
||||
from typing import Optional, Tuple, Any, Dict, List
|
||||
|
||||
import numpy
|
||||
import wasabi.tables
|
||||
|
||||
from ..pipeline import TextCategorizer, MultiLabel_TextCategorizer
|
||||
from ..errors import Errors
|
||||
from ..training import Corpus
|
||||
from ._util import app, Arg, Opt, import_code, setup_gpu
|
||||
from .. import util
|
||||
|
||||
_DEFAULTS = {
|
||||
"n_trials": 11,
|
||||
"use_gpu": -1,
|
||||
"gold_preproc": False,
|
||||
}
|
||||
|
||||
|
||||
@app.command(
|
||||
"find-threshold",
|
||||
context_settings={"allow_extra_args": False, "ignore_unknown_options": True},
|
||||
)
|
||||
def find_threshold_cli(
|
||||
# fmt: off
|
||||
model: str = Arg(..., help="Model name or path"),
|
||||
data_path: Path = Arg(..., help="Location of binary evaluation data in .spacy format", exists=True),
|
||||
pipe_name: str = Arg(..., help="Name of pipe to examine thresholds for"),
|
||||
threshold_key: str = Arg(..., help="Key of threshold attribute in component's configuration"),
|
||||
scores_key: str = Arg(..., help="Metric to optimize"),
|
||||
n_trials: int = Opt(_DEFAULTS["n_trials"], "--n_trials", "-n", help="Number of trials to determine optimal thresholds"),
|
||||
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||
use_gpu: int = Opt(_DEFAULTS["use_gpu"], "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
||||
gold_preproc: bool = Opt(_DEFAULTS["gold_preproc"], "--gold-preproc", "-G", help="Use gold preprocessing"),
|
||||
verbose: bool = Opt(False, "--silent", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||
# fmt: on
|
||||
):
|
||||
"""
|
||||
Runs prediction trials for a trained model with varying tresholds to maximize
|
||||
the specified metric. The search space for the threshold is traversed linearly
|
||||
from 0 to 1 in `n_trials` steps. Results are displayed in a table on `stdout`
|
||||
(the corresponding API call to `spacy.cli.find_threshold.find_threshold()`
|
||||
returns all results).
|
||||
|
||||
This is applicable only for components whose predictions are influenced by
|
||||
thresholds - e.g. `textcat_multilabel` and `spancat`, but not `textcat`. Note
|
||||
that the full path to the corresponding threshold attribute in the config has to
|
||||
be provided.
|
||||
|
||||
DOCS: https://spacy.io/api/cli#find-threshold
|
||||
"""
|
||||
|
||||
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
||||
import_code(code_path)
|
||||
find_threshold(
|
||||
model=model,
|
||||
data_path=data_path,
|
||||
pipe_name=pipe_name,
|
||||
threshold_key=threshold_key,
|
||||
scores_key=scores_key,
|
||||
n_trials=n_trials,
|
||||
use_gpu=use_gpu,
|
||||
gold_preproc=gold_preproc,
|
||||
silent=False,
|
||||
)
|
||||
|
||||
|
||||
def find_threshold(
|
||||
model: str,
|
||||
data_path: Path,
|
||||
pipe_name: str,
|
||||
threshold_key: str,
|
||||
scores_key: str,
|
||||
*,
|
||||
n_trials: int = _DEFAULTS["n_trials"], # type: ignore
|
||||
use_gpu: int = _DEFAULTS["use_gpu"], # type: ignore
|
||||
gold_preproc: bool = _DEFAULTS["gold_preproc"], # type: ignore
|
||||
silent: bool = True,
|
||||
) -> Tuple[float, float, Dict[float, float]]:
|
||||
"""
|
||||
Runs prediction trials for models with varying tresholds to maximize the specified metric.
|
||||
model (Union[str, Path]): Pipeline to evaluate. Can be a package or a path to a data directory.
|
||||
data_path (Path): Path to file with DocBin with docs to use for threshold search.
|
||||
pipe_name (str): Name of pipe to examine thresholds for.
|
||||
threshold_key (str): Key of threshold attribute in component's configuration.
|
||||
scores_key (str): Name of score to metric to optimize.
|
||||
n_trials (int): Number of trials to determine optimal thresholds.
|
||||
use_gpu (int): GPU ID or -1 for CPU.
|
||||
gold_preproc (bool): Whether to use gold preprocessing. Gold preprocessing helps the annotations align to the
|
||||
tokenization, and may result in sequences of more consistent length. However, it may reduce runtime accuracy due
|
||||
to train/test skew.
|
||||
silent (bool): Whether to print non-error-related output to stdout.
|
||||
RETURNS (Tuple[float, float, Dict[float, float]]): Best found threshold, the corresponding score, scores for all
|
||||
evaluated thresholds.
|
||||
"""
|
||||
|
||||
setup_gpu(use_gpu, silent=silent)
|
||||
data_path = util.ensure_path(data_path)
|
||||
if not data_path.exists():
|
||||
wasabi.msg.fail("Evaluation data not found", data_path, exits=1)
|
||||
nlp = util.load_model(model)
|
||||
|
||||
if pipe_name not in nlp.component_names:
|
||||
raise AttributeError(
|
||||
Errors.E001.format(name=pipe_name, opts=nlp.component_names)
|
||||
)
|
||||
pipe = nlp.get_pipe(pipe_name)
|
||||
if not hasattr(pipe, "scorer"):
|
||||
raise AttributeError(Errors.E1045)
|
||||
|
||||
if type(pipe) == TextCategorizer:
|
||||
wasabi.msg.warn(
|
||||
"The `textcat` component doesn't use a threshold as it's not applicable to the concept of "
|
||||
"exclusive classes. All thresholds will yield the same results."
|
||||
)
|
||||
|
||||
if not silent:
|
||||
wasabi.msg.info(
|
||||
title=f"Optimizing for {scores_key} for component '{pipe_name}' with {n_trials} "
|
||||
f"trials."
|
||||
)
|
||||
|
||||
# Load evaluation corpus.
|
||||
corpus = Corpus(data_path, gold_preproc=gold_preproc)
|
||||
dev_dataset = list(corpus(nlp))
|
||||
config_keys = threshold_key.split(".")
|
||||
|
||||
def set_nested_item(
|
||||
config: Dict[str, Any], keys: List[str], value: float
|
||||
) -> Dict[str, Any]:
|
||||
"""Set item in nested dictionary. Adapted from https://stackoverflow.com/a/54138200.
|
||||
config (Dict[str, Any]): Configuration dictionary.
|
||||
keys (List[Any]): Path to value to set.
|
||||
value (float): Value to set.
|
||||
RETURNS (Dict[str, Any]): Updated dictionary.
|
||||
"""
|
||||
functools.reduce(operator.getitem, keys[:-1], config)[keys[-1]] = value
|
||||
return config
|
||||
|
||||
def filter_config(
|
||||
config: Dict[str, Any], keys: List[str], full_key: str
|
||||
) -> Dict[str, Any]:
|
||||
"""Filters provided config dictionary so that only the specified keys path remains.
|
||||
config (Dict[str, Any]): Configuration dictionary.
|
||||
keys (List[Any]): Path to value to set.
|
||||
full_key (str): Full user-specified key.
|
||||
RETURNS (Dict[str, Any]): Filtered dictionary.
|
||||
"""
|
||||
if keys[0] not in config:
|
||||
wasabi.msg.fail(
|
||||
title=f"Failed to look up `{full_key}` in config: sub-key {[keys[0]]} not found.",
|
||||
text=f"Make sure you specified {[keys[0]]} correctly. The following sub-keys are available instead: "
|
||||
f"{list(config.keys())}",
|
||||
exits=1,
|
||||
)
|
||||
return {
|
||||
keys[0]: filter_config(config[keys[0]], keys[1:], full_key)
|
||||
if len(keys) > 1
|
||||
else config[keys[0]]
|
||||
}
|
||||
|
||||
# Evaluate with varying threshold values.
|
||||
scores: Dict[float, float] = {}
|
||||
config_keys_full = ["components", pipe_name, *config_keys]
|
||||
table_col_widths = (10, 10)
|
||||
thresholds = numpy.linspace(0, 1, n_trials)
|
||||
print(wasabi.tables.row(["Threshold", f"{scores_key}"], widths=table_col_widths))
|
||||
for threshold in thresholds:
|
||||
# Reload pipeline with overrides specifying the new threshold.
|
||||
nlp = util.load_model(
|
||||
model,
|
||||
config=set_nested_item(
|
||||
filter_config(
|
||||
nlp.config, config_keys_full, ".".join(config_keys_full)
|
||||
).copy(),
|
||||
config_keys_full,
|
||||
threshold,
|
||||
),
|
||||
)
|
||||
if hasattr(pipe, "cfg"):
|
||||
setattr(
|
||||
nlp.get_pipe(pipe_name),
|
||||
"cfg",
|
||||
set_nested_item(getattr(pipe, "cfg"), config_keys, threshold),
|
||||
)
|
||||
|
||||
eval_scores = nlp.evaluate(dev_dataset)
|
||||
if scores_key not in eval_scores:
|
||||
wasabi.msg.fail(
|
||||
title=f"Failed to look up score `{scores_key}` in evaluation results.",
|
||||
text=f"Make sure you specified the correct value for `scores_key`. The following scores are "
|
||||
f"available: {list(eval_scores.keys())}",
|
||||
exits=1,
|
||||
)
|
||||
scores[threshold] = eval_scores[scores_key]
|
||||
|
||||
if not isinstance(scores[threshold], (float, int)):
|
||||
wasabi.msg.fail(
|
||||
f"Returned score for key '{scores_key}' is not numeric. Threshold optimization only works for numeric "
|
||||
f"scores.",
|
||||
exits=1,
|
||||
)
|
||||
print(
|
||||
wasabi.row(
|
||||
[round(threshold, 3), round(scores[threshold], 3)],
|
||||
widths=table_col_widths,
|
||||
)
|
||||
)
|
||||
|
||||
best_threshold = max(scores.keys(), key=(lambda key: scores[key]))
|
||||
|
||||
# If all scores are identical, emit warning.
|
||||
if len(set(scores.values())) == 1:
|
||||
wasabi.msg.warn(
|
||||
title="All scores are identical. Verify that all settings are correct.",
|
||||
text=""
|
||||
if (
|
||||
not isinstance(pipe, MultiLabel_TextCategorizer)
|
||||
or scores_key in ("cats_macro_f", "cats_micro_f")
|
||||
)
|
||||
else "Use `cats_macro_f` or `cats_micro_f` when optimizing the threshold for `textcat_multilabel`.",
|
||||
)
|
||||
|
||||
else:
|
||||
if not silent:
|
||||
print(
|
||||
f"\nBest threshold: {round(best_threshold, ndigits=4)} with {scores_key} value of {scores[best_threshold]}."
|
||||
)
|
||||
|
||||
return best_threshold, scores[best_threshold], scores
|
|
@ -189,7 +189,11 @@ def convert_asset_url(url: str) -> str:
|
|||
RETURNS (str): The converted URL.
|
||||
"""
|
||||
# If the asset URL is a regular GitHub URL it's likely a mistake
|
||||
if re.match(r"(http(s?)):\/\/github.com", url) and "releases/download" not in url:
|
||||
if (
|
||||
re.match(r"(http(s?)):\/\/github.com", url)
|
||||
and "releases/download" not in url
|
||||
and "/raw/" not in url
|
||||
):
|
||||
converted = url.replace("github.com", "raw.githubusercontent.com")
|
||||
converted = re.sub(r"/(tree|blob)/", "/", converted)
|
||||
msg.warn(
|
||||
|
|
|
@ -25,6 +25,7 @@ def project_update_dvc_cli(
|
|||
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
|
||||
workflow: Optional[str] = Arg(None, help=f"Name of workflow defined in {PROJECT_FILE}. Defaults to first workflow if not set."),
|
||||
verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"),
|
||||
quiet: bool = Opt(False, "--quiet", "-q", help="Print less info"),
|
||||
force: bool = Opt(False, "--force", "-F", help="Force update DVC config"),
|
||||
# fmt: on
|
||||
):
|
||||
|
@ -36,7 +37,7 @@ def project_update_dvc_cli(
|
|||
|
||||
DOCS: https://spacy.io/api/cli#project-dvc
|
||||
"""
|
||||
project_update_dvc(project_dir, workflow, verbose=verbose, force=force)
|
||||
project_update_dvc(project_dir, workflow, verbose=verbose, quiet=quiet, force=force)
|
||||
|
||||
|
||||
def project_update_dvc(
|
||||
|
@ -44,6 +45,7 @@ def project_update_dvc(
|
|||
workflow: Optional[str] = None,
|
||||
*,
|
||||
verbose: bool = False,
|
||||
quiet: bool = False,
|
||||
force: bool = False,
|
||||
) -> None:
|
||||
"""Update the auto-generated Data Version Control (DVC) config file. A DVC
|
||||
|
@ -54,11 +56,12 @@ def project_update_dvc(
|
|||
workflow (Optional[str]): Optional name of workflow defined in project.yml.
|
||||
If not set, the first workflow will be used.
|
||||
verbose (bool): Print more info.
|
||||
quiet (bool): Print less info.
|
||||
force (bool): Force update DVC config.
|
||||
"""
|
||||
config = load_project_config(project_dir)
|
||||
updated = update_dvc_config(
|
||||
project_dir, config, workflow, verbose=verbose, force=force
|
||||
project_dir, config, workflow, verbose=verbose, quiet=quiet, force=force
|
||||
)
|
||||
help_msg = "To execute the workflow with DVC, run: dvc repro"
|
||||
if updated:
|
||||
|
@ -72,7 +75,7 @@ def update_dvc_config(
|
|||
config: Dict[str, Any],
|
||||
workflow: Optional[str] = None,
|
||||
verbose: bool = False,
|
||||
silent: bool = False,
|
||||
quiet: bool = False,
|
||||
force: bool = False,
|
||||
) -> bool:
|
||||
"""Re-run the DVC commands in dry mode and update dvc.yaml file in the
|
||||
|
@ -83,7 +86,7 @@ def update_dvc_config(
|
|||
path (Path): The path to the project directory.
|
||||
config (Dict[str, Any]): The loaded project.yml.
|
||||
verbose (bool): Whether to print additional info (via DVC).
|
||||
silent (bool): Don't output anything (via DVC).
|
||||
quiet (bool): Don't output anything (via DVC).
|
||||
force (bool): Force update, even if hashes match.
|
||||
RETURNS (bool): Whether the DVC config file was updated.
|
||||
"""
|
||||
|
@ -105,6 +108,14 @@ def update_dvc_config(
|
|||
dvc_config_path.unlink()
|
||||
dvc_commands = []
|
||||
config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
|
||||
|
||||
# some flags that apply to every command
|
||||
flags = []
|
||||
if verbose:
|
||||
flags.append("--verbose")
|
||||
if quiet:
|
||||
flags.append("--quiet")
|
||||
|
||||
for name in workflows[workflow]:
|
||||
if isinstance(name, dict) and "parallel" in name:
|
||||
msg.fail(
|
||||
|
@ -123,19 +134,26 @@ def update_dvc_config(
|
|||
deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl]
|
||||
outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl]
|
||||
outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl]
|
||||
dvc_cmd = ["run", "-n", name, "-w", str(path), "--no-exec"]
|
||||
|
||||
dvc_cmd = ["run", *flags, "-n", name, "-w", str(path), "--no-exec"]
|
||||
if command.get("no_skip"):
|
||||
dvc_cmd.append("--always-changed")
|
||||
full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd]
|
||||
dvc_commands.append(join_command(full_cmd))
|
||||
if len(dvc_commands) == 0:
|
||||
|
||||
if not dvc_commands:
|
||||
# If we don't check for this, then there will be an error when reading the
|
||||
# config, since DVC wouldn't create it.
|
||||
msg.fail(
|
||||
f"A DVC workflow must have at least one dependency or output",
|
||||
"No usable commands for DVC found. This can happen if none of your "
|
||||
"commands have dependencies or outputs.",
|
||||
exits=1,
|
||||
)
|
||||
|
||||
with working_dir(path):
|
||||
dvc_flags = {"--verbose": verbose, "--quiet": silent}
|
||||
run_dvc_commands(dvc_commands, flags=dvc_flags)
|
||||
for c in dvc_commands:
|
||||
dvc_command = "dvc " + c
|
||||
run_command(dvc_command)
|
||||
with dvc_config_path.open("r+", encoding="utf8") as f:
|
||||
content = f.read()
|
||||
f.seek(0, 0)
|
||||
|
@ -143,26 +161,6 @@ def update_dvc_config(
|
|||
return True
|
||||
|
||||
|
||||
def run_dvc_commands(
|
||||
commands: Iterable[str] = SimpleFrozenList(), flags: Dict[str, bool] = {}
|
||||
) -> None:
|
||||
"""Run a sequence of DVC commands in a subprocess, in order.
|
||||
|
||||
commands (List[str]): The string commands without the leading "dvc".
|
||||
flags (Dict[str, bool]): Conditional flags to be added to command. Makes it
|
||||
easier to pass flags like --quiet that depend on a variable or
|
||||
command-line setting while avoiding lots of nested conditionals.
|
||||
"""
|
||||
for c in commands:
|
||||
command = split_command(c)
|
||||
dvc_command = ["dvc", *command]
|
||||
# Add the flags if they are set to True
|
||||
for flag, is_active in flags.items():
|
||||
if is_active:
|
||||
dvc_command.append(flag)
|
||||
run_command(dvc_command)
|
||||
|
||||
|
||||
def check_workflows(workflows: List[str], workflow: Optional[str] = None) -> None:
|
||||
"""Validate workflows provided in project.yml and check that a given
|
||||
workflow can be used to generate a DVC config.
|
||||
|
|
|
@ -178,7 +178,7 @@ class _ParallelCommand:
|
|||
def change_state(self, new_state: str) -> None:
|
||||
if new_state not in self.state.transitions:
|
||||
raise RuntimeError(
|
||||
Errors.E1044.format(old_state=self.state.name, new_state=new_state)
|
||||
Errors.E1051.format(old_state=self.state.name, new_state=new_state)
|
||||
)
|
||||
self.state = _ParallelCommand.state_dict[new_state]
|
||||
|
||||
|
|
|
@ -5,14 +5,17 @@ import hashlib
|
|||
import urllib.parse
|
||||
import tarfile
|
||||
from pathlib import Path
|
||||
from wasabi import msg
|
||||
|
||||
from .._util import get_hash, get_checksum, download_file, ensure_pathy
|
||||
from ...util import make_tempdir, get_minor_version, ENV_VARS, check_bool_env_var
|
||||
from .._util import get_hash, get_checksum, upload_file, download_file
|
||||
from .._util import ensure_pathy, make_tempdir
|
||||
from ...util import get_minor_version, ENV_VARS, check_bool_env_var
|
||||
from ...git_info import GIT_VERSION
|
||||
from ... import about
|
||||
from ...errors import Errors
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pathy import Pathy # noqa: F401
|
||||
from pathy import FluidPath # noqa: F401
|
||||
|
||||
|
||||
class RemoteStorage:
|
||||
|
@ -27,7 +30,7 @@ class RemoteStorage:
|
|||
self.url = ensure_pathy(url)
|
||||
self.compression = compression
|
||||
|
||||
def push(self, path: Path, command_hash: str, content_hash: str) -> "Pathy":
|
||||
def push(self, path: Path, command_hash: str, content_hash: str) -> "FluidPath":
|
||||
"""Compress a file or directory within a project and upload it to a remote
|
||||
storage. If an object exists at the full URL, nothing is done.
|
||||
|
||||
|
@ -48,9 +51,7 @@ class RemoteStorage:
|
|||
mode_string = f"w:{self.compression}" if self.compression else "w"
|
||||
with tarfile.open(tar_loc, mode=mode_string) as tar_file:
|
||||
tar_file.add(str(loc), arcname=str(path))
|
||||
with tar_loc.open(mode="rb") as input_file:
|
||||
with url.open(mode="wb") as output_file:
|
||||
output_file.write(input_file.read())
|
||||
upload_file(tar_loc, url)
|
||||
return url
|
||||
|
||||
def pull(
|
||||
|
@ -59,7 +60,7 @@ class RemoteStorage:
|
|||
*,
|
||||
command_hash: Optional[str] = None,
|
||||
content_hash: Optional[str] = None,
|
||||
) -> Optional["Pathy"]:
|
||||
) -> Optional["FluidPath"]:
|
||||
"""Retrieve a file from the remote cache. If the file already exists,
|
||||
nothing is done.
|
||||
|
||||
|
@ -84,7 +85,23 @@ class RemoteStorage:
|
|||
with tarfile.open(tar_loc, mode=mode_string) as tar_file:
|
||||
# This requires that the path is added correctly, relative
|
||||
# to root. This is how we set things up in push()
|
||||
tar_file.extractall(self.root)
|
||||
|
||||
# Disallow paths outside the current directory for the tar
|
||||
# file (CVE-2007-4559, directory traversal vulnerability)
|
||||
def is_within_directory(directory, target):
|
||||
abs_directory = os.path.abspath(directory)
|
||||
abs_target = os.path.abspath(target)
|
||||
prefix = os.path.commonprefix([abs_directory, abs_target])
|
||||
return prefix == abs_directory
|
||||
|
||||
def safe_extract(tar, path):
|
||||
for member in tar.getmembers():
|
||||
member_path = os.path.join(path, member.name)
|
||||
if not is_within_directory(path, member_path):
|
||||
raise ValueError(Errors.E852)
|
||||
tar.extractall(path)
|
||||
|
||||
safe_extract(tar_file, self.root)
|
||||
return url
|
||||
|
||||
def find(
|
||||
|
@ -93,25 +110,37 @@ class RemoteStorage:
|
|||
*,
|
||||
command_hash: Optional[str] = None,
|
||||
content_hash: Optional[str] = None,
|
||||
) -> Optional["Pathy"]:
|
||||
) -> Optional["FluidPath"]:
|
||||
"""Find the best matching version of a file within the storage,
|
||||
or `None` if no match can be found. If both the creation and content hash
|
||||
are specified, only exact matches will be returned. Otherwise, the most
|
||||
recent matching file is preferred.
|
||||
"""
|
||||
name = self.encode_name(str(path))
|
||||
urls = []
|
||||
if command_hash is not None and content_hash is not None:
|
||||
url = self.make_url(path, command_hash, content_hash)
|
||||
url = self.url / name / command_hash / content_hash
|
||||
urls = [url] if url.exists() else []
|
||||
elif command_hash is not None:
|
||||
urls = list((self.url / name / command_hash).iterdir())
|
||||
if (self.url / name / command_hash).exists():
|
||||
urls = list((self.url / name / command_hash).iterdir())
|
||||
else:
|
||||
urls = list((self.url / name).iterdir())
|
||||
if content_hash is not None:
|
||||
urls = [url for url in urls if url.parts[-1] == content_hash]
|
||||
if (self.url / name).exists():
|
||||
for sub_dir in (self.url / name).iterdir():
|
||||
urls.extend(sub_dir.iterdir())
|
||||
if content_hash is not None:
|
||||
urls = [url for url in urls if url.parts[-1] == content_hash]
|
||||
if len(urls) >= 2:
|
||||
try:
|
||||
urls.sort(key=lambda x: x.stat().last_modified) # type: ignore
|
||||
except Exception:
|
||||
msg.warn(
|
||||
"Unable to sort remote files by last modified. The file(s) "
|
||||
"pulled from the cache may not be the most recent."
|
||||
)
|
||||
return urls[-1] if urls else None
|
||||
|
||||
def make_url(self, path: Path, command_hash: str, content_hash: str) -> "Pathy":
|
||||
def make_url(self, path: Path, command_hash: str, content_hash: str) -> "FluidPath":
|
||||
"""Construct a URL from a subpath, a creation hash and a content hash."""
|
||||
return self.url / self.encode_name(str(path)) / command_hash / content_hash
|
||||
|
||||
|
|
|
@ -51,6 +51,7 @@ def project_run(
|
|||
force: bool = False,
|
||||
dry: bool = False,
|
||||
capture: bool = False,
|
||||
skip_requirements_check: bool = False,
|
||||
) -> None:
|
||||
"""Run a named script defined in the project.yml. If the script is part
|
||||
of the default pipeline (defined in the "run" section), DVC is used to
|
||||
|
@ -67,6 +68,7 @@ def project_run(
|
|||
sys.exit will be called with the return code. You should use capture=False
|
||||
when you want to turn over execution to the command, and capture=True
|
||||
when you want to run the command more like a function.
|
||||
skip_requirements_check (bool): Whether to skip the requirements check.
|
||||
"""
|
||||
config = load_project_config(project_dir, overrides=overrides)
|
||||
commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
|
||||
|
@ -74,26 +76,28 @@ def project_run(
|
|||
validate_subcommand(list(commands.keys()), list(workflows.keys()), subcommand)
|
||||
|
||||
req_path = project_dir / "requirements.txt"
|
||||
if config.get("check_requirements", True) and os.path.exists(req_path):
|
||||
with req_path.open() as requirements_file:
|
||||
_check_requirements([req.replace("\n", "") for req in requirements_file])
|
||||
if not skip_requirements_check:
|
||||
if config.get("check_requirements", True) and os.path.exists(req_path):
|
||||
with req_path.open() as requirements_file:
|
||||
_check_requirements([req.strip() for req in requirements_file])
|
||||
|
||||
if subcommand in workflows:
|
||||
msg.info(f"Running workflow '{subcommand}'")
|
||||
for workflow_item in workflows[subcommand]:
|
||||
if isinstance(workflow_item, str):
|
||||
for cmd in workflows[subcommand]:
|
||||
if isinstance(cmd, str):
|
||||
project_run(
|
||||
project_dir,
|
||||
workflow_item,
|
||||
cmd,
|
||||
overrides=overrides,
|
||||
force=force,
|
||||
dry=dry,
|
||||
capture=capture,
|
||||
skip_requirements_check=True,
|
||||
)
|
||||
else:
|
||||
assert isinstance(workflow_item, dict)
|
||||
assert len(workflow_item) == 1
|
||||
cmds = workflow_item["parallel"]
|
||||
assert isinstance(cmd, dict)
|
||||
assert len(cmd) == 1
|
||||
cmds = cmd["parallel"]
|
||||
assert isinstance(cmds[0], str)
|
||||
project_run_parallel_group(
|
||||
project_dir,
|
||||
|
@ -289,6 +293,12 @@ def _check_requirements(requirements: List[str]) -> Tuple[bool, bool]:
|
|||
failed_pkgs_msgs.append(dnf.report())
|
||||
except pkg_resources.VersionConflict as vc:
|
||||
conflicting_pkgs_msgs.append(vc.report())
|
||||
except Exception:
|
||||
msg.warn(
|
||||
f"Unable to check requirement: {req} "
|
||||
"Checks are currently limited to requirement specifiers "
|
||||
"(PEP 508)"
|
||||
)
|
||||
|
||||
if len(failed_pkgs_msgs) or len(conflicting_pkgs_msgs):
|
||||
msg.warn(
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
{# This is a template for training configs used for the quickstart widget in
|
||||
the docs and the init config command. It encodes various best practices and
|
||||
can help generate the best possible configuration, given a user's requirements. #}
|
||||
{%- set use_transformer = hardware != "cpu" -%}
|
||||
{%- set use_transformer = hardware != "cpu" and transformer_data -%}
|
||||
{%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
|
||||
{%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "spancat", "trainable_lemmatizer"] -%}
|
||||
[paths]
|
||||
|
|
|
@ -37,6 +37,15 @@ bn:
|
|||
accuracy:
|
||||
name: sagorsarker/bangla-bert-base
|
||||
size_factor: 3
|
||||
ca:
|
||||
word_vectors: null
|
||||
transformer:
|
||||
efficiency:
|
||||
name: projecte-aina/roberta-base-ca-v2
|
||||
size_factor: 3
|
||||
accuracy:
|
||||
name: projecte-aina/roberta-base-ca-v2
|
||||
size_factor: 3
|
||||
da:
|
||||
word_vectors: da_core_news_lg
|
||||
transformer:
|
||||
|
|
|
@ -90,6 +90,8 @@ dev_corpus = "corpora.dev"
|
|||
train_corpus = "corpora.train"
|
||||
# Optional callback before nlp object is saved to disk after training
|
||||
before_to_disk = null
|
||||
# Optional callback that is invoked at the start of each training step
|
||||
before_update = null
|
||||
|
||||
[training.logger]
|
||||
@loggers = "spacy.ConsoleLogger.v1"
|
||||
|
|
|
@ -11,6 +11,7 @@ from .render import DependencyRenderer, EntityRenderer, SpanRenderer
|
|||
from ..tokens import Doc, Span
|
||||
from ..errors import Errors, Warnings
|
||||
from ..util import is_in_jupyter
|
||||
from ..util import find_available_port
|
||||
|
||||
|
||||
_html = {}
|
||||
|
@ -36,7 +37,7 @@ def render(
|
|||
jupyter (bool): Override Jupyter auto-detection.
|
||||
options (dict): Visualiser-specific options, e.g. colors.
|
||||
manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
|
||||
RETURNS (str): Rendered HTML markup.
|
||||
RETURNS (str): Rendered SVG or HTML markup.
|
||||
|
||||
DOCS: https://spacy.io/api/top-level#displacy.render
|
||||
USAGE: https://spacy.io/usage/visualizers
|
||||
|
@ -82,6 +83,7 @@ def serve(
|
|||
manual: bool = False,
|
||||
port: int = 5000,
|
||||
host: str = "0.0.0.0",
|
||||
auto_select_port: bool = False,
|
||||
) -> None:
|
||||
"""Serve displaCy visualisation.
|
||||
|
||||
|
@ -93,12 +95,15 @@ def serve(
|
|||
manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
|
||||
port (int): Port to serve visualisation.
|
||||
host (str): Host to serve visualisation.
|
||||
auto_select_port (bool): Automatically select a port if the specified port is in use.
|
||||
|
||||
DOCS: https://spacy.io/api/top-level#displacy.serve
|
||||
USAGE: https://spacy.io/usage/visualizers
|
||||
"""
|
||||
from wsgiref import simple_server
|
||||
|
||||
port = find_available_port(port, host, auto_select_port)
|
||||
|
||||
if is_in_jupyter():
|
||||
warnings.warn(Warnings.W011)
|
||||
render(docs, style=style, page=page, minify=minify, options=options, manual=manual)
|
||||
|
@ -228,12 +233,13 @@ def parse_spans(doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
|
|||
"kb_id": span.kb_id_ if span.kb_id_ else "",
|
||||
"kb_url": kb_url_template.format(span.kb_id_) if kb_url_template else "#",
|
||||
}
|
||||
for span in doc.spans[spans_key]
|
||||
for span in doc.spans.get(spans_key, [])
|
||||
]
|
||||
tokens = [token.text for token in doc]
|
||||
|
||||
if not spans:
|
||||
warnings.warn(Warnings.W117.format(spans_key=spans_key))
|
||||
keys = list(doc.spans.keys())
|
||||
warnings.warn(Warnings.W117.format(spans_key=spans_key, keys=keys))
|
||||
title = doc.user_data.get("title", None) if hasattr(doc, "user_data") else None
|
||||
settings = get_doc_settings(doc)
|
||||
return {
|
||||
|
|
|
@ -94,7 +94,7 @@ class SpanRenderer:
|
|||
parsed (list): Dependency parses to render.
|
||||
page (bool): Render parses wrapped as full HTML page.
|
||||
minify (bool): Minify HTML markup.
|
||||
RETURNS (str): Rendered HTML markup.
|
||||
RETURNS (str): Rendered SVG or HTML markup.
|
||||
"""
|
||||
rendered = []
|
||||
for i, p in enumerate(parsed):
|
||||
|
@ -510,7 +510,7 @@ class EntityRenderer:
|
|||
parsed (list): Dependency parses to render.
|
||||
page (bool): Render parses wrapped as full HTML page.
|
||||
minify (bool): Minify HTML markup.
|
||||
RETURNS (str): Rendered HTML markup.
|
||||
RETURNS (str): Rendered SVG or HTML markup.
|
||||
"""
|
||||
rendered = []
|
||||
for i, p in enumerate(parsed):
|
||||
|
|
|
@ -199,7 +199,7 @@ class Warnings(metaclass=ErrorsWithCodes):
|
|||
W117 = ("No spans to visualize found in Doc object with spans_key: '{spans_key}'. If this is "
|
||||
"surprising to you, make sure the Doc was processed using a model "
|
||||
"that supports span categorization, and check the `doc.spans[spans_key]` "
|
||||
"property manually if necessary.")
|
||||
"property manually if necessary.\n\nAvailable keys: {keys}")
|
||||
W118 = ("Term '{term}' not found in glossary. It may however be explained in documentation "
|
||||
"for the corpora used to train the language. Please check "
|
||||
"`nlp.meta[\"sources\"]` for any relevant links.")
|
||||
|
@ -212,8 +212,9 @@ class Warnings(metaclass=ErrorsWithCodes):
|
|||
W121 = ("Attempting to trace non-existent method '{method}' in pipe '{pipe}'")
|
||||
W122 = ("Couldn't trace method '{method}' in pipe '{pipe}'. This can happen if the pipe class "
|
||||
"is a Cython extension type.")
|
||||
W123 = ("Argument {arg} with value {arg_value} is used instead of {config_value} as specified in the config. Be "
|
||||
"aware that this might affect other components in your pipeline.")
|
||||
W123 = ("Argument `enable` with value {enable} does not contain all values specified in the config option "
|
||||
"`enabled` ({enabled}). Be aware that this might affect other components in your pipeline.")
|
||||
W124 = ("{host}:{port} is already in use, using the nearest available port {serve_port} as an alternative.")
|
||||
|
||||
|
||||
class Errors(metaclass=ErrorsWithCodes):
|
||||
|
@ -345,6 +346,11 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
"clear the existing vectors and resize the table.")
|
||||
E074 = ("Error interpreting compiled match pattern: patterns are expected "
|
||||
"to end with the attribute {attr}. Got: {bad_attr}.")
|
||||
E079 = ("Error computing states in beam: number of predicted beams "
|
||||
"({pbeams}) does not equal number of gold beams ({gbeams}).")
|
||||
E080 = ("Duplicate state found in beam: {key}.")
|
||||
E081 = ("Error getting gradient in beam: number of histories ({n_hist}) "
|
||||
"does not equal number of losses ({losses}).")
|
||||
E082 = ("Error deprojectivizing parse: number of heads ({n_heads}), "
|
||||
"projective heads ({n_proj_heads}) and labels ({n_labels}) do not "
|
||||
"match.")
|
||||
|
@ -540,8 +546,14 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
E199 = ("Unable to merge 0-length span at `doc[{start}:{end}]`.")
|
||||
E200 = ("Can't set {attr} from Span.")
|
||||
E202 = ("Unsupported {name} mode '{mode}'. Supported modes: {modes}.")
|
||||
E203 = ("If the {name} embedding layer is not updated "
|
||||
"during training, make sure to include it in 'annotating components'")
|
||||
|
||||
# New errors added in v3.x
|
||||
E851 = ("The 'textcat' component labels should only have values of 0 or 1, "
|
||||
"but found value of '{val}'.")
|
||||
E852 = ("The tar file pulled from the remote attempted an unsafe path "
|
||||
"traversal.")
|
||||
E853 = ("Unsupported component factory name '{name}'. The character '.' is "
|
||||
"not permitted in factory names.")
|
||||
E854 = ("Unable to set doc.ents. Check that the 'ents_filter' does not "
|
||||
|
@ -711,9 +723,9 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
"`nlp.enable_pipe` instead.")
|
||||
E927 = ("Can't write to frozen list. Maybe you're trying to modify a computed "
|
||||
"property or default function argument?")
|
||||
E928 = ("A KnowledgeBase can only be serialized to/from from a directory, "
|
||||
E928 = ("An InMemoryLookupKB can only be serialized to/from from a directory, "
|
||||
"but the provided argument {loc} points to a file.")
|
||||
E929 = ("Couldn't read KnowledgeBase from {loc}. The path does not seem to exist.")
|
||||
E929 = ("Couldn't read InMemoryLookupKB from {loc}. The path does not seem to exist.")
|
||||
E930 = ("Received invalid get_examples callback in `{method}`. "
|
||||
"Expected function that returns an iterable of Example objects but "
|
||||
"got: {obj}")
|
||||
|
@ -944,8 +956,20 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
"case pass an empty list for the previously not specified argument to avoid this error.")
|
||||
E1043 = ("Expected None or a value in range [{range_start}, {range_end}] for entity linker threshold, but got "
|
||||
"{value}.")
|
||||
E1044 = ("Illegal transition from {old_state} to {new_state}.")
|
||||
|
||||
E1044 = ("Expected `candidates_batch_size` to be >= 1, but got: {value}")
|
||||
E1045 = ("Encountered {parent} subclass without `{parent}.{method}` "
|
||||
"method in '{name}'. If you want to use this method, make "
|
||||
"sure it's overwritten on the subclass.")
|
||||
E1046 = ("{cls_name} is an abstract class and cannot be instantiated. If you are looking for spaCy's default "
|
||||
"knowledge base, use `InMemoryLookupKB`.")
|
||||
E1047 = ("`find_threshold()` only supports components with a `scorer` attribute.")
|
||||
E1048 = ("Got '{unexpected}' as console progress bar type, but expected one of the following: {expected}")
|
||||
E1049 = ("No available port found for displaCy on host {host}. Please specify an available port "
|
||||
"with `displacy.serve(doc, port=port)`")
|
||||
E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port=port)` "
|
||||
"or use `auto_switch_port=True` to pick an available port automatically.")
|
||||
E1051 = ("Illegal transition from {old_state} to {new_state}.")
|
||||
|
||||
|
||||
# Deprecated model shortcuts, only used in errors and warnings
|
||||
OLD_MODEL_SHORTCUTS = {
|
||||
|
|
3
spacy/kb/__init__.py
Normal file
3
spacy/kb/__init__.py
Normal file
|
@ -0,0 +1,3 @@
|
|||
from .kb import KnowledgeBase
|
||||
from .kb_in_memory import InMemoryLookupKB
|
||||
from .candidate import Candidate, get_candidates, get_candidates_batch
|
12
spacy/kb/candidate.pxd
Normal file
12
spacy/kb/candidate.pxd
Normal file
|
@ -0,0 +1,12 @@
|
|||
from .kb cimport KnowledgeBase
|
||||
from libcpp.vector cimport vector
|
||||
from ..typedefs cimport hash_t
|
||||
|
||||
# Object used by the Entity Linker that summarizes one entity-alias candidate combination.
|
||||
cdef class Candidate:
|
||||
cdef readonly KnowledgeBase kb
|
||||
cdef hash_t entity_hash
|
||||
cdef float entity_freq
|
||||
cdef vector[float] entity_vector
|
||||
cdef hash_t alias_hash
|
||||
cdef float prior_prob
|
74
spacy/kb/candidate.pyx
Normal file
74
spacy/kb/candidate.pyx
Normal file
|
@ -0,0 +1,74 @@
|
|||
# cython: infer_types=True, profile=True
|
||||
|
||||
from typing import Iterable
|
||||
from .kb cimport KnowledgeBase
|
||||
from ..tokens import Span
|
||||
|
||||
cdef class Candidate:
|
||||
"""A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved
|
||||
to a specific `entity` from a Knowledge Base. This will be used as input for the entity linking
|
||||
algorithm which will disambiguate the various candidates to the correct one.
|
||||
Each candidate (alias, entity) pair is assigned a certain prior probability.
|
||||
|
||||
DOCS: https://spacy.io/api/kb/#candidate-init
|
||||
"""
|
||||
|
||||
def __init__(self, KnowledgeBase kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob):
|
||||
self.kb = kb
|
||||
self.entity_hash = entity_hash
|
||||
self.entity_freq = entity_freq
|
||||
self.entity_vector = entity_vector
|
||||
self.alias_hash = alias_hash
|
||||
self.prior_prob = prior_prob
|
||||
|
||||
@property
|
||||
def entity(self) -> int:
|
||||
"""RETURNS (uint64): hash of the entity's KB ID/name"""
|
||||
return self.entity_hash
|
||||
|
||||
@property
|
||||
def entity_(self) -> str:
|
||||
"""RETURNS (str): ID/name of this entity in the KB"""
|
||||
return self.kb.vocab.strings[self.entity_hash]
|
||||
|
||||
@property
|
||||
def alias(self) -> int:
|
||||
"""RETURNS (uint64): hash of the alias"""
|
||||
return self.alias_hash
|
||||
|
||||
@property
|
||||
def alias_(self) -> str:
|
||||
"""RETURNS (str): ID of the original alias"""
|
||||
return self.kb.vocab.strings[self.alias_hash]
|
||||
|
||||
@property
|
||||
def entity_freq(self) -> float:
|
||||
return self.entity_freq
|
||||
|
||||
@property
|
||||
def entity_vector(self) -> Iterable[float]:
|
||||
return self.entity_vector
|
||||
|
||||
@property
|
||||
def prior_prob(self) -> float:
|
||||
return self.prior_prob
|
||||
|
||||
|
||||
def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]:
|
||||
"""
|
||||
Return candidate entities for a given mention and fetching appropriate entries from the index.
|
||||
kb (KnowledgeBase): Knowledge base to query.
|
||||
mention (Span): Entity mention for which to identify candidates.
|
||||
RETURNS (Iterable[Candidate]): Identified candidates.
|
||||
"""
|
||||
return kb.get_candidates(mention)
|
||||
|
||||
|
||||
def get_candidates_batch(kb: KnowledgeBase, mentions: Iterable[Span]) -> Iterable[Iterable[Candidate]]:
|
||||
"""
|
||||
Return candidate entities for the given mentions and fetching appropriate entries from the index.
|
||||
kb (KnowledgeBase): Knowledge base to query.
|
||||
mention (Iterable[Span]): Entity mentions for which to identify candidates.
|
||||
RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
|
||||
"""
|
||||
return kb.get_candidates_batch(mentions)
|
10
spacy/kb/kb.pxd
Normal file
10
spacy/kb/kb.pxd
Normal file
|
@ -0,0 +1,10 @@
|
|||
"""Knowledge-base for entity or concept linking."""
|
||||
|
||||
from cymem.cymem cimport Pool
|
||||
from libc.stdint cimport int64_t
|
||||
from ..vocab cimport Vocab
|
||||
|
||||
cdef class KnowledgeBase:
|
||||
cdef Pool mem
|
||||
cdef readonly Vocab vocab
|
||||
cdef readonly int64_t entity_vector_length
|
108
spacy/kb/kb.pyx
Normal file
108
spacy/kb/kb.pyx
Normal file
|
@ -0,0 +1,108 @@
|
|||
# cython: infer_types=True, profile=True
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Iterable, Tuple, Union
|
||||
from cymem.cymem cimport Pool
|
||||
|
||||
from .candidate import Candidate
|
||||
from ..tokens import Span
|
||||
from ..util import SimpleFrozenList
|
||||
from ..errors import Errors
|
||||
|
||||
|
||||
cdef class KnowledgeBase:
|
||||
"""A `KnowledgeBase` instance stores unique identifiers for entities and their textual aliases,
|
||||
to support entity linking of named entities to real-world concepts.
|
||||
This is an abstract class and requires its operations to be implemented.
|
||||
|
||||
DOCS: https://spacy.io/api/kb
|
||||
"""
|
||||
|
||||
def __init__(self, vocab: Vocab, entity_vector_length: int):
|
||||
"""Create a KnowledgeBase."""
|
||||
# Make sure abstract KB is not instantiated.
|
||||
if self.__class__ == KnowledgeBase:
|
||||
raise TypeError(
|
||||
Errors.E1046.format(cls_name=self.__class__.__name__)
|
||||
)
|
||||
|
||||
self.vocab = vocab
|
||||
self.entity_vector_length = entity_vector_length
|
||||
self.mem = Pool()
|
||||
|
||||
def get_candidates_batch(self, mentions: Iterable[Span]) -> Iterable[Iterable[Candidate]]:
|
||||
"""
|
||||
Return candidate entities for specified texts. Each candidate defines the entity, the original alias,
|
||||
and the prior probability of that alias resolving to that entity.
|
||||
If no candidate is found for a given text, an empty list is returned.
|
||||
mentions (Iterable[Span]): Mentions for which to get candidates.
|
||||
RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
|
||||
"""
|
||||
return [self.get_candidates(span) for span in mentions]
|
||||
|
||||
def get_candidates(self, mention: Span) -> Iterable[Candidate]:
|
||||
"""
|
||||
Return candidate entities for specified text. Each candidate defines the entity, the original alias,
|
||||
and the prior probability of that alias resolving to that entity.
|
||||
If the no candidate is found for a given text, an empty list is returned.
|
||||
mention (Span): Mention for which to get candidates.
|
||||
RETURNS (Iterable[Candidate]): Identified candidates.
|
||||
"""
|
||||
raise NotImplementedError(
|
||||
Errors.E1045.format(parent="KnowledgeBase", method="get_candidates", name=self.__name__)
|
||||
)
|
||||
|
||||
def get_vectors(self, entities: Iterable[str]) -> Iterable[Iterable[float]]:
|
||||
"""
|
||||
Return vectors for entities.
|
||||
entity (str): Entity name/ID.
|
||||
RETURNS (Iterable[Iterable[float]]): Vectors for specified entities.
|
||||
"""
|
||||
return [self.get_vector(entity) for entity in entities]
|
||||
|
||||
def get_vector(self, str entity) -> Iterable[float]:
|
||||
"""
|
||||
Return vector for entity.
|
||||
entity (str): Entity name/ID.
|
||||
RETURNS (Iterable[float]): Vector for specified entity.
|
||||
"""
|
||||
raise NotImplementedError(
|
||||
Errors.E1045.format(parent="KnowledgeBase", method="get_vector", name=self.__name__)
|
||||
)
|
||||
|
||||
def to_bytes(self, **kwargs) -> bytes:
|
||||
"""Serialize the current state to a binary string.
|
||||
RETURNS (bytes): Current state as binary string.
|
||||
"""
|
||||
raise NotImplementedError(
|
||||
Errors.E1045.format(parent="KnowledgeBase", method="to_bytes", name=self.__name__)
|
||||
)
|
||||
|
||||
def from_bytes(self, bytes_data: bytes, *, exclude: Tuple[str] = tuple()):
|
||||
"""Load state from a binary string.
|
||||
bytes_data (bytes): KB state.
|
||||
exclude (Tuple[str]): Properties to exclude when restoring KB.
|
||||
"""
|
||||
raise NotImplementedError(
|
||||
Errors.E1045.format(parent="KnowledgeBase", method="from_bytes", name=self.__name__)
|
||||
)
|
||||
|
||||
def to_disk(self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()) -> None:
|
||||
"""
|
||||
Write KnowledgeBase content to disk.
|
||||
path (Union[str, Path]): Target file path.
|
||||
exclude (Iterable[str]): List of components to exclude.
|
||||
"""
|
||||
raise NotImplementedError(
|
||||
Errors.E1045.format(parent="KnowledgeBase", method="to_disk", name=self.__name__)
|
||||
)
|
||||
|
||||
def from_disk(self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()) -> None:
|
||||
"""
|
||||
Load KnowledgeBase content from disk.
|
||||
path (Union[str, Path]): Target file path.
|
||||
exclude (Iterable[str]): List of components to exclude.
|
||||
"""
|
||||
raise NotImplementedError(
|
||||
Errors.E1045.format(parent="KnowledgeBase", method="from_disk", name=self.__name__)
|
||||
)
|
|
@ -1,14 +1,12 @@
|
|||
"""Knowledge-base for entity or concept linking."""
|
||||
from cymem.cymem cimport Pool
|
||||
from preshed.maps cimport PreshMap
|
||||
from libcpp.vector cimport vector
|
||||
from libc.stdint cimport int32_t, int64_t
|
||||
from libc.stdio cimport FILE
|
||||
|
||||
from .vocab cimport Vocab
|
||||
from .typedefs cimport hash_t
|
||||
from .structs cimport KBEntryC, AliasC
|
||||
|
||||
from ..typedefs cimport hash_t
|
||||
from ..structs cimport KBEntryC, AliasC
|
||||
from .kb cimport KnowledgeBase
|
||||
|
||||
ctypedef vector[KBEntryC] entry_vec
|
||||
ctypedef vector[AliasC] alias_vec
|
||||
|
@ -16,21 +14,7 @@ ctypedef vector[float] float_vec
|
|||
ctypedef vector[float_vec] float_matrix
|
||||
|
||||
|
||||
# Object used by the Entity Linker that summarizes one entity-alias candidate combination.
|
||||
cdef class Candidate:
|
||||
cdef readonly KnowledgeBase kb
|
||||
cdef hash_t entity_hash
|
||||
cdef float entity_freq
|
||||
cdef vector[float] entity_vector
|
||||
cdef hash_t alias_hash
|
||||
cdef float prior_prob
|
||||
|
||||
|
||||
cdef class KnowledgeBase:
|
||||
cdef Pool mem
|
||||
cdef readonly Vocab vocab
|
||||
cdef int64_t entity_vector_length
|
||||
|
||||
cdef class InMemoryLookupKB(KnowledgeBase):
|
||||
# This maps 64bit keys (hash of unique entity string)
|
||||
# to 64bit values (position of the _KBEntryC struct in the _entries vector).
|
||||
# The PreshMap is pretty space efficient, as it uses open addressing. So
|
|
@ -1,8 +1,7 @@
|
|||
# cython: infer_types=True, profile=True
|
||||
from typing import Iterator, Iterable, Callable, Dict, Any
|
||||
from typing import Iterable, Callable, Dict, Any, Union
|
||||
|
||||
import srsly
|
||||
from cymem.cymem cimport Pool
|
||||
from preshed.maps cimport PreshMap
|
||||
from cpython.exc cimport PyErr_SetFromErrno
|
||||
from libc.stdio cimport fopen, fclose, fread, fwrite, feof, fseek
|
||||
|
@ -12,85 +11,28 @@ from libcpp.vector cimport vector
|
|||
from pathlib import Path
|
||||
import warnings
|
||||
|
||||
from .typedefs cimport hash_t
|
||||
from .errors import Errors, Warnings
|
||||
from . import util
|
||||
from .util import SimpleFrozenList, ensure_path
|
||||
|
||||
cdef class Candidate:
|
||||
"""A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved
|
||||
to a specific `entity` from a Knowledge Base. This will be used as input for the entity linking
|
||||
algorithm which will disambiguate the various candidates to the correct one.
|
||||
Each candidate (alias, entity) pair is assigned to a certain prior probability.
|
||||
|
||||
DOCS: https://spacy.io/api/kb/#candidate_init
|
||||
"""
|
||||
|
||||
def __init__(self, KnowledgeBase kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob):
|
||||
self.kb = kb
|
||||
self.entity_hash = entity_hash
|
||||
self.entity_freq = entity_freq
|
||||
self.entity_vector = entity_vector
|
||||
self.alias_hash = alias_hash
|
||||
self.prior_prob = prior_prob
|
||||
|
||||
@property
|
||||
def entity(self):
|
||||
"""RETURNS (uint64): hash of the entity's KB ID/name"""
|
||||
return self.entity_hash
|
||||
|
||||
@property
|
||||
def entity_(self):
|
||||
"""RETURNS (str): ID/name of this entity in the KB"""
|
||||
return self.kb.vocab.strings[self.entity_hash]
|
||||
|
||||
@property
|
||||
def alias(self):
|
||||
"""RETURNS (uint64): hash of the alias"""
|
||||
return self.alias_hash
|
||||
|
||||
@property
|
||||
def alias_(self):
|
||||
"""RETURNS (str): ID of the original alias"""
|
||||
return self.kb.vocab.strings[self.alias_hash]
|
||||
|
||||
@property
|
||||
def entity_freq(self):
|
||||
return self.entity_freq
|
||||
|
||||
@property
|
||||
def entity_vector(self):
|
||||
return self.entity_vector
|
||||
|
||||
@property
|
||||
def prior_prob(self):
|
||||
return self.prior_prob
|
||||
from ..tokens import Span
|
||||
from ..typedefs cimport hash_t
|
||||
from ..errors import Errors, Warnings
|
||||
from .. import util
|
||||
from ..util import SimpleFrozenList, ensure_path
|
||||
from ..vocab cimport Vocab
|
||||
from .kb cimport KnowledgeBase
|
||||
from .candidate import Candidate as Candidate
|
||||
|
||||
|
||||
def get_candidates(KnowledgeBase kb, span) -> Iterator[Candidate]:
|
||||
"""
|
||||
Return candidate entities for a given span by using the text of the span as the alias
|
||||
and fetching appropriate entries from the index.
|
||||
This particular function is optimized to work with the built-in KB functionality,
|
||||
but any other custom candidate generation method can be used in combination with the KB as well.
|
||||
"""
|
||||
return kb.get_alias_candidates(span.text)
|
||||
|
||||
|
||||
cdef class KnowledgeBase:
|
||||
"""A `KnowledgeBase` instance stores unique identifiers for entities and their textual aliases,
|
||||
cdef class InMemoryLookupKB(KnowledgeBase):
|
||||
"""An `InMemoryLookupKB` instance stores unique identifiers for entities and their textual aliases,
|
||||
to support entity linking of named entities to real-world concepts.
|
||||
|
||||
DOCS: https://spacy.io/api/kb
|
||||
DOCS: https://spacy.io/api/inmemorylookupkb
|
||||
"""
|
||||
|
||||
def __init__(self, Vocab vocab, entity_vector_length):
|
||||
"""Create a KnowledgeBase."""
|
||||
self.mem = Pool()
|
||||
self.entity_vector_length = entity_vector_length
|
||||
"""Create an InMemoryLookupKB."""
|
||||
super().__init__(vocab, entity_vector_length)
|
||||
self._entry_index = PreshMap()
|
||||
self._alias_index = PreshMap()
|
||||
self.vocab = vocab
|
||||
self._create_empty_vectors(dummy_hash=self.vocab.strings[""])
|
||||
|
||||
def _initialize_entities(self, int64_t nr_entities):
|
||||
|
@ -104,11 +46,6 @@ cdef class KnowledgeBase:
|
|||
self._alias_index = PreshMap(nr_aliases + 1)
|
||||
self._aliases_table = alias_vec(nr_aliases + 1)
|
||||
|
||||
@property
|
||||
def entity_vector_length(self):
|
||||
"""RETURNS (uint64): length of the entity vectors"""
|
||||
return self.entity_vector_length
|
||||
|
||||
def __len__(self):
|
||||
return self.get_size_entities()
|
||||
|
||||
|
@ -286,7 +223,10 @@ cdef class KnowledgeBase:
|
|||
alias_entry.probs = probs
|
||||
self._aliases_table[alias_index] = alias_entry
|
||||
|
||||
def get_alias_candidates(self, str alias) -> Iterator[Candidate]:
|
||||
def get_candidates(self, mention: Span) -> Iterable[Candidate]:
|
||||
return self.get_alias_candidates(mention.text) # type: ignore
|
||||
|
||||
def get_alias_candidates(self, str alias) -> Iterable[Candidate]:
|
||||
"""
|
||||
Return candidate entities for an alias. Each candidate defines the entity, the original alias,
|
||||
and the prior probability of that alias resolving to that entity.
|
|
@ -72,10 +72,10 @@ class CatalanLemmatizer(Lemmatizer):
|
|||
oov_forms.append(form)
|
||||
if not forms:
|
||||
forms.extend(oov_forms)
|
||||
if not forms and string in lookup_table.keys():
|
||||
forms.append(self.lookup_lemmatize(token)[0])
|
||||
|
||||
# use lookups, and fall back to the token itself
|
||||
if not forms:
|
||||
forms.append(string)
|
||||
forms.append(lookup_table.get(string, [string])[0])
|
||||
forms = list(dict.fromkeys(forms))
|
||||
self.cache[cache_key] = forms
|
||||
return forms
|
||||
|
|
|
@ -280,7 +280,7 @@ _currency = (
|
|||
_punct = (
|
||||
r"… …… , : ; \! \? ¿ ؟ ¡ \( \) \[ \] \{ \} < > _ # \* & 。 ? ! , 、 ; : ~ · । ، ۔ ؛ ٪"
|
||||
)
|
||||
_quotes = r'\' " ” “ ` ‘ ´ ’ ‚ , „ » « 「 」 『 』 ( ) 〔 〕 【 】 《 》 〈 〉'
|
||||
_quotes = r'\' " ” “ ` ‘ ´ ’ ‚ , „ » « 「 」 『 』 ( ) 〔 〕 【 】 《 》 〈 〉 〈 〉 ⟦ ⟧'
|
||||
_hyphens = "- – — -- --- —— ~"
|
||||
|
||||
# Various symbols like dingbats, but also emoji
|
||||
|
|
|
@ -53,11 +53,16 @@ class FrenchLemmatizer(Lemmatizer):
|
|||
rules = rules_table.get(univ_pos, [])
|
||||
string = string.lower()
|
||||
forms = []
|
||||
# first try lookup in table based on upos
|
||||
if string in index:
|
||||
forms.append(string)
|
||||
self.cache[cache_key] = forms
|
||||
return forms
|
||||
|
||||
# then add anything in the exceptions table
|
||||
forms.extend(exceptions.get(string, []))
|
||||
|
||||
# if nothing found yet, use the rules
|
||||
oov_forms = []
|
||||
if not forms:
|
||||
for old, new in rules:
|
||||
|
@ -69,12 +74,14 @@ class FrenchLemmatizer(Lemmatizer):
|
|||
forms.append(form)
|
||||
else:
|
||||
oov_forms.append(form)
|
||||
|
||||
# if still nothing, add the oov forms from rules
|
||||
if not forms:
|
||||
forms.extend(oov_forms)
|
||||
if not forms and string in lookup_table.keys():
|
||||
forms.append(self.lookup_lemmatize(token)[0])
|
||||
|
||||
# use lookups, which fall back to the token itself
|
||||
if not forms:
|
||||
forms.append(string)
|
||||
forms.append(lookup_table.get(string, [string])[0])
|
||||
forms = list(dict.fromkeys(forms))
|
||||
self.cache[cache_key] = forms
|
||||
return forms
|
||||
|
|
|
@ -1,11 +1,15 @@
|
|||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||
from ...language import Language, BaseDefaults
|
||||
|
||||
|
||||
class AncientGreekDefaults(BaseDefaults):
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
prefixes = TOKENIZER_PREFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
infixes = TOKENIZER_INFIXES
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
|
46
spacy/lang/grc/punctuation.py
Normal file
46
spacy/lang/grc/punctuation.py
Normal file
|
@ -0,0 +1,46 @@
|
|||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
|
||||
from ..char_classes import LIST_ICONS, ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS
|
||||
from ..char_classes import CONCAT_QUOTES
|
||||
|
||||
_prefixes = (
|
||||
[
|
||||
"†",
|
||||
"⸏",
|
||||
]
|
||||
+ LIST_PUNCT
|
||||
+ LIST_ELLIPSES
|
||||
+ LIST_QUOTES
|
||||
+ LIST_CURRENCY
|
||||
+ LIST_ICONS
|
||||
)
|
||||
|
||||
_suffixes = (
|
||||
LIST_PUNCT
|
||||
+ LIST_ELLIPSES
|
||||
+ LIST_QUOTES
|
||||
+ LIST_ICONS
|
||||
+ [
|
||||
"†",
|
||||
"⸎",
|
||||
r"(?<=[\u1F00-\u1FFF\u0370-\u03FF])[\-\.⸏]",
|
||||
]
|
||||
)
|
||||
|
||||
_infixes = (
|
||||
LIST_ELLIPSES
|
||||
+ LIST_ICONS
|
||||
+ [
|
||||
r"(?<=[0-9])[+\-\*^](?=[0-9-])",
|
||||
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
|
||||
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
|
||||
),
|
||||
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
|
||||
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[\u1F00-\u1FFF\u0370-\u03FF])—",
|
||||
]
|
||||
)
|
||||
|
||||
TOKENIZER_PREFIXES = _prefixes
|
||||
TOKENIZER_SUFFIXES = _suffixes
|
||||
TOKENIZER_INFIXES = _infixes
|
|
@ -15,7 +15,7 @@
|
|||
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
aan af al alle alles allebei alleen allen als altijd ander anders andere anderen aangaangde aangezien achter achterna
|
||||
aan af al alle alles allebei alleen allen als altijd ander anders andere anderen aangaande aangezien achter achterna
|
||||
afgelopen aldus alhoewel anderzijds
|
||||
|
||||
ben bij bijna bijvoorbeeld behalve beide beiden beneden bent bepaald beter betere betreffende binnen binnenin boven
|
||||
|
|
|
@ -28,7 +28,7 @@ class Russian(Language):
|
|||
assigns=["token.lemma"],
|
||||
default_config={
|
||||
"model": None,
|
||||
"mode": "pymorphy2",
|
||||
"mode": "pymorphy3",
|
||||
"overwrite": False,
|
||||
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||
},
|
||||
|
|
|
@ -19,33 +19,48 @@ class RussianLemmatizer(Lemmatizer):
|
|||
model: Optional[Model],
|
||||
name: str = "lemmatizer",
|
||||
*,
|
||||
mode: str = "pymorphy2",
|
||||
mode: str = "pymorphy3",
|
||||
overwrite: bool = False,
|
||||
scorer: Optional[Callable] = lemmatizer_score,
|
||||
) -> None:
|
||||
if mode == "pymorphy2":
|
||||
if mode in {"pymorphy2", "pymorphy2_lookup"}:
|
||||
try:
|
||||
from pymorphy2 import MorphAnalyzer
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"The Russian lemmatizer mode 'pymorphy2' requires the "
|
||||
"pymorphy2 library. Install it with: pip install pymorphy2"
|
||||
"The lemmatizer mode 'pymorphy2' requires the "
|
||||
"pymorphy2 library and dictionaries. Install them with: "
|
||||
"pip install pymorphy2"
|
||||
"# for Ukrainian dictionaries:"
|
||||
"pip install pymorphy2-dicts-uk"
|
||||
) from None
|
||||
if getattr(self, "_morph", None) is None:
|
||||
self._morph = MorphAnalyzer()
|
||||
self._morph = MorphAnalyzer(lang="ru")
|
||||
elif mode in {"pymorphy3", "pymorphy3_lookup"}:
|
||||
try:
|
||||
from pymorphy3 import MorphAnalyzer
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"The lemmatizer mode 'pymorphy3' requires the "
|
||||
"pymorphy3 library and dictionaries. Install them with: "
|
||||
"pip install pymorphy3"
|
||||
"# for Ukrainian dictionaries:"
|
||||
"pip install pymorphy3-dicts-uk"
|
||||
) from None
|
||||
if getattr(self, "_morph", None) is None:
|
||||
self._morph = MorphAnalyzer(lang="ru")
|
||||
super().__init__(
|
||||
vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||
)
|
||||
|
||||
def pymorphy2_lemmatize(self, token: Token) -> List[str]:
|
||||
def _pymorphy_lemmatize(self, token: Token) -> List[str]:
|
||||
string = token.text
|
||||
univ_pos = token.pos_
|
||||
morphology = token.morph.to_dict()
|
||||
if univ_pos == "PUNCT":
|
||||
return [PUNCT_RULES.get(string, string)]
|
||||
if univ_pos not in ("ADJ", "DET", "NOUN", "NUM", "PRON", "PROPN", "VERB"):
|
||||
# Skip unchangeable pos
|
||||
return [string.lower()]
|
||||
return self._pymorphy_lookup_lemmatize(token)
|
||||
analyses = self._morph.parse(string)
|
||||
filtered_analyses = []
|
||||
for analysis in analyses:
|
||||
|
@ -53,8 +68,10 @@ class RussianLemmatizer(Lemmatizer):
|
|||
# Skip suggested parse variant for unknown word for pymorphy
|
||||
continue
|
||||
analysis_pos, _ = oc2ud(str(analysis.tag))
|
||||
if analysis_pos == univ_pos or (
|
||||
analysis_pos in ("NOUN", "PROPN") and univ_pos in ("NOUN", "PROPN")
|
||||
if (
|
||||
analysis_pos == univ_pos
|
||||
or (analysis_pos in ("NOUN", "PROPN") and univ_pos in ("NOUN", "PROPN"))
|
||||
or ((analysis_pos == "PRON") and (univ_pos == "DET"))
|
||||
):
|
||||
filtered_analyses.append(analysis)
|
||||
if not len(filtered_analyses):
|
||||
|
@ -97,13 +114,28 @@ class RussianLemmatizer(Lemmatizer):
|
|||
dict.fromkeys([analysis.normal_form for analysis in filtered_analyses])
|
||||
)
|
||||
|
||||
def pymorphy2_lookup_lemmatize(self, token: Token) -> List[str]:
|
||||
def _pymorphy_lookup_lemmatize(self, token: Token) -> List[str]:
|
||||
string = token.text
|
||||
analyses = self._morph.parse(string)
|
||||
if len(analyses) == 1:
|
||||
return [analyses[0].normal_form]
|
||||
# often multiple forms would derive from the same normal form
|
||||
# thus check _unique_ normal forms
|
||||
normal_forms = set([an.normal_form for an in analyses])
|
||||
if len(normal_forms) == 1:
|
||||
return [next(iter(normal_forms))]
|
||||
return [string]
|
||||
|
||||
def pymorphy2_lemmatize(self, token: Token) -> List[str]:
|
||||
return self._pymorphy_lemmatize(token)
|
||||
|
||||
def pymorphy2_lookup_lemmatize(self, token: Token) -> List[str]:
|
||||
return self._pymorphy_lookup_lemmatize(token)
|
||||
|
||||
def pymorphy3_lemmatize(self, token: Token) -> List[str]:
|
||||
return self._pymorphy_lemmatize(token)
|
||||
|
||||
def pymorphy3_lookup_lemmatize(self, token: Token) -> List[str]:
|
||||
return self._pymorphy_lookup_lemmatize(token)
|
||||
|
||||
|
||||
def oc2ud(oc_tag: str) -> Tuple[str, Dict[str, str]]:
|
||||
gram_map = {
|
||||
|
|
|
@ -61,6 +61,11 @@ for abbr in [
|
|||
{ORTH: "2к23", NORM: "2023"},
|
||||
{ORTH: "2к24", NORM: "2024"},
|
||||
{ORTH: "2к25", NORM: "2025"},
|
||||
{ORTH: "2к26", NORM: "2026"},
|
||||
{ORTH: "2к27", NORM: "2027"},
|
||||
{ORTH: "2к28", NORM: "2028"},
|
||||
{ORTH: "2к29", NORM: "2029"},
|
||||
{ORTH: "2к30", NORM: "2030"},
|
||||
]:
|
||||
_exc[abbr[ORTH]] = [abbr]
|
||||
|
||||
|
@ -268,8 +273,8 @@ for abbr in [
|
|||
{ORTH: "з-ка", NORM: "заимка"},
|
||||
{ORTH: "п-к", NORM: "починок"},
|
||||
{ORTH: "киш.", NORM: "кишлак"},
|
||||
{ORTH: "п. ст. ", NORM: "поселок станция"},
|
||||
{ORTH: "п. ж/д ст. ", NORM: "поселок при железнодорожной станции"},
|
||||
{ORTH: "п. ст.", NORM: "поселок станция"},
|
||||
{ORTH: "п. ж/д ст.", NORM: "поселок при железнодорожной станции"},
|
||||
{ORTH: "ж/д бл-ст", NORM: "железнодорожный блокпост"},
|
||||
{ORTH: "ж/д б-ка", NORM: "железнодорожная будка"},
|
||||
{ORTH: "ж/д в-ка", NORM: "железнодорожная ветка"},
|
||||
|
@ -280,12 +285,12 @@ for abbr in [
|
|||
{ORTH: "ж/д п.п.", NORM: "железнодорожный путевой пост"},
|
||||
{ORTH: "ж/д о.п.", NORM: "железнодорожный остановочный пункт"},
|
||||
{ORTH: "ж/д рзд.", NORM: "железнодорожный разъезд"},
|
||||
{ORTH: "ж/д ст. ", NORM: "железнодорожная станция"},
|
||||
{ORTH: "ж/д ст.", NORM: "железнодорожная станция"},
|
||||
{ORTH: "м-ко", NORM: "местечко"},
|
||||
{ORTH: "д.", NORM: "деревня"},
|
||||
{ORTH: "с.", NORM: "село"},
|
||||
{ORTH: "сл.", NORM: "слобода"},
|
||||
{ORTH: "ст. ", NORM: "станция"},
|
||||
{ORTH: "ст.", NORM: "станция"},
|
||||
{ORTH: "ст-ца", NORM: "станица"},
|
||||
{ORTH: "у.", NORM: "улус"},
|
||||
{ORTH: "х.", NORM: "хутор"},
|
||||
|
@ -388,8 +393,9 @@ for abbr in [
|
|||
{ORTH: "прим.", NORM: "примечание"},
|
||||
{ORTH: "прим.ред.", NORM: "примечание редакции"},
|
||||
{ORTH: "см. также", NORM: "смотри также"},
|
||||
{ORTH: "кв.м.", NORM: "квадрантный метр"},
|
||||
{ORTH: "м2", NORM: "квадрантный метр"},
|
||||
{ORTH: "см.", NORM: "смотри"},
|
||||
{ORTH: "кв.м.", NORM: "квадратный метр"},
|
||||
{ORTH: "м2", NORM: "квадратный метр"},
|
||||
{ORTH: "б/у", NORM: "бывший в употреблении"},
|
||||
{ORTH: "сокр.", NORM: "сокращение"},
|
||||
{ORTH: "чел.", NORM: "человек"},
|
||||
|
|
|
@ -1,9 +1,17 @@
|
|||
from .lex_attrs import LEX_ATTRS
|
||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES
|
||||
from .stop_words import STOP_WORDS
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from ...language import Language, BaseDefaults
|
||||
|
||||
|
||||
class SlovenianDefaults(BaseDefaults):
|
||||
stop_words = STOP_WORDS
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
prefixes = TOKENIZER_PREFIXES
|
||||
infixes = TOKENIZER_INFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
|
||||
|
||||
class Slovenian(Language):
|
||||
|
|
145
spacy/lang/sl/lex_attrs.py
Normal file
145
spacy/lang/sl/lex_attrs.py
Normal file
|
@ -0,0 +1,145 @@
|
|||
from ...attrs import LIKE_NUM
|
||||
from ...attrs import IS_CURRENCY
|
||||
import unicodedata
|
||||
|
||||
|
||||
_num_words = set(
|
||||
"""
|
||||
nula ničla nič ena dva tri štiri pet šest sedem osem
|
||||
devet deset enajst dvanajst trinajst štirinajst petnajst
|
||||
šestnajst sedemnajst osemnajst devetnajst dvajset trideset štirideset
|
||||
petdeset šestdest sedemdeset osemdeset devedeset sto tisoč
|
||||
milijon bilijon trilijon kvadrilijon nešteto
|
||||
|
||||
en eden enega enemu ennem enim enih enima enimi ene eni eno
|
||||
dveh dvema dvem dvoje trije treh trem tremi troje štirje štirih štirim štirimi
|
||||
petih petim petimi šestih šestim šestimi sedmih sedmim sedmimi osmih osmim osmimi
|
||||
devetih devetim devetimi desetih desetim desetimi enajstih enajstim enajstimi
|
||||
dvanajstih dvanajstim dvanajstimi trinajstih trinajstim trinajstimi
|
||||
šestnajstih šestnajstim šestnajstimi petnajstih petnajstim petnajstimi
|
||||
sedemnajstih sedemnajstim sedemnajstimi osemnajstih osemnajstim osemnajstimi
|
||||
devetnajstih devetnajstim devetnajstimi dvajsetih dvajsetim dvajsetimi
|
||||
""".split()
|
||||
)
|
||||
|
||||
_ordinal_words = set(
|
||||
"""
|
||||
prvi drugi tretji četrti peti šesti sedmi osmi
|
||||
deveti deseti enajsti dvanajsti trinajsti štirinajsti
|
||||
petnajsti šestnajsti sedemnajsti osemnajsti devetnajsti
|
||||
dvajseti trideseti štirideseti petdeseti šestdeseti sedemdeseti
|
||||
osemdeseti devetdeseti stoti tisoči milijonti bilijonti
|
||||
trilijonti kvadrilijonti nešteti
|
||||
|
||||
prva druga tretja četrta peta šesta sedma osma
|
||||
deveta deseta enajsta dvanajsta trinajsta štirnajsta
|
||||
petnajsta šestnajsta sedemnajsta osemnajsta devetnajsta
|
||||
dvajseta trideseta štirideseta petdeseta šestdeseta sedemdeseta
|
||||
osemdeseta devetdeseta stota tisoča milijonta bilijonta
|
||||
trilijonta kvadrilijonta nešteta
|
||||
|
||||
prvo drugo tretje četrto peto šestro sedmo osmo
|
||||
deveto deseto enajsto dvanajsto trinajsto štirnajsto
|
||||
petnajsto šestnajsto sedemnajsto osemnajsto devetnajsto
|
||||
dvajseto trideseto štirideseto petdeseto šestdeseto sedemdeseto
|
||||
osemdeseto devetdeseto stoto tisočo milijonto bilijonto
|
||||
trilijonto kvadrilijonto nešteto
|
||||
|
||||
prvega drugega tretjega četrtega petega šestega sedmega osmega
|
||||
devega desetega enajstega dvanajstega trinajstega štirnajstega
|
||||
petnajstega šestnajstega sedemnajstega osemnajstega devetnajstega
|
||||
dvajsetega tridesetega štiridesetega petdesetega šestdesetega sedemdesetega
|
||||
osemdesetega devetdesetega stotega tisočega milijontega bilijontega
|
||||
trilijontega kvadrilijontega neštetega
|
||||
|
||||
prvemu drugemu tretjemu četrtemu petemu šestemu sedmemu osmemu devetemu desetemu
|
||||
enajstemu dvanajstemu trinajstemu štirnajstemu petnajstemu šestnajstemu sedemnajstemu
|
||||
osemnajstemu devetnajstemu dvajsetemu tridesetemu štiridesetemu petdesetemu šestdesetemu
|
||||
sedemdesetemu osemdesetemu devetdesetemu stotemu tisočemu milijontemu bilijontemu
|
||||
trilijontemu kvadrilijontemu neštetemu
|
||||
|
||||
prvem drugem tretjem četrtem petem šestem sedmem osmem devetem desetem
|
||||
enajstem dvanajstem trinajstem štirnajstem petnajstem šestnajstem sedemnajstem
|
||||
osemnajstem devetnajstem dvajsetem tridesetem štiridesetem petdesetem šestdesetem
|
||||
sedemdesetem osemdesetem devetdesetem stotem tisočem milijontem bilijontem
|
||||
trilijontem kvadrilijontem neštetem
|
||||
|
||||
prvim drugim tretjim četrtim petim šestim sedtim osmim devetim desetim
|
||||
enajstim dvanajstim trinajstim štirnajstim petnajstim šestnajstim sedemnajstim
|
||||
osemnajstim devetnajstim dvajsetim tridesetim štiridesetim petdesetim šestdesetim
|
||||
sedemdesetim osemdesetim devetdesetim stotim tisočim milijontim bilijontim
|
||||
trilijontim kvadrilijontim neštetim
|
||||
|
||||
prvih drugih tretjih četrthih petih šestih sedmih osmih deveth desetih
|
||||
enajstih dvanajstih trinajstih štirnajstih petnajstih šestnajstih sedemnajstih
|
||||
osemnajstih devetnajstih dvajsetih tridesetih štiridesetih petdesetih šestdesetih
|
||||
sedemdesetih osemdesetih devetdesetih stotih tisočih milijontih bilijontih
|
||||
trilijontih kvadrilijontih nešteth
|
||||
|
||||
prvima drugima tretjima četrtima petima šestima sedmima osmima devetima desetima
|
||||
enajstima dvanajstima trinajstima štirnajstima petnajstima šestnajstima sedemnajstima
|
||||
osemnajstima devetnajstima dvajsetima tridesetima štiridesetima petdesetima šestdesetima
|
||||
sedemdesetima osemdesetima devetdesetima stotima tisočima milijontima bilijontima
|
||||
trilijontima kvadrilijontima neštetima
|
||||
|
||||
prve druge četrte pete šeste sedme osme devete desete
|
||||
enajste dvanajste trinajste štirnajste petnajste šestnajste sedemnajste
|
||||
osemnajste devetnajste dvajsete tridesete štiridesete petdesete šestdesete
|
||||
sedemdesete osemdesete devetdesete stote tisoče milijonte bilijonte
|
||||
trilijonte kvadrilijonte neštete
|
||||
|
||||
prvimi drugimi tretjimi četrtimi petimi šestimi sedtimi osmimi devetimi desetimi
|
||||
enajstimi dvanajstimi trinajstimi štirnajstimi petnajstimi šestnajstimi sedemnajstimi
|
||||
osemnajstimi devetnajstimi dvajsetimi tridesetimi štiridesetimi petdesetimi šestdesetimi
|
||||
sedemdesetimi osemdesetimi devetdesetimi stotimi tisočimi milijontimi bilijontimi
|
||||
trilijontimi kvadrilijontimi neštetimi
|
||||
""".split()
|
||||
)
|
||||
|
||||
_currency_words = set(
|
||||
"""
|
||||
evro evra evru evrom evrov evroma evrih evrom evre evri evr eur
|
||||
cent centa centu cenom centov centoma centih centom cente centi
|
||||
dolar dolarja dolarji dolarju dolarjem dolarjev dolarjema dolarjih dolarje usd
|
||||
tolar tolarja tolarji tolarju tolarjem tolarjev tolarjema tolarjih tolarje tol
|
||||
dinar dinarja dinarji dinarju dinarjem dinarjev dinarjema dinarjih dinarje din
|
||||
funt funta funti funtu funtom funtov funtoma funtih funte gpb
|
||||
forint forinta forinti forintu forintom forintov forintoma forintih forinte
|
||||
zlot zlota zloti zlotu zlotom zlotov zlotoma zlotih zlote
|
||||
rupij rupija rupiji rupiju rupijem rupijev rupijema rupijih rupije
|
||||
jen jena jeni jenu jenom jenov jenoma jenih jene
|
||||
kuna kuni kune kuno kun kunama kunah kunam kunami
|
||||
marka marki marke markama markah markami
|
||||
""".split()
|
||||
)
|
||||
|
||||
|
||||
def like_num(text):
|
||||
if text.startswith(("+", "-", "±", "~")):
|
||||
text = text[1:]
|
||||
text = text.replace(",", "").replace(".", "")
|
||||
if text.isdigit():
|
||||
return True
|
||||
if text.count("/") == 1:
|
||||
num, denom = text.split("/")
|
||||
if num.isdigit() and denom.isdigit():
|
||||
return True
|
||||
text_lower = text.lower()
|
||||
if text_lower in _num_words:
|
||||
return True
|
||||
if text_lower in _ordinal_words:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def is_currency(text):
|
||||
text_lower = text.lower()
|
||||
if text in _currency_words:
|
||||
return True
|
||||
for char in text:
|
||||
if unicodedata.category(char) != "Sc":
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
LEX_ATTRS = {LIKE_NUM: like_num, IS_CURRENCY: is_currency}
|
84
spacy/lang/sl/punctuation.py
Normal file
84
spacy/lang/sl/punctuation.py
Normal file
|
@ -0,0 +1,84 @@
|
|||
from ..char_classes import (
|
||||
LIST_ELLIPSES,
|
||||
LIST_ICONS,
|
||||
HYPHENS,
|
||||
LIST_PUNCT,
|
||||
LIST_QUOTES,
|
||||
CURRENCY,
|
||||
UNITS,
|
||||
PUNCT,
|
||||
LIST_CURRENCY,
|
||||
CONCAT_QUOTES,
|
||||
)
|
||||
from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
|
||||
from ..char_classes import merge_chars
|
||||
from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES
|
||||
|
||||
|
||||
INCLUDE_SPECIAL = ["\\+", "\\/", "\\•", "\\¯", "\\=", "\\×"] + HYPHENS.split("|")
|
||||
|
||||
_prefixes = INCLUDE_SPECIAL + BASE_TOKENIZER_PREFIXES
|
||||
|
||||
_suffixes = (
|
||||
INCLUDE_SPECIAL
|
||||
+ LIST_PUNCT
|
||||
+ LIST_ELLIPSES
|
||||
+ LIST_QUOTES
|
||||
+ LIST_ICONS
|
||||
+ [
|
||||
r"(?<=°[FfCcKk])\.",
|
||||
r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
|
||||
r"(?<=[0-9])(?:{u})".format(u=UNITS),
|
||||
r"(?<=[{al}{e}{p}(?:{q})])\.".format(
|
||||
al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, p=PUNCT
|
||||
),
|
||||
r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
|
||||
# split initials like J.K. Rowling
|
||||
r"(?<=[A-Z]\.)(?:[A-Z].)",
|
||||
]
|
||||
)
|
||||
|
||||
# a list of all suffixes following a hyphen that are shouldn't split (eg. BTC-jev)
|
||||
# source: Obeliks tokenizer - https://github.com/clarinsi/obeliks/blob/master/obeliks/res/TokRulesPart1.txt
|
||||
CONCAT_QUOTES = CONCAT_QUOTES.replace("'", "")
|
||||
HYPHENS_PERMITTED = (
|
||||
"((a)|(evemu)|(evskega)|(i)|(jevega)|(jevska)|(jevskimi)|(jinemu)|(oma)|(ovim)|"
|
||||
"(ovski)|(e)|(evi)|(evskem)|(ih)|(jevem)|(jevske)|(jevsko)|(jini)|(ov)|(ovima)|"
|
||||
"(ovskih)|(em)|(evih)|(evskemu)|(ja)|(jevemu)|(jevskega)|(ji)|(jinih)|(ova)|"
|
||||
"(ovimi)|(ovskim)|(ema)|(evim)|(evski)|(je)|(jevi)|(jevskem)|(jih)|(jinim)|"
|
||||
"(ove)|(ovo)|(ovskima)|(ev)|(evima)|(evskih)|(jem)|(jevih)|(jevskemu)|(jin)|"
|
||||
"(jinima)|(ovega)|(ovska)|(ovskimi)|(eva)|(evimi)|(evskim)|(jema)|(jevim)|"
|
||||
"(jevski)|(jina)|(jinimi)|(ovem)|(ovske)|(ovsko)|(eve)|(evo)|(evskima)|(jev)|"
|
||||
"(jevima)|(jevskih)|(jine)|(jino)|(ovemu)|(ovskega)|(u)|(evega)|(evska)|"
|
||||
"(evskimi)|(jeva)|(jevimi)|(jevskim)|(jinega)|(ju)|(ovi)|(ovskem)|(evem)|"
|
||||
"(evske)|(evsko)|(jeve)|(jevo)|(jevskima)|(jinem)|(om)|(ovih)|(ovskemu)|"
|
||||
"(ovec)|(ovca)|(ovcu)|(ovcem)|(ovcev)|(ovcema)|(ovcih)|(ovci)|(ovce)|(ovcimi)|"
|
||||
"(evec)|(evca)|(evcu)|(evcem)|(evcev)|(evcema)|(evcih)|(evci)|(evce)|(evcimi)|"
|
||||
"(jevec)|(jevca)|(jevcu)|(jevcem)|(jevcev)|(jevcema)|(jevcih)|(jevci)|(jevce)|"
|
||||
"(jevcimi)|(ovka)|(ovke)|(ovki)|(ovko)|(ovk)|(ovkama)|(ovkah)|(ovkam)|(ovkami)|"
|
||||
"(evka)|(evke)|(evki)|(evko)|(evk)|(evkama)|(evkah)|(evkam)|(evkami)|(jevka)|"
|
||||
"(jevke)|(jevki)|(jevko)|(jevk)|(jevkama)|(jevkah)|(jevkam)|(jevkami)|(timi)|"
|
||||
"(im)|(ima)|(a)|(imi)|(e)|(o)|(ega)|(ti)|(em)|(tih)|(emu)|(tim)|(i)|(tima)|"
|
||||
"(ih)|(ta)|(te)|(to)|(tega)|(tem)|(temu))"
|
||||
)
|
||||
|
||||
_infixes = (
|
||||
LIST_ELLIPSES
|
||||
+ LIST_ICONS
|
||||
+ [
|
||||
r"(?<=[0-9])[+\-\*^](?=[0-9-])",
|
||||
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
|
||||
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
|
||||
),
|
||||
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[{a}0-9])(?:{h})(?!{hp}$)(?=[{a}])".format(
|
||||
a=ALPHA, h=HYPHENS, hp=HYPHENS_PERMITTED
|
||||
),
|
||||
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
TOKENIZER_PREFIXES = _prefixes
|
||||
TOKENIZER_SUFFIXES = _suffixes
|
||||
TOKENIZER_INFIXES = _infixes
|
|
@ -1,326 +1,84 @@
|
|||
# Source: https://github.com/stopwords-iso/stopwords-sl
|
||||
# Removed various words that are not normally considered stop words, such as months.
|
||||
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
a
|
||||
ali
|
||||
b
|
||||
bi
|
||||
bil
|
||||
bila
|
||||
bile
|
||||
bili
|
||||
bilo
|
||||
biti
|
||||
blizu
|
||||
bo
|
||||
bodo
|
||||
bolj
|
||||
bom
|
||||
bomo
|
||||
boste
|
||||
bova
|
||||
boš
|
||||
brez
|
||||
c
|
||||
cel
|
||||
cela
|
||||
celi
|
||||
celo
|
||||
d
|
||||
da
|
||||
daleč
|
||||
dan
|
||||
danes
|
||||
do
|
||||
dober
|
||||
dobra
|
||||
dobri
|
||||
dobro
|
||||
dokler
|
||||
dol
|
||||
dovolj
|
||||
e
|
||||
eden
|
||||
en
|
||||
ena
|
||||
ene
|
||||
eni
|
||||
enkrat
|
||||
eno
|
||||
etc.
|
||||
a ali
|
||||
|
||||
b bi bil bila bile bili bilo biti blizu bo bodo bojo bolj bom bomo
|
||||
boste bova boš brez
|
||||
|
||||
c cel cela celi celo
|
||||
|
||||
č če često četrta četrtek četrti četrto čez čigav
|
||||
|
||||
d da daleč dan danes datum deset deseta deseti deseto devet
|
||||
deveta deveti deveto do dober dobra dobri dobro dokler dol dolg
|
||||
dolga dolgi dovolj drug druga drugi drugo dva dve
|
||||
|
||||
e eden en ena ene eni enkrat eno etc.
|
||||
|
||||
f
|
||||
g
|
||||
g.
|
||||
ga
|
||||
ga.
|
||||
gor
|
||||
gospa
|
||||
gospod
|
||||
h
|
||||
halo
|
||||
i
|
||||
idr.
|
||||
ii
|
||||
iii
|
||||
in
|
||||
iv
|
||||
ix
|
||||
iz
|
||||
j
|
||||
jaz
|
||||
je
|
||||
ji
|
||||
jih
|
||||
jim
|
||||
jo
|
||||
k
|
||||
kadarkoli
|
||||
kaj
|
||||
kajti
|
||||
kako
|
||||
kakor
|
||||
kamor
|
||||
kamorkoli
|
||||
kar
|
||||
karkoli
|
||||
katerikoli
|
||||
kdaj
|
||||
kdo
|
||||
kdorkoli
|
||||
ker
|
||||
ki
|
||||
kje
|
||||
kjer
|
||||
kjerkoli
|
||||
ko
|
||||
koderkoli
|
||||
koga
|
||||
komu
|
||||
kot
|
||||
l
|
||||
le
|
||||
lep
|
||||
lepa
|
||||
lepe
|
||||
lepi
|
||||
lepo
|
||||
m
|
||||
manj
|
||||
me
|
||||
med
|
||||
medtem
|
||||
mene
|
||||
mi
|
||||
midva
|
||||
midve
|
||||
mnogo
|
||||
moj
|
||||
moja
|
||||
moje
|
||||
mora
|
||||
morajo
|
||||
moram
|
||||
moramo
|
||||
morate
|
||||
moraš
|
||||
morem
|
||||
mu
|
||||
n
|
||||
na
|
||||
nad
|
||||
naj
|
||||
najina
|
||||
najino
|
||||
najmanj
|
||||
naju
|
||||
največ
|
||||
nam
|
||||
nas
|
||||
nato
|
||||
nazaj
|
||||
naš
|
||||
naša
|
||||
naše
|
||||
ne
|
||||
nedavno
|
||||
nek
|
||||
neka
|
||||
nekaj
|
||||
nekatere
|
||||
nekateri
|
||||
nekatero
|
||||
nekdo
|
||||
neke
|
||||
nekega
|
||||
neki
|
||||
nekje
|
||||
neko
|
||||
nekoga
|
||||
nekoč
|
||||
ni
|
||||
nikamor
|
||||
nikdar
|
||||
nikjer
|
||||
nikoli
|
||||
nič
|
||||
nje
|
||||
njega
|
||||
njegov
|
||||
njegova
|
||||
njegovo
|
||||
njej
|
||||
njemu
|
||||
njen
|
||||
njena
|
||||
njeno
|
||||
nji
|
||||
njih
|
||||
njihov
|
||||
njihova
|
||||
njihovo
|
||||
njiju
|
||||
njim
|
||||
njo
|
||||
njun
|
||||
njuna
|
||||
njuno
|
||||
no
|
||||
nocoj
|
||||
npr.
|
||||
o
|
||||
ob
|
||||
oba
|
||||
obe
|
||||
oboje
|
||||
od
|
||||
okoli
|
||||
on
|
||||
onadva
|
||||
one
|
||||
oni
|
||||
onidve
|
||||
oz.
|
||||
p
|
||||
pa
|
||||
po
|
||||
pod
|
||||
pogosto
|
||||
poleg
|
||||
ponavadi
|
||||
ponovno
|
||||
potem
|
||||
povsod
|
||||
prbl.
|
||||
precej
|
||||
pred
|
||||
prej
|
||||
preko
|
||||
pri
|
||||
pribl.
|
||||
približno
|
||||
proti
|
||||
r
|
||||
redko
|
||||
res
|
||||
s
|
||||
saj
|
||||
sam
|
||||
sama
|
||||
same
|
||||
sami
|
||||
samo
|
||||
se
|
||||
sebe
|
||||
sebi
|
||||
sedaj
|
||||
sem
|
||||
seveda
|
||||
si
|
||||
sicer
|
||||
skoraj
|
||||
skozi
|
||||
smo
|
||||
so
|
||||
spet
|
||||
sta
|
||||
ste
|
||||
sva
|
||||
t
|
||||
ta
|
||||
tak
|
||||
taka
|
||||
take
|
||||
taki
|
||||
tako
|
||||
takoj
|
||||
tam
|
||||
te
|
||||
tebe
|
||||
tebi
|
||||
tega
|
||||
ti
|
||||
tista
|
||||
tiste
|
||||
tisti
|
||||
tisto
|
||||
tj.
|
||||
tja
|
||||
to
|
||||
toda
|
||||
tu
|
||||
tudi
|
||||
tukaj
|
||||
tvoj
|
||||
tvoja
|
||||
tvoje
|
||||
|
||||
g g. ga ga. gor gospa gospod
|
||||
|
||||
h halo
|
||||
|
||||
i idr. ii iii in iv ix iz
|
||||
|
||||
j jaz je ji jih jim jo jutri
|
||||
|
||||
k kadarkoli kaj kajti kako kakor kamor kamorkoli kar karkoli
|
||||
katerikoli kdaj kdo kdorkoli ker ki kje kjer kjerkoli
|
||||
ko koder koderkoli koga komu kot kratek kratka kratke kratki
|
||||
|
||||
l lahka lahke lahki lahko le lep lepa lepe lepi lepo leto
|
||||
|
||||
m majhen majhna majhni malce malo manj me med medtem mene
|
||||
mesec mi midva midve mnogo moj moja moje mora morajo moram
|
||||
moramo morate moraš morem mu
|
||||
|
||||
n na nad naj najina najino najmanj naju največ nam narobe
|
||||
nas nato nazaj naš naša naše ne nedavno nedelja nek neka
|
||||
nekaj nekatere nekateri nekatero nekdo neke nekega neki
|
||||
nekje neko nekoga nekoč ni nikamor nikdar nikjer nikoli
|
||||
nič nje njega njegov njegova njegovo njej njemu njen
|
||||
njena njeno nji njih njihov njihova njihovo njiju njim
|
||||
njo njun njuna njuno no nocoj npr.
|
||||
|
||||
o ob oba obe oboje od odprt odprta odprti okoli on
|
||||
onadva one oni onidve osem osma osmi osmo oz.
|
||||
|
||||
p pa pet peta petek peti peto po pod pogosto poleg poln
|
||||
polna polni polno ponavadi ponedeljek ponovno potem
|
||||
povsod pozdravljen pozdravljeni prav prava prave pravi
|
||||
pravo prazen prazna prazno prbl. precej pred prej preko
|
||||
pri pribl. približno primer pripravljen pripravljena
|
||||
pripravljeni proti prva prvi prvo
|
||||
|
||||
r ravno redko res reč
|
||||
|
||||
s saj sam sama same sami samo se sebe sebi sedaj sedem
|
||||
sedma sedmi sedmo sem seveda si sicer skoraj skozi slab sm
|
||||
so sobota spet sreda srednja srednji sta ste stran stvar sva
|
||||
|
||||
š šest šesta šesti šesto štiri
|
||||
|
||||
t ta tak taka take taki tako takoj tam te tebe tebi tega
|
||||
težak težka težki težko ti tista tiste tisti tisto tj.
|
||||
tja to toda torek tretja tretje tretji tri tu tudi tukaj
|
||||
tvoj tvoja tvoje
|
||||
|
||||
u
|
||||
v
|
||||
vaju
|
||||
vam
|
||||
vas
|
||||
vaš
|
||||
vaša
|
||||
vaše
|
||||
ve
|
||||
vedno
|
||||
vendar
|
||||
ves
|
||||
več
|
||||
vi
|
||||
vidva
|
||||
vii
|
||||
viii
|
||||
vsa
|
||||
vsaj
|
||||
vsak
|
||||
vsaka
|
||||
vsakdo
|
||||
vsake
|
||||
vsaki
|
||||
vsakomur
|
||||
vse
|
||||
vsega
|
||||
vsi
|
||||
vso
|
||||
včasih
|
||||
x
|
||||
z
|
||||
za
|
||||
zadaj
|
||||
zadnji
|
||||
zakaj
|
||||
zdaj
|
||||
zelo
|
||||
zunaj
|
||||
č
|
||||
če
|
||||
često
|
||||
čez
|
||||
čigav
|
||||
š
|
||||
ž
|
||||
že
|
||||
|
||||
v vaju vam vas vaš vaša vaše ve vedno velik velika veliki
|
||||
veliko vendar ves več vi vidva vii viii visok visoka visoke
|
||||
visoki vsa vsaj vsak vsaka vsakdo vsake vsaki vsakomur vse
|
||||
vsega vsi vso včasih včeraj
|
||||
|
||||
x
|
||||
|
||||
z za zadaj zadnji zakaj zaprta zaprti zaprto zdaj zelo zunaj
|
||||
|
||||
ž že
|
||||
""".split()
|
||||
)
|
||||
|
|
272
spacy/lang/sl/tokenizer_exceptions.py
Normal file
272
spacy/lang/sl/tokenizer_exceptions.py
Normal file
|
@ -0,0 +1,272 @@
|
|||
from typing import Dict, List
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...symbols import ORTH, NORM
|
||||
from ...util import update_exc
|
||||
|
||||
_exc: Dict[str, List[Dict]] = {}
|
||||
|
||||
_other_exc = {
|
||||
"t.i.": [{ORTH: "t.", NORM: "tako"}, {ORTH: "i.", NORM: "imenovano"}],
|
||||
"t.j.": [{ORTH: "t.", NORM: "to"}, {ORTH: "j.", NORM: "je"}],
|
||||
"T.j.": [{ORTH: "T.", NORM: "to"}, {ORTH: "j.", NORM: "je"}],
|
||||
"d.o.o.": [
|
||||
{ORTH: "d.", NORM: "družba"},
|
||||
{ORTH: "o.", NORM: "omejeno"},
|
||||
{ORTH: "o.", NORM: "odgovornostjo"},
|
||||
],
|
||||
"D.O.O.": [
|
||||
{ORTH: "D.", NORM: "družba"},
|
||||
{ORTH: "O.", NORM: "omejeno"},
|
||||
{ORTH: "O.", NORM: "odgovornostjo"},
|
||||
],
|
||||
"d.n.o.": [
|
||||
{ORTH: "d.", NORM: "družba"},
|
||||
{ORTH: "n.", NORM: "neomejeno"},
|
||||
{ORTH: "o.", NORM: "odgovornostjo"},
|
||||
],
|
||||
"D.N.O.": [
|
||||
{ORTH: "D.", NORM: "družba"},
|
||||
{ORTH: "N.", NORM: "neomejeno"},
|
||||
{ORTH: "O.", NORM: "odgovornostjo"},
|
||||
],
|
||||
"d.d.": [{ORTH: "d.", NORM: "delniška"}, {ORTH: "d.", NORM: "družba"}],
|
||||
"D.D.": [{ORTH: "D.", NORM: "delniška"}, {ORTH: "D.", NORM: "družba"}],
|
||||
"s.p.": [{ORTH: "s.", NORM: "samostojni"}, {ORTH: "p.", NORM: "podjetnik"}],
|
||||
"S.P.": [{ORTH: "S.", NORM: "samostojni"}, {ORTH: "P.", NORM: "podjetnik"}],
|
||||
"l.r.": [{ORTH: "l.", NORM: "lastno"}, {ORTH: "r.", NORM: "ročno"}],
|
||||
"le-te": [{ORTH: "le"}, {ORTH: "-"}, {ORTH: "te"}],
|
||||
"Le-te": [{ORTH: "Le"}, {ORTH: "-"}, {ORTH: "te"}],
|
||||
"le-ti": [{ORTH: "le"}, {ORTH: "-"}, {ORTH: "ti"}],
|
||||
"Le-ti": [{ORTH: "Le"}, {ORTH: "-"}, {ORTH: "ti"}],
|
||||
"le-to": [{ORTH: "le"}, {ORTH: "-"}, {ORTH: "to"}],
|
||||
"Le-to": [{ORTH: "Le"}, {ORTH: "-"}, {ORTH: "to"}],
|
||||
"le-ta": [{ORTH: "le"}, {ORTH: "-"}, {ORTH: "ta"}],
|
||||
"Le-ta": [{ORTH: "Le"}, {ORTH: "-"}, {ORTH: "ta"}],
|
||||
"le-tega": [{ORTH: "le"}, {ORTH: "-"}, {ORTH: "tega"}],
|
||||
"Le-tega": [{ORTH: "Le"}, {ORTH: "-"}, {ORTH: "tega"}],
|
||||
}
|
||||
|
||||
_exc.update(_other_exc)
|
||||
|
||||
|
||||
for exc_data in [
|
||||
{ORTH: "adm.", NORM: "administracija"},
|
||||
{ORTH: "aer.", NORM: "aeronavtika"},
|
||||
{ORTH: "agr.", NORM: "agronomija"},
|
||||
{ORTH: "amer.", NORM: "ameriško"},
|
||||
{ORTH: "anat.", NORM: "anatomija"},
|
||||
{ORTH: "angl.", NORM: "angleški"},
|
||||
{ORTH: "ant.", NORM: "antonim"},
|
||||
{ORTH: "antr.", NORM: "antropologija"},
|
||||
{ORTH: "apr.", NORM: "april"},
|
||||
{ORTH: "arab.", NORM: "arabsko"},
|
||||
{ORTH: "arheol.", NORM: "arheologija"},
|
||||
{ORTH: "arhit.", NORM: "arhitektura"},
|
||||
{ORTH: "avg.", NORM: "avgust"},
|
||||
{ORTH: "avstr.", NORM: "avstrijsko"},
|
||||
{ORTH: "avt.", NORM: "avtomobilizem"},
|
||||
{ORTH: "bibl.", NORM: "biblijsko"},
|
||||
{ORTH: "biokem.", NORM: "biokemija"},
|
||||
{ORTH: "biol.", NORM: "biologija"},
|
||||
{ORTH: "bolg.", NORM: "bolgarski"},
|
||||
{ORTH: "bot.", NORM: "botanika"},
|
||||
{ORTH: "cit.", NORM: "citat"},
|
||||
{ORTH: "daj.", NORM: "dajalnik"},
|
||||
{ORTH: "del.", NORM: "deležnik"},
|
||||
{ORTH: "ed.", NORM: "ednina"},
|
||||
{ORTH: "etn.", NORM: "etnografija"},
|
||||
{ORTH: "farm.", NORM: "farmacija"},
|
||||
{ORTH: "filat.", NORM: "filatelija"},
|
||||
{ORTH: "filoz.", NORM: "filozofija"},
|
||||
{ORTH: "fin.", NORM: "finančništvo"},
|
||||
{ORTH: "fiz.", NORM: "fizika"},
|
||||
{ORTH: "fot.", NORM: "fotografija"},
|
||||
{ORTH: "fr.", NORM: "francoski"},
|
||||
{ORTH: "friz.", NORM: "frizerstvo"},
|
||||
{ORTH: "gastr.", NORM: "gastronomija"},
|
||||
{ORTH: "geogr.", NORM: "geografija"},
|
||||
{ORTH: "geol.", NORM: "geologija"},
|
||||
{ORTH: "geom.", NORM: "geometrija"},
|
||||
{ORTH: "germ.", NORM: "germanski"},
|
||||
{ORTH: "gl.", NORM: "glej"},
|
||||
{ORTH: "glag.", NORM: "glagolski"},
|
||||
{ORTH: "glasb.", NORM: "glasba"},
|
||||
{ORTH: "gled.", NORM: "gledališče"},
|
||||
{ORTH: "gost.", NORM: "gostinstvo"},
|
||||
{ORTH: "gozd.", NORM: "gozdarstvo"},
|
||||
{ORTH: "gr.", NORM: "grški"},
|
||||
{ORTH: "grad.", NORM: "gradbeništvo"},
|
||||
{ORTH: "hebr.", NORM: "hebrejsko"},
|
||||
{ORTH: "hrv.", NORM: "hrvaško"},
|
||||
{ORTH: "ide.", NORM: "indoevropsko"},
|
||||
{ORTH: "igr.", NORM: "igre"},
|
||||
{ORTH: "im.", NORM: "imenovalnik"},
|
||||
{ORTH: "iron.", NORM: "ironično"},
|
||||
{ORTH: "it.", NORM: "italijanski"},
|
||||
{ORTH: "itd.", NORM: "in tako dalje"},
|
||||
{ORTH: "itn.", NORM: "in tako naprej"},
|
||||
{ORTH: "ipd.", NORM: "in podobno"},
|
||||
{ORTH: "jap.", NORM: "japonsko"},
|
||||
{ORTH: "jul.", NORM: "julij"},
|
||||
{ORTH: "jun.", NORM: "junij"},
|
||||
{ORTH: "kit.", NORM: "kitajsko"},
|
||||
{ORTH: "knj.", NORM: "knjižno"},
|
||||
{ORTH: "knjiž.", NORM: "knjižno"},
|
||||
{ORTH: "kor.", NORM: "koreografija"},
|
||||
{ORTH: "lat.", NORM: "latinski"},
|
||||
{ORTH: "les.", NORM: "lesna stroka"},
|
||||
{ORTH: "lingv.", NORM: "lingvistika"},
|
||||
{ORTH: "lit.", NORM: "literarni"},
|
||||
{ORTH: "ljubk.", NORM: "ljubkovalno"},
|
||||
{ORTH: "lov.", NORM: "lovstvo"},
|
||||
{ORTH: "m.", NORM: "moški"},
|
||||
{ORTH: "mak.", NORM: "makedonski"},
|
||||
{ORTH: "mar.", NORM: "marec"},
|
||||
{ORTH: "mat.", NORM: "matematika"},
|
||||
{ORTH: "med.", NORM: "medicina"},
|
||||
{ORTH: "meh.", NORM: "mehiško"},
|
||||
{ORTH: "mest.", NORM: "mestnik"},
|
||||
{ORTH: "mdr.", NORM: "med drugim"},
|
||||
{ORTH: "min.", NORM: "mineralogija"},
|
||||
{ORTH: "mitol.", NORM: "mitologija"},
|
||||
{ORTH: "mn.", NORM: "množina"},
|
||||
{ORTH: "mont.", NORM: "montanistika"},
|
||||
{ORTH: "muz.", NORM: "muzikologija"},
|
||||
{ORTH: "nam.", NORM: "namenilnik"},
|
||||
{ORTH: "nar.", NORM: "narečno"},
|
||||
{ORTH: "nav.", NORM: "navadno"},
|
||||
{ORTH: "nedol.", NORM: "nedoločnik"},
|
||||
{ORTH: "nedov.", NORM: "nedovršni"},
|
||||
{ORTH: "neprav.", NORM: "nepravilno"},
|
||||
{ORTH: "nepreh.", NORM: "neprehodno"},
|
||||
{ORTH: "neskl.", NORM: "nesklonljiv(o)"},
|
||||
{ORTH: "nestrok.", NORM: "nestrokovno"},
|
||||
{ORTH: "num.", NORM: "numizmatika"},
|
||||
{ORTH: "npr.", NORM: "na primer"},
|
||||
{ORTH: "obrt.", NORM: "obrtništvo"},
|
||||
{ORTH: "okt.", NORM: "oktober"},
|
||||
{ORTH: "or.", NORM: "orodnik"},
|
||||
{ORTH: "os.", NORM: "oseba"},
|
||||
{ORTH: "otr.", NORM: "otroško"},
|
||||
{ORTH: "oz.", NORM: "oziroma"},
|
||||
{ORTH: "pal.", NORM: "paleontologija"},
|
||||
{ORTH: "papir.", NORM: "papirništvo"},
|
||||
{ORTH: "ped.", NORM: "pedagogika"},
|
||||
{ORTH: "pisar.", NORM: "pisarniško"},
|
||||
{ORTH: "pog.", NORM: "pogovorno"},
|
||||
{ORTH: "polit.", NORM: "politika"},
|
||||
{ORTH: "polj.", NORM: "poljsko"},
|
||||
{ORTH: "poljud.", NORM: "poljudno"},
|
||||
{ORTH: "preg.", NORM: "pregovor"},
|
||||
{ORTH: "preh.", NORM: "prehodno"},
|
||||
{ORTH: "pren.", NORM: "preneseno"},
|
||||
{ORTH: "prid.", NORM: "pridevnik"},
|
||||
{ORTH: "prim.", NORM: "primerjaj"},
|
||||
{ORTH: "prisl.", NORM: "prislov"},
|
||||
{ORTH: "psih.", NORM: "psihologija"},
|
||||
{ORTH: "psiht.", NORM: "psihiatrija"},
|
||||
{ORTH: "rad.", NORM: "radiotehnika"},
|
||||
{ORTH: "rač.", NORM: "računalništvo"},
|
||||
{ORTH: "rib.", NORM: "ribištvo"},
|
||||
{ORTH: "rod.", NORM: "rodilnik"},
|
||||
{ORTH: "rus.", NORM: "rusko"},
|
||||
{ORTH: "s.", NORM: "srednji"},
|
||||
{ORTH: "sam.", NORM: "samostalniški"},
|
||||
{ORTH: "sed.", NORM: "sedanjik"},
|
||||
{ORTH: "sep.", NORM: "september"},
|
||||
{ORTH: "slabš.", NORM: "slabšalno"},
|
||||
{ORTH: "slovan.", NORM: "slovansko"},
|
||||
{ORTH: "slovaš.", NORM: "slovaško"},
|
||||
{ORTH: "srb.", NORM: "srbsko"},
|
||||
{ORTH: "star.", NORM: "starinsko"},
|
||||
{ORTH: "stil.", NORM: "stilno"},
|
||||
{ORTH: "sv.", NORM: "svet(i)"},
|
||||
{ORTH: "teh.", NORM: "tehnika"},
|
||||
{ORTH: "tisk.", NORM: "tiskarstvo"},
|
||||
{ORTH: "tj.", NORM: "to je"},
|
||||
{ORTH: "tož.", NORM: "tožilnik"},
|
||||
{ORTH: "trg.", NORM: "trgovina"},
|
||||
{ORTH: "ukr.", NORM: "ukrajinski"},
|
||||
{ORTH: "um.", NORM: "umetnost"},
|
||||
{ORTH: "vel.", NORM: "velelnik"},
|
||||
{ORTH: "vet.", NORM: "veterina"},
|
||||
{ORTH: "vez.", NORM: "veznik"},
|
||||
{ORTH: "vn.", NORM: "visokonemško"},
|
||||
{ORTH: "voj.", NORM: "vojska"},
|
||||
{ORTH: "vrtn.", NORM: "vrtnarstvo"},
|
||||
{ORTH: "vulg.", NORM: "vulgarno"},
|
||||
{ORTH: "vznes.", NORM: "vzneseno"},
|
||||
{ORTH: "zal.", NORM: "založništvo"},
|
||||
{ORTH: "zastar.", NORM: "zastarelo"},
|
||||
{ORTH: "zgod.", NORM: "zgodovina"},
|
||||
{ORTH: "zool.", NORM: "zoologija"},
|
||||
{ORTH: "čeb.", NORM: "čebelarstvo"},
|
||||
{ORTH: "češ.", NORM: "češki"},
|
||||
{ORTH: "člov.", NORM: "človeškost"},
|
||||
{ORTH: "šah.", NORM: "šahovski"},
|
||||
{ORTH: "šalj.", NORM: "šaljivo"},
|
||||
{ORTH: "šp.", NORM: "španski"},
|
||||
{ORTH: "špan.", NORM: "špansko"},
|
||||
{ORTH: "šport.", NORM: "športni"},
|
||||
{ORTH: "štev.", NORM: "števnik"},
|
||||
{ORTH: "šved.", NORM: "švedsko"},
|
||||
{ORTH: "švic.", NORM: "švicarsko"},
|
||||
{ORTH: "ž.", NORM: "ženski"},
|
||||
{ORTH: "žarg.", NORM: "žargonsko"},
|
||||
{ORTH: "žel.", NORM: "železnica"},
|
||||
{ORTH: "živ.", NORM: "živost"},
|
||||
]:
|
||||
_exc[exc_data[ORTH]] = [exc_data]
|
||||
|
||||
|
||||
abbrv = """
|
||||
Co. Ch. DIPL. DR. Dr. Ev. Inc. Jr. Kr. Mag. M. MR. Mr. Mt. Murr. Npr. OZ.
|
||||
Opr. Osn. Prim. Roj. ST. Sim. Sp. Sred. St. Sv. Škofl. Tel. UR. Zb.
|
||||
a. aa. ab. abc. abit. abl. abs. abt. acc. accel. add. adj. adv. aet. afr. akad. al. alban. all. alleg.
|
||||
alp. alt. alter. alžir. am. an. andr. ang. anh. anon. ans. antrop. apoc. app. approx. apt. ar. arc. arch.
|
||||
arh. arr. as. asist. assist. assoc. asst. astr. attn. aug. avstral. az. b. bab. bal. bbl. bd. belg. bioinf.
|
||||
biomed. bk. bl. bn. borg. bp. br. braz. brit. bros. broš. bt. bu. c. ca. cal. can. cand. cantab. cap. capt.
|
||||
cat. cath. cc. cca. cd. cdr. cdre. cent. cerkv. cert. cf. cfr. ch. chap. chem. chr. chs. cic. circ. civ. cl.
|
||||
cm. cmd. cnr. co. cod. col. coll. colo. com. comp. con. conc. cond. conn. cons. cont. coop. corr. cost. cp.
|
||||
cpl. cr. crd. cres. cresc. ct. cu. d. dan. dat. davč. ddr. dec. ded. def. dem. dent. dept. dia. dip. dipl.
|
||||
dir. disp. diss. div. do. doc. dok. dol. doo. dop. dott. dr. dram. druž. družb. drž. dt. duh. dur. dvr. dwt. e.
|
||||
ea. ecc. eccl. eccles. econ. edn. egipt. egr. ekon. eksp. el. em. enc. eng. eo. ep. err. esp. esq. est.
|
||||
et. etc. etnogr. etnol. ev. evfem. evr. ex. exc. excl. exp. expl. ext. exx. f. fa. facs. fak. faks. fas.
|
||||
fasc. fco. fcp. feb. febr. fec. fed. fem. ff. fff. fid. fig. fil. film. fiziol. fiziot. flam. fm. fo. fol. folk.
|
||||
frag. fran. franc. fsc. g. ga. gal. gdč. ge. gen. geod. geog. geotehnol. gg. gimn. glas. glav. gnr. go. gor.
|
||||
gosp. gp. graf. gram. gren. grš. gs. h. hab. hf. hist. ho. hort. i. ia. ib. ibid. id. idr. idridr. ill. imen.
|
||||
imp. impf. impr. in. inc. incl. ind. indus. inf. inform. ing. init. ins. int. inv. inšp. inštr. inž. is. islam.
|
||||
ist. ital. iur. iz. izbr. izd. izg. izgr. izr. izv. j. jak. jam. jan. jav. je. jez. jr. jsl. jud. jug.
|
||||
jugoslovan. jur. juž. jv. jz. k. kal. kan. kand. kat. kdo. kem. kip. kmet. kol. kom. komp. konf. kont. kost. kov.
|
||||
kp. kpfw. kr. kraj. krat. kub. kult. kv. kval. l. la. lab. lb. ld. let. lib. lik. litt. lj. ljud. ll. loc. log.
|
||||
loč. lt. ma. madž. mag. manag. manjš. masc. mass. mater. max. maxmax. mb. md. mech. medic. medij. medn.
|
||||
mehč. mem. menedž. mes. mess. metal. meteor. meteorol. mex. mi. mikr. mil. minn. mio. misc. miss. mit. mk.
|
||||
mkt. ml. mlad. mlle. mlr. mm. mme. množ. mo. moj. moš. možn. mr. mrd. mrs. ms. msc. msgr. mt. murr. mus. mut.
|
||||
n. na. nad. nadalj. nadom. nagl. nakl. namer. nan. naniz. nasl. nat. navt. nač. ned. nem. nik. nizoz. nm. nn.
|
||||
no. nom. norv. notr. nov. novogr. ns. o. ob. obd. obj. oblač. obl. oblik. obr. obraz. obs. obst. obt. obč. oc.
|
||||
oct. od. odd. odg. odn. odst. odv. oec. off. ok. okla. okr. ont. oo. op. opis. opp. opr. orch. ord. ore. oreg.
|
||||
org. orient. orig. ork. ort. oseb. osn. ot. ozir. ošk. p. pag. par. para. parc. parl. part. past. pat. pdk.
|
||||
pen. perf. pert. perz. pesn. pet. pev. pf. pfc. ph. pharm. phil. pis. pl. po. pod. podr. podaljš. pogl. pogoj. pojm.
|
||||
pok. pokr. pol. poljed. poljub. polu. pom. pomen. pon. ponov. pop. por. port. pos. posl. posn. pov. pp. ppl. pr.
|
||||
praet. prav. pravopis. pravosl. preb. pred. predl. predm. predp. preds. pref. pregib. prel. prem. premen. prep.
|
||||
pres. pret. prev. pribl. prih. pril. primerj. primor. prip. pripor. prir. prist. priv. proc. prof. prog. proiz.
|
||||
prom. pron. prop. prot. protest. prov. ps. pss. pt. publ. pz. q. qld. qu. quad. que. r. racc. rastl. razgl.
|
||||
razl. razv. rd. red. ref. reg. rel. relig. rep. repr. rer. resp. rest. ret. rev. revol. rež. rim. rist. rkp. rm.
|
||||
roj. rom. romun. rp. rr. rt. rud. ruš. ry. sal. samogl. san. sc. scen. sci. scr. sdv. seg. sek. sen. sept. ser.
|
||||
sev. sg. sgt. sh. sig. sigg. sign. sim. sin. sing. sinh. skand. skl. sklad. sklanj. sklep. skr. sl. slik. slov.
|
||||
slovak. slovn. sn. so. sob. soc. sociol. sod. sopomen. sopr. sor. sov. sovj. sp. spec. spl. spr. spreg. sq. sr.
|
||||
sre. sred. sredoz. srh. ss. ssp. st. sta. stan. stanstar. stcsl. ste. stim. stol. stom. str. stroj. strok. stsl.
|
||||
stud. sup. supl. suppl. svet. sz. t. tab. tech. ted. tehn. tehnol. tek. teks. tekst. tel. temp. ten. teol. ter.
|
||||
term. test. th. theol. tim. tip. tisočl. tit. tl. tol. tolmač. tom. tor. tov. tr. trad. traj. trans. tren.
|
||||
trib. tril. trop. trp. trž. ts. tt. tu. tur. turiz. tvor. tvorb. tč. u. ul. umet. un. univ. up. upr. ur. urad.
|
||||
us. ust. utr. v. va. val. var. varn. ven. ver. verb. vest. vezal. vic. vis. viv. viz. viš. vod. vok. vol. vpr.
|
||||
vrst. vrstil. vs. vv. vzd. vzg. vzh. vzor. w. wed. wg. wk. x. y. z. zah. zaim. zak. zap. zasl. zavar. zač. zb.
|
||||
združ. zg. zn. znan. znanstv. zoot. zun. zv. zvd. á. é. ć. č. čas. čet. čl. člen. čustv. đ. ľ. ł. ş. ŠT. š. šir.
|
||||
škofl. škot. šol. št. števil. štud. ů. ű. žen. žival.
|
||||
""".split()
|
||||
|
||||
for orth in abbrv:
|
||||
_exc[orth] = [{ORTH: orth}]
|
||||
|
||||
|
||||
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
|
@ -29,7 +29,7 @@ class Ukrainian(Language):
|
|||
assigns=["token.lemma"],
|
||||
default_config={
|
||||
"model": None,
|
||||
"mode": "pymorphy2",
|
||||
"mode": "pymorphy3",
|
||||
"overwrite": False,
|
||||
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||
},
|
||||
|
|
|
@ -14,11 +14,11 @@ class UkrainianLemmatizer(RussianLemmatizer):
|
|||
model: Optional[Model],
|
||||
name: str = "lemmatizer",
|
||||
*,
|
||||
mode: str = "pymorphy2",
|
||||
mode: str = "pymorphy3",
|
||||
overwrite: bool = False,
|
||||
scorer: Optional[Callable] = lemmatizer_score,
|
||||
) -> None:
|
||||
if mode == "pymorphy2":
|
||||
if mode in {"pymorphy2", "pymorphy2_lookup"}:
|
||||
try:
|
||||
from pymorphy2 import MorphAnalyzer
|
||||
except ImportError:
|
||||
|
@ -29,6 +29,17 @@ class UkrainianLemmatizer(RussianLemmatizer):
|
|||
) from None
|
||||
if getattr(self, "_morph", None) is None:
|
||||
self._morph = MorphAnalyzer(lang="uk")
|
||||
elif mode in {"pymorphy3", "pymorphy3_lookup"}:
|
||||
try:
|
||||
from pymorphy3 import MorphAnalyzer
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"The Ukrainian lemmatizer mode 'pymorphy3' requires the "
|
||||
"pymorphy3 library and dictionaries. Install them with: "
|
||||
"pip install pymorphy3 pymorphy3-dicts-uk"
|
||||
) from None
|
||||
if getattr(self, "_morph", None) is None:
|
||||
self._morph = MorphAnalyzer(lang="uk")
|
||||
super().__init__(
|
||||
vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||
)
|
||||
|
|
|
@ -43,8 +43,7 @@ from .lookups import load_lookups
|
|||
from .compat import Literal
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .pipeline import Pipe # noqa: F401
|
||||
PipeCallable = Callable[[Doc], Doc]
|
||||
|
||||
|
||||
# This is the base config will all settings (training etc.)
|
||||
|
@ -181,7 +180,7 @@ class Language:
|
|||
self.vocab: Vocab = vocab
|
||||
if self.lang is None:
|
||||
self.lang = self.vocab.lang
|
||||
self._components: List[Tuple[str, "Pipe"]] = []
|
||||
self._components: List[Tuple[str, PipeCallable]] = []
|
||||
self._disabled: Set[str] = set()
|
||||
self.max_length = max_length
|
||||
# Create the default tokenizer from the default config
|
||||
|
@ -303,7 +302,7 @@ class Language:
|
|||
return SimpleFrozenList(names)
|
||||
|
||||
@property
|
||||
def components(self) -> List[Tuple[str, "Pipe"]]:
|
||||
def components(self) -> List[Tuple[str, PipeCallable]]:
|
||||
"""Get all (name, component) tuples in the pipeline, including the
|
||||
currently disabled components.
|
||||
"""
|
||||
|
@ -322,12 +321,12 @@ class Language:
|
|||
return SimpleFrozenList(names, error=Errors.E926.format(attr="component_names"))
|
||||
|
||||
@property
|
||||
def pipeline(self) -> List[Tuple[str, "Pipe"]]:
|
||||
def pipeline(self) -> List[Tuple[str, PipeCallable]]:
|
||||
"""The processing pipeline consisting of (name, component) tuples. The
|
||||
components are called on the Doc in order as it passes through the
|
||||
pipeline.
|
||||
|
||||
RETURNS (List[Tuple[str, Pipe]]): The pipeline.
|
||||
RETURNS (List[Tuple[str, Callable[[Doc], Doc]]]): The pipeline.
|
||||
"""
|
||||
pipes = [(n, p) for n, p in self._components if n not in self._disabled]
|
||||
return SimpleFrozenList(pipes, error=Errors.E926.format(attr="pipeline"))
|
||||
|
@ -527,7 +526,7 @@ class Language:
|
|||
assigns: Iterable[str] = SimpleFrozenList(),
|
||||
requires: Iterable[str] = SimpleFrozenList(),
|
||||
retokenizes: bool = False,
|
||||
func: Optional["Pipe"] = None,
|
||||
func: Optional[PipeCallable] = None,
|
||||
) -> Callable[..., Any]:
|
||||
"""Register a new pipeline component. Can be used for stateless function
|
||||
components that don't require a separate factory. Can be used as a
|
||||
|
@ -542,7 +541,7 @@ class Language:
|
|||
e.g. "token.ent_id". Used for pipeline analysis.
|
||||
retokenizes (bool): Whether the component changes the tokenization.
|
||||
Used for pipeline analysis.
|
||||
func (Optional[Callable]): Factory function if not used as a decorator.
|
||||
func (Optional[Callable[[Doc], Doc]): Factory function if not used as a decorator.
|
||||
|
||||
DOCS: https://spacy.io/api/language#component
|
||||
"""
|
||||
|
@ -553,11 +552,11 @@ class Language:
|
|||
raise ValueError(Errors.E853.format(name=name))
|
||||
component_name = name if name is not None else util.get_object_name(func)
|
||||
|
||||
def add_component(component_func: "Pipe") -> Callable:
|
||||
def add_component(component_func: PipeCallable) -> Callable:
|
||||
if isinstance(func, type): # function is a class
|
||||
raise ValueError(Errors.E965.format(name=component_name))
|
||||
|
||||
def factory_func(nlp, name: str) -> "Pipe":
|
||||
def factory_func(nlp, name: str) -> PipeCallable:
|
||||
return component_func
|
||||
|
||||
internal_name = cls.get_factory_name(name)
|
||||
|
@ -607,7 +606,7 @@ class Language:
|
|||
print_pipe_analysis(analysis, keys=keys)
|
||||
return analysis
|
||||
|
||||
def get_pipe(self, name: str) -> "Pipe":
|
||||
def get_pipe(self, name: str) -> PipeCallable:
|
||||
"""Get a pipeline component for a given component name.
|
||||
|
||||
name (str): Name of pipeline component to get.
|
||||
|
@ -628,7 +627,7 @@ class Language:
|
|||
config: Dict[str, Any] = SimpleFrozenDict(),
|
||||
raw_config: Optional[Config] = None,
|
||||
validate: bool = True,
|
||||
) -> "Pipe":
|
||||
) -> PipeCallable:
|
||||
"""Create a pipeline component. Mostly used internally. To create and
|
||||
add a component to the pipeline, you can use nlp.add_pipe.
|
||||
|
||||
|
@ -640,7 +639,7 @@ class Language:
|
|||
raw_config (Optional[Config]): Internals: the non-interpolated config.
|
||||
validate (bool): Whether to validate the component config against the
|
||||
arguments and types expected by the factory.
|
||||
RETURNS (Pipe): The pipeline component.
|
||||
RETURNS (Callable[[Doc], Doc]): The pipeline component.
|
||||
|
||||
DOCS: https://spacy.io/api/language#create_pipe
|
||||
"""
|
||||
|
@ -695,24 +694,18 @@ class Language:
|
|||
|
||||
def create_pipe_from_source(
|
||||
self, source_name: str, source: "Language", *, name: str
|
||||
) -> Tuple["Pipe", str]:
|
||||
) -> Tuple[PipeCallable, str]:
|
||||
"""Create a pipeline component by copying it from an existing model.
|
||||
|
||||
source_name (str): Name of the component in the source pipeline.
|
||||
source (Language): The source nlp object to copy from.
|
||||
name (str): Optional alternative name to use in current pipeline.
|
||||
RETURNS (Tuple[Callable, str]): The component and its factory name.
|
||||
RETURNS (Tuple[Callable[[Doc], Doc], str]): The component and its factory name.
|
||||
"""
|
||||
# Check source type
|
||||
if not isinstance(source, Language):
|
||||
raise ValueError(Errors.E945.format(name=source_name, source=type(source)))
|
||||
# Check vectors, with faster checks first
|
||||
if (
|
||||
self.vocab.vectors.shape != source.vocab.vectors.shape
|
||||
or self.vocab.vectors.key2row != source.vocab.vectors.key2row
|
||||
or self.vocab.vectors.to_bytes(exclude=["strings"])
|
||||
!= source.vocab.vectors.to_bytes(exclude=["strings"])
|
||||
):
|
||||
if self.vocab.vectors != source.vocab.vectors:
|
||||
warnings.warn(Warnings.W113.format(name=source_name))
|
||||
if source_name not in source.component_names:
|
||||
raise KeyError(
|
||||
|
@ -746,7 +739,7 @@ class Language:
|
|||
config: Dict[str, Any] = SimpleFrozenDict(),
|
||||
raw_config: Optional[Config] = None,
|
||||
validate: bool = True,
|
||||
) -> "Pipe":
|
||||
) -> PipeCallable:
|
||||
"""Add a component to the processing pipeline. Valid components are
|
||||
callables that take a `Doc` object, modify it and return it. Only one
|
||||
of before/after/first/last can be set. Default behaviour is "last".
|
||||
|
@ -769,7 +762,7 @@ class Language:
|
|||
raw_config (Optional[Config]): Internals: the non-interpolated config.
|
||||
validate (bool): Whether to validate the component config against the
|
||||
arguments and types expected by the factory.
|
||||
RETURNS (Pipe): The pipeline component.
|
||||
RETURNS (Callable[[Doc], Doc]): The pipeline component.
|
||||
|
||||
DOCS: https://spacy.io/api/language#add_pipe
|
||||
"""
|
||||
|
@ -790,14 +783,6 @@ class Language:
|
|||
factory_name, source, name=name
|
||||
)
|
||||
else:
|
||||
if not self.has_factory(factory_name):
|
||||
err = Errors.E002.format(
|
||||
name=factory_name,
|
||||
opts=", ".join(self.factory_names),
|
||||
method="add_pipe",
|
||||
lang=util.get_object_name(self),
|
||||
lang_code=self.lang,
|
||||
)
|
||||
pipe_component = self.create_pipe(
|
||||
factory_name,
|
||||
name=name,
|
||||
|
@ -883,7 +868,7 @@ class Language:
|
|||
*,
|
||||
config: Dict[str, Any] = SimpleFrozenDict(),
|
||||
validate: bool = True,
|
||||
) -> "Pipe":
|
||||
) -> PipeCallable:
|
||||
"""Replace a component in the pipeline.
|
||||
|
||||
name (str): Name of the component to replace.
|
||||
|
@ -892,7 +877,7 @@ class Language:
|
|||
component. Will be merged with default config, if available.
|
||||
validate (bool): Whether to validate the component config against the
|
||||
arguments and types expected by the factory.
|
||||
RETURNS (Pipe): The new pipeline component.
|
||||
RETURNS (Callable[[Doc], Doc]): The new pipeline component.
|
||||
|
||||
DOCS: https://spacy.io/api/language#replace_pipe
|
||||
"""
|
||||
|
@ -944,11 +929,11 @@ class Language:
|
|||
init_cfg = self._config["initialize"]["components"].pop(old_name)
|
||||
self._config["initialize"]["components"][new_name] = init_cfg
|
||||
|
||||
def remove_pipe(self, name: str) -> Tuple[str, "Pipe"]:
|
||||
def remove_pipe(self, name: str) -> Tuple[str, PipeCallable]:
|
||||
"""Remove a component from the pipeline.
|
||||
|
||||
name (str): Name of the component to remove.
|
||||
RETURNS (tuple): A `(name, component)` tuple of the removed component.
|
||||
RETURNS (Tuple[str, Callable[[Doc], Doc]]): A `(name, component)` tuple of the removed component.
|
||||
|
||||
DOCS: https://spacy.io/api/language#remove_pipe
|
||||
"""
|
||||
|
@ -1363,15 +1348,15 @@ class Language:
|
|||
|
||||
def set_error_handler(
|
||||
self,
|
||||
error_handler: Callable[[str, "Pipe", List[Doc], Exception], NoReturn],
|
||||
error_handler: Callable[[str, PipeCallable, List[Doc], Exception], NoReturn],
|
||||
):
|
||||
"""Set an error handler object for all the components in the pipeline that implement
|
||||
a set_error_handler function.
|
||||
"""Set an error handler object for all the components in the pipeline
|
||||
that implement a set_error_handler function.
|
||||
|
||||
error_handler (Callable[[str, Pipe, List[Doc], Exception], NoReturn]):
|
||||
Function that deals with a failing batch of documents. This callable function should take in
|
||||
the component's name, the component itself, the offending batch of documents, and the exception
|
||||
that was thrown.
|
||||
error_handler (Callable[[str, Callable[[Doc], Doc], List[Doc], Exception], NoReturn]):
|
||||
Function that deals with a failing batch of documents. This callable
|
||||
function should take in the component's name, the component itself,
|
||||
the offending batch of documents, and the exception that was thrown.
|
||||
DOCS: https://spacy.io/api/language#set_error_handler
|
||||
"""
|
||||
self.default_error_handler = error_handler
|
||||
|
@ -1879,31 +1864,22 @@ class Language:
|
|||
if isinstance(exclude, str):
|
||||
exclude = [exclude]
|
||||
|
||||
def fetch_pipes_status(value: Iterable[str], key: str) -> Iterable[str]:
|
||||
"""Fetch value for `enable` or `disable` w.r.t. the specified config and passed arguments passed to
|
||||
.load(). If both arguments and config specified values for this field, the passed arguments take precedence
|
||||
and a warning is printed.
|
||||
value (Iterable[str]): Passed value for `enable` or `disable`.
|
||||
key (str): Key for field in config (either "enabled" or "disabled").
|
||||
RETURN (Iterable[str]):
|
||||
"""
|
||||
# We assume that no argument was passed if the value is the specified default value.
|
||||
if id(value) == id(_DEFAULT_EMPTY_PIPES):
|
||||
return config["nlp"].get(key, [])
|
||||
else:
|
||||
if len(config["nlp"].get(key, [])):
|
||||
warnings.warn(
|
||||
Warnings.W123.format(
|
||||
arg=key[:-1],
|
||||
arg_value=value,
|
||||
config_value=config["nlp"][key],
|
||||
)
|
||||
# `enable` should not be merged with `enabled` (the opposite is true for `disable`/`disabled`). If the config
|
||||
# specifies values for `enabled` not included in `enable`, emit warning.
|
||||
if id(enable) != id(_DEFAULT_EMPTY_PIPES):
|
||||
enabled = config["nlp"].get("enabled", [])
|
||||
if len(enabled) and not set(enabled).issubset(enable):
|
||||
warnings.warn(
|
||||
Warnings.W123.format(
|
||||
enable=enable,
|
||||
enabled=enabled,
|
||||
)
|
||||
return value
|
||||
)
|
||||
|
||||
# Ensure sets of disabled/enabled pipe names are not contradictory.
|
||||
disabled_pipes = cls._resolve_component_status(
|
||||
fetch_pipes_status(disable, "disabled"),
|
||||
fetch_pipes_status(enable, "enabled"),
|
||||
list({*disable, *config["nlp"].get("disabled", [])}),
|
||||
enable,
|
||||
config["nlp"]["pipeline"],
|
||||
)
|
||||
nlp._disabled = set(p for p in disabled_pipes if p not in exclude)
|
||||
|
@ -2084,10 +2060,12 @@ class Language:
|
|||
if enable:
|
||||
if isinstance(enable, str):
|
||||
enable = [enable]
|
||||
to_disable = [
|
||||
pipe_name for pipe_name in pipe_names if pipe_name not in enable
|
||||
]
|
||||
if disable and disable != to_disable:
|
||||
to_disable = {
|
||||
*[pipe_name for pipe_name in pipe_names if pipe_name not in enable],
|
||||
*disable,
|
||||
}
|
||||
# If any pipe to be enabled is in to_disable, the specification is inconsistent.
|
||||
if len(set(enable) & to_disable):
|
||||
raise ValueError(Errors.E1042.format(enable=enable, disable=disable))
|
||||
|
||||
return tuple(to_disable)
|
||||
|
|
|
@ -4,6 +4,8 @@ from libc.stdint cimport int64_t
|
|||
|
||||
from typing import Optional
|
||||
|
||||
from ..util import registry
|
||||
|
||||
|
||||
cdef extern from "polyleven.c":
|
||||
int64_t polyleven(PyObject *o1, PyObject *o2, int64_t k)
|
||||
|
@ -13,3 +15,18 @@ cpdef int64_t levenshtein(a: str, b: str, k: Optional[int] = None):
|
|||
if k is None:
|
||||
k = -1
|
||||
return polyleven(<PyObject*>a, <PyObject*>b, k)
|
||||
|
||||
|
||||
cpdef bint levenshtein_compare(input_text: str, pattern_text: str, fuzzy: int = -1):
|
||||
if fuzzy >= 0:
|
||||
max_edits = fuzzy
|
||||
else:
|
||||
# allow at least two edits (to allow at least one transposition) and up
|
||||
# to 30% of the pattern string length
|
||||
max_edits = max(2, round(0.3 * len(pattern_text)))
|
||||
return levenshtein(input_text, pattern_text, max_edits) <= max_edits
|
||||
|
||||
|
||||
@registry.misc("spacy.levenshtein_compare.v1")
|
||||
def make_levenshtein_compare():
|
||||
return levenshtein_compare
|
||||
|
|
|
@ -77,3 +77,4 @@ cdef class Matcher:
|
|||
cdef public object _extensions
|
||||
cdef public object _extra_predicates
|
||||
cdef public object _seen_attrs
|
||||
cdef public object _fuzzy_compare
|
||||
|
|
|
@ -5,7 +5,12 @@ from ..vocab import Vocab
|
|||
from ..tokens import Doc, Span
|
||||
|
||||
class Matcher:
|
||||
def __init__(self, vocab: Vocab, validate: bool = ...) -> None: ...
|
||||
def __init__(
|
||||
self,
|
||||
vocab: Vocab,
|
||||
validate: bool = ...,
|
||||
fuzzy_compare: Callable[[str, str, int], bool] = ...,
|
||||
) -> None: ...
|
||||
def __reduce__(self) -> Any: ...
|
||||
def __len__(self) -> int: ...
|
||||
def __contains__(self, key: str) -> bool: ...
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# cython: infer_types=True, cython: profile=True
|
||||
# cython: binding=True, infer_types=True, profile=True
|
||||
from typing import List, Iterable
|
||||
|
||||
from libcpp.vector cimport vector
|
||||
|
@ -20,10 +20,12 @@ from ..tokens.token cimport Token
|
|||
from ..tokens.morphanalysis cimport MorphAnalysis
|
||||
from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH, ENT_IOB
|
||||
|
||||
from .levenshtein import levenshtein_compare
|
||||
from ..schemas import validate_token_pattern
|
||||
from ..errors import Errors, MatchPatternError, Warnings
|
||||
from ..strings import get_string_id
|
||||
from ..attrs import IDS
|
||||
from ..util import registry
|
||||
|
||||
|
||||
DEF PADDING = 5
|
||||
|
@ -36,11 +38,13 @@ cdef class Matcher:
|
|||
USAGE: https://spacy.io/usage/rule-based-matching
|
||||
"""
|
||||
|
||||
def __init__(self, vocab, validate=True):
|
||||
def __init__(self, vocab, validate=True, *, fuzzy_compare=levenshtein_compare):
|
||||
"""Create the Matcher.
|
||||
|
||||
vocab (Vocab): The vocabulary object, which must be shared with the
|
||||
documents the matcher will operate on.
|
||||
validate (bool): Validate all patterns added to this matcher.
|
||||
fuzzy_compare (Callable[[str, str, int], bool]): The comparison method
|
||||
for the FUZZY operators.
|
||||
"""
|
||||
self._extra_predicates = []
|
||||
self._patterns = {}
|
||||
|
@ -51,9 +55,10 @@ cdef class Matcher:
|
|||
self.vocab = vocab
|
||||
self.mem = Pool()
|
||||
self.validate = validate
|
||||
self._fuzzy_compare = fuzzy_compare
|
||||
|
||||
def __reduce__(self):
|
||||
data = (self.vocab, self._patterns, self._callbacks)
|
||||
data = (self.vocab, self._patterns, self._callbacks, self.validate, self._fuzzy_compare)
|
||||
return (unpickle_matcher, data, None, None)
|
||||
|
||||
def __len__(self):
|
||||
|
@ -128,7 +133,7 @@ cdef class Matcher:
|
|||
for pattern in patterns:
|
||||
try:
|
||||
specs = _preprocess_pattern(pattern, self.vocab,
|
||||
self._extensions, self._extra_predicates)
|
||||
self._extensions, self._extra_predicates, self._fuzzy_compare)
|
||||
self.patterns.push_back(init_pattern(self.mem, key, specs))
|
||||
for spec in specs:
|
||||
for attr, _ in spec[1]:
|
||||
|
@ -326,8 +331,8 @@ cdef class Matcher:
|
|||
return key
|
||||
|
||||
|
||||
def unpickle_matcher(vocab, patterns, callbacks):
|
||||
matcher = Matcher(vocab)
|
||||
def unpickle_matcher(vocab, patterns, callbacks, validate, fuzzy_compare):
|
||||
matcher = Matcher(vocab, validate=validate, fuzzy_compare=fuzzy_compare)
|
||||
for key, pattern in patterns.items():
|
||||
callback = callbacks.get(key, None)
|
||||
matcher.add(key, pattern, on_match=callback)
|
||||
|
@ -754,7 +759,7 @@ cdef attr_t get_ent_id(const TokenPatternC* pattern) nogil:
|
|||
return id_attr.value
|
||||
|
||||
|
||||
def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates):
|
||||
def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates, fuzzy_compare):
|
||||
"""This function interprets the pattern, converting the various bits of
|
||||
syntactic sugar before we compile it into a struct with init_pattern.
|
||||
|
||||
|
@ -781,7 +786,7 @@ def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates):
|
|||
ops = _get_operators(spec)
|
||||
attr_values = _get_attr_values(spec, string_store)
|
||||
extensions = _get_extensions(spec, string_store, extensions_table)
|
||||
predicates = _get_extra_predicates(spec, extra_predicates, vocab)
|
||||
predicates = _get_extra_predicates(spec, extra_predicates, vocab, fuzzy_compare)
|
||||
for op in ops:
|
||||
tokens.append((op, list(attr_values), list(extensions), list(predicates), token_idx))
|
||||
return tokens
|
||||
|
@ -826,16 +831,45 @@ def _get_attr_values(spec, string_store):
|
|||
# These predicate helper classes are used to match the REGEX, IN, >= etc
|
||||
# extensions to the matcher introduced in #3173.
|
||||
|
||||
class _FuzzyPredicate:
|
||||
operators = ("FUZZY", "FUZZY1", "FUZZY2", "FUZZY3", "FUZZY4", "FUZZY5",
|
||||
"FUZZY6", "FUZZY7", "FUZZY8", "FUZZY9")
|
||||
|
||||
def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None,
|
||||
regex=False, fuzzy=None, fuzzy_compare=None):
|
||||
self.i = i
|
||||
self.attr = attr
|
||||
self.value = value
|
||||
self.predicate = predicate
|
||||
self.is_extension = is_extension
|
||||
if self.predicate not in self.operators:
|
||||
raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
|
||||
fuzz = self.predicate[len("FUZZY"):] # number after prefix
|
||||
self.fuzzy = int(fuzz) if fuzz else -1
|
||||
self.fuzzy_compare = fuzzy_compare
|
||||
self.key = (self.attr, self.fuzzy, self.predicate, srsly.json_dumps(value, sort_keys=True))
|
||||
|
||||
def __call__(self, Token token):
|
||||
if self.is_extension:
|
||||
value = token._.get(self.attr)
|
||||
else:
|
||||
value = token.vocab.strings[get_token_attr_for_matcher(token.c, self.attr)]
|
||||
if self.value == value:
|
||||
return True
|
||||
return self.fuzzy_compare(value, self.value, self.fuzzy)
|
||||
|
||||
|
||||
class _RegexPredicate:
|
||||
operators = ("REGEX",)
|
||||
|
||||
def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None):
|
||||
def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None,
|
||||
regex=False, fuzzy=None, fuzzy_compare=None):
|
||||
self.i = i
|
||||
self.attr = attr
|
||||
self.value = re.compile(value)
|
||||
self.predicate = predicate
|
||||
self.is_extension = is_extension
|
||||
self.key = (attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
|
||||
self.key = (self.attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
|
||||
if self.predicate not in self.operators:
|
||||
raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
|
||||
|
||||
|
@ -850,18 +884,28 @@ class _RegexPredicate:
|
|||
class _SetPredicate:
|
||||
operators = ("IN", "NOT_IN", "IS_SUBSET", "IS_SUPERSET", "INTERSECTS")
|
||||
|
||||
def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None):
|
||||
def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None,
|
||||
regex=False, fuzzy=None, fuzzy_compare=None):
|
||||
self.i = i
|
||||
self.attr = attr
|
||||
self.vocab = vocab
|
||||
self.regex = regex
|
||||
self.fuzzy = fuzzy
|
||||
self.fuzzy_compare = fuzzy_compare
|
||||
if self.attr == MORPH:
|
||||
# normalize morph strings
|
||||
self.value = set(self.vocab.morphology.add(v) for v in value)
|
||||
else:
|
||||
self.value = set(get_string_id(v) for v in value)
|
||||
if self.regex:
|
||||
self.value = set(re.compile(v) for v in value)
|
||||
elif self.fuzzy is not None:
|
||||
# add to string store
|
||||
self.value = set(self.vocab.strings.add(v) for v in value)
|
||||
else:
|
||||
self.value = set(get_string_id(v) for v in value)
|
||||
self.predicate = predicate
|
||||
self.is_extension = is_extension
|
||||
self.key = (attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
|
||||
self.key = (self.attr, self.regex, self.fuzzy, self.predicate, srsly.json_dumps(value, sort_keys=True))
|
||||
if self.predicate not in self.operators:
|
||||
raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
|
||||
|
||||
|
@ -889,9 +933,29 @@ class _SetPredicate:
|
|||
return False
|
||||
|
||||
if self.predicate == "IN":
|
||||
return value in self.value
|
||||
if self.regex:
|
||||
value = self.vocab.strings[value]
|
||||
return any(bool(v.search(value)) for v in self.value)
|
||||
elif self.fuzzy is not None:
|
||||
value = self.vocab.strings[value]
|
||||
return any(self.fuzzy_compare(value, self.vocab.strings[v], self.fuzzy)
|
||||
for v in self.value)
|
||||
elif value in self.value:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
elif self.predicate == "NOT_IN":
|
||||
return value not in self.value
|
||||
if self.regex:
|
||||
value = self.vocab.strings[value]
|
||||
return not any(bool(v.search(value)) for v in self.value)
|
||||
elif self.fuzzy is not None:
|
||||
value = self.vocab.strings[value]
|
||||
return not any(self.fuzzy_compare(value, self.vocab.strings[v], self.fuzzy)
|
||||
for v in self.value)
|
||||
elif value in self.value:
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
elif self.predicate == "IS_SUBSET":
|
||||
return value <= self.value
|
||||
elif self.predicate == "IS_SUPERSET":
|
||||
|
@ -906,13 +970,14 @@ class _SetPredicate:
|
|||
class _ComparisonPredicate:
|
||||
operators = ("==", "!=", ">=", "<=", ">", "<")
|
||||
|
||||
def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None):
|
||||
def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None,
|
||||
regex=False, fuzzy=None, fuzzy_compare=None):
|
||||
self.i = i
|
||||
self.attr = attr
|
||||
self.value = value
|
||||
self.predicate = predicate
|
||||
self.is_extension = is_extension
|
||||
self.key = (attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
|
||||
self.key = (self.attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
|
||||
if self.predicate not in self.operators:
|
||||
raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
|
||||
|
||||
|
@ -935,7 +1000,7 @@ class _ComparisonPredicate:
|
|||
return value < self.value
|
||||
|
||||
|
||||
def _get_extra_predicates(spec, extra_predicates, vocab):
|
||||
def _get_extra_predicates(spec, extra_predicates, vocab, fuzzy_compare):
|
||||
predicate_types = {
|
||||
"REGEX": _RegexPredicate,
|
||||
"IN": _SetPredicate,
|
||||
|
@ -949,6 +1014,16 @@ def _get_extra_predicates(spec, extra_predicates, vocab):
|
|||
"<=": _ComparisonPredicate,
|
||||
">": _ComparisonPredicate,
|
||||
"<": _ComparisonPredicate,
|
||||
"FUZZY": _FuzzyPredicate,
|
||||
"FUZZY1": _FuzzyPredicate,
|
||||
"FUZZY2": _FuzzyPredicate,
|
||||
"FUZZY3": _FuzzyPredicate,
|
||||
"FUZZY4": _FuzzyPredicate,
|
||||
"FUZZY5": _FuzzyPredicate,
|
||||
"FUZZY6": _FuzzyPredicate,
|
||||
"FUZZY7": _FuzzyPredicate,
|
||||
"FUZZY8": _FuzzyPredicate,
|
||||
"FUZZY9": _FuzzyPredicate,
|
||||
}
|
||||
seen_predicates = {pred.key: pred.i for pred in extra_predicates}
|
||||
output = []
|
||||
|
@ -966,22 +1041,47 @@ def _get_extra_predicates(spec, extra_predicates, vocab):
|
|||
attr = "ORTH"
|
||||
attr = IDS.get(attr.upper())
|
||||
if isinstance(value, dict):
|
||||
processed = False
|
||||
value_with_upper_keys = {k.upper(): v for k, v in value.items()}
|
||||
for type_, cls in predicate_types.items():
|
||||
if type_ in value_with_upper_keys:
|
||||
predicate = cls(len(extra_predicates), attr, value_with_upper_keys[type_], type_, vocab=vocab)
|
||||
# Don't create a redundant predicates.
|
||||
# This helps with efficiency, as we're caching the results.
|
||||
if predicate.key in seen_predicates:
|
||||
output.append(seen_predicates[predicate.key])
|
||||
else:
|
||||
extra_predicates.append(predicate)
|
||||
output.append(predicate.i)
|
||||
seen_predicates[predicate.key] = predicate.i
|
||||
processed = True
|
||||
if not processed:
|
||||
warnings.warn(Warnings.W035.format(pattern=value))
|
||||
output.extend(_get_extra_predicates_dict(attr, value, vocab, predicate_types,
|
||||
extra_predicates, seen_predicates, fuzzy_compare=fuzzy_compare))
|
||||
return output
|
||||
|
||||
|
||||
def _get_extra_predicates_dict(attr, value_dict, vocab, predicate_types,
|
||||
extra_predicates, seen_predicates, regex=False, fuzzy=None, fuzzy_compare=None):
|
||||
output = []
|
||||
for type_, value in value_dict.items():
|
||||
type_ = type_.upper()
|
||||
cls = predicate_types.get(type_)
|
||||
if cls is None:
|
||||
warnings.warn(Warnings.W035.format(pattern=value_dict))
|
||||
# ignore unrecognized predicate type
|
||||
continue
|
||||
elif cls == _RegexPredicate:
|
||||
if isinstance(value, dict):
|
||||
# add predicates inside regex operator
|
||||
output.extend(_get_extra_predicates_dict(attr, value, vocab, predicate_types,
|
||||
extra_predicates, seen_predicates,
|
||||
regex=True))
|
||||
continue
|
||||
elif cls == _FuzzyPredicate:
|
||||
if isinstance(value, dict):
|
||||
# add predicates inside fuzzy operator
|
||||
fuzz = type_[len("FUZZY"):] # number after prefix
|
||||
fuzzy_val = int(fuzz) if fuzz else -1
|
||||
output.extend(_get_extra_predicates_dict(attr, value, vocab, predicate_types,
|
||||
extra_predicates, seen_predicates,
|
||||
fuzzy=fuzzy_val, fuzzy_compare=fuzzy_compare))
|
||||
continue
|
||||
predicate = cls(len(extra_predicates), attr, value, type_, vocab=vocab,
|
||||
regex=regex, fuzzy=fuzzy, fuzzy_compare=fuzzy_compare)
|
||||
# Don't create redundant predicates.
|
||||
# This helps with efficiency, as we're caching the results.
|
||||
if predicate.key in seen_predicates:
|
||||
output.append(seen_predicates[predicate.key])
|
||||
else:
|
||||
extra_predicates.append(predicate)
|
||||
output.append(predicate.i)
|
||||
seen_predicates[predicate.key] = predicate.i
|
||||
return output
|
||||
|
||||
|
||||
|
|
|
@ -1,11 +1,12 @@
|
|||
from pathlib import Path
|
||||
from typing import Optional, Callable, Iterable, List, Tuple
|
||||
from thinc.types import Floats2d
|
||||
from thinc.api import chain, clone, list2ragged, reduce_mean, residual
|
||||
from thinc.api import Model, Maxout, Linear, noop, tuplify, Ragged
|
||||
from thinc.api import chain, list2ragged, reduce_mean, residual
|
||||
from thinc.api import Model, Maxout, Linear, tuplify, Ragged
|
||||
|
||||
from ...util import registry
|
||||
from ...kb import KnowledgeBase, Candidate, get_candidates
|
||||
from ...kb import KnowledgeBase, InMemoryLookupKB
|
||||
from ...kb import Candidate, get_candidates, get_candidates_batch
|
||||
from ...vocab import Vocab
|
||||
from ...tokens import Span, Doc
|
||||
from ..extract_spans import extract_spans
|
||||
|
@ -70,17 +71,18 @@ def span_maker_forward(model, docs: List[Doc], is_train) -> Tuple[Ragged, Callab
|
|||
cands.append((start_token, end_token))
|
||||
|
||||
candidates.append(ops.asarray2i(cands))
|
||||
candlens = ops.asarray1i([len(cands) for cands in candidates])
|
||||
candidates = ops.xp.concatenate(candidates)
|
||||
outputs = Ragged(candidates, candlens)
|
||||
lengths = model.ops.asarray1i([len(cands) for cands in candidates])
|
||||
out = Ragged(model.ops.flatten(candidates), lengths)
|
||||
# because this is just rearranging docs, the backprop does nothing
|
||||
return outputs, lambda x: []
|
||||
return out, lambda x: []
|
||||
|
||||
|
||||
@registry.misc("spacy.KBFromFile.v1")
|
||||
def load_kb(kb_path: Path) -> Callable[[Vocab], KnowledgeBase]:
|
||||
def kb_from_file(vocab):
|
||||
kb = KnowledgeBase(vocab, entity_vector_length=1)
|
||||
def load_kb(
|
||||
kb_path: Path,
|
||||
) -> Callable[[Vocab], KnowledgeBase]:
|
||||
def kb_from_file(vocab: Vocab):
|
||||
kb = InMemoryLookupKB(vocab, entity_vector_length=1)
|
||||
kb.from_disk(kb_path)
|
||||
return kb
|
||||
|
||||
|
@ -88,9 +90,11 @@ def load_kb(kb_path: Path) -> Callable[[Vocab], KnowledgeBase]:
|
|||
|
||||
|
||||
@registry.misc("spacy.EmptyKB.v1")
|
||||
def empty_kb(entity_vector_length: int) -> Callable[[Vocab], KnowledgeBase]:
|
||||
def empty_kb_factory(vocab):
|
||||
return KnowledgeBase(vocab=vocab, entity_vector_length=entity_vector_length)
|
||||
def empty_kb(
|
||||
entity_vector_length: int,
|
||||
) -> Callable[[Vocab], KnowledgeBase]:
|
||||
def empty_kb_factory(vocab: Vocab):
|
||||
return InMemoryLookupKB(vocab=vocab, entity_vector_length=entity_vector_length)
|
||||
|
||||
return empty_kb_factory
|
||||
|
||||
|
@ -98,3 +102,10 @@ def empty_kb(entity_vector_length: int) -> Callable[[Vocab], KnowledgeBase]:
|
|||
@registry.misc("spacy.CandidateGenerator.v1")
|
||||
def create_candidates() -> Callable[[KnowledgeBase, Span], Iterable[Candidate]]:
|
||||
return get_candidates
|
||||
|
||||
|
||||
@registry.misc("spacy.CandidateBatchGenerator.v1")
|
||||
def create_candidates_batch() -> Callable[
|
||||
[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
|
||||
]:
|
||||
return get_candidates_batch
|
||||
|
|
|
@ -1,13 +1,12 @@
|
|||
from typing import cast, Any, Callable, Dict, Iterable, List, Optional
|
||||
from typing import Sequence, Tuple, Union
|
||||
from typing import Tuple
|
||||
from collections import Counter
|
||||
from copy import deepcopy
|
||||
from itertools import islice
|
||||
import numpy as np
|
||||
|
||||
import srsly
|
||||
from thinc.api import Config, Model, SequenceCategoricalCrossentropy
|
||||
from thinc.types import Floats2d, Ints1d, Ints2d
|
||||
from thinc.api import Config, Model, SequenceCategoricalCrossentropy, NumpyOps
|
||||
from thinc.types import Floats2d, Ints2d
|
||||
|
||||
from ._edit_tree_internals.edit_trees import EditTrees
|
||||
from ._edit_tree_internals.schemas import validate_edit_tree
|
||||
|
@ -21,6 +20,10 @@ from ..vocab import Vocab
|
|||
from .. import util
|
||||
|
||||
|
||||
# The cutoff value of *top_k* above which an alternative method is used to process guesses.
|
||||
TOP_K_GUARDRAIL = 20
|
||||
|
||||
|
||||
default_model_config = """
|
||||
[model]
|
||||
@architectures = "spacy.Tagger.v2"
|
||||
|
@ -116,6 +119,7 @@ class EditTreeLemmatizer(TrainablePipe):
|
|||
|
||||
self.cfg: Dict[str, Any] = {"labels": []}
|
||||
self.scorer = scorer
|
||||
self.numpy_ops = NumpyOps()
|
||||
|
||||
def get_loss(
|
||||
self, examples: Iterable[Example], scores: List[Floats2d]
|
||||
|
@ -129,7 +133,7 @@ class EditTreeLemmatizer(TrainablePipe):
|
|||
for (predicted, gold_lemma) in zip(
|
||||
eg.predicted, eg.get_aligned("LEMMA", as_string=True)
|
||||
):
|
||||
if gold_lemma is None:
|
||||
if gold_lemma is None or gold_lemma == "":
|
||||
label = -1
|
||||
else:
|
||||
tree_id = self.trees.add(predicted.text, gold_lemma)
|
||||
|
@ -145,31 +149,73 @@ class EditTreeLemmatizer(TrainablePipe):
|
|||
return float(loss), d_scores
|
||||
|
||||
def predict(self, docs: Iterable[Doc]) -> List[Ints2d]:
|
||||
if self.top_k == 1:
|
||||
scores2guesses = self._scores2guesses_top_k_equals_1
|
||||
elif self.top_k <= TOP_K_GUARDRAIL:
|
||||
scores2guesses = self._scores2guesses_top_k_greater_1
|
||||
else:
|
||||
scores2guesses = self._scores2guesses_top_k_guardrail
|
||||
# The behaviour of *_scores2guesses_top_k_greater_1()* is efficient for values
|
||||
# of *top_k>1* that are likely to be useful when the edit tree lemmatizer is used
|
||||
# for its principal purpose of lemmatizing tokens. However, the code could also
|
||||
# be used for other purposes, and with very large values of *top_k* the method
|
||||
# becomes inefficient. In such cases, *_scores2guesses_top_k_guardrail()* is used
|
||||
# instead.
|
||||
n_docs = len(list(docs))
|
||||
if not any(len(doc) for doc in docs):
|
||||
# Handle cases where there are no tokens in any docs.
|
||||
n_labels = len(self.cfg["labels"])
|
||||
guesses: List[Ints2d] = [
|
||||
self.model.ops.alloc((0, n_labels), dtype="i") for doc in docs
|
||||
]
|
||||
guesses: List[Ints2d] = [self.model.ops.alloc2i(0, n_labels) for _ in docs]
|
||||
assert len(guesses) == n_docs
|
||||
return guesses
|
||||
scores = self.model.predict(docs)
|
||||
assert len(scores) == n_docs
|
||||
guesses = self._scores2guesses(docs, scores)
|
||||
guesses = scores2guesses(docs, scores)
|
||||
assert len(guesses) == n_docs
|
||||
return guesses
|
||||
|
||||
def _scores2guesses(self, docs, scores):
|
||||
def _scores2guesses_top_k_equals_1(self, docs, scores):
|
||||
guesses = []
|
||||
for doc, doc_scores in zip(docs, scores):
|
||||
if self.top_k == 1:
|
||||
doc_guesses = doc_scores.argmax(axis=1).reshape(-1, 1)
|
||||
else:
|
||||
doc_guesses = np.argsort(doc_scores)[..., : -self.top_k - 1 : -1]
|
||||
doc_guesses = doc_scores.argmax(axis=1)
|
||||
doc_guesses = self.numpy_ops.asarray(doc_guesses)
|
||||
|
||||
if not isinstance(doc_guesses, np.ndarray):
|
||||
doc_guesses = doc_guesses.get()
|
||||
doc_compat_guesses = []
|
||||
for i, token in enumerate(doc):
|
||||
tree_id = self.cfg["labels"][doc_guesses[i]]
|
||||
if self.trees.apply(tree_id, token.text) is not None:
|
||||
doc_compat_guesses.append(tree_id)
|
||||
else:
|
||||
doc_compat_guesses.append(-1)
|
||||
guesses.append(np.array(doc_compat_guesses))
|
||||
|
||||
return guesses
|
||||
|
||||
def _scores2guesses_top_k_greater_1(self, docs, scores):
|
||||
guesses = []
|
||||
top_k = min(self.top_k, len(self.labels))
|
||||
for doc, doc_scores in zip(docs, scores):
|
||||
doc_scores = self.numpy_ops.asarray(doc_scores)
|
||||
doc_compat_guesses = []
|
||||
for i, token in enumerate(doc):
|
||||
for _ in range(top_k):
|
||||
candidate = int(doc_scores[i].argmax())
|
||||
candidate_tree_id = self.cfg["labels"][candidate]
|
||||
if self.trees.apply(candidate_tree_id, token.text) is not None:
|
||||
doc_compat_guesses.append(candidate_tree_id)
|
||||
break
|
||||
doc_scores[i, candidate] = np.finfo(np.float32).min
|
||||
else:
|
||||
doc_compat_guesses.append(-1)
|
||||
guesses.append(np.array(doc_compat_guesses))
|
||||
|
||||
return guesses
|
||||
|
||||
def _scores2guesses_top_k_guardrail(self, docs, scores):
|
||||
guesses = []
|
||||
for doc, doc_scores in zip(docs, scores):
|
||||
doc_guesses = np.argsort(doc_scores)[..., : -self.top_k - 1 : -1]
|
||||
doc_guesses = self.numpy_ops.asarray(doc_guesses)
|
||||
|
||||
doc_compat_guesses = []
|
||||
for token, candidates in zip(doc, doc_guesses):
|
||||
|
@ -331,9 +377,9 @@ class EditTreeLemmatizer(TrainablePipe):
|
|||
|
||||
tree = dict(tree)
|
||||
if "orig" in tree:
|
||||
tree["orig"] = self.vocab.strings[tree["orig"]]
|
||||
tree["orig"] = self.vocab.strings.add(tree["orig"])
|
||||
if "orig" in tree:
|
||||
tree["subst"] = self.vocab.strings[tree["subst"]]
|
||||
tree["subst"] = self.vocab.strings.add(tree["subst"])
|
||||
|
||||
trees.append(tree)
|
||||
|
||||
|
|
|
@ -53,9 +53,11 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]
|
|||
"incl_context": True,
|
||||
"entity_vector_length": 64,
|
||||
"get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
|
||||
"get_candidates_batch": {"@misc": "spacy.CandidateBatchGenerator.v1"},
|
||||
"overwrite": True,
|
||||
"scorer": {"@scorers": "spacy.entity_linker_scorer.v1"},
|
||||
"use_gold_ents": True,
|
||||
"candidates_batch_size": 1,
|
||||
"threshold": None,
|
||||
},
|
||||
default_score_weights={
|
||||
|
@ -75,9 +77,13 @@ def make_entity_linker(
|
|||
incl_context: bool,
|
||||
entity_vector_length: int,
|
||||
get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
|
||||
get_candidates_batch: Callable[
|
||||
[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
|
||||
],
|
||||
overwrite: bool,
|
||||
scorer: Optional[Callable],
|
||||
use_gold_ents: bool,
|
||||
candidates_batch_size: int,
|
||||
threshold: Optional[float] = None,
|
||||
):
|
||||
"""Construct an EntityLinker component.
|
||||
|
@ -90,17 +96,21 @@ def make_entity_linker(
|
|||
incl_prior (bool): Whether or not to include prior probabilities from the KB in the model.
|
||||
incl_context (bool): Whether or not to include the local context in the model.
|
||||
entity_vector_length (int): Size of encoding vectors in the KB.
|
||||
get_candidates (Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]): Function that
|
||||
get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
|
||||
produces a list of candidates, given a certain knowledge base and a textual mention.
|
||||
get_candidates_batch (
|
||||
Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]], Iterable[Candidate]]
|
||||
): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
|
||||
scorer (Optional[Callable]): The scoring method.
|
||||
use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
|
||||
component must provide entity annotations.
|
||||
candidates_batch_size (int): Size of batches for entity candidate generation.
|
||||
threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the threshold,
|
||||
prediction is discarded. If None, predictions are not filtered by any threshold.
|
||||
"""
|
||||
|
||||
if not model.attrs.get("include_span_maker", False):
|
||||
# The only difference in arguments here is that use_gold_ents is not available
|
||||
# The only difference in arguments here is that use_gold_ents and threshold aren't available.
|
||||
return EntityLinker_v1(
|
||||
nlp.vocab,
|
||||
model,
|
||||
|
@ -124,9 +134,11 @@ def make_entity_linker(
|
|||
incl_context=incl_context,
|
||||
entity_vector_length=entity_vector_length,
|
||||
get_candidates=get_candidates,
|
||||
get_candidates_batch=get_candidates_batch,
|
||||
overwrite=overwrite,
|
||||
scorer=scorer,
|
||||
use_gold_ents=use_gold_ents,
|
||||
candidates_batch_size=candidates_batch_size,
|
||||
threshold=threshold,
|
||||
)
|
||||
|
||||
|
@ -160,9 +172,13 @@ class EntityLinker(TrainablePipe):
|
|||
incl_context: bool,
|
||||
entity_vector_length: int,
|
||||
get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
|
||||
get_candidates_batch: Callable[
|
||||
[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
|
||||
],
|
||||
overwrite: bool = BACKWARD_OVERWRITE,
|
||||
scorer: Optional[Callable] = entity_linker_score,
|
||||
use_gold_ents: bool,
|
||||
candidates_batch_size: int,
|
||||
threshold: Optional[float] = None,
|
||||
) -> None:
|
||||
"""Initialize an entity linker.
|
||||
|
@ -178,10 +194,14 @@ class EntityLinker(TrainablePipe):
|
|||
entity_vector_length (int): Size of encoding vectors in the KB.
|
||||
get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
|
||||
produces a list of candidates, given a certain knowledge base and a textual mention.
|
||||
scorer (Optional[Callable]): The scoring method. Defaults to
|
||||
Scorer.score_links.
|
||||
get_candidates_batch (
|
||||
Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]],
|
||||
Iterable[Candidate]]
|
||||
): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
|
||||
scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_links.
|
||||
use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
|
||||
component must provide entity annotations.
|
||||
candidates_batch_size (int): Size of batches for entity candidate generation.
|
||||
threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the
|
||||
threshold, prediction is discarded. If None, predictions are not filtered by any threshold.
|
||||
DOCS: https://spacy.io/api/entitylinker#init
|
||||
|
@ -204,22 +224,27 @@ class EntityLinker(TrainablePipe):
|
|||
self.incl_prior = incl_prior
|
||||
self.incl_context = incl_context
|
||||
self.get_candidates = get_candidates
|
||||
self.get_candidates_batch = get_candidates_batch
|
||||
self.cfg: Dict[str, Any] = {"overwrite": overwrite}
|
||||
self.distance = CosineDistance(normalize=False)
|
||||
# how many neighbour sentences to take into account
|
||||
# create an empty KB by default. If you want to load a predefined one, specify it in 'initialize'.
|
||||
# create an empty KB by default
|
||||
self.kb = empty_kb(entity_vector_length)(self.vocab)
|
||||
self.scorer = scorer
|
||||
self.use_gold_ents = use_gold_ents
|
||||
self.candidates_batch_size = candidates_batch_size
|
||||
self.threshold = threshold
|
||||
|
||||
if candidates_batch_size < 1:
|
||||
raise ValueError(Errors.E1044)
|
||||
|
||||
def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]):
|
||||
"""Define the KB of this pipe by providing a function that will
|
||||
create it using this object's vocab."""
|
||||
if not callable(kb_loader):
|
||||
raise ValueError(Errors.E885.format(arg_type=type(kb_loader)))
|
||||
|
||||
self.kb = kb_loader(self.vocab)
|
||||
self.kb = kb_loader(self.vocab) # type: ignore
|
||||
|
||||
def validate_kb(self) -> None:
|
||||
# Raise an error if the knowledge base is not initialized.
|
||||
|
@ -241,8 +266,8 @@ class EntityLinker(TrainablePipe):
|
|||
get_examples (Callable[[], Iterable[Example]]): Function that
|
||||
returns a representative sample of gold-standard Example objects.
|
||||
nlp (Language): The current nlp object the component is part of.
|
||||
kb_loader (Callable[[Vocab], KnowledgeBase]): A function that creates a KnowledgeBase from a Vocab instance.
|
||||
Note that providing this argument, will overwrite all data accumulated in the current KB.
|
||||
kb_loader (Callable[[Vocab], KnowledgeBase]): A function that creates a KnowledgeBase from a Vocab
|
||||
instance. Note that providing this argument will overwrite all data accumulated in the current KB.
|
||||
Use this only when loading a KB as-such from file.
|
||||
|
||||
DOCS: https://spacy.io/api/entitylinker#initialize
|
||||
|
@ -419,66 +444,93 @@ class EntityLinker(TrainablePipe):
|
|||
if len(doc) == 0:
|
||||
continue
|
||||
sentences = [s for s in doc.sents]
|
||||
# Looping through each entity (TODO: rewrite)
|
||||
for ent in doc.ents:
|
||||
sent_index = sentences.index(ent.sent)
|
||||
assert sent_index >= 0
|
||||
|
||||
if self.incl_context:
|
||||
# get n_neighbour sentences, clipped to the length of the document
|
||||
start_sentence = max(0, sent_index - self.n_sents)
|
||||
end_sentence = min(len(sentences) - 1, sent_index + self.n_sents)
|
||||
start_token = sentences[start_sentence].start
|
||||
end_token = sentences[end_sentence].end
|
||||
sent_doc = doc[start_token:end_token].as_doc()
|
||||
# currently, the context is the same for each entity in a sentence (should be refined)
|
||||
sentence_encoding = self.model.predict([sent_doc])[0]
|
||||
sentence_encoding_t = sentence_encoding.T
|
||||
sentence_norm = xp.linalg.norm(sentence_encoding_t)
|
||||
entity_count += 1
|
||||
if ent.label_ in self.labels_discard:
|
||||
# ignoring this entity - setting to NIL
|
||||
final_kb_ids.append(self.NIL)
|
||||
else:
|
||||
candidates = list(self.get_candidates(self.kb, ent))
|
||||
if not candidates:
|
||||
# no prediction possible for this entity - setting to NIL
|
||||
final_kb_ids.append(self.NIL)
|
||||
elif len(candidates) == 1 and self.threshold is None:
|
||||
# shortcut for efficiency reasons: take the 1 candidate
|
||||
final_kb_ids.append(candidates[0].entity_)
|
||||
else:
|
||||
random.shuffle(candidates)
|
||||
# set all prior probabilities to 0 if incl_prior=False
|
||||
prior_probs = xp.asarray([c.prior_prob for c in candidates])
|
||||
if not self.incl_prior:
|
||||
prior_probs = xp.asarray([0.0 for _ in candidates])
|
||||
scores = prior_probs
|
||||
# add in similarity from the context
|
||||
if self.incl_context:
|
||||
entity_encodings = xp.asarray(
|
||||
[c.entity_vector for c in candidates]
|
||||
)
|
||||
entity_norm = xp.linalg.norm(entity_encodings, axis=1)
|
||||
if len(entity_encodings) != len(prior_probs):
|
||||
raise RuntimeError(
|
||||
Errors.E147.format(
|
||||
method="predict",
|
||||
msg="vectors not of equal length",
|
||||
)
|
||||
)
|
||||
# cosine similarity
|
||||
sims = xp.dot(entity_encodings, sentence_encoding_t) / (
|
||||
sentence_norm * entity_norm
|
||||
)
|
||||
if sims.shape != prior_probs.shape:
|
||||
raise ValueError(Errors.E161)
|
||||
scores = prior_probs + sims - (prior_probs * sims)
|
||||
final_kb_ids.append(
|
||||
candidates[scores.argmax().item()].entity_
|
||||
if self.threshold is None or scores.max() >= self.threshold
|
||||
else EntityLinker.NIL
|
||||
# Loop over entities in batches.
|
||||
for ent_idx in range(0, len(doc.ents), self.candidates_batch_size):
|
||||
ent_batch = doc.ents[ent_idx : ent_idx + self.candidates_batch_size]
|
||||
|
||||
# Look up candidate entities.
|
||||
valid_ent_idx = [
|
||||
idx
|
||||
for idx in range(len(ent_batch))
|
||||
if ent_batch[idx].label_ not in self.labels_discard
|
||||
]
|
||||
|
||||
batch_candidates = list(
|
||||
self.get_candidates_batch(
|
||||
self.kb, [ent_batch[idx] for idx in valid_ent_idx]
|
||||
)
|
||||
if self.candidates_batch_size > 1
|
||||
else [
|
||||
self.get_candidates(self.kb, ent_batch[idx])
|
||||
for idx in valid_ent_idx
|
||||
]
|
||||
)
|
||||
|
||||
# Looping through each entity in batch (TODO: rewrite)
|
||||
for j, ent in enumerate(ent_batch):
|
||||
sent_index = sentences.index(ent.sent)
|
||||
assert sent_index >= 0
|
||||
|
||||
if self.incl_context:
|
||||
# get n_neighbour sentences, clipped to the length of the document
|
||||
start_sentence = max(0, sent_index - self.n_sents)
|
||||
end_sentence = min(
|
||||
len(sentences) - 1, sent_index + self.n_sents
|
||||
)
|
||||
start_token = sentences[start_sentence].start
|
||||
end_token = sentences[end_sentence].end
|
||||
sent_doc = doc[start_token:end_token].as_doc()
|
||||
# currently, the context is the same for each entity in a sentence (should be refined)
|
||||
sentence_encoding = self.model.predict([sent_doc])[0]
|
||||
sentence_encoding_t = sentence_encoding.T
|
||||
sentence_norm = xp.linalg.norm(sentence_encoding_t)
|
||||
entity_count += 1
|
||||
if ent.label_ in self.labels_discard:
|
||||
# ignoring this entity - setting to NIL
|
||||
final_kb_ids.append(self.NIL)
|
||||
else:
|
||||
candidates = list(batch_candidates[j])
|
||||
if not candidates:
|
||||
# no prediction possible for this entity - setting to NIL
|
||||
final_kb_ids.append(self.NIL)
|
||||
elif len(candidates) == 1 and self.threshold is None:
|
||||
# shortcut for efficiency reasons: take the 1 candidate
|
||||
final_kb_ids.append(candidates[0].entity_)
|
||||
else:
|
||||
random.shuffle(candidates)
|
||||
# set all prior probabilities to 0 if incl_prior=False
|
||||
prior_probs = xp.asarray([c.prior_prob for c in candidates])
|
||||
if not self.incl_prior:
|
||||
prior_probs = xp.asarray([0.0 for _ in candidates])
|
||||
scores = prior_probs
|
||||
# add in similarity from the context
|
||||
if self.incl_context:
|
||||
entity_encodings = xp.asarray(
|
||||
[c.entity_vector for c in candidates]
|
||||
)
|
||||
entity_norm = xp.linalg.norm(entity_encodings, axis=1)
|
||||
if len(entity_encodings) != len(prior_probs):
|
||||
raise RuntimeError(
|
||||
Errors.E147.format(
|
||||
method="predict",
|
||||
msg="vectors not of equal length",
|
||||
)
|
||||
)
|
||||
# cosine similarity
|
||||
sims = xp.dot(entity_encodings, sentence_encoding_t) / (
|
||||
sentence_norm * entity_norm
|
||||
)
|
||||
if sims.shape != prior_probs.shape:
|
||||
raise ValueError(Errors.E161)
|
||||
scores = prior_probs + sims - (prior_probs * sims)
|
||||
final_kb_ids.append(
|
||||
candidates[scores.argmax().item()].entity_
|
||||
if self.threshold is None
|
||||
or scores.max() >= self.threshold
|
||||
else EntityLinker.NIL
|
||||
)
|
||||
|
||||
if not (len(final_kb_ids) == entity_count):
|
||||
err = Errors.E147.format(
|
||||
method="predict", msg="result variables not of equal length"
|
||||
|
|
|
@ -11,6 +11,7 @@ from ..errors import Errors, Warnings
|
|||
from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList, registry
|
||||
from ..tokens import Doc, Span
|
||||
from ..matcher import Matcher, PhraseMatcher
|
||||
from ..matcher.levenshtein import levenshtein_compare
|
||||
from ..scorer import get_ner_prf
|
||||
|
||||
|
||||
|
@ -23,6 +24,7 @@ PatternType = Dict[str, Union[str, List[Dict[str, Any]]]]
|
|||
assigns=["doc.ents", "token.ent_type", "token.ent_iob"],
|
||||
default_config={
|
||||
"phrase_matcher_attr": None,
|
||||
"matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"},
|
||||
"validate": False,
|
||||
"overwrite_ents": False,
|
||||
"ent_id_sep": DEFAULT_ENT_ID_SEP,
|
||||
|
@ -39,6 +41,7 @@ def make_entity_ruler(
|
|||
nlp: Language,
|
||||
name: str,
|
||||
phrase_matcher_attr: Optional[Union[int, str]],
|
||||
matcher_fuzzy_compare: Callable,
|
||||
validate: bool,
|
||||
overwrite_ents: bool,
|
||||
ent_id_sep: str,
|
||||
|
@ -48,6 +51,7 @@ def make_entity_ruler(
|
|||
nlp,
|
||||
name,
|
||||
phrase_matcher_attr=phrase_matcher_attr,
|
||||
matcher_fuzzy_compare=matcher_fuzzy_compare,
|
||||
validate=validate,
|
||||
overwrite_ents=overwrite_ents,
|
||||
ent_id_sep=ent_id_sep,
|
||||
|
@ -81,6 +85,7 @@ class EntityRuler(Pipe):
|
|||
name: str = "entity_ruler",
|
||||
*,
|
||||
phrase_matcher_attr: Optional[Union[int, str]] = None,
|
||||
matcher_fuzzy_compare: Callable = levenshtein_compare,
|
||||
validate: bool = False,
|
||||
overwrite_ents: bool = False,
|
||||
ent_id_sep: str = DEFAULT_ENT_ID_SEP,
|
||||
|
@ -99,7 +104,10 @@ class EntityRuler(Pipe):
|
|||
added. Used to disable the current entity ruler while creating
|
||||
phrase patterns with the nlp object.
|
||||
phrase_matcher_attr (int / str): Token attribute to match on, passed
|
||||
to the internal PhraseMatcher as `attr`
|
||||
to the internal PhraseMatcher as `attr`.
|
||||
matcher_fuzzy_compare (Callable): The fuzzy comparison method for the
|
||||
internal Matcher. Defaults to
|
||||
spacy.matcher.levenshtein.levenshtein_compare.
|
||||
validate (bool): Whether patterns should be validated, passed to
|
||||
Matcher and PhraseMatcher as `validate`
|
||||
patterns (iterable): Optional patterns to load in.
|
||||
|
@ -117,7 +125,10 @@ class EntityRuler(Pipe):
|
|||
self.token_patterns = defaultdict(list) # type: ignore
|
||||
self.phrase_patterns = defaultdict(list) # type: ignore
|
||||
self._validate = validate
|
||||
self.matcher = Matcher(nlp.vocab, validate=validate)
|
||||
self.matcher_fuzzy_compare = matcher_fuzzy_compare
|
||||
self.matcher = Matcher(
|
||||
nlp.vocab, validate=validate, fuzzy_compare=self.matcher_fuzzy_compare
|
||||
)
|
||||
self.phrase_matcher_attr = phrase_matcher_attr
|
||||
self.phrase_matcher = PhraseMatcher(
|
||||
nlp.vocab, attr=self.phrase_matcher_attr, validate=validate
|
||||
|
@ -337,7 +348,11 @@ class EntityRuler(Pipe):
|
|||
self.token_patterns = defaultdict(list)
|
||||
self.phrase_patterns = defaultdict(list)
|
||||
self._ent_ids = defaultdict(tuple)
|
||||
self.matcher = Matcher(self.nlp.vocab, validate=self._validate)
|
||||
self.matcher = Matcher(
|
||||
self.nlp.vocab,
|
||||
validate=self._validate,
|
||||
fuzzy_compare=self.matcher_fuzzy_compare,
|
||||
)
|
||||
self.phrase_matcher = PhraseMatcher(
|
||||
self.nlp.vocab, attr=self.phrase_matcher_attr, validate=self._validate
|
||||
)
|
||||
|
@ -431,7 +446,8 @@ class EntityRuler(Pipe):
|
|||
self.overwrite = cfg.get("overwrite", False)
|
||||
self.phrase_matcher_attr = cfg.get("phrase_matcher_attr", None)
|
||||
self.phrase_matcher = PhraseMatcher(
|
||||
self.nlp.vocab, attr=self.phrase_matcher_attr
|
||||
self.nlp.vocab,
|
||||
attr=self.phrase_matcher_attr,
|
||||
)
|
||||
self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP)
|
||||
else:
|
||||
|
|
|
@ -68,8 +68,7 @@ class EntityLinker_v1(TrainablePipe):
|
|||
entity_vector_length (int): Size of encoding vectors in the KB.
|
||||
get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
|
||||
produces a list of candidates, given a certain knowledge base and a textual mention.
|
||||
scorer (Optional[Callable]): The scoring method. Defaults to
|
||||
Scorer.score_links.
|
||||
scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_links.
|
||||
DOCS: https://spacy.io/api/entitylinker#init
|
||||
"""
|
||||
self.vocab = vocab
|
||||
|
@ -115,7 +114,7 @@ class EntityLinker_v1(TrainablePipe):
|
|||
get_examples (Callable[[], Iterable[Example]]): Function that
|
||||
returns a representative sample of gold-standard Example objects.
|
||||
nlp (Language): The current nlp object the component is part of.
|
||||
kb_loader (Callable[[Vocab], KnowledgeBase]): A function that creates a KnowledgeBase from a Vocab instance.
|
||||
kb_loader (Callable[[Vocab], KnowledgeBase]): A function that creates an InMemoryLookupKB from a Vocab instance.
|
||||
Note that providing this argument, will overwrite all data accumulated in the current KB.
|
||||
Use this only when loading a KB as-such from file.
|
||||
|
||||
|
|
|
@ -13,6 +13,7 @@ from ..util import ensure_path, SimpleFrozenList, registry
|
|||
from ..tokens import Doc, Span
|
||||
from ..scorer import Scorer
|
||||
from ..matcher import Matcher, PhraseMatcher
|
||||
from ..matcher.levenshtein import levenshtein_compare
|
||||
from .. import util
|
||||
|
||||
PatternType = Dict[str, Union[str, List[Dict[str, Any]]]]
|
||||
|
@ -28,6 +29,7 @@ DEFAULT_SPANS_KEY = "ruler"
|
|||
"overwrite_ents": False,
|
||||
"scorer": {"@scorers": "spacy.entity_ruler_scorer.v1"},
|
||||
"ent_id_sep": "__unused__",
|
||||
"matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"},
|
||||
},
|
||||
default_score_weights={
|
||||
"ents_f": 1.0,
|
||||
|
@ -40,6 +42,7 @@ def make_entity_ruler(
|
|||
nlp: Language,
|
||||
name: str,
|
||||
phrase_matcher_attr: Optional[Union[int, str]],
|
||||
matcher_fuzzy_compare: Callable,
|
||||
validate: bool,
|
||||
overwrite_ents: bool,
|
||||
scorer: Optional[Callable],
|
||||
|
@ -57,6 +60,7 @@ def make_entity_ruler(
|
|||
annotate_ents=True,
|
||||
ents_filter=ents_filter,
|
||||
phrase_matcher_attr=phrase_matcher_attr,
|
||||
matcher_fuzzy_compare=matcher_fuzzy_compare,
|
||||
validate=validate,
|
||||
overwrite=False,
|
||||
scorer=scorer,
|
||||
|
@ -72,6 +76,7 @@ def make_entity_ruler(
|
|||
"annotate_ents": False,
|
||||
"ents_filter": {"@misc": "spacy.first_longest_spans_filter.v1"},
|
||||
"phrase_matcher_attr": None,
|
||||
"matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"},
|
||||
"validate": False,
|
||||
"overwrite": True,
|
||||
"scorer": {
|
||||
|
@ -94,6 +99,7 @@ def make_span_ruler(
|
|||
annotate_ents: bool,
|
||||
ents_filter: Callable[[Iterable[Span], Iterable[Span]], Iterable[Span]],
|
||||
phrase_matcher_attr: Optional[Union[int, str]],
|
||||
matcher_fuzzy_compare: Callable,
|
||||
validate: bool,
|
||||
overwrite: bool,
|
||||
scorer: Optional[Callable],
|
||||
|
@ -106,6 +112,7 @@ def make_span_ruler(
|
|||
annotate_ents=annotate_ents,
|
||||
ents_filter=ents_filter,
|
||||
phrase_matcher_attr=phrase_matcher_attr,
|
||||
matcher_fuzzy_compare=matcher_fuzzy_compare,
|
||||
validate=validate,
|
||||
overwrite=overwrite,
|
||||
scorer=scorer,
|
||||
|
@ -170,7 +177,7 @@ def prioritize_existing_ents_filter(
|
|||
|
||||
|
||||
@registry.misc("spacy.prioritize_existing_ents_filter.v1")
|
||||
def make_preverse_existing_ents_filter():
|
||||
def make_preserve_existing_ents_filter():
|
||||
return prioritize_existing_ents_filter
|
||||
|
||||
|
||||
|
@ -216,6 +223,7 @@ class SpanRuler(Pipe):
|
|||
[Iterable[Span], Iterable[Span]], Iterable[Span]
|
||||
] = util.filter_chain_spans,
|
||||
phrase_matcher_attr: Optional[Union[int, str]] = None,
|
||||
matcher_fuzzy_compare: Callable = levenshtein_compare,
|
||||
validate: bool = False,
|
||||
overwrite: bool = False,
|
||||
scorer: Optional[Callable] = partial(
|
||||
|
@ -246,6 +254,9 @@ class SpanRuler(Pipe):
|
|||
phrase_matcher_attr (Optional[Union[int, str]]): Token attribute to
|
||||
match on, passed to the internal PhraseMatcher as `attr`. Defaults
|
||||
to `None`.
|
||||
matcher_fuzzy_compare (Callable): The fuzzy comparison method for the
|
||||
internal Matcher. Defaults to
|
||||
spacy.matcher.levenshtein.levenshtein_compare.
|
||||
validate (bool): Whether patterns should be validated, passed to
|
||||
Matcher and PhraseMatcher as `validate`.
|
||||
overwrite (bool): Whether to remove any existing spans under this spans
|
||||
|
@ -266,6 +277,7 @@ class SpanRuler(Pipe):
|
|||
self.spans_filter = spans_filter
|
||||
self.ents_filter = ents_filter
|
||||
self.scorer = scorer
|
||||
self.matcher_fuzzy_compare = matcher_fuzzy_compare
|
||||
self._match_label_id_map: Dict[int, Dict[str, str]] = {}
|
||||
self.clear()
|
||||
|
||||
|
@ -451,7 +463,11 @@ class SpanRuler(Pipe):
|
|||
DOCS: https://spacy.io/api/spanruler#clear
|
||||
"""
|
||||
self._patterns: List[PatternType] = []
|
||||
self.matcher: Matcher = Matcher(self.nlp.vocab, validate=self.validate)
|
||||
self.matcher: Matcher = Matcher(
|
||||
self.nlp.vocab,
|
||||
validate=self.validate,
|
||||
fuzzy_compare=self.matcher_fuzzy_compare,
|
||||
)
|
||||
self.phrase_matcher: PhraseMatcher = PhraseMatcher(
|
||||
self.nlp.vocab,
|
||||
attr=self.phrase_matcher_attr,
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any, cast
|
||||
from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any
|
||||
from thinc.api import Config, Model, get_current_ops, set_dropout_rate, Ops
|
||||
from thinc.api import Optimizer
|
||||
from thinc.types import Ragged, Ints2d, Floats2d, Ints1d
|
||||
from thinc.types import Ragged, Ints2d, Floats2d
|
||||
|
||||
import numpy
|
||||
|
||||
|
@ -26,17 +26,17 @@ scorer = {"@layers": "spacy.LinearLogistic.v1"}
|
|||
hidden_size = 128
|
||||
|
||||
[model.tok2vec]
|
||||
@architectures = "spacy.Tok2Vec.v1"
|
||||
@architectures = "spacy.Tok2Vec.v2"
|
||||
|
||||
[model.tok2vec.embed]
|
||||
@architectures = "spacy.MultiHashEmbed.v1"
|
||||
@architectures = "spacy.MultiHashEmbed.v2"
|
||||
width = 96
|
||||
rows = [5000, 2000, 1000, 1000]
|
||||
attrs = ["ORTH", "PREFIX", "SUFFIX", "SHAPE"]
|
||||
include_static_vectors = false
|
||||
|
||||
[model.tok2vec.encode]
|
||||
@architectures = "spacy.MaxoutWindowEncoder.v1"
|
||||
@architectures = "spacy.MaxoutWindowEncoder.v2"
|
||||
width = ${model.tok2vec.embed.width}
|
||||
window_size = 1
|
||||
maxout_pieces = 3
|
||||
|
@ -133,6 +133,9 @@ def make_spancat(
|
|||
spans_key (str): Key of the doc.spans dict to save the spans under. During
|
||||
initialization and training, the component will look for spans on the
|
||||
reference document under the same key.
|
||||
scorer (Optional[Callable]): The scoring method. Defaults to
|
||||
Scorer.score_spans for the Doc.spans[spans_key] with overlapping
|
||||
spans allowed.
|
||||
threshold (float): Minimum probability to consider a prediction positive.
|
||||
Spans with a positive prediction will be saved on the Doc. Defaults to
|
||||
0.5.
|
||||
|
@ -269,7 +272,10 @@ class SpanCategorizer(TrainablePipe):
|
|||
DOCS: https://spacy.io/api/spancategorizer#predict
|
||||
"""
|
||||
indices = self.suggester(docs, ops=self.model.ops)
|
||||
scores = self.model.predict((docs, indices)) # type: ignore
|
||||
if indices.lengths.sum() == 0:
|
||||
scores = self.model.ops.alloc2f(0, 0)
|
||||
else:
|
||||
scores = self.model.predict((docs, indices)) # type: ignore
|
||||
return indices, scores
|
||||
|
||||
def set_candidates(
|
||||
|
|
|
@ -24,8 +24,8 @@ single_label_default_config = """
|
|||
[model.tok2vec.embed]
|
||||
@architectures = "spacy.MultiHashEmbed.v2"
|
||||
width = 64
|
||||
rows = [2000, 2000, 1000, 1000, 1000, 1000]
|
||||
attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
|
||||
rows = [2000, 2000, 500, 1000, 500]
|
||||
attrs = ["NORM", "LOWER", "PREFIX", "SUFFIX", "SHAPE"]
|
||||
include_static_vectors = false
|
||||
|
||||
[model.tok2vec.encode]
|
||||
|
@ -72,9 +72,9 @@ subword_features = true
|
|||
"textcat",
|
||||
assigns=["doc.cats"],
|
||||
default_config={
|
||||
"threshold": 0.5,
|
||||
"threshold": 0.0,
|
||||
"model": DEFAULT_SINGLE_TEXTCAT_MODEL,
|
||||
"scorer": {"@scorers": "spacy.textcat_scorer.v1"},
|
||||
"scorer": {"@scorers": "spacy.textcat_scorer.v2"},
|
||||
},
|
||||
default_score_weights={
|
||||
"cats_score": 1.0,
|
||||
|
@ -87,7 +87,6 @@ subword_features = true
|
|||
"cats_macro_f": None,
|
||||
"cats_macro_auc": None,
|
||||
"cats_f_per_type": None,
|
||||
"cats_macro_auc_per_type": None,
|
||||
},
|
||||
)
|
||||
def make_textcat(
|
||||
|
@ -118,7 +117,7 @@ def textcat_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
|||
)
|
||||
|
||||
|
||||
@registry.scorers("spacy.textcat_scorer.v1")
|
||||
@registry.scorers("spacy.textcat_scorer.v2")
|
||||
def make_textcat_scorer():
|
||||
return textcat_score
|
||||
|
||||
|
@ -144,7 +143,8 @@ class TextCategorizer(TrainablePipe):
|
|||
model (thinc.api.Model): The Thinc Model powering the pipeline component.
|
||||
name (str): The component instance name, used to add entries to the
|
||||
losses during training.
|
||||
threshold (float): Cutoff to consider a prediction "positive".
|
||||
threshold (float): Unused, not needed for single-label (exclusive
|
||||
classes) classification.
|
||||
scorer (Optional[Callable]): The scoring method. Defaults to
|
||||
Scorer.score_cats for the attribute "cats".
|
||||
|
||||
|
@ -154,7 +154,11 @@ class TextCategorizer(TrainablePipe):
|
|||
self.model = model
|
||||
self.name = name
|
||||
self._rehearsal_model = None
|
||||
cfg = {"labels": [], "threshold": threshold, "positive_label": None}
|
||||
cfg: Dict[str, Any] = {
|
||||
"labels": [],
|
||||
"threshold": threshold,
|
||||
"positive_label": None,
|
||||
}
|
||||
self.cfg = dict(cfg)
|
||||
self.scorer = scorer
|
||||
|
||||
|
@ -396,5 +400,9 @@ class TextCategorizer(TrainablePipe):
|
|||
def _validate_categories(self, examples: Iterable[Example]):
|
||||
"""Check whether the provided examples all have single-label cats annotations."""
|
||||
for ex in examples:
|
||||
if list(ex.reference.cats.values()).count(1.0) > 1:
|
||||
vals = list(ex.reference.cats.values())
|
||||
if vals.count(1.0) > 1:
|
||||
raise ValueError(Errors.E895.format(value=ex.reference.cats))
|
||||
for val in vals:
|
||||
if not (val == 1.0 or val == 0.0):
|
||||
raise ValueError(Errors.E851.format(val=val))
|
||||
|
|
|
@ -19,17 +19,17 @@ multi_label_default_config = """
|
|||
@architectures = "spacy.TextCatEnsemble.v2"
|
||||
|
||||
[model.tok2vec]
|
||||
@architectures = "spacy.Tok2Vec.v1"
|
||||
@architectures = "spacy.Tok2Vec.v2"
|
||||
|
||||
[model.tok2vec.embed]
|
||||
@architectures = "spacy.MultiHashEmbed.v2"
|
||||
width = 64
|
||||
rows = [2000, 2000, 1000, 1000, 1000, 1000]
|
||||
attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
|
||||
rows = [2000, 2000, 500, 1000, 500]
|
||||
attrs = ["NORM", "LOWER", "PREFIX", "SUFFIX", "SHAPE"]
|
||||
include_static_vectors = false
|
||||
|
||||
[model.tok2vec.encode]
|
||||
@architectures = "spacy.MaxoutWindowEncoder.v1"
|
||||
@architectures = "spacy.MaxoutWindowEncoder.v2"
|
||||
width = ${model.tok2vec.embed.width}
|
||||
window_size = 1
|
||||
maxout_pieces = 3
|
||||
|
@ -74,7 +74,7 @@ subword_features = true
|
|||
default_config={
|
||||
"threshold": 0.5,
|
||||
"model": DEFAULT_MULTI_TEXTCAT_MODEL,
|
||||
"scorer": {"@scorers": "spacy.textcat_multilabel_scorer.v1"},
|
||||
"scorer": {"@scorers": "spacy.textcat_multilabel_scorer.v2"},
|
||||
},
|
||||
default_score_weights={
|
||||
"cats_score": 1.0,
|
||||
|
@ -87,7 +87,6 @@ subword_features = true
|
|||
"cats_macro_f": None,
|
||||
"cats_macro_auc": None,
|
||||
"cats_f_per_type": None,
|
||||
"cats_macro_auc_per_type": None,
|
||||
},
|
||||
)
|
||||
def make_multilabel_textcat(
|
||||
|
@ -96,8 +95,8 @@ def make_multilabel_textcat(
|
|||
model: Model[List[Doc], List[Floats2d]],
|
||||
threshold: float,
|
||||
scorer: Optional[Callable],
|
||||
) -> "TextCategorizer":
|
||||
"""Create a TextCategorizer component. The text categorizer predicts categories
|
||||
) -> "MultiLabel_TextCategorizer":
|
||||
"""Create a MultiLabel_TextCategorizer component. The text categorizer predicts categories
|
||||
over a whole document. It can learn one or more labels, and the labels are considered
|
||||
to be non-mutually exclusive, which means that there can be zero or more labels
|
||||
per doc).
|
||||
|
@ -105,6 +104,7 @@ def make_multilabel_textcat(
|
|||
model (Model[List[Doc], List[Floats2d]]): A model instance that predicts
|
||||
scores for each category.
|
||||
threshold (float): Cutoff to consider a prediction "positive".
|
||||
scorer (Optional[Callable]): The scoring method.
|
||||
"""
|
||||
return MultiLabel_TextCategorizer(
|
||||
nlp.vocab, model, name, threshold=threshold, scorer=scorer
|
||||
|
@ -120,7 +120,7 @@ def textcat_multilabel_score(examples: Iterable[Example], **kwargs) -> Dict[str,
|
|||
)
|
||||
|
||||
|
||||
@registry.scorers("spacy.textcat_multilabel_scorer.v1")
|
||||
@registry.scorers("spacy.textcat_multilabel_scorer.v2")
|
||||
def make_textcat_multilabel_scorer():
|
||||
return textcat_multilabel_score
|
||||
|
||||
|
@ -147,6 +147,7 @@ class MultiLabel_TextCategorizer(TextCategorizer):
|
|||
name (str): The component instance name, used to add entries to the
|
||||
losses during training.
|
||||
threshold (float): Cutoff to consider a prediction "positive".
|
||||
scorer (Optional[Callable]): The scoring method.
|
||||
|
||||
DOCS: https://spacy.io/api/textcategorizer#init
|
||||
"""
|
||||
|
@ -190,6 +191,8 @@ class MultiLabel_TextCategorizer(TextCategorizer):
|
|||
for label in labels:
|
||||
self.add_label(label)
|
||||
subbatch = list(islice(get_examples(), 10))
|
||||
self._validate_categories(subbatch)
|
||||
|
||||
doc_sample = [eg.reference for eg in subbatch]
|
||||
label_sample, _ = self._examples_to_truth(subbatch)
|
||||
self._require_labels()
|
||||
|
@ -200,4 +203,8 @@ class MultiLabel_TextCategorizer(TextCategorizer):
|
|||
def _validate_categories(self, examples: Iterable[Example]):
|
||||
"""This component allows any type of single- or multi-label annotations.
|
||||
This method overwrites the more strict one from 'textcat'."""
|
||||
pass
|
||||
# check that annotation values are valid
|
||||
for ex in examples:
|
||||
for val in ex.reference.cats.values():
|
||||
if not (val == 1.0 or val == 0.0):
|
||||
raise ValueError(Errors.E851.format(val=val))
|
||||
|
|
|
@ -123,9 +123,6 @@ class Tok2Vec(TrainablePipe):
|
|||
width = self.model.get_dim("nO")
|
||||
return [self.model.ops.alloc((0, width)) for doc in docs]
|
||||
tokvecs = self.model.predict(docs)
|
||||
batch_id = Tok2VecListener.get_batch_id(docs)
|
||||
for listener in self.listeners:
|
||||
listener.receive(batch_id, tokvecs, _empty_backprop)
|
||||
return tokvecs
|
||||
|
||||
def set_annotations(self, docs: Sequence[Doc], tokvecses) -> None:
|
||||
|
@ -286,8 +283,19 @@ class Tok2VecListener(Model):
|
|||
def forward(model: Tok2VecListener, inputs, is_train: bool):
|
||||
"""Supply the outputs from the upstream Tok2Vec component."""
|
||||
if is_train:
|
||||
model.verify_inputs(inputs)
|
||||
return model._outputs, model._backprop
|
||||
# This might occur during training when the tok2vec layer is frozen / hasn't been updated.
|
||||
# In that case, it should be set to "annotating" so we can retrieve the embeddings from the doc.
|
||||
if model._batch_id is None:
|
||||
outputs = []
|
||||
for doc in inputs:
|
||||
if doc.tensor.size == 0:
|
||||
raise ValueError(Errors.E203.format(name="tok2vec"))
|
||||
else:
|
||||
outputs.append(doc.tensor)
|
||||
return outputs, _empty_backprop
|
||||
else:
|
||||
model.verify_inputs(inputs)
|
||||
return model._outputs, model._backprop
|
||||
else:
|
||||
# This is pretty grim, but it's hard to do better :(.
|
||||
# It's hard to avoid relying on the doc.tensor attribute, because the
|
||||
|
@ -306,7 +314,7 @@ def forward(model: Tok2VecListener, inputs, is_train: bool):
|
|||
outputs.append(model.ops.alloc2f(len(doc), width))
|
||||
else:
|
||||
outputs.append(doc.tensor)
|
||||
return outputs, lambda dX: []
|
||||
return outputs, _empty_backprop
|
||||
|
||||
|
||||
def _empty_backprop(dX): # for pickling
|
||||
|
|
|
@ -156,12 +156,40 @@ def validate_token_pattern(obj: list) -> List[str]:
|
|||
|
||||
|
||||
class TokenPatternString(BaseModel):
|
||||
REGEX: Optional[StrictStr] = Field(None, alias="regex")
|
||||
REGEX: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="regex")
|
||||
IN: Optional[List[StrictStr]] = Field(None, alias="in")
|
||||
NOT_IN: Optional[List[StrictStr]] = Field(None, alias="not_in")
|
||||
IS_SUBSET: Optional[List[StrictStr]] = Field(None, alias="is_subset")
|
||||
IS_SUPERSET: Optional[List[StrictStr]] = Field(None, alias="is_superset")
|
||||
INTERSECTS: Optional[List[StrictStr]] = Field(None, alias="intersects")
|
||||
FUZZY: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy")
|
||||
FUZZY1: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
|
||||
None, alias="fuzzy1"
|
||||
)
|
||||
FUZZY2: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
|
||||
None, alias="fuzzy2"
|
||||
)
|
||||
FUZZY3: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
|
||||
None, alias="fuzzy3"
|
||||
)
|
||||
FUZZY4: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
|
||||
None, alias="fuzzy4"
|
||||
)
|
||||
FUZZY5: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
|
||||
None, alias="fuzzy5"
|
||||
)
|
||||
FUZZY6: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
|
||||
None, alias="fuzzy6"
|
||||
)
|
||||
FUZZY7: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
|
||||
None, alias="fuzzy7"
|
||||
)
|
||||
FUZZY8: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
|
||||
None, alias="fuzzy8"
|
||||
)
|
||||
FUZZY9: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
|
||||
None, alias="fuzzy9"
|
||||
)
|
||||
|
||||
class Config:
|
||||
extra = "forbid"
|
||||
|
@ -181,12 +209,12 @@ class TokenPatternNumber(BaseModel):
|
|||
IS_SUBSET: Optional[List[StrictInt]] = Field(None, alias="is_subset")
|
||||
IS_SUPERSET: Optional[List[StrictInt]] = Field(None, alias="is_superset")
|
||||
INTERSECTS: Optional[List[StrictInt]] = Field(None, alias="intersects")
|
||||
EQ: Union[StrictInt, StrictFloat] = Field(None, alias="==")
|
||||
NEQ: Union[StrictInt, StrictFloat] = Field(None, alias="!=")
|
||||
GEQ: Union[StrictInt, StrictFloat] = Field(None, alias=">=")
|
||||
LEQ: Union[StrictInt, StrictFloat] = Field(None, alias="<=")
|
||||
GT: Union[StrictInt, StrictFloat] = Field(None, alias=">")
|
||||
LT: Union[StrictInt, StrictFloat] = Field(None, alias="<")
|
||||
EQ: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias="==")
|
||||
NEQ: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias="!=")
|
||||
GEQ: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias=">=")
|
||||
LEQ: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias="<=")
|
||||
GT: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias=">")
|
||||
LT: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias="<")
|
||||
|
||||
class Config:
|
||||
extra = "forbid"
|
||||
|
@ -329,6 +357,7 @@ class ConfigSchemaTraining(BaseModel):
|
|||
frozen_components: List[str] = Field(..., title="Pipeline components that shouldn't be updated during training")
|
||||
annotating_components: List[str] = Field(..., title="Pipeline components that should set annotations during training")
|
||||
before_to_disk: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after training, before it's saved to disk")
|
||||
before_update: Optional[Callable[["Language", Dict[str, Any]], None]] = Field(..., title="Optional callback that is invoked at the start of each training step")
|
||||
# fmt: on
|
||||
|
||||
class Config:
|
||||
|
@ -430,7 +459,7 @@ class ProjectConfigAssetURL(BaseModel):
|
|||
# fmt: off
|
||||
dest: StrictStr = Field(..., title="Destination of downloaded asset")
|
||||
url: Optional[StrictStr] = Field(None, title="URL of asset")
|
||||
checksum: str = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
|
||||
checksum: Optional[str] = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
|
||||
description: StrictStr = Field("", title="Description of asset")
|
||||
# fmt: on
|
||||
|
||||
|
@ -438,7 +467,7 @@ class ProjectConfigAssetURL(BaseModel):
|
|||
class ProjectConfigAssetGit(BaseModel):
|
||||
# fmt: off
|
||||
git: ProjectConfigAssetGitItem = Field(..., title="Git repo information")
|
||||
checksum: str = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
|
||||
checksum: Optional[str] = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
|
||||
description: Optional[StrictStr] = Field(None, title="Description of asset")
|
||||
# fmt: on
|
||||
|
||||
|
@ -509,9 +538,9 @@ class DocJSONSchema(BaseModel):
|
|||
None, title="Indices of sentences' start and end indices"
|
||||
)
|
||||
text: StrictStr = Field(..., title="Document text")
|
||||
spans: Dict[StrictStr, List[Dict[StrictStr, Union[StrictStr, StrictInt]]]] = Field(
|
||||
None, title="Span information - end/start indices, label, KB ID"
|
||||
)
|
||||
spans: Optional[
|
||||
Dict[StrictStr, List[Dict[StrictStr, Union[StrictStr, StrictInt]]]]
|
||||
] = Field(None, title="Span information - end/start indices, label, KB ID")
|
||||
tokens: List[Dict[StrictStr, Union[StrictStr, StrictInt]]] = Field(
|
||||
..., title="Token information - ID, start, annotations"
|
||||
)
|
||||
|
@ -520,9 +549,9 @@ class DocJSONSchema(BaseModel):
|
|||
title="Any custom data stored in the document's _ attribute",
|
||||
alias="_",
|
||||
)
|
||||
underscore_token: Optional[Dict[StrictStr, Dict[StrictStr, Any]]] = Field(
|
||||
underscore_token: Optional[Dict[StrictStr, List[Dict[StrictStr, Any]]]] = Field(
|
||||
None, title="Any custom data stored in the token's _ attribute"
|
||||
)
|
||||
underscore_span: Optional[Dict[StrictStr, Dict[StrictStr, Any]]] = Field(
|
||||
underscore_span: Optional[Dict[StrictStr, List[Dict[StrictStr, Any]]]] = Field(
|
||||
None, title="Any custom data stored in the span's _ attribute"
|
||||
)
|
||||
|
|
|
@ -174,7 +174,7 @@ class Scorer:
|
|||
prf_score.score_set(pred_spans, gold_spans)
|
||||
if len(acc_score) > 0:
|
||||
return {
|
||||
"token_acc": acc_score.fscore,
|
||||
"token_acc": acc_score.precision,
|
||||
"token_p": prf_score.precision,
|
||||
"token_r": prf_score.recall,
|
||||
"token_f": prf_score.fscore,
|
||||
|
@ -446,7 +446,7 @@ class Scorer:
|
|||
labels (Iterable[str]): The set of possible labels. Defaults to [].
|
||||
multi_label (bool): Whether the attribute allows multiple labels.
|
||||
Defaults to True. When set to False (exclusive labels), missing
|
||||
gold labels are interpreted as 0.0.
|
||||
gold labels are interpreted as 0.0 and the threshold is set to 0.0.
|
||||
positive_label (str): The positive label for a binary task with
|
||||
exclusive classes. Defaults to None.
|
||||
threshold (float): Cutoff to consider a prediction "positive". Defaults
|
||||
|
@ -471,17 +471,17 @@ class Scorer:
|
|||
"""
|
||||
if threshold is None:
|
||||
threshold = 0.5 if multi_label else 0.0
|
||||
if not multi_label:
|
||||
threshold = 0.0
|
||||
f_per_type = {label: PRFScore() for label in labels}
|
||||
auc_per_type = {label: ROCAUCScore() for label in labels}
|
||||
labels = set(labels)
|
||||
if labels:
|
||||
for eg in examples:
|
||||
labels.update(eg.predicted.cats.keys())
|
||||
labels.update(eg.reference.cats.keys())
|
||||
for example in examples:
|
||||
# Through this loop, None in the gold_cats indicates missing label.
|
||||
pred_cats = getter(example.predicted, attr)
|
||||
pred_cats = {k: v for k, v in pred_cats.items() if k in labels}
|
||||
gold_cats = getter(example.reference, attr)
|
||||
gold_cats = {k: v for k, v in gold_cats.items() if k in labels}
|
||||
|
||||
for label in labels:
|
||||
pred_score = pred_cats.get(label, 0.0)
|
||||
|
@ -505,20 +505,18 @@ class Scorer:
|
|||
# Get the highest-scoring for each.
|
||||
pred_label, pred_score = max(pred_cats.items(), key=lambda it: it[1])
|
||||
gold_label, gold_score = max(gold_cats.items(), key=lambda it: it[1])
|
||||
if pred_label == gold_label and pred_score >= threshold:
|
||||
if pred_label == gold_label:
|
||||
f_per_type[pred_label].tp += 1
|
||||
else:
|
||||
f_per_type[gold_label].fn += 1
|
||||
if pred_score >= threshold:
|
||||
f_per_type[pred_label].fp += 1
|
||||
f_per_type[pred_label].fp += 1
|
||||
elif gold_cats:
|
||||
gold_label, gold_score = max(gold_cats, key=lambda it: it[1])
|
||||
if gold_score > 0:
|
||||
f_per_type[gold_label].fn += 1
|
||||
elif pred_cats:
|
||||
pred_label, pred_score = max(pred_cats.items(), key=lambda it: it[1])
|
||||
if pred_score >= threshold:
|
||||
f_per_type[pred_label].fp += 1
|
||||
f_per_type[pred_label].fp += 1
|
||||
micro_prf = PRFScore()
|
||||
for label_prf in f_per_type.values():
|
||||
micro_prf.tp += label_prf.tp
|
||||
|
|
|
@ -333,16 +333,24 @@ def ro_tokenizer():
|
|||
|
||||
@pytest.fixture(scope="session")
|
||||
def ru_tokenizer():
|
||||
pytest.importorskip("pymorphy2")
|
||||
pytest.importorskip("pymorphy3")
|
||||
return get_lang_class("ru")().tokenizer
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
@pytest.fixture(scope="session")
|
||||
def ru_lemmatizer():
|
||||
pytest.importorskip("pymorphy2")
|
||||
pytest.importorskip("pymorphy3")
|
||||
return get_lang_class("ru")().add_pipe("lemmatizer")
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def ru_lookup_lemmatizer():
|
||||
pytest.importorskip("pymorphy3")
|
||||
return get_lang_class("ru")().add_pipe(
|
||||
"lemmatizer", config={"mode": "pymorphy3_lookup"}
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def sa_tokenizer():
|
||||
return get_lang_class("sa")().tokenizer
|
||||
|
@ -411,17 +419,26 @@ def ky_tokenizer():
|
|||
|
||||
@pytest.fixture(scope="session")
|
||||
def uk_tokenizer():
|
||||
pytest.importorskip("pymorphy2")
|
||||
pytest.importorskip("pymorphy3")
|
||||
return get_lang_class("uk")().tokenizer
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
@pytest.fixture(scope="session")
|
||||
def uk_lemmatizer():
|
||||
pytest.importorskip("pymorphy2")
|
||||
pytest.importorskip("pymorphy2_dicts_uk")
|
||||
pytest.importorskip("pymorphy3")
|
||||
pytest.importorskip("pymorphy3_dicts_uk")
|
||||
return get_lang_class("uk")().add_pipe("lemmatizer")
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def uk_lookup_lemmatizer():
|
||||
pytest.importorskip("pymorphy3")
|
||||
pytest.importorskip("pymorphy3_dicts_uk")
|
||||
return get_lang_class("uk")().add_pipe(
|
||||
"lemmatizer", config={"mode": "pymorphy3_lookup"}
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def ur_tokenizer():
|
||||
return get_lang_class("ur")().tokenizer
|
||||
|
|
|
@ -123,14 +123,14 @@ def test_doc_from_array_heads_in_bounds(en_vocab):
|
|||
|
||||
# head before start
|
||||
arr = doc.to_array(["HEAD"])
|
||||
arr[0] = -1
|
||||
arr[0] = numpy.int32(-1).astype(numpy.uint64)
|
||||
doc_from_array = Doc(en_vocab, words=words)
|
||||
with pytest.raises(ValueError):
|
||||
doc_from_array.from_array(["HEAD"], arr)
|
||||
|
||||
# head after end
|
||||
arr = doc.to_array(["HEAD"])
|
||||
arr[0] = 5
|
||||
arr[0] = numpy.int32(5).astype(numpy.uint64)
|
||||
doc_from_array = Doc(en_vocab, words=words)
|
||||
with pytest.raises(ValueError):
|
||||
doc_from_array.from_array(["HEAD"], arr)
|
||||
|
|
|
@ -128,7 +128,9 @@ def test_doc_to_json_with_token_span_attributes(doc):
|
|||
doc._.json_test1 = "hello world"
|
||||
doc._.json_test2 = [1, 2, 3]
|
||||
doc[0:1]._.span_test = "span_attribute"
|
||||
doc[0:2]._.span_test = "span_attribute_2"
|
||||
doc[0]._.token_test = 117
|
||||
doc[1]._.token_test = 118
|
||||
doc.spans["span_group"] = [doc[0:1]]
|
||||
json_doc = doc.to_json(
|
||||
underscore=["json_test1", "json_test2", "token_test", "span_test"]
|
||||
|
@ -139,8 +141,10 @@ def test_doc_to_json_with_token_span_attributes(doc):
|
|||
assert json_doc["_"]["json_test2"] == [1, 2, 3]
|
||||
assert "underscore_token" in json_doc
|
||||
assert "underscore_span" in json_doc
|
||||
assert json_doc["underscore_token"]["token_test"]["value"] == 117
|
||||
assert json_doc["underscore_span"]["span_test"]["value"] == "span_attribute"
|
||||
assert json_doc["underscore_token"]["token_test"][0]["value"] == 117
|
||||
assert json_doc["underscore_token"]["token_test"][1]["value"] == 118
|
||||
assert json_doc["underscore_span"]["span_test"][0]["value"] == "span_attribute"
|
||||
assert json_doc["underscore_span"]["span_test"][1]["value"] == "span_attribute_2"
|
||||
assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0
|
||||
assert srsly.json_loads(srsly.json_dumps(json_doc)) == json_doc
|
||||
|
||||
|
@ -161,8 +165,8 @@ def test_doc_to_json_with_custom_user_data(doc):
|
|||
assert json_doc["_"]["json_test"] == "hello world"
|
||||
assert "underscore_token" in json_doc
|
||||
assert "underscore_span" in json_doc
|
||||
assert json_doc["underscore_token"]["token_test"]["value"] == 117
|
||||
assert json_doc["underscore_span"]["span_test"]["value"] == "span_attribute"
|
||||
assert json_doc["underscore_token"]["token_test"][0]["value"] == 117
|
||||
assert json_doc["underscore_span"]["span_test"][0]["value"] == "span_attribute"
|
||||
assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0
|
||||
assert srsly.json_loads(srsly.json_dumps(json_doc)) == json_doc
|
||||
|
||||
|
@ -181,8 +185,8 @@ def test_doc_to_json_with_token_span_same_identifier(doc):
|
|||
assert json_doc["_"]["my_ext"] == "hello world"
|
||||
assert "underscore_token" in json_doc
|
||||
assert "underscore_span" in json_doc
|
||||
assert json_doc["underscore_token"]["my_ext"]["value"] == 117
|
||||
assert json_doc["underscore_span"]["my_ext"]["value"] == "span_attribute"
|
||||
assert json_doc["underscore_token"]["my_ext"][0]["value"] == 117
|
||||
assert json_doc["underscore_span"]["my_ext"][0]["value"] == "span_attribute"
|
||||
assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0
|
||||
assert srsly.json_loads(srsly.json_dumps(json_doc)) == json_doc
|
||||
|
||||
|
@ -195,10 +199,9 @@ def test_doc_to_json_with_token_attributes_missing(doc):
|
|||
doc[0]._.token_test = 117
|
||||
json_doc = doc.to_json(underscore=["span_test"])
|
||||
|
||||
assert "underscore_token" in json_doc
|
||||
assert "underscore_span" in json_doc
|
||||
assert json_doc["underscore_span"]["span_test"]["value"] == "span_attribute"
|
||||
assert "token_test" not in json_doc["underscore_token"]
|
||||
assert json_doc["underscore_span"]["span_test"][0]["value"] == "span_attribute"
|
||||
assert "underscore_token" not in json_doc
|
||||
assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0
|
||||
|
||||
|
||||
|
@ -283,7 +286,9 @@ def test_json_to_doc_with_token_span_attributes(doc):
|
|||
doc._.json_test1 = "hello world"
|
||||
doc._.json_test2 = [1, 2, 3]
|
||||
doc[0:1]._.span_test = "span_attribute"
|
||||
doc[0:2]._.span_test = "span_attribute_2"
|
||||
doc[0]._.token_test = 117
|
||||
doc[1]._.token_test = 118
|
||||
|
||||
json_doc = doc.to_json(
|
||||
underscore=["json_test1", "json_test2", "token_test", "span_test"]
|
||||
|
@ -295,7 +300,9 @@ def test_json_to_doc_with_token_span_attributes(doc):
|
|||
assert new_doc._.json_test1 == "hello world"
|
||||
assert new_doc._.json_test2 == [1, 2, 3]
|
||||
assert new_doc[0]._.token_test == 117
|
||||
assert new_doc[1]._.token_test == 118
|
||||
assert new_doc[0:1]._.span_test == "span_attribute"
|
||||
assert new_doc[0:2]._.span_test == "span_attribute_2"
|
||||
assert new_doc.user_data == doc.user_data
|
||||
assert new_doc.to_bytes(exclude=["user_data"]) == doc.to_bytes(
|
||||
exclude=["user_data"]
|
||||
|
@ -363,3 +370,12 @@ def test_json_to_doc_validation_error(doc):
|
|||
doc_json.pop("tokens")
|
||||
with pytest.raises(ValueError):
|
||||
Doc(doc.vocab).from_json(doc_json, validate=True)
|
||||
|
||||
|
||||
def test_to_json_underscore_doc_getters(doc):
|
||||
def get_text_length(doc):
|
||||
return len(doc.text)
|
||||
|
||||
Doc.set_extension("text_length", getter=get_text_length)
|
||||
doc_json = doc.to_json(underscore=["text_length"])
|
||||
assert doc_json["_"]["text_length"] == get_text_length(doc)
|
||||
|
|
|
@ -1,7 +1,10 @@
|
|||
from typing import List
|
||||
|
||||
import pytest
|
||||
from random import Random
|
||||
from spacy.matcher import Matcher
|
||||
from spacy.tokens import Span, SpanGroup
|
||||
from spacy.tokens import Span, SpanGroup, Doc
|
||||
from spacy.util import filter_spans
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
|
@ -240,3 +243,13 @@ def test_span_group_extend(doc):
|
|||
def test_span_group_dealloc(span_group):
|
||||
with pytest.raises(AttributeError):
|
||||
print(span_group.doc)
|
||||
|
||||
|
||||
@pytest.mark.issue(11975)
|
||||
def test_span_group_typing(doc: Doc):
|
||||
"""Tests whether typing of `SpanGroup` as `Iterable[Span]`-like object is accepted by mypy."""
|
||||
span_group: SpanGroup = doc.spans["SPANS"]
|
||||
spans: List[Span] = list(span_group)
|
||||
for i, span in enumerate(span_group):
|
||||
assert span == span_group[i] == spans[i]
|
||||
filter_spans(span_group)
|
||||
|
|
18
spacy/tests/lang/grc/test_tokenizer.py
Normal file
18
spacy/tests/lang/grc/test_tokenizer.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
import pytest
|
||||
|
||||
|
||||
# fmt: off
|
||||
GRC_TOKEN_EXCEPTION_TESTS = [
|
||||
("τὸ 〈τῆς〉 φιλοσοφίας ἔργον ἔνιοί φασιν ἀπὸ ⟦βαρβάρων⟧ ἄρξαι.", ["τὸ", "〈", "τῆς", "〉", "φιλοσοφίας", "ἔργον", "ἔνιοί", "φασιν", "ἀπὸ", "⟦", "βαρβάρων", "⟧", "ἄρξαι", "."]),
|
||||
("τὴν δὲ τῶν Αἰγυπτίων φιλοσοφίαν εἶναι τοιαύτην περί τε †θεῶν† καὶ ὑπὲρ δικαιοσύνης.", ["τὴν", "δὲ", "τῶν", "Αἰγυπτίων", "φιλοσοφίαν", "εἶναι", "τοιαύτην", "περί", "τε", "†", "θεῶν", "†", "καὶ", "ὑπὲρ", "δικαιοσύνης", "."]),
|
||||
("⸏πόσις δ' Ἐρεχθεύς ἐστί μοι σεσωσμένος⸏", ["⸏", "πόσις", "δ'", "Ἐρεχθεύς", "ἐστί", "μοι", "σεσωσμένος", "⸏"]),
|
||||
("⸏ὔπνον ἴδωμεν⸎", ["⸏", "ὔπνον", "ἴδωμεν", "⸎"]),
|
||||
]
|
||||
# fmt: on
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text,expected_tokens", GRC_TOKEN_EXCEPTION_TESTS)
|
||||
def test_grc_tokenizer(grc_tokenizer, text, expected_tokens):
|
||||
tokens = grc_tokenizer(text)
|
||||
token_list = [token.text for token in tokens if not token.is_space]
|
||||
assert expected_tokens == token_list
|
|
@ -78,3 +78,32 @@ def test_ru_lemmatizer_punct(ru_lemmatizer):
|
|||
assert ru_lemmatizer.pymorphy2_lemmatize(doc[0]) == ['"']
|
||||
doc = Doc(ru_lemmatizer.vocab, words=["»"], pos=["PUNCT"])
|
||||
assert ru_lemmatizer.pymorphy2_lemmatize(doc[0]) == ['"']
|
||||
|
||||
|
||||
def test_ru_doc_lookup_lemmatization(ru_lookup_lemmatizer):
|
||||
assert ru_lookup_lemmatizer.mode == "pymorphy3_lookup"
|
||||
words = ["мама", "мыла", "раму"]
|
||||
pos = ["NOUN", "VERB", "NOUN"]
|
||||
morphs = [
|
||||
"Animacy=Anim|Case=Nom|Gender=Fem|Number=Sing",
|
||||
"Aspect=Imp|Gender=Fem|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act",
|
||||
"Animacy=Anim|Case=Acc|Gender=Fem|Number=Sing",
|
||||
]
|
||||
doc = Doc(ru_lookup_lemmatizer.vocab, words=words, pos=pos, morphs=morphs)
|
||||
doc = ru_lookup_lemmatizer(doc)
|
||||
lemmas = [token.lemma_ for token in doc]
|
||||
assert lemmas == ["мама", "мыла", "раму"]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"word,lemma",
|
||||
(
|
||||
("бременем", "бремя"),
|
||||
("будешь", "быть"),
|
||||
("какая-то", "какой-то"),
|
||||
),
|
||||
)
|
||||
def test_ru_lookup_lemmatizer(ru_lookup_lemmatizer, word, lemma):
|
||||
assert ru_lookup_lemmatizer.mode == "pymorphy3_lookup"
|
||||
doc = Doc(ru_lookup_lemmatizer.vocab, words=[word])
|
||||
assert ru_lookup_lemmatizer(doc)[0].lemma_ == lemma
|
||||
|
|
|
@ -20,7 +20,6 @@ od katerih so te svoboščine odvisne,
|
|||
assert len(tokens) == 116
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
def test_ordinal_number(sl_tokenizer):
|
||||
text = "10. decembra 1948"
|
||||
tokens = sl_tokenizer(text)
|
||||
|
|
|
@ -8,4 +8,20 @@ pytestmark = pytest.mark.filterwarnings("ignore::DeprecationWarning")
|
|||
def test_uk_lemmatizer(uk_lemmatizer):
|
||||
"""Check that the default uk lemmatizer runs."""
|
||||
doc = Doc(uk_lemmatizer.vocab, words=["a", "b", "c"])
|
||||
assert uk_lemmatizer.mode == "pymorphy3"
|
||||
uk_lemmatizer(doc)
|
||||
assert [token.lemma for token in doc]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"word,lemma",
|
||||
(
|
||||
("якийсь", "якийсь"),
|
||||
("розповідають", "розповідати"),
|
||||
("розповіси", "розповісти"),
|
||||
),
|
||||
)
|
||||
def test_uk_lookup_lemmatizer(uk_lookup_lemmatizer, word, lemma):
|
||||
assert uk_lookup_lemmatizer.mode == "pymorphy3_lookup"
|
||||
doc = Doc(uk_lookup_lemmatizer.vocab, words=[word])
|
||||
assert uk_lookup_lemmatizer(doc)[0].lemma_ == lemma
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
import pytest
|
||||
from spacy.matcher import levenshtein
|
||||
from spacy.matcher.levenshtein import levenshtein_compare
|
||||
|
||||
|
||||
# empty string plus 10 random ASCII, 10 random unicode, and 2 random long tests
|
||||
|
@ -42,3 +43,31 @@ from spacy.matcher import levenshtein
|
|||
)
|
||||
def test_levenshtein(dist, a, b):
|
||||
assert levenshtein(a, b) == dist
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"a,b,fuzzy,expected",
|
||||
[
|
||||
("a", "a", 1, True),
|
||||
("a", "a", 0, True),
|
||||
("a", "a", -1, True),
|
||||
("a", "ab", 1, True),
|
||||
("a", "ab", 0, False),
|
||||
("a", "ab", -1, True),
|
||||
("ab", "ac", 1, True),
|
||||
("ab", "ac", -1, True),
|
||||
("abc", "cde", 4, True),
|
||||
("abc", "cde", -1, False),
|
||||
("abcdef", "cdefgh", 4, True),
|
||||
("abcdef", "cdefgh", 3, False),
|
||||
("abcdef", "cdefgh", -1, False), # default (2 for length 6)
|
||||
("abcdefgh", "cdefghijk", 5, True),
|
||||
("abcdefgh", "cdefghijk", 4, False),
|
||||
("abcdefgh", "cdefghijk", -1, False), # default (2)
|
||||
("abcdefgh", "cdefghijkl", 6, True),
|
||||
("abcdefgh", "cdefghijkl", 5, False),
|
||||
("abcdefgh", "cdefghijkl", -1, False), # default (2)
|
||||
],
|
||||
)
|
||||
def test_levenshtein_compare(a, b, fuzzy, expected):
|
||||
assert levenshtein_compare(a, b, fuzzy) == expected
|
||||
|
|
|
@ -118,6 +118,155 @@ def test_matcher_match_multi(matcher):
|
|||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"rules,match_locs",
|
||||
[
|
||||
(
|
||||
{
|
||||
"GoogleNow": [[{"ORTH": {"FUZZY": "Google"}}, {"ORTH": "Now"}]],
|
||||
},
|
||||
[(2, 4)],
|
||||
),
|
||||
(
|
||||
{
|
||||
"Java": [[{"LOWER": {"FUZZY": "java"}}]],
|
||||
},
|
||||
[(5, 6)],
|
||||
),
|
||||
(
|
||||
{
|
||||
"JS": [[{"ORTH": {"FUZZY": "JavaScript"}}]],
|
||||
"GoogleNow": [[{"ORTH": {"FUZZY": "Google"}}, {"ORTH": "Now"}]],
|
||||
"Java": [[{"LOWER": {"FUZZY": "java"}}]],
|
||||
},
|
||||
[(2, 4), (5, 6), (8, 9)],
|
||||
),
|
||||
# only the second pattern matches (check that predicate keys used for
|
||||
# caching don't collide)
|
||||
(
|
||||
{
|
||||
"A": [[{"ORTH": {"FUZZY": "Javascripts"}}]],
|
||||
"B": [[{"ORTH": {"FUZZY5": "Javascripts"}}]],
|
||||
},
|
||||
[(8, 9)],
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_matcher_match_fuzzy(en_vocab, rules, match_locs):
|
||||
words = ["They", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"]
|
||||
doc = Doc(en_vocab, words=words)
|
||||
|
||||
matcher = Matcher(en_vocab)
|
||||
for key, patterns in rules.items():
|
||||
matcher.add(key, patterns)
|
||||
assert match_locs == [(start, end) for m_id, start, end in matcher(doc)]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("set_op", ["IN", "NOT_IN"])
|
||||
def test_matcher_match_fuzzy_set_op_longest(en_vocab, set_op):
|
||||
rules = {
|
||||
"GoogleNow": [[{"ORTH": {"FUZZY": {set_op: ["Google", "Now"]}}, "OP": "+"}]]
|
||||
}
|
||||
matcher = Matcher(en_vocab)
|
||||
for key, patterns in rules.items():
|
||||
matcher.add(key, patterns, greedy="LONGEST")
|
||||
|
||||
words = ["They", "like", "Goggle", "Noo"]
|
||||
doc = Doc(en_vocab, words=words)
|
||||
assert len(matcher(doc)) == 1
|
||||
|
||||
|
||||
def test_matcher_match_fuzzy_set_multiple(en_vocab):
|
||||
rules = {
|
||||
"GoogleNow": [
|
||||
[
|
||||
{
|
||||
"ORTH": {"FUZZY": {"IN": ["Google", "Now"]}, "NOT_IN": ["Goggle"]},
|
||||
"OP": "+",
|
||||
}
|
||||
]
|
||||
]
|
||||
}
|
||||
matcher = Matcher(en_vocab)
|
||||
for key, patterns in rules.items():
|
||||
matcher.add(key, patterns, greedy="LONGEST")
|
||||
|
||||
words = ["They", "like", "Goggle", "Noo"]
|
||||
doc = Doc(matcher.vocab, words=words)
|
||||
assert matcher(doc) == [
|
||||
(doc.vocab.strings["GoogleNow"], 3, 4),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("fuzzyn", range(1, 10))
|
||||
def test_matcher_match_fuzzyn_all_insertions(en_vocab, fuzzyn):
|
||||
matcher = Matcher(en_vocab)
|
||||
matcher.add("GoogleNow", [[{"ORTH": {f"FUZZY{fuzzyn}": "GoogleNow"}}]])
|
||||
# words with increasing edit distance
|
||||
words = ["GoogleNow" + "a" * i for i in range(0, 10)]
|
||||
doc = Doc(en_vocab, words)
|
||||
assert len(matcher(doc)) == fuzzyn + 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize("fuzzyn", range(1, 6))
|
||||
def test_matcher_match_fuzzyn_various_edits(en_vocab, fuzzyn):
|
||||
matcher = Matcher(en_vocab)
|
||||
matcher.add("GoogleNow", [[{"ORTH": {f"FUZZY{fuzzyn}": "GoogleNow"}}]])
|
||||
# words with increasing edit distance of different edit types
|
||||
words = [
|
||||
"GoogleNow",
|
||||
"GoogleNuw",
|
||||
"GoogleNuew",
|
||||
"GoogleNoweee",
|
||||
"GiggleNuw3",
|
||||
"gouggle5New",
|
||||
]
|
||||
doc = Doc(en_vocab, words)
|
||||
assert len(matcher(doc)) == fuzzyn + 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize("greedy", ["FIRST", "LONGEST"])
|
||||
@pytest.mark.parametrize("set_op", ["IN", "NOT_IN"])
|
||||
def test_matcher_match_fuzzyn_set_op_longest(en_vocab, greedy, set_op):
|
||||
rules = {
|
||||
"GoogleNow": [[{"ORTH": {"FUZZY2": {set_op: ["Google", "Now"]}}, "OP": "+"}]]
|
||||
}
|
||||
matcher = Matcher(en_vocab)
|
||||
for key, patterns in rules.items():
|
||||
matcher.add(key, patterns, greedy=greedy)
|
||||
|
||||
words = ["They", "like", "Goggle", "Noo"]
|
||||
doc = Doc(matcher.vocab, words=words)
|
||||
spans = matcher(doc, as_spans=True)
|
||||
assert len(spans) == 1
|
||||
if set_op == "IN":
|
||||
assert spans[0].text == "Goggle Noo"
|
||||
else:
|
||||
assert spans[0].text == "They like"
|
||||
|
||||
|
||||
def test_matcher_match_fuzzyn_set_multiple(en_vocab):
|
||||
rules = {
|
||||
"GoogleNow": [
|
||||
[
|
||||
{
|
||||
"ORTH": {"FUZZY1": {"IN": ["Google", "Now"]}, "NOT_IN": ["Goggle"]},
|
||||
"OP": "+",
|
||||
}
|
||||
]
|
||||
]
|
||||
}
|
||||
matcher = Matcher(en_vocab)
|
||||
for key, patterns in rules.items():
|
||||
matcher.add(key, patterns, greedy="LONGEST")
|
||||
|
||||
words = ["They", "like", "Goggle", "Noo"]
|
||||
doc = Doc(matcher.vocab, words=words)
|
||||
assert matcher(doc) == [
|
||||
(doc.vocab.strings["GoogleNow"], 3, 4),
|
||||
]
|
||||
|
||||
|
||||
def test_matcher_empty_dict(en_vocab):
|
||||
"""Test matcher allows empty token specs, meaning match on any token."""
|
||||
matcher = Matcher(en_vocab)
|
||||
|
@ -437,6 +586,30 @@ def test_matcher_regex(en_vocab):
|
|||
assert len(matches) == 0
|
||||
|
||||
|
||||
def test_matcher_regex_set_in(en_vocab):
|
||||
matcher = Matcher(en_vocab)
|
||||
pattern = [{"ORTH": {"REGEX": {"IN": [r"(?:a)", r"(?:an)"]}}}]
|
||||
matcher.add("A_OR_AN", [pattern])
|
||||
doc = Doc(en_vocab, words=["an", "a", "hi"])
|
||||
matches = matcher(doc)
|
||||
assert len(matches) == 2
|
||||
doc = Doc(en_vocab, words=["bye"])
|
||||
matches = matcher(doc)
|
||||
assert len(matches) == 0
|
||||
|
||||
|
||||
def test_matcher_regex_set_not_in(en_vocab):
|
||||
matcher = Matcher(en_vocab)
|
||||
pattern = [{"ORTH": {"REGEX": {"NOT_IN": [r"(?:a)", r"(?:an)"]}}}]
|
||||
matcher.add("A_OR_AN", [pattern])
|
||||
doc = Doc(en_vocab, words=["an", "a", "hi"])
|
||||
matches = matcher(doc)
|
||||
assert len(matches) == 1
|
||||
doc = Doc(en_vocab, words=["bye"])
|
||||
matches = matcher(doc)
|
||||
assert len(matches) == 1
|
||||
|
||||
|
||||
def test_matcher_regex_shape(en_vocab):
|
||||
matcher = Matcher(en_vocab)
|
||||
pattern = [{"SHAPE": {"REGEX": r"^[^x]+$"}}]
|
||||
|
|
|
@ -60,20 +60,56 @@ def test_initialize_from_labels():
|
|||
nlp2 = Language()
|
||||
lemmatizer2 = nlp2.add_pipe("trainable_lemmatizer")
|
||||
lemmatizer2.initialize(
|
||||
get_examples=lambda: train_examples,
|
||||
# We want to check that the strings in replacement nodes are
|
||||
# added to the string store. Avoid that they get added through
|
||||
# the examples.
|
||||
get_examples=lambda: train_examples[:1],
|
||||
labels=lemmatizer.label_data,
|
||||
)
|
||||
assert lemmatizer2.tree2label == {1: 0, 3: 1, 4: 2, 6: 3}
|
||||
assert lemmatizer2.label_data == {
|
||||
"trees": [
|
||||
{"orig": "S", "subst": "s"},
|
||||
{
|
||||
"prefix_len": 1,
|
||||
"suffix_len": 0,
|
||||
"prefix_tree": 0,
|
||||
"suffix_tree": 4294967295,
|
||||
},
|
||||
{"orig": "s", "subst": ""},
|
||||
{
|
||||
"prefix_len": 0,
|
||||
"suffix_len": 1,
|
||||
"prefix_tree": 4294967295,
|
||||
"suffix_tree": 2,
|
||||
},
|
||||
{
|
||||
"prefix_len": 0,
|
||||
"suffix_len": 0,
|
||||
"prefix_tree": 4294967295,
|
||||
"suffix_tree": 4294967295,
|
||||
},
|
||||
{"orig": "E", "subst": "e"},
|
||||
{
|
||||
"prefix_len": 1,
|
||||
"suffix_len": 0,
|
||||
"prefix_tree": 5,
|
||||
"suffix_tree": 4294967295,
|
||||
},
|
||||
],
|
||||
"labels": (1, 3, 4, 6),
|
||||
}
|
||||
|
||||
|
||||
def test_no_data():
|
||||
@pytest.mark.parametrize("top_k", (1, 5, 30))
|
||||
def test_no_data(top_k):
|
||||
# Test that the lemmatizer provides a nice error when there's no tagging data / labels
|
||||
TEXTCAT_DATA = [
|
||||
("I'm so happy.", {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}),
|
||||
("I'm so angry", {"cats": {"POSITIVE": 0.0, "NEGATIVE": 1.0}}),
|
||||
]
|
||||
nlp = English()
|
||||
nlp.add_pipe("trainable_lemmatizer")
|
||||
nlp.add_pipe("trainable_lemmatizer", config={"top_k": top_k})
|
||||
nlp.add_pipe("textcat")
|
||||
|
||||
train_examples = []
|
||||
|
@ -84,10 +120,11 @@ def test_no_data():
|
|||
nlp.initialize(get_examples=lambda: train_examples)
|
||||
|
||||
|
||||
def test_incomplete_data():
|
||||
@pytest.mark.parametrize("top_k", (1, 5, 30))
|
||||
def test_incomplete_data(top_k):
|
||||
# Test that the lemmatizer works with incomplete information
|
||||
nlp = English()
|
||||
lemmatizer = nlp.add_pipe("trainable_lemmatizer")
|
||||
lemmatizer = nlp.add_pipe("trainable_lemmatizer", config={"top_k": top_k})
|
||||
lemmatizer.min_tree_freq = 1
|
||||
train_examples = []
|
||||
for t in PARTIAL_DATA:
|
||||
|
@ -104,10 +141,25 @@ def test_incomplete_data():
|
|||
assert doc[1].lemma_ == "like"
|
||||
assert doc[2].lemma_ == "blue"
|
||||
|
||||
# Check that incomplete annotations are ignored.
|
||||
scores, _ = lemmatizer.model([eg.predicted for eg in train_examples], is_train=True)
|
||||
_, dX = lemmatizer.get_loss(train_examples, scores)
|
||||
xp = lemmatizer.model.ops.xp
|
||||
|
||||
def test_overfitting_IO():
|
||||
# Missing annotations.
|
||||
assert xp.count_nonzero(dX[0][0]) == 0
|
||||
assert xp.count_nonzero(dX[0][3]) == 0
|
||||
assert xp.count_nonzero(dX[1][0]) == 0
|
||||
assert xp.count_nonzero(dX[1][3]) == 0
|
||||
|
||||
# Misaligned annotations.
|
||||
assert xp.count_nonzero(dX[1][1]) == 0
|
||||
|
||||
|
||||
@pytest.mark.parametrize("top_k", (1, 5, 30))
|
||||
def test_overfitting_IO(top_k):
|
||||
nlp = English()
|
||||
lemmatizer = nlp.add_pipe("trainable_lemmatizer")
|
||||
lemmatizer = nlp.add_pipe("trainable_lemmatizer", config={"top_k": top_k})
|
||||
lemmatizer.min_tree_freq = 1
|
||||
train_examples = []
|
||||
for t in TRAIN_DATA:
|
||||
|
@ -140,7 +192,7 @@ def test_overfitting_IO():
|
|||
# Check model after a {to,from}_bytes roundtrip
|
||||
nlp_bytes = nlp.to_bytes()
|
||||
nlp3 = English()
|
||||
nlp3.add_pipe("trainable_lemmatizer")
|
||||
nlp3.add_pipe("trainable_lemmatizer", config={"top_k": top_k})
|
||||
nlp3.from_bytes(nlp_bytes)
|
||||
doc3 = nlp3(test_text)
|
||||
assert doc3[0].lemma_ == "she"
|
||||
|
|
|
@ -6,9 +6,10 @@ from numpy.testing import assert_equal
|
|||
from spacy import registry, util
|
||||
from spacy.attrs import ENT_KB_ID
|
||||
from spacy.compat import pickle
|
||||
from spacy.kb import Candidate, KnowledgeBase, get_candidates
|
||||
from spacy.kb import Candidate, InMemoryLookupKB, get_candidates, KnowledgeBase
|
||||
from spacy.lang.en import English
|
||||
from spacy.ml import load_kb
|
||||
from spacy.ml.models.entity_linker import build_span_maker
|
||||
from spacy.pipeline import EntityLinker
|
||||
from spacy.pipeline.legacy import EntityLinker_v1
|
||||
from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
|
||||
|
@ -34,7 +35,7 @@ def assert_almost_equal(a, b):
|
|||
def test_issue4674():
|
||||
"""Test that setting entities with overlapping identifiers does not mess up IO"""
|
||||
nlp = English()
|
||||
kb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
|
||||
kb = InMemoryLookupKB(nlp.vocab, entity_vector_length=3)
|
||||
vector1 = [0.9, 1.1, 1.01]
|
||||
vector2 = [1.8, 2.25, 2.01]
|
||||
with pytest.warns(UserWarning):
|
||||
|
@ -51,7 +52,7 @@ def test_issue4674():
|
|||
dir_path.mkdir()
|
||||
file_path = dir_path / "kb"
|
||||
kb.to_disk(str(file_path))
|
||||
kb2 = KnowledgeBase(nlp.vocab, entity_vector_length=3)
|
||||
kb2 = InMemoryLookupKB(nlp.vocab, entity_vector_length=3)
|
||||
kb2.from_disk(str(file_path))
|
||||
assert kb2.get_size_entities() == 1
|
||||
|
||||
|
@ -59,9 +60,9 @@ def test_issue4674():
|
|||
@pytest.mark.issue(6730)
|
||||
def test_issue6730(en_vocab):
|
||||
"""Ensure that the KB does not accept empty strings, but otherwise IO works fine."""
|
||||
from spacy.kb import KnowledgeBase
|
||||
from spacy.kb.kb_in_memory import InMemoryLookupKB
|
||||
|
||||
kb = KnowledgeBase(en_vocab, entity_vector_length=3)
|
||||
kb = InMemoryLookupKB(en_vocab, entity_vector_length=3)
|
||||
kb.add_entity(entity="1", freq=148, entity_vector=[1, 2, 3])
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
|
@ -127,7 +128,7 @@ def test_issue7065_b():
|
|||
|
||||
def create_kb(vocab):
|
||||
# create artificial KB
|
||||
mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
|
||||
mykb = InMemoryLookupKB(vocab, entity_vector_length=vector_length)
|
||||
mykb.add_entity(entity="Q270853", freq=12, entity_vector=[9, 1, -7])
|
||||
mykb.add_alias(
|
||||
alias="No. 8",
|
||||
|
@ -190,7 +191,7 @@ def test_no_entities():
|
|||
|
||||
def create_kb(vocab):
|
||||
# create artificial KB
|
||||
mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
|
||||
mykb = InMemoryLookupKB(vocab, entity_vector_length=vector_length)
|
||||
mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
|
||||
mykb.add_alias("Russ Cochran", ["Q2146908"], [0.9])
|
||||
return mykb
|
||||
|
@ -231,7 +232,7 @@ def test_partial_links():
|
|||
|
||||
def create_kb(vocab):
|
||||
# create artificial KB
|
||||
mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
|
||||
mykb = InMemoryLookupKB(vocab, entity_vector_length=vector_length)
|
||||
mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
|
||||
mykb.add_alias("Russ Cochran", ["Q2146908"], [0.9])
|
||||
return mykb
|
||||
|
@ -263,7 +264,7 @@ def test_partial_links():
|
|||
|
||||
def test_kb_valid_entities(nlp):
|
||||
"""Test the valid construction of a KB with 3 entities and two aliases"""
|
||||
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
|
||||
mykb = InMemoryLookupKB(nlp.vocab, entity_vector_length=3)
|
||||
|
||||
# adding entities
|
||||
mykb.add_entity(entity="Q1", freq=19, entity_vector=[8, 4, 3])
|
||||
|
@ -292,7 +293,7 @@ def test_kb_valid_entities(nlp):
|
|||
|
||||
def test_kb_invalid_entities(nlp):
|
||||
"""Test the invalid construction of a KB with an alias linked to a non-existing entity"""
|
||||
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
|
||||
mykb = InMemoryLookupKB(nlp.vocab, entity_vector_length=1)
|
||||
|
||||
# adding entities
|
||||
mykb.add_entity(entity="Q1", freq=19, entity_vector=[1])
|
||||
|
@ -308,7 +309,7 @@ def test_kb_invalid_entities(nlp):
|
|||
|
||||
def test_kb_invalid_probabilities(nlp):
|
||||
"""Test the invalid construction of a KB with wrong prior probabilities"""
|
||||
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
|
||||
mykb = InMemoryLookupKB(nlp.vocab, entity_vector_length=1)
|
||||
|
||||
# adding entities
|
||||
mykb.add_entity(entity="Q1", freq=19, entity_vector=[1])
|
||||
|
@ -322,7 +323,7 @@ def test_kb_invalid_probabilities(nlp):
|
|||
|
||||
def test_kb_invalid_combination(nlp):
|
||||
"""Test the invalid construction of a KB with non-matching entity and probability lists"""
|
||||
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
|
||||
mykb = InMemoryLookupKB(nlp.vocab, entity_vector_length=1)
|
||||
|
||||
# adding entities
|
||||
mykb.add_entity(entity="Q1", freq=19, entity_vector=[1])
|
||||
|
@ -338,7 +339,7 @@ def test_kb_invalid_combination(nlp):
|
|||
|
||||
def test_kb_invalid_entity_vector(nlp):
|
||||
"""Test the invalid construction of a KB with non-matching entity vector lengths"""
|
||||
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
|
||||
mykb = InMemoryLookupKB(nlp.vocab, entity_vector_length=3)
|
||||
|
||||
# adding entities
|
||||
mykb.add_entity(entity="Q1", freq=19, entity_vector=[1, 2, 3])
|
||||
|
@ -376,7 +377,7 @@ def test_kb_initialize_empty(nlp):
|
|||
|
||||
def test_kb_serialize(nlp):
|
||||
"""Test serialization of the KB"""
|
||||
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
|
||||
mykb = InMemoryLookupKB(nlp.vocab, entity_vector_length=1)
|
||||
with make_tempdir() as d:
|
||||
# normal read-write behaviour
|
||||
mykb.to_disk(d / "kb")
|
||||
|
@ -393,12 +394,12 @@ def test_kb_serialize(nlp):
|
|||
@pytest.mark.issue(9137)
|
||||
def test_kb_serialize_2(nlp):
|
||||
v = [5, 6, 7, 8]
|
||||
kb1 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=4)
|
||||
kb1 = InMemoryLookupKB(vocab=nlp.vocab, entity_vector_length=4)
|
||||
kb1.set_entities(["E1"], [1], [v])
|
||||
assert kb1.get_vector("E1") == v
|
||||
with make_tempdir() as d:
|
||||
kb1.to_disk(d / "kb")
|
||||
kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=4)
|
||||
kb2 = InMemoryLookupKB(vocab=nlp.vocab, entity_vector_length=4)
|
||||
kb2.from_disk(d / "kb")
|
||||
assert kb2.get_vector("E1") == v
|
||||
|
||||
|
@ -408,7 +409,7 @@ def test_kb_set_entities(nlp):
|
|||
v = [5, 6, 7, 8]
|
||||
v1 = [1, 1, 1, 0]
|
||||
v2 = [2, 2, 2, 3]
|
||||
kb1 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=4)
|
||||
kb1 = InMemoryLookupKB(vocab=nlp.vocab, entity_vector_length=4)
|
||||
kb1.set_entities(["E0"], [1], [v])
|
||||
assert kb1.get_entity_strings() == ["E0"]
|
||||
kb1.set_entities(["E1", "E2"], [1, 9], [v1, v2])
|
||||
|
@ -417,7 +418,7 @@ def test_kb_set_entities(nlp):
|
|||
assert kb1.get_vector("E2") == v2
|
||||
with make_tempdir() as d:
|
||||
kb1.to_disk(d / "kb")
|
||||
kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=4)
|
||||
kb2 = InMemoryLookupKB(vocab=nlp.vocab, entity_vector_length=4)
|
||||
kb2.from_disk(d / "kb")
|
||||
assert set(kb2.get_entity_strings()) == {"E1", "E2"}
|
||||
assert kb2.get_vector("E1") == v1
|
||||
|
@ -428,7 +429,7 @@ def test_kb_serialize_vocab(nlp):
|
|||
"""Test serialization of the KB and custom strings"""
|
||||
entity = "MyFunnyID"
|
||||
assert entity not in nlp.vocab.strings
|
||||
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
|
||||
mykb = InMemoryLookupKB(nlp.vocab, entity_vector_length=1)
|
||||
assert not mykb.contains_entity(entity)
|
||||
mykb.add_entity(entity, freq=342, entity_vector=[3])
|
||||
assert mykb.contains_entity(entity)
|
||||
|
@ -436,14 +437,14 @@ def test_kb_serialize_vocab(nlp):
|
|||
with make_tempdir() as d:
|
||||
# normal read-write behaviour
|
||||
mykb.to_disk(d / "kb")
|
||||
mykb_new = KnowledgeBase(Vocab(), entity_vector_length=1)
|
||||
mykb_new = InMemoryLookupKB(Vocab(), entity_vector_length=1)
|
||||
mykb_new.from_disk(d / "kb")
|
||||
assert entity in mykb_new.vocab.strings
|
||||
|
||||
|
||||
def test_candidate_generation(nlp):
|
||||
"""Test correct candidate generation"""
|
||||
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
|
||||
mykb = InMemoryLookupKB(nlp.vocab, entity_vector_length=1)
|
||||
doc = nlp("douglas adam Adam shrubbery")
|
||||
|
||||
douglas_ent = doc[0:1]
|
||||
|
@ -481,7 +482,7 @@ def test_el_pipe_configuration(nlp):
|
|||
ruler.add_patterns([pattern])
|
||||
|
||||
def create_kb(vocab):
|
||||
kb = KnowledgeBase(vocab, entity_vector_length=1)
|
||||
kb = InMemoryLookupKB(vocab, entity_vector_length=1)
|
||||
kb.add_entity(entity="Q2", freq=12, entity_vector=[2])
|
||||
kb.add_entity(entity="Q3", freq=5, entity_vector=[3])
|
||||
kb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.8, 0.1])
|
||||
|
@ -500,10 +501,21 @@ def test_el_pipe_configuration(nlp):
|
|||
def get_lowercased_candidates(kb, span):
|
||||
return kb.get_alias_candidates(span.text.lower())
|
||||
|
||||
def get_lowercased_candidates_batch(kb, spans):
|
||||
return [get_lowercased_candidates(kb, span) for span in spans]
|
||||
|
||||
@registry.misc("spacy.LowercaseCandidateGenerator.v1")
|
||||
def create_candidates() -> Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]:
|
||||
def create_candidates() -> Callable[
|
||||
[InMemoryLookupKB, "Span"], Iterable[Candidate]
|
||||
]:
|
||||
return get_lowercased_candidates
|
||||
|
||||
@registry.misc("spacy.LowercaseCandidateBatchGenerator.v1")
|
||||
def create_candidates_batch() -> Callable[
|
||||
[InMemoryLookupKB, Iterable["Span"]], Iterable[Iterable[Candidate]]
|
||||
]:
|
||||
return get_lowercased_candidates_batch
|
||||
|
||||
# replace the pipe with a new one with with a different candidate generator
|
||||
entity_linker = nlp.replace_pipe(
|
||||
"entity_linker",
|
||||
|
@ -511,6 +523,9 @@ def test_el_pipe_configuration(nlp):
|
|||
config={
|
||||
"incl_context": False,
|
||||
"get_candidates": {"@misc": "spacy.LowercaseCandidateGenerator.v1"},
|
||||
"get_candidates_batch": {
|
||||
"@misc": "spacy.LowercaseCandidateBatchGenerator.v1"
|
||||
},
|
||||
},
|
||||
)
|
||||
entity_linker.set_kb(create_kb)
|
||||
|
@ -532,7 +547,7 @@ def test_nel_nsents(nlp):
|
|||
|
||||
def test_vocab_serialization(nlp):
|
||||
"""Test that string information is retained across storage"""
|
||||
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
|
||||
mykb = InMemoryLookupKB(nlp.vocab, entity_vector_length=1)
|
||||
|
||||
# adding entities
|
||||
mykb.add_entity(entity="Q1", freq=27, entity_vector=[1])
|
||||
|
@ -552,7 +567,7 @@ def test_vocab_serialization(nlp):
|
|||
|
||||
with make_tempdir() as d:
|
||||
mykb.to_disk(d / "kb")
|
||||
kb_new_vocab = KnowledgeBase(Vocab(), entity_vector_length=1)
|
||||
kb_new_vocab = InMemoryLookupKB(Vocab(), entity_vector_length=1)
|
||||
kb_new_vocab.from_disk(d / "kb")
|
||||
|
||||
candidates = kb_new_vocab.get_alias_candidates("adam")
|
||||
|
@ -568,7 +583,7 @@ def test_vocab_serialization(nlp):
|
|||
|
||||
def test_append_alias(nlp):
|
||||
"""Test that we can append additional alias-entity pairs"""
|
||||
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
|
||||
mykb = InMemoryLookupKB(nlp.vocab, entity_vector_length=1)
|
||||
|
||||
# adding entities
|
||||
mykb.add_entity(entity="Q1", freq=27, entity_vector=[1])
|
||||
|
@ -599,7 +614,7 @@ def test_append_alias(nlp):
|
|||
@pytest.mark.filterwarnings("ignore:\\[W036")
|
||||
def test_append_invalid_alias(nlp):
|
||||
"""Test that append an alias will throw an error if prior probs are exceeding 1"""
|
||||
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
|
||||
mykb = InMemoryLookupKB(nlp.vocab, entity_vector_length=1)
|
||||
|
||||
# adding entities
|
||||
mykb.add_entity(entity="Q1", freq=27, entity_vector=[1])
|
||||
|
@ -621,7 +636,7 @@ def test_preserving_links_asdoc(nlp):
|
|||
vector_length = 1
|
||||
|
||||
def create_kb(vocab):
|
||||
mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
|
||||
mykb = InMemoryLookupKB(vocab, entity_vector_length=vector_length)
|
||||
# adding entities
|
||||
mykb.add_entity(entity="Q1", freq=19, entity_vector=[1])
|
||||
mykb.add_entity(entity="Q2", freq=8, entity_vector=[1])
|
||||
|
@ -701,7 +716,11 @@ TRAIN_DATA = [
|
|||
("Russ Cochran was a member of University of Kentucky's golf team.",
|
||||
{"links": {(0, 12): {"Q7381115": 0.0, "Q2146908": 1.0}},
|
||||
"entities": [(0, 12, "PERSON"), (43, 51, "LOC")],
|
||||
"sent_starts": [1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]})
|
||||
"sent_starts": [1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}),
|
||||
# having a blank instance shouldn't break things
|
||||
("The weather is nice today.",
|
||||
{"links": {}, "entities": [],
|
||||
"sent_starts": [1, -1, 0, 0, 0, 0]})
|
||||
]
|
||||
GOLD_entities = ["Q2146908", "Q7381115", "Q7381115", "Q2146908"]
|
||||
# fmt: on
|
||||
|
@ -723,7 +742,7 @@ def test_overfitting_IO():
|
|||
# create artificial KB - assign same prior weight to the two russ cochran's
|
||||
# Q2146908 (Russ Cochran): American golfer
|
||||
# Q7381115 (Russ Cochran): publisher
|
||||
mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
|
||||
mykb = InMemoryLookupKB(vocab, entity_vector_length=vector_length)
|
||||
mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
|
||||
mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7])
|
||||
mykb.add_alias(
|
||||
|
@ -805,7 +824,7 @@ def test_kb_serialization():
|
|||
kb_dir = tmp_dir / "kb"
|
||||
nlp1 = English()
|
||||
assert "Q2146908" not in nlp1.vocab.strings
|
||||
mykb = KnowledgeBase(nlp1.vocab, entity_vector_length=vector_length)
|
||||
mykb = InMemoryLookupKB(nlp1.vocab, entity_vector_length=vector_length)
|
||||
mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
|
||||
mykb.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8])
|
||||
assert "Q2146908" in nlp1.vocab.strings
|
||||
|
@ -828,7 +847,7 @@ def test_kb_serialization():
|
|||
def test_kb_pickle():
|
||||
# Test that the KB can be pickled
|
||||
nlp = English()
|
||||
kb_1 = KnowledgeBase(nlp.vocab, entity_vector_length=3)
|
||||
kb_1 = InMemoryLookupKB(nlp.vocab, entity_vector_length=3)
|
||||
kb_1.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
|
||||
assert not kb_1.contains_alias("Russ Cochran")
|
||||
kb_1.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8])
|
||||
|
@ -842,7 +861,7 @@ def test_kb_pickle():
|
|||
def test_nel_pickle():
|
||||
# Test that a pipeline with an EL component can be pickled
|
||||
def create_kb(vocab):
|
||||
kb = KnowledgeBase(vocab, entity_vector_length=3)
|
||||
kb = InMemoryLookupKB(vocab, entity_vector_length=3)
|
||||
kb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
|
||||
kb.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8])
|
||||
return kb
|
||||
|
@ -864,7 +883,7 @@ def test_nel_pickle():
|
|||
def test_kb_to_bytes():
|
||||
# Test that the KB's to_bytes method works correctly
|
||||
nlp = English()
|
||||
kb_1 = KnowledgeBase(nlp.vocab, entity_vector_length=3)
|
||||
kb_1 = InMemoryLookupKB(nlp.vocab, entity_vector_length=3)
|
||||
kb_1.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
|
||||
kb_1.add_entity(entity="Q66", freq=9, entity_vector=[1, 2, 3])
|
||||
kb_1.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8])
|
||||
|
@ -874,7 +893,7 @@ def test_kb_to_bytes():
|
|||
)
|
||||
assert kb_1.contains_alias("Russ Cochran")
|
||||
kb_bytes = kb_1.to_bytes()
|
||||
kb_2 = KnowledgeBase(nlp.vocab, entity_vector_length=3)
|
||||
kb_2 = InMemoryLookupKB(nlp.vocab, entity_vector_length=3)
|
||||
assert not kb_2.contains_alias("Russ Cochran")
|
||||
kb_2 = kb_2.from_bytes(kb_bytes)
|
||||
# check that both KBs are exactly the same
|
||||
|
@ -897,7 +916,7 @@ def test_kb_to_bytes():
|
|||
def test_nel_to_bytes():
|
||||
# Test that a pipeline with an EL component can be converted to bytes
|
||||
def create_kb(vocab):
|
||||
kb = KnowledgeBase(vocab, entity_vector_length=3)
|
||||
kb = InMemoryLookupKB(vocab, entity_vector_length=3)
|
||||
kb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
|
||||
kb.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8])
|
||||
return kb
|
||||
|
@ -987,7 +1006,7 @@ def test_legacy_architectures(name, config):
|
|||
train_examples.append(Example.from_dict(doc, annotation))
|
||||
|
||||
def create_kb(vocab):
|
||||
mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
|
||||
mykb = InMemoryLookupKB(vocab, entity_vector_length=vector_length)
|
||||
mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
|
||||
mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7])
|
||||
mykb.add_alias(
|
||||
|
@ -1054,7 +1073,7 @@ def test_no_gold_ents(patterns):
|
|||
|
||||
def create_kb(vocab):
|
||||
# create artificial KB
|
||||
mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
|
||||
mykb = InMemoryLookupKB(vocab, entity_vector_length=vector_length)
|
||||
mykb.add_entity(entity="Q613241", freq=12, entity_vector=[6, -4, 3])
|
||||
mykb.add_alias("Kirby", ["Q613241"], [0.9])
|
||||
# Placeholder
|
||||
|
@ -1104,7 +1123,7 @@ def test_tokenization_mismatch():
|
|||
|
||||
def create_kb(vocab):
|
||||
# create placeholder KB
|
||||
mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
|
||||
mykb = InMemoryLookupKB(vocab, entity_vector_length=vector_length)
|
||||
mykb.add_entity(entity="Q613241", freq=12, entity_vector=[6, -4, 3])
|
||||
mykb.add_alias("Kirby", ["Q613241"], [0.9])
|
||||
return mykb
|
||||
|
@ -1121,6 +1140,12 @@ def test_tokenization_mismatch():
|
|||
nlp.evaluate(train_examples)
|
||||
|
||||
|
||||
def test_abstract_kb_instantiation():
|
||||
"""Test whether instantiation of abstract KB base class fails."""
|
||||
with pytest.raises(TypeError):
|
||||
KnowledgeBase(None, 3)
|
||||
|
||||
|
||||
# fmt: off
|
||||
@pytest.mark.parametrize(
|
||||
"meet_threshold,config",
|
||||
|
@ -1151,7 +1176,7 @@ def test_threshold(meet_threshold: bool, config: Dict[str, Any]):
|
|||
|
||||
def create_kb(vocab):
|
||||
# create artificial KB
|
||||
mykb = KnowledgeBase(vocab, entity_vector_length=3)
|
||||
mykb = InMemoryLookupKB(vocab, entity_vector_length=3)
|
||||
mykb.add_entity(entity=entity_id, freq=12, entity_vector=[6, -4, 3])
|
||||
mykb.add_alias(
|
||||
alias="Mahler",
|
||||
|
@ -1176,3 +1201,18 @@ def test_threshold(meet_threshold: bool, config: Dict[str, Any]):
|
|||
|
||||
assert len(doc.ents) == 1
|
||||
assert doc.ents[0].kb_id_ == entity_id if meet_threshold else EntityLinker.NIL
|
||||
|
||||
|
||||
def test_span_maker_forward_with_empty():
|
||||
"""The forward pass of the span maker may have a doc with no entities."""
|
||||
nlp = English()
|
||||
doc1 = nlp("a b c")
|
||||
ent = doc1[0:1]
|
||||
ent.label_ = "X"
|
||||
doc1.ents = [ent]
|
||||
# no entities
|
||||
doc2 = nlp("x y z")
|
||||
|
||||
# just to get a model
|
||||
span_maker = build_span_maker()
|
||||
span_maker([doc1, doc2], False)
|
||||
|
|
|
@ -382,6 +382,43 @@ def test_entity_ruler_overlapping_spans(nlp, entity_ruler_factory):
|
|||
assert doc.ents[0].label_ == "FOOBAR"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
|
||||
def test_entity_ruler_fuzzy_pipe(nlp, entity_ruler_factory):
|
||||
ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
|
||||
patterns = [{"label": "HELLO", "pattern": [{"LOWER": {"FUZZY": "hello"}}]}]
|
||||
ruler.add_patterns(patterns)
|
||||
doc = nlp("helloo")
|
||||
assert len(doc.ents) == 1
|
||||
assert doc.ents[0].label_ == "HELLO"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
|
||||
def test_entity_ruler_fuzzy(nlp, entity_ruler_factory):
|
||||
ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
|
||||
patterns = [{"label": "HELLO", "pattern": [{"LOWER": {"FUZZY": "hello"}}]}]
|
||||
ruler.add_patterns(patterns)
|
||||
doc = nlp("helloo")
|
||||
assert len(doc.ents) == 1
|
||||
assert doc.ents[0].label_ == "HELLO"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
|
||||
def test_entity_ruler_fuzzy_disabled(nlp, entity_ruler_factory):
|
||||
@registry.misc("test_fuzzy_compare_disabled")
|
||||
def make_test_fuzzy_compare_disabled():
|
||||
return lambda x, y, z: False
|
||||
|
||||
ruler = nlp.add_pipe(
|
||||
entity_ruler_factory,
|
||||
name="entity_ruler",
|
||||
config={"matcher_fuzzy_compare": {"@misc": "test_fuzzy_compare_disabled"}},
|
||||
)
|
||||
patterns = [{"label": "HELLO", "pattern": [{"LOWER": {"FUZZY": "hello"}}]}]
|
||||
ruler.add_patterns(patterns)
|
||||
doc = nlp("helloo")
|
||||
assert len(doc.ents) == 0
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_process", [1, 2])
|
||||
@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS)
|
||||
def test_entity_ruler_multiprocessing(nlp, n_process, entity_ruler_factory):
|
||||
|
|
|
@ -615,20 +615,18 @@ def test_enable_disable_conflict_with_config():
|
|||
|
||||
with make_tempdir() as tmp_dir:
|
||||
nlp.to_disk(tmp_dir)
|
||||
# Expected to fail, as config and arguments conflict.
|
||||
with pytest.raises(ValueError):
|
||||
spacy.load(
|
||||
tmp_dir, enable=["tagger"], config={"nlp": {"disabled": ["senter"]}}
|
||||
)
|
||||
# Expected to succeed, as config and arguments do not conflict.
|
||||
assert spacy.load(
|
||||
tmp_dir, enable=["tagger"], config={"nlp": {"disabled": ["senter"]}}
|
||||
).disabled == ["senter", "sentencizer"]
|
||||
# Expected to succeed without warning due to the lack of a conflicting config option.
|
||||
spacy.load(tmp_dir, enable=["tagger"])
|
||||
# Expected to succeed with a warning, as disable=[] should override the config setting.
|
||||
with pytest.warns(UserWarning):
|
||||
# Expected to fail due to conflict between enable and disabled.
|
||||
with pytest.raises(ValueError):
|
||||
spacy.load(
|
||||
tmp_dir,
|
||||
enable=["tagger"],
|
||||
disable=[],
|
||||
config={"nlp": {"disabled": ["senter"]}},
|
||||
enable=["senter"],
|
||||
config={"nlp": {"disabled": ["senter", "tagger"]}},
|
||||
)
|
||||
|
||||
|
||||
|
|
|
@ -372,24 +372,39 @@ def test_overfitting_IO_overlapping():
|
|||
|
||||
|
||||
def test_zero_suggestions():
|
||||
# Test with a suggester that returns 0 suggestions
|
||||
# Test with a suggester that can return 0 suggestions
|
||||
|
||||
@registry.misc("test_zero_suggester")
|
||||
def make_zero_suggester():
|
||||
def zero_suggester(docs, *, ops=None):
|
||||
@registry.misc("test_mixed_zero_suggester")
|
||||
def make_mixed_zero_suggester():
|
||||
def mixed_zero_suggester(docs, *, ops=None):
|
||||
if ops is None:
|
||||
ops = get_current_ops()
|
||||
return Ragged(
|
||||
ops.xp.zeros((0, 0), dtype="i"), ops.xp.zeros((len(docs),), dtype="i")
|
||||
)
|
||||
spans = []
|
||||
lengths = []
|
||||
for doc in docs:
|
||||
if len(doc) > 0 and len(doc) % 2 == 0:
|
||||
spans.append((0, 1))
|
||||
lengths.append(1)
|
||||
else:
|
||||
lengths.append(0)
|
||||
spans = ops.asarray2i(spans)
|
||||
lengths_array = ops.asarray1i(lengths)
|
||||
if len(spans) > 0:
|
||||
output = Ragged(ops.xp.vstack(spans), lengths_array)
|
||||
else:
|
||||
output = Ragged(ops.xp.zeros((0, 0), dtype="i"), lengths_array)
|
||||
return output
|
||||
|
||||
return zero_suggester
|
||||
return mixed_zero_suggester
|
||||
|
||||
fix_random_seed(0)
|
||||
nlp = English()
|
||||
spancat = nlp.add_pipe(
|
||||
"spancat",
|
||||
config={"suggester": {"@misc": "test_zero_suggester"}, "spans_key": SPAN_KEY},
|
||||
config={
|
||||
"suggester": {"@misc": "test_mixed_zero_suggester"},
|
||||
"spans_key": SPAN_KEY,
|
||||
},
|
||||
)
|
||||
train_examples = make_examples(nlp)
|
||||
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||
|
@ -397,6 +412,16 @@ def test_zero_suggestions():
|
|||
assert set(spancat.labels) == {"LOC", "PERSON"}
|
||||
|
||||
nlp.update(train_examples, sgd=optimizer)
|
||||
# empty doc
|
||||
nlp("")
|
||||
# single doc with zero suggestions
|
||||
nlp("one")
|
||||
# single doc with one suggestion
|
||||
nlp("two two")
|
||||
# batch with mixed zero/one suggestions
|
||||
list(nlp.pipe(["one", "two two", "three three three", "", "four four four four"]))
|
||||
# batch with no suggestions
|
||||
list(nlp.pipe(["", "one", "three three three"]))
|
||||
|
||||
|
||||
def test_set_candidates():
|
||||
|
|
|
@ -360,6 +360,30 @@ def test_label_types(name):
|
|||
nlp.initialize()
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"name,get_examples",
|
||||
[
|
||||
("textcat", make_get_examples_single_label),
|
||||
("textcat_multilabel", make_get_examples_multi_label),
|
||||
],
|
||||
)
|
||||
def test_invalid_label_value(name, get_examples):
|
||||
nlp = Language()
|
||||
textcat = nlp.add_pipe(name)
|
||||
example_getter = get_examples(nlp)
|
||||
|
||||
def invalid_examples():
|
||||
# make one example with an invalid score
|
||||
examples = example_getter()
|
||||
ref = examples[0].reference
|
||||
key = list(ref.cats.keys())[0]
|
||||
ref.cats[key] = 2.0
|
||||
return examples
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
nlp.initialize(get_examples=invalid_examples)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("name", ["textcat", "textcat_multilabel"])
|
||||
def test_no_label(name):
|
||||
nlp = Language()
|
||||
|
@ -814,8 +838,8 @@ def test_textcat_loss(multi_label: bool, expected_loss: float):
|
|||
textcat = nlp.add_pipe("textcat_multilabel")
|
||||
else:
|
||||
textcat = nlp.add_pipe("textcat")
|
||||
textcat.initialize(lambda: train_examples)
|
||||
assert isinstance(textcat, TextCategorizer)
|
||||
textcat.initialize(lambda: train_examples)
|
||||
scores = textcat.model.ops.asarray(
|
||||
[[0.0, 0.0, 0.0, 1.0], [0.0, 0.0, 1.0, 1.0]], dtype="f" # type: ignore
|
||||
)
|
||||
|
@ -823,10 +847,10 @@ def test_textcat_loss(multi_label: bool, expected_loss: float):
|
|||
assert loss == expected_loss
|
||||
|
||||
|
||||
def test_textcat_threshold():
|
||||
def test_textcat_multilabel_threshold():
|
||||
# Ensure the scorer can be called with a different threshold
|
||||
nlp = English()
|
||||
nlp.add_pipe("textcat")
|
||||
nlp.add_pipe("textcat_multilabel")
|
||||
|
||||
train_examples = []
|
||||
for text, annotations in TRAIN_DATA_SINGLE_LABEL:
|
||||
|
@ -849,7 +873,7 @@ def test_textcat_threshold():
|
|||
)
|
||||
pos_f = scores["cats_score"]
|
||||
assert scores["cats_f_per_type"]["POSITIVE"]["r"] == 1.0
|
||||
assert pos_f > macro_f
|
||||
assert pos_f >= macro_f
|
||||
|
||||
|
||||
def test_textcat_multi_threshold():
|
||||
|
@ -871,3 +895,26 @@ def test_textcat_multi_threshold():
|
|||
|
||||
scores = nlp.evaluate(train_examples, scorer_cfg={"threshold": 0})
|
||||
assert scores["cats_f_per_type"]["POSITIVE"]["r"] == 1.0
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"component_name,scorer",
|
||||
[
|
||||
("textcat", "spacy.textcat_scorer.v1"),
|
||||
("textcat_multilabel", "spacy.textcat_multilabel_scorer.v1"),
|
||||
],
|
||||
)
|
||||
def test_textcat_legacy_scorers(component_name, scorer):
|
||||
"""Check that legacy scorers are registered and produce the expected score
|
||||
keys."""
|
||||
nlp = English()
|
||||
nlp.add_pipe(component_name, config={"scorer": {"@scorers": scorer}})
|
||||
|
||||
train_examples = []
|
||||
for text, annotations in TRAIN_DATA_SINGLE_LABEL:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
||||
nlp.initialize(get_examples=lambda: train_examples)
|
||||
|
||||
# score the model (it's not actually trained but that doesn't matter)
|
||||
scores = nlp.evaluate(train_examples)
|
||||
assert 0 <= scores["cats_score"] <= 1
|
||||
|
|
|
@ -230,6 +230,97 @@ def test_tok2vec_listener_callback():
|
|||
assert get_dX(Y) is not None
|
||||
|
||||
|
||||
def test_tok2vec_listener_overfitting():
|
||||
"""Test that a pipeline with a listener properly overfits, even if 'tok2vec' is in the annotating components"""
|
||||
orig_config = Config().from_str(cfg_string)
|
||||
nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
|
||||
train_examples = []
|
||||
for t in TRAIN_DATA:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||
|
||||
for i in range(50):
|
||||
losses = {}
|
||||
nlp.update(train_examples, sgd=optimizer, losses=losses, annotates=["tok2vec"])
|
||||
assert losses["tagger"] < 0.00001
|
||||
|
||||
# test the trained model
|
||||
test_text = "I like blue eggs"
|
||||
doc = nlp(test_text)
|
||||
assert doc[0].tag_ == "N"
|
||||
assert doc[1].tag_ == "V"
|
||||
assert doc[2].tag_ == "J"
|
||||
assert doc[3].tag_ == "N"
|
||||
|
||||
# Also test the results are still the same after IO
|
||||
with make_tempdir() as tmp_dir:
|
||||
nlp.to_disk(tmp_dir)
|
||||
nlp2 = util.load_model_from_path(tmp_dir)
|
||||
doc2 = nlp2(test_text)
|
||||
assert doc2[0].tag_ == "N"
|
||||
assert doc2[1].tag_ == "V"
|
||||
assert doc2[2].tag_ == "J"
|
||||
assert doc2[3].tag_ == "N"
|
||||
|
||||
|
||||
def test_tok2vec_frozen_not_annotating():
|
||||
"""Test that a pipeline with a frozen tok2vec raises an error when the tok2vec is not annotating"""
|
||||
orig_config = Config().from_str(cfg_string)
|
||||
nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
|
||||
train_examples = []
|
||||
for t in TRAIN_DATA:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||
|
||||
for i in range(2):
|
||||
losses = {}
|
||||
with pytest.raises(
|
||||
ValueError, match=r"the tok2vec embedding layer is not updated"
|
||||
):
|
||||
nlp.update(
|
||||
train_examples, sgd=optimizer, losses=losses, exclude=["tok2vec"]
|
||||
)
|
||||
|
||||
|
||||
def test_tok2vec_frozen_overfitting():
|
||||
"""Test that a pipeline with a frozen & annotating tok2vec can still overfit"""
|
||||
orig_config = Config().from_str(cfg_string)
|
||||
nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
|
||||
train_examples = []
|
||||
for t in TRAIN_DATA:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||
|
||||
for i in range(100):
|
||||
losses = {}
|
||||
nlp.update(
|
||||
train_examples,
|
||||
sgd=optimizer,
|
||||
losses=losses,
|
||||
exclude=["tok2vec"],
|
||||
annotates=["tok2vec"],
|
||||
)
|
||||
assert losses["tagger"] < 0.0001
|
||||
|
||||
# test the trained model
|
||||
test_text = "I like blue eggs"
|
||||
doc = nlp(test_text)
|
||||
assert doc[0].tag_ == "N"
|
||||
assert doc[1].tag_ == "V"
|
||||
assert doc[2].tag_ == "J"
|
||||
assert doc[3].tag_ == "N"
|
||||
|
||||
# Also test the results are still the same after IO
|
||||
with make_tempdir() as tmp_dir:
|
||||
nlp.to_disk(tmp_dir)
|
||||
nlp2 = util.load_model_from_path(tmp_dir)
|
||||
doc2 = nlp2(test_text)
|
||||
assert doc2[0].tag_ == "N"
|
||||
assert doc2[1].tag_ == "V"
|
||||
assert doc2[2].tag_ == "J"
|
||||
assert doc2[3].tag_ == "N"
|
||||
|
||||
|
||||
def test_replace_listeners():
|
||||
orig_config = Config().from_str(cfg_string)
|
||||
nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
|
||||
|
|
|
@ -3,7 +3,7 @@ from unittest import TestCase
|
|||
import pytest
|
||||
import srsly
|
||||
from numpy import zeros
|
||||
from spacy.kb import KnowledgeBase, Writer
|
||||
from spacy.kb.kb_in_memory import InMemoryLookupKB, Writer
|
||||
from spacy.vectors import Vectors
|
||||
from spacy.language import Language
|
||||
from spacy.pipeline import TrainablePipe
|
||||
|
@ -71,7 +71,7 @@ def entity_linker():
|
|||
nlp = Language()
|
||||
|
||||
def create_kb(vocab):
|
||||
kb = KnowledgeBase(vocab, entity_vector_length=1)
|
||||
kb = InMemoryLookupKB(vocab, entity_vector_length=1)
|
||||
kb.add_entity("test", 0.0, zeros((1, 1), dtype="f"))
|
||||
return kb
|
||||
|
||||
|
@ -120,7 +120,7 @@ def test_writer_with_path_py35():
|
|||
|
||||
def test_save_and_load_knowledge_base():
|
||||
nlp = Language()
|
||||
kb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
|
||||
kb = InMemoryLookupKB(nlp.vocab, entity_vector_length=1)
|
||||
with make_tempdir() as d:
|
||||
path = d / "kb"
|
||||
try:
|
||||
|
@ -129,7 +129,7 @@ def test_save_and_load_knowledge_base():
|
|||
pytest.fail(str(e))
|
||||
|
||||
try:
|
||||
kb_loaded = KnowledgeBase(nlp.vocab, entity_vector_length=1)
|
||||
kb_loaded = InMemoryLookupKB(nlp.vocab, entity_vector_length=1)
|
||||
kb_loaded.from_disk(path)
|
||||
except Exception as e:
|
||||
pytest.fail(str(e))
|
||||
|
|
|
@ -2,7 +2,7 @@ from typing import Callable
|
|||
|
||||
from spacy import util
|
||||
from spacy.util import ensure_path, registry, load_model_from_config
|
||||
from spacy.kb import KnowledgeBase
|
||||
from spacy.kb.kb_in_memory import InMemoryLookupKB
|
||||
from spacy.vocab import Vocab
|
||||
from thinc.api import Config
|
||||
|
||||
|
@ -22,7 +22,7 @@ def test_serialize_kb_disk(en_vocab):
|
|||
dir_path.mkdir()
|
||||
file_path = dir_path / "kb"
|
||||
kb1.to_disk(str(file_path))
|
||||
kb2 = KnowledgeBase(vocab=en_vocab, entity_vector_length=3)
|
||||
kb2 = InMemoryLookupKB(vocab=en_vocab, entity_vector_length=3)
|
||||
kb2.from_disk(str(file_path))
|
||||
|
||||
# final assertions
|
||||
|
@ -30,7 +30,7 @@ def test_serialize_kb_disk(en_vocab):
|
|||
|
||||
|
||||
def _get_dummy_kb(vocab):
|
||||
kb = KnowledgeBase(vocab, entity_vector_length=3)
|
||||
kb = InMemoryLookupKB(vocab, entity_vector_length=3)
|
||||
kb.add_entity(entity="Q53", freq=33, entity_vector=[0, 5, 3])
|
||||
kb.add_entity(entity="Q17", freq=2, entity_vector=[7, 1, 0])
|
||||
kb.add_entity(entity="Q007", freq=7, entity_vector=[0, 0, 7])
|
||||
|
@ -104,7 +104,7 @@ def test_serialize_subclassed_kb():
|
|||
custom_field = 666
|
||||
"""
|
||||
|
||||
class SubKnowledgeBase(KnowledgeBase):
|
||||
class SubInMemoryLookupKB(InMemoryLookupKB):
|
||||
def __init__(self, vocab, entity_vector_length, custom_field):
|
||||
super().__init__(vocab, entity_vector_length)
|
||||
self.custom_field = custom_field
|
||||
|
@ -112,9 +112,9 @@ def test_serialize_subclassed_kb():
|
|||
@registry.misc("spacy.CustomKB.v1")
|
||||
def custom_kb(
|
||||
entity_vector_length: int, custom_field: int
|
||||
) -> Callable[[Vocab], KnowledgeBase]:
|
||||
) -> Callable[[Vocab], InMemoryLookupKB]:
|
||||
def custom_kb_factory(vocab):
|
||||
kb = SubKnowledgeBase(
|
||||
kb = SubInMemoryLookupKB(
|
||||
vocab=vocab,
|
||||
entity_vector_length=entity_vector_length,
|
||||
custom_field=custom_field,
|
||||
|
@ -129,7 +129,7 @@ def test_serialize_subclassed_kb():
|
|||
nlp.initialize()
|
||||
|
||||
entity_linker = nlp.get_pipe("entity_linker")
|
||||
assert type(entity_linker.kb) == SubKnowledgeBase
|
||||
assert type(entity_linker.kb) == SubInMemoryLookupKB
|
||||
assert entity_linker.kb.entity_vector_length == 342
|
||||
assert entity_linker.kb.custom_field == 666
|
||||
|
||||
|
@ -139,6 +139,6 @@ def test_serialize_subclassed_kb():
|
|||
nlp2 = util.load_model_from_path(tmp_dir)
|
||||
entity_linker2 = nlp2.get_pipe("entity_linker")
|
||||
# After IO, the KB is the standard one
|
||||
assert type(entity_linker2.kb) == KnowledgeBase
|
||||
assert type(entity_linker2.kb) == InMemoryLookupKB
|
||||
assert entity_linker2.kb.entity_vector_length == 342
|
||||
assert not hasattr(entity_linker2.kb, "custom_field")
|
||||
|
|
|
@ -404,11 +404,10 @@ def test_serialize_pipeline_disable_enable():
|
|||
assert nlp3.component_names == ["ner", "tagger"]
|
||||
with make_tempdir() as d:
|
||||
nlp3.to_disk(d)
|
||||
with pytest.warns(UserWarning):
|
||||
nlp4 = spacy.load(d, disable=["ner"])
|
||||
assert nlp4.pipe_names == ["tagger"]
|
||||
nlp4 = spacy.load(d, disable=["ner"])
|
||||
assert nlp4.pipe_names == []
|
||||
assert nlp4.component_names == ["ner", "tagger"]
|
||||
assert nlp4.disabled == ["ner"]
|
||||
assert nlp4.disabled == ["ner", "tagger"]
|
||||
with make_tempdir() as d:
|
||||
nlp.to_disk(d)
|
||||
nlp5 = spacy.load(d, exclude=["tagger"])
|
||||
|
|
|
@ -1,8 +1,13 @@
|
|||
import os
|
||||
import math
|
||||
from random import sample
|
||||
from typing import Counter
|
||||
from collections import Counter
|
||||
from typing import Tuple, List, Dict, Any
|
||||
import pkg_resources
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import spacy
|
||||
import numpy
|
||||
import pytest
|
||||
import srsly
|
||||
from click import NoSuchOption
|
||||
|
@ -11,10 +16,11 @@ from thinc.api import Config, ConfigValidationError
|
|||
|
||||
from spacy import about
|
||||
from spacy.cli import info
|
||||
from spacy.cli._util import is_subpath_of, load_project_config
|
||||
from spacy.cli._util import is_subpath_of, load_project_config, walk_directory
|
||||
from spacy.cli._util import parse_config_overrides, string_to_list
|
||||
from spacy.cli._util import substitute_project_variables
|
||||
from spacy.cli._util import validate_project_commands
|
||||
from spacy.cli._util import upload_file, download_file
|
||||
from spacy.cli.debug_data import _compile_gold, _get_labels_from_model
|
||||
from spacy.cli.debug_data import _get_labels_from_spancat
|
||||
from spacy.cli.debug_data import _get_distribution, _get_kl_divergence
|
||||
|
@ -25,12 +31,16 @@ from spacy.cli.download import get_compatibility, get_version
|
|||
from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config
|
||||
from spacy.cli.package import get_third_party_dependencies
|
||||
from spacy.cli.package import _is_permitted_package_name
|
||||
from spacy.cli.project.remote_storage import RemoteStorage
|
||||
from spacy.cli.project.run import _check_requirements
|
||||
from spacy.cli.validate import get_model_pkgs
|
||||
from spacy.cli.apply import apply
|
||||
from spacy.cli.find_threshold import find_threshold
|
||||
from spacy.lang.en import English
|
||||
from spacy.lang.nl import Dutch
|
||||
from spacy.language import Language
|
||||
from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
|
||||
from spacy.tokens import Doc
|
||||
from spacy.tokens import Doc, DocBin
|
||||
from spacy.tokens.span import Span
|
||||
from spacy.training import Example, docs_to_json, offsets_to_biluo_tags
|
||||
from spacy.training.converters import conll_ner_to_docs, conllu_to_docs
|
||||
|
@ -116,6 +126,25 @@ def test_issue7055():
|
|||
assert "model" in filled_cfg["components"]["ner"]
|
||||
|
||||
|
||||
@pytest.mark.issue(11235)
|
||||
def test_issue11235():
|
||||
"""
|
||||
Test that the cli handles interpolation in the directory names correctly when loading project config.
|
||||
"""
|
||||
lang_var = "en"
|
||||
variables = {"lang": lang_var}
|
||||
commands = [{"name": "x", "script": ["hello ${vars.lang}"]}]
|
||||
directories = ["cfg", "${vars.lang}_model"]
|
||||
project = {"commands": commands, "vars": variables, "directories": directories}
|
||||
with make_tempdir() as d:
|
||||
srsly.write_yaml(d / "project.yml", project)
|
||||
cfg = load_project_config(d)
|
||||
# Check that the directories are interpolated and created correctly
|
||||
assert os.path.exists(d / "cfg")
|
||||
assert os.path.exists(d / f"{lang_var}_model")
|
||||
assert cfg["commands"][0]["script"][0] == f"hello {lang_var}"
|
||||
|
||||
|
||||
def test_cli_info():
|
||||
nlp = Dutch()
|
||||
nlp.add_pipe("textcat")
|
||||
|
@ -855,3 +884,326 @@ def test_span_length_freq_dist_output_must_be_correct():
|
|||
span_freqs = _get_spans_length_freq_dist(sample_span_lengths, threshold)
|
||||
assert sum(span_freqs.values()) >= threshold
|
||||
assert list(span_freqs.keys()) == [3, 1, 4, 5, 2]
|
||||
|
||||
|
||||
def test_applycli_empty_dir():
|
||||
with make_tempdir() as data_path:
|
||||
output = data_path / "test.spacy"
|
||||
apply(data_path, output, "blank:en", "text", 1, 1)
|
||||
|
||||
|
||||
def test_applycli_docbin():
|
||||
with make_tempdir() as data_path:
|
||||
output = data_path / "testout.spacy"
|
||||
nlp = spacy.blank("en")
|
||||
doc = nlp("testing apply cli.")
|
||||
# test empty DocBin case
|
||||
docbin = DocBin()
|
||||
docbin.to_disk(data_path / "testin.spacy")
|
||||
apply(data_path, output, "blank:en", "text", 1, 1)
|
||||
docbin.add(doc)
|
||||
docbin.to_disk(data_path / "testin.spacy")
|
||||
apply(data_path, output, "blank:en", "text", 1, 1)
|
||||
|
||||
|
||||
def test_applycli_jsonl():
|
||||
with make_tempdir() as data_path:
|
||||
output = data_path / "testout.spacy"
|
||||
data = [{"field": "Testing apply cli.", "key": 234}]
|
||||
data2 = [{"field": "234"}]
|
||||
srsly.write_jsonl(data_path / "test.jsonl", data)
|
||||
apply(data_path, output, "blank:en", "field", 1, 1)
|
||||
srsly.write_jsonl(data_path / "test2.jsonl", data2)
|
||||
apply(data_path, output, "blank:en", "field", 1, 1)
|
||||
|
||||
|
||||
def test_applycli_txt():
|
||||
with make_tempdir() as data_path:
|
||||
output = data_path / "testout.spacy"
|
||||
with open(data_path / "test.foo", "w") as ftest:
|
||||
ftest.write("Testing apply cli.")
|
||||
apply(data_path, output, "blank:en", "text", 1, 1)
|
||||
|
||||
|
||||
def test_applycli_mixed():
|
||||
with make_tempdir() as data_path:
|
||||
output = data_path / "testout.spacy"
|
||||
text = "Testing apply cli"
|
||||
nlp = spacy.blank("en")
|
||||
doc = nlp(text)
|
||||
jsonl_data = [{"text": text}]
|
||||
srsly.write_jsonl(data_path / "test.jsonl", jsonl_data)
|
||||
docbin = DocBin()
|
||||
docbin.add(doc)
|
||||
docbin.to_disk(data_path / "testin.spacy")
|
||||
with open(data_path / "test.txt", "w") as ftest:
|
||||
ftest.write(text)
|
||||
apply(data_path, output, "blank:en", "text", 1, 1)
|
||||
# Check whether it worked
|
||||
result = list(DocBin().from_disk(output).get_docs(nlp.vocab))
|
||||
assert len(result) == 3
|
||||
for doc in result:
|
||||
assert doc.text == text
|
||||
|
||||
|
||||
def test_applycli_user_data():
|
||||
Doc.set_extension("ext", default=0)
|
||||
val = ("ext", 0)
|
||||
with make_tempdir() as data_path:
|
||||
output = data_path / "testout.spacy"
|
||||
nlp = spacy.blank("en")
|
||||
doc = nlp("testing apply cli.")
|
||||
doc._.ext = val
|
||||
docbin = DocBin(store_user_data=True)
|
||||
docbin.add(doc)
|
||||
docbin.to_disk(data_path / "testin.spacy")
|
||||
apply(data_path, output, "blank:en", "", 1, 1)
|
||||
result = list(DocBin().from_disk(output).get_docs(nlp.vocab))
|
||||
assert result[0]._.ext == val
|
||||
|
||||
|
||||
def test_local_remote_storage():
|
||||
with make_tempdir() as d:
|
||||
filename = "a.txt"
|
||||
|
||||
content_hashes = ("aaaa", "cccc", "bbbb")
|
||||
for i, content_hash in enumerate(content_hashes):
|
||||
# make sure that each subsequent file has a later timestamp
|
||||
if i > 0:
|
||||
time.sleep(1)
|
||||
content = f"{content_hash} content"
|
||||
loc_file = d / "root" / filename
|
||||
if not loc_file.parent.exists():
|
||||
loc_file.parent.mkdir(parents=True)
|
||||
with loc_file.open(mode="w") as file_:
|
||||
file_.write(content)
|
||||
|
||||
# push first version to remote storage
|
||||
remote = RemoteStorage(d / "root", str(d / "remote"))
|
||||
remote.push(filename, "aaaa", content_hash)
|
||||
|
||||
# retrieve with full hashes
|
||||
loc_file.unlink()
|
||||
remote.pull(filename, command_hash="aaaa", content_hash=content_hash)
|
||||
with loc_file.open(mode="r") as file_:
|
||||
assert file_.read() == content
|
||||
|
||||
# retrieve with command hash
|
||||
loc_file.unlink()
|
||||
remote.pull(filename, command_hash="aaaa")
|
||||
with loc_file.open(mode="r") as file_:
|
||||
assert file_.read() == content
|
||||
|
||||
# retrieve with content hash
|
||||
loc_file.unlink()
|
||||
remote.pull(filename, content_hash=content_hash)
|
||||
with loc_file.open(mode="r") as file_:
|
||||
assert file_.read() == content
|
||||
|
||||
# retrieve with no hashes
|
||||
loc_file.unlink()
|
||||
remote.pull(filename)
|
||||
with loc_file.open(mode="r") as file_:
|
||||
assert file_.read() == content
|
||||
|
||||
|
||||
def test_local_remote_storage_pull_missing():
|
||||
# pulling from a non-existent remote pulls nothing gracefully
|
||||
with make_tempdir() as d:
|
||||
filename = "a.txt"
|
||||
remote = RemoteStorage(d / "root", str(d / "remote"))
|
||||
assert remote.pull(filename, command_hash="aaaa") is None
|
||||
assert remote.pull(filename) is None
|
||||
|
||||
|
||||
def test_cli_find_threshold(capsys):
|
||||
thresholds = numpy.linspace(0, 1, 10)
|
||||
|
||||
def make_examples(nlp: Language) -> List[Example]:
|
||||
docs: List[Example] = []
|
||||
|
||||
for t in [
|
||||
(
|
||||
"I am angry and confused in the Bank of America.",
|
||||
{
|
||||
"cats": {"ANGRY": 1.0, "CONFUSED": 1.0, "HAPPY": 0.0},
|
||||
"spans": {"sc": [(31, 46, "ORG")]},
|
||||
},
|
||||
),
|
||||
(
|
||||
"I am confused but happy in New York.",
|
||||
{
|
||||
"cats": {"ANGRY": 0.0, "CONFUSED": 1.0, "HAPPY": 1.0},
|
||||
"spans": {"sc": [(27, 35, "GPE")]},
|
||||
},
|
||||
),
|
||||
]:
|
||||
doc = nlp.make_doc(t[0])
|
||||
docs.append(Example.from_dict(doc, t[1]))
|
||||
|
||||
return docs
|
||||
|
||||
def init_nlp(
|
||||
components: Tuple[Tuple[str, Dict[str, Any]], ...] = ()
|
||||
) -> Tuple[Language, List[Example]]:
|
||||
new_nlp = English()
|
||||
new_nlp.add_pipe( # type: ignore
|
||||
factory_name="textcat_multilabel",
|
||||
name="tc_multi",
|
||||
config={"threshold": 0.9},
|
||||
)
|
||||
|
||||
# Append additional components to pipeline.
|
||||
for cfn, comp_config in components:
|
||||
new_nlp.add_pipe(cfn, config=comp_config)
|
||||
|
||||
new_examples = make_examples(new_nlp)
|
||||
new_nlp.initialize(get_examples=lambda: new_examples)
|
||||
for i in range(5):
|
||||
new_nlp.update(new_examples)
|
||||
|
||||
return new_nlp, new_examples
|
||||
|
||||
with make_tempdir() as docs_dir:
|
||||
# Check whether find_threshold() identifies lowest threshold above 0 as (first) ideal threshold, as this matches
|
||||
# the current model behavior with the examples above. This can break once the model behavior changes and serves
|
||||
# mostly as a smoke test.
|
||||
nlp, examples = init_nlp()
|
||||
DocBin(docs=[example.reference for example in examples]).to_disk(
|
||||
docs_dir / "docs.spacy"
|
||||
)
|
||||
with make_tempdir() as nlp_dir:
|
||||
nlp.to_disk(nlp_dir)
|
||||
res = find_threshold(
|
||||
model=nlp_dir,
|
||||
data_path=docs_dir / "docs.spacy",
|
||||
pipe_name="tc_multi",
|
||||
threshold_key="threshold",
|
||||
scores_key="cats_macro_f",
|
||||
silent=True,
|
||||
)
|
||||
assert res[0] != thresholds[0]
|
||||
assert thresholds[0] < res[0] < thresholds[9]
|
||||
assert res[1] == 1.0
|
||||
assert res[2][1.0] == 0.0
|
||||
|
||||
# Test with spancat.
|
||||
nlp, _ = init_nlp((("spancat", {}),))
|
||||
with make_tempdir() as nlp_dir:
|
||||
nlp.to_disk(nlp_dir)
|
||||
res = find_threshold(
|
||||
model=nlp_dir,
|
||||
data_path=docs_dir / "docs.spacy",
|
||||
pipe_name="spancat",
|
||||
threshold_key="threshold",
|
||||
scores_key="spans_sc_f",
|
||||
silent=True,
|
||||
)
|
||||
assert res[0] != thresholds[0]
|
||||
assert thresholds[0] < res[0] < thresholds[8]
|
||||
assert res[1] >= 0.6
|
||||
assert res[2][1.0] == 0.0
|
||||
|
||||
# Having multiple textcat_multilabel components should work, since the name has to be specified.
|
||||
nlp, _ = init_nlp((("textcat_multilabel", {}),))
|
||||
with make_tempdir() as nlp_dir:
|
||||
nlp.to_disk(nlp_dir)
|
||||
assert find_threshold(
|
||||
model=nlp_dir,
|
||||
data_path=docs_dir / "docs.spacy",
|
||||
pipe_name="tc_multi",
|
||||
threshold_key="threshold",
|
||||
scores_key="cats_macro_f",
|
||||
silent=True,
|
||||
)
|
||||
|
||||
# Specifying the name of an non-existing pipe should fail.
|
||||
nlp, _ = init_nlp()
|
||||
with make_tempdir() as nlp_dir:
|
||||
nlp.to_disk(nlp_dir)
|
||||
with pytest.raises(AttributeError):
|
||||
find_threshold(
|
||||
model=nlp_dir,
|
||||
data_path=docs_dir / "docs.spacy",
|
||||
pipe_name="_",
|
||||
threshold_key="threshold",
|
||||
scores_key="cats_macro_f",
|
||||
silent=True,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"reqs,output",
|
||||
[
|
||||
[
|
||||
"""
|
||||
spacy
|
||||
|
||||
# comment
|
||||
|
||||
thinc""",
|
||||
(False, False),
|
||||
],
|
||||
[
|
||||
"""# comment
|
||||
--some-flag
|
||||
spacy""",
|
||||
(False, False),
|
||||
],
|
||||
[
|
||||
"""# comment
|
||||
--some-flag
|
||||
spacy; python_version >= '3.6'""",
|
||||
(False, False),
|
||||
],
|
||||
[
|
||||
"""# comment
|
||||
spacyunknowndoesnotexist12345""",
|
||||
(True, False),
|
||||
],
|
||||
],
|
||||
)
|
||||
def test_project_check_requirements(reqs, output):
|
||||
# excessive guard against unlikely package name
|
||||
try:
|
||||
pkg_resources.require("spacyunknowndoesnotexist12345")
|
||||
except pkg_resources.DistributionNotFound:
|
||||
assert output == _check_requirements([req.strip() for req in reqs.split("\n")])
|
||||
|
||||
|
||||
def test_upload_download_local_file():
|
||||
with make_tempdir() as d1, make_tempdir() as d2:
|
||||
filename = "f.txt"
|
||||
content = "content"
|
||||
local_file = d1 / filename
|
||||
remote_file = d2 / filename
|
||||
with local_file.open(mode="w") as file_:
|
||||
file_.write(content)
|
||||
upload_file(local_file, remote_file)
|
||||
local_file.unlink()
|
||||
download_file(remote_file, local_file)
|
||||
with local_file.open(mode="r") as file_:
|
||||
assert file_.read() == content
|
||||
|
||||
|
||||
def test_walk_directory():
|
||||
with make_tempdir() as d:
|
||||
files = [
|
||||
"data1.iob",
|
||||
"data2.iob",
|
||||
"data3.json",
|
||||
"data4.conll",
|
||||
"data5.conll",
|
||||
"data6.conll",
|
||||
"data7.txt",
|
||||
]
|
||||
|
||||
for f in files:
|
||||
Path(d / f).touch()
|
||||
|
||||
assert (len(walk_directory(d))) == 7
|
||||
assert (len(walk_directory(d, suffix=None))) == 7
|
||||
assert (len(walk_directory(d, suffix="json"))) == 1
|
||||
assert (len(walk_directory(d, suffix="iob"))) == 2
|
||||
assert (len(walk_directory(d, suffix="conll"))) == 3
|
||||
assert (len(walk_directory(d, suffix="pdf"))) == 0
|
||||
|
|
42
spacy/tests/test_cli_app.py
Normal file
42
spacy/tests/test_cli_app.py
Normal file
|
@ -0,0 +1,42 @@
|
|||
import os
|
||||
from pathlib import Path
|
||||
from typer.testing import CliRunner
|
||||
|
||||
from spacy.cli._util import app
|
||||
from .util import make_tempdir
|
||||
|
||||
|
||||
def test_convert_auto():
|
||||
with make_tempdir() as d_in, make_tempdir() as d_out:
|
||||
for f in ["data1.iob", "data2.iob", "data3.iob"]:
|
||||
Path(d_in / f).touch()
|
||||
|
||||
# ensure that "automatic" suffix detection works
|
||||
result = CliRunner().invoke(app, ["convert", str(d_in), str(d_out)])
|
||||
assert "Generated output file" in result.stdout
|
||||
out_files = os.listdir(d_out)
|
||||
assert len(out_files) == 3
|
||||
assert "data1.spacy" in out_files
|
||||
assert "data2.spacy" in out_files
|
||||
assert "data3.spacy" in out_files
|
||||
|
||||
|
||||
def test_convert_auto_conflict():
|
||||
with make_tempdir() as d_in, make_tempdir() as d_out:
|
||||
for f in ["data1.iob", "data2.iob", "data3.json"]:
|
||||
Path(d_in / f).touch()
|
||||
|
||||
# ensure that "automatic" suffix detection warns when there are different file types
|
||||
result = CliRunner().invoke(app, ["convert", str(d_in), str(d_out)])
|
||||
assert "All input files must be same type" in result.stdout
|
||||
out_files = os.listdir(d_out)
|
||||
assert len(out_files) == 0
|
||||
|
||||
|
||||
def test_benchmark_accuracy_alias():
|
||||
# Verify that the `evaluate` alias works correctly.
|
||||
result_benchmark = CliRunner().invoke(app, ["benchmark", "accuracy", "--help"])
|
||||
result_evaluate = CliRunner().invoke(app, ["evaluate", "--help"])
|
||||
assert result_benchmark.stdout == result_evaluate.stdout.replace(
|
||||
"spacy evaluate", "spacy benchmark accuracy"
|
||||
)
|
|
@ -203,6 +203,16 @@ def test_displacy_parse_spans_different_spans_key(en_vocab):
|
|||
]
|
||||
|
||||
|
||||
def test_displacy_parse_empty_spans_key(en_vocab):
|
||||
"""Test that having an unset spans key doesn't raise an error"""
|
||||
doc = Doc(en_vocab, words=["Welcome", "to", "the", "Bank", "of", "China"])
|
||||
doc.spans["custom"] = [Span(doc, 3, 6, "BANK")]
|
||||
with pytest.warns(UserWarning, match="W117"):
|
||||
spans = displacy.parse_spans(doc)
|
||||
|
||||
assert isinstance(spans, dict)
|
||||
|
||||
|
||||
def test_displacy_parse_ents(en_vocab):
|
||||
"""Test that named entities on a Doc are converted into displaCy's format."""
|
||||
doc = Doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"])
|
||||
|
|
|
@ -3,6 +3,7 @@ import logging
|
|||
from unittest import mock
|
||||
import pytest
|
||||
from spacy.language import Language
|
||||
from spacy.scorer import Scorer
|
||||
from spacy.tokens import Doc, Span
|
||||
from spacy.vocab import Vocab
|
||||
from spacy.training import Example
|
||||
|
@ -126,6 +127,112 @@ def test_evaluate_no_pipe(nlp):
|
|||
nlp.evaluate([Example.from_dict(doc, annots)])
|
||||
|
||||
|
||||
def test_evaluate_textcat_multilabel(en_vocab):
|
||||
"""Test that evaluate works with a multilabel textcat pipe."""
|
||||
nlp = Language(en_vocab)
|
||||
textcat_multilabel = nlp.add_pipe("textcat_multilabel")
|
||||
for label in ("FEATURE", "REQUEST", "BUG", "QUESTION"):
|
||||
textcat_multilabel.add_label(label)
|
||||
nlp.initialize()
|
||||
|
||||
annots = {"cats": {"FEATURE": 1.0, "QUESTION": 1.0}}
|
||||
doc = nlp.make_doc("hello world")
|
||||
example = Example.from_dict(doc, annots)
|
||||
scores = nlp.evaluate([example])
|
||||
labels = nlp.get_pipe("textcat_multilabel").labels
|
||||
for label in labels:
|
||||
assert scores["cats_f_per_type"].get(label) is not None
|
||||
for key in example.reference.cats.keys():
|
||||
if key not in labels:
|
||||
assert scores["cats_f_per_type"].get(key) is None
|
||||
|
||||
|
||||
def test_evaluate_multiple_textcat_final(en_vocab):
|
||||
"""Test that evaluate evaluates the final textcat component in a pipeline
|
||||
with more than one textcat or textcat_multilabel."""
|
||||
nlp = Language(en_vocab)
|
||||
textcat = nlp.add_pipe("textcat")
|
||||
for label in ("POSITIVE", "NEGATIVE"):
|
||||
textcat.add_label(label)
|
||||
textcat_multilabel = nlp.add_pipe("textcat_multilabel")
|
||||
for label in ("FEATURE", "REQUEST", "BUG", "QUESTION"):
|
||||
textcat_multilabel.add_label(label)
|
||||
nlp.initialize()
|
||||
|
||||
annots = {
|
||||
"cats": {
|
||||
"POSITIVE": 1.0,
|
||||
"NEGATIVE": 0.0,
|
||||
"FEATURE": 1.0,
|
||||
"QUESTION": 1.0,
|
||||
"POSITIVE": 1.0,
|
||||
"NEGATIVE": 0.0,
|
||||
}
|
||||
}
|
||||
doc = nlp.make_doc("hello world")
|
||||
example = Example.from_dict(doc, annots)
|
||||
scores = nlp.evaluate([example])
|
||||
# get the labels from the final pipe
|
||||
labels = nlp.get_pipe(nlp.pipe_names[-1]).labels
|
||||
for label in labels:
|
||||
assert scores["cats_f_per_type"].get(label) is not None
|
||||
for key in example.reference.cats.keys():
|
||||
if key not in labels:
|
||||
assert scores["cats_f_per_type"].get(key) is None
|
||||
|
||||
|
||||
def test_evaluate_multiple_textcat_separate(en_vocab):
|
||||
"""Test that evaluate can evaluate multiple textcat components separately
|
||||
with custom scorers."""
|
||||
|
||||
def custom_textcat_score(examples, **kwargs):
|
||||
scores = Scorer.score_cats(
|
||||
examples,
|
||||
"cats",
|
||||
multi_label=False,
|
||||
**kwargs,
|
||||
)
|
||||
return {f"custom_{k}": v for k, v in scores.items()}
|
||||
|
||||
@spacy.registry.scorers("test_custom_textcat_scorer")
|
||||
def make_custom_textcat_scorer():
|
||||
return custom_textcat_score
|
||||
|
||||
nlp = Language(en_vocab)
|
||||
textcat = nlp.add_pipe(
|
||||
"textcat",
|
||||
config={"scorer": {"@scorers": "test_custom_textcat_scorer"}},
|
||||
)
|
||||
for label in ("POSITIVE", "NEGATIVE"):
|
||||
textcat.add_label(label)
|
||||
textcat_multilabel = nlp.add_pipe("textcat_multilabel")
|
||||
for label in ("FEATURE", "REQUEST", "BUG", "QUESTION"):
|
||||
textcat_multilabel.add_label(label)
|
||||
nlp.initialize()
|
||||
|
||||
annots = {
|
||||
"cats": {
|
||||
"POSITIVE": 1.0,
|
||||
"NEGATIVE": 0.0,
|
||||
"FEATURE": 1.0,
|
||||
"QUESTION": 1.0,
|
||||
"POSITIVE": 1.0,
|
||||
"NEGATIVE": 0.0,
|
||||
}
|
||||
}
|
||||
doc = nlp.make_doc("hello world")
|
||||
example = Example.from_dict(doc, annots)
|
||||
scores = nlp.evaluate([example])
|
||||
# check custom scores for the textcat pipe
|
||||
assert "custom_cats_f_per_type" in scores
|
||||
labels = nlp.get_pipe("textcat").labels
|
||||
assert set(scores["custom_cats_f_per_type"].keys()) == set(labels)
|
||||
# check default scores for the textcat_multilabel pipe
|
||||
assert "cats_f_per_type" in scores
|
||||
labels = nlp.get_pipe("textcat_multilabel").labels
|
||||
assert set(scores["cats_f_per_type"].keys()) == set(labels)
|
||||
|
||||
|
||||
def vector_modification_pipe(doc):
|
||||
doc.vector += 1
|
||||
return doc
|
||||
|
|
|
@ -8,7 +8,7 @@ from spacy import prefer_gpu, require_gpu, require_cpu
|
|||
from spacy.ml._precomputable_affine import PrecomputableAffine
|
||||
from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding
|
||||
from spacy.util import dot_to_object, SimpleFrozenList, import_file
|
||||
from spacy.util import to_ternary_int
|
||||
from spacy.util import to_ternary_int, find_available_port
|
||||
from thinc.api import Config, Optimizer, ConfigValidationError
|
||||
from thinc.api import get_current_ops, set_current_ops, NumpyOps, CupyOps, MPSOps
|
||||
from thinc.compat import has_cupy_gpu, has_torch_mps_gpu
|
||||
|
@ -434,3 +434,16 @@ def test_to_ternary_int():
|
|||
assert to_ternary_int(-10) == -1
|
||||
assert to_ternary_int("string") == -1
|
||||
assert to_ternary_int([0, "string"]) == -1
|
||||
|
||||
|
||||
def test_find_available_port():
|
||||
host = "0.0.0.0"
|
||||
port = 5000
|
||||
assert find_available_port(port, host) == port, "Port 5000 isn't free"
|
||||
|
||||
from wsgiref.simple_server import make_server, demo_app
|
||||
|
||||
with make_server(host, port, demo_app) as httpd:
|
||||
with pytest.warns(UserWarning, match="already in use"):
|
||||
found_port = find_available_port(port, host, auto_select=True)
|
||||
assert found_port == port + 1, "Didn't find next port"
|
||||
|
|
|
@ -23,7 +23,7 @@ def get_textcat_bow_kwargs():
|
|||
|
||||
|
||||
def get_textcat_cnn_kwargs():
|
||||
return {"tok2vec": test_tok2vec(), "exclusive_classes": False, "nO": 13}
|
||||
return {"tok2vec": make_test_tok2vec(), "exclusive_classes": False, "nO": 13}
|
||||
|
||||
|
||||
def get_all_params(model):
|
||||
|
@ -65,7 +65,7 @@ def get_tok2vec_kwargs():
|
|||
}
|
||||
|
||||
|
||||
def test_tok2vec():
|
||||
def make_test_tok2vec():
|
||||
return build_Tok2Vec_model(**get_tok2vec_kwargs())
|
||||
|
||||
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user