mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-02 11:20:19 +03:00
Merge branch 'v4' into cleanup/remove-backwards-compat
This commit is contained in:
commit
db196aaa09
41
.github/azure-steps.yml
vendored
41
.github/azure-steps.yml
vendored
|
@ -52,12 +52,17 @@ steps:
|
|||
python -W error -c "import spacy"
|
||||
displayName: "Test import"
|
||||
|
||||
- script: |
|
||||
python -m spacy download ca_core_news_sm
|
||||
python -m spacy download ca_core_news_md
|
||||
python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
|
||||
displayName: 'Test download CLI'
|
||||
condition: eq(variables['python_version'], '3.8')
|
||||
# - script: |
|
||||
# python -m spacy download ca_core_news_sm
|
||||
# python -m spacy download ca_core_news_md
|
||||
# python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
|
||||
# displayName: 'Test download CLI'
|
||||
# condition: eq(variables['python_version'], '3.8')
|
||||
#
|
||||
# - script: |
|
||||
# python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
|
||||
# displayName: 'Test no warnings on load (#11713)'
|
||||
# condition: eq(variables['python_version'], '3.8')
|
||||
|
||||
- script: |
|
||||
python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
|
||||
|
@ -81,17 +86,17 @@ steps:
|
|||
displayName: 'Test train CLI'
|
||||
condition: eq(variables['python_version'], '3.8')
|
||||
|
||||
- script: |
|
||||
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
|
||||
PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
|
||||
displayName: 'Test assemble CLI'
|
||||
condition: eq(variables['python_version'], '3.8')
|
||||
|
||||
- script: |
|
||||
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
|
||||
python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
|
||||
displayName: 'Test assemble CLI vectors warning'
|
||||
condition: eq(variables['python_version'], '3.8')
|
||||
# - script: |
|
||||
# python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
|
||||
# PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
|
||||
# displayName: 'Test assemble CLI'
|
||||
# condition: eq(variables['python_version'], '3.8')
|
||||
#
|
||||
# - script: |
|
||||
# python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
|
||||
# python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
|
||||
# displayName: 'Test assemble CLI vectors warning'
|
||||
# condition: eq(variables['python_version'], '3.8')
|
||||
|
||||
- script: |
|
||||
python -m pip install -U -r requirements.txt
|
||||
|
@ -102,7 +107,7 @@ steps:
|
|||
displayName: "Run CPU tests"
|
||||
|
||||
- script: |
|
||||
python -m pip install --pre thinc-apple-ops
|
||||
python -m pip install 'spacy[apple]'
|
||||
python -m pytest --pyargs spacy
|
||||
displayName: "Run CPU tests with thinc-apple-ops"
|
||||
condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.11'))
|
||||
|
|
2
.github/workflows/lock.yml
vendored
2
.github/workflows/lock.yml
vendored
|
@ -15,7 +15,7 @@ jobs:
|
|||
action:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: dessant/lock-threads@v3
|
||||
- uses: dessant/lock-threads@v4
|
||||
with:
|
||||
process-only: 'issues'
|
||||
issue-inactive-days: '30'
|
||||
|
|
2
.github/workflows/spacy_universe_alert.yml
vendored
2
.github/workflows/spacy_universe_alert.yml
vendored
|
@ -19,6 +19,8 @@ jobs:
|
|||
|
||||
- uses: actions/checkout@v3
|
||||
- uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: '3.10'
|
||||
- name: Install Bernadette app dependency and send an alert
|
||||
env:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
|
|
10
.gitignore
vendored
10
.gitignore
vendored
|
@ -10,16 +10,6 @@ spacy/tests/package/setup.cfg
|
|||
spacy/tests/package/pyproject.toml
|
||||
spacy/tests/package/requirements.txt
|
||||
|
||||
# Website
|
||||
website/.cache/
|
||||
website/public/
|
||||
website/node_modules
|
||||
website/.npm
|
||||
website/logs
|
||||
*.log
|
||||
npm-debug.log*
|
||||
quickstart-training-generator.js
|
||||
|
||||
# Cython / C extensions
|
||||
cythonize.json
|
||||
spacy/*.html
|
||||
|
|
|
@ -3,9 +3,9 @@ repos:
|
|||
rev: 22.3.0
|
||||
hooks:
|
||||
- id: black
|
||||
language_version: python3.7
|
||||
language_version: python3.8
|
||||
additional_dependencies: ['click==8.0.4']
|
||||
- repo: https://gitlab.com/pycqa/flake8
|
||||
- repo: https://github.com/pycqa/flake8
|
||||
rev: 5.0.4
|
||||
hooks:
|
||||
- id: flake8
|
||||
|
|
|
@ -271,7 +271,7 @@ except: # noqa: E722
|
|||
|
||||
### Python conventions
|
||||
|
||||
All Python code must be written **compatible with Python 3.6+**. More detailed
|
||||
All Python code must be written **compatible with Python 3.8+**. More detailed
|
||||
code conventions can be found in the [developer docs](https://github.com/explosion/spaCy/blob/master/extra/DEVELOPER_DOCS/Code%20Conventions.md).
|
||||
|
||||
#### I/O and handling paths
|
||||
|
|
2
Makefile
2
Makefile
|
@ -5,7 +5,7 @@ override SPACY_EXTRAS = spacy-lookups-data==1.0.2 jieba spacy-pkuseg==0.0.28 sud
|
|||
endif
|
||||
|
||||
ifndef PYVER
|
||||
override PYVER = 3.6
|
||||
override PYVER = 3.8
|
||||
endif
|
||||
|
||||
VENV := ./env$(PYVER)
|
||||
|
|
|
@ -14,9 +14,9 @@ parsing, **named entity recognition**, **text classification** and more,
|
|||
multi-task learning with pretrained **transformers** like BERT, as well as a
|
||||
production-ready [**training system**](https://spacy.io/usage/training) and easy
|
||||
model packaging, deployment and workflow management. spaCy is commercial
|
||||
open-source software, released under the MIT license.
|
||||
open-source software, released under the [MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE).
|
||||
|
||||
💫 **Version 3.4 out now!**
|
||||
💫 **Version 3.5 out now!**
|
||||
[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
|
||||
|
||||
[](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)
|
||||
|
@ -46,6 +46,7 @@ open-source software, released under the MIT license.
|
|||
| 🛠 **[Changelog]** | Changes and version history. |
|
||||
| 💝 **[Contribute]** | How to contribute to the spaCy project and code base. |
|
||||
| <a href="https://explosion.ai/spacy-tailored-pipelines"><img src="https://user-images.githubusercontent.com/13643239/152853098-1c761611-ccb0-4ec6-9066-b234552831fe.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Get a custom spaCy pipeline, tailor-made for your NLP problem by spaCy's core developers. Streamlined, production-ready, predictable and maintainable. Start by completing our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-pipelines)** |
|
||||
| <a href="https://explosion.ai/spacy-tailored-analysis"><img src="https://user-images.githubusercontent.com/1019791/206151300-b00cd189-e503-4797-aa1e-1bb6344062c5.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Bespoke advice for problem solving, strategy and analysis for applied NLP projects. Services include data strategy, code reviews, pipeline design and annotation coaching. Curious? Fill in our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-analysis)** |
|
||||
|
||||
[spacy 101]: https://spacy.io/usage/spacy-101
|
||||
[new in v3.0]: https://spacy.io/usage/v3
|
||||
|
@ -59,6 +60,7 @@ open-source software, released under the MIT license.
|
|||
[changelog]: https://spacy.io/usage#changelog
|
||||
[contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md
|
||||
|
||||
|
||||
## 💬 Where to ask questions
|
||||
|
||||
The spaCy project is maintained by the [spaCy team](https://explosion.ai/about).
|
||||
|
@ -103,7 +105,7 @@ For detailed installation instructions, see the
|
|||
|
||||
- **Operating system**: macOS / OS X · Linux · Windows (Cygwin, MinGW, Visual
|
||||
Studio)
|
||||
- **Python version**: Python 3.6+ (only 64 bit)
|
||||
- **Python version**: Python 3.8+ (only 64 bit)
|
||||
- **Package managers**: [pip] · [conda] (via `conda-forge`)
|
||||
|
||||
[pip]: https://pypi.org/project/spacy/
|
||||
|
|
|
@ -11,25 +11,39 @@ trigger:
|
|||
exclude:
|
||||
- "website/*"
|
||||
- "*.md"
|
||||
- "*.mdx"
|
||||
- ".github/workflows/*"
|
||||
pr:
|
||||
paths:
|
||||
exclude:
|
||||
- "*.md"
|
||||
- "*.mdx"
|
||||
- "website/docs/*"
|
||||
- "website/src/*"
|
||||
- "website/meta/*.tsx"
|
||||
- "website/meta/*.mjs"
|
||||
- "website/meta/languages.json"
|
||||
- "website/meta/site.json"
|
||||
- "website/meta/sidebars.json"
|
||||
- "website/meta/type-annotations.json"
|
||||
- "website/pages/*"
|
||||
- ".github/workflows/*"
|
||||
|
||||
jobs:
|
||||
# Perform basic checks for most important errors (syntax etc.) Uses the config
|
||||
# defined in .flake8 and overwrites the selected codes.
|
||||
# Check formatting and linting. Perform basic checks for most important errors
|
||||
# (syntax etc.) Uses the config defined in setup.cfg and overwrites the
|
||||
# selected codes.
|
||||
- job: "Validate"
|
||||
pool:
|
||||
vmImage: "ubuntu-latest"
|
||||
steps:
|
||||
- task: UsePythonVersion@0
|
||||
inputs:
|
||||
versionSpec: "3.7"
|
||||
versionSpec: "3.8"
|
||||
- script: |
|
||||
pip install black==22.3.0
|
||||
python -m black spacy --check
|
||||
displayName: "black"
|
||||
- script: |
|
||||
pip install flake8==5.0.4
|
||||
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
|
||||
|
@ -40,24 +54,6 @@ jobs:
|
|||
strategy:
|
||||
matrix:
|
||||
# We're only running one platform per Python version to speed up builds
|
||||
Python36Linux:
|
||||
imageName: "ubuntu-latest"
|
||||
python.version: "3.6"
|
||||
# Python36Windows:
|
||||
# imageName: "windows-latest"
|
||||
# python.version: "3.6"
|
||||
# Python36Mac:
|
||||
# imageName: "macos-latest"
|
||||
# python.version: "3.6"
|
||||
# Python37Linux:
|
||||
# imageName: "ubuntu-latest"
|
||||
# python.version: "3.7"
|
||||
Python37Windows:
|
||||
imageName: "windows-latest"
|
||||
python.version: "3.7"
|
||||
# Python37Mac:
|
||||
# imageName: "macos-latest"
|
||||
# python.version: "3.7"
|
||||
# Python38Linux:
|
||||
# imageName: "ubuntu-latest"
|
||||
# python.version: "3.8"
|
||||
|
@ -87,13 +83,13 @@ jobs:
|
|||
# python.version: "3.10"
|
||||
Python311Linux:
|
||||
imageName: 'ubuntu-latest'
|
||||
python.version: '3.11.0'
|
||||
python.version: '3.11'
|
||||
Python311Windows:
|
||||
imageName: 'windows-latest'
|
||||
python.version: '3.11.0'
|
||||
python.version: '3.11'
|
||||
Python311Mac:
|
||||
imageName: 'macos-latest'
|
||||
python.version: '3.11.0'
|
||||
python.version: '3.11'
|
||||
maxParallel: 4
|
||||
pool:
|
||||
vmImage: $(imageName)
|
||||
|
|
|
@ -5,4 +5,5 @@ numpy==1.17.3; python_version=='3.8' and platform_machine!='aarch64'
|
|||
numpy==1.19.2; python_version=='3.8' and platform_machine=='aarch64'
|
||||
numpy==1.19.3; python_version=='3.9'
|
||||
numpy==1.21.3; python_version=='3.10'
|
||||
numpy; python_version>='3.11'
|
||||
numpy==1.23.2; python_version=='3.11'
|
||||
numpy; python_version>='3.12'
|
||||
|
|
|
@ -5,7 +5,7 @@ requires = [
|
|||
"cymem>=2.0.2,<2.1.0",
|
||||
"preshed>=3.0.2,<3.1.0",
|
||||
"murmurhash>=0.28.0,<1.1.0",
|
||||
"thinc>=8.1.0,<8.2.0",
|
||||
"thinc>=9.0.0.dev2,<9.1.0",
|
||||
"numpy>=1.15.0",
|
||||
]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
|
|
@ -1,16 +1,17 @@
|
|||
# Our libraries
|
||||
spacy-legacy>=3.0.10,<3.1.0
|
||||
spacy-legacy>=4.0.0.dev0,<4.1.0
|
||||
spacy-loggers>=1.0.0,<2.0.0
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
thinc>=8.1.0,<8.2.0
|
||||
thinc>=9.0.0.dev2,<9.1.0
|
||||
ml_datasets>=0.2.0,<0.3.0
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
wasabi>=0.9.1,<1.1.0
|
||||
wasabi>=0.9.1,<1.2.0
|
||||
srsly>=2.4.3,<3.0.0
|
||||
catalogue>=2.0.6,<2.1.0
|
||||
typer>=0.3.0,<0.5.0
|
||||
pathy>=0.3.5
|
||||
typer>=0.3.0,<0.8.0
|
||||
pathy>=0.10.0
|
||||
smart-open>=5.2.1,<7.0.0
|
||||
# Third party dependencies
|
||||
numpy>=1.15.0
|
||||
requests>=2.13.0,<3.0.0
|
||||
|
@ -21,7 +22,6 @@ langcodes>=3.2.0,<4.0.0
|
|||
# Official Python utilities
|
||||
setuptools
|
||||
packaging>=20.0
|
||||
typing_extensions>=3.7.4.1,<4.2.0; python_version < "3.8"
|
||||
# Development dependencies
|
||||
pre-commit>=2.13.0
|
||||
cython>=0.25,<3.0
|
||||
|
@ -30,8 +30,7 @@ pytest-timeout>=1.3.0,<2.0.0
|
|||
mock>=2.0.0,<3.0.0
|
||||
flake8>=3.8.0,<6.0.0
|
||||
hypothesis>=3.27.0,<7.0.0
|
||||
mypy>=0.980,<0.990; platform_machine != "aarch64" and python_version >= "3.7"
|
||||
types-dataclasses>=0.1.3; python_version < "3.7"
|
||||
mypy>=0.990,<0.1000; platform_machine != "aarch64"
|
||||
types-mock>=0.1.1
|
||||
types-setuptools>=57.0.0
|
||||
types-requests
|
||||
|
|
19
setup.cfg
19
setup.cfg
|
@ -17,11 +17,10 @@ classifiers =
|
|||
Operating System :: Microsoft :: Windows
|
||||
Programming Language :: Cython
|
||||
Programming Language :: Python :: 3
|
||||
Programming Language :: Python :: 3.6
|
||||
Programming Language :: Python :: 3.7
|
||||
Programming Language :: Python :: 3.8
|
||||
Programming Language :: Python :: 3.9
|
||||
Programming Language :: Python :: 3.10
|
||||
Programming Language :: Python :: 3.11
|
||||
Topic :: Scientific/Engineering
|
||||
project_urls =
|
||||
Release notes = https://github.com/explosion/spaCy/releases
|
||||
|
@ -30,21 +29,22 @@ project_urls =
|
|||
[options]
|
||||
zip_safe = false
|
||||
include_package_data = true
|
||||
python_requires = >=3.6
|
||||
python_requires = >=3.8
|
||||
install_requires =
|
||||
# Our libraries
|
||||
spacy-legacy>=3.0.10,<3.1.0
|
||||
spacy-legacy>=4.0.0.dev0,<4.1.0
|
||||
spacy-loggers>=1.0.0,<2.0.0
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
thinc>=8.1.0,<8.2.0
|
||||
wasabi>=0.9.1,<1.1.0
|
||||
thinc>=9.0.0.dev2,<9.1.0
|
||||
wasabi>=0.9.1,<1.2.0
|
||||
srsly>=2.4.3,<3.0.0
|
||||
catalogue>=2.0.6,<2.1.0
|
||||
# Third-party dependencies
|
||||
typer>=0.3.0,<0.5.0
|
||||
pathy>=0.3.5
|
||||
typer>=0.3.0,<0.8.0
|
||||
pathy>=0.10.0
|
||||
smart-open>=5.2.1,<7.0.0
|
||||
tqdm>=4.38.0,<5.0.0
|
||||
numpy>=1.15.0
|
||||
requests>=2.13.0,<3.0.0
|
||||
|
@ -53,7 +53,6 @@ install_requires =
|
|||
# Official Python utilities
|
||||
setuptools
|
||||
packaging>=20.0
|
||||
typing_extensions>=3.7.4,<4.2.0; python_version < "3.8"
|
||||
langcodes>=3.2.0,<4.0.0
|
||||
|
||||
[options.entry_points]
|
||||
|
@ -64,7 +63,7 @@ console_scripts =
|
|||
lookups =
|
||||
spacy_lookups_data>=1.0.3,<1.1.0
|
||||
transformers =
|
||||
spacy_transformers>=1.1.2,<1.2.0
|
||||
spacy_transformers>=1.1.2,<1.3.0
|
||||
ray =
|
||||
spacy_ray>=0.1.0,<1.0.0
|
||||
cuda =
|
||||
|
|
9
setup.py
9
setup.py
|
@ -33,13 +33,10 @@ MOD_NAMES = [
|
|||
"spacy.kb.candidate",
|
||||
"spacy.kb.kb",
|
||||
"spacy.kb.kb_in_memory",
|
||||
"spacy.ml.parser_model",
|
||||
"spacy.ml.tb_framework",
|
||||
"spacy.morphology",
|
||||
"spacy.pipeline.dep_parser",
|
||||
"spacy.pipeline._edit_tree_internals.edit_trees",
|
||||
"spacy.pipeline.morphologizer",
|
||||
"spacy.pipeline.multitask",
|
||||
"spacy.pipeline.ner",
|
||||
"spacy.pipeline.pipe",
|
||||
"spacy.pipeline.trainable_pipe",
|
||||
"spacy.pipeline.sentencizer",
|
||||
|
@ -47,12 +44,15 @@ MOD_NAMES = [
|
|||
"spacy.pipeline.tagger",
|
||||
"spacy.pipeline.transition_parser",
|
||||
"spacy.pipeline._parser_internals.arc_eager",
|
||||
"spacy.pipeline._parser_internals.batch",
|
||||
"spacy.pipeline._parser_internals.ner",
|
||||
"spacy.pipeline._parser_internals.nonproj",
|
||||
"spacy.pipeline._parser_internals.search",
|
||||
"spacy.pipeline._parser_internals._state",
|
||||
"spacy.pipeline._parser_internals.stateclass",
|
||||
"spacy.pipeline._parser_internals.transition_system",
|
||||
"spacy.pipeline._parser_internals._beam_utils",
|
||||
"spacy.pipeline._parser_internals._parser_utils",
|
||||
"spacy.tokenizer",
|
||||
"spacy.training.align",
|
||||
"spacy.training.gold_io",
|
||||
|
@ -68,6 +68,7 @@ MOD_NAMES = [
|
|||
"spacy.matcher.dependencymatcher",
|
||||
"spacy.symbols",
|
||||
"spacy.vectors",
|
||||
"spacy.tests.parser._search",
|
||||
]
|
||||
COMPILE_OPTIONS = {
|
||||
"msvc": ["/Ox", "/EHsc"],
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
# fmt: off
|
||||
__title__ = "spacy"
|
||||
__version__ = "3.4.2"
|
||||
__version__ = "4.0.0.dev0"
|
||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||
__projects__ = "https://github.com/explosion/projects"
|
||||
|
|
|
@ -4,6 +4,7 @@ from ._util import app, setup_cli # noqa: F401
|
|||
|
||||
# These are the actual functions, NOT the wrapped CLI commands. The CLI commands
|
||||
# are registered automatically and won't have to be imported here.
|
||||
from .benchmark_speed import benchmark_speed_cli # noqa: F401
|
||||
from .download import download # noqa: F401
|
||||
from .info import info # noqa: F401
|
||||
from .package import package # noqa: F401
|
||||
|
@ -16,6 +17,7 @@ from .debug_config import debug_config # noqa: F401
|
|||
from .debug_model import debug_model # noqa: F401
|
||||
from .debug_diff import debug_diff # noqa: F401
|
||||
from .evaluate import evaluate # noqa: F401
|
||||
from .apply import apply # noqa: F401
|
||||
from .convert import convert # noqa: F401
|
||||
from .init_pipeline import init_pipeline_cli # noqa: F401
|
||||
from .init_config import init_config, fill_config # noqa: F401
|
||||
|
@ -27,6 +29,7 @@ from .project.dvc import project_update_dvc # noqa: F401
|
|||
from .project.push import project_push # noqa: F401
|
||||
from .project.pull import project_pull # noqa: F401
|
||||
from .project.document import project_document # noqa: F401
|
||||
from .find_threshold import find_threshold # noqa: F401
|
||||
|
||||
|
||||
@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from typing import Dict, Any, Union, List, Optional, Tuple, Iterable
|
||||
from typing import Dict, Any, Union, List, Optional, Tuple, Iterable, Literal
|
||||
from typing import TYPE_CHECKING, overload
|
||||
import sys
|
||||
import shutil
|
||||
|
@ -16,14 +16,13 @@ from thinc.util import gpu_is_available
|
|||
from configparser import InterpolationError
|
||||
import os
|
||||
|
||||
from ..compat import Literal
|
||||
from ..schemas import ProjectConfigSchema, validate
|
||||
from ..util import import_file, run_command, make_tempdir, registry, logger
|
||||
from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS
|
||||
from .. import about
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pathy import Pathy # noqa: F401
|
||||
from pathy import FluidPath # noqa: F401
|
||||
|
||||
|
||||
SDIST_SUFFIX = ".tar.gz"
|
||||
|
@ -46,6 +45,7 @@ DEBUG_HELP = """Suite of helpful commands for debugging and profiling. Includes
|
|||
commands to check and validate your config files, training and evaluation data,
|
||||
and custom model implementations.
|
||||
"""
|
||||
BENCHMARK_HELP = """Commands for benchmarking pipelines."""
|
||||
INIT_HELP = """Commands for initializing configs and pipeline packages."""
|
||||
|
||||
# Wrappers for Typer's annotations. Initially created to set defaults and to
|
||||
|
@ -54,12 +54,14 @@ Arg = typer.Argument
|
|||
Opt = typer.Option
|
||||
|
||||
app = typer.Typer(name=NAME, help=HELP)
|
||||
benchmark_cli = typer.Typer(name="benchmark", help=BENCHMARK_HELP, no_args_is_help=True)
|
||||
project_cli = typer.Typer(name="project", help=PROJECT_HELP, no_args_is_help=True)
|
||||
debug_cli = typer.Typer(name="debug", help=DEBUG_HELP, no_args_is_help=True)
|
||||
init_cli = typer.Typer(name="init", help=INIT_HELP, no_args_is_help=True)
|
||||
|
||||
app.add_typer(project_cli)
|
||||
app.add_typer(debug_cli)
|
||||
app.add_typer(benchmark_cli)
|
||||
app.add_typer(init_cli)
|
||||
|
||||
|
||||
|
@ -158,15 +160,15 @@ def load_project_config(
|
|||
sys.exit(1)
|
||||
validate_project_version(config)
|
||||
validate_project_commands(config)
|
||||
if interpolate:
|
||||
err = f"{PROJECT_FILE} validation error"
|
||||
with show_validation_error(title=err, hint_fill=False):
|
||||
config = substitute_project_variables(config, overrides)
|
||||
# Make sure directories defined in config exist
|
||||
for subdir in config.get("directories", []):
|
||||
dir_path = path / subdir
|
||||
if not dir_path.exists():
|
||||
dir_path.mkdir(parents=True)
|
||||
if interpolate:
|
||||
err = f"{PROJECT_FILE} validation error"
|
||||
with show_validation_error(title=err, hint_fill=False):
|
||||
config = substitute_project_variables(config, overrides)
|
||||
return config
|
||||
|
||||
|
||||
|
@ -331,7 +333,7 @@ def import_code(code_path: Optional[Union[Path, str]]) -> None:
|
|||
msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1)
|
||||
|
||||
|
||||
def upload_file(src: Path, dest: Union[str, "Pathy"]) -> None:
|
||||
def upload_file(src: Path, dest: Union[str, "FluidPath"]) -> None:
|
||||
"""Upload a file.
|
||||
|
||||
src (Path): The source path.
|
||||
|
@ -339,13 +341,20 @@ def upload_file(src: Path, dest: Union[str, "Pathy"]) -> None:
|
|||
"""
|
||||
import smart_open
|
||||
|
||||
# Create parent directories for local paths
|
||||
if isinstance(dest, Path):
|
||||
if not dest.parent.exists():
|
||||
dest.parent.mkdir(parents=True)
|
||||
|
||||
dest = str(dest)
|
||||
with smart_open.open(dest, mode="wb") as output_file:
|
||||
with src.open(mode="rb") as input_file:
|
||||
output_file.write(input_file.read())
|
||||
|
||||
|
||||
def download_file(src: Union[str, "Pathy"], dest: Path, *, force: bool = False) -> None:
|
||||
def download_file(
|
||||
src: Union[str, "FluidPath"], dest: Path, *, force: bool = False
|
||||
) -> None:
|
||||
"""Download a file using smart_open.
|
||||
|
||||
url (str): The URL of the file.
|
||||
|
@ -358,7 +367,7 @@ def download_file(src: Union[str, "Pathy"], dest: Path, *, force: bool = False)
|
|||
if dest.exists() and not force:
|
||||
return None
|
||||
src = str(src)
|
||||
with smart_open.open(src, mode="rb", ignore_ext=True) as input_file:
|
||||
with smart_open.open(src, mode="rb", compression="disable") as input_file:
|
||||
with dest.open(mode="wb") as output_file:
|
||||
shutil.copyfileobj(input_file, output_file)
|
||||
|
||||
|
@ -368,7 +377,7 @@ def ensure_pathy(path):
|
|||
slow and annoying Google Cloud warning)."""
|
||||
from pathy import Pathy # noqa: F811
|
||||
|
||||
return Pathy(path)
|
||||
return Pathy.fluid(path)
|
||||
|
||||
|
||||
def git_checkout(
|
||||
|
@ -575,6 +584,33 @@ def setup_gpu(use_gpu: int, silent=None) -> None:
|
|||
local_msg.info("To switch to GPU 0, use the option: --gpu-id 0")
|
||||
|
||||
|
||||
def walk_directory(path: Path, suffix: Optional[str] = None) -> List[Path]:
|
||||
"""Given a directory and a suffix, recursively find all files matching the suffix.
|
||||
Directories or files with names beginning with a . are ignored, but hidden flags on
|
||||
filesystems are not checked.
|
||||
When provided with a suffix `None`, there is no suffix-based filtering."""
|
||||
if not path.is_dir():
|
||||
return [path]
|
||||
paths = [path]
|
||||
locs = []
|
||||
seen = set()
|
||||
for path in paths:
|
||||
if str(path) in seen:
|
||||
continue
|
||||
seen.add(str(path))
|
||||
if path.parts[-1].startswith("."):
|
||||
continue
|
||||
elif path.is_dir():
|
||||
paths.extend(path.iterdir())
|
||||
elif suffix is not None and not path.parts[-1].endswith(suffix):
|
||||
continue
|
||||
else:
|
||||
locs.append(path)
|
||||
# It's good to sort these, in case the ordering messes up cache.
|
||||
locs.sort()
|
||||
return locs
|
||||
|
||||
|
||||
def _format_number(number: Union[int, float], ndigits: int = 2) -> str:
|
||||
"""Formats a number (float or int) rounding to `ndigits`, without truncating trailing 0s,
|
||||
as happens with `round(number, ndigits)`"""
|
||||
|
|
143
spacy/cli/apply.py
Normal file
143
spacy/cli/apply.py
Normal file
|
@ -0,0 +1,143 @@
|
|||
import tqdm
|
||||
import srsly
|
||||
|
||||
from itertools import chain
|
||||
from pathlib import Path
|
||||
from typing import Optional, List, Iterable, cast, Union
|
||||
|
||||
from wasabi import msg
|
||||
|
||||
from ._util import app, Arg, Opt, setup_gpu, import_code, walk_directory
|
||||
|
||||
from ..tokens import Doc, DocBin
|
||||
from ..vocab import Vocab
|
||||
from ..util import ensure_path, load_model
|
||||
|
||||
|
||||
path_help = """Location of the documents to predict on.
|
||||
Can be a single file in .spacy format or a .jsonl file.
|
||||
Files with other extensions are treated as single plain text documents.
|
||||
If a directory is provided it is traversed recursively to grab
|
||||
all files to be processed.
|
||||
The files can be a mixture of .spacy, .jsonl and text files.
|
||||
If .jsonl is provided the specified field is going
|
||||
to be grabbed ("text" by default)."""
|
||||
|
||||
out_help = "Path to save the resulting .spacy file"
|
||||
code_help = (
|
||||
"Path to Python file with additional " "code (registered functions) to be imported"
|
||||
)
|
||||
gold_help = "Use gold preprocessing provided in the .spacy files"
|
||||
force_msg = (
|
||||
"The provided output file already exists. "
|
||||
"To force overwriting the output file, set the --force or -F flag."
|
||||
)
|
||||
|
||||
|
||||
DocOrStrStream = Union[Iterable[str], Iterable[Doc]]
|
||||
|
||||
|
||||
def _stream_docbin(path: Path, vocab: Vocab) -> Iterable[Doc]:
|
||||
"""
|
||||
Stream Doc objects from DocBin.
|
||||
"""
|
||||
docbin = DocBin().from_disk(path)
|
||||
for doc in docbin.get_docs(vocab):
|
||||
yield doc
|
||||
|
||||
|
||||
def _stream_jsonl(path: Path, field: str) -> Iterable[str]:
|
||||
"""
|
||||
Stream "text" field from JSONL. If the field "text" is
|
||||
not found it raises error.
|
||||
"""
|
||||
for entry in srsly.read_jsonl(path):
|
||||
if field not in entry:
|
||||
msg.fail(f"{path} does not contain the required '{field}' field.", exits=1)
|
||||
else:
|
||||
yield entry[field]
|
||||
|
||||
|
||||
def _stream_texts(paths: Iterable[Path]) -> Iterable[str]:
|
||||
"""
|
||||
Yields strings from text files in paths.
|
||||
"""
|
||||
for path in paths:
|
||||
with open(path, "r") as fin:
|
||||
text = fin.read()
|
||||
yield text
|
||||
|
||||
|
||||
@app.command("apply")
|
||||
def apply_cli(
|
||||
# fmt: off
|
||||
model: str = Arg(..., help="Model name or path"),
|
||||
data_path: Path = Arg(..., help=path_help, exists=True),
|
||||
output_file: Path = Arg(..., help=out_help, dir_okay=False),
|
||||
code_path: Optional[Path] = Opt(None, "--code", "-c", help=code_help),
|
||||
text_key: str = Opt("text", "--text-key", "-tk", help="Key containing text string for JSONL"),
|
||||
force_overwrite: bool = Opt(False, "--force", "-F", help="Force overwriting the output file"),
|
||||
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU."),
|
||||
batch_size: int = Opt(1, "--batch-size", "-b", help="Batch size."),
|
||||
n_process: int = Opt(1, "--n-process", "-n", help="number of processors to use.")
|
||||
):
|
||||
"""
|
||||
Apply a trained pipeline to documents to get predictions.
|
||||
Expects a loadable spaCy pipeline and path to the data, which
|
||||
can be a directory or a file.
|
||||
The data files can be provided in multiple formats:
|
||||
1. .spacy files
|
||||
2. .jsonl files with a specified "field" to read the text from.
|
||||
3. Files with any other extension are assumed to be containing
|
||||
a single document.
|
||||
DOCS: https://spacy.io/api/cli#apply
|
||||
"""
|
||||
data_path = ensure_path(data_path)
|
||||
output_file = ensure_path(output_file)
|
||||
code_path = ensure_path(code_path)
|
||||
if output_file.exists() and not force_overwrite:
|
||||
msg.fail(force_msg, exits=1)
|
||||
if not data_path.exists():
|
||||
msg.fail(f"Couldn't find data path: {data_path}", exits=1)
|
||||
import_code(code_path)
|
||||
setup_gpu(use_gpu)
|
||||
apply(data_path, output_file, model, text_key, batch_size, n_process)
|
||||
|
||||
|
||||
def apply(
|
||||
data_path: Path,
|
||||
output_file: Path,
|
||||
model: str,
|
||||
json_field: str,
|
||||
batch_size: int,
|
||||
n_process: int,
|
||||
):
|
||||
docbin = DocBin(store_user_data=True)
|
||||
paths = walk_directory(data_path)
|
||||
if len(paths) == 0:
|
||||
docbin.to_disk(output_file)
|
||||
msg.warn(
|
||||
"Did not find data to process,"
|
||||
f" {data_path} seems to be an empty directory."
|
||||
)
|
||||
return
|
||||
nlp = load_model(model)
|
||||
msg.good(f"Loaded model {model}")
|
||||
vocab = nlp.vocab
|
||||
streams: List[DocOrStrStream] = []
|
||||
text_files = []
|
||||
for path in paths:
|
||||
if path.suffix == ".spacy":
|
||||
streams.append(_stream_docbin(path, vocab))
|
||||
elif path.suffix == ".jsonl":
|
||||
streams.append(_stream_jsonl(path, json_field))
|
||||
else:
|
||||
text_files.append(path)
|
||||
if len(text_files) > 0:
|
||||
streams.append(_stream_texts(text_files))
|
||||
datagen = cast(DocOrStrStream, chain(*streams))
|
||||
for doc in tqdm.tqdm(nlp.pipe(datagen, batch_size=batch_size, n_process=n_process)):
|
||||
docbin.add(doc)
|
||||
if output_file.suffix == "":
|
||||
output_file = output_file.with_suffix(".spacy")
|
||||
docbin.to_disk(output_file)
|
174
spacy/cli/benchmark_speed.py
Normal file
174
spacy/cli/benchmark_speed.py
Normal file
|
@ -0,0 +1,174 @@
|
|||
from typing import Iterable, List, Optional
|
||||
import random
|
||||
from itertools import islice
|
||||
import numpy
|
||||
from pathlib import Path
|
||||
import time
|
||||
from tqdm import tqdm
|
||||
import typer
|
||||
from wasabi import msg
|
||||
|
||||
from .. import util
|
||||
from ..language import Language
|
||||
from ..tokens import Doc
|
||||
from ..training import Corpus
|
||||
from ._util import Arg, Opt, benchmark_cli, setup_gpu
|
||||
|
||||
|
||||
@benchmark_cli.command(
|
||||
"speed",
|
||||
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
||||
)
|
||||
def benchmark_speed_cli(
|
||||
# fmt: off
|
||||
ctx: typer.Context,
|
||||
model: str = Arg(..., help="Model name or path"),
|
||||
data_path: Path = Arg(..., help="Location of binary evaluation data in .spacy format", exists=True),
|
||||
batch_size: Optional[int] = Opt(None, "--batch-size", "-b", min=1, help="Override the pipeline batch size"),
|
||||
no_shuffle: bool = Opt(False, "--no-shuffle", help="Do not shuffle benchmark data"),
|
||||
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
||||
n_batches: int = Opt(50, "--batches", help="Minimum number of batches to benchmark", min=30,),
|
||||
warmup_epochs: int = Opt(3, "--warmup", "-w", min=0, help="Number of iterations over the data for warmup"),
|
||||
# fmt: on
|
||||
):
|
||||
"""
|
||||
Benchmark a pipeline. Expects a loadable spaCy pipeline and benchmark
|
||||
data in the binary .spacy format.
|
||||
"""
|
||||
setup_gpu(use_gpu=use_gpu, silent=False)
|
||||
|
||||
nlp = util.load_model(model)
|
||||
batch_size = batch_size if batch_size is not None else nlp.batch_size
|
||||
corpus = Corpus(data_path)
|
||||
docs = [eg.predicted for eg in corpus(nlp)]
|
||||
|
||||
if len(docs) == 0:
|
||||
msg.fail("Cannot benchmark speed using an empty corpus.", exits=1)
|
||||
|
||||
print(f"Warming up for {warmup_epochs} epochs...")
|
||||
warmup(nlp, docs, warmup_epochs, batch_size)
|
||||
|
||||
print()
|
||||
print(f"Benchmarking {n_batches} batches...")
|
||||
wps = benchmark(nlp, docs, n_batches, batch_size, not no_shuffle)
|
||||
|
||||
print()
|
||||
print_outliers(wps)
|
||||
print_mean_with_ci(wps)
|
||||
|
||||
|
||||
# Lowercased, behaves as a context manager function.
|
||||
class time_context:
|
||||
"""Register the running time of a context."""
|
||||
|
||||
def __enter__(self):
|
||||
self.start = time.perf_counter()
|
||||
return self
|
||||
|
||||
def __exit__(self, type, value, traceback):
|
||||
self.elapsed = time.perf_counter() - self.start
|
||||
|
||||
|
||||
class Quartiles:
|
||||
"""Calculate the q1, q2, q3 quartiles and the inter-quartile range (iqr)
|
||||
of a sample."""
|
||||
|
||||
q1: float
|
||||
q2: float
|
||||
q3: float
|
||||
iqr: float
|
||||
|
||||
def __init__(self, sample: numpy.ndarray) -> None:
|
||||
self.q1 = numpy.quantile(sample, 0.25)
|
||||
self.q2 = numpy.quantile(sample, 0.5)
|
||||
self.q3 = numpy.quantile(sample, 0.75)
|
||||
self.iqr = self.q3 - self.q1
|
||||
|
||||
|
||||
def annotate(
|
||||
nlp: Language, docs: List[Doc], batch_size: Optional[int]
|
||||
) -> numpy.ndarray:
|
||||
docs = nlp.pipe(tqdm(docs, unit="doc"), batch_size=batch_size)
|
||||
wps = []
|
||||
while True:
|
||||
with time_context() as elapsed:
|
||||
batch_docs = list(
|
||||
islice(docs, batch_size if batch_size else nlp.batch_size)
|
||||
)
|
||||
if len(batch_docs) == 0:
|
||||
break
|
||||
n_tokens = count_tokens(batch_docs)
|
||||
wps.append(n_tokens / elapsed.elapsed)
|
||||
|
||||
return numpy.array(wps)
|
||||
|
||||
|
||||
def benchmark(
|
||||
nlp: Language,
|
||||
docs: List[Doc],
|
||||
n_batches: int,
|
||||
batch_size: int,
|
||||
shuffle: bool,
|
||||
) -> numpy.ndarray:
|
||||
if shuffle:
|
||||
bench_docs = [
|
||||
nlp.make_doc(random.choice(docs).text)
|
||||
for _ in range(n_batches * batch_size)
|
||||
]
|
||||
else:
|
||||
bench_docs = [
|
||||
nlp.make_doc(docs[i % len(docs)].text)
|
||||
for i in range(n_batches * batch_size)
|
||||
]
|
||||
|
||||
return annotate(nlp, bench_docs, batch_size)
|
||||
|
||||
|
||||
def bootstrap(x, statistic=numpy.mean, iterations=10000) -> numpy.ndarray:
|
||||
"""Apply a statistic to repeated random samples of an array."""
|
||||
return numpy.fromiter(
|
||||
(
|
||||
statistic(numpy.random.choice(x, len(x), replace=True))
|
||||
for _ in range(iterations)
|
||||
),
|
||||
numpy.float64,
|
||||
)
|
||||
|
||||
|
||||
def count_tokens(docs: Iterable[Doc]) -> int:
|
||||
return sum(len(doc) for doc in docs)
|
||||
|
||||
|
||||
def print_mean_with_ci(sample: numpy.ndarray):
|
||||
mean = numpy.mean(sample)
|
||||
bootstrap_means = bootstrap(sample)
|
||||
bootstrap_means.sort()
|
||||
|
||||
# 95% confidence interval
|
||||
low = bootstrap_means[int(len(bootstrap_means) * 0.025)]
|
||||
high = bootstrap_means[int(len(bootstrap_means) * 0.975)]
|
||||
|
||||
print(f"Mean: {mean:.1f} words/s (95% CI: {low-mean:.1f} +{high-mean:.1f})")
|
||||
|
||||
|
||||
def print_outliers(sample: numpy.ndarray):
|
||||
quartiles = Quartiles(sample)
|
||||
|
||||
n_outliers = numpy.sum(
|
||||
(sample < (quartiles.q1 - 1.5 * quartiles.iqr))
|
||||
| (sample > (quartiles.q3 + 1.5 * quartiles.iqr))
|
||||
)
|
||||
n_extreme_outliers = numpy.sum(
|
||||
(sample < (quartiles.q1 - 3.0 * quartiles.iqr))
|
||||
| (sample > (quartiles.q3 + 3.0 * quartiles.iqr))
|
||||
)
|
||||
print(
|
||||
f"Outliers: {(100 * n_outliers) / len(sample):.1f}%, extreme outliers: {(100 * n_extreme_outliers) / len(sample)}%"
|
||||
)
|
||||
|
||||
|
||||
def warmup(
|
||||
nlp: Language, docs: List[Doc], warmup_epochs: int, batch_size: Optional[int]
|
||||
) -> numpy.ndarray:
|
||||
docs = warmup_epochs * docs
|
||||
return annotate(nlp, docs, batch_size)
|
|
@ -1,4 +1,4 @@
|
|||
from typing import Callable, Iterable, Mapping, Optional, Any, List, Union
|
||||
from typing import Callable, Iterable, Mapping, Optional, Any, Union
|
||||
from enum import Enum
|
||||
from pathlib import Path
|
||||
from wasabi import Printer
|
||||
|
@ -7,7 +7,7 @@ import re
|
|||
import sys
|
||||
import itertools
|
||||
|
||||
from ._util import app, Arg, Opt
|
||||
from ._util import app, Arg, Opt, walk_directory
|
||||
from ..training import docs_to_json
|
||||
from ..tokens import Doc, DocBin
|
||||
from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs
|
||||
|
@ -28,6 +28,8 @@ CONVERTERS: Mapping[str, Callable[..., Iterable[Doc]]] = {
|
|||
"json": json_to_docs,
|
||||
}
|
||||
|
||||
AUTO = "auto"
|
||||
|
||||
|
||||
# File types that can be written to stdout
|
||||
FILE_TYPES_STDOUT = ("json",)
|
||||
|
@ -49,7 +51,7 @@ def convert_cli(
|
|||
model: Optional[str] = Opt(None, "--model", "--base", "-b", help="Trained spaCy pipeline for sentence segmentation to use as base (for --seg-sents)"),
|
||||
morphology: bool = Opt(False, "--morphology", "-m", help="Enable appending morphology to tags"),
|
||||
merge_subtokens: bool = Opt(False, "--merge-subtokens", "-T", help="Merge CoNLL-U subtokens"),
|
||||
converter: str = Opt("auto", "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}"),
|
||||
converter: str = Opt(AUTO, "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}"),
|
||||
ner_map: Optional[Path] = Opt(None, "--ner-map", "-nm", help="NER tag mapping (as JSON-encoded dict of entity types)", exists=True),
|
||||
lang: Optional[str] = Opt(None, "--lang", "-l", help="Language (if tokenizer required)"),
|
||||
concatenate: bool = Opt(None, "--concatenate", "-C", help="Concatenate output to a single file"),
|
||||
|
@ -70,8 +72,8 @@ def convert_cli(
|
|||
output_dir: Union[str, Path] = "-" if output_dir == Path("-") else output_dir
|
||||
silent = output_dir == "-"
|
||||
msg = Printer(no_print=silent)
|
||||
verify_cli_args(msg, input_path, output_dir, file_type.value, converter, ner_map)
|
||||
converter = _get_converter(msg, converter, input_path)
|
||||
verify_cli_args(msg, input_path, output_dir, file_type.value, converter, ner_map)
|
||||
convert(
|
||||
input_path,
|
||||
output_dir,
|
||||
|
@ -100,7 +102,7 @@ def convert(
|
|||
model: Optional[str] = None,
|
||||
morphology: bool = False,
|
||||
merge_subtokens: bool = False,
|
||||
converter: str = "auto",
|
||||
converter: str,
|
||||
ner_map: Optional[Path] = None,
|
||||
lang: Optional[str] = None,
|
||||
concatenate: bool = False,
|
||||
|
@ -189,33 +191,6 @@ def autodetect_ner_format(input_data: str) -> Optional[str]:
|
|||
return None
|
||||
|
||||
|
||||
def walk_directory(path: Path, converter: str) -> List[Path]:
|
||||
if not path.is_dir():
|
||||
return [path]
|
||||
paths = [path]
|
||||
locs = []
|
||||
seen = set()
|
||||
for path in paths:
|
||||
if str(path) in seen:
|
||||
continue
|
||||
seen.add(str(path))
|
||||
if path.parts[-1].startswith("."):
|
||||
continue
|
||||
elif path.is_dir():
|
||||
paths.extend(path.iterdir())
|
||||
elif converter == "json" and not path.parts[-1].endswith("json"):
|
||||
continue
|
||||
elif converter == "conll" and not path.parts[-1].endswith("conll"):
|
||||
continue
|
||||
elif converter == "iob" and not path.parts[-1].endswith("iob"):
|
||||
continue
|
||||
else:
|
||||
locs.append(path)
|
||||
# It's good to sort these, in case the ordering messes up cache.
|
||||
locs.sort()
|
||||
return locs
|
||||
|
||||
|
||||
def verify_cli_args(
|
||||
msg: Printer,
|
||||
input_path: Path,
|
||||
|
@ -239,18 +214,22 @@ def verify_cli_args(
|
|||
input_locs = walk_directory(input_path, converter)
|
||||
if len(input_locs) == 0:
|
||||
msg.fail("No input files in directory", input_path, exits=1)
|
||||
file_types = list(set([loc.suffix[1:] for loc in input_locs]))
|
||||
if converter == "auto" and len(file_types) >= 2:
|
||||
file_types_str = ",".join(file_types)
|
||||
msg.fail("All input files must be same type", file_types_str, exits=1)
|
||||
if converter != "auto" and converter not in CONVERTERS:
|
||||
if converter not in CONVERTERS:
|
||||
msg.fail(f"Can't find converter for {converter}", exits=1)
|
||||
|
||||
|
||||
def _get_converter(msg, converter, input_path: Path):
|
||||
if input_path.is_dir():
|
||||
input_path = walk_directory(input_path, converter)[0]
|
||||
if converter == "auto":
|
||||
if converter == AUTO:
|
||||
input_locs = walk_directory(input_path, suffix=None)
|
||||
file_types = list(set([loc.suffix[1:] for loc in input_locs]))
|
||||
if len(file_types) >= 2:
|
||||
file_types_str = ",".join(file_types)
|
||||
msg.fail("All input files must be same type", file_types_str, exits=1)
|
||||
input_path = input_locs[0]
|
||||
else:
|
||||
input_path = walk_directory(input_path, suffix=converter)[0]
|
||||
if converter == AUTO:
|
||||
converter = input_path.suffix[1:]
|
||||
if converter == "ner" or converter == "iob":
|
||||
with input_path.open(encoding="utf8") as file_:
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Tuple, Union
|
||||
from typing import cast, overload
|
||||
from typing import Literal, cast, overload
|
||||
from pathlib import Path
|
||||
from collections import Counter
|
||||
import sys
|
||||
|
@ -13,13 +13,14 @@ from ._util import import_code, debug_cli, _format_number
|
|||
from ..training import Example, remove_bilu_prefix
|
||||
from ..training.initialize import get_sourced_components
|
||||
from ..schemas import ConfigSchemaTraining
|
||||
from ..pipeline import TrainablePipe
|
||||
from ..pipeline._parser_internals import nonproj
|
||||
from ..pipeline._parser_internals.nonproj import DELIMITER
|
||||
from ..pipeline import Morphologizer, SpanCategorizer
|
||||
from ..pipeline._edit_tree_internals.edit_trees import EditTrees
|
||||
from ..morphology import Morphology
|
||||
from ..language import Language
|
||||
from ..util import registry, resolve_dot_names
|
||||
from ..compat import Literal
|
||||
from ..vectors import Mode as VectorsMode
|
||||
from .. import util
|
||||
|
||||
|
@ -670,6 +671,59 @@ def debug_data(
|
|||
f"Found {gold_train_data['n_cycles']} projectivized train sentence(s) with cycles"
|
||||
)
|
||||
|
||||
if "trainable_lemmatizer" in factory_names:
|
||||
msg.divider("Trainable Lemmatizer")
|
||||
trees_train: Set[str] = gold_train_data["lemmatizer_trees"]
|
||||
trees_dev: Set[str] = gold_dev_data["lemmatizer_trees"]
|
||||
# This is necessary context when someone is attempting to interpret whether the
|
||||
# number of trees exclusively in the dev set is meaningful.
|
||||
msg.info(f"{len(trees_train)} lemmatizer trees generated from training data")
|
||||
msg.info(f"{len(trees_dev)} lemmatizer trees generated from dev data")
|
||||
dev_not_train = trees_dev - trees_train
|
||||
|
||||
if len(dev_not_train) != 0:
|
||||
pct = len(dev_not_train) / len(trees_dev)
|
||||
msg.info(
|
||||
f"{len(dev_not_train)} lemmatizer trees ({pct*100:.1f}% of dev trees)"
|
||||
" were found exclusively in the dev data."
|
||||
)
|
||||
else:
|
||||
# Would we ever expect this case? It seems like it would be pretty rare,
|
||||
# and we might actually want a warning?
|
||||
msg.info("All trees in dev data present in training data.")
|
||||
|
||||
if gold_train_data["n_low_cardinality_lemmas"] > 0:
|
||||
n = gold_train_data["n_low_cardinality_lemmas"]
|
||||
msg.warn(f"{n} training docs with 0 or 1 unique lemmas.")
|
||||
|
||||
if gold_dev_data["n_low_cardinality_lemmas"] > 0:
|
||||
n = gold_dev_data["n_low_cardinality_lemmas"]
|
||||
msg.warn(f"{n} dev docs with 0 or 1 unique lemmas.")
|
||||
|
||||
if gold_train_data["no_lemma_annotations"] > 0:
|
||||
n = gold_train_data["no_lemma_annotations"]
|
||||
msg.warn(f"{n} training docs with no lemma annotations.")
|
||||
else:
|
||||
msg.good("All training docs have lemma annotations.")
|
||||
|
||||
if gold_dev_data["no_lemma_annotations"] > 0:
|
||||
n = gold_dev_data["no_lemma_annotations"]
|
||||
msg.warn(f"{n} dev docs with no lemma annotations.")
|
||||
else:
|
||||
msg.good("All dev docs have lemma annotations.")
|
||||
|
||||
if gold_train_data["partial_lemma_annotations"] > 0:
|
||||
n = gold_train_data["partial_lemma_annotations"]
|
||||
msg.info(f"{n} training docs with partial lemma annotations.")
|
||||
else:
|
||||
msg.good("All training docs have complete lemma annotations.")
|
||||
|
||||
if gold_dev_data["partial_lemma_annotations"] > 0:
|
||||
n = gold_dev_data["partial_lemma_annotations"]
|
||||
msg.info(f"{n} dev docs with partial lemma annotations.")
|
||||
else:
|
||||
msg.good("All dev docs have complete lemma annotations.")
|
||||
|
||||
msg.divider("Summary")
|
||||
good_counts = msg.counts[MESSAGES.GOOD]
|
||||
warn_counts = msg.counts[MESSAGES.WARN]
|
||||
|
@ -731,7 +785,13 @@ def _compile_gold(
|
|||
"n_cats_multilabel": 0,
|
||||
"n_cats_bad_values": 0,
|
||||
"texts": set(),
|
||||
"lemmatizer_trees": set(),
|
||||
"no_lemma_annotations": 0,
|
||||
"partial_lemma_annotations": 0,
|
||||
"n_low_cardinality_lemmas": 0,
|
||||
}
|
||||
if "trainable_lemmatizer" in factory_names:
|
||||
trees = EditTrees(nlp.vocab.strings)
|
||||
for eg in examples:
|
||||
gold = eg.reference
|
||||
doc = eg.predicted
|
||||
|
@ -861,6 +921,25 @@ def _compile_gold(
|
|||
data["n_nonproj"] += 1
|
||||
if nonproj.contains_cycle(aligned_heads):
|
||||
data["n_cycles"] += 1
|
||||
if "trainable_lemmatizer" in factory_names:
|
||||
# from EditTreeLemmatizer._labels_from_data
|
||||
if all(token.lemma == 0 for token in gold):
|
||||
data["no_lemma_annotations"] += 1
|
||||
continue
|
||||
if any(token.lemma == 0 for token in gold):
|
||||
data["partial_lemma_annotations"] += 1
|
||||
lemma_set = set()
|
||||
for token in gold:
|
||||
if token.lemma != 0:
|
||||
lemma_set.add(token.lemma)
|
||||
tree_id = trees.add(token.text, token.lemma_)
|
||||
tree_str = trees.tree_to_str(tree_id)
|
||||
data["lemmatizer_trees"].add(tree_str)
|
||||
# We want to identify cases where lemmas aren't assigned
|
||||
# or are all assigned the same value, as this would indicate
|
||||
# an issue since we're expecting a large set of lemmas
|
||||
if len(lemma_set) < 2 and len(gold) > 1:
|
||||
data["n_low_cardinality_lemmas"] += 1
|
||||
return data
|
||||
|
||||
|
||||
|
@ -934,6 +1013,7 @@ def _get_labels_from_model(nlp: Language, factory_name: str) -> Set[str]:
|
|||
labels: Set[str] = set()
|
||||
for pipe_name in pipe_names:
|
||||
pipe = nlp.get_pipe(pipe_name)
|
||||
assert isinstance(pipe, TrainablePipe)
|
||||
labels.update(pipe.labels)
|
||||
return labels
|
||||
|
||||
|
|
|
@ -8,7 +8,6 @@ from ._util import app, Arg, Opt, WHEEL_SUFFIX, SDIST_SUFFIX
|
|||
from .. import about
|
||||
from ..util import is_package, get_minor_version, run_command
|
||||
from ..util import is_prerelease_version
|
||||
from ..errors import OLD_MODEL_SHORTCUTS
|
||||
|
||||
|
||||
@app.command(
|
||||
|
@ -61,12 +60,6 @@ def download(
|
|||
version = components[-1]
|
||||
else:
|
||||
model_name = model
|
||||
if model in OLD_MODEL_SHORTCUTS:
|
||||
msg.warn(
|
||||
f"As of spaCy v3.0, shortcuts like '{model}' are deprecated. Please "
|
||||
f"use the full pipeline package name '{OLD_MODEL_SHORTCUTS[model]}' instead."
|
||||
)
|
||||
model_name = OLD_MODEL_SHORTCUTS[model]
|
||||
compatibility = get_compatibility()
|
||||
version = get_version(model_name, compatibility)
|
||||
|
||||
|
|
|
@ -7,12 +7,15 @@ from thinc.api import fix_random_seed
|
|||
|
||||
from ..training import Corpus
|
||||
from ..tokens import Doc
|
||||
from ._util import app, Arg, Opt, setup_gpu, import_code
|
||||
from ._util import app, Arg, Opt, setup_gpu, import_code, benchmark_cli
|
||||
from ..scorer import Scorer
|
||||
from .. import util
|
||||
from .. import displacy
|
||||
|
||||
|
||||
@benchmark_cli.command(
|
||||
"accuracy",
|
||||
)
|
||||
@app.command("evaluate")
|
||||
def evaluate_cli(
|
||||
# fmt: off
|
||||
|
@ -36,7 +39,7 @@ def evaluate_cli(
|
|||
dependency parses in a HTML file, set as output directory as the
|
||||
displacy_path argument.
|
||||
|
||||
DOCS: https://spacy.io/api/cli#evaluate
|
||||
DOCS: https://spacy.io/api/cli#benchmark-accuracy
|
||||
"""
|
||||
import_code(code_path)
|
||||
evaluate(
|
||||
|
|
233
spacy/cli/find_threshold.py
Normal file
233
spacy/cli/find_threshold.py
Normal file
|
@ -0,0 +1,233 @@
|
|||
import functools
|
||||
import operator
|
||||
from pathlib import Path
|
||||
import logging
|
||||
from typing import Optional, Tuple, Any, Dict, List
|
||||
|
||||
import numpy
|
||||
import wasabi.tables
|
||||
|
||||
from ..pipeline import TextCategorizer, MultiLabel_TextCategorizer
|
||||
from ..errors import Errors
|
||||
from ..training import Corpus
|
||||
from ._util import app, Arg, Opt, import_code, setup_gpu
|
||||
from .. import util
|
||||
|
||||
_DEFAULTS = {
|
||||
"n_trials": 11,
|
||||
"use_gpu": -1,
|
||||
"gold_preproc": False,
|
||||
}
|
||||
|
||||
|
||||
@app.command(
|
||||
"find-threshold",
|
||||
context_settings={"allow_extra_args": False, "ignore_unknown_options": True},
|
||||
)
|
||||
def find_threshold_cli(
|
||||
# fmt: off
|
||||
model: str = Arg(..., help="Model name or path"),
|
||||
data_path: Path = Arg(..., help="Location of binary evaluation data in .spacy format", exists=True),
|
||||
pipe_name: str = Arg(..., help="Name of pipe to examine thresholds for"),
|
||||
threshold_key: str = Arg(..., help="Key of threshold attribute in component's configuration"),
|
||||
scores_key: str = Arg(..., help="Metric to optimize"),
|
||||
n_trials: int = Opt(_DEFAULTS["n_trials"], "--n_trials", "-n", help="Number of trials to determine optimal thresholds"),
|
||||
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||
use_gpu: int = Opt(_DEFAULTS["use_gpu"], "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
||||
gold_preproc: bool = Opt(_DEFAULTS["gold_preproc"], "--gold-preproc", "-G", help="Use gold preprocessing"),
|
||||
verbose: bool = Opt(False, "--silent", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||
# fmt: on
|
||||
):
|
||||
"""
|
||||
Runs prediction trials for a trained model with varying tresholds to maximize
|
||||
the specified metric. The search space for the threshold is traversed linearly
|
||||
from 0 to 1 in `n_trials` steps. Results are displayed in a table on `stdout`
|
||||
(the corresponding API call to `spacy.cli.find_threshold.find_threshold()`
|
||||
returns all results).
|
||||
|
||||
This is applicable only for components whose predictions are influenced by
|
||||
thresholds - e.g. `textcat_multilabel` and `spancat`, but not `textcat`. Note
|
||||
that the full path to the corresponding threshold attribute in the config has to
|
||||
be provided.
|
||||
|
||||
DOCS: https://spacy.io/api/cli#find-threshold
|
||||
"""
|
||||
|
||||
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
||||
import_code(code_path)
|
||||
find_threshold(
|
||||
model=model,
|
||||
data_path=data_path,
|
||||
pipe_name=pipe_name,
|
||||
threshold_key=threshold_key,
|
||||
scores_key=scores_key,
|
||||
n_trials=n_trials,
|
||||
use_gpu=use_gpu,
|
||||
gold_preproc=gold_preproc,
|
||||
silent=False,
|
||||
)
|
||||
|
||||
|
||||
def find_threshold(
|
||||
model: str,
|
||||
data_path: Path,
|
||||
pipe_name: str,
|
||||
threshold_key: str,
|
||||
scores_key: str,
|
||||
*,
|
||||
n_trials: int = _DEFAULTS["n_trials"], # type: ignore
|
||||
use_gpu: int = _DEFAULTS["use_gpu"], # type: ignore
|
||||
gold_preproc: bool = _DEFAULTS["gold_preproc"], # type: ignore
|
||||
silent: bool = True,
|
||||
) -> Tuple[float, float, Dict[float, float]]:
|
||||
"""
|
||||
Runs prediction trials for models with varying tresholds to maximize the specified metric.
|
||||
model (Union[str, Path]): Pipeline to evaluate. Can be a package or a path to a data directory.
|
||||
data_path (Path): Path to file with DocBin with docs to use for threshold search.
|
||||
pipe_name (str): Name of pipe to examine thresholds for.
|
||||
threshold_key (str): Key of threshold attribute in component's configuration.
|
||||
scores_key (str): Name of score to metric to optimize.
|
||||
n_trials (int): Number of trials to determine optimal thresholds.
|
||||
use_gpu (int): GPU ID or -1 for CPU.
|
||||
gold_preproc (bool): Whether to use gold preprocessing. Gold preprocessing helps the annotations align to the
|
||||
tokenization, and may result in sequences of more consistent length. However, it may reduce runtime accuracy due
|
||||
to train/test skew.
|
||||
silent (bool): Whether to print non-error-related output to stdout.
|
||||
RETURNS (Tuple[float, float, Dict[float, float]]): Best found threshold, the corresponding score, scores for all
|
||||
evaluated thresholds.
|
||||
"""
|
||||
|
||||
setup_gpu(use_gpu, silent=silent)
|
||||
data_path = util.ensure_path(data_path)
|
||||
if not data_path.exists():
|
||||
wasabi.msg.fail("Evaluation data not found", data_path, exits=1)
|
||||
nlp = util.load_model(model)
|
||||
|
||||
if pipe_name not in nlp.component_names:
|
||||
raise AttributeError(
|
||||
Errors.E001.format(name=pipe_name, opts=nlp.component_names)
|
||||
)
|
||||
pipe = nlp.get_pipe(pipe_name)
|
||||
if not hasattr(pipe, "scorer"):
|
||||
raise AttributeError(Errors.E1045)
|
||||
|
||||
if type(pipe) == TextCategorizer:
|
||||
wasabi.msg.warn(
|
||||
"The `textcat` component doesn't use a threshold as it's not applicable to the concept of "
|
||||
"exclusive classes. All thresholds will yield the same results."
|
||||
)
|
||||
|
||||
if not silent:
|
||||
wasabi.msg.info(
|
||||
title=f"Optimizing for {scores_key} for component '{pipe_name}' with {n_trials} "
|
||||
f"trials."
|
||||
)
|
||||
|
||||
# Load evaluation corpus.
|
||||
corpus = Corpus(data_path, gold_preproc=gold_preproc)
|
||||
dev_dataset = list(corpus(nlp))
|
||||
config_keys = threshold_key.split(".")
|
||||
|
||||
def set_nested_item(
|
||||
config: Dict[str, Any], keys: List[str], value: float
|
||||
) -> Dict[str, Any]:
|
||||
"""Set item in nested dictionary. Adapted from https://stackoverflow.com/a/54138200.
|
||||
config (Dict[str, Any]): Configuration dictionary.
|
||||
keys (List[Any]): Path to value to set.
|
||||
value (float): Value to set.
|
||||
RETURNS (Dict[str, Any]): Updated dictionary.
|
||||
"""
|
||||
functools.reduce(operator.getitem, keys[:-1], config)[keys[-1]] = value
|
||||
return config
|
||||
|
||||
def filter_config(
|
||||
config: Dict[str, Any], keys: List[str], full_key: str
|
||||
) -> Dict[str, Any]:
|
||||
"""Filters provided config dictionary so that only the specified keys path remains.
|
||||
config (Dict[str, Any]): Configuration dictionary.
|
||||
keys (List[Any]): Path to value to set.
|
||||
full_key (str): Full user-specified key.
|
||||
RETURNS (Dict[str, Any]): Filtered dictionary.
|
||||
"""
|
||||
if keys[0] not in config:
|
||||
wasabi.msg.fail(
|
||||
title=f"Failed to look up `{full_key}` in config: sub-key {[keys[0]]} not found.",
|
||||
text=f"Make sure you specified {[keys[0]]} correctly. The following sub-keys are available instead: "
|
||||
f"{list(config.keys())}",
|
||||
exits=1,
|
||||
)
|
||||
return {
|
||||
keys[0]: filter_config(config[keys[0]], keys[1:], full_key)
|
||||
if len(keys) > 1
|
||||
else config[keys[0]]
|
||||
}
|
||||
|
||||
# Evaluate with varying threshold values.
|
||||
scores: Dict[float, float] = {}
|
||||
config_keys_full = ["components", pipe_name, *config_keys]
|
||||
table_col_widths = (10, 10)
|
||||
thresholds = numpy.linspace(0, 1, n_trials)
|
||||
print(wasabi.tables.row(["Threshold", f"{scores_key}"], widths=table_col_widths))
|
||||
for threshold in thresholds:
|
||||
# Reload pipeline with overrides specifying the new threshold.
|
||||
nlp = util.load_model(
|
||||
model,
|
||||
config=set_nested_item(
|
||||
filter_config(
|
||||
nlp.config, config_keys_full, ".".join(config_keys_full)
|
||||
).copy(),
|
||||
config_keys_full,
|
||||
threshold,
|
||||
),
|
||||
)
|
||||
if hasattr(pipe, "cfg"):
|
||||
setattr(
|
||||
nlp.get_pipe(pipe_name),
|
||||
"cfg",
|
||||
set_nested_item(getattr(pipe, "cfg"), config_keys, threshold),
|
||||
)
|
||||
|
||||
eval_scores = nlp.evaluate(dev_dataset)
|
||||
if scores_key not in eval_scores:
|
||||
wasabi.msg.fail(
|
||||
title=f"Failed to look up score `{scores_key}` in evaluation results.",
|
||||
text=f"Make sure you specified the correct value for `scores_key`. The following scores are "
|
||||
f"available: {list(eval_scores.keys())}",
|
||||
exits=1,
|
||||
)
|
||||
scores[threshold] = eval_scores[scores_key]
|
||||
|
||||
if not isinstance(scores[threshold], (float, int)):
|
||||
wasabi.msg.fail(
|
||||
f"Returned score for key '{scores_key}' is not numeric. Threshold optimization only works for numeric "
|
||||
f"scores.",
|
||||
exits=1,
|
||||
)
|
||||
print(
|
||||
wasabi.row(
|
||||
[round(threshold, 3), round(scores[threshold], 3)],
|
||||
widths=table_col_widths,
|
||||
)
|
||||
)
|
||||
|
||||
best_threshold = max(scores.keys(), key=(lambda key: scores[key]))
|
||||
|
||||
# If all scores are identical, emit warning.
|
||||
if len(set(scores.values())) == 1:
|
||||
wasabi.msg.warn(
|
||||
title="All scores are identical. Verify that all settings are correct.",
|
||||
text=""
|
||||
if (
|
||||
not isinstance(pipe, MultiLabel_TextCategorizer)
|
||||
or scores_key in ("cats_macro_f", "cats_micro_f")
|
||||
)
|
||||
else "Use `cats_macro_f` or `cats_micro_f` when optimizing the threshold for `textcat_multilabel`.",
|
||||
)
|
||||
|
||||
else:
|
||||
if not silent:
|
||||
print(
|
||||
f"\nBest threshold: {round(best_threshold, ndigits=4)} with {scores_key} value of {scores[best_threshold]}."
|
||||
)
|
||||
|
||||
return best_threshold, scores[best_threshold], scores
|
|
@ -189,7 +189,11 @@ def convert_asset_url(url: str) -> str:
|
|||
RETURNS (str): The converted URL.
|
||||
"""
|
||||
# If the asset URL is a regular GitHub URL it's likely a mistake
|
||||
if re.match(r"(http(s?)):\/\/github.com", url) and "releases/download" not in url:
|
||||
if (
|
||||
re.match(r"(http(s?)):\/\/github.com", url)
|
||||
and "releases/download" not in url
|
||||
and "/raw/" not in url
|
||||
):
|
||||
converted = url.replace("github.com", "raw.githubusercontent.com")
|
||||
converted = re.sub(r"/(tree|blob)/", "/", converted)
|
||||
msg.warn(
|
||||
|
|
|
@ -5,14 +5,17 @@ import hashlib
|
|||
import urllib.parse
|
||||
import tarfile
|
||||
from pathlib import Path
|
||||
from wasabi import msg
|
||||
|
||||
from .._util import get_hash, get_checksum, download_file, ensure_pathy
|
||||
from ...util import make_tempdir, get_minor_version, ENV_VARS, check_bool_env_var
|
||||
from .._util import get_hash, get_checksum, upload_file, download_file
|
||||
from .._util import ensure_pathy, make_tempdir
|
||||
from ...util import get_minor_version, ENV_VARS, check_bool_env_var
|
||||
from ...git_info import GIT_VERSION
|
||||
from ... import about
|
||||
from ...errors import Errors
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pathy import Pathy # noqa: F401
|
||||
from pathy import FluidPath # noqa: F401
|
||||
|
||||
|
||||
class RemoteStorage:
|
||||
|
@ -27,7 +30,7 @@ class RemoteStorage:
|
|||
self.url = ensure_pathy(url)
|
||||
self.compression = compression
|
||||
|
||||
def push(self, path: Path, command_hash: str, content_hash: str) -> "Pathy":
|
||||
def push(self, path: Path, command_hash: str, content_hash: str) -> "FluidPath":
|
||||
"""Compress a file or directory within a project and upload it to a remote
|
||||
storage. If an object exists at the full URL, nothing is done.
|
||||
|
||||
|
@ -48,9 +51,7 @@ class RemoteStorage:
|
|||
mode_string = f"w:{self.compression}" if self.compression else "w"
|
||||
with tarfile.open(tar_loc, mode=mode_string) as tar_file:
|
||||
tar_file.add(str(loc), arcname=str(path))
|
||||
with tar_loc.open(mode="rb") as input_file:
|
||||
with url.open(mode="wb") as output_file:
|
||||
output_file.write(input_file.read())
|
||||
upload_file(tar_loc, url)
|
||||
return url
|
||||
|
||||
def pull(
|
||||
|
@ -59,7 +60,7 @@ class RemoteStorage:
|
|||
*,
|
||||
command_hash: Optional[str] = None,
|
||||
content_hash: Optional[str] = None,
|
||||
) -> Optional["Pathy"]:
|
||||
) -> Optional["FluidPath"]:
|
||||
"""Retrieve a file from the remote cache. If the file already exists,
|
||||
nothing is done.
|
||||
|
||||
|
@ -84,7 +85,23 @@ class RemoteStorage:
|
|||
with tarfile.open(tar_loc, mode=mode_string) as tar_file:
|
||||
# This requires that the path is added correctly, relative
|
||||
# to root. This is how we set things up in push()
|
||||
tar_file.extractall(self.root)
|
||||
|
||||
# Disallow paths outside the current directory for the tar
|
||||
# file (CVE-2007-4559, directory traversal vulnerability)
|
||||
def is_within_directory(directory, target):
|
||||
abs_directory = os.path.abspath(directory)
|
||||
abs_target = os.path.abspath(target)
|
||||
prefix = os.path.commonprefix([abs_directory, abs_target])
|
||||
return prefix == abs_directory
|
||||
|
||||
def safe_extract(tar, path):
|
||||
for member in tar.getmembers():
|
||||
member_path = os.path.join(path, member.name)
|
||||
if not is_within_directory(path, member_path):
|
||||
raise ValueError(Errors.E852)
|
||||
tar.extractall(path)
|
||||
|
||||
safe_extract(tar_file, self.root)
|
||||
return url
|
||||
|
||||
def find(
|
||||
|
@ -93,25 +110,37 @@ class RemoteStorage:
|
|||
*,
|
||||
command_hash: Optional[str] = None,
|
||||
content_hash: Optional[str] = None,
|
||||
) -> Optional["Pathy"]:
|
||||
) -> Optional["FluidPath"]:
|
||||
"""Find the best matching version of a file within the storage,
|
||||
or `None` if no match can be found. If both the creation and content hash
|
||||
are specified, only exact matches will be returned. Otherwise, the most
|
||||
recent matching file is preferred.
|
||||
"""
|
||||
name = self.encode_name(str(path))
|
||||
urls = []
|
||||
if command_hash is not None and content_hash is not None:
|
||||
url = self.make_url(path, command_hash, content_hash)
|
||||
url = self.url / name / command_hash / content_hash
|
||||
urls = [url] if url.exists() else []
|
||||
elif command_hash is not None:
|
||||
urls = list((self.url / name / command_hash).iterdir())
|
||||
if (self.url / name / command_hash).exists():
|
||||
urls = list((self.url / name / command_hash).iterdir())
|
||||
else:
|
||||
urls = list((self.url / name).iterdir())
|
||||
if content_hash is not None:
|
||||
urls = [url for url in urls if url.parts[-1] == content_hash]
|
||||
if (self.url / name).exists():
|
||||
for sub_dir in (self.url / name).iterdir():
|
||||
urls.extend(sub_dir.iterdir())
|
||||
if content_hash is not None:
|
||||
urls = [url for url in urls if url.parts[-1] == content_hash]
|
||||
if len(urls) >= 2:
|
||||
try:
|
||||
urls.sort(key=lambda x: x.stat().last_modified) # type: ignore
|
||||
except Exception:
|
||||
msg.warn(
|
||||
"Unable to sort remote files by last modified. The file(s) "
|
||||
"pulled from the cache may not be the most recent."
|
||||
)
|
||||
return urls[-1] if urls else None
|
||||
|
||||
def make_url(self, path: Path, command_hash: str, content_hash: str) -> "Pathy":
|
||||
def make_url(self, path: Path, command_hash: str, content_hash: str) -> "FluidPath":
|
||||
"""Construct a URL from a subpath, a creation hash and a content hash."""
|
||||
return self.url / self.encode_name(str(path)) / command_hash / content_hash
|
||||
|
||||
|
|
|
@ -53,6 +53,7 @@ def project_run(
|
|||
force: bool = False,
|
||||
dry: bool = False,
|
||||
capture: bool = False,
|
||||
skip_requirements_check: bool = False,
|
||||
) -> None:
|
||||
"""Run a named script defined in the project.yml. If the script is part
|
||||
of the default pipeline (defined in the "run" section), DVC is used to
|
||||
|
@ -69,6 +70,7 @@ def project_run(
|
|||
sys.exit will be called with the return code. You should use capture=False
|
||||
when you want to turn over execution to the command, and capture=True
|
||||
when you want to run the command more like a function.
|
||||
skip_requirements_check (bool): Whether to skip the requirements check.
|
||||
"""
|
||||
config = load_project_config(project_dir, overrides=overrides)
|
||||
commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
|
||||
|
@ -76,9 +78,10 @@ def project_run(
|
|||
validate_subcommand(list(commands.keys()), list(workflows.keys()), subcommand)
|
||||
|
||||
req_path = project_dir / "requirements.txt"
|
||||
if config.get("check_requirements", True) and os.path.exists(req_path):
|
||||
with req_path.open() as requirements_file:
|
||||
_check_requirements([req.replace("\n", "") for req in requirements_file])
|
||||
if not skip_requirements_check:
|
||||
if config.get("check_requirements", True) and os.path.exists(req_path):
|
||||
with req_path.open() as requirements_file:
|
||||
_check_requirements([req.strip() for req in requirements_file])
|
||||
|
||||
if subcommand in workflows:
|
||||
msg.info(f"Running workflow '{subcommand}'")
|
||||
|
@ -90,6 +93,7 @@ def project_run(
|
|||
force=force,
|
||||
dry=dry,
|
||||
capture=capture,
|
||||
skip_requirements_check=True,
|
||||
)
|
||||
else:
|
||||
cmd = commands[subcommand]
|
||||
|
@ -97,8 +101,8 @@ def project_run(
|
|||
if not (project_dir / dep).exists():
|
||||
err = f"Missing dependency specified by command '{subcommand}': {dep}"
|
||||
err_help = "Maybe you forgot to run the 'project assets' command or a previous step?"
|
||||
err_kwargs = {"exits": 1} if not dry else {}
|
||||
msg.fail(err, err_help, **err_kwargs)
|
||||
err_exits = 1 if not dry else None
|
||||
msg.fail(err, err_help, exits=err_exits)
|
||||
check_spacy_commit = check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION)
|
||||
with working_dir(project_dir) as current_dir:
|
||||
msg.divider(subcommand)
|
||||
|
@ -338,6 +342,12 @@ def _check_requirements(requirements: List[str]) -> Tuple[bool, bool]:
|
|||
failed_pkgs_msgs.append(dnf.report())
|
||||
except pkg_resources.VersionConflict as vc:
|
||||
conflicting_pkgs_msgs.append(vc.report())
|
||||
except Exception:
|
||||
msg.warn(
|
||||
f"Unable to check requirement: {req} "
|
||||
"Checks are currently limited to requirement specifiers "
|
||||
"(PEP 508)"
|
||||
)
|
||||
|
||||
if len(failed_pkgs_msgs) or len(conflicting_pkgs_msgs):
|
||||
msg.warn(
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
{# This is a template for training configs used for the quickstart widget in
|
||||
the docs and the init config command. It encodes various best practices and
|
||||
can help generate the best possible configuration, given a user's requirements. #}
|
||||
{%- set use_transformer = hardware != "cpu" -%}
|
||||
{%- set use_transformer = hardware != "cpu" and transformer_data -%}
|
||||
{%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
|
||||
{%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "spancat", "trainable_lemmatizer"] -%}
|
||||
[paths]
|
||||
|
@ -87,12 +87,11 @@ grad_factor = 1.0
|
|||
factory = "parser"
|
||||
|
||||
[components.parser.model]
|
||||
@architectures = "spacy.TransitionBasedParser.v2"
|
||||
@architectures = "spacy.TransitionBasedParser.v3"
|
||||
state_type = "parser"
|
||||
extra_state_tokens = false
|
||||
hidden_width = 128
|
||||
maxout_pieces = 3
|
||||
use_upper = false
|
||||
nO = null
|
||||
|
||||
[components.parser.model.tok2vec]
|
||||
|
@ -108,12 +107,11 @@ grad_factor = 1.0
|
|||
factory = "ner"
|
||||
|
||||
[components.ner.model]
|
||||
@architectures = "spacy.TransitionBasedParser.v2"
|
||||
@architectures = "spacy.TransitionBasedParser.v3"
|
||||
state_type = "ner"
|
||||
extra_state_tokens = false
|
||||
hidden_width = 64
|
||||
maxout_pieces = 2
|
||||
use_upper = false
|
||||
nO = null
|
||||
|
||||
[components.ner.model.tok2vec]
|
||||
|
@ -314,12 +312,11 @@ width = ${components.tok2vec.model.encode.width}
|
|||
factory = "parser"
|
||||
|
||||
[components.parser.model]
|
||||
@architectures = "spacy.TransitionBasedParser.v2"
|
||||
@architectures = "spacy.TransitionBasedParser.v3"
|
||||
state_type = "parser"
|
||||
extra_state_tokens = false
|
||||
hidden_width = 128
|
||||
maxout_pieces = 3
|
||||
use_upper = true
|
||||
nO = null
|
||||
|
||||
[components.parser.model.tok2vec]
|
||||
|
@ -332,12 +329,11 @@ width = ${components.tok2vec.model.encode.width}
|
|||
factory = "ner"
|
||||
|
||||
[components.ner.model]
|
||||
@architectures = "spacy.TransitionBasedParser.v2"
|
||||
@architectures = "spacy.TransitionBasedParser.v3"
|
||||
state_type = "ner"
|
||||
extra_state_tokens = false
|
||||
hidden_width = 64
|
||||
maxout_pieces = 2
|
||||
use_upper = true
|
||||
nO = null
|
||||
|
||||
[components.ner.model.tok2vec]
|
||||
|
|
|
@ -37,6 +37,15 @@ bn:
|
|||
accuracy:
|
||||
name: sagorsarker/bangla-bert-base
|
||||
size_factor: 3
|
||||
ca:
|
||||
word_vectors: null
|
||||
transformer:
|
||||
efficiency:
|
||||
name: projecte-aina/roberta-base-ca-v2
|
||||
size_factor: 3
|
||||
accuracy:
|
||||
name: projecte-aina/roberta-base-ca-v2
|
||||
size_factor: 3
|
||||
da:
|
||||
word_vectors: da_core_news_lg
|
||||
transformer:
|
||||
|
|
|
@ -22,19 +22,6 @@ try:
|
|||
except ImportError:
|
||||
cupy = None
|
||||
|
||||
if sys.version_info[:2] >= (3, 8): # Python 3.8+
|
||||
from typing import Literal, Protocol, runtime_checkable
|
||||
else:
|
||||
from typing_extensions import Literal, Protocol, runtime_checkable # noqa: F401
|
||||
|
||||
# Important note: The importlib_metadata "backport" includes functionality
|
||||
# that's not part of the built-in importlib.metadata. We should treat this
|
||||
# import like the built-in and only use what's available there.
|
||||
try: # Python 3.8+
|
||||
import importlib.metadata as importlib_metadata
|
||||
except ImportError:
|
||||
from catalogue import _importlib_metadata as importlib_metadata # type: ignore[no-redef] # noqa: F401
|
||||
|
||||
from thinc.api import Optimizer # noqa: F401
|
||||
|
||||
pickle = pickle
|
||||
|
|
|
@ -90,6 +90,8 @@ dev_corpus = "corpora.dev"
|
|||
train_corpus = "corpora.train"
|
||||
# Optional callback before nlp object is saved to disk after training
|
||||
before_to_disk = null
|
||||
# Optional callback that is invoked at the start of each training step
|
||||
before_update = null
|
||||
|
||||
[training.logger]
|
||||
@loggers = "spacy.ConsoleLogger.v1"
|
||||
|
|
|
@ -11,6 +11,7 @@ from .render import DependencyRenderer, EntityRenderer, SpanRenderer
|
|||
from ..tokens import Doc, Span
|
||||
from ..errors import Errors, Warnings
|
||||
from ..util import is_in_jupyter
|
||||
from ..util import find_available_port
|
||||
|
||||
|
||||
_html = {}
|
||||
|
@ -36,7 +37,7 @@ def render(
|
|||
jupyter (bool): Override Jupyter auto-detection.
|
||||
options (dict): Visualiser-specific options, e.g. colors.
|
||||
manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
|
||||
RETURNS (str): Rendered HTML markup.
|
||||
RETURNS (str): Rendered SVG or HTML markup.
|
||||
|
||||
DOCS: https://spacy.io/api/top-level#displacy.render
|
||||
USAGE: https://spacy.io/usage/visualizers
|
||||
|
@ -82,6 +83,7 @@ def serve(
|
|||
manual: bool = False,
|
||||
port: int = 5000,
|
||||
host: str = "0.0.0.0",
|
||||
auto_select_port: bool = False,
|
||||
) -> None:
|
||||
"""Serve displaCy visualisation.
|
||||
|
||||
|
@ -93,12 +95,15 @@ def serve(
|
|||
manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
|
||||
port (int): Port to serve visualisation.
|
||||
host (str): Host to serve visualisation.
|
||||
auto_select_port (bool): Automatically select a port if the specified port is in use.
|
||||
|
||||
DOCS: https://spacy.io/api/top-level#displacy.serve
|
||||
USAGE: https://spacy.io/usage/visualizers
|
||||
"""
|
||||
from wsgiref import simple_server
|
||||
|
||||
port = find_available_port(port, host, auto_select_port)
|
||||
|
||||
if is_in_jupyter():
|
||||
warnings.warn(Warnings.W011)
|
||||
render(docs, style=style, page=page, minify=minify, options=options, manual=manual)
|
||||
|
@ -228,12 +233,13 @@ def parse_spans(doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
|
|||
"kb_id": span.kb_id_ if span.kb_id_ else "",
|
||||
"kb_url": kb_url_template.format(span.kb_id_) if kb_url_template else "#",
|
||||
}
|
||||
for span in doc.spans[spans_key]
|
||||
for span in doc.spans.get(spans_key, [])
|
||||
]
|
||||
tokens = [token.text for token in doc]
|
||||
|
||||
if not spans:
|
||||
warnings.warn(Warnings.W117.format(spans_key=spans_key))
|
||||
keys = list(doc.spans.keys())
|
||||
warnings.warn(Warnings.W117.format(spans_key=spans_key, keys=keys))
|
||||
title = doc.user_data.get("title", None) if hasattr(doc, "user_data") else None
|
||||
settings = get_doc_settings(doc)
|
||||
return {
|
||||
|
|
|
@ -94,7 +94,7 @@ class SpanRenderer:
|
|||
parsed (list): Dependency parses to render.
|
||||
page (bool): Render parses wrapped as full HTML page.
|
||||
minify (bool): Minify HTML markup.
|
||||
RETURNS (str): Rendered HTML markup.
|
||||
RETURNS (str): Rendered SVG or HTML markup.
|
||||
"""
|
||||
rendered = []
|
||||
for i, p in enumerate(parsed):
|
||||
|
@ -510,7 +510,7 @@ class EntityRenderer:
|
|||
parsed (list): Dependency parses to render.
|
||||
page (bool): Render parses wrapped as full HTML page.
|
||||
minify (bool): Minify HTML markup.
|
||||
RETURNS (str): Rendered HTML markup.
|
||||
RETURNS (str): Rendered SVG or HTML markup.
|
||||
"""
|
||||
rendered = []
|
||||
for i, p in enumerate(parsed):
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
from typing import Literal
|
||||
import warnings
|
||||
from .compat import Literal
|
||||
|
||||
|
||||
class ErrorsWithCodes(type):
|
||||
|
@ -131,13 +131,6 @@ class Warnings(metaclass=ErrorsWithCodes):
|
|||
"and make it independent. For example, `replace_listeners = "
|
||||
"[\"model.tok2vec\"]` See the documentation for details: "
|
||||
"https://spacy.io/usage/training#config-components-listeners")
|
||||
W088 = ("The pipeline component {name} implements a `begin_training` "
|
||||
"method, which won't be called by spaCy. As of v3.0, `begin_training` "
|
||||
"has been renamed to `initialize`, so you likely want to rename the "
|
||||
"component method. See the documentation for details: "
|
||||
"https://spacy.io/api/language#initialize")
|
||||
W089 = ("As of spaCy v3.0, the `nlp.begin_training` method has been renamed "
|
||||
"to `nlp.initialize`.")
|
||||
W090 = ("Could not locate any {format} files in path '{path}'.")
|
||||
W091 = ("Could not clean/remove the temp directory at {dir}: {msg}.")
|
||||
W092 = ("Ignoring annotations for sentence starts, as dependency heads are set.")
|
||||
|
@ -199,7 +192,7 @@ class Warnings(metaclass=ErrorsWithCodes):
|
|||
W117 = ("No spans to visualize found in Doc object with spans_key: '{spans_key}'. If this is "
|
||||
"surprising to you, make sure the Doc was processed using a model "
|
||||
"that supports span categorization, and check the `doc.spans[spans_key]` "
|
||||
"property manually if necessary.")
|
||||
"property manually if necessary.\n\nAvailable keys: {keys}")
|
||||
W118 = ("Term '{term}' not found in glossary. It may however be explained in documentation "
|
||||
"for the corpora used to train the language. Please check "
|
||||
"`nlp.meta[\"sources\"]` for any relevant links.")
|
||||
|
@ -212,8 +205,11 @@ class Warnings(metaclass=ErrorsWithCodes):
|
|||
W121 = ("Attempting to trace non-existent method '{method}' in pipe '{pipe}'")
|
||||
W122 = ("Couldn't trace method '{method}' in pipe '{pipe}'. This can happen if the pipe class "
|
||||
"is a Cython extension type.")
|
||||
W123 = ("Argument {arg} with value {arg_value} is used instead of {config_value} as specified in the config. Be "
|
||||
"aware that this might affect other components in your pipeline.")
|
||||
W123 = ("Argument `enable` with value {enable} does not contain all values specified in the config option "
|
||||
"`enabled` ({enabled}). Be aware that this might affect other components in your pipeline.")
|
||||
W124 = ("{host}:{port} is already in use, using the nearest available port {serve_port} as an alternative.")
|
||||
|
||||
W400 = ("`use_upper=False` is ignored, the upper layer is always enabled")
|
||||
|
||||
|
||||
class Errors(metaclass=ErrorsWithCodes):
|
||||
|
@ -250,8 +246,6 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
"https://spacy.io/usage/models")
|
||||
E011 = ("Unknown operator: '{op}'. Options: {opts}")
|
||||
E012 = ("Cannot add pattern for zero tokens to matcher.\nKey: {key}")
|
||||
E016 = ("MultitaskObjective target should be function or one of: dep, "
|
||||
"tag, ent, dep_tag_offset, ent_tag.")
|
||||
E017 = ("Can only add 'str' inputs to StringStore. Got type: {value_type}")
|
||||
E018 = ("Can't retrieve string for hash '{hash_value}'. This usually "
|
||||
"refers to an issue with the `Vocab` or `StringStore`.")
|
||||
|
@ -345,6 +339,11 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
"clear the existing vectors and resize the table.")
|
||||
E074 = ("Error interpreting compiled match pattern: patterns are expected "
|
||||
"to end with the attribute {attr}. Got: {bad_attr}.")
|
||||
E079 = ("Error computing states in beam: number of predicted beams "
|
||||
"({pbeams}) does not equal number of gold beams ({gbeams}).")
|
||||
E080 = ("Duplicate state found in beam: {key}.")
|
||||
E081 = ("Error getting gradient in beam: number of histories ({n_hist}) "
|
||||
"does not equal number of losses ({losses}).")
|
||||
E082 = ("Error deprojectivizing parse: number of heads ({n_heads}), "
|
||||
"projective heads ({n_proj_heads}) and labels ({n_labels}) do not "
|
||||
"match.")
|
||||
|
@ -544,6 +543,10 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
"during training, make sure to include it in 'annotating components'")
|
||||
|
||||
# New errors added in v3.x
|
||||
E851 = ("The 'textcat' component labels should only have values of 0 or 1, "
|
||||
"but found value of '{val}'.")
|
||||
E852 = ("The tar file pulled from the remote attempted an unsafe path "
|
||||
"traversal.")
|
||||
E853 = ("Unsupported component factory name '{name}'. The character '.' is "
|
||||
"not permitted in factory names.")
|
||||
E854 = ("Unable to set doc.ents. Check that the 'ents_filter' does not "
|
||||
|
@ -723,13 +726,6 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
"method in component '{name}'. If you want to use this "
|
||||
"method, make sure it's overwritten on the subclass.")
|
||||
E940 = ("Found NaN values in scores.")
|
||||
E941 = ("Can't find model '{name}'. It looks like you're trying to load a "
|
||||
"model from a shortcut, which is obsolete as of spaCy v3.0. To "
|
||||
"load the model, use its full name instead:\n\n"
|
||||
"nlp = spacy.load(\"{full}\")\n\nFor more details on the available "
|
||||
"models, see the models directory: https://spacy.io/models. If you "
|
||||
"want to create a blank model, use spacy.blank: "
|
||||
"nlp = spacy.blank(\"{name}\")")
|
||||
E942 = ("Executing `after_{name}` callback failed. Expected the function to "
|
||||
"return an initialized nlp object but got: {value}. Maybe "
|
||||
"you forgot to return the modified object in your function?")
|
||||
|
@ -950,20 +946,21 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
"sure it's overwritten on the subclass.")
|
||||
E1046 = ("{cls_name} is an abstract class and cannot be instantiated. If you are looking for spaCy's default "
|
||||
"knowledge base, use `InMemoryLookupKB`.")
|
||||
E1047 = ("`find_threshold()` only supports components with a `scorer` attribute.")
|
||||
E1048 = ("Got '{unexpected}' as console progress bar type, but expected one of the following: {expected}")
|
||||
E1049 = ("No available port found for displaCy on host {host}. Please specify an available port "
|
||||
"with `displacy.serve(doc, port=port)`")
|
||||
E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port=port)` "
|
||||
"or use `auto_switch_port=True` to pick an available port automatically.")
|
||||
|
||||
# v4 error strings
|
||||
E4000 = ("Expected a Doc as input, but got: '{type}'")
|
||||
E4001 = ("Expected input to be one of the following types: ({expected_types}), "
|
||||
"but got '{received_type}'")
|
||||
|
||||
|
||||
# Deprecated model shortcuts, only used in errors and warnings
|
||||
OLD_MODEL_SHORTCUTS = {
|
||||
"en": "en_core_web_sm", "de": "de_core_news_sm", "es": "es_core_news_sm",
|
||||
"pt": "pt_core_news_sm", "fr": "fr_core_news_sm", "it": "it_core_news_sm",
|
||||
"nl": "nl_core_news_sm", "el": "el_core_news_sm", "nb": "nb_core_news_sm",
|
||||
"lt": "lt_core_news_sm", "xx": "xx_ent_wiki_sm"
|
||||
}
|
||||
E4002 = ("Pipe '{name}' requires a teacher pipe for distillation.")
|
||||
E4003 = ("Training examples for distillation must have the exact same tokens in the "
|
||||
"reference and predicted docs.")
|
||||
E4004 = ("Backprop is not supported when is_train is not set.")
|
||||
|
||||
|
||||
# fmt: on
|
||||
|
|
|
@ -25,7 +25,7 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
"""An `InMemoryLookupKB` instance stores unique identifiers for entities and their textual aliases,
|
||||
to support entity linking of named entities to real-world concepts.
|
||||
|
||||
DOCS: https://spacy.io/api/kb_in_memory
|
||||
DOCS: https://spacy.io/api/inmemorylookupkb
|
||||
"""
|
||||
|
||||
def __init__(self, Vocab vocab, entity_vector_length):
|
||||
|
|
|
@ -15,7 +15,7 @@
|
|||
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
aan af al alle alles allebei alleen allen als altijd ander anders andere anderen aangaangde aangezien achter achterna
|
||||
aan af al alle alles allebei alleen allen als altijd ander anders andere anderen aangaande aangezien achter achterna
|
||||
afgelopen aldus alhoewel anderzijds
|
||||
|
||||
ben bij bijna bijvoorbeeld behalve beide beiden beneden bent bepaald beter betere betreffende binnen binnenin boven
|
||||
|
|
|
@ -28,34 +28,39 @@ class RussianLemmatizer(Lemmatizer):
|
|||
from pymorphy2 import MorphAnalyzer
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"The Russian lemmatizer mode 'pymorphy2' requires the "
|
||||
"pymorphy2 library. Install it with: pip install pymorphy2"
|
||||
"The lemmatizer mode 'pymorphy2' requires the "
|
||||
"pymorphy2 library and dictionaries. Install them with: "
|
||||
"pip install pymorphy2"
|
||||
"# for Ukrainian dictionaries:"
|
||||
"pip install pymorphy2-dicts-uk"
|
||||
) from None
|
||||
if getattr(self, "_morph", None) is None:
|
||||
self._morph = MorphAnalyzer()
|
||||
elif mode == "pymorphy3":
|
||||
self._morph = MorphAnalyzer(lang="ru")
|
||||
elif mode in {"pymorphy3", "pymorphy3_lookup"}:
|
||||
try:
|
||||
from pymorphy3 import MorphAnalyzer
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"The Russian lemmatizer mode 'pymorphy3' requires the "
|
||||
"pymorphy3 library. Install it with: pip install pymorphy3"
|
||||
"The lemmatizer mode 'pymorphy3' requires the "
|
||||
"pymorphy3 library and dictionaries. Install them with: "
|
||||
"pip install pymorphy3"
|
||||
"# for Ukrainian dictionaries:"
|
||||
"pip install pymorphy3-dicts-uk"
|
||||
) from None
|
||||
if getattr(self, "_morph", None) is None:
|
||||
self._morph = MorphAnalyzer()
|
||||
self._morph = MorphAnalyzer(lang="ru")
|
||||
super().__init__(
|
||||
vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||
)
|
||||
|
||||
def pymorphy2_lemmatize(self, token: Token) -> List[str]:
|
||||
def _pymorphy_lemmatize(self, token: Token) -> List[str]:
|
||||
string = token.text
|
||||
univ_pos = token.pos_
|
||||
morphology = token.morph.to_dict()
|
||||
if univ_pos == "PUNCT":
|
||||
return [PUNCT_RULES.get(string, string)]
|
||||
if univ_pos not in ("ADJ", "DET", "NOUN", "NUM", "PRON", "PROPN", "VERB"):
|
||||
# Skip unchangeable pos
|
||||
return [string.lower()]
|
||||
return self._pymorphy_lookup_lemmatize(token)
|
||||
analyses = self._morph.parse(string)
|
||||
filtered_analyses = []
|
||||
for analysis in analyses:
|
||||
|
@ -63,8 +68,10 @@ class RussianLemmatizer(Lemmatizer):
|
|||
# Skip suggested parse variant for unknown word for pymorphy
|
||||
continue
|
||||
analysis_pos, _ = oc2ud(str(analysis.tag))
|
||||
if analysis_pos == univ_pos or (
|
||||
analysis_pos in ("NOUN", "PROPN") and univ_pos in ("NOUN", "PROPN")
|
||||
if (
|
||||
analysis_pos == univ_pos
|
||||
or (analysis_pos in ("NOUN", "PROPN") and univ_pos in ("NOUN", "PROPN"))
|
||||
or ((analysis_pos == "PRON") and (univ_pos == "DET"))
|
||||
):
|
||||
filtered_analyses.append(analysis)
|
||||
if not len(filtered_analyses):
|
||||
|
@ -107,15 +114,27 @@ class RussianLemmatizer(Lemmatizer):
|
|||
dict.fromkeys([analysis.normal_form for analysis in filtered_analyses])
|
||||
)
|
||||
|
||||
def pymorphy2_lookup_lemmatize(self, token: Token) -> List[str]:
|
||||
def _pymorphy_lookup_lemmatize(self, token: Token) -> List[str]:
|
||||
string = token.text
|
||||
analyses = self._morph.parse(string)
|
||||
if len(analyses) == 1:
|
||||
return [analyses[0].normal_form]
|
||||
# often multiple forms would derive from the same normal form
|
||||
# thus check _unique_ normal forms
|
||||
normal_forms = set([an.normal_form for an in analyses])
|
||||
if len(normal_forms) == 1:
|
||||
return [next(iter(normal_forms))]
|
||||
return [string]
|
||||
|
||||
def pymorphy2_lemmatize(self, token: Token) -> List[str]:
|
||||
return self._pymorphy_lemmatize(token)
|
||||
|
||||
def pymorphy2_lookup_lemmatize(self, token: Token) -> List[str]:
|
||||
return self._pymorphy_lookup_lemmatize(token)
|
||||
|
||||
def pymorphy3_lemmatize(self, token: Token) -> List[str]:
|
||||
return self.pymorphy2_lemmatize(token)
|
||||
return self._pymorphy_lemmatize(token)
|
||||
|
||||
def pymorphy3_lookup_lemmatize(self, token: Token) -> List[str]:
|
||||
return self._pymorphy_lookup_lemmatize(token)
|
||||
|
||||
|
||||
def oc2ud(oc_tag: str) -> Tuple[str, Dict[str, str]]:
|
||||
|
|
|
@ -61,6 +61,11 @@ for abbr in [
|
|||
{ORTH: "2к23", NORM: "2023"},
|
||||
{ORTH: "2к24", NORM: "2024"},
|
||||
{ORTH: "2к25", NORM: "2025"},
|
||||
{ORTH: "2к26", NORM: "2026"},
|
||||
{ORTH: "2к27", NORM: "2027"},
|
||||
{ORTH: "2к28", NORM: "2028"},
|
||||
{ORTH: "2к29", NORM: "2029"},
|
||||
{ORTH: "2к30", NORM: "2030"},
|
||||
]:
|
||||
_exc[abbr[ORTH]] = [abbr]
|
||||
|
||||
|
@ -268,8 +273,8 @@ for abbr in [
|
|||
{ORTH: "з-ка", NORM: "заимка"},
|
||||
{ORTH: "п-к", NORM: "починок"},
|
||||
{ORTH: "киш.", NORM: "кишлак"},
|
||||
{ORTH: "п. ст. ", NORM: "поселок станция"},
|
||||
{ORTH: "п. ж/д ст. ", NORM: "поселок при железнодорожной станции"},
|
||||
{ORTH: "п. ст.", NORM: "поселок станция"},
|
||||
{ORTH: "п. ж/д ст.", NORM: "поселок при железнодорожной станции"},
|
||||
{ORTH: "ж/д бл-ст", NORM: "железнодорожный блокпост"},
|
||||
{ORTH: "ж/д б-ка", NORM: "железнодорожная будка"},
|
||||
{ORTH: "ж/д в-ка", NORM: "железнодорожная ветка"},
|
||||
|
@ -280,12 +285,12 @@ for abbr in [
|
|||
{ORTH: "ж/д п.п.", NORM: "железнодорожный путевой пост"},
|
||||
{ORTH: "ж/д о.п.", NORM: "железнодорожный остановочный пункт"},
|
||||
{ORTH: "ж/д рзд.", NORM: "железнодорожный разъезд"},
|
||||
{ORTH: "ж/д ст. ", NORM: "железнодорожная станция"},
|
||||
{ORTH: "ж/д ст.", NORM: "железнодорожная станция"},
|
||||
{ORTH: "м-ко", NORM: "местечко"},
|
||||
{ORTH: "д.", NORM: "деревня"},
|
||||
{ORTH: "с.", NORM: "село"},
|
||||
{ORTH: "сл.", NORM: "слобода"},
|
||||
{ORTH: "ст. ", NORM: "станция"},
|
||||
{ORTH: "ст.", NORM: "станция"},
|
||||
{ORTH: "ст-ца", NORM: "станица"},
|
||||
{ORTH: "у.", NORM: "улус"},
|
||||
{ORTH: "х.", NORM: "хутор"},
|
||||
|
@ -388,8 +393,9 @@ for abbr in [
|
|||
{ORTH: "прим.", NORM: "примечание"},
|
||||
{ORTH: "прим.ред.", NORM: "примечание редакции"},
|
||||
{ORTH: "см. также", NORM: "смотри также"},
|
||||
{ORTH: "кв.м.", NORM: "квадрантный метр"},
|
||||
{ORTH: "м2", NORM: "квадрантный метр"},
|
||||
{ORTH: "см.", NORM: "смотри"},
|
||||
{ORTH: "кв.м.", NORM: "квадратный метр"},
|
||||
{ORTH: "м2", NORM: "квадратный метр"},
|
||||
{ORTH: "б/у", NORM: "бывший в употреблении"},
|
||||
{ORTH: "сокр.", NORM: "сокращение"},
|
||||
{ORTH: "чел.", NORM: "человек"},
|
||||
|
|
|
@ -29,7 +29,7 @@ class UkrainianLemmatizer(RussianLemmatizer):
|
|||
) from None
|
||||
if getattr(self, "_morph", None) is None:
|
||||
self._morph = MorphAnalyzer(lang="uk")
|
||||
elif mode == "pymorphy3":
|
||||
elif mode in {"pymorphy3", "pymorphy3_lookup"}:
|
||||
try:
|
||||
from pymorphy3 import MorphAnalyzer
|
||||
except ImportError:
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from typing import Iterator, Optional, Any, Dict, Callable, Iterable
|
||||
from typing import Iterator, Optional, Any, Dict, Callable, Iterable, Literal
|
||||
from typing import Union, Tuple, List, Set, Pattern, Sequence
|
||||
from typing import NoReturn, TYPE_CHECKING, TypeVar, cast, overload
|
||||
|
||||
|
@ -22,7 +22,7 @@ from . import ty
|
|||
from .tokens.underscore import Underscore
|
||||
from .vocab import Vocab, create_vocab
|
||||
from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis
|
||||
from .training import Example, validate_examples
|
||||
from .training import Example, validate_examples, validate_distillation_examples
|
||||
from .training.initialize import init_vocab, init_tok2vec
|
||||
from .scorer import Scorer
|
||||
from .util import registry, SimpleFrozenList, _pipe, raise_error, _DEFAULT_EMPTY_PIPES
|
||||
|
@ -40,11 +40,9 @@ from .git_info import GIT_VERSION
|
|||
from . import util
|
||||
from . import about
|
||||
from .lookups import load_lookups
|
||||
from .compat import Literal
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .pipeline import Pipe # noqa: F401
|
||||
PipeCallable = Callable[[Doc], Doc]
|
||||
|
||||
|
||||
# This is the base config will all settings (training etc.)
|
||||
|
@ -181,7 +179,7 @@ class Language:
|
|||
self.vocab: Vocab = vocab
|
||||
if self.lang is None:
|
||||
self.lang = self.vocab.lang
|
||||
self._components: List[Tuple[str, "Pipe"]] = []
|
||||
self._components: List[Tuple[str, PipeCallable]] = []
|
||||
self._disabled: Set[str] = set()
|
||||
self.max_length = max_length
|
||||
# Create the default tokenizer from the default config
|
||||
|
@ -303,7 +301,7 @@ class Language:
|
|||
return SimpleFrozenList(names)
|
||||
|
||||
@property
|
||||
def components(self) -> List[Tuple[str, "Pipe"]]:
|
||||
def components(self) -> List[Tuple[str, PipeCallable]]:
|
||||
"""Get all (name, component) tuples in the pipeline, including the
|
||||
currently disabled components.
|
||||
"""
|
||||
|
@ -322,12 +320,12 @@ class Language:
|
|||
return SimpleFrozenList(names, error=Errors.E926.format(attr="component_names"))
|
||||
|
||||
@property
|
||||
def pipeline(self) -> List[Tuple[str, "Pipe"]]:
|
||||
def pipeline(self) -> List[Tuple[str, PipeCallable]]:
|
||||
"""The processing pipeline consisting of (name, component) tuples. The
|
||||
components are called on the Doc in order as it passes through the
|
||||
pipeline.
|
||||
|
||||
RETURNS (List[Tuple[str, Pipe]]): The pipeline.
|
||||
RETURNS (List[Tuple[str, Callable[[Doc], Doc]]]): The pipeline.
|
||||
"""
|
||||
pipes = [(n, p) for n, p in self._components if n not in self._disabled]
|
||||
return SimpleFrozenList(pipes, error=Errors.E926.format(attr="pipeline"))
|
||||
|
@ -527,7 +525,7 @@ class Language:
|
|||
assigns: Iterable[str] = SimpleFrozenList(),
|
||||
requires: Iterable[str] = SimpleFrozenList(),
|
||||
retokenizes: bool = False,
|
||||
func: Optional["Pipe"] = None,
|
||||
func: Optional[PipeCallable] = None,
|
||||
) -> Callable[..., Any]:
|
||||
"""Register a new pipeline component. Can be used for stateless function
|
||||
components that don't require a separate factory. Can be used as a
|
||||
|
@ -542,7 +540,7 @@ class Language:
|
|||
e.g. "token.ent_id". Used for pipeline analysis.
|
||||
retokenizes (bool): Whether the component changes the tokenization.
|
||||
Used for pipeline analysis.
|
||||
func (Optional[Callable]): Factory function if not used as a decorator.
|
||||
func (Optional[Callable[[Doc], Doc]): Factory function if not used as a decorator.
|
||||
|
||||
DOCS: https://spacy.io/api/language#component
|
||||
"""
|
||||
|
@ -553,11 +551,11 @@ class Language:
|
|||
raise ValueError(Errors.E853.format(name=name))
|
||||
component_name = name if name is not None else util.get_object_name(func)
|
||||
|
||||
def add_component(component_func: "Pipe") -> Callable:
|
||||
def add_component(component_func: PipeCallable) -> Callable:
|
||||
if isinstance(func, type): # function is a class
|
||||
raise ValueError(Errors.E965.format(name=component_name))
|
||||
|
||||
def factory_func(nlp, name: str) -> "Pipe":
|
||||
def factory_func(nlp, name: str) -> PipeCallable:
|
||||
return component_func
|
||||
|
||||
internal_name = cls.get_factory_name(name)
|
||||
|
@ -607,7 +605,7 @@ class Language:
|
|||
print_pipe_analysis(analysis, keys=keys)
|
||||
return analysis
|
||||
|
||||
def get_pipe(self, name: str) -> "Pipe":
|
||||
def get_pipe(self, name: str) -> PipeCallable:
|
||||
"""Get a pipeline component for a given component name.
|
||||
|
||||
name (str): Name of pipeline component to get.
|
||||
|
@ -628,7 +626,7 @@ class Language:
|
|||
config: Dict[str, Any] = SimpleFrozenDict(),
|
||||
raw_config: Optional[Config] = None,
|
||||
validate: bool = True,
|
||||
) -> "Pipe":
|
||||
) -> PipeCallable:
|
||||
"""Create a pipeline component. Mostly used internally. To create and
|
||||
add a component to the pipeline, you can use nlp.add_pipe.
|
||||
|
||||
|
@ -640,7 +638,7 @@ class Language:
|
|||
raw_config (Optional[Config]): Internals: the non-interpolated config.
|
||||
validate (bool): Whether to validate the component config against the
|
||||
arguments and types expected by the factory.
|
||||
RETURNS (Pipe): The pipeline component.
|
||||
RETURNS (Callable[[Doc], Doc]): The pipeline component.
|
||||
|
||||
DOCS: https://spacy.io/api/language#create_pipe
|
||||
"""
|
||||
|
@ -695,24 +693,18 @@ class Language:
|
|||
|
||||
def create_pipe_from_source(
|
||||
self, source_name: str, source: "Language", *, name: str
|
||||
) -> Tuple["Pipe", str]:
|
||||
) -> Tuple[PipeCallable, str]:
|
||||
"""Create a pipeline component by copying it from an existing model.
|
||||
|
||||
source_name (str): Name of the component in the source pipeline.
|
||||
source (Language): The source nlp object to copy from.
|
||||
name (str): Optional alternative name to use in current pipeline.
|
||||
RETURNS (Tuple[Callable, str]): The component and its factory name.
|
||||
RETURNS (Tuple[Callable[[Doc], Doc], str]): The component and its factory name.
|
||||
"""
|
||||
# Check source type
|
||||
if not isinstance(source, Language):
|
||||
raise ValueError(Errors.E945.format(name=source_name, source=type(source)))
|
||||
# Check vectors, with faster checks first
|
||||
if (
|
||||
self.vocab.vectors.shape != source.vocab.vectors.shape
|
||||
or self.vocab.vectors.key2row != source.vocab.vectors.key2row
|
||||
or self.vocab.vectors.to_bytes(exclude=["strings"])
|
||||
!= source.vocab.vectors.to_bytes(exclude=["strings"])
|
||||
):
|
||||
if self.vocab.vectors != source.vocab.vectors:
|
||||
warnings.warn(Warnings.W113.format(name=source_name))
|
||||
if source_name not in source.component_names:
|
||||
raise KeyError(
|
||||
|
@ -746,7 +738,7 @@ class Language:
|
|||
config: Dict[str, Any] = SimpleFrozenDict(),
|
||||
raw_config: Optional[Config] = None,
|
||||
validate: bool = True,
|
||||
) -> "Pipe":
|
||||
) -> PipeCallable:
|
||||
"""Add a component to the processing pipeline. Valid components are
|
||||
callables that take a `Doc` object, modify it and return it. Only one
|
||||
of before/after/first/last can be set. Default behaviour is "last".
|
||||
|
@ -769,7 +761,7 @@ class Language:
|
|||
raw_config (Optional[Config]): Internals: the non-interpolated config.
|
||||
validate (bool): Whether to validate the component config against the
|
||||
arguments and types expected by the factory.
|
||||
RETURNS (Pipe): The pipeline component.
|
||||
RETURNS (Callable[[Doc], Doc]): The pipeline component.
|
||||
|
||||
DOCS: https://spacy.io/api/language#add_pipe
|
||||
"""
|
||||
|
@ -790,14 +782,6 @@ class Language:
|
|||
factory_name, source, name=name
|
||||
)
|
||||
else:
|
||||
if not self.has_factory(factory_name):
|
||||
err = Errors.E002.format(
|
||||
name=factory_name,
|
||||
opts=", ".join(self.factory_names),
|
||||
method="add_pipe",
|
||||
lang=util.get_object_name(self),
|
||||
lang_code=self.lang,
|
||||
)
|
||||
pipe_component = self.create_pipe(
|
||||
factory_name,
|
||||
name=name,
|
||||
|
@ -883,7 +867,7 @@ class Language:
|
|||
*,
|
||||
config: Dict[str, Any] = SimpleFrozenDict(),
|
||||
validate: bool = True,
|
||||
) -> "Pipe":
|
||||
) -> PipeCallable:
|
||||
"""Replace a component in the pipeline.
|
||||
|
||||
name (str): Name of the component to replace.
|
||||
|
@ -892,7 +876,7 @@ class Language:
|
|||
component. Will be merged with default config, if available.
|
||||
validate (bool): Whether to validate the component config against the
|
||||
arguments and types expected by the factory.
|
||||
RETURNS (Pipe): The new pipeline component.
|
||||
RETURNS (Callable[[Doc], Doc]): The new pipeline component.
|
||||
|
||||
DOCS: https://spacy.io/api/language#replace_pipe
|
||||
"""
|
||||
|
@ -944,11 +928,11 @@ class Language:
|
|||
init_cfg = self._config["initialize"]["components"].pop(old_name)
|
||||
self._config["initialize"]["components"][new_name] = init_cfg
|
||||
|
||||
def remove_pipe(self, name: str) -> Tuple[str, "Pipe"]:
|
||||
def remove_pipe(self, name: str) -> Tuple[str, PipeCallable]:
|
||||
"""Remove a component from the pipeline.
|
||||
|
||||
name (str): Name of the component to remove.
|
||||
RETURNS (tuple): A `(name, component)` tuple of the removed component.
|
||||
RETURNS (Tuple[str, Callable[[Doc], Doc]]): A `(name, component)` tuple of the removed component.
|
||||
|
||||
DOCS: https://spacy.io/api/language#remove_pipe
|
||||
"""
|
||||
|
@ -1033,6 +1017,102 @@ class Language:
|
|||
raise ValueError(Errors.E005.format(name=name, returned_type=type(doc)))
|
||||
return doc
|
||||
|
||||
def distill(
|
||||
self,
|
||||
teacher: "Language",
|
||||
examples: Iterable[Example],
|
||||
*,
|
||||
drop: float = 0.0,
|
||||
sgd: Optional[Optimizer] = None,
|
||||
losses: Optional[Dict[str, float]] = None,
|
||||
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
|
||||
exclude: Iterable[str] = SimpleFrozenList(),
|
||||
annotates: Iterable[str] = SimpleFrozenList(),
|
||||
student_to_teacher: Optional[Dict[str, str]] = None,
|
||||
):
|
||||
"""Distill the models in a student pipeline from a teacher pipeline.
|
||||
teacher (Language): Teacher to distill from.
|
||||
examples (Iterable[Example]): Distillation examples. The reference
|
||||
(teacher) and predicted (student) docs must have the same number of
|
||||
tokens and the same orthography.
|
||||
drop (float): The dropout rate.
|
||||
sgd (Optional[Optimizer]): An optimizer.
|
||||
losses (Optional(Dict[str, float])): Dictionary to update with the loss,
|
||||
keyed by component.
|
||||
component_cfg (Optional[Dict[str, Dict[str, Any]]]): Config parameters
|
||||
for specific pipeline components, keyed by component name.
|
||||
exclude (Iterable[str]): Names of components that shouldn't be updated.
|
||||
annotates (Iterable[str]): Names of components that should set
|
||||
annotations on the predicted examples after updating.
|
||||
student_to_teacher (Optional[Dict[str, str]]): Map student pipe name to
|
||||
teacher pipe name, only needed for pipes where the student pipe
|
||||
name does not match the teacher pipe name.
|
||||
RETURNS (Dict[str, float]): The updated losses dictionary
|
||||
|
||||
DOCS: https://spacy.io/api/language#distill
|
||||
"""
|
||||
if student_to_teacher is None:
|
||||
student_to_teacher = {}
|
||||
if losses is None:
|
||||
losses = {}
|
||||
if isinstance(examples, list) and len(examples) == 0:
|
||||
return losses
|
||||
|
||||
validate_distillation_examples(examples, "Language.distill")
|
||||
examples = _copy_examples(examples)
|
||||
|
||||
if sgd is None:
|
||||
if self._optimizer is None:
|
||||
self._optimizer = self.create_optimizer()
|
||||
sgd = self._optimizer
|
||||
|
||||
if component_cfg is None:
|
||||
component_cfg = {}
|
||||
pipe_kwargs = {}
|
||||
for student_name, student_proc in self.pipeline:
|
||||
component_cfg.setdefault(student_name, {})
|
||||
pipe_kwargs[student_name] = deepcopy(component_cfg[student_name])
|
||||
component_cfg[student_name].setdefault("drop", drop)
|
||||
pipe_kwargs[student_name].setdefault("batch_size", self.batch_size)
|
||||
|
||||
teacher_pipes = dict(teacher.pipeline)
|
||||
for student_name, student_proc in self.pipeline:
|
||||
if student_name in annotates:
|
||||
for doc, eg in zip(
|
||||
_pipe(
|
||||
(eg.predicted for eg in examples),
|
||||
proc=student_proc,
|
||||
name=student_name,
|
||||
default_error_handler=self.default_error_handler,
|
||||
kwargs=pipe_kwargs[student_name],
|
||||
),
|
||||
examples,
|
||||
):
|
||||
eg.predicted = doc
|
||||
|
||||
if (
|
||||
student_name not in exclude
|
||||
and isinstance(student_proc, ty.DistillableComponent)
|
||||
and student_proc.is_distillable
|
||||
):
|
||||
# A missing teacher pipe is not an error, some student pipes
|
||||
# do not need a teacher, such as tok2vec layer losses.
|
||||
teacher_name = (
|
||||
student_to_teacher[student_name]
|
||||
if student_name in student_to_teacher
|
||||
else student_name
|
||||
)
|
||||
teacher_pipe = teacher_pipes.get(teacher_name, None)
|
||||
student_proc.distill(
|
||||
teacher_pipe,
|
||||
examples,
|
||||
sgd=sgd,
|
||||
losses=losses,
|
||||
**component_cfg[student_name],
|
||||
)
|
||||
|
||||
return losses
|
||||
|
||||
def disable_pipes(self, *names) -> "DisabledPipes":
|
||||
"""Disable one or more pipeline components. If used as a context
|
||||
manager, the pipeline will be restored to the initial state at the end
|
||||
|
@ -1254,25 +1334,20 @@ class Language:
|
|||
sgd(key, W, dW) # type: ignore[call-arg, misc]
|
||||
return losses
|
||||
|
||||
def begin_training(
|
||||
self,
|
||||
get_examples: Optional[Callable[[], Iterable[Example]]] = None,
|
||||
*,
|
||||
sgd: Optional[Optimizer] = None,
|
||||
) -> Optimizer:
|
||||
warnings.warn(Warnings.W089, DeprecationWarning)
|
||||
return self.initialize(get_examples, sgd=sgd)
|
||||
|
||||
def initialize(
|
||||
self,
|
||||
get_examples: Optional[Callable[[], Iterable[Example]]] = None,
|
||||
*,
|
||||
labels: Optional[Dict[str, Any]] = None,
|
||||
sgd: Optional[Optimizer] = None,
|
||||
) -> Optimizer:
|
||||
"""Initialize the pipe for training, using data examples if available.
|
||||
|
||||
get_examples (Callable[[], Iterable[Example]]): Optional function that
|
||||
returns gold-standard Example objects.
|
||||
labels (Optional[Dict[str, Any]]): Labels to pass to pipe initialization,
|
||||
using the names of the pipes as keys. Overrides labels that are in
|
||||
the model configuration.
|
||||
sgd (Optional[Optimizer]): An optimizer to use for updates. If not
|
||||
provided, will be created using the .create_optimizer() method.
|
||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
||||
|
@ -1317,6 +1392,8 @@ class Language:
|
|||
for name, proc in self.pipeline:
|
||||
if isinstance(proc, ty.InitializableComponent):
|
||||
p_settings = I["components"].get(name, {})
|
||||
if labels is not None and name in labels:
|
||||
p_settings["labels"] = labels[name]
|
||||
p_settings = validate_init_settings(
|
||||
proc.initialize, p_settings, section="components", name=name
|
||||
)
|
||||
|
@ -1363,15 +1440,15 @@ class Language:
|
|||
|
||||
def set_error_handler(
|
||||
self,
|
||||
error_handler: Callable[[str, "Pipe", List[Doc], Exception], NoReturn],
|
||||
error_handler: Callable[[str, PipeCallable, List[Doc], Exception], NoReturn],
|
||||
):
|
||||
"""Set an error handler object for all the components in the pipeline that implement
|
||||
a set_error_handler function.
|
||||
"""Set an error handler object for all the components in the pipeline
|
||||
that implement a set_error_handler function.
|
||||
|
||||
error_handler (Callable[[str, Pipe, List[Doc], Exception], NoReturn]):
|
||||
Function that deals with a failing batch of documents. This callable function should take in
|
||||
the component's name, the component itself, the offending batch of documents, and the exception
|
||||
that was thrown.
|
||||
error_handler (Callable[[str, Callable[[Doc], Doc], List[Doc], Exception], NoReturn]):
|
||||
Function that deals with a failing batch of documents. This callable
|
||||
function should take in the component's name, the component itself,
|
||||
the offending batch of documents, and the exception that was thrown.
|
||||
DOCS: https://spacy.io/api/language#set_error_handler
|
||||
"""
|
||||
self.default_error_handler = error_handler
|
||||
|
@ -1750,6 +1827,7 @@ class Language:
|
|||
# using the nlp.config with all defaults.
|
||||
config = util.copy_config(config)
|
||||
orig_pipeline = config.pop("components", {})
|
||||
orig_distill = config.pop("distill", None)
|
||||
orig_pretraining = config.pop("pretraining", None)
|
||||
config["components"] = {}
|
||||
if auto_fill:
|
||||
|
@ -1758,6 +1836,9 @@ class Language:
|
|||
filled = config
|
||||
filled["components"] = orig_pipeline
|
||||
config["components"] = orig_pipeline
|
||||
if orig_distill is not None:
|
||||
filled["distill"] = orig_distill
|
||||
config["distill"] = orig_distill
|
||||
if orig_pretraining is not None:
|
||||
filled["pretraining"] = orig_pretraining
|
||||
config["pretraining"] = orig_pretraining
|
||||
|
@ -1879,31 +1960,22 @@ class Language:
|
|||
if isinstance(exclude, str):
|
||||
exclude = [exclude]
|
||||
|
||||
def fetch_pipes_status(value: Iterable[str], key: str) -> Iterable[str]:
|
||||
"""Fetch value for `enable` or `disable` w.r.t. the specified config and passed arguments passed to
|
||||
.load(). If both arguments and config specified values for this field, the passed arguments take precedence
|
||||
and a warning is printed.
|
||||
value (Iterable[str]): Passed value for `enable` or `disable`.
|
||||
key (str): Key for field in config (either "enabled" or "disabled").
|
||||
RETURN (Iterable[str]):
|
||||
"""
|
||||
# We assume that no argument was passed if the value is the specified default value.
|
||||
if id(value) == id(_DEFAULT_EMPTY_PIPES):
|
||||
return config["nlp"].get(key, [])
|
||||
else:
|
||||
if len(config["nlp"].get(key, [])):
|
||||
warnings.warn(
|
||||
Warnings.W123.format(
|
||||
arg=key[:-1],
|
||||
arg_value=value,
|
||||
config_value=config["nlp"][key],
|
||||
)
|
||||
# `enable` should not be merged with `enabled` (the opposite is true for `disable`/`disabled`). If the config
|
||||
# specifies values for `enabled` not included in `enable`, emit warning.
|
||||
if id(enable) != id(_DEFAULT_EMPTY_PIPES):
|
||||
enabled = config["nlp"].get("enabled", [])
|
||||
if len(enabled) and not set(enabled).issubset(enable):
|
||||
warnings.warn(
|
||||
Warnings.W123.format(
|
||||
enable=enable,
|
||||
enabled=enabled,
|
||||
)
|
||||
return value
|
||||
)
|
||||
|
||||
# Ensure sets of disabled/enabled pipe names are not contradictory.
|
||||
disabled_pipes = cls._resolve_component_status(
|
||||
fetch_pipes_status(disable, "disabled"),
|
||||
fetch_pipes_status(enable, "enabled"),
|
||||
list({*disable, *config["nlp"].get("disabled", [])}),
|
||||
enable,
|
||||
config["nlp"]["pipeline"],
|
||||
)
|
||||
nlp._disabled = set(p for p in disabled_pipes if p not in exclude)
|
||||
|
@ -2084,10 +2156,12 @@ class Language:
|
|||
if enable:
|
||||
if isinstance(enable, str):
|
||||
enable = [enable]
|
||||
to_disable = [
|
||||
pipe_name for pipe_name in pipe_names if pipe_name not in enable
|
||||
]
|
||||
if disable and disable != to_disable:
|
||||
to_disable = {
|
||||
*[pipe_name for pipe_name in pipe_names if pipe_name not in enable],
|
||||
*disable,
|
||||
}
|
||||
# If any pipe to be enabled is in to_disable, the specification is inconsistent.
|
||||
if len(set(enable) & to_disable):
|
||||
raise ValueError(Errors.E1042.format(enable=enable, disable=disable))
|
||||
|
||||
return tuple(to_disable)
|
||||
|
|
|
@ -5,7 +5,6 @@ from .attrs cimport attr_id_t
|
|||
from .attrs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, LANG
|
||||
|
||||
from .structs cimport LexemeC
|
||||
from .strings cimport StringStore
|
||||
from .vocab cimport Vocab
|
||||
|
||||
|
||||
|
|
|
@ -41,7 +41,7 @@ cdef class Lexeme:
|
|||
"""
|
||||
self.vocab = vocab
|
||||
self.orth = orth
|
||||
self.c = <LexemeC*><void*>vocab.get_by_orth(vocab.mem, orth)
|
||||
self.c = <LexemeC*><void*>vocab.get_by_orth(orth)
|
||||
if self.c.orth != orth:
|
||||
raise ValueError(Errors.E071.format(orth=orth, vocab_orth=self.c.orth))
|
||||
|
||||
|
|
|
@ -4,6 +4,8 @@ from libc.stdint cimport int64_t
|
|||
|
||||
from typing import Optional
|
||||
|
||||
from ..util import registry
|
||||
|
||||
|
||||
cdef extern from "polyleven.c":
|
||||
int64_t polyleven(PyObject *o1, PyObject *o2, int64_t k)
|
||||
|
@ -13,3 +15,18 @@ cpdef int64_t levenshtein(a: str, b: str, k: Optional[int] = None):
|
|||
if k is None:
|
||||
k = -1
|
||||
return polyleven(<PyObject*>a, <PyObject*>b, k)
|
||||
|
||||
|
||||
cpdef bint levenshtein_compare(input_text: str, pattern_text: str, fuzzy: int = -1):
|
||||
if fuzzy >= 0:
|
||||
max_edits = fuzzy
|
||||
else:
|
||||
# allow at least two edits (to allow at least one transposition) and up
|
||||
# to 30% of the pattern string length
|
||||
max_edits = max(2, round(0.3 * len(pattern_text)))
|
||||
return levenshtein(input_text, pattern_text, max_edits) <= max_edits
|
||||
|
||||
|
||||
@registry.misc("spacy.levenshtein_compare.v1")
|
||||
def make_levenshtein_compare():
|
||||
return levenshtein_compare
|
||||
|
|
|
@ -77,3 +77,4 @@ cdef class Matcher:
|
|||
cdef public object _extensions
|
||||
cdef public object _extra_predicates
|
||||
cdef public object _seen_attrs
|
||||
cdef public object _fuzzy_compare
|
||||
|
|
|
@ -1,11 +1,15 @@
|
|||
from typing import Any, List, Dict, Tuple, Optional, Callable, Union
|
||||
from typing import Any, List, Dict, Tuple, Optional, Callable, Union, Literal
|
||||
from typing import Iterator, Iterable, overload
|
||||
from ..compat import Literal
|
||||
from ..vocab import Vocab
|
||||
from ..tokens import Doc, Span
|
||||
|
||||
class Matcher:
|
||||
def __init__(self, vocab: Vocab, validate: bool = ...) -> None: ...
|
||||
def __init__(
|
||||
self,
|
||||
vocab: Vocab,
|
||||
validate: bool = ...,
|
||||
fuzzy_compare: Callable[[str, str, int], bool] = ...,
|
||||
) -> None: ...
|
||||
def __reduce__(self) -> Any: ...
|
||||
def __len__(self) -> int: ...
|
||||
def __contains__(self, key: str) -> bool: ...
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# cython: infer_types=True, cython: profile=True
|
||||
# cython: binding=True, infer_types=True, profile=True
|
||||
from typing import List, Iterable
|
||||
|
||||
from libcpp.vector cimport vector
|
||||
|
@ -20,10 +20,12 @@ from ..tokens.token cimport Token
|
|||
from ..tokens.morphanalysis cimport MorphAnalysis
|
||||
from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH, ENT_IOB
|
||||
|
||||
from .levenshtein import levenshtein_compare
|
||||
from ..schemas import validate_token_pattern
|
||||
from ..errors import Errors, MatchPatternError, Warnings
|
||||
from ..strings cimport get_string_id
|
||||
from ..attrs import IDS
|
||||
from ..util import registry
|
||||
|
||||
|
||||
DEF PADDING = 5
|
||||
|
@ -36,11 +38,13 @@ cdef class Matcher:
|
|||
USAGE: https://spacy.io/usage/rule-based-matching
|
||||
"""
|
||||
|
||||
def __init__(self, vocab, validate=True):
|
||||
def __init__(self, vocab, validate=True, *, fuzzy_compare=levenshtein_compare):
|
||||
"""Create the Matcher.
|
||||
|
||||
vocab (Vocab): The vocabulary object, which must be shared with the
|
||||
documents the matcher will operate on.
|
||||
validate (bool): Validate all patterns added to this matcher.
|
||||
fuzzy_compare (Callable[[str, str, int], bool]): The comparison method
|
||||
for the FUZZY operators.
|
||||
"""
|
||||
self._extra_predicates = []
|
||||
self._patterns = {}
|
||||
|
@ -51,9 +55,10 @@ cdef class Matcher:
|
|||
self.vocab = vocab
|
||||
self.mem = Pool()
|
||||
self.validate = validate
|
||||
self._fuzzy_compare = fuzzy_compare
|
||||
|
||||
def __reduce__(self):
|
||||
data = (self.vocab, self._patterns, self._callbacks)
|
||||
data = (self.vocab, self._patterns, self._callbacks, self.validate, self._fuzzy_compare)
|
||||
return (unpickle_matcher, data, None, None)
|
||||
|
||||
def __len__(self):
|
||||
|
@ -128,7 +133,7 @@ cdef class Matcher:
|
|||
for pattern in patterns:
|
||||
try:
|
||||
specs = _preprocess_pattern(pattern, self.vocab,
|
||||
self._extensions, self._extra_predicates)
|
||||
self._extensions, self._extra_predicates, self._fuzzy_compare)
|
||||
self.patterns.push_back(init_pattern(self.mem, key, specs))
|
||||
for spec in specs:
|
||||
for attr, _ in spec[1]:
|
||||
|
@ -327,8 +332,8 @@ cdef class Matcher:
|
|||
return key
|
||||
|
||||
|
||||
def unpickle_matcher(vocab, patterns, callbacks):
|
||||
matcher = Matcher(vocab)
|
||||
def unpickle_matcher(vocab, patterns, callbacks, validate, fuzzy_compare):
|
||||
matcher = Matcher(vocab, validate=validate, fuzzy_compare=fuzzy_compare)
|
||||
for key, pattern in patterns.items():
|
||||
callback = callbacks.get(key, None)
|
||||
matcher.add(key, pattern, on_match=callback)
|
||||
|
@ -755,7 +760,7 @@ cdef attr_t get_ent_id(const TokenPatternC* pattern) nogil:
|
|||
return id_attr.value
|
||||
|
||||
|
||||
def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates):
|
||||
def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates, fuzzy_compare):
|
||||
"""This function interprets the pattern, converting the various bits of
|
||||
syntactic sugar before we compile it into a struct with init_pattern.
|
||||
|
||||
|
@ -782,7 +787,7 @@ def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates):
|
|||
ops = _get_operators(spec)
|
||||
attr_values = _get_attr_values(spec, string_store)
|
||||
extensions = _get_extensions(spec, string_store, extensions_table)
|
||||
predicates = _get_extra_predicates(spec, extra_predicates, vocab)
|
||||
predicates = _get_extra_predicates(spec, extra_predicates, vocab, fuzzy_compare)
|
||||
for op in ops:
|
||||
tokens.append((op, list(attr_values), list(extensions), list(predicates), token_idx))
|
||||
return tokens
|
||||
|
@ -827,16 +832,45 @@ def _get_attr_values(spec, string_store):
|
|||
# These predicate helper classes are used to match the REGEX, IN, >= etc
|
||||
# extensions to the matcher introduced in #3173.
|
||||
|
||||
class _FuzzyPredicate:
|
||||
operators = ("FUZZY", "FUZZY1", "FUZZY2", "FUZZY3", "FUZZY4", "FUZZY5",
|
||||
"FUZZY6", "FUZZY7", "FUZZY8", "FUZZY9")
|
||||
|
||||
def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None,
|
||||
regex=False, fuzzy=None, fuzzy_compare=None):
|
||||
self.i = i
|
||||
self.attr = attr
|
||||
self.value = value
|
||||
self.predicate = predicate
|
||||
self.is_extension = is_extension
|
||||
if self.predicate not in self.operators:
|
||||
raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
|
||||
fuzz = self.predicate[len("FUZZY"):] # number after prefix
|
||||
self.fuzzy = int(fuzz) if fuzz else -1
|
||||
self.fuzzy_compare = fuzzy_compare
|
||||
self.key = (self.attr, self.fuzzy, self.predicate, srsly.json_dumps(value, sort_keys=True))
|
||||
|
||||
def __call__(self, Token token):
|
||||
if self.is_extension:
|
||||
value = token._.get(self.attr)
|
||||
else:
|
||||
value = token.vocab.strings[get_token_attr_for_matcher(token.c, self.attr)]
|
||||
if self.value == value:
|
||||
return True
|
||||
return self.fuzzy_compare(value, self.value, self.fuzzy)
|
||||
|
||||
|
||||
class _RegexPredicate:
|
||||
operators = ("REGEX",)
|
||||
|
||||
def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None):
|
||||
def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None,
|
||||
regex=False, fuzzy=None, fuzzy_compare=None):
|
||||
self.i = i
|
||||
self.attr = attr
|
||||
self.value = re.compile(value)
|
||||
self.predicate = predicate
|
||||
self.is_extension = is_extension
|
||||
self.key = (attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
|
||||
self.key = (self.attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
|
||||
if self.predicate not in self.operators:
|
||||
raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
|
||||
|
||||
|
@ -851,18 +885,28 @@ class _RegexPredicate:
|
|||
class _SetPredicate:
|
||||
operators = ("IN", "NOT_IN", "IS_SUBSET", "IS_SUPERSET", "INTERSECTS")
|
||||
|
||||
def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None):
|
||||
def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None,
|
||||
regex=False, fuzzy=None, fuzzy_compare=None):
|
||||
self.i = i
|
||||
self.attr = attr
|
||||
self.vocab = vocab
|
||||
self.regex = regex
|
||||
self.fuzzy = fuzzy
|
||||
self.fuzzy_compare = fuzzy_compare
|
||||
if self.attr == MORPH:
|
||||
# normalize morph strings
|
||||
self.value = set(self.vocab.morphology.add(v) for v in value)
|
||||
else:
|
||||
self.value = set(get_string_id(v) for v in value)
|
||||
if self.regex:
|
||||
self.value = set(re.compile(v) for v in value)
|
||||
elif self.fuzzy is not None:
|
||||
# add to string store
|
||||
self.value = set(self.vocab.strings.add(v) for v in value)
|
||||
else:
|
||||
self.value = set(get_string_id(v) for v in value)
|
||||
self.predicate = predicate
|
||||
self.is_extension = is_extension
|
||||
self.key = (attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
|
||||
self.key = (self.attr, self.regex, self.fuzzy, self.predicate, srsly.json_dumps(value, sort_keys=True))
|
||||
if self.predicate not in self.operators:
|
||||
raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
|
||||
|
||||
|
@ -890,9 +934,29 @@ class _SetPredicate:
|
|||
return False
|
||||
|
||||
if self.predicate == "IN":
|
||||
return value in self.value
|
||||
if self.regex:
|
||||
value = self.vocab.strings[value]
|
||||
return any(bool(v.search(value)) for v in self.value)
|
||||
elif self.fuzzy is not None:
|
||||
value = self.vocab.strings[value]
|
||||
return any(self.fuzzy_compare(value, self.vocab.strings[v], self.fuzzy)
|
||||
for v in self.value)
|
||||
elif value in self.value:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
elif self.predicate == "NOT_IN":
|
||||
return value not in self.value
|
||||
if self.regex:
|
||||
value = self.vocab.strings[value]
|
||||
return not any(bool(v.search(value)) for v in self.value)
|
||||
elif self.fuzzy is not None:
|
||||
value = self.vocab.strings[value]
|
||||
return not any(self.fuzzy_compare(value, self.vocab.strings[v], self.fuzzy)
|
||||
for v in self.value)
|
||||
elif value in self.value:
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
elif self.predicate == "IS_SUBSET":
|
||||
return value <= self.value
|
||||
elif self.predicate == "IS_SUPERSET":
|
||||
|
@ -907,13 +971,14 @@ class _SetPredicate:
|
|||
class _ComparisonPredicate:
|
||||
operators = ("==", "!=", ">=", "<=", ">", "<")
|
||||
|
||||
def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None):
|
||||
def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None,
|
||||
regex=False, fuzzy=None, fuzzy_compare=None):
|
||||
self.i = i
|
||||
self.attr = attr
|
||||
self.value = value
|
||||
self.predicate = predicate
|
||||
self.is_extension = is_extension
|
||||
self.key = (attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
|
||||
self.key = (self.attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
|
||||
if self.predicate not in self.operators:
|
||||
raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
|
||||
|
||||
|
@ -936,7 +1001,7 @@ class _ComparisonPredicate:
|
|||
return value < self.value
|
||||
|
||||
|
||||
def _get_extra_predicates(spec, extra_predicates, vocab):
|
||||
def _get_extra_predicates(spec, extra_predicates, vocab, fuzzy_compare):
|
||||
predicate_types = {
|
||||
"REGEX": _RegexPredicate,
|
||||
"IN": _SetPredicate,
|
||||
|
@ -950,6 +1015,16 @@ def _get_extra_predicates(spec, extra_predicates, vocab):
|
|||
"<=": _ComparisonPredicate,
|
||||
">": _ComparisonPredicate,
|
||||
"<": _ComparisonPredicate,
|
||||
"FUZZY": _FuzzyPredicate,
|
||||
"FUZZY1": _FuzzyPredicate,
|
||||
"FUZZY2": _FuzzyPredicate,
|
||||
"FUZZY3": _FuzzyPredicate,
|
||||
"FUZZY4": _FuzzyPredicate,
|
||||
"FUZZY5": _FuzzyPredicate,
|
||||
"FUZZY6": _FuzzyPredicate,
|
||||
"FUZZY7": _FuzzyPredicate,
|
||||
"FUZZY8": _FuzzyPredicate,
|
||||
"FUZZY9": _FuzzyPredicate,
|
||||
}
|
||||
seen_predicates = {pred.key: pred.i for pred in extra_predicates}
|
||||
output = []
|
||||
|
@ -967,22 +1042,47 @@ def _get_extra_predicates(spec, extra_predicates, vocab):
|
|||
attr = "ORTH"
|
||||
attr = IDS.get(attr.upper())
|
||||
if isinstance(value, dict):
|
||||
processed = False
|
||||
value_with_upper_keys = {k.upper(): v for k, v in value.items()}
|
||||
for type_, cls in predicate_types.items():
|
||||
if type_ in value_with_upper_keys:
|
||||
predicate = cls(len(extra_predicates), attr, value_with_upper_keys[type_], type_, vocab=vocab)
|
||||
# Don't create a redundant predicates.
|
||||
# This helps with efficiency, as we're caching the results.
|
||||
if predicate.key in seen_predicates:
|
||||
output.append(seen_predicates[predicate.key])
|
||||
else:
|
||||
extra_predicates.append(predicate)
|
||||
output.append(predicate.i)
|
||||
seen_predicates[predicate.key] = predicate.i
|
||||
processed = True
|
||||
if not processed:
|
||||
warnings.warn(Warnings.W035.format(pattern=value))
|
||||
output.extend(_get_extra_predicates_dict(attr, value, vocab, predicate_types,
|
||||
extra_predicates, seen_predicates, fuzzy_compare=fuzzy_compare))
|
||||
return output
|
||||
|
||||
|
||||
def _get_extra_predicates_dict(attr, value_dict, vocab, predicate_types,
|
||||
extra_predicates, seen_predicates, regex=False, fuzzy=None, fuzzy_compare=None):
|
||||
output = []
|
||||
for type_, value in value_dict.items():
|
||||
type_ = type_.upper()
|
||||
cls = predicate_types.get(type_)
|
||||
if cls is None:
|
||||
warnings.warn(Warnings.W035.format(pattern=value_dict))
|
||||
# ignore unrecognized predicate type
|
||||
continue
|
||||
elif cls == _RegexPredicate:
|
||||
if isinstance(value, dict):
|
||||
# add predicates inside regex operator
|
||||
output.extend(_get_extra_predicates_dict(attr, value, vocab, predicate_types,
|
||||
extra_predicates, seen_predicates,
|
||||
regex=True))
|
||||
continue
|
||||
elif cls == _FuzzyPredicate:
|
||||
if isinstance(value, dict):
|
||||
# add predicates inside fuzzy operator
|
||||
fuzz = type_[len("FUZZY"):] # number after prefix
|
||||
fuzzy_val = int(fuzz) if fuzz else -1
|
||||
output.extend(_get_extra_predicates_dict(attr, value, vocab, predicate_types,
|
||||
extra_predicates, seen_predicates,
|
||||
fuzzy=fuzzy_val, fuzzy_compare=fuzzy_compare))
|
||||
continue
|
||||
predicate = cls(len(extra_predicates), attr, value, type_, vocab=vocab,
|
||||
regex=regex, fuzzy=fuzzy, fuzzy_compare=fuzzy_compare)
|
||||
# Don't create redundant predicates.
|
||||
# This helps with efficiency, as we're caching the results.
|
||||
if predicate.key in seen_predicates:
|
||||
output.append(seen_predicates[predicate.key])
|
||||
else:
|
||||
extra_predicates.append(predicate)
|
||||
output.append(predicate.i)
|
||||
seen_predicates[predicate.key] = predicate.i
|
||||
return output
|
||||
|
||||
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
from typing import List, Tuple, Union, Optional, Callable, Any, Dict, overload
|
||||
from ..compat import Literal
|
||||
from typing import List, Tuple, Union, Optional, Callable, Any, Dict, Literal
|
||||
from typing import overload
|
||||
from .matcher import Matcher
|
||||
from ..vocab import Vocab
|
||||
from ..tokens import Doc, Span
|
||||
|
|
|
@ -1,164 +0,0 @@
|
|||
from thinc.api import Model, normal_init
|
||||
|
||||
from ..util import registry
|
||||
|
||||
|
||||
@registry.layers("spacy.PrecomputableAffine.v1")
|
||||
def PrecomputableAffine(nO, nI, nF, nP, dropout=0.1):
|
||||
model = Model(
|
||||
"precomputable_affine",
|
||||
forward,
|
||||
init=init,
|
||||
dims={"nO": nO, "nI": nI, "nF": nF, "nP": nP},
|
||||
params={"W": None, "b": None, "pad": None},
|
||||
attrs={"dropout_rate": dropout},
|
||||
)
|
||||
return model
|
||||
|
||||
|
||||
def forward(model, X, is_train):
|
||||
nF = model.get_dim("nF")
|
||||
nO = model.get_dim("nO")
|
||||
nP = model.get_dim("nP")
|
||||
nI = model.get_dim("nI")
|
||||
W = model.get_param("W")
|
||||
# Preallocate array for layer output, including padding.
|
||||
Yf = model.ops.alloc2f(X.shape[0] + 1, nF * nO * nP, zeros=False)
|
||||
model.ops.gemm(X, W.reshape((nF * nO * nP, nI)), trans2=True, out=Yf[1:])
|
||||
Yf = Yf.reshape((Yf.shape[0], nF, nO, nP))
|
||||
|
||||
# Set padding. Padding has shape (1, nF, nO, nP). Unfortunately, we cannot
|
||||
# change its shape to (nF, nO, nP) without breaking existing models. So
|
||||
# we'll squeeze the first dimension here.
|
||||
Yf[0] = model.ops.xp.squeeze(model.get_param("pad"), 0)
|
||||
|
||||
def backward(dY_ids):
|
||||
# This backprop is particularly tricky, because we get back a different
|
||||
# thing from what we put out. We put out an array of shape:
|
||||
# (nB, nF, nO, nP), and get back:
|
||||
# (nB, nO, nP) and ids (nB, nF)
|
||||
# The ids tell us the values of nF, so we would have:
|
||||
#
|
||||
# dYf = zeros((nB, nF, nO, nP))
|
||||
# for b in range(nB):
|
||||
# for f in range(nF):
|
||||
# dYf[b, ids[b, f]] += dY[b]
|
||||
#
|
||||
# However, we avoid building that array for efficiency -- and just pass
|
||||
# in the indices.
|
||||
dY, ids = dY_ids
|
||||
assert dY.ndim == 3
|
||||
assert dY.shape[1] == nO, dY.shape
|
||||
assert dY.shape[2] == nP, dY.shape
|
||||
# nB = dY.shape[0]
|
||||
model.inc_grad("pad", _backprop_precomputable_affine_padding(model, dY, ids))
|
||||
Xf = X[ids]
|
||||
Xf = Xf.reshape((Xf.shape[0], nF * nI))
|
||||
|
||||
model.inc_grad("b", dY.sum(axis=0))
|
||||
dY = dY.reshape((dY.shape[0], nO * nP))
|
||||
|
||||
Wopfi = W.transpose((1, 2, 0, 3))
|
||||
Wopfi = Wopfi.reshape((nO * nP, nF * nI))
|
||||
dXf = model.ops.gemm(dY.reshape((dY.shape[0], nO * nP)), Wopfi)
|
||||
|
||||
dWopfi = model.ops.gemm(dY, Xf, trans1=True)
|
||||
dWopfi = dWopfi.reshape((nO, nP, nF, nI))
|
||||
# (o, p, f, i) --> (f, o, p, i)
|
||||
dWopfi = dWopfi.transpose((2, 0, 1, 3))
|
||||
model.inc_grad("W", dWopfi)
|
||||
return dXf.reshape((dXf.shape[0], nF, nI))
|
||||
|
||||
return Yf, backward
|
||||
|
||||
|
||||
def _backprop_precomputable_affine_padding(model, dY, ids):
|
||||
nB = dY.shape[0]
|
||||
nF = model.get_dim("nF")
|
||||
nP = model.get_dim("nP")
|
||||
nO = model.get_dim("nO")
|
||||
# Backprop the "padding", used as a filler for missing values.
|
||||
# Values that are missing are set to -1, and each state vector could
|
||||
# have multiple missing values. The padding has different values for
|
||||
# different missing features. The gradient of the padding vector is:
|
||||
#
|
||||
# for b in range(nB):
|
||||
# for f in range(nF):
|
||||
# if ids[b, f] < 0:
|
||||
# d_pad[f] += dY[b]
|
||||
#
|
||||
# Which can be rewritten as:
|
||||
#
|
||||
# (ids < 0).T @ dY
|
||||
mask = model.ops.asarray(ids < 0, dtype="f")
|
||||
d_pad = model.ops.gemm(mask, dY.reshape(nB, nO * nP), trans1=True)
|
||||
return d_pad.reshape((1, nF, nO, nP))
|
||||
|
||||
|
||||
def init(model, X=None, Y=None):
|
||||
"""This is like the 'layer sequential unit variance', but instead
|
||||
of taking the actual inputs, we randomly generate whitened data.
|
||||
|
||||
Why's this all so complicated? We have a huge number of inputs,
|
||||
and the maxout unit makes guessing the dynamics tricky. Instead
|
||||
we set the maxout weights to values that empirically result in
|
||||
whitened outputs given whitened inputs.
|
||||
"""
|
||||
if model.has_param("W") and model.get_param("W").any():
|
||||
return
|
||||
|
||||
nF = model.get_dim("nF")
|
||||
nO = model.get_dim("nO")
|
||||
nP = model.get_dim("nP")
|
||||
nI = model.get_dim("nI")
|
||||
W = model.ops.alloc4f(nF, nO, nP, nI)
|
||||
b = model.ops.alloc2f(nO, nP)
|
||||
pad = model.ops.alloc4f(1, nF, nO, nP)
|
||||
|
||||
ops = model.ops
|
||||
W = normal_init(ops, W.shape, mean=float(ops.xp.sqrt(1.0 / nF * nI)))
|
||||
pad = normal_init(ops, pad.shape, mean=1.0)
|
||||
model.set_param("W", W)
|
||||
model.set_param("b", b)
|
||||
model.set_param("pad", pad)
|
||||
|
||||
ids = ops.alloc((5000, nF), dtype="f")
|
||||
ids += ops.xp.random.uniform(0, 1000, ids.shape)
|
||||
ids = ops.asarray(ids, dtype="i")
|
||||
tokvecs = ops.alloc((5000, nI), dtype="f")
|
||||
tokvecs += ops.xp.random.normal(loc=0.0, scale=1.0, size=tokvecs.size).reshape(
|
||||
tokvecs.shape
|
||||
)
|
||||
|
||||
def predict(ids, tokvecs):
|
||||
# nS ids. nW tokvecs. Exclude the padding array.
|
||||
hiddens = model.predict(tokvecs[:-1]) # (nW, f, o, p)
|
||||
vectors = model.ops.alloc((ids.shape[0], nO * nP), dtype="f")
|
||||
# need nS vectors
|
||||
hiddens = hiddens.reshape((hiddens.shape[0] * nF, nO * nP))
|
||||
model.ops.scatter_add(vectors, ids.flatten(), hiddens)
|
||||
vectors = vectors.reshape((vectors.shape[0], nO, nP))
|
||||
vectors += b
|
||||
vectors = model.ops.asarray(vectors)
|
||||
if nP >= 2:
|
||||
return model.ops.maxout(vectors)[0]
|
||||
else:
|
||||
return vectors * (vectors >= 0)
|
||||
|
||||
tol_var = 0.01
|
||||
tol_mean = 0.01
|
||||
t_max = 10
|
||||
W = model.get_param("W").copy()
|
||||
b = model.get_param("b").copy()
|
||||
for t_i in range(t_max):
|
||||
acts1 = predict(ids, tokvecs)
|
||||
var = model.ops.xp.var(acts1)
|
||||
mean = model.ops.xp.mean(acts1)
|
||||
if abs(var - 1.0) >= tol_var:
|
||||
W /= model.ops.xp.sqrt(var)
|
||||
model.set_param("W", W)
|
||||
elif abs(mean) >= tol_mean:
|
||||
b -= mean
|
||||
model.set_param("b", b)
|
||||
else:
|
||||
break
|
|
@ -23,6 +23,7 @@ DEFAULT_NVTX_ANNOTATABLE_PIPE_METHODS = [
|
|||
"update",
|
||||
"rehearse",
|
||||
"get_loss",
|
||||
"get_teacher_student_loss",
|
||||
"initialize",
|
||||
"begin_update",
|
||||
"finish_update",
|
||||
|
|
|
@ -1,17 +1,19 @@
|
|||
from typing import Optional, List, cast
|
||||
from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops
|
||||
from typing import Optional, List, Tuple, Any, Literal
|
||||
from thinc.types import Floats2d
|
||||
from thinc.api import Model
|
||||
import warnings
|
||||
|
||||
from ...errors import Errors
|
||||
from ...compat import Literal
|
||||
from ...errors import Errors, Warnings
|
||||
from ...util import registry
|
||||
from .._precomputable_affine import PrecomputableAffine
|
||||
from ..tb_framework import TransitionModel
|
||||
from ...tokens import Doc
|
||||
from ...tokens.doc import Doc
|
||||
|
||||
TransitionSystem = Any # TODO
|
||||
State = Any # TODO
|
||||
|
||||
|
||||
@registry.architectures("spacy.TransitionBasedParser.v2")
|
||||
def build_tb_parser_model(
|
||||
@registry.architectures.register("spacy.TransitionBasedParser.v2")
|
||||
def transition_parser_v2(
|
||||
tok2vec: Model[List[Doc], List[Floats2d]],
|
||||
state_type: Literal["parser", "ner"],
|
||||
extra_state_tokens: bool,
|
||||
|
@ -19,6 +21,46 @@ def build_tb_parser_model(
|
|||
maxout_pieces: int,
|
||||
use_upper: bool,
|
||||
nO: Optional[int] = None,
|
||||
) -> Model:
|
||||
if not use_upper:
|
||||
warnings.warn(Warnings.W400)
|
||||
|
||||
return build_tb_parser_model(
|
||||
tok2vec,
|
||||
state_type,
|
||||
extra_state_tokens,
|
||||
hidden_width,
|
||||
maxout_pieces,
|
||||
nO=nO,
|
||||
)
|
||||
|
||||
|
||||
@registry.architectures.register("spacy.TransitionBasedParser.v3")
|
||||
def transition_parser_v3(
|
||||
tok2vec: Model[List[Doc], List[Floats2d]],
|
||||
state_type: Literal["parser", "ner"],
|
||||
extra_state_tokens: bool,
|
||||
hidden_width: int,
|
||||
maxout_pieces: int,
|
||||
nO: Optional[int] = None,
|
||||
) -> Model:
|
||||
return build_tb_parser_model(
|
||||
tok2vec,
|
||||
state_type,
|
||||
extra_state_tokens,
|
||||
hidden_width,
|
||||
maxout_pieces,
|
||||
nO=nO,
|
||||
)
|
||||
|
||||
|
||||
def build_tb_parser_model(
|
||||
tok2vec: Model[List[Doc], List[Floats2d]],
|
||||
state_type: Literal["parser", "ner"],
|
||||
extra_state_tokens: bool,
|
||||
hidden_width: int,
|
||||
maxout_pieces: int,
|
||||
nO: Optional[int] = None,
|
||||
) -> Model:
|
||||
"""
|
||||
Build a transition-based parser model. Can apply to NER or dependency-parsing.
|
||||
|
@ -51,14 +93,7 @@ def build_tb_parser_model(
|
|||
feature sets (for the NER) or 13 (for the parser).
|
||||
hidden_width (int): The width of the hidden layer.
|
||||
maxout_pieces (int): How many pieces to use in the state prediction layer.
|
||||
Recommended values are 1, 2 or 3. If 1, the maxout non-linearity
|
||||
is replaced with a ReLu non-linearity if use_upper=True, and no
|
||||
non-linearity if use_upper=False.
|
||||
use_upper (bool): Whether to use an additional hidden layer after the state
|
||||
vector in order to predict the action scores. It is recommended to set
|
||||
this to False for large pretrained models such as transformers, and True
|
||||
for smaller networks. The upper layer is computed on CPU, which becomes
|
||||
a bottleneck on larger GPU-based models, where it's also less necessary.
|
||||
Recommended values are 1, 2 or 3.
|
||||
nO (int or None): The number of actions the model will predict between.
|
||||
Usually inferred from data at the beginning of training, or loaded from
|
||||
disk.
|
||||
|
@ -69,106 +104,11 @@ def build_tb_parser_model(
|
|||
nr_feature_tokens = 6 if extra_state_tokens else 3
|
||||
else:
|
||||
raise ValueError(Errors.E917.format(value=state_type))
|
||||
t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
|
||||
tok2vec = chain(
|
||||
tok2vec,
|
||||
list2array(),
|
||||
Linear(hidden_width, t2v_width),
|
||||
return TransitionModel(
|
||||
tok2vec=tok2vec,
|
||||
state_tokens=nr_feature_tokens,
|
||||
hidden_width=hidden_width,
|
||||
maxout_pieces=maxout_pieces,
|
||||
nO=nO,
|
||||
unseen_classes=set(),
|
||||
)
|
||||
tok2vec.set_dim("nO", hidden_width)
|
||||
lower = _define_lower(
|
||||
nO=hidden_width if use_upper else nO,
|
||||
nF=nr_feature_tokens,
|
||||
nI=tok2vec.get_dim("nO"),
|
||||
nP=maxout_pieces,
|
||||
)
|
||||
upper = None
|
||||
if use_upper:
|
||||
with use_ops("cpu"):
|
||||
# Initialize weights at zero, as it's a classification layer.
|
||||
upper = _define_upper(nO=nO, nI=None)
|
||||
return TransitionModel(tok2vec, lower, upper, resize_output)
|
||||
|
||||
|
||||
def _define_upper(nO, nI):
|
||||
return Linear(nO=nO, nI=nI, init_W=zero_init)
|
||||
|
||||
|
||||
def _define_lower(nO, nF, nI, nP):
|
||||
return PrecomputableAffine(nO=nO, nF=nF, nI=nI, nP=nP)
|
||||
|
||||
|
||||
def resize_output(model, new_nO):
|
||||
if model.attrs["has_upper"]:
|
||||
return _resize_upper(model, new_nO)
|
||||
return _resize_lower(model, new_nO)
|
||||
|
||||
|
||||
def _resize_upper(model, new_nO):
|
||||
upper = model.get_ref("upper")
|
||||
if upper.has_dim("nO") is None:
|
||||
upper.set_dim("nO", new_nO)
|
||||
return model
|
||||
elif new_nO == upper.get_dim("nO"):
|
||||
return model
|
||||
|
||||
smaller = upper
|
||||
nI = smaller.maybe_get_dim("nI")
|
||||
with use_ops("cpu"):
|
||||
larger = _define_upper(nO=new_nO, nI=nI)
|
||||
# it could be that the model is not initialized yet, then skip this bit
|
||||
if smaller.has_param("W"):
|
||||
larger_W = larger.ops.alloc2f(new_nO, nI)
|
||||
larger_b = larger.ops.alloc1f(new_nO)
|
||||
smaller_W = smaller.get_param("W")
|
||||
smaller_b = smaller.get_param("b")
|
||||
# Weights are stored in (nr_out, nr_in) format, so we're basically
|
||||
# just adding rows here.
|
||||
if smaller.has_dim("nO"):
|
||||
old_nO = smaller.get_dim("nO")
|
||||
larger_W[:old_nO] = smaller_W
|
||||
larger_b[:old_nO] = smaller_b
|
||||
for i in range(old_nO, new_nO):
|
||||
model.attrs["unseen_classes"].add(i)
|
||||
|
||||
larger.set_param("W", larger_W)
|
||||
larger.set_param("b", larger_b)
|
||||
model._layers[-1] = larger
|
||||
model.set_ref("upper", larger)
|
||||
return model
|
||||
|
||||
|
||||
def _resize_lower(model, new_nO):
|
||||
lower = model.get_ref("lower")
|
||||
if lower.has_dim("nO") is None:
|
||||
lower.set_dim("nO", new_nO)
|
||||
return model
|
||||
|
||||
smaller = lower
|
||||
nI = smaller.maybe_get_dim("nI")
|
||||
nF = smaller.maybe_get_dim("nF")
|
||||
nP = smaller.maybe_get_dim("nP")
|
||||
larger = _define_lower(nO=new_nO, nI=nI, nF=nF, nP=nP)
|
||||
# it could be that the model is not initialized yet, then skip this bit
|
||||
if smaller.has_param("W"):
|
||||
larger_W = larger.ops.alloc4f(nF, new_nO, nP, nI)
|
||||
larger_b = larger.ops.alloc2f(new_nO, nP)
|
||||
larger_pad = larger.ops.alloc4f(1, nF, new_nO, nP)
|
||||
smaller_W = smaller.get_param("W")
|
||||
smaller_b = smaller.get_param("b")
|
||||
smaller_pad = smaller.get_param("pad")
|
||||
# Copy the old weights and padding into the new layer
|
||||
if smaller.has_dim("nO"):
|
||||
old_nO = smaller.get_dim("nO")
|
||||
larger_W[:, 0:old_nO, :, :] = smaller_W
|
||||
larger_pad[:, :, 0:old_nO, :] = smaller_pad
|
||||
larger_b[0:old_nO, :] = smaller_b
|
||||
for i in range(old_nO, new_nO):
|
||||
model.attrs["unseen_classes"].add(i)
|
||||
|
||||
larger.set_param("W", larger_W)
|
||||
larger.set_param("b", larger_b)
|
||||
larger.set_param("pad", larger_pad)
|
||||
model._layers[1] = larger
|
||||
model.set_ref("lower", larger)
|
||||
return model
|
||||
|
|
|
@ -1,49 +0,0 @@
|
|||
from libc.string cimport memset, memcpy
|
||||
from thinc.backends.cblas cimport CBlas
|
||||
from ..typedefs cimport weight_t, hash_t
|
||||
from ..pipeline._parser_internals._state cimport StateC
|
||||
|
||||
|
||||
cdef struct SizesC:
|
||||
int states
|
||||
int classes
|
||||
int hiddens
|
||||
int pieces
|
||||
int feats
|
||||
int embed_width
|
||||
|
||||
|
||||
cdef struct WeightsC:
|
||||
const float* feat_weights
|
||||
const float* feat_bias
|
||||
const float* hidden_bias
|
||||
const float* hidden_weights
|
||||
const float* seen_classes
|
||||
|
||||
|
||||
cdef struct ActivationsC:
|
||||
int* token_ids
|
||||
float* unmaxed
|
||||
float* scores
|
||||
float* hiddens
|
||||
int* is_valid
|
||||
int _curr_size
|
||||
int _max_size
|
||||
|
||||
|
||||
cdef WeightsC get_c_weights(model) except *
|
||||
|
||||
cdef SizesC get_c_sizes(model, int batch_size) except *
|
||||
|
||||
cdef ActivationsC alloc_activations(SizesC n) nogil
|
||||
|
||||
cdef void free_activations(const ActivationsC* A) nogil
|
||||
|
||||
cdef void predict_states(CBlas cblas, ActivationsC* A, StateC** states,
|
||||
const WeightsC* W, SizesC n) nogil
|
||||
|
||||
cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) nogil
|
||||
|
||||
cdef void cpu_log_loss(float* d_scores,
|
||||
const float* costs, const int* is_valid, const float* scores, int O) nogil
|
||||
|
|
@ -1,490 +0,0 @@
|
|||
# cython: infer_types=True, cdivision=True, boundscheck=False
|
||||
cimport numpy as np
|
||||
from libc.math cimport exp
|
||||
from libc.string cimport memset, memcpy
|
||||
from libc.stdlib cimport calloc, free, realloc
|
||||
from thinc.backends.linalg cimport Vec, VecVec
|
||||
from thinc.backends.cblas cimport saxpy, sgemm
|
||||
|
||||
import numpy
|
||||
import numpy.random
|
||||
from thinc.api import Model, CupyOps, NumpyOps, get_ops
|
||||
|
||||
from .. import util
|
||||
from ..errors import Errors
|
||||
from ..typedefs cimport weight_t, class_t, hash_t
|
||||
from ..pipeline._parser_internals.stateclass cimport StateClass
|
||||
|
||||
|
||||
cdef WeightsC get_c_weights(model) except *:
|
||||
cdef WeightsC output
|
||||
cdef precompute_hiddens state2vec = model.state2vec
|
||||
output.feat_weights = state2vec.get_feat_weights()
|
||||
output.feat_bias = <const float*>state2vec.bias.data
|
||||
cdef np.ndarray vec2scores_W
|
||||
cdef np.ndarray vec2scores_b
|
||||
if model.vec2scores is None:
|
||||
output.hidden_weights = NULL
|
||||
output.hidden_bias = NULL
|
||||
else:
|
||||
vec2scores_W = model.vec2scores.get_param("W")
|
||||
vec2scores_b = model.vec2scores.get_param("b")
|
||||
output.hidden_weights = <const float*>vec2scores_W.data
|
||||
output.hidden_bias = <const float*>vec2scores_b.data
|
||||
cdef np.ndarray class_mask = model._class_mask
|
||||
output.seen_classes = <const float*>class_mask.data
|
||||
return output
|
||||
|
||||
|
||||
cdef SizesC get_c_sizes(model, int batch_size) except *:
|
||||
cdef SizesC output
|
||||
output.states = batch_size
|
||||
if model.vec2scores is None:
|
||||
output.classes = model.state2vec.get_dim("nO")
|
||||
else:
|
||||
output.classes = model.vec2scores.get_dim("nO")
|
||||
output.hiddens = model.state2vec.get_dim("nO")
|
||||
output.pieces = model.state2vec.get_dim("nP")
|
||||
output.feats = model.state2vec.get_dim("nF")
|
||||
output.embed_width = model.tokvecs.shape[1]
|
||||
return output
|
||||
|
||||
|
||||
cdef ActivationsC alloc_activations(SizesC n) nogil:
|
||||
cdef ActivationsC A
|
||||
memset(&A, 0, sizeof(A))
|
||||
resize_activations(&A, n)
|
||||
return A
|
||||
|
||||
|
||||
cdef void free_activations(const ActivationsC* A) nogil:
|
||||
free(A.token_ids)
|
||||
free(A.scores)
|
||||
free(A.unmaxed)
|
||||
free(A.hiddens)
|
||||
free(A.is_valid)
|
||||
|
||||
|
||||
cdef void resize_activations(ActivationsC* A, SizesC n) nogil:
|
||||
if n.states <= A._max_size:
|
||||
A._curr_size = n.states
|
||||
return
|
||||
if A._max_size == 0:
|
||||
A.token_ids = <int*>calloc(n.states * n.feats, sizeof(A.token_ids[0]))
|
||||
A.scores = <float*>calloc(n.states * n.classes, sizeof(A.scores[0]))
|
||||
A.unmaxed = <float*>calloc(n.states * n.hiddens * n.pieces, sizeof(A.unmaxed[0]))
|
||||
A.hiddens = <float*>calloc(n.states * n.hiddens, sizeof(A.hiddens[0]))
|
||||
A.is_valid = <int*>calloc(n.states * n.classes, sizeof(A.is_valid[0]))
|
||||
A._max_size = n.states
|
||||
else:
|
||||
A.token_ids = <int*>realloc(A.token_ids,
|
||||
n.states * n.feats * sizeof(A.token_ids[0]))
|
||||
A.scores = <float*>realloc(A.scores,
|
||||
n.states * n.classes * sizeof(A.scores[0]))
|
||||
A.unmaxed = <float*>realloc(A.unmaxed,
|
||||
n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0]))
|
||||
A.hiddens = <float*>realloc(A.hiddens,
|
||||
n.states * n.hiddens * sizeof(A.hiddens[0]))
|
||||
A.is_valid = <int*>realloc(A.is_valid,
|
||||
n.states * n.classes * sizeof(A.is_valid[0]))
|
||||
A._max_size = n.states
|
||||
A._curr_size = n.states
|
||||
|
||||
|
||||
cdef void predict_states(CBlas cblas, ActivationsC* A, StateC** states,
|
||||
const WeightsC* W, SizesC n) nogil:
|
||||
cdef double one = 1.0
|
||||
resize_activations(A, n)
|
||||
for i in range(n.states):
|
||||
states[i].set_context_tokens(&A.token_ids[i*n.feats], n.feats)
|
||||
memset(A.unmaxed, 0, n.states * n.hiddens * n.pieces * sizeof(float))
|
||||
memset(A.hiddens, 0, n.states * n.hiddens * sizeof(float))
|
||||
sum_state_features(cblas, A.unmaxed,
|
||||
W.feat_weights, A.token_ids, n.states, n.feats, n.hiddens * n.pieces)
|
||||
for i in range(n.states):
|
||||
VecVec.add_i(&A.unmaxed[i*n.hiddens*n.pieces],
|
||||
W.feat_bias, 1., n.hiddens * n.pieces)
|
||||
for j in range(n.hiddens):
|
||||
index = i * n.hiddens * n.pieces + j * n.pieces
|
||||
which = Vec.arg_max(&A.unmaxed[index], n.pieces)
|
||||
A.hiddens[i*n.hiddens + j] = A.unmaxed[index + which]
|
||||
memset(A.scores, 0, n.states * n.classes * sizeof(float))
|
||||
if W.hidden_weights == NULL:
|
||||
memcpy(A.scores, A.hiddens, n.states * n.classes * sizeof(float))
|
||||
else:
|
||||
# Compute hidden-to-output
|
||||
sgemm(cblas)(False, True, n.states, n.classes, n.hiddens,
|
||||
1.0, <const float *>A.hiddens, n.hiddens,
|
||||
<const float *>W.hidden_weights, n.hiddens,
|
||||
0.0, A.scores, n.classes)
|
||||
# Add bias
|
||||
for i in range(n.states):
|
||||
VecVec.add_i(&A.scores[i*n.classes],
|
||||
W.hidden_bias, 1., n.classes)
|
||||
# Set unseen classes to minimum value
|
||||
i = 0
|
||||
min_ = A.scores[0]
|
||||
for i in range(1, n.states * n.classes):
|
||||
if A.scores[i] < min_:
|
||||
min_ = A.scores[i]
|
||||
for i in range(n.states):
|
||||
for j in range(n.classes):
|
||||
if not W.seen_classes[j]:
|
||||
A.scores[i*n.classes+j] = min_
|
||||
|
||||
|
||||
cdef void sum_state_features(CBlas cblas, float* output,
|
||||
const float* cached, const int* token_ids, int B, int F, int O) nogil:
|
||||
cdef int idx, b, f, i
|
||||
cdef const float* feature
|
||||
padding = cached
|
||||
cached += F * O
|
||||
cdef int id_stride = F*O
|
||||
cdef float one = 1.
|
||||
for b in range(B):
|
||||
for f in range(F):
|
||||
if token_ids[f] < 0:
|
||||
feature = &padding[f*O]
|
||||
else:
|
||||
idx = token_ids[f] * id_stride + f*O
|
||||
feature = &cached[idx]
|
||||
saxpy(cblas)(O, one, <const float*>feature, 1, &output[b*O], 1)
|
||||
token_ids += F
|
||||
|
||||
|
||||
cdef void cpu_log_loss(float* d_scores,
|
||||
const float* costs, const int* is_valid, const float* scores,
|
||||
int O) nogil:
|
||||
"""Do multi-label log loss"""
|
||||
cdef double max_, gmax, Z, gZ
|
||||
best = arg_max_if_gold(scores, costs, is_valid, O)
|
||||
guess = Vec.arg_max(scores, O)
|
||||
if best == -1 or guess == -1:
|
||||
# These shouldn't happen, but if they do, we want to make sure we don't
|
||||
# cause an OOB access.
|
||||
return
|
||||
Z = 1e-10
|
||||
gZ = 1e-10
|
||||
max_ = scores[guess]
|
||||
gmax = scores[best]
|
||||
for i in range(O):
|
||||
Z += exp(scores[i] - max_)
|
||||
if costs[i] <= costs[best]:
|
||||
gZ += exp(scores[i] - gmax)
|
||||
for i in range(O):
|
||||
if costs[i] <= costs[best]:
|
||||
d_scores[i] = (exp(scores[i]-max_) / Z) - (exp(scores[i]-gmax)/gZ)
|
||||
else:
|
||||
d_scores[i] = exp(scores[i]-max_) / Z
|
||||
|
||||
|
||||
cdef int arg_max_if_gold(const weight_t* scores, const weight_t* costs,
|
||||
const int* is_valid, int n) nogil:
|
||||
# Find minimum cost
|
||||
cdef float cost = 1
|
||||
for i in range(n):
|
||||
if is_valid[i] and costs[i] < cost:
|
||||
cost = costs[i]
|
||||
# Now find best-scoring with that cost
|
||||
cdef int best = -1
|
||||
for i in range(n):
|
||||
if costs[i] <= cost and is_valid[i]:
|
||||
if best == -1 or scores[i] > scores[best]:
|
||||
best = i
|
||||
return best
|
||||
|
||||
|
||||
cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) nogil:
|
||||
cdef int best = -1
|
||||
for i in range(n):
|
||||
if is_valid[i] >= 1:
|
||||
if best == -1 or scores[i] > scores[best]:
|
||||
best = i
|
||||
return best
|
||||
|
||||
|
||||
|
||||
class ParserStepModel(Model):
|
||||
def __init__(self, docs, layers, *, has_upper, unseen_classes=None, train=True,
|
||||
dropout=0.1):
|
||||
Model.__init__(self, name="parser_step_model", forward=step_forward)
|
||||
self.attrs["has_upper"] = has_upper
|
||||
self.attrs["dropout_rate"] = dropout
|
||||
self.tokvecs, self.bp_tokvecs = layers[0](docs, is_train=train)
|
||||
if layers[1].get_dim("nP") >= 2:
|
||||
activation = "maxout"
|
||||
elif has_upper:
|
||||
activation = None
|
||||
else:
|
||||
activation = "relu"
|
||||
self.state2vec = precompute_hiddens(len(docs), self.tokvecs, layers[1],
|
||||
activation=activation, train=train)
|
||||
if has_upper:
|
||||
self.vec2scores = layers[-1]
|
||||
else:
|
||||
self.vec2scores = None
|
||||
self.cuda_stream = util.get_cuda_stream(non_blocking=True)
|
||||
self.backprops = []
|
||||
self._class_mask = numpy.zeros((self.nO,), dtype='f')
|
||||
self._class_mask.fill(1)
|
||||
if unseen_classes is not None:
|
||||
for class_ in unseen_classes:
|
||||
self._class_mask[class_] = 0.
|
||||
|
||||
def clear_memory(self):
|
||||
del self.tokvecs
|
||||
del self.bp_tokvecs
|
||||
del self.state2vec
|
||||
del self.backprops
|
||||
del self._class_mask
|
||||
|
||||
@property
|
||||
def nO(self):
|
||||
if self.attrs["has_upper"]:
|
||||
return self.vec2scores.get_dim("nO")
|
||||
else:
|
||||
return self.state2vec.get_dim("nO")
|
||||
|
||||
def class_is_unseen(self, class_):
|
||||
return self._class_mask[class_]
|
||||
|
||||
def mark_class_unseen(self, class_):
|
||||
self._class_mask[class_] = 0
|
||||
|
||||
def mark_class_seen(self, class_):
|
||||
self._class_mask[class_] = 1
|
||||
|
||||
def get_token_ids(self, states):
|
||||
cdef StateClass state
|
||||
states = [state for state in states if not state.is_final()]
|
||||
cdef np.ndarray ids = numpy.zeros((len(states), self.state2vec.nF),
|
||||
dtype='i', order='C')
|
||||
ids.fill(-1)
|
||||
c_ids = <int*>ids.data
|
||||
for state in states:
|
||||
state.c.set_context_tokens(c_ids, ids.shape[1])
|
||||
c_ids += ids.shape[1]
|
||||
return ids
|
||||
|
||||
def backprop_step(self, token_ids, d_vector, get_d_tokvecs):
|
||||
if isinstance(self.state2vec.ops, CupyOps) \
|
||||
and not isinstance(token_ids, self.state2vec.ops.xp.ndarray):
|
||||
# Move token_ids and d_vector to GPU, asynchronously
|
||||
self.backprops.append((
|
||||
util.get_async(self.cuda_stream, token_ids),
|
||||
util.get_async(self.cuda_stream, d_vector),
|
||||
get_d_tokvecs
|
||||
))
|
||||
else:
|
||||
self.backprops.append((token_ids, d_vector, get_d_tokvecs))
|
||||
|
||||
|
||||
def finish_steps(self, golds):
|
||||
# Add a padding vector to the d_tokvecs gradient, so that missing
|
||||
# values don't affect the real gradient.
|
||||
d_tokvecs = self.ops.alloc((self.tokvecs.shape[0]+1, self.tokvecs.shape[1]))
|
||||
# Tells CUDA to block, so our async copies complete.
|
||||
if self.cuda_stream is not None:
|
||||
self.cuda_stream.synchronize()
|
||||
for ids, d_vector, bp_vector in self.backprops:
|
||||
d_state_features = bp_vector((d_vector, ids))
|
||||
ids = ids.flatten()
|
||||
d_state_features = d_state_features.reshape(
|
||||
(ids.size, d_state_features.shape[2]))
|
||||
self.ops.scatter_add(d_tokvecs, ids,
|
||||
d_state_features)
|
||||
# Padded -- see update()
|
||||
self.bp_tokvecs(d_tokvecs[:-1])
|
||||
return d_tokvecs
|
||||
|
||||
NUMPY_OPS = NumpyOps()
|
||||
|
||||
def step_forward(model: ParserStepModel, states, is_train):
|
||||
token_ids = model.get_token_ids(states)
|
||||
vector, get_d_tokvecs = model.state2vec(token_ids, is_train)
|
||||
mask = None
|
||||
if model.attrs["has_upper"]:
|
||||
dropout_rate = model.attrs["dropout_rate"]
|
||||
if is_train and dropout_rate > 0:
|
||||
mask = NUMPY_OPS.get_dropout_mask(vector.shape, 0.1)
|
||||
vector *= mask
|
||||
scores, get_d_vector = model.vec2scores(vector, is_train)
|
||||
else:
|
||||
scores = NumpyOps().asarray(vector)
|
||||
get_d_vector = lambda d_scores: d_scores
|
||||
# If the class is unseen, make sure its score is minimum
|
||||
scores[:, model._class_mask == 0] = numpy.nanmin(scores)
|
||||
|
||||
def backprop_parser_step(d_scores):
|
||||
# Zero vectors for unseen classes
|
||||
d_scores *= model._class_mask
|
||||
d_vector = get_d_vector(d_scores)
|
||||
if mask is not None:
|
||||
d_vector *= mask
|
||||
model.backprop_step(token_ids, d_vector, get_d_tokvecs)
|
||||
return None
|
||||
return scores, backprop_parser_step
|
||||
|
||||
|
||||
cdef class precompute_hiddens:
|
||||
"""Allow a model to be "primed" by pre-computing input features in bulk.
|
||||
|
||||
This is used for the parser, where we want to take a batch of documents,
|
||||
and compute vectors for each (token, position) pair. These vectors can then
|
||||
be reused, especially for beam-search.
|
||||
|
||||
Let's say we're using 12 features for each state, e.g. word at start of
|
||||
buffer, three words on stack, their children, etc. In the normal arc-eager
|
||||
system, a document of length N is processed in 2*N states. This means we'll
|
||||
create 2*N*12 feature vectors --- but if we pre-compute, we only need
|
||||
N*12 vector computations. The saving for beam-search is much better:
|
||||
if we have a beam of k, we'll normally make 2*N*12*K computations --
|
||||
so we can save the factor k. This also gives a nice CPU/GPU division:
|
||||
we can do all our hard maths up front, packed into large multiplications,
|
||||
and do the hard-to-program parsing on the CPU.
|
||||
"""
|
||||
cdef readonly int nF, nO, nP
|
||||
cdef bint _is_synchronized
|
||||
cdef public object ops
|
||||
cdef public object numpy_ops
|
||||
cdef public object _cpu_ops
|
||||
cdef np.ndarray _features
|
||||
cdef np.ndarray _cached
|
||||
cdef np.ndarray bias
|
||||
cdef object _cuda_stream
|
||||
cdef object _bp_hiddens
|
||||
cdef object activation
|
||||
|
||||
def __init__(self, batch_size, tokvecs, lower_model, cuda_stream=None,
|
||||
activation="maxout", train=False):
|
||||
gpu_cached, bp_features = lower_model(tokvecs, train)
|
||||
cdef np.ndarray cached
|
||||
if not isinstance(gpu_cached, numpy.ndarray):
|
||||
# Note the passing of cuda_stream here: it lets
|
||||
# cupy make the copy asynchronously.
|
||||
# We then have to block before first use.
|
||||
cached = gpu_cached.get(stream=cuda_stream)
|
||||
else:
|
||||
cached = gpu_cached
|
||||
if not isinstance(lower_model.get_param("b"), numpy.ndarray):
|
||||
self.bias = lower_model.get_param("b").get(stream=cuda_stream)
|
||||
else:
|
||||
self.bias = lower_model.get_param("b")
|
||||
self.nF = cached.shape[1]
|
||||
if lower_model.has_dim("nP"):
|
||||
self.nP = lower_model.get_dim("nP")
|
||||
else:
|
||||
self.nP = 1
|
||||
self.nO = cached.shape[2]
|
||||
self.ops = lower_model.ops
|
||||
self.numpy_ops = NumpyOps()
|
||||
self._cpu_ops = get_ops("cpu") if isinstance(self.ops, CupyOps) else self.ops
|
||||
assert activation in (None, "relu", "maxout")
|
||||
self.activation = activation
|
||||
self._is_synchronized = False
|
||||
self._cuda_stream = cuda_stream
|
||||
self._cached = cached
|
||||
self._bp_hiddens = bp_features
|
||||
|
||||
cdef const float* get_feat_weights(self) except NULL:
|
||||
if not self._is_synchronized and self._cuda_stream is not None:
|
||||
self._cuda_stream.synchronize()
|
||||
self._is_synchronized = True
|
||||
return <float*>self._cached.data
|
||||
|
||||
def has_dim(self, name):
|
||||
if name == "nF":
|
||||
return self.nF if self.nF is not None else True
|
||||
elif name == "nP":
|
||||
return self.nP if self.nP is not None else True
|
||||
elif name == "nO":
|
||||
return self.nO if self.nO is not None else True
|
||||
else:
|
||||
return False
|
||||
|
||||
def get_dim(self, name):
|
||||
if name == "nF":
|
||||
return self.nF
|
||||
elif name == "nP":
|
||||
return self.nP
|
||||
elif name == "nO":
|
||||
return self.nO
|
||||
else:
|
||||
raise ValueError(Errors.E1033.format(name=name))
|
||||
|
||||
def set_dim(self, name, value):
|
||||
if name == "nF":
|
||||
self.nF = value
|
||||
elif name == "nP":
|
||||
self.nP = value
|
||||
elif name == "nO":
|
||||
self.nO = value
|
||||
else:
|
||||
raise ValueError(Errors.E1033.format(name=name))
|
||||
|
||||
def __call__(self, X, bint is_train):
|
||||
if is_train:
|
||||
return self.begin_update(X)
|
||||
else:
|
||||
return self.predict(X), lambda X: X
|
||||
|
||||
def predict(self, X):
|
||||
return self.begin_update(X)[0]
|
||||
|
||||
def begin_update(self, token_ids):
|
||||
cdef np.ndarray state_vector = numpy.zeros(
|
||||
(token_ids.shape[0], self.nO, self.nP), dtype='f')
|
||||
# This is tricky, but (assuming GPU available);
|
||||
# - Input to forward on CPU
|
||||
# - Output from forward on CPU
|
||||
# - Input to backward on GPU!
|
||||
# - Output from backward on GPU
|
||||
bp_hiddens = self._bp_hiddens
|
||||
|
||||
cdef CBlas cblas = self._cpu_ops.cblas()
|
||||
|
||||
feat_weights = self.get_feat_weights()
|
||||
cdef int[:, ::1] ids = token_ids
|
||||
sum_state_features(cblas, <float*>state_vector.data,
|
||||
feat_weights, &ids[0,0],
|
||||
token_ids.shape[0], self.nF, self.nO*self.nP)
|
||||
state_vector += self.bias
|
||||
state_vector, bp_nonlinearity = self._nonlinearity(state_vector)
|
||||
|
||||
def backward(d_state_vector_ids):
|
||||
d_state_vector, token_ids = d_state_vector_ids
|
||||
d_state_vector = bp_nonlinearity(d_state_vector)
|
||||
d_tokens = bp_hiddens((d_state_vector, token_ids))
|
||||
return d_tokens
|
||||
return state_vector, backward
|
||||
|
||||
def _nonlinearity(self, state_vector):
|
||||
if self.activation == "maxout":
|
||||
return self._maxout_nonlinearity(state_vector)
|
||||
else:
|
||||
return self._relu_nonlinearity(state_vector)
|
||||
|
||||
def _maxout_nonlinearity(self, state_vector):
|
||||
state_vector, mask = self.numpy_ops.maxout(state_vector)
|
||||
# We're outputting to CPU, but we need this variable on GPU for the
|
||||
# backward pass.
|
||||
mask = self.ops.asarray(mask)
|
||||
|
||||
def backprop_maxout(d_best):
|
||||
return self.ops.backprop_maxout(d_best, mask, self.nP)
|
||||
|
||||
return state_vector, backprop_maxout
|
||||
|
||||
def _relu_nonlinearity(self, state_vector):
|
||||
state_vector = state_vector.reshape((state_vector.shape[0], -1))
|
||||
mask = state_vector >= 0.
|
||||
state_vector *= mask
|
||||
# We're outputting to CPU, but we need this variable on GPU for the
|
||||
# backward pass.
|
||||
mask = self.ops.asarray(mask)
|
||||
|
||||
def backprop_relu(d_best):
|
||||
d_best *= mask
|
||||
return d_best.reshape((d_best.shape + (1,)))
|
||||
|
||||
return state_vector, backprop_relu
|
28
spacy/ml/tb_framework.pxd
Normal file
28
spacy/ml/tb_framework.pxd
Normal file
|
@ -0,0 +1,28 @@
|
|||
from libc.stdint cimport int8_t
|
||||
|
||||
|
||||
cdef struct SizesC:
|
||||
int states
|
||||
int classes
|
||||
int hiddens
|
||||
int pieces
|
||||
int feats
|
||||
int embed_width
|
||||
int tokens
|
||||
|
||||
|
||||
cdef struct WeightsC:
|
||||
const float* feat_weights
|
||||
const float* feat_bias
|
||||
const float* hidden_bias
|
||||
const float* hidden_weights
|
||||
const int8_t* seen_mask
|
||||
|
||||
|
||||
cdef struct ActivationsC:
|
||||
int* token_ids
|
||||
float* unmaxed
|
||||
float* hiddens
|
||||
int* is_valid
|
||||
int _curr_size
|
||||
int _max_size
|
|
@ -1,50 +0,0 @@
|
|||
from thinc.api import Model, noop
|
||||
from .parser_model import ParserStepModel
|
||||
from ..util import registry
|
||||
|
||||
|
||||
@registry.layers("spacy.TransitionModel.v1")
|
||||
def TransitionModel(
|
||||
tok2vec, lower, upper, resize_output, dropout=0.2, unseen_classes=set()
|
||||
):
|
||||
"""Set up a stepwise transition-based model"""
|
||||
if upper is None:
|
||||
has_upper = False
|
||||
upper = noop()
|
||||
else:
|
||||
has_upper = True
|
||||
# don't define nO for this object, because we can't dynamically change it
|
||||
return Model(
|
||||
name="parser_model",
|
||||
forward=forward,
|
||||
dims={"nI": tok2vec.maybe_get_dim("nI")},
|
||||
layers=[tok2vec, lower, upper],
|
||||
refs={"tok2vec": tok2vec, "lower": lower, "upper": upper},
|
||||
init=init,
|
||||
attrs={
|
||||
"has_upper": has_upper,
|
||||
"unseen_classes": set(unseen_classes),
|
||||
"resize_output": resize_output,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def forward(model, X, is_train):
|
||||
step_model = ParserStepModel(
|
||||
X,
|
||||
model.layers,
|
||||
unseen_classes=model.attrs["unseen_classes"],
|
||||
train=is_train,
|
||||
has_upper=model.attrs["has_upper"],
|
||||
)
|
||||
|
||||
return step_model, step_model.finish_steps
|
||||
|
||||
|
||||
def init(model, X=None, Y=None):
|
||||
model.get_ref("tok2vec").initialize(X=X)
|
||||
lower = model.get_ref("lower")
|
||||
lower.initialize()
|
||||
if model.attrs["has_upper"]:
|
||||
statevecs = model.ops.alloc2f(2, lower.get_dim("nO"))
|
||||
model.get_ref("upper").initialize(X=statevecs)
|
621
spacy/ml/tb_framework.pyx
Normal file
621
spacy/ml/tb_framework.pyx
Normal file
|
@ -0,0 +1,621 @@
|
|||
# cython: infer_types=True, cdivision=True, boundscheck=False
|
||||
from typing import List, Tuple, Any, Optional, TypeVar, cast
|
||||
from libc.string cimport memset, memcpy
|
||||
from libc.stdlib cimport calloc, free, realloc
|
||||
from libcpp.vector cimport vector
|
||||
import numpy
|
||||
cimport numpy as np
|
||||
from thinc.api import Model, normal_init, chain, list2array, Linear
|
||||
from thinc.api import uniform_init, glorot_uniform_init, zero_init
|
||||
from thinc.api import NumpyOps
|
||||
from thinc.backends.cblas cimport CBlas, saxpy, sgemm
|
||||
from thinc.types import Floats1d, Floats2d, Floats3d, Floats4d
|
||||
from thinc.types import Ints1d, Ints2d
|
||||
|
||||
from ..errors import Errors
|
||||
from ..pipeline._parser_internals import _beam_utils
|
||||
from ..pipeline._parser_internals.batch import GreedyBatch
|
||||
from ..pipeline._parser_internals._parser_utils cimport arg_max
|
||||
from ..pipeline._parser_internals.transition_system cimport c_transition_batch, c_apply_actions
|
||||
from ..pipeline._parser_internals.transition_system cimport TransitionSystem
|
||||
from ..pipeline._parser_internals.stateclass cimport StateC, StateClass
|
||||
from ..tokens.doc import Doc
|
||||
from ..util import registry
|
||||
|
||||
|
||||
State = Any # TODO
|
||||
|
||||
|
||||
@registry.layers("spacy.TransitionModel.v2")
|
||||
def TransitionModel(
|
||||
*,
|
||||
tok2vec: Model[List[Doc], List[Floats2d]],
|
||||
beam_width: int = 1,
|
||||
beam_density: float = 0.0,
|
||||
state_tokens: int,
|
||||
hidden_width: int,
|
||||
maxout_pieces: int,
|
||||
nO: Optional[int] = None,
|
||||
unseen_classes=set(),
|
||||
) -> Model[Tuple[List[Doc], TransitionSystem], List[Tuple[State, List[Floats2d]]]]:
|
||||
"""Set up a transition-based parsing model, using a maxout hidden
|
||||
layer and a linear output layer.
|
||||
"""
|
||||
t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
|
||||
tok2vec_projected = chain(tok2vec, list2array(), Linear(hidden_width, t2v_width)) # type: ignore
|
||||
tok2vec_projected.set_dim("nO", hidden_width)
|
||||
|
||||
# FIXME: we use `output` as a container for the output layer's
|
||||
# weights and biases. Thinc optimizers cannot handle resizing
|
||||
# of parameters. So, when the parser model is resized, we
|
||||
# construct a new `output` layer, which has a different key in
|
||||
# the optimizer. Once the optimizer supports parameter resizing,
|
||||
# we can replace the `output` layer by `output_W` and `output_b`
|
||||
# parameters in this model.
|
||||
output = Linear(nO=None, nI=hidden_width, init_W=zero_init)
|
||||
|
||||
return Model(
|
||||
name="parser_model",
|
||||
forward=forward,
|
||||
init=init,
|
||||
layers=[tok2vec_projected, output],
|
||||
refs={
|
||||
"tok2vec": tok2vec_projected,
|
||||
"output": output,
|
||||
},
|
||||
params={
|
||||
"hidden_W": None, # Floats2d W for the hidden layer
|
||||
"hidden_b": None, # Floats1d bias for the hidden layer
|
||||
"hidden_pad": None, # Floats1d padding for the hidden layer
|
||||
},
|
||||
dims={
|
||||
"nO": None, # Output size
|
||||
"nP": maxout_pieces,
|
||||
"nH": hidden_width,
|
||||
"nI": tok2vec_projected.maybe_get_dim("nO"),
|
||||
"nF": state_tokens,
|
||||
},
|
||||
attrs={
|
||||
"beam_width": beam_width,
|
||||
"beam_density": beam_density,
|
||||
"unseen_classes": set(unseen_classes),
|
||||
"resize_output": resize_output,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def resize_output(model: Model, new_nO: int) -> Model:
|
||||
old_nO = model.maybe_get_dim("nO")
|
||||
output = model.get_ref("output")
|
||||
if old_nO is None:
|
||||
model.set_dim("nO", new_nO)
|
||||
output.set_dim("nO", new_nO)
|
||||
output.initialize()
|
||||
return model
|
||||
elif new_nO <= old_nO:
|
||||
return model
|
||||
elif output.has_param("W"):
|
||||
nH = model.get_dim("nH")
|
||||
new_output = Linear(nO=new_nO, nI=nH, init_W=zero_init)
|
||||
new_output.initialize()
|
||||
new_W = new_output.get_param("W")
|
||||
new_b = new_output.get_param("b")
|
||||
old_W = output.get_param("W")
|
||||
old_b = output.get_param("b")
|
||||
new_W[:old_nO] = old_W # type: ignore
|
||||
new_b[:old_nO] = old_b # type: ignore
|
||||
for i in range(old_nO, new_nO):
|
||||
model.attrs["unseen_classes"].add(i)
|
||||
model.layers[-1] = new_output
|
||||
model.set_ref("output", new_output)
|
||||
# TODO: Avoid this private intrusion
|
||||
model._dims["nO"] = new_nO
|
||||
return model
|
||||
|
||||
|
||||
def init(
|
||||
model,
|
||||
X: Optional[Tuple[List[Doc], TransitionSystem]] = None,
|
||||
Y: Optional[Tuple[List[State], List[Floats2d]]] = None,
|
||||
):
|
||||
if X is not None:
|
||||
docs, moves = X
|
||||
model.get_ref("tok2vec").initialize(X=docs)
|
||||
else:
|
||||
model.get_ref("tok2vec").initialize()
|
||||
inferred_nO = _infer_nO(Y)
|
||||
if inferred_nO is not None:
|
||||
current_nO = model.maybe_get_dim("nO")
|
||||
if current_nO is None or current_nO != inferred_nO:
|
||||
model.attrs["resize_output"](model, inferred_nO)
|
||||
nO = model.get_dim("nO")
|
||||
nP = model.get_dim("nP")
|
||||
nH = model.get_dim("nH")
|
||||
nI = model.get_dim("nI")
|
||||
nF = model.get_dim("nF")
|
||||
ops = model.ops
|
||||
|
||||
Wl = ops.alloc2f(nH * nP, nF * nI)
|
||||
bl = ops.alloc1f(nH * nP)
|
||||
padl = ops.alloc1f(nI)
|
||||
# Wl = zero_init(ops, Wl.shape)
|
||||
Wl = glorot_uniform_init(ops, Wl.shape)
|
||||
padl = uniform_init(ops, padl.shape) # type: ignore
|
||||
# TODO: Experiment with whether better to initialize output_W
|
||||
model.set_param("hidden_W", Wl)
|
||||
model.set_param("hidden_b", bl)
|
||||
model.set_param("hidden_pad", padl)
|
||||
# model = _lsuv_init(model)
|
||||
return model
|
||||
|
||||
|
||||
class TransitionModelInputs:
|
||||
"""
|
||||
Input to transition model.
|
||||
"""
|
||||
|
||||
# dataclass annotation is not yet supported in Cython 0.29.x,
|
||||
# so, we'll do something close to it.
|
||||
|
||||
actions: Optional[List[Ints1d]]
|
||||
docs: List[Doc]
|
||||
max_moves: int
|
||||
moves: TransitionSystem
|
||||
states: Optional[List[State]]
|
||||
|
||||
__slots__ = [
|
||||
"actions",
|
||||
"docs",
|
||||
"max_moves",
|
||||
"moves",
|
||||
"states",
|
||||
]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
docs: List[Doc],
|
||||
moves: TransitionSystem,
|
||||
actions: Optional[List[Ints1d]]=None,
|
||||
max_moves: int=0,
|
||||
states: Optional[List[State]]=None):
|
||||
"""
|
||||
actions (Optional[List[Ints1d]]): actions to apply for each Doc.
|
||||
docs (List[Doc]): Docs to predict transition sequences for.
|
||||
max_moves: (int): the maximum number of moves to apply, values less
|
||||
than 1 will apply moves to states until they are final states.
|
||||
moves (TransitionSystem): the transition system to use when predicting
|
||||
the transition sequences.
|
||||
states (Optional[List[States]]): the initial states to predict the
|
||||
transition sequences for. When absent, the initial states are
|
||||
initialized from the provided Docs.
|
||||
"""
|
||||
self.actions = actions
|
||||
self.docs = docs
|
||||
self.moves = moves
|
||||
self.max_moves = max_moves
|
||||
self.states = states
|
||||
|
||||
|
||||
def forward(model, inputs: TransitionModelInputs, is_train: bool):
|
||||
docs = inputs.docs
|
||||
moves = inputs.moves
|
||||
actions = inputs.actions
|
||||
|
||||
beam_width = model.attrs["beam_width"]
|
||||
hidden_pad = model.get_param("hidden_pad")
|
||||
tok2vec = model.get_ref("tok2vec")
|
||||
|
||||
states = moves.init_batch(docs) if inputs.states is None else inputs.states
|
||||
tokvecs, backprop_tok2vec = tok2vec(docs, is_train)
|
||||
tokvecs = model.ops.xp.vstack((tokvecs, hidden_pad))
|
||||
feats, backprop_feats = _forward_precomputable_affine(model, tokvecs, is_train)
|
||||
seen_mask = _get_seen_mask(model)
|
||||
|
||||
if not is_train and beam_width == 1 and isinstance(model.ops, NumpyOps):
|
||||
# Note: max_moves is only used during training, so we don't need to
|
||||
# pass it to the greedy inference path.
|
||||
return _forward_greedy_cpu(model, moves, states, feats, seen_mask, actions=actions)
|
||||
else:
|
||||
return _forward_fallback(model, moves, states, tokvecs, backprop_tok2vec,
|
||||
feats, backprop_feats, seen_mask, is_train, actions=actions,
|
||||
max_moves=inputs.max_moves)
|
||||
|
||||
|
||||
def _forward_greedy_cpu(model: Model, TransitionSystem moves, states: List[StateClass], np.ndarray feats,
|
||||
np.ndarray[np.npy_bool, ndim=1] seen_mask, actions: Optional[List[Ints1d]]=None):
|
||||
cdef vector[StateC*] c_states
|
||||
cdef StateClass state
|
||||
for state in states:
|
||||
if not state.is_final():
|
||||
c_states.push_back(state.c)
|
||||
weights = _get_c_weights(model, <float*>feats.data, seen_mask)
|
||||
# Precomputed features have rows for each token, plus one for padding.
|
||||
cdef int n_tokens = feats.shape[0] - 1
|
||||
sizes = _get_c_sizes(model, c_states.size(), n_tokens)
|
||||
cdef CBlas cblas = model.ops.cblas()
|
||||
scores = _parse_batch(cblas, moves, &c_states[0], weights, sizes, actions=actions)
|
||||
|
||||
def backprop(dY):
|
||||
raise ValueError(Errors.E4004)
|
||||
|
||||
return (states, scores), backprop
|
||||
|
||||
cdef list _parse_batch(CBlas cblas, TransitionSystem moves, StateC** states,
|
||||
WeightsC weights, SizesC sizes, actions: Optional[List[Ints1d]]=None):
|
||||
cdef int i, j
|
||||
cdef vector[StateC *] unfinished
|
||||
cdef ActivationsC activations = _alloc_activations(sizes)
|
||||
cdef np.ndarray step_scores
|
||||
cdef np.ndarray step_actions
|
||||
|
||||
scores = []
|
||||
while sizes.states >= 1:
|
||||
step_scores = numpy.empty((sizes.states, sizes.classes), dtype="f")
|
||||
step_actions = actions[0] if actions is not None else None
|
||||
with nogil:
|
||||
_predict_states(cblas, &activations, <float*>step_scores.data, states, &weights, sizes)
|
||||
if actions is None:
|
||||
# Validate actions, argmax, take action.
|
||||
c_transition_batch(moves, states, <const float*>step_scores.data, sizes.classes,
|
||||
sizes.states)
|
||||
else:
|
||||
c_apply_actions(moves, states, <const int*>step_actions.data, sizes.states)
|
||||
for i in range(sizes.states):
|
||||
if not states[i].is_final():
|
||||
unfinished.push_back(states[i])
|
||||
for i in range(unfinished.size()):
|
||||
states[i] = unfinished[i]
|
||||
sizes.states = unfinished.size()
|
||||
scores.append(step_scores)
|
||||
unfinished.clear()
|
||||
actions = actions[1:] if actions is not None else None
|
||||
_free_activations(&activations)
|
||||
|
||||
return scores
|
||||
|
||||
|
||||
def _forward_fallback(
|
||||
model: Model,
|
||||
moves: TransitionSystem,
|
||||
states: List[StateClass],
|
||||
tokvecs, backprop_tok2vec,
|
||||
feats,
|
||||
backprop_feats,
|
||||
seen_mask,
|
||||
is_train: bool,
|
||||
actions: Optional[List[Ints1d]]=None,
|
||||
max_moves: int=0):
|
||||
nF = model.get_dim("nF")
|
||||
output = model.get_ref("output")
|
||||
hidden_b = model.get_param("hidden_b")
|
||||
nH = model.get_dim("nH")
|
||||
nP = model.get_dim("nP")
|
||||
|
||||
beam_width = model.attrs["beam_width"]
|
||||
beam_density = model.attrs["beam_density"]
|
||||
|
||||
ops = model.ops
|
||||
|
||||
all_ids = []
|
||||
all_which = []
|
||||
all_statevecs = []
|
||||
all_scores = []
|
||||
if beam_width == 1:
|
||||
batch = GreedyBatch(moves, states, None)
|
||||
else:
|
||||
batch = _beam_utils.BeamBatch(
|
||||
moves, states, None, width=beam_width, density=beam_density
|
||||
)
|
||||
arange = ops.xp.arange(nF)
|
||||
n_moves = 0
|
||||
while not batch.is_done:
|
||||
ids = numpy.zeros((len(batch.get_unfinished_states()), nF), dtype="i")
|
||||
for i, state in enumerate(batch.get_unfinished_states()):
|
||||
state.set_context_tokens(ids, i, nF)
|
||||
# Sum the state features, add the bias and apply the activation (maxout)
|
||||
# to create the state vectors.
|
||||
preacts2f = feats[ids, arange].sum(axis=1) # type: ignore
|
||||
preacts2f += hidden_b
|
||||
preacts = ops.reshape3f(preacts2f, preacts2f.shape[0], nH, nP)
|
||||
assert preacts.shape[0] == len(batch.get_unfinished_states()), preacts.shape
|
||||
statevecs, which = ops.maxout(preacts)
|
||||
# We don't use output's backprop, since we want to backprop for
|
||||
# all states at once, rather than a single state.
|
||||
scores = output.predict(statevecs)
|
||||
scores[:, seen_mask] = ops.xp.nanmin(scores)
|
||||
# Transition the states, filtering out any that are finished.
|
||||
cpu_scores = ops.to_numpy(scores)
|
||||
if actions is None:
|
||||
batch.advance(cpu_scores)
|
||||
else:
|
||||
batch.advance_with_actions(actions[0])
|
||||
actions = actions[1:]
|
||||
all_scores.append(scores)
|
||||
if is_train:
|
||||
# Remember intermediate results for the backprop.
|
||||
all_ids.append(ids)
|
||||
all_statevecs.append(statevecs)
|
||||
all_which.append(which)
|
||||
if n_moves >= max_moves >= 1:
|
||||
break
|
||||
n_moves += 1
|
||||
|
||||
def backprop_parser(d_states_d_scores):
|
||||
ids = ops.xp.vstack(all_ids)
|
||||
which = ops.xp.vstack(all_which)
|
||||
statevecs = ops.xp.vstack(all_statevecs)
|
||||
_, d_scores = d_states_d_scores
|
||||
if model.attrs.get("unseen_classes"):
|
||||
# If we have a negative gradient (i.e. the probability should
|
||||
# increase) on any classes we filtered out as unseen, mark
|
||||
# them as seen.
|
||||
for clas in set(model.attrs["unseen_classes"]):
|
||||
if (d_scores[:, clas] < 0).any():
|
||||
model.attrs["unseen_classes"].remove(clas)
|
||||
d_scores *= seen_mask == False
|
||||
# Calculate the gradients for the parameters of the output layer.
|
||||
# The weight gemm is (nS, nO) @ (nS, nH).T
|
||||
output.inc_grad("b", d_scores.sum(axis=0))
|
||||
output.inc_grad("W", ops.gemm(d_scores, statevecs, trans1=True))
|
||||
# Now calculate d_statevecs, by backproping through the output linear layer.
|
||||
# This gemm is (nS, nO) @ (nO, nH)
|
||||
output_W = output.get_param("W")
|
||||
d_statevecs = ops.gemm(d_scores, output_W)
|
||||
# Backprop through the maxout activation
|
||||
d_preacts = ops.backprop_maxout(d_statevecs, which, nP)
|
||||
d_preacts2f = ops.reshape2f(d_preacts, d_preacts.shape[0], nH * nP)
|
||||
model.inc_grad("hidden_b", d_preacts2f.sum(axis=0))
|
||||
# We don't need to backprop the summation, because we pass back the IDs instead
|
||||
d_state_features = backprop_feats((d_preacts2f, ids))
|
||||
d_tokvecs = ops.alloc2f(tokvecs.shape[0], tokvecs.shape[1])
|
||||
ops.scatter_add(d_tokvecs, ids, d_state_features)
|
||||
model.inc_grad("hidden_pad", d_tokvecs[-1])
|
||||
return (backprop_tok2vec(d_tokvecs[:-1]), None)
|
||||
|
||||
return (list(batch), all_scores), backprop_parser
|
||||
|
||||
|
||||
def _get_seen_mask(model: Model) -> numpy.array[bool, 1]:
|
||||
mask = model.ops.xp.zeros(model.get_dim("nO"), dtype="bool")
|
||||
for class_ in model.attrs.get("unseen_classes", set()):
|
||||
mask[class_] = True
|
||||
return mask
|
||||
|
||||
|
||||
def _forward_precomputable_affine(model, X: Floats2d, is_train: bool):
|
||||
W: Floats2d = model.get_param("hidden_W")
|
||||
nF = model.get_dim("nF")
|
||||
nH = model.get_dim("nH")
|
||||
nP = model.get_dim("nP")
|
||||
nI = model.get_dim("nI")
|
||||
# The weights start out (nH * nP, nF * nI). Transpose and reshape to (nF * nH *nP, nI)
|
||||
W3f = model.ops.reshape3f(W, nH * nP, nF, nI)
|
||||
W3f = W3f.transpose((1, 0, 2))
|
||||
W2f = model.ops.reshape2f(W3f, nF * nH * nP, nI)
|
||||
assert X.shape == (X.shape[0], nI), X.shape
|
||||
Yf_ = model.ops.gemm(X, W2f, trans2=True)
|
||||
Yf = model.ops.reshape3f(Yf_, Yf_.shape[0], nF, nH * nP)
|
||||
|
||||
def backward(dY_ids: Tuple[Floats3d, Ints2d]):
|
||||
# This backprop is particularly tricky, because we get back a different
|
||||
# thing from what we put out. We put out an array of shape:
|
||||
# (nB, nF, nH, nP), and get back:
|
||||
# (nB, nH, nP) and ids (nB, nF)
|
||||
# The ids tell us the values of nF, so we would have:
|
||||
#
|
||||
# dYf = zeros((nB, nF, nH, nP))
|
||||
# for b in range(nB):
|
||||
# for f in range(nF):
|
||||
# dYf[b, ids[b, f]] += dY[b]
|
||||
#
|
||||
# However, we avoid building that array for efficiency -- and just pass
|
||||
# in the indices.
|
||||
dY, ids = dY_ids
|
||||
dXf = model.ops.gemm(dY, W)
|
||||
Xf = X[ids].reshape((ids.shape[0], -1))
|
||||
dW = model.ops.gemm(dY, Xf, trans1=True)
|
||||
model.inc_grad("hidden_W", dW)
|
||||
return model.ops.reshape3f(dXf, dXf.shape[0], nF, nI)
|
||||
|
||||
return Yf, backward
|
||||
|
||||
|
||||
def _infer_nO(Y: Optional[Tuple[List[State], List[Floats2d]]]) -> Optional[int]:
|
||||
if Y is None:
|
||||
return None
|
||||
_, scores = Y
|
||||
if len(scores) == 0:
|
||||
return None
|
||||
assert scores[0].shape[0] >= 1
|
||||
assert len(scores[0].shape) == 2
|
||||
return scores[0].shape[1]
|
||||
|
||||
|
||||
def _lsuv_init(model: Model):
|
||||
"""This is like the 'layer sequential unit variance', but instead
|
||||
of taking the actual inputs, we randomly generate whitened data.
|
||||
|
||||
Why's this all so complicated? We have a huge number of inputs,
|
||||
and the maxout unit makes guessing the dynamics tricky. Instead
|
||||
we set the maxout weights to values that empirically result in
|
||||
whitened outputs given whitened inputs.
|
||||
"""
|
||||
W = model.maybe_get_param("hidden_W")
|
||||
if W is not None and W.any():
|
||||
return
|
||||
|
||||
nF = model.get_dim("nF")
|
||||
nH = model.get_dim("nH")
|
||||
nP = model.get_dim("nP")
|
||||
nI = model.get_dim("nI")
|
||||
W = model.ops.alloc4f(nF, nH, nP, nI)
|
||||
b = model.ops.alloc2f(nH, nP)
|
||||
pad = model.ops.alloc4f(1, nF, nH, nP)
|
||||
|
||||
ops = model.ops
|
||||
W = normal_init(ops, W.shape, mean=float(ops.xp.sqrt(1.0 / nF * nI)))
|
||||
pad = normal_init(ops, pad.shape, mean=1.0)
|
||||
model.set_param("W", W)
|
||||
model.set_param("b", b)
|
||||
model.set_param("pad", pad)
|
||||
|
||||
ids = ops.alloc_f((5000, nF), dtype="f")
|
||||
ids += ops.xp.random.uniform(0, 1000, ids.shape)
|
||||
ids = ops.asarray(ids, dtype="i")
|
||||
tokvecs = ops.alloc_f((5000, nI), dtype="f")
|
||||
tokvecs += ops.xp.random.normal(loc=0.0, scale=1.0, size=tokvecs.size).reshape(
|
||||
tokvecs.shape
|
||||
)
|
||||
|
||||
def predict(ids, tokvecs):
|
||||
# nS ids. nW tokvecs. Exclude the padding array.
|
||||
hiddens, _ = _forward_precomputable_affine(model, tokvecs[:-1], False)
|
||||
vectors = model.ops.alloc2f(ids.shape[0], nH * nP)
|
||||
# need nS vectors
|
||||
hiddens = hiddens.reshape((hiddens.shape[0] * nF, nH * nP))
|
||||
model.ops.scatter_add(vectors, ids.flatten(), hiddens)
|
||||
vectors3f = model.ops.reshape3f(vectors, vectors.shape[0], nH, nP)
|
||||
vectors3f += b
|
||||
return model.ops.maxout(vectors3f)[0]
|
||||
|
||||
tol_var = 0.01
|
||||
tol_mean = 0.01
|
||||
t_max = 10
|
||||
W = cast(Floats4d, model.get_param("hidden_W").copy())
|
||||
b = cast(Floats2d, model.get_param("hidden_b").copy())
|
||||
for t_i in range(t_max):
|
||||
acts1 = predict(ids, tokvecs)
|
||||
var = model.ops.xp.var(acts1)
|
||||
mean = model.ops.xp.mean(acts1)
|
||||
if abs(var - 1.0) >= tol_var:
|
||||
W /= model.ops.xp.sqrt(var)
|
||||
model.set_param("hidden_W", W)
|
||||
elif abs(mean) >= tol_mean:
|
||||
b -= mean
|
||||
model.set_param("hidden_b", b)
|
||||
else:
|
||||
break
|
||||
return model
|
||||
|
||||
|
||||
cdef WeightsC _get_c_weights(model, const float* feats, np.ndarray[np.npy_bool, ndim=1] seen_mask) except *:
|
||||
output = model.get_ref("output")
|
||||
cdef np.ndarray hidden_b = model.get_param("hidden_b")
|
||||
cdef np.ndarray output_W = output.get_param("W")
|
||||
cdef np.ndarray output_b = output.get_param("b")
|
||||
|
||||
cdef WeightsC weights
|
||||
weights.feat_weights = feats
|
||||
weights.feat_bias = <const float*>hidden_b.data
|
||||
weights.hidden_weights = <const float *> output_W.data
|
||||
weights.hidden_bias = <const float *> output_b.data
|
||||
weights.seen_mask = <const int8_t*> seen_mask.data
|
||||
|
||||
return weights
|
||||
|
||||
|
||||
cdef SizesC _get_c_sizes(model, int batch_size, int tokens) except *:
|
||||
cdef SizesC sizes
|
||||
sizes.states = batch_size
|
||||
sizes.classes = model.get_dim("nO")
|
||||
sizes.hiddens = model.get_dim("nH")
|
||||
sizes.pieces = model.get_dim("nP")
|
||||
sizes.feats = model.get_dim("nF")
|
||||
sizes.embed_width = model.get_dim("nI")
|
||||
sizes.tokens = tokens
|
||||
return sizes
|
||||
|
||||
|
||||
cdef ActivationsC _alloc_activations(SizesC n) nogil:
|
||||
cdef ActivationsC A
|
||||
memset(&A, 0, sizeof(A))
|
||||
_resize_activations(&A, n)
|
||||
return A
|
||||
|
||||
|
||||
cdef void _free_activations(const ActivationsC* A) nogil:
|
||||
free(A.token_ids)
|
||||
free(A.unmaxed)
|
||||
free(A.hiddens)
|
||||
free(A.is_valid)
|
||||
|
||||
|
||||
cdef void _resize_activations(ActivationsC* A, SizesC n) nogil:
|
||||
if n.states <= A._max_size:
|
||||
A._curr_size = n.states
|
||||
return
|
||||
if A._max_size == 0:
|
||||
A.token_ids = <int*>calloc(n.states * n.feats, sizeof(A.token_ids[0]))
|
||||
A.unmaxed = <float*>calloc(n.states * n.hiddens * n.pieces, sizeof(A.unmaxed[0]))
|
||||
A.hiddens = <float*>calloc(n.states * n.hiddens, sizeof(A.hiddens[0]))
|
||||
A.is_valid = <int*>calloc(n.states * n.classes, sizeof(A.is_valid[0]))
|
||||
A._max_size = n.states
|
||||
else:
|
||||
A.token_ids = <int*>realloc(A.token_ids,
|
||||
n.states * n.feats * sizeof(A.token_ids[0]))
|
||||
A.unmaxed = <float*>realloc(A.unmaxed,
|
||||
n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0]))
|
||||
A.hiddens = <float*>realloc(A.hiddens,
|
||||
n.states * n.hiddens * sizeof(A.hiddens[0]))
|
||||
A.is_valid = <int*>realloc(A.is_valid,
|
||||
n.states * n.classes * sizeof(A.is_valid[0]))
|
||||
A._max_size = n.states
|
||||
A._curr_size = n.states
|
||||
|
||||
|
||||
cdef void _predict_states(CBlas cblas, ActivationsC* A, float* scores, StateC** states, const WeightsC* W, SizesC n) nogil:
|
||||
_resize_activations(A, n)
|
||||
for i in range(n.states):
|
||||
states[i].set_context_tokens(&A.token_ids[i*n.feats], n.feats)
|
||||
memset(A.unmaxed, 0, n.states * n.hiddens * n.pieces * sizeof(float))
|
||||
_sum_state_features(cblas, A.unmaxed, W.feat_weights, A.token_ids, n)
|
||||
for i in range(n.states):
|
||||
saxpy(cblas)(n.hiddens * n.pieces, 1., W.feat_bias, 1, &A.unmaxed[i*n.hiddens*n.pieces], 1)
|
||||
for j in range(n.hiddens):
|
||||
index = i * n.hiddens * n.pieces + j * n.pieces
|
||||
which = arg_max(&A.unmaxed[index], n.pieces)
|
||||
A.hiddens[i*n.hiddens + j] = A.unmaxed[index + which]
|
||||
if W.hidden_weights == NULL:
|
||||
memcpy(scores, A.hiddens, n.states * n.classes * sizeof(float))
|
||||
else:
|
||||
# Compute hidden-to-output
|
||||
sgemm(cblas)(False, True, n.states, n.classes, n.hiddens,
|
||||
1.0, <const float *>A.hiddens, n.hiddens,
|
||||
<const float *>W.hidden_weights, n.hiddens,
|
||||
0.0, scores, n.classes)
|
||||
# Add bias
|
||||
for i in range(n.states):
|
||||
saxpy(cblas)(n.classes, 1., W.hidden_bias, 1, &scores[i*n.classes], 1)
|
||||
# Set unseen classes to minimum value
|
||||
i = 0
|
||||
min_ = scores[0]
|
||||
for i in range(1, n.states * n.classes):
|
||||
if scores[i] < min_:
|
||||
min_ = scores[i]
|
||||
for i in range(n.states):
|
||||
for j in range(n.classes):
|
||||
if W.seen_mask[j]:
|
||||
scores[i*n.classes+j] = min_
|
||||
|
||||
|
||||
cdef void _sum_state_features(CBlas cblas, float* output,
|
||||
const float* cached, const int* token_ids, SizesC n) nogil:
|
||||
cdef int idx, b, f, i
|
||||
cdef const float* feature
|
||||
cdef int B = n.states
|
||||
cdef int O = n.hiddens * n.pieces
|
||||
cdef int F = n.feats
|
||||
cdef int T = n.tokens
|
||||
padding = cached + (T * F * O)
|
||||
cdef int id_stride = F*O
|
||||
cdef float one = 1.
|
||||
for b in range(B):
|
||||
for f in range(F):
|
||||
if token_ids[f] < 0:
|
||||
feature = &padding[f*O]
|
||||
else:
|
||||
idx = token_ids[f] * id_stride + f*O
|
||||
feature = &cached[idx]
|
||||
saxpy(cblas)(O, one, <const float*>feature, 1, &output[b*O], 1)
|
||||
token_ids += F
|
||||
|
|
@ -1,6 +1,6 @@
|
|||
from ...typedefs cimport class_t, hash_t
|
||||
|
||||
# These are passed as callbacks to thinc.search.Beam
|
||||
# These are passed as callbacks to .search.Beam
|
||||
cdef int transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1
|
||||
|
||||
cdef int check_final_state(void* _state, void* extra_args) except -1
|
||||
|
|
|
@ -3,17 +3,17 @@
|
|||
cimport numpy as np
|
||||
import numpy
|
||||
from cpython.ref cimport PyObject, Py_XDECREF
|
||||
from thinc.extra.search cimport Beam
|
||||
from thinc.extra.search import MaxViolation
|
||||
from thinc.extra.search cimport MaxViolation
|
||||
|
||||
from ...typedefs cimport hash_t, class_t
|
||||
from .transition_system cimport TransitionSystem, Transition
|
||||
from ...errors import Errors
|
||||
from .batch cimport Batch
|
||||
from .search cimport Beam, MaxViolation
|
||||
from .search import MaxViolation
|
||||
from .stateclass cimport StateC, StateClass
|
||||
|
||||
|
||||
# These are passed as callbacks to thinc.search.Beam
|
||||
# These are passed as callbacks to .search.Beam
|
||||
cdef int transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1:
|
||||
dest = <StateC*>_dest
|
||||
src = <StateC*>_src
|
||||
|
@ -27,7 +27,7 @@ cdef int check_final_state(void* _state, void* extra_args) except -1:
|
|||
return state.is_final()
|
||||
|
||||
|
||||
cdef class BeamBatch(object):
|
||||
cdef class BeamBatch(Batch):
|
||||
cdef public TransitionSystem moves
|
||||
cdef public object states
|
||||
cdef public object docs
|
||||
|
|
2
spacy/pipeline/_parser_internals/_parser_utils.pxd
Normal file
2
spacy/pipeline/_parser_internals/_parser_utils.pxd
Normal file
|
@ -0,0 +1,2 @@
|
|||
cdef int arg_max(const float* scores, const int n_classes) nogil
|
||||
cdef int arg_max_if_valid(const float* scores, const int* is_valid, int n) nogil
|
22
spacy/pipeline/_parser_internals/_parser_utils.pyx
Normal file
22
spacy/pipeline/_parser_internals/_parser_utils.pyx
Normal file
|
@ -0,0 +1,22 @@
|
|||
# cython: infer_types=True
|
||||
|
||||
cdef inline int arg_max(const float* scores, const int n_classes) nogil:
|
||||
if n_classes == 2:
|
||||
return 0 if scores[0] > scores[1] else 1
|
||||
cdef int i
|
||||
cdef int best = 0
|
||||
cdef float mode = scores[0]
|
||||
for i in range(1, n_classes):
|
||||
if scores[i] > mode:
|
||||
mode = scores[i]
|
||||
best = i
|
||||
return best
|
||||
|
||||
|
||||
cdef inline int arg_max_if_valid(const float* scores, const int* is_valid, int n) nogil:
|
||||
cdef int best = -1
|
||||
for i in range(n):
|
||||
if is_valid[i] >= 1:
|
||||
if best == -1 or scores[i] > scores[best]:
|
||||
best = i
|
||||
return best
|
|
@ -6,7 +6,6 @@ cimport libcpp
|
|||
from libcpp.unordered_map cimport unordered_map
|
||||
from libcpp.vector cimport vector
|
||||
from libcpp.set cimport set
|
||||
from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
|
||||
from murmurhash.mrmr cimport hash64
|
||||
|
||||
from ...vocab cimport EMPTY_LEXEME
|
||||
|
@ -26,7 +25,7 @@ cdef struct ArcC:
|
|||
|
||||
|
||||
cdef cppclass StateC:
|
||||
int* _heads
|
||||
vector[int] _heads
|
||||
const TokenC* _sent
|
||||
vector[int] _stack
|
||||
vector[int] _rebuffer
|
||||
|
@ -34,31 +33,34 @@ cdef cppclass StateC:
|
|||
unordered_map[int, vector[ArcC]] _left_arcs
|
||||
unordered_map[int, vector[ArcC]] _right_arcs
|
||||
vector[libcpp.bool] _unshiftable
|
||||
vector[int] history
|
||||
set[int] _sent_starts
|
||||
TokenC _empty_token
|
||||
int length
|
||||
int offset
|
||||
int _b_i
|
||||
|
||||
__init__(const TokenC* sent, int length) nogil:
|
||||
__init__(const TokenC* sent, int length) nogil except +:
|
||||
this._heads.resize(length, -1)
|
||||
this._unshiftable.resize(length, False)
|
||||
|
||||
# Reserve memory ahead of time to minimize allocations during parsing.
|
||||
# The initial capacity set here ideally reflects the expected average-case/majority usage.
|
||||
cdef int init_capacity = 32
|
||||
this._stack.reserve(init_capacity)
|
||||
this._rebuffer.reserve(init_capacity)
|
||||
this._ents.reserve(init_capacity)
|
||||
this._left_arcs.reserve(init_capacity)
|
||||
this._right_arcs.reserve(init_capacity)
|
||||
this.history.reserve(init_capacity)
|
||||
|
||||
this._sent = sent
|
||||
this._heads = <int*>calloc(length, sizeof(int))
|
||||
if not (this._sent and this._heads):
|
||||
with gil:
|
||||
PyErr_SetFromErrno(MemoryError)
|
||||
PyErr_CheckSignals()
|
||||
this.offset = 0
|
||||
this.length = length
|
||||
this._b_i = 0
|
||||
for i in range(length):
|
||||
this._heads[i] = -1
|
||||
this._unshiftable.push_back(0)
|
||||
memset(&this._empty_token, 0, sizeof(TokenC))
|
||||
this._empty_token.lex = &EMPTY_LEXEME
|
||||
|
||||
__dealloc__():
|
||||
free(this._heads)
|
||||
|
||||
void set_context_tokens(int* ids, int n) nogil:
|
||||
cdef int i, j
|
||||
if n == 1:
|
||||
|
@ -131,19 +133,20 @@ cdef cppclass StateC:
|
|||
ids[i] = -1
|
||||
|
||||
int S(int i) nogil const:
|
||||
if i >= this._stack.size():
|
||||
cdef int stack_size = this._stack.size()
|
||||
if i >= stack_size or i < 0:
|
||||
return -1
|
||||
elif i < 0:
|
||||
return -1
|
||||
return this._stack.at(this._stack.size() - (i+1))
|
||||
else:
|
||||
return this._stack[stack_size - (i+1)]
|
||||
|
||||
int B(int i) nogil const:
|
||||
cdef int buf_size = this._rebuffer.size()
|
||||
if i < 0:
|
||||
return -1
|
||||
elif i < this._rebuffer.size():
|
||||
return this._rebuffer.at(this._rebuffer.size() - (i+1))
|
||||
elif i < buf_size:
|
||||
return this._rebuffer[buf_size - (i+1)]
|
||||
else:
|
||||
b_i = this._b_i + (i - this._rebuffer.size())
|
||||
b_i = this._b_i + (i - buf_size)
|
||||
if b_i >= this.length:
|
||||
return -1
|
||||
else:
|
||||
|
@ -242,7 +245,7 @@ cdef cppclass StateC:
|
|||
return 0
|
||||
elif this._sent[word].sent_start == 1:
|
||||
return 1
|
||||
elif this._sent_starts.count(word) >= 1:
|
||||
elif this._sent_starts.const_find(word) != this._sent_starts.const_end():
|
||||
return 1
|
||||
else:
|
||||
return 0
|
||||
|
@ -327,7 +330,7 @@ cdef cppclass StateC:
|
|||
if item >= this._unshiftable.size():
|
||||
return 0
|
||||
else:
|
||||
return this._unshiftable.at(item)
|
||||
return this._unshiftable[item]
|
||||
|
||||
void set_reshiftable(int item) nogil:
|
||||
if item < this._unshiftable.size():
|
||||
|
@ -347,6 +350,9 @@ cdef cppclass StateC:
|
|||
this._heads[child] = head
|
||||
|
||||
void map_del_arc(unordered_map[int, vector[ArcC]]* heads_arcs, int h_i, int c_i) nogil:
|
||||
cdef vector[ArcC]* arcs
|
||||
cdef ArcC* arc
|
||||
|
||||
arcs_it = heads_arcs.find(h_i)
|
||||
if arcs_it == heads_arcs.end():
|
||||
return
|
||||
|
@ -355,12 +361,12 @@ cdef cppclass StateC:
|
|||
if arcs.size() == 0:
|
||||
return
|
||||
|
||||
arc = arcs.back()
|
||||
arc = &arcs.back()
|
||||
if arc.head == h_i and arc.child == c_i:
|
||||
arcs.pop_back()
|
||||
else:
|
||||
for i in range(arcs.size()-1):
|
||||
arc = arcs.at(i)
|
||||
arc = &deref(arcs)[i]
|
||||
if arc.head == h_i and arc.child == c_i:
|
||||
arc.head = -1
|
||||
arc.child = -1
|
||||
|
@ -400,10 +406,11 @@ cdef cppclass StateC:
|
|||
this._rebuffer = src._rebuffer
|
||||
this._sent_starts = src._sent_starts
|
||||
this._unshiftable = src._unshiftable
|
||||
memcpy(this._heads, src._heads, this.length * sizeof(this._heads[0]))
|
||||
this._heads = src._heads
|
||||
this._ents = src._ents
|
||||
this._left_arcs = src._left_arcs
|
||||
this._right_arcs = src._right_arcs
|
||||
this._b_i = src._b_i
|
||||
this.offset = src.offset
|
||||
this._empty_token = src._empty_token
|
||||
this.history = src.history
|
||||
|
|
|
@ -15,7 +15,7 @@ from ...training.example cimport Example
|
|||
from .stateclass cimport StateClass
|
||||
from ._state cimport StateC, ArcC
|
||||
from ...errors import Errors
|
||||
from thinc.extra.search cimport Beam
|
||||
from .search cimport Beam
|
||||
|
||||
cdef weight_t MIN_SCORE = -90000
|
||||
cdef attr_t SUBTOK_LABEL = hash_string('subtok')
|
||||
|
@ -773,6 +773,8 @@ cdef class ArcEager(TransitionSystem):
|
|||
return list(arcs)
|
||||
|
||||
def has_gold(self, Example eg, start=0, end=None):
|
||||
if end is not None and end < 0:
|
||||
end = None
|
||||
for word in eg.y[start:end]:
|
||||
if word.dep != 0:
|
||||
return True
|
||||
|
@ -858,6 +860,7 @@ cdef class ArcEager(TransitionSystem):
|
|||
state.print_state()
|
||||
)))
|
||||
action.do(state.c, action.label)
|
||||
state.c.history.push_back(i)
|
||||
break
|
||||
else:
|
||||
failed = False
|
||||
|
|
2
spacy/pipeline/_parser_internals/batch.pxd
Normal file
2
spacy/pipeline/_parser_internals/batch.pxd
Normal file
|
@ -0,0 +1,2 @@
|
|||
cdef class Batch:
|
||||
pass
|
52
spacy/pipeline/_parser_internals/batch.pyx
Normal file
52
spacy/pipeline/_parser_internals/batch.pyx
Normal file
|
@ -0,0 +1,52 @@
|
|||
from typing import Any
|
||||
|
||||
TransitionSystem = Any # TODO
|
||||
|
||||
cdef class Batch:
|
||||
def advance(self, scores):
|
||||
raise NotImplementedError
|
||||
|
||||
def get_states(self):
|
||||
raise NotImplementedError
|
||||
|
||||
@property
|
||||
def is_done(self):
|
||||
raise NotImplementedError
|
||||
|
||||
def get_unfinished_states(self):
|
||||
raise NotImplementedError
|
||||
|
||||
def __getitem__(self, i):
|
||||
raise NotImplementedError
|
||||
|
||||
def __len__(self):
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class GreedyBatch(Batch):
|
||||
def __init__(self, moves: TransitionSystem, states, golds):
|
||||
self._moves = moves
|
||||
self._states = states
|
||||
self._next_states = [s for s in states if not s.is_final()]
|
||||
|
||||
def advance(self, scores):
|
||||
self._next_states = self._moves.transition_states(self._next_states, scores)
|
||||
|
||||
def advance_with_actions(self, actions):
|
||||
self._next_states = self._moves.apply_actions(self._next_states, actions)
|
||||
|
||||
def get_states(self):
|
||||
return self._states
|
||||
|
||||
@property
|
||||
def is_done(self):
|
||||
return all(s.is_final() for s in self._states)
|
||||
|
||||
def get_unfinished_states(self):
|
||||
return [st for st in self._states if not st.is_final()]
|
||||
|
||||
def __getitem__(self, i):
|
||||
return self._states[i]
|
||||
|
||||
def __len__(self):
|
||||
return len(self._states)
|
|
@ -6,7 +6,6 @@ from libcpp.vector cimport vector
|
|||
from cymem.cymem cimport Pool
|
||||
|
||||
from collections import Counter
|
||||
from thinc.extra.search cimport Beam
|
||||
|
||||
from ...tokens.doc cimport Doc
|
||||
from ...tokens.span import Span
|
||||
|
@ -17,6 +16,7 @@ from ...attrs cimport IS_SPACE
|
|||
from ...structs cimport TokenC, SpanC
|
||||
from ...training import split_bilu_label
|
||||
from ...training.example cimport Example
|
||||
from .search cimport Beam
|
||||
from .stateclass cimport StateClass
|
||||
from ._state cimport StateC
|
||||
from .transition_system cimport Transition, do_func_t
|
||||
|
@ -306,6 +306,8 @@ cdef class BiluoPushDown(TransitionSystem):
|
|||
for span in eg.y.spans.get(neg_key, []):
|
||||
if span.start >= start and span.end <= end:
|
||||
return True
|
||||
if end is not None and end < 0:
|
||||
end = None
|
||||
for word in eg.y[start:end]:
|
||||
if word.ent_iob != 0:
|
||||
return True
|
||||
|
|
89
spacy/pipeline/_parser_internals/search.pxd
Normal file
89
spacy/pipeline/_parser_internals/search.pxd
Normal file
|
@ -0,0 +1,89 @@
|
|||
from cymem.cymem cimport Pool
|
||||
|
||||
from libc.stdint cimport uint32_t
|
||||
from libc.stdint cimport uint64_t
|
||||
from libcpp.pair cimport pair
|
||||
from libcpp.queue cimport priority_queue
|
||||
from libcpp.vector cimport vector
|
||||
|
||||
from ...typedefs cimport class_t, weight_t, hash_t
|
||||
|
||||
ctypedef pair[weight_t, size_t] Entry
|
||||
ctypedef priority_queue[Entry] Queue
|
||||
|
||||
|
||||
ctypedef int (*trans_func_t)(void* dest, void* src, class_t clas, void* x) except -1
|
||||
|
||||
ctypedef void* (*init_func_t)(Pool mem, int n, void* extra_args) except NULL
|
||||
|
||||
ctypedef int (*del_func_t)(Pool mem, void* state, void* extra_args) except -1
|
||||
|
||||
ctypedef int (*finish_func_t)(void* state, void* extra_args) except -1
|
||||
|
||||
ctypedef hash_t (*hash_func_t)(void* state, void* x) except 0
|
||||
|
||||
|
||||
cdef struct _State:
|
||||
void* content
|
||||
class_t* hist
|
||||
weight_t score
|
||||
weight_t loss
|
||||
int i
|
||||
int t
|
||||
bint is_done
|
||||
|
||||
|
||||
cdef class Beam:
|
||||
cdef Pool mem
|
||||
cdef class_t nr_class
|
||||
cdef class_t width
|
||||
cdef class_t size
|
||||
cdef public weight_t min_density
|
||||
cdef int t
|
||||
cdef readonly bint is_done
|
||||
cdef list histories
|
||||
cdef list _parent_histories
|
||||
cdef weight_t** scores
|
||||
cdef int** is_valid
|
||||
cdef weight_t** costs
|
||||
cdef _State* _parents
|
||||
cdef _State* _states
|
||||
cdef del_func_t del_func
|
||||
|
||||
cdef int _fill(self, Queue* q, weight_t** scores, int** is_valid) except -1
|
||||
|
||||
cdef inline void* at(self, int i) nogil:
|
||||
return self._states[i].content
|
||||
|
||||
cdef int initialize(self, init_func_t init_func, del_func_t del_func, int n, void* extra_args) except -1
|
||||
cdef int advance(self, trans_func_t transition_func, hash_func_t hash_func,
|
||||
void* extra_args) except -1
|
||||
cdef int check_done(self, finish_func_t finish_func, void* extra_args) except -1
|
||||
|
||||
|
||||
cdef inline void set_cell(self, int i, int j, weight_t score, int is_valid, weight_t cost) nogil:
|
||||
self.scores[i][j] = score
|
||||
self.is_valid[i][j] = is_valid
|
||||
self.costs[i][j] = cost
|
||||
|
||||
cdef int set_row(self, int i, const weight_t* scores, const int* is_valid,
|
||||
const weight_t* costs) except -1
|
||||
cdef int set_table(self, weight_t** scores, int** is_valid, weight_t** costs) except -1
|
||||
|
||||
|
||||
cdef class MaxViolation:
|
||||
cdef Pool mem
|
||||
cdef weight_t cost
|
||||
cdef weight_t delta
|
||||
cdef readonly weight_t p_score
|
||||
cdef readonly weight_t g_score
|
||||
cdef readonly double Z
|
||||
cdef readonly double gZ
|
||||
cdef class_t n
|
||||
cdef readonly list p_hist
|
||||
cdef readonly list g_hist
|
||||
cdef readonly list p_probs
|
||||
cdef readonly list g_probs
|
||||
|
||||
cpdef int check(self, Beam pred, Beam gold) except -1
|
||||
cpdef int check_crf(self, Beam pred, Beam gold) except -1
|
306
spacy/pipeline/_parser_internals/search.pyx
Normal file
306
spacy/pipeline/_parser_internals/search.pyx
Normal file
|
@ -0,0 +1,306 @@
|
|||
# cython: profile=True, experimental_cpp_class_def=True, cdivision=True, infer_types=True
|
||||
cimport cython
|
||||
from libc.string cimport memset, memcpy
|
||||
from libc.math cimport log, exp
|
||||
import math
|
||||
|
||||
from cymem.cymem cimport Pool
|
||||
from preshed.maps cimport PreshMap
|
||||
|
||||
|
||||
cdef class Beam:
|
||||
def __init__(self, class_t nr_class, class_t width, weight_t min_density=0.0):
|
||||
assert nr_class != 0
|
||||
assert width != 0
|
||||
self.nr_class = nr_class
|
||||
self.width = width
|
||||
self.min_density = min_density
|
||||
self.size = 1
|
||||
self.t = 0
|
||||
self.mem = Pool()
|
||||
self.del_func = NULL
|
||||
self._parents = <_State*>self.mem.alloc(self.width, sizeof(_State))
|
||||
self._states = <_State*>self.mem.alloc(self.width, sizeof(_State))
|
||||
cdef int i
|
||||
self.histories = [[] for i in range(self.width)]
|
||||
self._parent_histories = [[] for i in range(self.width)]
|
||||
|
||||
self.scores = <weight_t**>self.mem.alloc(self.width, sizeof(weight_t*))
|
||||
self.is_valid = <int**>self.mem.alloc(self.width, sizeof(weight_t*))
|
||||
self.costs = <weight_t**>self.mem.alloc(self.width, sizeof(weight_t*))
|
||||
for i in range(self.width):
|
||||
self.scores[i] = <weight_t*>self.mem.alloc(self.nr_class, sizeof(weight_t))
|
||||
self.is_valid[i] = <int*>self.mem.alloc(self.nr_class, sizeof(int))
|
||||
self.costs[i] = <weight_t*>self.mem.alloc(self.nr_class, sizeof(weight_t))
|
||||
|
||||
def __len__(self):
|
||||
return self.size
|
||||
|
||||
property score:
|
||||
def __get__(self):
|
||||
return self._states[0].score
|
||||
|
||||
property min_score:
|
||||
def __get__(self):
|
||||
return self._states[self.size-1].score
|
||||
|
||||
property loss:
|
||||
def __get__(self):
|
||||
return self._states[0].loss
|
||||
|
||||
property probs:
|
||||
def __get__(self):
|
||||
return _softmax([self._states[i].score for i in range(self.size)])
|
||||
|
||||
property scores:
|
||||
def __get__(self):
|
||||
return [self._states[i].score for i in range(self.size)]
|
||||
|
||||
property histories:
|
||||
def __get__(self):
|
||||
return self.histories
|
||||
|
||||
cdef int set_row(self, int i, const weight_t* scores, const int* is_valid,
|
||||
const weight_t* costs) except -1:
|
||||
cdef int j
|
||||
for j in range(self.nr_class):
|
||||
self.scores[i][j] = scores[j]
|
||||
self.is_valid[i][j] = is_valid[j]
|
||||
self.costs[i][j] = costs[j]
|
||||
|
||||
cdef int set_table(self, weight_t** scores, int** is_valid, weight_t** costs) except -1:
|
||||
cdef int i, j
|
||||
for i in range(self.width):
|
||||
memcpy(self.scores[i], scores[i], sizeof(weight_t) * self.nr_class)
|
||||
memcpy(self.is_valid[i], is_valid[i], sizeof(bint) * self.nr_class)
|
||||
memcpy(self.costs[i], costs[i], sizeof(int) * self.nr_class)
|
||||
|
||||
cdef int initialize(self, init_func_t init_func, del_func_t del_func, int n, void* extra_args) except -1:
|
||||
for i in range(self.width):
|
||||
self._states[i].content = init_func(self.mem, n, extra_args)
|
||||
self._parents[i].content = init_func(self.mem, n, extra_args)
|
||||
self.del_func = del_func
|
||||
|
||||
def __dealloc__(self):
|
||||
if self.del_func == NULL:
|
||||
return
|
||||
|
||||
for i in range(self.width):
|
||||
self.del_func(self.mem, self._states[i].content, NULL)
|
||||
self.del_func(self.mem, self._parents[i].content, NULL)
|
||||
|
||||
@cython.cdivision(True)
|
||||
cdef int advance(self, trans_func_t transition_func, hash_func_t hash_func,
|
||||
void* extra_args) except -1:
|
||||
cdef weight_t** scores = self.scores
|
||||
cdef int** is_valid = self.is_valid
|
||||
cdef weight_t** costs = self.costs
|
||||
|
||||
cdef Queue* q = new Queue()
|
||||
self._fill(q, scores, is_valid)
|
||||
# For a beam of width k, we only ever need 2k state objects. How?
|
||||
# Each transition takes a parent and a class and produces a new state.
|
||||
# So, we don't need the whole history --- just the parent. So at
|
||||
# each step, we take a parent, and apply one or more extensions to
|
||||
# it.
|
||||
self._parents, self._states = self._states, self._parents
|
||||
self._parent_histories, self.histories = self.histories, self._parent_histories
|
||||
cdef weight_t score
|
||||
cdef int p_i
|
||||
cdef int i = 0
|
||||
cdef class_t clas
|
||||
cdef _State* parent
|
||||
cdef _State* state
|
||||
cdef hash_t key
|
||||
cdef PreshMap seen_states = PreshMap(self.width)
|
||||
cdef uint64_t is_seen
|
||||
cdef uint64_t one = 1
|
||||
while i < self.width and not q.empty():
|
||||
data = q.top()
|
||||
p_i = data.second / self.nr_class
|
||||
clas = data.second % self.nr_class
|
||||
score = data.first
|
||||
q.pop()
|
||||
parent = &self._parents[p_i]
|
||||
# Indicates terminal state reached; i.e. state is done
|
||||
if parent.is_done:
|
||||
# Now parent will not be changed, so we don't have to copy.
|
||||
# Once finished, should also be unbranching.
|
||||
self._states[i], parent[0] = parent[0], self._states[i]
|
||||
parent.i = self._states[i].i
|
||||
parent.t = self._states[i].t
|
||||
parent.is_done = self._states[i].t
|
||||
self._states[i].score = score
|
||||
self.histories[i] = list(self._parent_histories[p_i])
|
||||
i += 1
|
||||
else:
|
||||
state = &self._states[i]
|
||||
# The supplied transition function should adjust the destination
|
||||
# state to be the result of applying the class to the source state
|
||||
transition_func(state.content, parent.content, clas, extra_args)
|
||||
key = hash_func(state.content, extra_args) if hash_func is not NULL else 0
|
||||
is_seen = <uint64_t>seen_states.get(key)
|
||||
if key == 0 or key == 1 or not is_seen:
|
||||
if key != 0 and key != 1:
|
||||
seen_states.set(key, <void*>one)
|
||||
state.score = score
|
||||
state.loss = parent.loss + costs[p_i][clas]
|
||||
self.histories[i] = list(self._parent_histories[p_i])
|
||||
self.histories[i].append(clas)
|
||||
i += 1
|
||||
del q
|
||||
self.size = i
|
||||
assert self.size >= 1
|
||||
for i in range(self.width):
|
||||
memset(self.scores[i], 0, sizeof(weight_t) * self.nr_class)
|
||||
memset(self.costs[i], 0, sizeof(weight_t) * self.nr_class)
|
||||
memset(self.is_valid[i], 0, sizeof(int) * self.nr_class)
|
||||
self.t += 1
|
||||
|
||||
cdef int check_done(self, finish_func_t finish_func, void* extra_args) except -1:
|
||||
cdef int i
|
||||
for i in range(self.size):
|
||||
if not self._states[i].is_done:
|
||||
self._states[i].is_done = finish_func(self._states[i].content, extra_args)
|
||||
for i in range(self.size):
|
||||
if not self._states[i].is_done:
|
||||
self.is_done = False
|
||||
break
|
||||
else:
|
||||
self.is_done = True
|
||||
|
||||
@cython.cdivision(True)
|
||||
cdef int _fill(self, Queue* q, weight_t** scores, int** is_valid) except -1:
|
||||
"""Populate the queue from a k * n matrix of scores, where k is the
|
||||
beam-width, and n is the number of classes.
|
||||
"""
|
||||
cdef Entry entry
|
||||
cdef weight_t score
|
||||
cdef _State* s
|
||||
cdef int i, j, move_id
|
||||
assert self.size >= 1
|
||||
cdef vector[Entry] entries
|
||||
for i in range(self.size):
|
||||
s = &self._states[i]
|
||||
move_id = i * self.nr_class
|
||||
if s.is_done:
|
||||
# Update score by path average, following TACL '13 paper.
|
||||
if self.histories[i]:
|
||||
entry.first = s.score + (s.score / self.t)
|
||||
else:
|
||||
entry.first = s.score
|
||||
entry.second = move_id
|
||||
entries.push_back(entry)
|
||||
else:
|
||||
for j in range(self.nr_class):
|
||||
if is_valid[i][j]:
|
||||
entry.first = s.score + scores[i][j]
|
||||
entry.second = move_id + j
|
||||
entries.push_back(entry)
|
||||
cdef double max_, Z, cutoff
|
||||
if self.min_density == 0.0:
|
||||
for i in range(entries.size()):
|
||||
q.push(entries[i])
|
||||
elif not entries.empty():
|
||||
max_ = entries[0].first
|
||||
Z = 0.
|
||||
cutoff = 0.
|
||||
# Softmax into probabilities, so we can prune
|
||||
for i in range(entries.size()):
|
||||
if entries[i].first > max_:
|
||||
max_ = entries[i].first
|
||||
for i in range(entries.size()):
|
||||
Z += exp(entries[i].first-max_)
|
||||
cutoff = (1. / Z) * self.min_density
|
||||
for i in range(entries.size()):
|
||||
prob = exp(entries[i].first-max_) / Z
|
||||
if prob >= cutoff:
|
||||
q.push(entries[i])
|
||||
|
||||
|
||||
cdef class MaxViolation:
|
||||
def __init__(self):
|
||||
self.p_score = 0.0
|
||||
self.g_score = 0.0
|
||||
self.Z = 0.0
|
||||
self.gZ = 0.0
|
||||
self.delta = -1
|
||||
self.cost = 0
|
||||
self.p_hist = []
|
||||
self.g_hist = []
|
||||
self.p_probs = []
|
||||
self.g_probs = []
|
||||
|
||||
cpdef int check(self, Beam pred, Beam gold) except -1:
|
||||
cdef _State* p = &pred._states[0]
|
||||
cdef _State* g = &gold._states[0]
|
||||
cdef weight_t d = p.score - g.score
|
||||
if p.loss >= 1 and (self.cost == 0 or d > self.delta):
|
||||
self.cost = p.loss
|
||||
self.delta = d
|
||||
self.p_hist = list(pred.histories[0])
|
||||
self.g_hist = list(gold.histories[0])
|
||||
self.p_score = p.score
|
||||
self.g_score = g.score
|
||||
self.Z = 1e-10
|
||||
self.gZ = 1e-10
|
||||
for i in range(pred.size):
|
||||
if pred._states[i].loss > 0:
|
||||
self.Z += exp(pred._states[i].score)
|
||||
for i in range(gold.size):
|
||||
if gold._states[i].loss == 0:
|
||||
prob = exp(gold._states[i].score)
|
||||
self.Z += prob
|
||||
self.gZ += prob
|
||||
|
||||
cpdef int check_crf(self, Beam pred, Beam gold) except -1:
|
||||
d = pred.score - gold.score
|
||||
seen_golds = set([tuple(gold.histories[i]) for i in range(gold.size)])
|
||||
if pred.loss > 0 and (self.cost == 0 or d > self.delta):
|
||||
p_hist = []
|
||||
p_scores = []
|
||||
g_hist = []
|
||||
g_scores = []
|
||||
for i in range(pred.size):
|
||||
if pred._states[i].loss > 0:
|
||||
p_scores.append(pred._states[i].score)
|
||||
p_hist.append(list(pred.histories[i]))
|
||||
# This can happen from non-monotonic actions
|
||||
# If we find a better gold analysis this way, be sure to keep it.
|
||||
elif pred._states[i].loss <= 0 \
|
||||
and tuple(pred.histories[i]) not in seen_golds:
|
||||
g_scores.append(pred._states[i].score)
|
||||
g_hist.append(list(pred.histories[i]))
|
||||
for i in range(gold.size):
|
||||
if gold._states[i].loss == 0:
|
||||
g_scores.append(gold._states[i].score)
|
||||
g_hist.append(list(gold.histories[i]))
|
||||
|
||||
all_probs = _softmax(p_scores + g_scores)
|
||||
p_probs = all_probs[:len(p_scores)]
|
||||
g_probs_all = all_probs[len(p_scores):]
|
||||
g_probs = _softmax(g_scores)
|
||||
|
||||
self.cost = pred.loss
|
||||
self.delta = d
|
||||
self.p_hist = p_hist
|
||||
self.g_hist = g_hist
|
||||
# TODO: These variables are misnamed! These are the gradients of the loss.
|
||||
self.p_probs = p_probs
|
||||
# Intuition here:
|
||||
# The gradient of the loss is:
|
||||
# P(model) - P(truth)
|
||||
# Normally, P(truth) is 1 for the gold
|
||||
# But, if we want to do the "partial credit" scheme, we want
|
||||
# to create a distribution over the gold, proportional to the scores
|
||||
# awarded.
|
||||
self.g_probs = [x-y for x, y in zip(g_probs_all, g_probs)]
|
||||
|
||||
|
||||
def _softmax(nums):
|
||||
if not nums:
|
||||
return []
|
||||
max_ = max(nums)
|
||||
nums = [(exp(n-max_) if n is not None else None) for n in nums]
|
||||
Z = sum(n for n in nums if n is not None)
|
||||
return [(n/Z if n is not None else None) for n in nums]
|
|
@ -20,6 +20,10 @@ cdef class StateClass:
|
|||
if self._borrowed != 1:
|
||||
del self.c
|
||||
|
||||
@property
|
||||
def history(self):
|
||||
return list(self.c.history)
|
||||
|
||||
@property
|
||||
def stack(self):
|
||||
return [self.S(i) for i in range(self.c.stack_depth())]
|
||||
|
@ -176,3 +180,6 @@ cdef class StateClass:
|
|||
|
||||
def clone(self, StateClass src):
|
||||
self.c.clone(src.c)
|
||||
|
||||
def set_context_tokens(self, int[:, :] output, int row, int n_feats):
|
||||
self.c.set_context_tokens(&output[row, 0], n_feats)
|
||||
|
|
|
@ -53,3 +53,10 @@ cdef class TransitionSystem:
|
|||
|
||||
cdef int set_costs(self, int* is_valid, weight_t* costs,
|
||||
const StateC* state, gold) except -1
|
||||
|
||||
|
||||
cdef void c_apply_actions(TransitionSystem moves, StateC** states, const int* actions,
|
||||
int batch_size) nogil
|
||||
|
||||
cdef void c_transition_batch(TransitionSystem moves, StateC** states, const float* scores,
|
||||
int nr_class, int batch_size) nogil
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
# cython: infer_types=True
|
||||
from __future__ import print_function
|
||||
from cymem.cymem cimport Pool
|
||||
from libc.stdlib cimport calloc, free
|
||||
from libcpp.vector cimport vector
|
||||
|
||||
from collections import Counter
|
||||
import srsly
|
||||
|
@ -10,6 +12,7 @@ from ...typedefs cimport weight_t, attr_t
|
|||
from ...tokens.doc cimport Doc
|
||||
from ...structs cimport TokenC
|
||||
from .stateclass cimport StateClass
|
||||
from ._parser_utils cimport arg_max_if_valid
|
||||
|
||||
from ...errors import Errors
|
||||
from ... import util
|
||||
|
@ -73,7 +76,18 @@ cdef class TransitionSystem:
|
|||
offset += len(doc)
|
||||
return states
|
||||
|
||||
def follow_history(self, doc, history):
|
||||
cdef int clas
|
||||
cdef StateClass state = StateClass(doc)
|
||||
for clas in history:
|
||||
action = self.c[clas]
|
||||
action.do(state.c, action.label)
|
||||
state.c.history.push_back(clas)
|
||||
return state
|
||||
|
||||
def get_oracle_sequence(self, Example example, _debug=False):
|
||||
if not self.has_gold(example):
|
||||
return []
|
||||
states, golds, _ = self.init_gold_batch([example])
|
||||
if not states:
|
||||
return []
|
||||
|
@ -85,6 +99,8 @@ cdef class TransitionSystem:
|
|||
return self.get_oracle_sequence_from_state(state, gold)
|
||||
|
||||
def get_oracle_sequence_from_state(self, StateClass state, gold, _debug=None):
|
||||
if state.is_final():
|
||||
return []
|
||||
cdef Pool mem = Pool()
|
||||
# n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
|
||||
assert self.n_moves > 0
|
||||
|
@ -110,6 +126,7 @@ cdef class TransitionSystem:
|
|||
"S0 head?", str(state.has_head(state.S(0))),
|
||||
)))
|
||||
action.do(state.c, action.label)
|
||||
state.c.history.push_back(i)
|
||||
break
|
||||
else:
|
||||
if _debug:
|
||||
|
@ -137,6 +154,28 @@ cdef class TransitionSystem:
|
|||
raise ValueError(Errors.E170.format(name=name))
|
||||
action = self.lookup_transition(name)
|
||||
action.do(state.c, action.label)
|
||||
state.c.history.push_back(action.clas)
|
||||
|
||||
def apply_actions(self, states, const int[::1] actions):
|
||||
assert len(states) == actions.shape[0]
|
||||
cdef StateClass state
|
||||
cdef vector[StateC*] c_states
|
||||
c_states.resize(len(states))
|
||||
cdef int i
|
||||
for (i, state) in enumerate(states):
|
||||
c_states[i] = state.c
|
||||
c_apply_actions(self, &c_states[0], &actions[0], actions.shape[0])
|
||||
return [state for state in states if not state.c.is_final()]
|
||||
|
||||
def transition_states(self, states, float[:, ::1] scores):
|
||||
assert len(states) == scores.shape[0]
|
||||
cdef StateClass state
|
||||
cdef float* c_scores = &scores[0, 0]
|
||||
cdef vector[StateC*] c_states
|
||||
for state in states:
|
||||
c_states.push_back(state.c)
|
||||
c_transition_batch(self, &c_states[0], c_scores, scores.shape[1], scores.shape[0])
|
||||
return [state for state in states if not state.c.is_final()]
|
||||
|
||||
cdef Transition lookup_transition(self, object name) except *:
|
||||
raise NotImplementedError
|
||||
|
@ -250,3 +289,35 @@ cdef class TransitionSystem:
|
|||
self.cfg.update(msg['cfg'])
|
||||
self.initialize_actions(labels)
|
||||
return self
|
||||
|
||||
|
||||
cdef void c_apply_actions(TransitionSystem moves, StateC** states, const int* actions,
|
||||
int batch_size) nogil:
|
||||
cdef int i
|
||||
cdef Transition action
|
||||
cdef StateC* state
|
||||
for i in range(batch_size):
|
||||
state = states[i]
|
||||
action = moves.c[actions[i]]
|
||||
action.do(state, action.label)
|
||||
state.history.push_back(action.clas)
|
||||
|
||||
|
||||
cdef void c_transition_batch(TransitionSystem moves, StateC** states, const float* scores,
|
||||
int nr_class, int batch_size) nogil:
|
||||
is_valid = <int*>calloc(moves.n_moves, sizeof(int))
|
||||
cdef int i, guess
|
||||
cdef Transition action
|
||||
for i in range(batch_size):
|
||||
moves.set_valid(is_valid, states[i])
|
||||
guess = arg_max_if_valid(&scores[i*nr_class], is_valid, nr_class)
|
||||
if guess == -1:
|
||||
# This shouldn't happen, but it's hard to raise an error here,
|
||||
# and we don't want to infinite loop. So, force to end state.
|
||||
states[i].force_final()
|
||||
else:
|
||||
action = moves.c[guess]
|
||||
action.do(states[i], action.label)
|
||||
states[i].history.push_back(guess)
|
||||
free(is_valid)
|
||||
|
||||
|
|
|
@ -4,8 +4,8 @@ from typing import Optional, Iterable, Callable
|
|||
from thinc.api import Model, Config
|
||||
|
||||
from ._parser_internals.transition_system import TransitionSystem
|
||||
from .transition_parser cimport Parser
|
||||
from ._parser_internals.arc_eager cimport ArcEager
|
||||
from .transition_parser import Parser
|
||||
from ._parser_internals.arc_eager import ArcEager
|
||||
|
||||
from .functions import merge_subtokens
|
||||
from ..language import Language
|
||||
|
@ -18,12 +18,11 @@ from ..util import registry
|
|||
|
||||
default_model_config = """
|
||||
[model]
|
||||
@architectures = "spacy.TransitionBasedParser.v2"
|
||||
@architectures = "spacy.TransitionBasedParser.v3"
|
||||
state_type = "parser"
|
||||
extra_state_tokens = false
|
||||
hidden_width = 64
|
||||
maxout_pieces = 2
|
||||
use_upper = true
|
||||
|
||||
[model.tok2vec]
|
||||
@architectures = "spacy.HashEmbedCNN.v2"
|
||||
|
@ -123,6 +122,7 @@ def make_parser(
|
|||
scorer=scorer,
|
||||
)
|
||||
|
||||
|
||||
@Language.factory(
|
||||
"beam_parser",
|
||||
assigns=["token.dep", "token.head", "token.is_sent_start", "doc.sents"],
|
||||
|
@ -228,6 +228,7 @@ def parser_score(examples, **kwargs):
|
|||
|
||||
DOCS: https://spacy.io/api/dependencyparser#score
|
||||
"""
|
||||
|
||||
def has_sents(doc):
|
||||
return doc.has_annotation("SENT_START")
|
||||
|
||||
|
@ -235,8 +236,11 @@ def parser_score(examples, **kwargs):
|
|||
dep = getattr(token, attr)
|
||||
dep = token.vocab.strings.as_string(dep).lower()
|
||||
return dep
|
||||
|
||||
results = {}
|
||||
results.update(Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs))
|
||||
results.update(
|
||||
Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)
|
||||
)
|
||||
kwargs.setdefault("getter", dep_getter)
|
||||
kwargs.setdefault("ignore_labels", ("p", "punct"))
|
||||
results.update(Scorer.score_deps(examples, "dep", **kwargs))
|
||||
|
@ -249,11 +253,12 @@ def make_parser_scorer():
|
|||
return parser_score
|
||||
|
||||
|
||||
cdef class DependencyParser(Parser):
|
||||
class DependencyParser(Parser):
|
||||
"""Pipeline component for dependency parsing.
|
||||
|
||||
DOCS: https://spacy.io/api/dependencyparser
|
||||
"""
|
||||
|
||||
TransitionSystem = ArcEager
|
||||
|
||||
def __init__(
|
||||
|
@ -273,8 +278,7 @@ cdef class DependencyParser(Parser):
|
|||
incorrect_spans_key=None,
|
||||
scorer=parser_score,
|
||||
):
|
||||
"""Create a DependencyParser.
|
||||
"""
|
||||
"""Create a DependencyParser."""
|
||||
super().__init__(
|
||||
vocab,
|
||||
model,
|
|
@ -5,8 +5,9 @@ from itertools import islice
|
|||
import numpy as np
|
||||
|
||||
import srsly
|
||||
from thinc.api import Config, Model, SequenceCategoricalCrossentropy
|
||||
from thinc.api import Config, Model, SequenceCategoricalCrossentropy, NumpyOps
|
||||
from thinc.types import ArrayXd, Floats2d, Ints1d
|
||||
from thinc.legacy import LegacySequenceCategoricalCrossentropy
|
||||
|
||||
from ._edit_tree_internals.edit_trees import EditTrees
|
||||
from ._edit_tree_internals.schemas import validate_edit_tree
|
||||
|
@ -21,6 +22,8 @@ from .. import util
|
|||
|
||||
|
||||
ActivationsT = Dict[str, Union[List[Floats2d], List[Ints1d]]]
|
||||
# The cutoff value of *top_k* above which an alternative method is used to process guesses.
|
||||
TOP_K_GUARDRAIL = 20
|
||||
|
||||
|
||||
default_model_config = """
|
||||
|
@ -124,12 +127,15 @@ class EditTreeLemmatizer(TrainablePipe):
|
|||
self.cfg: Dict[str, Any] = {"labels": []}
|
||||
self.scorer = scorer
|
||||
self.save_activations = save_activations
|
||||
self.numpy_ops = NumpyOps()
|
||||
|
||||
def get_loss(
|
||||
self, examples: Iterable[Example], scores: List[Floats2d]
|
||||
) -> Tuple[float, List[Floats2d]]:
|
||||
validate_examples(examples, "EditTreeLemmatizer.get_loss")
|
||||
loss_func = SequenceCategoricalCrossentropy(normalize=False, missing_value=-1)
|
||||
loss_func = LegacySequenceCategoricalCrossentropy(
|
||||
normalize=False, missing_value=-1
|
||||
)
|
||||
|
||||
truths = []
|
||||
for eg in examples:
|
||||
|
@ -137,7 +143,7 @@ class EditTreeLemmatizer(TrainablePipe):
|
|||
for (predicted, gold_lemma) in zip(
|
||||
eg.predicted, eg.get_aligned("LEMMA", as_string=True)
|
||||
):
|
||||
if gold_lemma is None:
|
||||
if gold_lemma is None or gold_lemma == "":
|
||||
label = -1
|
||||
else:
|
||||
tree_id = self.trees.add(predicted.text, gold_lemma)
|
||||
|
@ -152,7 +158,38 @@ class EditTreeLemmatizer(TrainablePipe):
|
|||
|
||||
return float(loss), d_scores
|
||||
|
||||
def get_teacher_student_loss(
|
||||
self, teacher_scores: List[Floats2d], student_scores: List[Floats2d]
|
||||
) -> Tuple[float, List[Floats2d]]:
|
||||
"""Calculate the loss and its gradient for a batch of student
|
||||
scores, relative to teacher scores.
|
||||
|
||||
teacher_scores: Scores representing the teacher model's predictions.
|
||||
student_scores: Scores representing the student model's predictions.
|
||||
|
||||
RETURNS (Tuple[float, float]): The loss and the gradient.
|
||||
|
||||
DOCS: https://spacy.io/api/edittreelemmatizer#get_teacher_student_loss
|
||||
"""
|
||||
loss_func = LegacySequenceCategoricalCrossentropy(normalize=False)
|
||||
d_scores, loss = loss_func(student_scores, teacher_scores)
|
||||
if self.model.ops.xp.isnan(loss):
|
||||
raise ValueError(Errors.E910.format(name=self.name))
|
||||
return float(loss), d_scores
|
||||
|
||||
def predict(self, docs: Iterable[Doc]) -> ActivationsT:
|
||||
if self.top_k == 1:
|
||||
scores2guesses = self._scores2guesses_top_k_equals_1
|
||||
elif self.top_k <= TOP_K_GUARDRAIL:
|
||||
scores2guesses = self._scores2guesses_top_k_greater_1
|
||||
else:
|
||||
scores2guesses = self._scores2guesses_top_k_guardrail
|
||||
# The behaviour of *_scores2guesses_top_k_greater_1()* is efficient for values
|
||||
# of *top_k>1* that are likely to be useful when the edit tree lemmatizer is used
|
||||
# for its principal purpose of lemmatizing tokens. However, the code could also
|
||||
# be used for other purposes, and with very large values of *top_k* the method
|
||||
# becomes inefficient. In such cases, *_scores2guesses_top_k_guardrail()* is used
|
||||
# instead.
|
||||
n_docs = len(list(docs))
|
||||
if not any(len(doc) for doc in docs):
|
||||
# Handle cases where there are no tokens in any docs.
|
||||
|
@ -167,20 +204,52 @@ class EditTreeLemmatizer(TrainablePipe):
|
|||
return {"probabilities": scores, "tree_ids": guesses}
|
||||
scores = self.model.predict(docs)
|
||||
assert len(scores) == n_docs
|
||||
guesses = self._scores2guesses(docs, scores)
|
||||
guesses = scores2guesses(docs, scores)
|
||||
assert len(guesses) == n_docs
|
||||
return {"probabilities": scores, "tree_ids": guesses}
|
||||
|
||||
def _scores2guesses(self, docs, scores):
|
||||
def _scores2guesses_top_k_equals_1(self, docs, scores):
|
||||
guesses = []
|
||||
for doc, doc_scores in zip(docs, scores):
|
||||
if self.top_k == 1:
|
||||
doc_guesses = doc_scores.argmax(axis=1).reshape(-1, 1)
|
||||
else:
|
||||
doc_guesses = np.argsort(doc_scores)[..., : -self.top_k - 1 : -1]
|
||||
doc_guesses = doc_scores.argmax(axis=1)
|
||||
doc_guesses = self.numpy_ops.asarray(doc_guesses)
|
||||
|
||||
if not isinstance(doc_guesses, np.ndarray):
|
||||
doc_guesses = doc_guesses.get()
|
||||
doc_compat_guesses = []
|
||||
for i, token in enumerate(doc):
|
||||
tree_id = self.cfg["labels"][doc_guesses[i]]
|
||||
if self.trees.apply(tree_id, token.text) is not None:
|
||||
doc_compat_guesses.append(tree_id)
|
||||
else:
|
||||
doc_compat_guesses.append(-1)
|
||||
guesses.append(np.array(doc_compat_guesses))
|
||||
|
||||
return guesses
|
||||
|
||||
def _scores2guesses_top_k_greater_1(self, docs, scores):
|
||||
guesses = []
|
||||
top_k = min(self.top_k, len(self.labels))
|
||||
for doc, doc_scores in zip(docs, scores):
|
||||
doc_scores = self.numpy_ops.asarray(doc_scores)
|
||||
doc_compat_guesses = []
|
||||
for i, token in enumerate(doc):
|
||||
for _ in range(top_k):
|
||||
candidate = int(doc_scores[i].argmax())
|
||||
candidate_tree_id = self.cfg["labels"][candidate]
|
||||
if self.trees.apply(candidate_tree_id, token.text) is not None:
|
||||
doc_compat_guesses.append(candidate_tree_id)
|
||||
break
|
||||
doc_scores[i, candidate] = np.finfo(np.float32).min
|
||||
else:
|
||||
doc_compat_guesses.append(-1)
|
||||
guesses.append(np.array(doc_compat_guesses))
|
||||
|
||||
return guesses
|
||||
|
||||
def _scores2guesses_top_k_guardrail(self, docs, scores):
|
||||
guesses = []
|
||||
for doc, doc_scores in zip(docs, scores):
|
||||
doc_guesses = np.argsort(doc_scores)[..., : -self.top_k - 1 : -1]
|
||||
doc_guesses = self.numpy_ops.asarray(doc_guesses)
|
||||
|
||||
doc_compat_guesses = []
|
||||
for token, candidates in zip(doc, doc_guesses):
|
||||
|
@ -347,9 +416,9 @@ class EditTreeLemmatizer(TrainablePipe):
|
|||
|
||||
tree = dict(tree)
|
||||
if "orig" in tree:
|
||||
tree["orig"] = self.vocab.strings[tree["orig"]]
|
||||
tree["orig"] = self.vocab.strings.add(tree["orig"])
|
||||
if "orig" in tree:
|
||||
tree["subst"] = self.vocab.strings[tree["subst"]]
|
||||
tree["subst"] = self.vocab.strings.add(tree["subst"])
|
||||
|
||||
trees.append(tree)
|
||||
|
||||
|
|
|
@ -451,7 +451,11 @@ class EntityLinker(TrainablePipe):
|
|||
docs_ents: List[Ragged] = []
|
||||
docs_scores: List[Ragged] = []
|
||||
if not docs:
|
||||
return {KNOWLEDGE_BASE_IDS: final_kb_ids, "ents": docs_ents, "scores": docs_scores}
|
||||
return {
|
||||
KNOWLEDGE_BASE_IDS: final_kb_ids,
|
||||
"ents": docs_ents,
|
||||
"scores": docs_scores,
|
||||
}
|
||||
if isinstance(docs, Doc):
|
||||
docs = [docs]
|
||||
for doc in docs:
|
||||
|
@ -583,7 +587,11 @@ class EntityLinker(TrainablePipe):
|
|||
method="predict", msg="result variables not of equal length"
|
||||
)
|
||||
raise RuntimeError(err)
|
||||
return {KNOWLEDGE_BASE_IDS: final_kb_ids, "ents": docs_ents, "scores": docs_scores}
|
||||
return {
|
||||
KNOWLEDGE_BASE_IDS: final_kb_ids,
|
||||
"ents": docs_ents,
|
||||
"scores": docs_scores,
|
||||
}
|
||||
|
||||
def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT) -> None:
|
||||
"""Modify a batch of documents, using pre-computed scores.
|
||||
|
|
0
spacy/pipeline/entityruler.py
Normal file
0
spacy/pipeline/entityruler.py
Normal file
|
@ -1,7 +1,8 @@
|
|||
# cython: infer_types=True, profile=True, binding=True
|
||||
from typing import Callable, Dict, Iterable, List, Optional, Union
|
||||
import srsly
|
||||
from thinc.api import SequenceCategoricalCrossentropy, Model, Config
|
||||
from thinc.api import Model, Config
|
||||
from thinc.legacy import LegacySequenceCategoricalCrossentropy
|
||||
from thinc.types import Floats2d, Ints1d
|
||||
from itertools import islice
|
||||
|
||||
|
@ -288,7 +289,7 @@ class Morphologizer(Tagger):
|
|||
DOCS: https://spacy.io/api/morphologizer#get_loss
|
||||
"""
|
||||
validate_examples(examples, "Morphologizer.get_loss")
|
||||
loss_func = SequenceCategoricalCrossentropy(names=tuple(self.labels), normalize=False)
|
||||
loss_func = LegacySequenceCategoricalCrossentropy(names=tuple(self.labels), normalize=False)
|
||||
truths = []
|
||||
for eg in examples:
|
||||
eg_truths = []
|
||||
|
|
|
@ -1,221 +0,0 @@
|
|||
# cython: infer_types=True, profile=True, binding=True
|
||||
from typing import Optional
|
||||
import numpy
|
||||
from thinc.api import CosineDistance, to_categorical, Model, Config
|
||||
from thinc.api import set_dropout_rate
|
||||
|
||||
from ..tokens.doc cimport Doc
|
||||
|
||||
from .trainable_pipe import TrainablePipe
|
||||
from .tagger import Tagger
|
||||
from ..training import validate_examples
|
||||
from ..language import Language
|
||||
from ._parser_internals import nonproj
|
||||
from ..attrs import POS, ID
|
||||
from ..errors import Errors
|
||||
|
||||
|
||||
default_model_config = """
|
||||
[model]
|
||||
@architectures = "spacy.MultiTask.v1"
|
||||
maxout_pieces = 3
|
||||
token_vector_width = 96
|
||||
|
||||
[model.tok2vec]
|
||||
@architectures = "spacy.HashEmbedCNN.v2"
|
||||
pretrained_vectors = null
|
||||
width = 96
|
||||
depth = 4
|
||||
embed_size = 2000
|
||||
window_size = 1
|
||||
maxout_pieces = 2
|
||||
subword_features = true
|
||||
"""
|
||||
DEFAULT_MT_MODEL = Config().from_str(default_model_config)["model"]
|
||||
|
||||
|
||||
@Language.factory(
|
||||
"nn_labeller",
|
||||
default_config={"labels": None, "target": "dep_tag_offset", "model": DEFAULT_MT_MODEL}
|
||||
)
|
||||
def make_nn_labeller(nlp: Language, name: str, model: Model, labels: Optional[dict], target: str):
|
||||
return MultitaskObjective(nlp.vocab, model, name)
|
||||
|
||||
|
||||
class MultitaskObjective(Tagger):
|
||||
"""Experimental: Assist training of a parser or tagger, by training a
|
||||
side-objective.
|
||||
"""
|
||||
|
||||
def __init__(self, vocab, model, name="nn_labeller", *, target):
|
||||
self.vocab = vocab
|
||||
self.model = model
|
||||
self.name = name
|
||||
if target == "dep":
|
||||
self.make_label = self.make_dep
|
||||
elif target == "tag":
|
||||
self.make_label = self.make_tag
|
||||
elif target == "ent":
|
||||
self.make_label = self.make_ent
|
||||
elif target == "dep_tag_offset":
|
||||
self.make_label = self.make_dep_tag_offset
|
||||
elif target == "ent_tag":
|
||||
self.make_label = self.make_ent_tag
|
||||
elif target == "sent_start":
|
||||
self.make_label = self.make_sent_start
|
||||
elif hasattr(target, "__call__"):
|
||||
self.make_label = target
|
||||
else:
|
||||
raise ValueError(Errors.E016)
|
||||
cfg = {"labels": {}, "target": target}
|
||||
self.cfg = dict(cfg)
|
||||
|
||||
@property
|
||||
def labels(self):
|
||||
return self.cfg.setdefault("labels", {})
|
||||
|
||||
@labels.setter
|
||||
def labels(self, value):
|
||||
self.cfg["labels"] = value
|
||||
|
||||
def set_annotations(self, docs, dep_ids):
|
||||
pass
|
||||
|
||||
def initialize(self, get_examples, nlp=None, labels=None):
|
||||
if not hasattr(get_examples, "__call__"):
|
||||
err = Errors.E930.format(name="MultitaskObjective", obj=type(get_examples))
|
||||
raise ValueError(err)
|
||||
if labels is not None:
|
||||
self.labels = labels
|
||||
else:
|
||||
for example in get_examples():
|
||||
for token in example.y:
|
||||
label = self.make_label(token)
|
||||
if label is not None and label not in self.labels:
|
||||
self.labels[label] = len(self.labels)
|
||||
self.model.initialize() # TODO: fix initialization by defining X and Y
|
||||
|
||||
def predict(self, docs):
|
||||
tokvecs = self.model.get_ref("tok2vec")(docs)
|
||||
scores = self.model.get_ref("softmax")(tokvecs)
|
||||
return tokvecs, scores
|
||||
|
||||
def get_loss(self, examples, scores):
|
||||
cdef int idx = 0
|
||||
correct = numpy.zeros((scores.shape[0],), dtype="i")
|
||||
guesses = scores.argmax(axis=1)
|
||||
docs = [eg.predicted for eg in examples]
|
||||
for i, eg in enumerate(examples):
|
||||
# Handles alignment for tokenization differences
|
||||
doc_annots = eg.get_aligned() # TODO
|
||||
for j in range(len(eg.predicted)):
|
||||
tok_annots = {key: values[j] for key, values in tok_annots.items()}
|
||||
label = self.make_label(j, tok_annots)
|
||||
if label is None or label not in self.labels:
|
||||
correct[idx] = guesses[idx]
|
||||
else:
|
||||
correct[idx] = self.labels[label]
|
||||
idx += 1
|
||||
correct = self.model.ops.xp.array(correct, dtype="i")
|
||||
d_scores = scores - to_categorical(correct, n_classes=scores.shape[1])
|
||||
loss = (d_scores**2).sum()
|
||||
return float(loss), d_scores
|
||||
|
||||
@staticmethod
|
||||
def make_dep(token):
|
||||
return token.dep_
|
||||
|
||||
@staticmethod
|
||||
def make_tag(token):
|
||||
return token.tag_
|
||||
|
||||
@staticmethod
|
||||
def make_ent(token):
|
||||
if token.ent_iob_ == "O":
|
||||
return "O"
|
||||
else:
|
||||
return token.ent_iob_ + "-" + token.ent_type_
|
||||
|
||||
@staticmethod
|
||||
def make_dep_tag_offset(token):
|
||||
dep = token.dep_
|
||||
tag = token.tag_
|
||||
offset = token.head.i - token.i
|
||||
offset = min(offset, 2)
|
||||
offset = max(offset, -2)
|
||||
return f"{dep}-{tag}:{offset}"
|
||||
|
||||
@staticmethod
|
||||
def make_ent_tag(token):
|
||||
if token.ent_iob_ == "O":
|
||||
ent = "O"
|
||||
else:
|
||||
ent = token.ent_iob_ + "-" + token.ent_type_
|
||||
tag = token.tag_
|
||||
return f"{tag}-{ent}"
|
||||
|
||||
@staticmethod
|
||||
def make_sent_start(token):
|
||||
"""A multi-task objective for representing sentence boundaries,
|
||||
using BILU scheme. (O is impossible)
|
||||
"""
|
||||
if token.is_sent_start and token.is_sent_end:
|
||||
return "U-SENT"
|
||||
elif token.is_sent_start:
|
||||
return "B-SENT"
|
||||
else:
|
||||
return "I-SENT"
|
||||
|
||||
|
||||
class ClozeMultitask(TrainablePipe):
|
||||
def __init__(self, vocab, model, **cfg):
|
||||
self.vocab = vocab
|
||||
self.model = model
|
||||
self.cfg = cfg
|
||||
self.distance = CosineDistance(ignore_zeros=True, normalize=False) # TODO: in config
|
||||
|
||||
def set_annotations(self, docs, dep_ids):
|
||||
pass
|
||||
|
||||
def initialize(self, get_examples, nlp=None):
|
||||
self.model.initialize() # TODO: fix initialization by defining X and Y
|
||||
X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO")))
|
||||
self.model.output_layer.initialize(X)
|
||||
|
||||
def predict(self, docs):
|
||||
tokvecs = self.model.get_ref("tok2vec")(docs)
|
||||
vectors = self.model.get_ref("output_layer")(tokvecs)
|
||||
return tokvecs, vectors
|
||||
|
||||
def get_loss(self, examples, vectors, prediction):
|
||||
validate_examples(examples, "ClozeMultitask.get_loss")
|
||||
# The simplest way to implement this would be to vstack the
|
||||
# token.vector values, but that's a bit inefficient, especially on GPU.
|
||||
# Instead we fetch the index into the vectors table for each of our tokens,
|
||||
# and look them up all at once. This prevents data copying.
|
||||
ids = self.model.ops.flatten([eg.predicted.to_array(ID).ravel() for eg in examples])
|
||||
target = vectors[ids]
|
||||
gradient = self.distance.get_grad(prediction, target)
|
||||
loss = self.distance.get_loss(prediction, target)
|
||||
return float(loss), gradient
|
||||
|
||||
def update(self, examples, *, drop=0., sgd=None, losses=None):
|
||||
pass
|
||||
|
||||
def rehearse(self, examples, drop=0., sgd=None, losses=None):
|
||||
if losses is not None and self.name not in losses:
|
||||
losses[self.name] = 0.
|
||||
set_dropout_rate(self.model, drop)
|
||||
validate_examples(examples, "ClozeMultitask.rehearse")
|
||||
docs = [eg.predicted for eg in examples]
|
||||
predictions, bp_predictions = self.model.begin_update()
|
||||
loss, d_predictions = self.get_loss(examples, self.vocab.vectors.data, predictions)
|
||||
bp_predictions(d_predictions)
|
||||
if sgd is not None:
|
||||
self.finish_update(sgd)
|
||||
if losses is not None:
|
||||
losses[self.name] += loss
|
||||
return losses
|
||||
|
||||
def add_label(self, label):
|
||||
raise NotImplementedError
|
|
@ -4,22 +4,22 @@ from typing import Optional, Iterable, Callable
|
|||
from thinc.api import Model, Config
|
||||
|
||||
from ._parser_internals.transition_system import TransitionSystem
|
||||
from .transition_parser cimport Parser
|
||||
from ._parser_internals.ner cimport BiluoPushDown
|
||||
from .transition_parser import Parser
|
||||
from ._parser_internals.ner import BiluoPushDown
|
||||
from ..language import Language
|
||||
from ..scorer import get_ner_prf, PRFScore
|
||||
from ..training import validate_examples
|
||||
from ..util import registry
|
||||
from ..training import remove_bilu_prefix
|
||||
|
||||
|
||||
default_model_config = """
|
||||
[model]
|
||||
@architectures = "spacy.TransitionBasedParser.v2"
|
||||
@architectures = "spacy.TransitionBasedParser.v3"
|
||||
state_type = "ner"
|
||||
extra_state_tokens = false
|
||||
hidden_width = 64
|
||||
maxout_pieces = 2
|
||||
use_upper = true
|
||||
|
||||
[model.tok2vec]
|
||||
@architectures = "spacy.HashEmbedCNN.v2"
|
||||
|
@ -44,8 +44,12 @@ DEFAULT_NER_MODEL = Config().from_str(default_model_config)["model"]
|
|||
"incorrect_spans_key": None,
|
||||
"scorer": {"@scorers": "spacy.ner_scorer.v1"},
|
||||
},
|
||||
default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
|
||||
|
||||
default_score_weights={
|
||||
"ents_f": 1.0,
|
||||
"ents_p": 0.0,
|
||||
"ents_r": 0.0,
|
||||
"ents_per_type": None,
|
||||
},
|
||||
)
|
||||
def make_ner(
|
||||
nlp: Language,
|
||||
|
@ -98,6 +102,7 @@ def make_ner(
|
|||
scorer=scorer,
|
||||
)
|
||||
|
||||
|
||||
@Language.factory(
|
||||
"beam_ner",
|
||||
assigns=["doc.ents", "token.ent_iob", "token.ent_type"],
|
||||
|
@ -111,7 +116,12 @@ def make_ner(
|
|||
"incorrect_spans_key": None,
|
||||
"scorer": None,
|
||||
},
|
||||
default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
|
||||
default_score_weights={
|
||||
"ents_f": 1.0,
|
||||
"ents_p": 0.0,
|
||||
"ents_r": 0.0,
|
||||
"ents_per_type": None,
|
||||
},
|
||||
)
|
||||
def make_beam_ner(
|
||||
nlp: Language,
|
||||
|
@ -185,11 +195,12 @@ def make_ner_scorer():
|
|||
return ner_score
|
||||
|
||||
|
||||
cdef class EntityRecognizer(Parser):
|
||||
class EntityRecognizer(Parser):
|
||||
"""Pipeline component for named entity recognition.
|
||||
|
||||
DOCS: https://spacy.io/api/entityrecognizer
|
||||
"""
|
||||
|
||||
TransitionSystem = BiluoPushDown
|
||||
|
||||
def __init__(
|
||||
|
@ -207,15 +218,14 @@ cdef class EntityRecognizer(Parser):
|
|||
incorrect_spans_key=None,
|
||||
scorer=ner_score,
|
||||
):
|
||||
"""Create an EntityRecognizer.
|
||||
"""
|
||||
"""Create an EntityRecognizer."""
|
||||
super().__init__(
|
||||
vocab,
|
||||
model,
|
||||
name,
|
||||
moves,
|
||||
update_with_oracle_cut_size=update_with_oracle_cut_size,
|
||||
min_action_freq=1, # not relevant for NER
|
||||
min_action_freq=1, # not relevant for NER
|
||||
learn_tokens=False, # not relevant for NER
|
||||
beam_width=beam_width,
|
||||
beam_density=beam_density,
|
||||
|
@ -242,8 +252,11 @@ cdef class EntityRecognizer(Parser):
|
|||
def labels(self):
|
||||
# Get the labels from the model by looking at the available moves, e.g.
|
||||
# B-PERSON, I-PERSON, L-PERSON, U-PERSON
|
||||
labels = set(remove_bilu_prefix(move) for move in self.move_names
|
||||
if move[0] in ("B", "I", "L", "U"))
|
||||
labels = set(
|
||||
remove_bilu_prefix(move)
|
||||
for move in self.move_names
|
||||
if move[0] in ("B", "I", "L", "U")
|
||||
)
|
||||
return tuple(sorted(labels))
|
||||
|
||||
def scored_ents(self, beams):
|
|
@ -19,13 +19,6 @@ cdef class Pipe:
|
|||
DOCS: https://spacy.io/api/pipe
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
def __init_subclass__(cls, **kwargs):
|
||||
"""Raise a warning if an inheriting class implements 'begin_training'
|
||||
(from v2) instead of the new 'initialize' method (from v3)"""
|
||||
if hasattr(cls, "begin_training"):
|
||||
warnings.warn(Warnings.W088.format(name=cls.__name__))
|
||||
|
||||
def __call__(self, Doc doc) -> Doc:
|
||||
"""Apply the pipe to one document. The document is modified in place,
|
||||
and returned. This usually happens under the hood when the nlp object
|
||||
|
@ -94,6 +87,10 @@ cdef class Pipe:
|
|||
return self.scorer(examples, **scorer_kwargs)
|
||||
return {}
|
||||
|
||||
@property
|
||||
def is_distillable(self) -> bool:
|
||||
return False
|
||||
|
||||
@property
|
||||
def is_trainable(self) -> bool:
|
||||
return False
|
||||
|
|
|
@ -3,7 +3,9 @@ from typing import Dict, Iterable, Optional, Callable, List, Union
|
|||
from itertools import islice
|
||||
|
||||
import srsly
|
||||
from thinc.api import Model, SequenceCategoricalCrossentropy, Config
|
||||
from thinc.api import Model, Config
|
||||
from thinc.legacy import LegacySequenceCategoricalCrossentropy
|
||||
|
||||
from thinc.types import Floats2d, Ints1d
|
||||
|
||||
from ..tokens.doc cimport Doc
|
||||
|
@ -160,7 +162,7 @@ class SentenceRecognizer(Tagger):
|
|||
"""
|
||||
validate_examples(examples, "SentenceRecognizer.get_loss")
|
||||
labels = self.labels
|
||||
loss_func = SequenceCategoricalCrossentropy(names=labels, normalize=False)
|
||||
loss_func = LegacySequenceCategoricalCrossentropy(names=labels, normalize=False)
|
||||
truths = []
|
||||
for eg in examples:
|
||||
eg_truth = []
|
||||
|
|
|
@ -13,6 +13,7 @@ from ..util import ensure_path, SimpleFrozenList, registry
|
|||
from ..tokens import Doc, Span
|
||||
from ..scorer import Scorer, get_ner_prf
|
||||
from ..matcher import Matcher, PhraseMatcher
|
||||
from ..matcher.levenshtein import levenshtein_compare
|
||||
from .. import util
|
||||
|
||||
PatternType = Dict[str, Union[str, List[Dict[str, Any]]]]
|
||||
|
@ -28,6 +29,7 @@ DEFAULT_SPANS_KEY = "ruler"
|
|||
"overwrite_ents": False,
|
||||
"scorer": {"@scorers": "spacy.entity_ruler_scorer.v1"},
|
||||
"ent_id_sep": "__unused__",
|
||||
"matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"},
|
||||
},
|
||||
default_score_weights={
|
||||
"ents_f": 1.0,
|
||||
|
@ -40,6 +42,7 @@ def make_entity_ruler(
|
|||
nlp: Language,
|
||||
name: str,
|
||||
phrase_matcher_attr: Optional[Union[int, str]],
|
||||
matcher_fuzzy_compare: Callable,
|
||||
validate: bool,
|
||||
overwrite_ents: bool,
|
||||
scorer: Optional[Callable],
|
||||
|
@ -57,6 +60,7 @@ def make_entity_ruler(
|
|||
annotate_ents=True,
|
||||
ents_filter=ents_filter,
|
||||
phrase_matcher_attr=phrase_matcher_attr,
|
||||
matcher_fuzzy_compare=matcher_fuzzy_compare,
|
||||
validate=validate,
|
||||
overwrite=False,
|
||||
scorer=scorer,
|
||||
|
@ -81,6 +85,7 @@ def make_entity_ruler_scorer():
|
|||
"annotate_ents": False,
|
||||
"ents_filter": {"@misc": "spacy.first_longest_spans_filter.v1"},
|
||||
"phrase_matcher_attr": None,
|
||||
"matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"},
|
||||
"validate": False,
|
||||
"overwrite": True,
|
||||
"scorer": {
|
||||
|
@ -103,6 +108,7 @@ def make_span_ruler(
|
|||
annotate_ents: bool,
|
||||
ents_filter: Callable[[Iterable[Span], Iterable[Span]], Iterable[Span]],
|
||||
phrase_matcher_attr: Optional[Union[int, str]],
|
||||
matcher_fuzzy_compare: Callable,
|
||||
validate: bool,
|
||||
overwrite: bool,
|
||||
scorer: Optional[Callable],
|
||||
|
@ -115,6 +121,7 @@ def make_span_ruler(
|
|||
annotate_ents=annotate_ents,
|
||||
ents_filter=ents_filter,
|
||||
phrase_matcher_attr=phrase_matcher_attr,
|
||||
matcher_fuzzy_compare=matcher_fuzzy_compare,
|
||||
validate=validate,
|
||||
overwrite=overwrite,
|
||||
scorer=scorer,
|
||||
|
@ -179,7 +186,7 @@ def prioritize_existing_ents_filter(
|
|||
|
||||
|
||||
@registry.misc("spacy.prioritize_existing_ents_filter.v1")
|
||||
def make_preverse_existing_ents_filter():
|
||||
def make_preserve_existing_ents_filter():
|
||||
return prioritize_existing_ents_filter
|
||||
|
||||
|
||||
|
@ -225,6 +232,7 @@ class SpanRuler(Pipe):
|
|||
[Iterable[Span], Iterable[Span]], Iterable[Span]
|
||||
] = util.filter_chain_spans,
|
||||
phrase_matcher_attr: Optional[Union[int, str]] = None,
|
||||
matcher_fuzzy_compare: Callable = levenshtein_compare,
|
||||
validate: bool = False,
|
||||
overwrite: bool = False,
|
||||
scorer: Optional[Callable] = partial(
|
||||
|
@ -255,6 +263,9 @@ class SpanRuler(Pipe):
|
|||
phrase_matcher_attr (Optional[Union[int, str]]): Token attribute to
|
||||
match on, passed to the internal PhraseMatcher as `attr`. Defaults
|
||||
to `None`.
|
||||
matcher_fuzzy_compare (Callable): The fuzzy comparison method for the
|
||||
internal Matcher. Defaults to
|
||||
spacy.matcher.levenshtein.levenshtein_compare.
|
||||
validate (bool): Whether patterns should be validated, passed to
|
||||
Matcher and PhraseMatcher as `validate`.
|
||||
overwrite (bool): Whether to remove any existing spans under this spans
|
||||
|
@ -275,6 +286,7 @@ class SpanRuler(Pipe):
|
|||
self.spans_filter = spans_filter
|
||||
self.ents_filter = ents_filter
|
||||
self.scorer = scorer
|
||||
self.matcher_fuzzy_compare = matcher_fuzzy_compare
|
||||
self._match_label_id_map: Dict[int, Dict[str, str]] = {}
|
||||
self.clear()
|
||||
|
||||
|
@ -460,7 +472,11 @@ class SpanRuler(Pipe):
|
|||
DOCS: https://spacy.io/api/spanruler#clear
|
||||
"""
|
||||
self._patterns: List[PatternType] = []
|
||||
self.matcher: Matcher = Matcher(self.nlp.vocab, validate=self.validate)
|
||||
self.matcher: Matcher = Matcher(
|
||||
self.nlp.vocab,
|
||||
validate=self.validate,
|
||||
fuzzy_compare=self.matcher_fuzzy_compare,
|
||||
)
|
||||
self.phrase_matcher: PhraseMatcher = PhraseMatcher(
|
||||
self.nlp.vocab,
|
||||
attr=self.phrase_matcher_attr,
|
||||
|
|
|
@ -1,12 +1,11 @@
|
|||
from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any, cast
|
||||
from typing import Union
|
||||
from typing import Union, Protocol, runtime_checkable
|
||||
from thinc.api import Config, Model, get_current_ops, set_dropout_rate, Ops
|
||||
from thinc.api import Optimizer
|
||||
from thinc.types import Ragged, Ints2d, Floats2d, Ints1d
|
||||
from thinc.types import Ragged, Ints2d, Floats2d
|
||||
|
||||
import numpy
|
||||
|
||||
from ..compat import Protocol, runtime_checkable
|
||||
from ..scorer import Scorer
|
||||
from ..language import Language
|
||||
from .trainable_pipe import TrainablePipe
|
||||
|
@ -282,7 +281,10 @@ class SpanCategorizer(TrainablePipe):
|
|||
DOCS: https://spacy.io/api/spancategorizer#predict
|
||||
"""
|
||||
indices = self.suggester(docs, ops=self.model.ops)
|
||||
scores = self.model.predict((docs, indices)) # type: ignore
|
||||
if indices.lengths.sum() == 0:
|
||||
scores = self.model.ops.alloc2f(0, 0)
|
||||
else:
|
||||
scores = self.model.predict((docs, indices)) # type: ignore
|
||||
return {"indices": indices, "scores": scores}
|
||||
|
||||
def set_candidates(
|
||||
|
|
|
@ -1,8 +1,10 @@
|
|||
# cython: infer_types=True, profile=True, binding=True
|
||||
from typing import Callable, Dict, Iterable, List, Optional, Union
|
||||
from typing import Tuple
|
||||
import numpy
|
||||
import srsly
|
||||
from thinc.api import Model, set_dropout_rate, SequenceCategoricalCrossentropy, Config
|
||||
from thinc.api import Model, set_dropout_rate, Config
|
||||
from thinc.legacy import LegacySequenceCategoricalCrossentropy
|
||||
from thinc.types import Floats2d, Ints1d
|
||||
import warnings
|
||||
from itertools import islice
|
||||
|
@ -242,7 +244,6 @@ class Tagger(TrainablePipe):
|
|||
|
||||
DOCS: https://spacy.io/api/tagger#rehearse
|
||||
"""
|
||||
loss_func = SequenceCategoricalCrossentropy()
|
||||
if losses is None:
|
||||
losses = {}
|
||||
losses.setdefault(self.name, 0.0)
|
||||
|
@ -256,12 +257,32 @@ class Tagger(TrainablePipe):
|
|||
set_dropout_rate(self.model, drop)
|
||||
tag_scores, bp_tag_scores = self.model.begin_update(docs)
|
||||
tutor_tag_scores, _ = self._rehearsal_model.begin_update(docs)
|
||||
grads, loss = loss_func(tag_scores, tutor_tag_scores)
|
||||
loss, grads = self.get_teacher_student_loss(tutor_tag_scores, tag_scores)
|
||||
bp_tag_scores(grads)
|
||||
self.finish_update(sgd)
|
||||
if sgd is not None:
|
||||
self.finish_update(sgd)
|
||||
losses[self.name] += loss
|
||||
return losses
|
||||
|
||||
def get_teacher_student_loss(
|
||||
self, teacher_scores: List[Floats2d], student_scores: List[Floats2d]
|
||||
) -> Tuple[float, List[Floats2d]]:
|
||||
"""Calculate the loss and its gradient for a batch of student
|
||||
scores, relative to teacher scores.
|
||||
|
||||
teacher_scores: Scores representing the teacher model's predictions.
|
||||
student_scores: Scores representing the student model's predictions.
|
||||
|
||||
RETURNS (Tuple[float, float]): The loss and the gradient.
|
||||
|
||||
DOCS: https://spacy.io/api/tagger#get_teacher_student_loss
|
||||
"""
|
||||
loss_func = LegacySequenceCategoricalCrossentropy(normalize=False)
|
||||
d_scores, loss = loss_func(student_scores, teacher_scores)
|
||||
if self.model.ops.xp.isnan(loss):
|
||||
raise ValueError(Errors.E910.format(name=self.name))
|
||||
return float(loss), d_scores
|
||||
|
||||
def get_loss(self, examples, scores):
|
||||
"""Find the loss and gradient of loss for the batch of documents and
|
||||
their predicted scores.
|
||||
|
@ -273,7 +294,7 @@ class Tagger(TrainablePipe):
|
|||
DOCS: https://spacy.io/api/tagger#get_loss
|
||||
"""
|
||||
validate_examples(examples, "Tagger.get_loss")
|
||||
loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False, neg_prefix=self.cfg["neg_prefix"])
|
||||
loss_func = LegacySequenceCategoricalCrossentropy(names=self.labels, normalize=False, neg_prefix=self.cfg["neg_prefix"])
|
||||
# Convert empty tag "" to missing value None so that both misaligned
|
||||
# tokens and tokens with missing annotation have the default missing
|
||||
# value None.
|
||||
|
|
|
@ -77,7 +77,7 @@ subword_features = true
|
|||
default_config={
|
||||
"threshold": 0.0,
|
||||
"model": DEFAULT_SINGLE_TEXTCAT_MODEL,
|
||||
"scorer": {"@scorers": "spacy.textcat_scorer.v1"},
|
||||
"scorer": {"@scorers": "spacy.textcat_scorer.v2"},
|
||||
"save_activations": False,
|
||||
},
|
||||
default_score_weights={
|
||||
|
@ -91,7 +91,6 @@ subword_features = true
|
|||
"cats_macro_f": None,
|
||||
"cats_macro_auc": None,
|
||||
"cats_f_per_type": None,
|
||||
"cats_macro_auc_per_type": None,
|
||||
},
|
||||
)
|
||||
def make_textcat(
|
||||
|
@ -131,7 +130,7 @@ def textcat_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
|||
)
|
||||
|
||||
|
||||
@registry.scorers("spacy.textcat_scorer.v1")
|
||||
@registry.scorers("spacy.textcat_scorer.v2")
|
||||
def make_textcat_scorer():
|
||||
return textcat_score
|
||||
|
||||
|
@ -169,7 +168,11 @@ class TextCategorizer(TrainablePipe):
|
|||
self.model = model
|
||||
self.name = name
|
||||
self._rehearsal_model = None
|
||||
cfg: Dict[str, Any] = {"labels": [], "threshold": threshold, "positive_label": None}
|
||||
cfg: Dict[str, Any] = {
|
||||
"labels": [],
|
||||
"threshold": threshold,
|
||||
"positive_label": None,
|
||||
}
|
||||
self.cfg = dict(cfg)
|
||||
self.scorer = scorer
|
||||
self.save_activations = save_activations
|
||||
|
@ -416,5 +419,9 @@ class TextCategorizer(TrainablePipe):
|
|||
def _validate_categories(self, examples: Iterable[Example]):
|
||||
"""Check whether the provided examples all have single-label cats annotations."""
|
||||
for ex in examples:
|
||||
if list(ex.reference.cats.values()).count(1.0) > 1:
|
||||
vals = list(ex.reference.cats.values())
|
||||
if vals.count(1.0) > 1:
|
||||
raise ValueError(Errors.E895.format(value=ex.reference.cats))
|
||||
for val in vals:
|
||||
if not (val == 1.0 or val == 0.0):
|
||||
raise ValueError(Errors.E851.format(val=val))
|
||||
|
|
|
@ -74,7 +74,7 @@ subword_features = true
|
|||
default_config={
|
||||
"threshold": 0.5,
|
||||
"model": DEFAULT_MULTI_TEXTCAT_MODEL,
|
||||
"scorer": {"@scorers": "spacy.textcat_multilabel_scorer.v1"},
|
||||
"scorer": {"@scorers": "spacy.textcat_multilabel_scorer.v2"},
|
||||
"save_activations": False,
|
||||
},
|
||||
default_score_weights={
|
||||
|
@ -88,7 +88,6 @@ subword_features = true
|
|||
"cats_macro_f": None,
|
||||
"cats_macro_auc": None,
|
||||
"cats_f_per_type": None,
|
||||
"cats_macro_auc_per_type": None,
|
||||
},
|
||||
)
|
||||
def make_multilabel_textcat(
|
||||
|
@ -128,7 +127,7 @@ def textcat_multilabel_score(examples: Iterable[Example], **kwargs) -> Dict[str,
|
|||
)
|
||||
|
||||
|
||||
@registry.scorers("spacy.textcat_multilabel_scorer.v1")
|
||||
@registry.scorers("spacy.textcat_multilabel_scorer.v2")
|
||||
def make_textcat_multilabel_scorer():
|
||||
return textcat_multilabel_score
|
||||
|
||||
|
@ -156,11 +155,8 @@ class MultiLabel_TextCategorizer(TextCategorizer):
|
|||
name (str): The component instance name, used to add entries to the
|
||||
losses during training.
|
||||
threshold (float): Cutoff to consider a prediction "positive".
|
||||
<<<<<<< HEAD
|
||||
save_activations (bool): save model activations in Doc when annotating.
|
||||
=======
|
||||
scorer (Optional[Callable]): The scoring method.
|
||||
>>>>>>> upstream/master
|
||||
save_activations (bool): save model activations in Doc when annotating.
|
||||
|
||||
DOCS: https://spacy.io/api/textcategorizer#init
|
||||
"""
|
||||
|
@ -205,6 +201,8 @@ class MultiLabel_TextCategorizer(TextCategorizer):
|
|||
for label in labels:
|
||||
self.add_label(label)
|
||||
subbatch = list(islice(get_examples(), 10))
|
||||
self._validate_categories(subbatch)
|
||||
|
||||
doc_sample = [eg.reference for eg in subbatch]
|
||||
label_sample, _ = self._examples_to_truth(subbatch)
|
||||
self._require_labels()
|
||||
|
@ -215,4 +213,8 @@ class MultiLabel_TextCategorizer(TextCategorizer):
|
|||
def _validate_categories(self, examples: Iterable[Example]):
|
||||
"""This component allows any type of single- or multi-label annotations.
|
||||
This method overwrites the more strict one from 'textcat'."""
|
||||
pass
|
||||
# check that annotation values are valid
|
||||
for ex in examples:
|
||||
for val in ex.reference.cats.values():
|
||||
if not (val == 1.0 or val == 0.0):
|
||||
raise ValueError(Errors.E851.format(val=val))
|
||||
|
|
|
@ -6,7 +6,7 @@ import warnings
|
|||
|
||||
from ..tokens.doc cimport Doc
|
||||
|
||||
from ..training import validate_examples
|
||||
from ..training import validate_examples, validate_distillation_examples
|
||||
from ..errors import Errors, Warnings
|
||||
from .pipe import Pipe, deserialize_config
|
||||
from .. import util
|
||||
|
@ -56,6 +56,53 @@ cdef class TrainablePipe(Pipe):
|
|||
except Exception as e:
|
||||
error_handler(self.name, self, [doc], e)
|
||||
|
||||
|
||||
def distill(self,
|
||||
teacher_pipe: Optional["TrainablePipe"],
|
||||
examples: Iterable["Example"],
|
||||
*,
|
||||
drop: float=0.0,
|
||||
sgd: Optional[Optimizer]=None,
|
||||
losses: Optional[Dict[str, float]]=None) -> Dict[str, float]:
|
||||
"""Train a pipe (the student) on the predictions of another pipe
|
||||
(the teacher). The student is typically trained on the probability
|
||||
distribution of the teacher, but details may differ per pipe.
|
||||
|
||||
teacher_pipe (Optional[TrainablePipe]): The teacher pipe to learn
|
||||
from.
|
||||
examples (Iterable[Example]): Distillation examples. The reference
|
||||
(teacher) and predicted (student) docs must have the same number of
|
||||
tokens and the same orthography.
|
||||
drop (float): dropout rate.
|
||||
sgd (Optional[Optimizer]): An optimizer. Will be created via
|
||||
create_optimizer if not set.
|
||||
losses (Optional[Dict[str, float]]): Optional record of loss during
|
||||
distillation.
|
||||
RETURNS: The updated losses dictionary.
|
||||
|
||||
DOCS: https://spacy.io/api/pipe#distill
|
||||
"""
|
||||
# By default we require a teacher pipe, but there are downstream
|
||||
# implementations that don't require a pipe.
|
||||
if teacher_pipe is None:
|
||||
raise ValueError(Errors.E4002.format(name=self.name))
|
||||
if losses is None:
|
||||
losses = {}
|
||||
losses.setdefault(self.name, 0.0)
|
||||
validate_distillation_examples(examples, "TrainablePipe.distill")
|
||||
set_dropout_rate(self.model, drop)
|
||||
for node in teacher_pipe.model.walk():
|
||||
if node.name == "softmax":
|
||||
node.attrs["softmax_normalize"] = True
|
||||
teacher_scores = teacher_pipe.model.predict([eg.reference for eg in examples])
|
||||
student_scores, bp_student_scores = self.model.begin_update([eg.predicted for eg in examples])
|
||||
loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores)
|
||||
bp_student_scores(d_scores)
|
||||
if sgd is not None:
|
||||
self.finish_update(sgd)
|
||||
losses[self.name] += loss
|
||||
return losses
|
||||
|
||||
def pipe(self, stream: Iterable[Doc], *, batch_size: int=128) -> Iterator[Doc]:
|
||||
"""Apply the pipe to a stream of documents. This usually happens under
|
||||
the hood when the nlp object is called on a text and all components are
|
||||
|
@ -169,6 +216,19 @@ cdef class TrainablePipe(Pipe):
|
|||
"""
|
||||
raise NotImplementedError(Errors.E931.format(parent="TrainablePipe", method="get_loss", name=self.name))
|
||||
|
||||
def get_teacher_student_loss(self, teacher_scores, student_scores):
|
||||
"""Calculate the loss and its gradient for a batch of student
|
||||
scores, relative to teacher scores.
|
||||
|
||||
teacher_scores: Scores representing the teacher model's predictions.
|
||||
student_scores: Scores representing the student model's predictions.
|
||||
|
||||
RETURNS (Tuple[float, float]): The loss and the gradient.
|
||||
|
||||
DOCS: https://spacy.io/api/pipe#get_teacher_student_loss
|
||||
"""
|
||||
raise NotImplementedError(Errors.E931.format(parent="TrainablePipe", method="get_teacher_student_loss", name=self.name))
|
||||
|
||||
def create_optimizer(self) -> Optimizer:
|
||||
"""Create an optimizer for the pipeline component.
|
||||
|
||||
|
@ -205,6 +265,14 @@ cdef class TrainablePipe(Pipe):
|
|||
"""
|
||||
raise NotImplementedError(Errors.E931.format(parent="Pipe", method="add_label", name=self.name))
|
||||
|
||||
@property
|
||||
def is_distillable(self) -> bool:
|
||||
# Normally a pipe overrides `get_teacher_student_loss` to implement
|
||||
# distillation. In more exceptional cases, a pipe can provide its
|
||||
# own `distill` implementation. If neither of these methods is
|
||||
# overridden, the pipe does not implement distillation.
|
||||
return not (self.__class__.distill is TrainablePipe.distill and self.__class__.get_teacher_student_loss is TrainablePipe.get_teacher_student_loss)
|
||||
|
||||
@property
|
||||
def is_trainable(self) -> bool:
|
||||
return True
|
||||
|
|
|
@ -1,21 +0,0 @@
|
|||
from cymem.cymem cimport Pool
|
||||
from thinc.backends.cblas cimport CBlas
|
||||
|
||||
from ..vocab cimport Vocab
|
||||
from .trainable_pipe cimport TrainablePipe
|
||||
from ._parser_internals.transition_system cimport Transition, TransitionSystem
|
||||
from ._parser_internals._state cimport StateC
|
||||
from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC
|
||||
|
||||
|
||||
cdef class Parser(TrainablePipe):
|
||||
cdef public object _rehearsal_model
|
||||
cdef readonly TransitionSystem moves
|
||||
cdef public object _multitasks
|
||||
cdef object _cpu_ops
|
||||
|
||||
cdef void _parseC(self, CBlas cblas, StateC** states,
|
||||
WeightsC weights, SizesC sizes) nogil
|
||||
|
||||
cdef void c_transition_batch(self, StateC** states, const float* scores,
|
||||
int nr_class, int batch_size) nogil
|
|
@ -1,5 +1,6 @@
|
|||
# cython: infer_types=True, cdivision=True, boundscheck=False, binding=True
|
||||
from __future__ import print_function
|
||||
from typing import Dict, Iterable, List, Optional, Tuple
|
||||
from cymem.cymem cimport Pool
|
||||
cimport numpy as np
|
||||
from itertools import islice
|
||||
|
@ -7,25 +8,30 @@ from libcpp.vector cimport vector
|
|||
from libc.string cimport memset, memcpy
|
||||
from libc.stdlib cimport calloc, free
|
||||
import random
|
||||
import contextlib
|
||||
|
||||
import srsly
|
||||
from thinc.api import get_ops, set_dropout_rate, CupyOps, NumpyOps
|
||||
from thinc.extra.search cimport Beam
|
||||
from thinc.api import get_ops, set_dropout_rate, CupyOps, NumpyOps, Optimizer
|
||||
from thinc.api import chain, softmax_activation, use_ops, get_array_module
|
||||
from thinc.legacy import LegacySequenceCategoricalCrossentropy
|
||||
from thinc.types import Floats2d, Ints1d
|
||||
import numpy.random
|
||||
import numpy
|
||||
import warnings
|
||||
|
||||
from ._parser_internals.stateclass cimport StateClass
|
||||
from ..ml.parser_model cimport alloc_activations, free_activations
|
||||
from ..ml.parser_model cimport predict_states, arg_max_if_valid
|
||||
from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss
|
||||
from ..ml.parser_model cimport get_c_weights, get_c_sizes
|
||||
from ..ml.tb_framework import TransitionModelInputs
|
||||
from ._parser_internals.stateclass cimport StateC, StateClass
|
||||
from ._parser_internals.search cimport Beam
|
||||
from ..tokens.doc cimport Doc
|
||||
from .trainable_pipe import TrainablePipe
|
||||
from .trainable_pipe cimport TrainablePipe
|
||||
from ._parser_internals cimport _beam_utils
|
||||
from ._parser_internals import _beam_utils
|
||||
from ..vocab cimport Vocab
|
||||
from ._parser_internals.transition_system cimport Transition, TransitionSystem
|
||||
from ..typedefs cimport weight_t
|
||||
|
||||
from ..training import validate_examples, validate_get_examples
|
||||
from ..training import validate_distillation_examples
|
||||
from ..errors import Errors, Warnings
|
||||
from .. import util
|
||||
|
||||
|
@ -33,7 +39,7 @@ from .. import util
|
|||
NUMPY_OPS = NumpyOps()
|
||||
|
||||
|
||||
cdef class Parser(TrainablePipe):
|
||||
class Parser(TrainablePipe):
|
||||
"""
|
||||
Base class of the DependencyParser and EntityRecognizer.
|
||||
"""
|
||||
|
@ -133,8 +139,9 @@ cdef class Parser(TrainablePipe):
|
|||
@property
|
||||
def move_names(self):
|
||||
names = []
|
||||
cdef TransitionSystem moves = self.moves
|
||||
for i in range(self.moves.n_moves):
|
||||
name = self.moves.move_name(self.moves.c[i].move, self.moves.c[i].label)
|
||||
name = self.moves.move_name(moves.c[i].move, moves.c[i].label)
|
||||
# Explicitly removing the internal "U-" token used for blocking entities
|
||||
if name != "U-":
|
||||
names.append(name)
|
||||
|
@ -203,6 +210,118 @@ cdef class Parser(TrainablePipe):
|
|||
# Defined in subclasses, to avoid circular import
|
||||
raise NotImplementedError
|
||||
|
||||
def distill(self,
|
||||
teacher_pipe: Optional[TrainablePipe],
|
||||
examples: Iterable["Example"],
|
||||
*,
|
||||
drop: float=0.0,
|
||||
sgd: Optional[Optimizer]=None,
|
||||
losses: Optional[Dict[str, float]]=None):
|
||||
"""Train a pipe (the student) on the predictions of another pipe
|
||||
(the teacher). The student is trained on the transition probabilities
|
||||
of the teacher.
|
||||
|
||||
teacher_pipe (Optional[TrainablePipe]): The teacher pipe to learn
|
||||
from.
|
||||
examples (Iterable[Example]): Distillation examples. The reference
|
||||
(teacher) and predicted (student) docs must have the same number of
|
||||
tokens and the same orthography.
|
||||
drop (float): dropout rate.
|
||||
sgd (Optional[Optimizer]): An optimizer. Will be created via
|
||||
create_optimizer if not set.
|
||||
losses (Optional[Dict[str, float]]): Optional record of loss during
|
||||
distillation.
|
||||
RETURNS: The updated losses dictionary.
|
||||
|
||||
DOCS: https://spacy.io/api/dependencyparser#distill
|
||||
"""
|
||||
if teacher_pipe is None:
|
||||
raise ValueError(Errors.E4002.format(name=self.name))
|
||||
if losses is None:
|
||||
losses = {}
|
||||
losses.setdefault(self.name, 0.0)
|
||||
|
||||
validate_distillation_examples(examples, "TransitionParser.distill")
|
||||
|
||||
set_dropout_rate(self.model, drop)
|
||||
|
||||
student_docs = [eg.predicted for eg in examples]
|
||||
|
||||
max_moves = self.cfg["update_with_oracle_cut_size"]
|
||||
if max_moves >= 1:
|
||||
# Chop sequences into lengths of this many words, to make the
|
||||
# batch uniform length. Since we do not have a gold standard
|
||||
# sequence, we use the teacher's predictions as the gold
|
||||
# standard.
|
||||
max_moves = int(random.uniform(max_moves // 2, max_moves * 2))
|
||||
states = self._init_batch(teacher_pipe, student_docs, max_moves)
|
||||
else:
|
||||
states = self.moves.init_batch(student_docs)
|
||||
|
||||
# We distill as follows: 1. we first let the student predict transition
|
||||
# sequences (and the corresponding transition probabilities); (2) we
|
||||
# let the teacher follow the student's predicted transition sequences
|
||||
# to obtain the teacher's transition probabilities; (3) we compute the
|
||||
# gradients of the student's transition distributions relative to the
|
||||
# teacher's distributions.
|
||||
|
||||
student_inputs = TransitionModelInputs(docs=student_docs, moves=self.moves,
|
||||
max_moves=max_moves)
|
||||
(student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs)
|
||||
actions = states2actions(student_states)
|
||||
teacher_inputs = TransitionModelInputs(docs=[eg.reference for eg in examples],
|
||||
moves=self.moves, actions=actions)
|
||||
(_, teacher_scores) = teacher_pipe.model.predict(teacher_inputs)
|
||||
|
||||
loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores)
|
||||
backprop_scores((student_states, d_scores))
|
||||
|
||||
if sgd is not None:
|
||||
self.finish_update(sgd)
|
||||
|
||||
losses[self.name] += loss
|
||||
|
||||
return losses
|
||||
|
||||
|
||||
def get_teacher_student_loss(
|
||||
self, teacher_scores: List[Floats2d], student_scores: List[Floats2d],
|
||||
normalize: bool=False,
|
||||
) -> Tuple[float, List[Floats2d]]:
|
||||
"""Calculate the loss and its gradient for a batch of student
|
||||
scores, relative to teacher scores.
|
||||
|
||||
teacher_scores: Scores representing the teacher model's predictions.
|
||||
student_scores: Scores representing the student model's predictions.
|
||||
|
||||
RETURNS (Tuple[float, float]): The loss and the gradient.
|
||||
|
||||
DOCS: https://spacy.io/api/dependencyparser#get_teacher_student_loss
|
||||
"""
|
||||
|
||||
# We can't easily hook up a softmax layer in the parsing model, since
|
||||
# the get_loss does additional masking. So, we could apply softmax
|
||||
# manually here and use Thinc's cross-entropy loss. But it's a bit
|
||||
# suboptimal, since we can have a lot of states that would result in
|
||||
# many kernel launches. Futhermore the parsing model's backprop expects
|
||||
# a XP array, so we'd have to concat the softmaxes anyway. So, like
|
||||
# the get_loss implementation, we'll compute the loss and gradients
|
||||
# ourselves.
|
||||
|
||||
teacher_scores = self.model.ops.softmax(self.model.ops.xp.vstack(teacher_scores),
|
||||
axis=-1, inplace=True)
|
||||
student_scores = self.model.ops.softmax(self.model.ops.xp.vstack(student_scores),
|
||||
axis=-1, inplace=True)
|
||||
|
||||
assert teacher_scores.shape == student_scores.shape
|
||||
|
||||
d_scores = student_scores - teacher_scores
|
||||
if normalize:
|
||||
d_scores /= d_scores.shape[0]
|
||||
loss = (d_scores**2).sum() / d_scores.size
|
||||
|
||||
return float(loss), d_scores
|
||||
|
||||
def init_multitask_objectives(self, get_examples, pipeline, **cfg):
|
||||
"""Setup models for secondary objectives, to benefit from multi-task
|
||||
learning. This method is intended to be overridden by subclasses.
|
||||
|
@ -223,9 +342,6 @@ cdef class Parser(TrainablePipe):
|
|||
|
||||
stream: The sequence of documents to process.
|
||||
batch_size (int): Number of documents to accumulate into a working set.
|
||||
error_handler (Callable[[str, List[Doc], Exception], Any]): Function that
|
||||
deals with a failing batch of documents. The default function just reraises
|
||||
the exception.
|
||||
|
||||
YIELDS (Doc): Documents, in order.
|
||||
"""
|
||||
|
@ -247,78 +363,29 @@ cdef class Parser(TrainablePipe):
|
|||
def predict(self, docs):
|
||||
if isinstance(docs, Doc):
|
||||
docs = [docs]
|
||||
self._ensure_labels_are_added(docs)
|
||||
if not any(len(doc) for doc in docs):
|
||||
result = self.moves.init_batch(docs)
|
||||
return result
|
||||
if self.cfg["beam_width"] == 1:
|
||||
return self.greedy_parse(docs, drop=0.0)
|
||||
else:
|
||||
return self.beam_parse(
|
||||
docs,
|
||||
drop=0.0,
|
||||
beam_width=self.cfg["beam_width"],
|
||||
beam_density=self.cfg["beam_density"]
|
||||
)
|
||||
with _change_attrs(self.model, beam_width=self.cfg["beam_width"], beam_density=self.cfg["beam_density"]):
|
||||
inputs = TransitionModelInputs(docs=docs, moves=self.moves)
|
||||
states_or_beams, _ = self.model.predict(inputs)
|
||||
return states_or_beams
|
||||
|
||||
def greedy_parse(self, docs, drop=0.):
|
||||
cdef vector[StateC*] states
|
||||
cdef StateClass state
|
||||
cdef CBlas cblas = self._cpu_ops.cblas()
|
||||
self._resize()
|
||||
self._ensure_labels_are_added(docs)
|
||||
set_dropout_rate(self.model, drop)
|
||||
batch = self.moves.init_batch(docs)
|
||||
model = self.model.predict(docs)
|
||||
weights = get_c_weights(model)
|
||||
for state in batch:
|
||||
if not state.is_final():
|
||||
states.push_back(state.c)
|
||||
sizes = get_c_sizes(model, states.size())
|
||||
with nogil:
|
||||
self._parseC(cblas, &states[0], weights, sizes)
|
||||
model.clear_memory()
|
||||
del model
|
||||
return batch
|
||||
with _change_attrs(self.model, beam_width=1):
|
||||
inputs = TransitionModelInputs(docs=docs, moves=self.moves)
|
||||
states, _ = self.model.predict(inputs)
|
||||
return states
|
||||
|
||||
def beam_parse(self, docs, int beam_width, float drop=0., beam_density=0.):
|
||||
cdef Beam beam
|
||||
cdef Doc doc
|
||||
self._ensure_labels_are_added(docs)
|
||||
batch = _beam_utils.BeamBatch(
|
||||
self.moves,
|
||||
self.moves.init_batch(docs),
|
||||
None,
|
||||
beam_width,
|
||||
density=beam_density
|
||||
)
|
||||
model = self.model.predict(docs)
|
||||
while not batch.is_done:
|
||||
states = batch.get_unfinished_states()
|
||||
if not states:
|
||||
break
|
||||
scores = model.predict(states)
|
||||
batch.advance(scores)
|
||||
model.clear_memory()
|
||||
del model
|
||||
return list(batch)
|
||||
|
||||
cdef void _parseC(self, CBlas cblas, StateC** states,
|
||||
WeightsC weights, SizesC sizes) nogil:
|
||||
cdef int i, j
|
||||
cdef vector[StateC*] unfinished
|
||||
cdef ActivationsC activations = alloc_activations(sizes)
|
||||
while sizes.states >= 1:
|
||||
predict_states(cblas, &activations, states, &weights, sizes)
|
||||
# Validate actions, argmax, take action.
|
||||
self.c_transition_batch(states,
|
||||
activations.scores, sizes.classes, sizes.states)
|
||||
for i in range(sizes.states):
|
||||
if not states[i].is_final():
|
||||
unfinished.push_back(states[i])
|
||||
for i in range(unfinished.size()):
|
||||
states[i] = unfinished[i]
|
||||
sizes.states = unfinished.size()
|
||||
unfinished.clear()
|
||||
free_activations(&activations)
|
||||
with _change_attrs(self.model, beam_width=self.cfg["beam_width"], beam_density=self.cfg["beam_density"]):
|
||||
inputs = TransitionModelInputs(docs=docs, moves=self.moves)
|
||||
beams, _ = self.model.predict(inputs)
|
||||
return beams
|
||||
|
||||
def set_annotations(self, docs, states_or_beams):
|
||||
cdef StateClass state
|
||||
|
@ -330,35 +397,6 @@ cdef class Parser(TrainablePipe):
|
|||
for hook in self.postprocesses:
|
||||
hook(doc)
|
||||
|
||||
def transition_states(self, states, float[:, ::1] scores):
|
||||
cdef StateClass state
|
||||
cdef float* c_scores = &scores[0, 0]
|
||||
cdef vector[StateC*] c_states
|
||||
for state in states:
|
||||
c_states.push_back(state.c)
|
||||
self.c_transition_batch(&c_states[0], c_scores, scores.shape[1], scores.shape[0])
|
||||
return [state for state in states if not state.c.is_final()]
|
||||
|
||||
cdef void c_transition_batch(self, StateC** states, const float* scores,
|
||||
int nr_class, int batch_size) nogil:
|
||||
# n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
|
||||
with gil:
|
||||
assert self.moves.n_moves > 0, Errors.E924.format(name=self.name)
|
||||
is_valid = <int*>calloc(self.moves.n_moves, sizeof(int))
|
||||
cdef int i, guess
|
||||
cdef Transition action
|
||||
for i in range(batch_size):
|
||||
self.moves.set_valid(is_valid, states[i])
|
||||
guess = arg_max_if_valid(&scores[i*nr_class], is_valid, nr_class)
|
||||
if guess == -1:
|
||||
# This shouldn't happen, but it's hard to raise an error here,
|
||||
# and we don't want to infinite loop. So, force to end state.
|
||||
states[i].force_final()
|
||||
else:
|
||||
action = self.moves.c[guess]
|
||||
action.do(states[i], action.label)
|
||||
free(is_valid)
|
||||
|
||||
def update(self, examples, *, drop=0., sgd=None, losses=None):
|
||||
cdef StateClass state
|
||||
if losses is None:
|
||||
|
@ -370,67 +408,99 @@ cdef class Parser(TrainablePipe):
|
|||
)
|
||||
for multitask in self._multitasks:
|
||||
multitask.update(examples, drop=drop, sgd=sgd)
|
||||
# We need to take care to act on the whole batch, because we might be
|
||||
# getting vectors via a listener.
|
||||
n_examples = len([eg for eg in examples if self.moves.has_gold(eg)])
|
||||
if n_examples == 0:
|
||||
return losses
|
||||
set_dropout_rate(self.model, drop)
|
||||
# The probability we use beam update, instead of falling back to
|
||||
# a greedy update
|
||||
beam_update_prob = self.cfg["beam_update_prob"]
|
||||
if self.cfg['beam_width'] >= 2 and numpy.random.random() < beam_update_prob:
|
||||
return self.update_beam(
|
||||
examples,
|
||||
beam_width=self.cfg["beam_width"],
|
||||
sgd=sgd,
|
||||
losses=losses,
|
||||
beam_density=self.cfg["beam_density"]
|
||||
)
|
||||
docs = [eg.x for eg in examples if len(eg.x)]
|
||||
|
||||
max_moves = self.cfg["update_with_oracle_cut_size"]
|
||||
if max_moves >= 1:
|
||||
# Chop sequences into lengths of this many words, to make the
|
||||
# batch uniform length.
|
||||
max_moves = int(random.uniform(max_moves // 2, max_moves * 2))
|
||||
states, golds, _ = self._init_gold_batch(
|
||||
max_moves = int(random.uniform(max(max_moves // 2, 1), max_moves * 2))
|
||||
init_states, gold_states, _ = self._init_gold_batch(
|
||||
examples,
|
||||
max_length=max_moves
|
||||
)
|
||||
else:
|
||||
states, golds, _ = self.moves.init_gold_batch(examples)
|
||||
if not states:
|
||||
init_states, gold_states, _ = self.moves.init_gold_batch(examples)
|
||||
|
||||
inputs = TransitionModelInputs(docs=docs, moves=self.moves,
|
||||
max_moves=max_moves, states=[state.copy() for state in init_states])
|
||||
(pred_states, scores), backprop_scores = self.model.begin_update(inputs)
|
||||
if sum(s.shape[0] for s in scores) == 0:
|
||||
return losses
|
||||
model, backprop_tok2vec = self.model.begin_update([eg.x for eg in examples])
|
||||
|
||||
all_states = list(states)
|
||||
states_golds = list(zip(states, golds))
|
||||
n_moves = 0
|
||||
while states_golds:
|
||||
states, golds = zip(*states_golds)
|
||||
scores, backprop = model.begin_update(states)
|
||||
d_scores = self.get_batch_loss(states, golds, scores, losses)
|
||||
# Note that the gradient isn't normalized by the batch size
|
||||
# here, because our "samples" are really the states...But we
|
||||
# can't normalize by the number of states either, as then we'd
|
||||
# be getting smaller gradients for states in long sequences.
|
||||
backprop(d_scores)
|
||||
# Follow the predicted action
|
||||
self.transition_states(states, scores)
|
||||
states_golds = [(s, g) for (s, g) in zip(states, golds) if not s.is_final()]
|
||||
if max_moves >= 1 and n_moves >= max_moves:
|
||||
break
|
||||
n_moves += 1
|
||||
|
||||
backprop_tok2vec(golds)
|
||||
d_scores = self.get_loss((gold_states, init_states, pred_states, scores),
|
||||
examples, max_moves)
|
||||
backprop_scores((pred_states, d_scores))
|
||||
if sgd not in (None, False):
|
||||
self.finish_update(sgd)
|
||||
losses[self.name] += float((d_scores**2).sum())
|
||||
# Ugh, this is annoying. If we're working on GPU, we want to free the
|
||||
# memory ASAP. It seems that Python doesn't necessarily get around to
|
||||
# removing these in time if we don't explicitly delete? It's confusing.
|
||||
del backprop
|
||||
del backprop_tok2vec
|
||||
model.clear_memory()
|
||||
del model
|
||||
del backprop_scores
|
||||
return losses
|
||||
|
||||
def get_loss(self, states_scores, examples, max_moves):
|
||||
gold_states, init_states, pred_states, scores = states_scores
|
||||
scores = self.model.ops.xp.vstack(scores)
|
||||
costs = self._get_costs_from_histories(
|
||||
examples,
|
||||
gold_states,
|
||||
init_states,
|
||||
[list(state.history) for state in pred_states],
|
||||
max_moves
|
||||
)
|
||||
xp = get_array_module(scores)
|
||||
best_costs = costs.min(axis=1, keepdims=True)
|
||||
gscores = scores.copy()
|
||||
min_score = scores.min() - 1000
|
||||
assert costs.shape == scores.shape, (costs.shape, scores.shape)
|
||||
gscores[costs > best_costs] = min_score
|
||||
max_ = scores.max(axis=1, keepdims=True)
|
||||
gmax = gscores.max(axis=1, keepdims=True)
|
||||
exp_scores = xp.exp(scores - max_)
|
||||
exp_gscores = xp.exp(gscores - gmax)
|
||||
Z = exp_scores.sum(axis=1, keepdims=True)
|
||||
gZ = exp_gscores.sum(axis=1, keepdims=True)
|
||||
d_scores = exp_scores / Z
|
||||
d_scores -= (costs <= best_costs) * (exp_gscores / gZ)
|
||||
return d_scores
|
||||
|
||||
def _get_costs_from_histories(self, examples, gold_states, init_states, histories, max_moves):
|
||||
cdef TransitionSystem moves = self.moves
|
||||
cdef StateClass state
|
||||
cdef int clas
|
||||
cdef int nF = self.model.get_dim("nF")
|
||||
cdef int nO = moves.n_moves
|
||||
cdef int nS = sum([len(history) for history in histories])
|
||||
cdef Pool mem = Pool()
|
||||
cdef np.ndarray costs_i
|
||||
is_valid = <int*>mem.alloc(nO, sizeof(int))
|
||||
batch = list(zip(init_states, histories, gold_states))
|
||||
n_moves = 0
|
||||
output = []
|
||||
while batch:
|
||||
costs = numpy.zeros((len(batch), nO), dtype="f")
|
||||
for i, (state, history, gold) in enumerate(batch):
|
||||
costs_i = costs[i]
|
||||
clas = history.pop(0)
|
||||
moves.set_costs(is_valid, <weight_t*>costs_i.data, state.c, gold)
|
||||
action = moves.c[clas]
|
||||
action.do(state.c, action.label)
|
||||
state.c.history.push_back(clas)
|
||||
output.append(costs)
|
||||
batch = [(s, h, g) for s, h, g in batch if len(h) != 0]
|
||||
if n_moves >= max_moves >= 1:
|
||||
break
|
||||
n_moves += 1
|
||||
|
||||
return self.model.ops.xp.vstack(output)
|
||||
|
||||
def rehearse(self, examples, sgd=None, losses=None, **cfg):
|
||||
"""Perform a "rehearsal" update, to prevent catastrophic forgetting."""
|
||||
if losses is None:
|
||||
|
@ -440,10 +510,9 @@ cdef class Parser(TrainablePipe):
|
|||
multitask.rehearse(examples, losses=losses, sgd=sgd)
|
||||
if self._rehearsal_model is None:
|
||||
return None
|
||||
losses.setdefault(self.name, 0.)
|
||||
losses.setdefault(self.name, 0.0)
|
||||
validate_examples(examples, "Parser.rehearse")
|
||||
docs = [eg.predicted for eg in examples]
|
||||
states = self.moves.init_batch(docs)
|
||||
# This is pretty dirty, but the NER can resize itself in init_batch,
|
||||
# if labels are missing. We therefore have to check whether we need to
|
||||
# expand our model output.
|
||||
|
@ -451,85 +520,33 @@ cdef class Parser(TrainablePipe):
|
|||
# Prepare the stepwise model, and get the callback for finishing the batch
|
||||
set_dropout_rate(self._rehearsal_model, 0.0)
|
||||
set_dropout_rate(self.model, 0.0)
|
||||
tutor, _ = self._rehearsal_model.begin_update(docs)
|
||||
model, backprop_tok2vec = self.model.begin_update(docs)
|
||||
n_scores = 0.
|
||||
loss = 0.
|
||||
while states:
|
||||
targets, _ = tutor.begin_update(states)
|
||||
guesses, backprop = model.begin_update(states)
|
||||
d_scores = (guesses - targets) / targets.shape[0]
|
||||
# If all weights for an output are 0 in the original model, don't
|
||||
# supervise that output. This allows us to add classes.
|
||||
loss += (d_scores**2).sum()
|
||||
backprop(d_scores)
|
||||
# Follow the predicted action
|
||||
self.transition_states(states, guesses)
|
||||
states = [state for state in states if not state.is_final()]
|
||||
n_scores += d_scores.size
|
||||
# Do the backprop
|
||||
backprop_tok2vec(docs)
|
||||
student_inputs = TransitionModelInputs(docs=docs, moves=self.moves)
|
||||
(student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs)
|
||||
actions = states2actions(student_states)
|
||||
teacher_inputs = TransitionModelInputs(docs=docs, moves=self.moves, actions=actions)
|
||||
_, teacher_scores = self._rehearsal_model.predict(teacher_inputs)
|
||||
|
||||
loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores, normalize=True)
|
||||
|
||||
teacher_scores = self.model.ops.xp.vstack(teacher_scores)
|
||||
student_scores = self.model.ops.xp.vstack(student_scores)
|
||||
assert teacher_scores.shape == student_scores.shape
|
||||
|
||||
d_scores = (student_scores - teacher_scores) / teacher_scores.shape[0]
|
||||
# If all weights for an output are 0 in the original model, don't
|
||||
# supervise that output. This allows us to add classes.
|
||||
loss = (d_scores**2).sum() / d_scores.size
|
||||
backprop_scores((student_states, d_scores))
|
||||
|
||||
if sgd is not None:
|
||||
self.finish_update(sgd)
|
||||
losses[self.name] += loss / n_scores
|
||||
del backprop
|
||||
del backprop_tok2vec
|
||||
model.clear_memory()
|
||||
tutor.clear_memory()
|
||||
del model
|
||||
del tutor
|
||||
losses[self.name] += loss
|
||||
|
||||
return losses
|
||||
|
||||
def update_beam(self, examples, *, beam_width,
|
||||
drop=0., sgd=None, losses=None, beam_density=0.0):
|
||||
states, golds, _ = self.moves.init_gold_batch(examples)
|
||||
if not states:
|
||||
return losses
|
||||
# Prepare the stepwise model, and get the callback for finishing the batch
|
||||
model, backprop_tok2vec = self.model.begin_update(
|
||||
[eg.predicted for eg in examples])
|
||||
loss = _beam_utils.update_beam(
|
||||
self.moves,
|
||||
states,
|
||||
golds,
|
||||
model,
|
||||
beam_width,
|
||||
beam_density=beam_density,
|
||||
)
|
||||
losses[self.name] += loss
|
||||
backprop_tok2vec(golds)
|
||||
if sgd is not None:
|
||||
self.finish_update(sgd)
|
||||
|
||||
def get_batch_loss(self, states, golds, float[:, ::1] scores, losses):
|
||||
cdef StateClass state
|
||||
cdef Pool mem = Pool()
|
||||
cdef int i
|
||||
|
||||
# n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
|
||||
assert self.moves.n_moves > 0, Errors.E924.format(name=self.name)
|
||||
|
||||
is_valid = <int*>mem.alloc(self.moves.n_moves, sizeof(int))
|
||||
costs = <float*>mem.alloc(self.moves.n_moves, sizeof(float))
|
||||
cdef np.ndarray d_scores = numpy.zeros((len(states), self.moves.n_moves),
|
||||
dtype='f', order='C')
|
||||
c_d_scores = <float*>d_scores.data
|
||||
unseen_classes = self.model.attrs["unseen_classes"]
|
||||
for i, (state, gold) in enumerate(zip(states, golds)):
|
||||
memset(is_valid, 0, self.moves.n_moves * sizeof(int))
|
||||
memset(costs, 0, self.moves.n_moves * sizeof(float))
|
||||
self.moves.set_costs(is_valid, costs, state.c, gold)
|
||||
for j in range(self.moves.n_moves):
|
||||
if costs[j] <= 0.0 and j in unseen_classes:
|
||||
unseen_classes.remove(j)
|
||||
cpu_log_loss(c_d_scores,
|
||||
costs, is_valid, &scores[i, 0], d_scores.shape[1])
|
||||
c_d_scores += d_scores.shape[1]
|
||||
# Note that we don't normalize this. See comment in update() for why.
|
||||
if losses is not None:
|
||||
losses.setdefault(self.name, 0.)
|
||||
losses[self.name] += (d_scores**2).sum()
|
||||
return d_scores
|
||||
raise NotImplementedError
|
||||
|
||||
def set_output(self, nO):
|
||||
self.model.attrs["resize_output"](self.model, nO)
|
||||
|
@ -568,7 +585,7 @@ cdef class Parser(TrainablePipe):
|
|||
for example in islice(get_examples(), 10):
|
||||
doc_sample.append(example.predicted)
|
||||
assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
|
||||
self.model.initialize(doc_sample)
|
||||
self.model.initialize((doc_sample, self.moves))
|
||||
if nlp is not None:
|
||||
self.init_multitask_objectives(get_examples, nlp.pipeline)
|
||||
|
||||
|
@ -625,28 +642,63 @@ cdef class Parser(TrainablePipe):
|
|||
raise ValueError(Errors.E149) from None
|
||||
return self
|
||||
|
||||
def _init_gold_batch(self, examples, max_length):
|
||||
"""Make a square batch, of length equal to the shortest transition
|
||||
def _init_batch(self, teacher_step_model, docs, max_length):
|
||||
"""Make a square batch of length equal to the shortest transition
|
||||
sequence or a cap. A long
|
||||
doc will get multiple states. Let's say we have a doc of length 2*N,
|
||||
where N is the shortest doc. We'll make two states, one representing
|
||||
long_doc[:N], and another representing long_doc[N:]."""
|
||||
long_doc[:N], and another representing long_doc[N:]. In contrast to
|
||||
_init_gold_batch, this version uses a teacher model to generate the
|
||||
cut sequences."""
|
||||
cdef:
|
||||
StateClass start_state
|
||||
StateClass state
|
||||
Transition action
|
||||
all_states = self.moves.init_batch([eg.predicted for eg in examples])
|
||||
all_states = self.moves.init_batch(docs)
|
||||
states = []
|
||||
to_cut = []
|
||||
for state, doc in zip(all_states, docs):
|
||||
if not state.is_final():
|
||||
if len(doc) < max_length:
|
||||
states.append(state)
|
||||
else:
|
||||
to_cut.append(state)
|
||||
while to_cut:
|
||||
states.extend(state.copy() for state in to_cut)
|
||||
# Move states forward max_length actions.
|
||||
length = 0
|
||||
while to_cut and length < max_length:
|
||||
teacher_scores = teacher_step_model.predict(to_cut)
|
||||
self.transition_states(to_cut, teacher_scores)
|
||||
# States that are completed do not need further cutting.
|
||||
to_cut = [state for state in to_cut if not state.is_final()]
|
||||
length += 1
|
||||
return states
|
||||
|
||||
|
||||
def _init_gold_batch(self, examples, max_length):
|
||||
"""Make a square batch, of length equal to the shortest transition
|
||||
sequence or a cap. A long doc will get multiple states. Let's say we
|
||||
have a doc of length 2*N, where N is the shortest doc. We'll make
|
||||
two states, one representing long_doc[:N], and another representing
|
||||
long_doc[N:]."""
|
||||
cdef:
|
||||
StateClass start_state
|
||||
StateClass state
|
||||
Transition action
|
||||
TransitionSystem moves = self.moves
|
||||
all_states = moves.init_batch([eg.predicted for eg in examples])
|
||||
states = []
|
||||
golds = []
|
||||
to_cut = []
|
||||
for state, eg in zip(all_states, examples):
|
||||
if self.moves.has_gold(eg) and not state.is_final():
|
||||
gold = self.moves.init_gold(state, eg)
|
||||
if moves.has_gold(eg) and not state.is_final():
|
||||
gold = moves.init_gold(state, eg)
|
||||
if len(eg.x) < max_length:
|
||||
states.append(state)
|
||||
golds.append(gold)
|
||||
else:
|
||||
oracle_actions = self.moves.get_oracle_sequence_from_state(
|
||||
oracle_actions = moves.get_oracle_sequence_from_state(
|
||||
state.copy(), gold)
|
||||
to_cut.append((eg, state, gold, oracle_actions))
|
||||
if not to_cut:
|
||||
|
@ -656,13 +708,52 @@ cdef class Parser(TrainablePipe):
|
|||
for i in range(0, len(oracle_actions), max_length):
|
||||
start_state = state.copy()
|
||||
for clas in oracle_actions[i:i+max_length]:
|
||||
action = self.moves.c[clas]
|
||||
action = moves.c[clas]
|
||||
action.do(state.c, action.label)
|
||||
if state.is_final():
|
||||
break
|
||||
if self.moves.has_gold(eg, start_state.B(0), state.B(0)):
|
||||
if moves.has_gold(eg, start_state.B(0), state.B(0)):
|
||||
states.append(start_state)
|
||||
golds.append(gold)
|
||||
if state.is_final():
|
||||
break
|
||||
return states, golds, max_length
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
def _change_attrs(model, **kwargs):
|
||||
"""Temporarily modify a thinc model's attributes."""
|
||||
unset = object()
|
||||
old_attrs = {}
|
||||
for key, value in kwargs.items():
|
||||
old_attrs[key] = model.attrs.get(key, unset)
|
||||
model.attrs[key] = value
|
||||
yield model
|
||||
for key, value in old_attrs.items():
|
||||
if value is unset:
|
||||
model.attrs.pop(key)
|
||||
else:
|
||||
model.attrs[key] = value
|
||||
|
||||
|
||||
def states2actions(states: List[StateClass]) -> List[Ints1d]:
|
||||
cdef int step
|
||||
cdef StateClass state
|
||||
cdef StateC* c_state
|
||||
actions = []
|
||||
while True:
|
||||
step = len(actions)
|
||||
|
||||
step_actions = []
|
||||
for state in states:
|
||||
c_state = state.c
|
||||
if step < c_state.history.size():
|
||||
step_actions.append(c_state.history[step])
|
||||
|
||||
# We are done if we have exhausted all histories.
|
||||
if len(step_actions) == 0:
|
||||
break
|
||||
|
||||
actions.append(numpy.array(step_actions, dtype="i"))
|
||||
|
||||
return actions
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
from typing import Dict, List, Union, Optional, Any, Callable, Type, Tuple
|
||||
from typing import Iterable, TypeVar, TYPE_CHECKING
|
||||
from .compat import Literal
|
||||
from typing import Iterable, TypeVar, Literal, TYPE_CHECKING
|
||||
from enum import Enum
|
||||
from pydantic import BaseModel, Field, ValidationError, validator, create_model
|
||||
from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool, ConstrainedStr
|
||||
|
@ -156,12 +155,40 @@ def validate_token_pattern(obj: list) -> List[str]:
|
|||
|
||||
|
||||
class TokenPatternString(BaseModel):
|
||||
REGEX: Optional[StrictStr] = Field(None, alias="regex")
|
||||
REGEX: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="regex")
|
||||
IN: Optional[List[StrictStr]] = Field(None, alias="in")
|
||||
NOT_IN: Optional[List[StrictStr]] = Field(None, alias="not_in")
|
||||
IS_SUBSET: Optional[List[StrictStr]] = Field(None, alias="is_subset")
|
||||
IS_SUPERSET: Optional[List[StrictStr]] = Field(None, alias="is_superset")
|
||||
INTERSECTS: Optional[List[StrictStr]] = Field(None, alias="intersects")
|
||||
FUZZY: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy")
|
||||
FUZZY1: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
|
||||
None, alias="fuzzy1"
|
||||
)
|
||||
FUZZY2: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
|
||||
None, alias="fuzzy2"
|
||||
)
|
||||
FUZZY3: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
|
||||
None, alias="fuzzy3"
|
||||
)
|
||||
FUZZY4: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
|
||||
None, alias="fuzzy4"
|
||||
)
|
||||
FUZZY5: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
|
||||
None, alias="fuzzy5"
|
||||
)
|
||||
FUZZY6: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
|
||||
None, alias="fuzzy6"
|
||||
)
|
||||
FUZZY7: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
|
||||
None, alias="fuzzy7"
|
||||
)
|
||||
FUZZY8: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
|
||||
None, alias="fuzzy8"
|
||||
)
|
||||
FUZZY9: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
|
||||
None, alias="fuzzy9"
|
||||
)
|
||||
|
||||
class Config:
|
||||
extra = "forbid"
|
||||
|
@ -329,6 +356,7 @@ class ConfigSchemaTraining(BaseModel):
|
|||
frozen_components: List[str] = Field(..., title="Pipeline components that shouldn't be updated during training")
|
||||
annotating_components: List[str] = Field(..., title="Pipeline components that should set annotations during training")
|
||||
before_to_disk: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after training, before it's saved to disk")
|
||||
before_update: Optional[Callable[["Language", Dict[str, Any]], None]] = Field(..., title="Optional callback that is invoked at the start of each training step")
|
||||
# fmt: on
|
||||
|
||||
class Config:
|
||||
|
|
|
@ -174,7 +174,7 @@ class Scorer:
|
|||
prf_score.score_set(pred_spans, gold_spans)
|
||||
if len(acc_score) > 0:
|
||||
return {
|
||||
"token_acc": acc_score.fscore,
|
||||
"token_acc": acc_score.precision,
|
||||
"token_p": prf_score.precision,
|
||||
"token_r": prf_score.recall,
|
||||
"token_f": prf_score.fscore,
|
||||
|
@ -476,14 +476,12 @@ class Scorer:
|
|||
f_per_type = {label: PRFScore() for label in labels}
|
||||
auc_per_type = {label: ROCAUCScore() for label in labels}
|
||||
labels = set(labels)
|
||||
if labels:
|
||||
for eg in examples:
|
||||
labels.update(eg.predicted.cats.keys())
|
||||
labels.update(eg.reference.cats.keys())
|
||||
for example in examples:
|
||||
# Through this loop, None in the gold_cats indicates missing label.
|
||||
pred_cats = getter(example.predicted, attr)
|
||||
pred_cats = {k: v for k, v in pred_cats.items() if k in labels}
|
||||
gold_cats = getter(example.reference, attr)
|
||||
gold_cats = {k: v for k, v in gold_cats.items() if k in labels}
|
||||
|
||||
for label in labels:
|
||||
pred_score = pred_cats.get(label, 0.0)
|
||||
|
|
|
@ -1,6 +1,10 @@
|
|||
import pytest
|
||||
from spacy.util import get_lang_class
|
||||
import functools
|
||||
from hypothesis import settings
|
||||
import inspect
|
||||
import importlib
|
||||
import sys
|
||||
|
||||
# Functionally disable deadline settings for tests
|
||||
# to prevent spurious test failures in CI builds.
|
||||
|
@ -47,6 +51,33 @@ def pytest_runtest_setup(item):
|
|||
pytest.skip("not referencing any issues")
|
||||
|
||||
|
||||
# Decorator for Cython-built tests
|
||||
# https://shwina.github.io/cython-testing/
|
||||
def cytest(func):
|
||||
"""
|
||||
Wraps `func` in a plain Python function.
|
||||
"""
|
||||
|
||||
@functools.wraps(func)
|
||||
def wrapped(*args, **kwargs):
|
||||
bound = inspect.signature(func).bind(*args, **kwargs)
|
||||
return func(*bound.args, **bound.kwargs)
|
||||
|
||||
return wrapped
|
||||
|
||||
|
||||
def register_cython_tests(cython_mod_name: str, test_mod_name: str):
|
||||
"""
|
||||
Registers all callables with name `test_*` in Cython module `cython_mod_name`
|
||||
as attributes in module `test_mod_name`, making them discoverable by pytest.
|
||||
"""
|
||||
cython_mod = importlib.import_module(cython_mod_name)
|
||||
for name in dir(cython_mod):
|
||||
item = getattr(cython_mod, name)
|
||||
if callable(item) and name.startswith("test_"):
|
||||
setattr(sys.modules[test_mod_name], name, item)
|
||||
|
||||
|
||||
# Fixtures for language tokenizers (languages sorted alphabetically)
|
||||
|
||||
|
||||
|
@ -351,17 +382,17 @@ def ru_tokenizer():
|
|||
return get_lang_class("ru")().tokenizer
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
@pytest.fixture(scope="session")
|
||||
def ru_lemmatizer():
|
||||
pytest.importorskip("pymorphy3")
|
||||
return get_lang_class("ru")().add_pipe("lemmatizer")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
@pytest.fixture(scope="session")
|
||||
def ru_lookup_lemmatizer():
|
||||
pytest.importorskip("pymorphy2")
|
||||
pytest.importorskip("pymorphy3")
|
||||
return get_lang_class("ru")().add_pipe(
|
||||
"lemmatizer", config={"mode": "pymorphy2_lookup"}
|
||||
"lemmatizer", config={"mode": "pymorphy3_lookup"}
|
||||
)
|
||||
|
||||
|
||||
|
@ -437,19 +468,19 @@ def uk_tokenizer():
|
|||
return get_lang_class("uk")().tokenizer
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
@pytest.fixture(scope="session")
|
||||
def uk_lemmatizer():
|
||||
pytest.importorskip("pymorphy3")
|
||||
pytest.importorskip("pymorphy3_dicts_uk")
|
||||
return get_lang_class("uk")().add_pipe("lemmatizer")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
@pytest.fixture(scope="session")
|
||||
def uk_lookup_lemmatizer():
|
||||
pytest.importorskip("pymorphy2")
|
||||
pytest.importorskip("pymorphy2_dicts_uk")
|
||||
pytest.importorskip("pymorphy3")
|
||||
pytest.importorskip("pymorphy3_dicts_uk")
|
||||
return get_lang_class("uk")().add_pipe(
|
||||
"lemmatizer", config={"mode": "pymorphy2_lookup"}
|
||||
"lemmatizer", config={"mode": "pymorphy3_lookup"}
|
||||
)
|
||||
|
||||
|
||||
|
|
|
@ -123,14 +123,14 @@ def test_doc_from_array_heads_in_bounds(en_vocab):
|
|||
|
||||
# head before start
|
||||
arr = doc.to_array(["HEAD"])
|
||||
arr[0] = -1
|
||||
arr[0] = numpy.int32(-1).astype(numpy.uint64)
|
||||
doc_from_array = Doc(en_vocab, words=words)
|
||||
with pytest.raises(ValueError):
|
||||
doc_from_array.from_array(["HEAD"], arr)
|
||||
|
||||
# head after end
|
||||
arr = doc.to_array(["HEAD"])
|
||||
arr[0] = 5
|
||||
arr[0] = numpy.int32(5).astype(numpy.uint64)
|
||||
doc_from_array = Doc(en_vocab, words=words)
|
||||
with pytest.raises(ValueError):
|
||||
doc_from_array.from_array(["HEAD"], arr)
|
||||
|
|
|
@ -370,3 +370,12 @@ def test_json_to_doc_validation_error(doc):
|
|||
doc_json.pop("tokens")
|
||||
with pytest.raises(ValueError):
|
||||
Doc(doc.vocab).from_json(doc_json, validate=True)
|
||||
|
||||
|
||||
def test_to_json_underscore_doc_getters(doc):
|
||||
def get_text_length(doc):
|
||||
return len(doc.text)
|
||||
|
||||
Doc.set_extension("text_length", getter=get_text_length)
|
||||
doc_json = doc.to_json(underscore=["text_length"])
|
||||
assert doc_json["_"]["text_length"] == get_text_length(doc)
|
||||
|
|
|
@ -1,7 +1,10 @@
|
|||
from typing import List
|
||||
|
||||
import pytest
|
||||
from random import Random
|
||||
from spacy.matcher import Matcher
|
||||
from spacy.tokens import Span, SpanGroup
|
||||
from spacy.tokens import Span, SpanGroup, Doc
|
||||
from spacy.util import filter_spans
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
|
@ -242,3 +245,13 @@ def test_span_group_extend(doc):
|
|||
def test_span_group_dealloc(span_group):
|
||||
with pytest.raises(AttributeError):
|
||||
print(span_group.doc)
|
||||
|
||||
|
||||
@pytest.mark.issue(11975)
|
||||
def test_span_group_typing(doc: Doc):
|
||||
"""Tests whether typing of `SpanGroup` as `Iterable[Span]`-like object is accepted by mypy."""
|
||||
span_group: SpanGroup = doc.spans["SPANS"]
|
||||
spans: List[Span] = list(span_group)
|
||||
for i, span in enumerate(span_group):
|
||||
assert span == span_group[i] == spans[i]
|
||||
filter_spans(span_group)
|
||||
|
|
|
@ -3,6 +3,10 @@ from mock import Mock
|
|||
from spacy.tokens import Doc, Span, Token
|
||||
from spacy.tokens.underscore import Underscore
|
||||
|
||||
# Helper functions
|
||||
def _get_tuple(s: Span):
|
||||
return "._.", "span_extension", s.start_char, s.end_char, s.label, s.kb_id, s.id
|
||||
|
||||
|
||||
@pytest.fixture(scope="function", autouse=True)
|
||||
def clean_underscore():
|
||||
|
@ -171,3 +175,118 @@ def test_underscore_docstring(en_vocab):
|
|||
doc = Doc(en_vocab, words=["hello", "world"])
|
||||
assert test_method.__doc__ == "I am a docstring"
|
||||
assert doc._.test_docstrings.__doc__.rsplit(". ")[-1] == "I am a docstring"
|
||||
|
||||
|
||||
def test_underscore_for_unique_span(en_tokenizer):
|
||||
"""Test that spans with the same boundaries but with different labels are uniquely identified (see #9706)."""
|
||||
Doc.set_extension(name="doc_extension", default=None)
|
||||
Span.set_extension(name="span_extension", default=None)
|
||||
Token.set_extension(name="token_extension", default=None)
|
||||
|
||||
# Initialize doc
|
||||
text = "Hello, world!"
|
||||
doc = en_tokenizer(text)
|
||||
span_1 = Span(doc, 0, 2, "SPAN_1")
|
||||
span_2 = Span(doc, 0, 2, "SPAN_2")
|
||||
|
||||
# Set custom extensions
|
||||
doc._.doc_extension = "doc extension"
|
||||
doc[0]._.token_extension = "token extension"
|
||||
span_1._.span_extension = "span_1 extension"
|
||||
span_2._.span_extension = "span_2 extension"
|
||||
|
||||
# Assert extensions
|
||||
assert doc.user_data[_get_tuple(span_1)] == "span_1 extension"
|
||||
assert doc.user_data[_get_tuple(span_2)] == "span_2 extension"
|
||||
|
||||
# Change label of span and assert extensions
|
||||
span_1.label_ = "NEW_LABEL"
|
||||
assert doc.user_data[_get_tuple(span_1)] == "span_1 extension"
|
||||
assert doc.user_data[_get_tuple(span_2)] == "span_2 extension"
|
||||
|
||||
# Change KB_ID and assert extensions
|
||||
span_1.kb_id_ = "KB_ID"
|
||||
assert doc.user_data[_get_tuple(span_1)] == "span_1 extension"
|
||||
assert doc.user_data[_get_tuple(span_2)] == "span_2 extension"
|
||||
|
||||
# Change extensions and assert
|
||||
span_2._.span_extension = "updated span_2 extension"
|
||||
assert doc.user_data[_get_tuple(span_1)] == "span_1 extension"
|
||||
assert doc.user_data[_get_tuple(span_2)] == "updated span_2 extension"
|
||||
|
||||
# Change span ID and assert extensions
|
||||
span_2.id = 2
|
||||
assert doc.user_data[_get_tuple(span_1)] == "span_1 extension"
|
||||
assert doc.user_data[_get_tuple(span_2)] == "updated span_2 extension"
|
||||
|
||||
# Assert extensions with original key
|
||||
assert doc.user_data[("._.", "doc_extension", None, None)] == "doc extension"
|
||||
assert doc.user_data[("._.", "token_extension", 0, None)] == "token extension"
|
||||
|
||||
|
||||
def test_underscore_for_unique_span_from_docs(en_tokenizer):
|
||||
"""Test that spans in the user_data keep the same data structure when using Doc.from_docs"""
|
||||
Span.set_extension(name="span_extension", default=None)
|
||||
Token.set_extension(name="token_extension", default=None)
|
||||
|
||||
# Initialize doc
|
||||
text_1 = "Hello, world!"
|
||||
doc_1 = en_tokenizer(text_1)
|
||||
span_1a = Span(doc_1, 0, 2, "SPAN_1a")
|
||||
span_1b = Span(doc_1, 0, 2, "SPAN_1b")
|
||||
|
||||
text_2 = "This is a test."
|
||||
doc_2 = en_tokenizer(text_2)
|
||||
span_2a = Span(doc_2, 0, 3, "SPAN_2a")
|
||||
|
||||
# Set custom extensions
|
||||
doc_1[0]._.token_extension = "token_1"
|
||||
doc_2[1]._.token_extension = "token_2"
|
||||
span_1a._.span_extension = "span_1a extension"
|
||||
span_1b._.span_extension = "span_1b extension"
|
||||
span_2a._.span_extension = "span_2a extension"
|
||||
|
||||
doc = Doc.from_docs([doc_1, doc_2])
|
||||
# Assert extensions
|
||||
assert doc_1.user_data[_get_tuple(span_1a)] == "span_1a extension"
|
||||
assert doc_1.user_data[_get_tuple(span_1b)] == "span_1b extension"
|
||||
assert doc_2.user_data[_get_tuple(span_2a)] == "span_2a extension"
|
||||
|
||||
# Check extensions on merged doc
|
||||
assert doc.user_data[_get_tuple(span_1a)] == "span_1a extension"
|
||||
assert doc.user_data[_get_tuple(span_1b)] == "span_1b extension"
|
||||
assert (
|
||||
doc.user_data[
|
||||
(
|
||||
"._.",
|
||||
"span_extension",
|
||||
span_2a.start_char + len(doc_1.text) + 1,
|
||||
span_2a.end_char + len(doc_1.text) + 1,
|
||||
span_2a.label,
|
||||
span_2a.kb_id,
|
||||
span_2a.id,
|
||||
)
|
||||
]
|
||||
== "span_2a extension"
|
||||
)
|
||||
|
||||
|
||||
def test_underscore_for_unique_span_as_span(en_tokenizer):
|
||||
"""Test that spans in the user_data keep the same data structure when using Span.as_doc"""
|
||||
Span.set_extension(name="span_extension", default=None)
|
||||
|
||||
# Initialize doc
|
||||
text = "Hello, world!"
|
||||
doc = en_tokenizer(text)
|
||||
span_1 = Span(doc, 0, 2, "SPAN_1")
|
||||
span_2 = Span(doc, 0, 2, "SPAN_2")
|
||||
|
||||
# Set custom extensions
|
||||
span_1._.span_extension = "span_1 extension"
|
||||
span_2._.span_extension = "span_2 extension"
|
||||
|
||||
span_doc = span_1.as_doc(copy_user_data=True)
|
||||
|
||||
# Assert extensions
|
||||
assert span_doc.user_data[_get_tuple(span_1)] == "span_1 extension"
|
||||
assert span_doc.user_data[_get_tuple(span_2)] == "span_2 extension"
|
||||
|
|
|
@ -81,6 +81,7 @@ def test_ru_lemmatizer_punct(ru_lemmatizer):
|
|||
|
||||
|
||||
def test_ru_doc_lookup_lemmatization(ru_lookup_lemmatizer):
|
||||
assert ru_lookup_lemmatizer.mode == "pymorphy3_lookup"
|
||||
words = ["мама", "мыла", "раму"]
|
||||
pos = ["NOUN", "VERB", "NOUN"]
|
||||
morphs = [
|
||||
|
@ -92,3 +93,17 @@ def test_ru_doc_lookup_lemmatization(ru_lookup_lemmatizer):
|
|||
doc = ru_lookup_lemmatizer(doc)
|
||||
lemmas = [token.lemma_ for token in doc]
|
||||
assert lemmas == ["мама", "мыла", "раму"]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"word,lemma",
|
||||
(
|
||||
("бременем", "бремя"),
|
||||
("будешь", "быть"),
|
||||
("какая-то", "какой-то"),
|
||||
),
|
||||
)
|
||||
def test_ru_lookup_lemmatizer(ru_lookup_lemmatizer, word, lemma):
|
||||
assert ru_lookup_lemmatizer.mode == "pymorphy3_lookup"
|
||||
doc = Doc(ru_lookup_lemmatizer.vocab, words=[word])
|
||||
assert ru_lookup_lemmatizer(doc)[0].lemma_ == lemma
|
||||
|
|
|
@ -8,12 +8,20 @@ pytestmark = pytest.mark.filterwarnings("ignore::DeprecationWarning")
|
|||
def test_uk_lemmatizer(uk_lemmatizer):
|
||||
"""Check that the default uk lemmatizer runs."""
|
||||
doc = Doc(uk_lemmatizer.vocab, words=["a", "b", "c"])
|
||||
assert uk_lemmatizer.mode == "pymorphy3"
|
||||
uk_lemmatizer(doc)
|
||||
assert [token.lemma for token in doc]
|
||||
|
||||
|
||||
def test_uk_lookup_lemmatizer(uk_lookup_lemmatizer):
|
||||
"""Check that the lookup uk lemmatizer runs."""
|
||||
doc = Doc(uk_lookup_lemmatizer.vocab, words=["a", "b", "c"])
|
||||
uk_lookup_lemmatizer(doc)
|
||||
assert [token.lemma for token in doc]
|
||||
@pytest.mark.parametrize(
|
||||
"word,lemma",
|
||||
(
|
||||
("якийсь", "якийсь"),
|
||||
("розповідають", "розповідати"),
|
||||
("розповіси", "розповісти"),
|
||||
),
|
||||
)
|
||||
def test_uk_lookup_lemmatizer(uk_lookup_lemmatizer, word, lemma):
|
||||
assert uk_lookup_lemmatizer.mode == "pymorphy3_lookup"
|
||||
doc = Doc(uk_lookup_lemmatizer.vocab, words=[word])
|
||||
assert uk_lookup_lemmatizer(doc)[0].lemma_ == lemma
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
import pytest
|
||||
from spacy.matcher import levenshtein
|
||||
from spacy.matcher.levenshtein import levenshtein_compare
|
||||
|
||||
|
||||
# empty string plus 10 random ASCII, 10 random unicode, and 2 random long tests
|
||||
|
@ -42,3 +43,31 @@ from spacy.matcher import levenshtein
|
|||
)
|
||||
def test_levenshtein(dist, a, b):
|
||||
assert levenshtein(a, b) == dist
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"a,b,fuzzy,expected",
|
||||
[
|
||||
("a", "a", 1, True),
|
||||
("a", "a", 0, True),
|
||||
("a", "a", -1, True),
|
||||
("a", "ab", 1, True),
|
||||
("a", "ab", 0, False),
|
||||
("a", "ab", -1, True),
|
||||
("ab", "ac", 1, True),
|
||||
("ab", "ac", -1, True),
|
||||
("abc", "cde", 4, True),
|
||||
("abc", "cde", -1, False),
|
||||
("abcdef", "cdefgh", 4, True),
|
||||
("abcdef", "cdefgh", 3, False),
|
||||
("abcdef", "cdefgh", -1, False), # default (2 for length 6)
|
||||
("abcdefgh", "cdefghijk", 5, True),
|
||||
("abcdefgh", "cdefghijk", 4, False),
|
||||
("abcdefgh", "cdefghijk", -1, False), # default (2)
|
||||
("abcdefgh", "cdefghijkl", 6, True),
|
||||
("abcdefgh", "cdefghijkl", 5, False),
|
||||
("abcdefgh", "cdefghijkl", -1, False), # default (2)
|
||||
],
|
||||
)
|
||||
def test_levenshtein_compare(a, b, fuzzy, expected):
|
||||
assert levenshtein_compare(a, b, fuzzy) == expected
|
||||
|
|
|
@ -115,6 +115,155 @@ def test_matcher_match_multi(matcher):
|
|||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"rules,match_locs",
|
||||
[
|
||||
(
|
||||
{
|
||||
"GoogleNow": [[{"ORTH": {"FUZZY": "Google"}}, {"ORTH": "Now"}]],
|
||||
},
|
||||
[(2, 4)],
|
||||
),
|
||||
(
|
||||
{
|
||||
"Java": [[{"LOWER": {"FUZZY": "java"}}]],
|
||||
},
|
||||
[(5, 6)],
|
||||
),
|
||||
(
|
||||
{
|
||||
"JS": [[{"ORTH": {"FUZZY": "JavaScript"}}]],
|
||||
"GoogleNow": [[{"ORTH": {"FUZZY": "Google"}}, {"ORTH": "Now"}]],
|
||||
"Java": [[{"LOWER": {"FUZZY": "java"}}]],
|
||||
},
|
||||
[(2, 4), (5, 6), (8, 9)],
|
||||
),
|
||||
# only the second pattern matches (check that predicate keys used for
|
||||
# caching don't collide)
|
||||
(
|
||||
{
|
||||
"A": [[{"ORTH": {"FUZZY": "Javascripts"}}]],
|
||||
"B": [[{"ORTH": {"FUZZY5": "Javascripts"}}]],
|
||||
},
|
||||
[(8, 9)],
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_matcher_match_fuzzy(en_vocab, rules, match_locs):
|
||||
words = ["They", "like", "Goggle", "Now", "and", "Jav", "but", "not", "JvvaScrpt"]
|
||||
doc = Doc(en_vocab, words=words)
|
||||
|
||||
matcher = Matcher(en_vocab)
|
||||
for key, patterns in rules.items():
|
||||
matcher.add(key, patterns)
|
||||
assert match_locs == [(start, end) for m_id, start, end in matcher(doc)]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("set_op", ["IN", "NOT_IN"])
|
||||
def test_matcher_match_fuzzy_set_op_longest(en_vocab, set_op):
|
||||
rules = {
|
||||
"GoogleNow": [[{"ORTH": {"FUZZY": {set_op: ["Google", "Now"]}}, "OP": "+"}]]
|
||||
}
|
||||
matcher = Matcher(en_vocab)
|
||||
for key, patterns in rules.items():
|
||||
matcher.add(key, patterns, greedy="LONGEST")
|
||||
|
||||
words = ["They", "like", "Goggle", "Noo"]
|
||||
doc = Doc(en_vocab, words=words)
|
||||
assert len(matcher(doc)) == 1
|
||||
|
||||
|
||||
def test_matcher_match_fuzzy_set_multiple(en_vocab):
|
||||
rules = {
|
||||
"GoogleNow": [
|
||||
[
|
||||
{
|
||||
"ORTH": {"FUZZY": {"IN": ["Google", "Now"]}, "NOT_IN": ["Goggle"]},
|
||||
"OP": "+",
|
||||
}
|
||||
]
|
||||
]
|
||||
}
|
||||
matcher = Matcher(en_vocab)
|
||||
for key, patterns in rules.items():
|
||||
matcher.add(key, patterns, greedy="LONGEST")
|
||||
|
||||
words = ["They", "like", "Goggle", "Noo"]
|
||||
doc = Doc(matcher.vocab, words=words)
|
||||
assert matcher(doc) == [
|
||||
(doc.vocab.strings["GoogleNow"], 3, 4),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("fuzzyn", range(1, 10))
|
||||
def test_matcher_match_fuzzyn_all_insertions(en_vocab, fuzzyn):
|
||||
matcher = Matcher(en_vocab)
|
||||
matcher.add("GoogleNow", [[{"ORTH": {f"FUZZY{fuzzyn}": "GoogleNow"}}]])
|
||||
# words with increasing edit distance
|
||||
words = ["GoogleNow" + "a" * i for i in range(0, 10)]
|
||||
doc = Doc(en_vocab, words)
|
||||
assert len(matcher(doc)) == fuzzyn + 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize("fuzzyn", range(1, 6))
|
||||
def test_matcher_match_fuzzyn_various_edits(en_vocab, fuzzyn):
|
||||
matcher = Matcher(en_vocab)
|
||||
matcher.add("GoogleNow", [[{"ORTH": {f"FUZZY{fuzzyn}": "GoogleNow"}}]])
|
||||
# words with increasing edit distance of different edit types
|
||||
words = [
|
||||
"GoogleNow",
|
||||
"GoogleNuw",
|
||||
"GoogleNuew",
|
||||
"GoogleNoweee",
|
||||
"GiggleNuw3",
|
||||
"gouggle5New",
|
||||
]
|
||||
doc = Doc(en_vocab, words)
|
||||
assert len(matcher(doc)) == fuzzyn + 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize("greedy", ["FIRST", "LONGEST"])
|
||||
@pytest.mark.parametrize("set_op", ["IN", "NOT_IN"])
|
||||
def test_matcher_match_fuzzyn_set_op_longest(en_vocab, greedy, set_op):
|
||||
rules = {
|
||||
"GoogleNow": [[{"ORTH": {"FUZZY2": {set_op: ["Google", "Now"]}}, "OP": "+"}]]
|
||||
}
|
||||
matcher = Matcher(en_vocab)
|
||||
for key, patterns in rules.items():
|
||||
matcher.add(key, patterns, greedy=greedy)
|
||||
|
||||
words = ["They", "like", "Goggle", "Noo"]
|
||||
doc = Doc(matcher.vocab, words=words)
|
||||
spans = matcher(doc, as_spans=True)
|
||||
assert len(spans) == 1
|
||||
if set_op == "IN":
|
||||
assert spans[0].text == "Goggle Noo"
|
||||
else:
|
||||
assert spans[0].text == "They like"
|
||||
|
||||
|
||||
def test_matcher_match_fuzzyn_set_multiple(en_vocab):
|
||||
rules = {
|
||||
"GoogleNow": [
|
||||
[
|
||||
{
|
||||
"ORTH": {"FUZZY1": {"IN": ["Google", "Now"]}, "NOT_IN": ["Goggle"]},
|
||||
"OP": "+",
|
||||
}
|
||||
]
|
||||
]
|
||||
}
|
||||
matcher = Matcher(en_vocab)
|
||||
for key, patterns in rules.items():
|
||||
matcher.add(key, patterns, greedy="LONGEST")
|
||||
|
||||
words = ["They", "like", "Goggle", "Noo"]
|
||||
doc = Doc(matcher.vocab, words=words)
|
||||
assert matcher(doc) == [
|
||||
(doc.vocab.strings["GoogleNow"], 3, 4),
|
||||
]
|
||||
|
||||
|
||||
def test_matcher_empty_dict(en_vocab):
|
||||
"""Test matcher allows empty token specs, meaning match on any token."""
|
||||
matcher = Matcher(en_vocab)
|
||||
|
@ -434,6 +583,30 @@ def test_matcher_regex(en_vocab):
|
|||
assert len(matches) == 0
|
||||
|
||||
|
||||
def test_matcher_regex_set_in(en_vocab):
|
||||
matcher = Matcher(en_vocab)
|
||||
pattern = [{"ORTH": {"REGEX": {"IN": [r"(?:a)", r"(?:an)"]}}}]
|
||||
matcher.add("A_OR_AN", [pattern])
|
||||
doc = Doc(en_vocab, words=["an", "a", "hi"])
|
||||
matches = matcher(doc)
|
||||
assert len(matches) == 2
|
||||
doc = Doc(en_vocab, words=["bye"])
|
||||
matches = matcher(doc)
|
||||
assert len(matches) == 0
|
||||
|
||||
|
||||
def test_matcher_regex_set_not_in(en_vocab):
|
||||
matcher = Matcher(en_vocab)
|
||||
pattern = [{"ORTH": {"REGEX": {"NOT_IN": [r"(?:a)", r"(?:an)"]}}}]
|
||||
matcher.add("A_OR_AN", [pattern])
|
||||
doc = Doc(en_vocab, words=["an", "a", "hi"])
|
||||
matches = matcher(doc)
|
||||
assert len(matches) == 1
|
||||
doc = Doc(en_vocab, words=["bye"])
|
||||
matches = matcher(doc)
|
||||
assert len(matches) == 1
|
||||
|
||||
|
||||
def test_matcher_regex_shape(en_vocab):
|
||||
matcher = Matcher(en_vocab)
|
||||
pattern = [{"SHAPE": {"REGEX": r"^[^x]+$"}}]
|
||||
|
|
119
spacy/tests/parser/_search.pyx
Normal file
119
spacy/tests/parser/_search.pyx
Normal file
|
@ -0,0 +1,119 @@
|
|||
# cython: infer_types=True, binding=True
|
||||
from spacy.pipeline._parser_internals.search cimport Beam, MaxViolation
|
||||
from spacy.typedefs cimport class_t, weight_t
|
||||
from cymem.cymem cimport Pool
|
||||
|
||||
from ..conftest import cytest
|
||||
import pytest
|
||||
|
||||
cdef struct TestState:
|
||||
int length
|
||||
int x
|
||||
Py_UNICODE* string
|
||||
|
||||
|
||||
cdef int transition(void* dest, void* src, class_t clas, void* extra_args) except -1:
|
||||
dest_state = <TestState*>dest
|
||||
src_state = <TestState*>src
|
||||
dest_state.length = src_state.length
|
||||
dest_state.x = src_state.x
|
||||
dest_state.x += clas
|
||||
if extra_args != NULL:
|
||||
dest_state.string = <Py_UNICODE*>extra_args
|
||||
else:
|
||||
dest_state.string = src_state.string
|
||||
|
||||
|
||||
cdef void* initialize(Pool mem, int n, void* extra_args) except NULL:
|
||||
state = <TestState*>mem.alloc(1, sizeof(TestState))
|
||||
state.length = n
|
||||
state.x = 1
|
||||
if extra_args == NULL:
|
||||
state.string = u'default'
|
||||
else:
|
||||
state.string = <Py_UNICODE*>extra_args
|
||||
return state
|
||||
|
||||
|
||||
cdef int destroy(Pool mem, void* state, void* extra_args) except -1:
|
||||
state = <TestState*>state
|
||||
mem.free(state)
|
||||
|
||||
@cytest
|
||||
@pytest.mark.parametrize("nr_class,beam_width",
|
||||
[
|
||||
(2, 3),
|
||||
(3, 6),
|
||||
(4, 20),
|
||||
]
|
||||
)
|
||||
def test_init(nr_class, beam_width):
|
||||
b = Beam(nr_class, beam_width)
|
||||
assert b.size == 1
|
||||
assert b.width == beam_width
|
||||
assert b.nr_class == nr_class
|
||||
|
||||
@cytest
|
||||
def test_init_violn():
|
||||
MaxViolation()
|
||||
|
||||
@cytest
|
||||
@pytest.mark.parametrize("nr_class,beam_width,length",
|
||||
[
|
||||
(2, 3, 3),
|
||||
(3, 6, 15),
|
||||
(4, 20, 32),
|
||||
]
|
||||
)
|
||||
def test_initialize(nr_class, beam_width, length):
|
||||
b = Beam(nr_class, beam_width)
|
||||
b.initialize(initialize, destroy, length, NULL)
|
||||
for i in range(b.width):
|
||||
s = <TestState*>b.at(i)
|
||||
assert s.length == length, s.length
|
||||
assert s.string == 'default'
|
||||
|
||||
|
||||
@cytest
|
||||
@pytest.mark.parametrize("nr_class,beam_width,length,extra",
|
||||
[
|
||||
(2, 3, 4, None),
|
||||
(3, 6, 15, u"test beam 1"),
|
||||
]
|
||||
)
|
||||
def test_initialize_extra(nr_class, beam_width, length, extra):
|
||||
b = Beam(nr_class, beam_width)
|
||||
if extra is None:
|
||||
b.initialize(initialize, destroy, length, NULL)
|
||||
else:
|
||||
b.initialize(initialize, destroy, length, <void*><Py_UNICODE*>extra)
|
||||
for i in range(b.width):
|
||||
s = <TestState*>b.at(i)
|
||||
assert s.length == length
|
||||
|
||||
|
||||
@cytest
|
||||
@pytest.mark.parametrize("nr_class,beam_width,length",
|
||||
[
|
||||
(3, 6, 15),
|
||||
(4, 20, 32),
|
||||
]
|
||||
)
|
||||
def test_transition(nr_class, beam_width, length):
|
||||
b = Beam(nr_class, beam_width)
|
||||
b.initialize(initialize, destroy, length, NULL)
|
||||
b.set_cell(0, 2, 30, True, 0)
|
||||
b.set_cell(0, 1, 42, False, 0)
|
||||
b.advance(transition, NULL, NULL)
|
||||
assert b.size == 1, b.size
|
||||
assert b.score == 30, b.score
|
||||
s = <TestState*>b.at(0)
|
||||
assert s.x == 3
|
||||
assert b._states[0].score == 30, b._states[0].score
|
||||
b.set_cell(0, 1, 10, True, 0)
|
||||
b.set_cell(0, 2, 20, True, 0)
|
||||
b.advance(transition, NULL, NULL)
|
||||
assert b._states[0].score == 50, b._states[0].score
|
||||
assert b._states[1].score == 40
|
||||
s = <TestState*>b.at(0)
|
||||
assert s.x == 5
|
|
@ -13,6 +13,7 @@ from spacy.pipeline._parser_internals.ner import BiluoPushDown
|
|||
from spacy.training import Example, iob_to_biluo, split_bilu_label
|
||||
from spacy.tokens import Doc, Span
|
||||
from spacy.vocab import Vocab
|
||||
from thinc.api import fix_random_seed
|
||||
import logging
|
||||
|
||||
from ..util import make_tempdir
|
||||
|
@ -412,7 +413,7 @@ def test_train_empty():
|
|||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||
ner = nlp.add_pipe("ner", last=True)
|
||||
ner.add_label("PERSON")
|
||||
nlp.initialize()
|
||||
nlp.initialize(get_examples=lambda: train_examples)
|
||||
for itn in range(2):
|
||||
losses = {}
|
||||
batches = util.minibatch(train_examples, size=8)
|
||||
|
@ -539,11 +540,11 @@ def test_block_ner():
|
|||
assert [token.ent_type_ for token in doc] == expected_types
|
||||
|
||||
|
||||
@pytest.mark.parametrize("use_upper", [True, False])
|
||||
def test_overfitting_IO(use_upper):
|
||||
def test_overfitting_IO():
|
||||
fix_random_seed(1)
|
||||
# Simple test to try and quickly overfit the NER component
|
||||
nlp = English()
|
||||
ner = nlp.add_pipe("ner", config={"model": {"use_upper": use_upper}})
|
||||
ner = nlp.add_pipe("ner", config={"model": {}})
|
||||
train_examples = []
|
||||
for text, annotations in TRAIN_DATA:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
||||
|
@ -575,7 +576,6 @@ def test_overfitting_IO(use_upper):
|
|||
assert ents2[0].label_ == "LOC"
|
||||
# Ensure that the predictions are still the same, even after adding a new label
|
||||
ner2 = nlp2.get_pipe("ner")
|
||||
assert ner2.model.attrs["has_upper"] == use_upper
|
||||
ner2.add_label("RANDOM_NEW_LABEL")
|
||||
doc3 = nlp2(test_text)
|
||||
ents3 = doc3.ents
|
||||
|
@ -617,6 +617,52 @@ def test_overfitting_IO(use_upper):
|
|||
assert ents[1].kb_id == 0
|
||||
|
||||
|
||||
def test_is_distillable():
|
||||
nlp = English()
|
||||
ner = nlp.add_pipe("ner")
|
||||
assert ner.is_distillable
|
||||
|
||||
|
||||
def test_distill():
|
||||
teacher = English()
|
||||
teacher_ner = teacher.add_pipe("ner")
|
||||
train_examples = []
|
||||
for text, annotations in TRAIN_DATA:
|
||||
train_examples.append(Example.from_dict(teacher.make_doc(text), annotations))
|
||||
for ent in annotations.get("entities"):
|
||||
teacher_ner.add_label(ent[2])
|
||||
|
||||
optimizer = teacher.initialize(get_examples=lambda: train_examples)
|
||||
|
||||
for i in range(50):
|
||||
losses = {}
|
||||
teacher.update(train_examples, sgd=optimizer, losses=losses)
|
||||
assert losses["ner"] < 0.00001
|
||||
|
||||
student = English()
|
||||
student_ner = student.add_pipe("ner")
|
||||
student_ner.initialize(
|
||||
get_examples=lambda: train_examples, labels=teacher_ner.label_data
|
||||
)
|
||||
|
||||
distill_examples = [
|
||||
Example.from_dict(teacher.make_doc(t[0]), {}) for t in TRAIN_DATA
|
||||
]
|
||||
|
||||
for i in range(100):
|
||||
losses = {}
|
||||
student_ner.distill(teacher_ner, distill_examples, sgd=optimizer, losses=losses)
|
||||
assert losses["ner"] < 0.0001
|
||||
|
||||
# test the trained model
|
||||
test_text = "I like London."
|
||||
doc = student(test_text)
|
||||
ents = doc.ents
|
||||
assert len(ents) == 1
|
||||
assert ents[0].text == "London"
|
||||
assert ents[0].label_ == "LOC"
|
||||
|
||||
|
||||
def test_beam_ner_scores():
|
||||
# Test that we can get confidence values out of the beam_ner pipe
|
||||
beam_width = 16
|
||||
|
|
|
@ -1,13 +1,17 @@
|
|||
import itertools
|
||||
import pytest
|
||||
import numpy
|
||||
from numpy.testing import assert_equal
|
||||
from thinc.api import Adam
|
||||
|
||||
from spacy import registry, util
|
||||
from spacy.attrs import DEP, NORM
|
||||
from spacy.lang.en import English
|
||||
from spacy.tokens import Doc
|
||||
from spacy.training import Example
|
||||
from spacy.tokens import Doc
|
||||
from spacy.vocab import Vocab
|
||||
from spacy import util, registry
|
||||
from thinc.api import fix_random_seed
|
||||
|
||||
from ...pipeline import DependencyParser
|
||||
from ...pipeline.dep_parser import DEFAULT_PARSER_MODEL
|
||||
|
@ -59,6 +63,8 @@ PARTIAL_DATA = [
|
|||
),
|
||||
]
|
||||
|
||||
PARSERS = ["parser"] # TODO: Test beam_parser when ready
|
||||
|
||||
eps = 0.1
|
||||
|
||||
|
||||
|
@ -171,6 +177,57 @@ def test_parser_parse_one_word_sentence(en_vocab, en_parser, words):
|
|||
assert doc[0].dep != 0
|
||||
|
||||
|
||||
def test_parser_apply_actions(en_vocab, en_parser):
|
||||
words = ["I", "ate", "pizza"]
|
||||
words2 = ["Eat", "more", "pizza", "!"]
|
||||
doc1 = Doc(en_vocab, words=words)
|
||||
doc2 = Doc(en_vocab, words=words2)
|
||||
docs = [doc1, doc2]
|
||||
|
||||
moves = en_parser.moves
|
||||
moves.add_action(0, "")
|
||||
moves.add_action(1, "")
|
||||
moves.add_action(2, "nsubj")
|
||||
moves.add_action(3, "obj")
|
||||
moves.add_action(2, "amod")
|
||||
|
||||
actions = [
|
||||
numpy.array([0, 0], dtype="i"),
|
||||
numpy.array([2, 0], dtype="i"),
|
||||
numpy.array([0, 4], dtype="i"),
|
||||
numpy.array([3, 3], dtype="i"),
|
||||
numpy.array([1, 1], dtype="i"),
|
||||
numpy.array([1, 1], dtype="i"),
|
||||
numpy.array([0], dtype="i"),
|
||||
numpy.array([1], dtype="i"),
|
||||
]
|
||||
|
||||
states = moves.init_batch(docs)
|
||||
active_states = states
|
||||
|
||||
for step_actions in actions:
|
||||
active_states = moves.apply_actions(active_states, step_actions)
|
||||
|
||||
assert len(active_states) == 0
|
||||
|
||||
for (state, doc) in zip(states, docs):
|
||||
moves.set_annotations(state, doc)
|
||||
|
||||
assert docs[0][0].head.i == 1
|
||||
assert docs[0][0].dep_ == "nsubj"
|
||||
assert docs[0][1].head.i == 1
|
||||
assert docs[0][1].dep_ == "ROOT"
|
||||
assert docs[0][2].head.i == 1
|
||||
assert docs[0][2].dep_ == "obj"
|
||||
|
||||
assert docs[1][0].head.i == 0
|
||||
assert docs[1][0].dep_ == "ROOT"
|
||||
assert docs[1][1].head.i == 2
|
||||
assert docs[1][1].dep_ == "amod"
|
||||
assert docs[1][2].head.i == 0
|
||||
assert docs[1][2].dep_ == "obj"
|
||||
|
||||
|
||||
@pytest.mark.skip(
|
||||
reason="The step_through API was removed (but should be brought back)"
|
||||
)
|
||||
|
@ -319,7 +376,7 @@ def test_parser_constructor(en_vocab):
|
|||
DependencyParser(en_vocab, model)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"])
|
||||
@pytest.mark.parametrize("pipe_name", PARSERS)
|
||||
def test_incomplete_data(pipe_name):
|
||||
# Test that the parser works with incomplete information
|
||||
nlp = English()
|
||||
|
@ -345,11 +402,15 @@ def test_incomplete_data(pipe_name):
|
|||
assert doc[2].head.i == 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"])
|
||||
def test_overfitting_IO(pipe_name):
|
||||
@pytest.mark.parametrize(
|
||||
"pipe_name,max_moves", itertools.product(PARSERS, [0, 1, 5, 100])
|
||||
)
|
||||
def test_overfitting_IO(pipe_name, max_moves):
|
||||
fix_random_seed(0)
|
||||
# Simple test to try and quickly overfit the dependency parser (normal or beam)
|
||||
nlp = English()
|
||||
parser = nlp.add_pipe(pipe_name)
|
||||
parser.cfg["update_with_oracle_cut_size"] = max_moves
|
||||
train_examples = []
|
||||
for text, annotations in TRAIN_DATA:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
||||
|
@ -396,16 +457,67 @@ def test_overfitting_IO(pipe_name):
|
|||
assert_equal(batch_deps_1, no_batch_deps)
|
||||
|
||||
|
||||
def test_is_distillable():
|
||||
nlp = English()
|
||||
parser = nlp.add_pipe("parser")
|
||||
assert parser.is_distillable
|
||||
|
||||
|
||||
def test_distill():
|
||||
teacher = English()
|
||||
teacher_parser = teacher.add_pipe("parser")
|
||||
train_examples = []
|
||||
for text, annotations in TRAIN_DATA:
|
||||
train_examples.append(Example.from_dict(teacher.make_doc(text), annotations))
|
||||
for dep in annotations.get("deps", []):
|
||||
teacher_parser.add_label(dep)
|
||||
|
||||
optimizer = teacher.initialize(get_examples=lambda: train_examples)
|
||||
|
||||
for i in range(200):
|
||||
losses = {}
|
||||
teacher.update(train_examples, sgd=optimizer, losses=losses)
|
||||
assert losses["parser"] < 0.0001
|
||||
|
||||
student = English()
|
||||
student_parser = student.add_pipe("parser")
|
||||
student_parser.initialize(
|
||||
get_examples=lambda: train_examples, labels=teacher_parser.label_data
|
||||
)
|
||||
|
||||
distill_examples = [
|
||||
Example.from_dict(teacher.make_doc(t[0]), {}) for t in TRAIN_DATA
|
||||
]
|
||||
|
||||
for i in range(200):
|
||||
losses = {}
|
||||
student_parser.distill(
|
||||
teacher_parser, distill_examples, sgd=optimizer, losses=losses
|
||||
)
|
||||
assert losses["parser"] < 0.0001
|
||||
|
||||
test_text = "I like securities."
|
||||
doc = student(test_text)
|
||||
assert doc[0].dep_ == "nsubj"
|
||||
assert doc[2].dep_ == "dobj"
|
||||
assert doc[3].dep_ == "punct"
|
||||
assert doc[0].head.i == 1
|
||||
assert doc[2].head.i == 1
|
||||
assert doc[3].head.i == 1
|
||||
|
||||
|
||||
# fmt: off
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"])
|
||||
@pytest.mark.parametrize(
|
||||
"parser_config",
|
||||
[
|
||||
# TransitionBasedParser V1
|
||||
({"@architectures": "spacy.TransitionBasedParser.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": True}),
|
||||
# TransitionBasedParser V2
|
||||
# TODO: re-enable after we have a spacy-legacy release for v4. See
|
||||
# https://github.com/explosion/spacy-legacy/pull/36
|
||||
#({"@architectures": "spacy.TransitionBasedParser.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": True}),
|
||||
({"@architectures": "spacy.TransitionBasedParser.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": True}),
|
||||
({"@architectures": "spacy.TransitionBasedParser.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": False}),
|
||||
({"@architectures": "spacy.TransitionBasedParser.v3", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2}),
|
||||
],
|
||||
)
|
||||
# fmt: on
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user