Compare commits

..

No commits in common. "master" and "v3.7.4" have entirely different histories.

191 changed files with 2558 additions and 9598 deletions

View File

@ -1,99 +0,0 @@
name: Build
on:
push:
tags:
# ytf did they invent their own syntax that's almost regex?
# ** matches 'zero or more of any character'
- 'release-v[0-9]+.[0-9]+.[0-9]+**'
- 'prerelease-v[0-9]+.[0-9]+.[0-9]+**'
jobs:
build_wheels:
name: Build wheels on ${{ matrix.os }}
runs-on: ${{ matrix.os }}
strategy:
matrix:
# macos-13 is an intel runner, macos-14 is apple silicon
os: [ubuntu-latest, windows-latest, macos-13, macos-14, ubuntu-24.04-arm]
steps:
- uses: actions/checkout@v4
# aarch64 (arm) is built via qemu emulation
# QEMU is sadly too slow. We need to wait for public ARM support
#- name: Set up QEMU
# if: runner.os == 'Linux'
# uses: docker/setup-qemu-action@v3
# with:
# platforms: all
- name: Build wheels
uses: pypa/cibuildwheel@v2.21.3
env:
CIBW_ARCHS_LINUX: auto
with:
package-dir: .
output-dir: wheelhouse
config-file: "{package}/pyproject.toml"
- uses: actions/upload-artifact@v4
with:
name: cibw-wheels-${{ matrix.os }}-${{ strategy.job-index }}
path: ./wheelhouse/*.whl
build_sdist:
name: Build source distribution
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Build sdist
run: pipx run build --sdist
- uses: actions/upload-artifact@v4
with:
name: cibw-sdist
path: dist/*.tar.gz
create_release:
needs: [build_wheels, build_sdist]
runs-on: ubuntu-latest
permissions:
contents: write
checks: write
actions: read
issues: read
packages: write
pull-requests: read
repository-projects: read
statuses: read
steps:
- name: Get the tag name and determine if it's a prerelease
id: get_tag_info
run: |
FULL_TAG=${GITHUB_REF#refs/tags/}
if [[ $FULL_TAG == release-* ]]; then
TAG_NAME=${FULL_TAG#release-}
IS_PRERELEASE=false
elif [[ $FULL_TAG == prerelease-* ]]; then
TAG_NAME=${FULL_TAG#prerelease-}
IS_PRERELEASE=true
else
echo "Tag does not match expected patterns" >&2
exit 1
fi
echo "FULL_TAG=$TAG_NAME" >> $GITHUB_ENV
echo "TAG_NAME=$TAG_NAME" >> $GITHUB_ENV
echo "IS_PRERELEASE=$IS_PRERELEASE" >> $GITHUB_ENV
- uses: actions/download-artifact@v4
with:
# unpacks all CIBW artifacts into dist/
pattern: cibw-*
path: dist
merge-multiple: true
- name: Create Draft Release
id: create_release
uses: softprops/action-gh-release@v2
if: startsWith(github.ref, 'refs/tags/')
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
name: ${{ env.TAG_NAME }}
draft: true
prerelease: ${{ env.IS_PRERELEASE }}
files: "./dist/*"

View File

@ -15,7 +15,7 @@ jobs:
env: env:
GITHUB_CONTEXT: ${{ toJson(github) }} GITHUB_CONTEXT: ${{ toJson(github) }}
run: echo "$GITHUB_CONTEXT" run: echo "$GITHUB_CONTEXT"
- uses: actions/checkout@v4 - uses: actions/checkout@v3
- uses: actions/setup-python@v4 - uses: actions/setup-python@v4
- name: Install and run explosion-bot - name: Install and run explosion-bot
run: | run: |

View File

@ -9,7 +9,7 @@ jobs:
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
branch: [master, v4] branch: [master, main]
if: github.repository_owner == 'explosion' if: github.repository_owner == 'explosion'
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:

View File

@ -16,7 +16,7 @@ jobs:
if: github.repository_owner == 'explosion' if: github.repository_owner == 'explosion'
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- uses: dessant/lock-threads@v5 - uses: dessant/lock-threads@v4
with: with:
process-only: 'issues' process-only: 'issues'
issue-inactive-days: '30' issue-inactive-days: '30'

View File

@ -1,29 +0,0 @@
# The cibuildwheel action triggers on creation of a release, this
# triggers on publication.
# The expected workflow is to create a draft release and let the wheels
# upload, and then hit 'publish', which uploads to PyPi.
on:
release:
types:
- published
jobs:
upload_pypi:
runs-on: ubuntu-latest
environment:
name: pypi
url: https://pypi.org/p/spacy
permissions:
id-token: write
contents: read
if: github.event_name == 'release' && github.event.action == 'published'
# or, alternatively, upload to PyPI on every tag starting with 'v' (remove on: release above to use this)
# if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
steps:
- uses: robinraju/release-downloader@v1
with:
tag: ${{ github.event.release.tag_name }}
fileName: '*'
out-file-path: 'dist'
- uses: pypa/gh-action-pypi-publish@release/v1

View File

@ -9,12 +9,12 @@ jobs:
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
branch: [master, v4] branch: [master, main]
if: github.repository_owner == 'explosion' if: github.repository_owner == 'explosion'
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- name: Checkout - name: Checkout
uses: actions/checkout@v4 uses: actions/checkout@v3
with: with:
ref: ${{ matrix.branch }} ref: ${{ matrix.branch }}
- name: Get commits from past 24 hours - name: Get commits from past 24 hours

View File

@ -18,7 +18,7 @@ jobs:
run: | run: |
echo "$GITHUB_CONTEXT" echo "$GITHUB_CONTEXT"
- uses: actions/checkout@v4 - uses: actions/checkout@v3
- uses: actions/setup-python@v4 - uses: actions/setup-python@v4
with: with:
python-version: '3.10' python-version: '3.10'

View File

@ -2,8 +2,6 @@ name: tests
on: on:
push: push:
tags-ignore:
- '**'
branches-ignore: branches-ignore:
- "spacy.io" - "spacy.io"
- "nightly.spacy.io" - "nightly.spacy.io"
@ -12,6 +10,7 @@ on:
- "*.md" - "*.md"
- "*.mdx" - "*.mdx"
- "website/**" - "website/**"
- ".github/workflows/**"
pull_request: pull_request:
types: [opened, synchronize, reopened, edited] types: [opened, synchronize, reopened, edited]
paths-ignore: paths-ignore:
@ -26,12 +25,13 @@ jobs:
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- name: Check out repo - name: Check out repo
uses: actions/checkout@v4 uses: actions/checkout@v3
- name: Configure Python version - name: Configure Python version
uses: actions/setup-python@v4 uses: actions/setup-python@v4
with: with:
python-version: "3.10" python-version: "3.7"
architecture: x64
- name: black - name: black
run: | run: |
@ -45,12 +45,11 @@ jobs:
run: | run: |
python -m pip install flake8==5.0.4 python -m pip install flake8==5.0.4
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
# Unfortunately cython-lint isn't working after the shift to Cython 3. - name: cython-lint
#- name: cython-lint run: |
# run: | python -m pip install cython-lint -c requirements.txt
# python -m pip install cython-lint -c requirements.txt # E501: line too log, W291: trailing whitespace, E266: too many leading '#' for block comment
# # E501: line too log, W291: trailing whitespace, E266: too many leading '#' for block comment cython-lint spacy --ignore E501,W291,E266
# cython-lint spacy --ignore E501,W291,E266
tests: tests:
name: Test name: Test
@ -59,18 +58,30 @@ jobs:
fail-fast: true fail-fast: true
matrix: matrix:
os: [ubuntu-latest, windows-latest, macos-latest] os: [ubuntu-latest, windows-latest, macos-latest]
python_version: ["3.9", "3.12", "3.13"] python_version: ["3.12"]
include:
- os: windows-latest
python_version: "3.7"
- os: macos-latest
python_version: "3.8"
- os: ubuntu-latest
python_version: "3.9"
- os: windows-latest
python_version: "3.10"
- os: macos-latest
python_version: "3.11"
runs-on: ${{ matrix.os }} runs-on: ${{ matrix.os }}
steps: steps:
- name: Check out repo - name: Check out repo
uses: actions/checkout@v4 uses: actions/checkout@v3
- name: Configure Python version - name: Configure Python version
uses: actions/setup-python@v4 uses: actions/setup-python@v4
with: with:
python-version: ${{ matrix.python_version }} python-version: ${{ matrix.python_version }}
architecture: x64
- name: Install dependencies - name: Install dependencies
run: | run: |
@ -148,9 +159,7 @@ jobs:
- name: "Test assemble CLI" - name: "Test assemble CLI"
run: | run: |
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')" python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
python -m spacy assemble ner_source_sm.cfg output_dir PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
env:
PYTHONWARNINGS: "error,ignore::DeprecationWarning"
if: matrix.python_version == '3.9' if: matrix.python_version == '3.9'
- name: "Test assemble CLI vectors warning" - name: "Test assemble CLI vectors warning"

View File

@ -20,12 +20,13 @@ jobs:
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- name: Check out repo - name: Check out repo
uses: actions/checkout@v4 uses: actions/checkout@v3
- name: Configure Python version - name: Configure Python version
uses: actions/setup-python@v4 uses: actions/setup-python@v4
with: with:
python-version: "3.7" python-version: "3.7"
architecture: x64
- name: Validate website/meta/universe.json - name: Validate website/meta/universe.json
run: | run: |

View File

@ -35,7 +35,7 @@ so that more people can benefit from it.
When opening an issue, use a **descriptive title** and include your When opening an issue, use a **descriptive title** and include your
**environment** (operating system, Python version, spaCy version). Our **environment** (operating system, Python version, spaCy version). Our
[issue templates](https://github.com/explosion/spaCy/issues/new/choose) help you [issue template](https://github.com/explosion/spaCy/issues/new) helps you
remember the most important details to include. If you've discovered a bug, you remember the most important details to include. If you've discovered a bug, you
can also submit a [regression test](#fixing-bugs) straight away. When you're can also submit a [regression test](#fixing-bugs) straight away. When you're
opening an issue to report the bug, simply refer to your pull request in the opening an issue to report the bug, simply refer to your pull request in the
@ -449,8 +449,8 @@ and plugins in spaCy v3.0, and we can't wait to see what you build with it!
[`spacy`](https://github.com/topics/spacy?o=desc&s=stars) and [`spacy`](https://github.com/topics/spacy?o=desc&s=stars) and
[`spacy-extensions`](https://github.com/topics/spacy-extension?o=desc&s=stars) [`spacy-extensions`](https://github.com/topics/spacy-extension?o=desc&s=stars)
to make it easier to find. Those are also the topics we're linking to from the to make it easier to find. Those are also the topics we're linking to from the
spaCy website. If you're sharing your project on X, feel free to tag spaCy website. If you're sharing your project on Twitter, feel free to tag
[@spacy_io](https://x.com/spacy_io) so we can check it out. [@spacy_io](https://twitter.com/spacy_io) so we can check it out.
- Once your extension is published, you can open a - Once your extension is published, you can open a
[PR](https://github.com/explosion/spaCy/pulls) to suggest it for the [PR](https://github.com/explosion/spaCy/pulls) to suggest it for the

View File

@ -1,6 +1,6 @@
The MIT License (MIT) The MIT License (MIT)
Copyright (C) 2016-2024 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal Copyright (C) 2016-2023 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal
Permission is hereby granted, free of charge, to any person obtaining a copy Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal of this software and associated documentation files (the "Software"), to deal

View File

@ -4,6 +4,5 @@ include README.md
include pyproject.toml include pyproject.toml
include spacy/py.typed include spacy/py.typed
recursive-include spacy/cli *.yml recursive-include spacy/cli *.yml
recursive-include spacy/tests *.json
recursive-include licenses * recursive-include licenses *
recursive-exclude spacy *.cpp recursive-exclude spacy *.cpp

View File

@ -16,7 +16,7 @@ model packaging, deployment and workflow management. spaCy is commercial
open-source software, released under the open-source software, released under the
[MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE). [MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE).
💫 **Version 3.8 out now!** 💫 **Version 3.7 out now!**
[Check out the release notes here.](https://github.com/explosion/spaCy/releases) [Check out the release notes here.](https://github.com/explosion/spaCy/releases)
[![tests](https://github.com/explosion/spaCy/actions/workflows/tests.yml/badge.svg)](https://github.com/explosion/spaCy/actions/workflows/tests.yml) [![tests](https://github.com/explosion/spaCy/actions/workflows/tests.yml/badge.svg)](https://github.com/explosion/spaCy/actions/workflows/tests.yml)
@ -28,6 +28,7 @@ open-source software, released under the
<br /> <br />
[![PyPi downloads](https://static.pepy.tech/personalized-badge/spacy?period=total&units=international_system&left_color=grey&right_color=orange&left_text=pip%20downloads)](https://pypi.org/project/spacy/) [![PyPi downloads](https://static.pepy.tech/personalized-badge/spacy?period=total&units=international_system&left_color=grey&right_color=orange&left_text=pip%20downloads)](https://pypi.org/project/spacy/)
[![Conda downloads](https://img.shields.io/conda/dn/conda-forge/spacy?label=conda%20downloads)](https://anaconda.org/conda-forge/spacy) [![Conda downloads](https://img.shields.io/conda/dn/conda-forge/spacy?label=conda%20downloads)](https://anaconda.org/conda-forge/spacy)
[![spaCy on Twitter](https://img.shields.io/twitter/follow/spacy_io.svg?style=social&label=Follow)](https://twitter.com/spacy_io)
## 📖 Documentation ## 📖 Documentation
@ -46,7 +47,6 @@ open-source software, released under the
| 👩‍🏫 **[Online Course]** | Learn spaCy in this free and interactive online course. | | 👩‍🏫 **[Online Course]** | Learn spaCy in this free and interactive online course. |
| 📰 **[Blog]** | Read about current spaCy and Prodigy development, releases, talks and more from Explosion. | | 📰 **[Blog]** | Read about current spaCy and Prodigy development, releases, talks and more from Explosion. |
| 📺 **[Videos]** | Our YouTube channel with video tutorials, talks and more. | | 📺 **[Videos]** | Our YouTube channel with video tutorials, talks and more. |
| 🔴 **[Live Stream]** | Join Matt as he works on spaCy and chat about NLP, live every week. |
| 🛠 **[Changelog]** | Changes and version history. | | 🛠 **[Changelog]** | Changes and version history. |
| 💝 **[Contribute]** | How to contribute to the spaCy project and code base. | | 💝 **[Contribute]** | How to contribute to the spaCy project and code base. |
| 👕 **[Swag]** | Support us and our work with unique, custom-designed swag! | | 👕 **[Swag]** | Support us and our work with unique, custom-designed swag! |
@ -62,7 +62,6 @@ open-source software, released under the
[universe]: https://spacy.io/universe [universe]: https://spacy.io/universe
[spacy vs code extension]: https://github.com/explosion/spacy-vscode [spacy vs code extension]: https://github.com/explosion/spacy-vscode
[videos]: https://www.youtube.com/c/ExplosionAI [videos]: https://www.youtube.com/c/ExplosionAI
[live stream]: https://www.youtube.com/playlist?list=PLBmcuObd5An5_iAxNYLJa_xWmNzsYce8c
[online course]: https://course.spacy.io [online course]: https://course.spacy.io
[blog]: https://explosion.ai [blog]: https://explosion.ai
[project templates]: https://github.com/explosion/projects [project templates]: https://github.com/explosion/projects
@ -80,14 +79,13 @@ more people can benefit from it.
| Type | Platforms | | Type | Platforms |
| ------------------------------- | --------------------------------------- | | ------------------------------- | --------------------------------------- |
| 🚨 **Bug Reports** | [GitHub Issue Tracker] | | 🚨 **Bug Reports** | [GitHub Issue Tracker] |
| 🎁 **Feature Requests & Ideas** | [GitHub Discussions] · [Live Stream] | | 🎁 **Feature Requests & Ideas** | [GitHub Discussions] |
| 👩‍💻 **Usage Questions** | [GitHub Discussions] · [Stack Overflow] | | 👩‍💻 **Usage Questions** | [GitHub Discussions] · [Stack Overflow] |
| 🗯 **General Discussion** | [GitHub Discussions] · [Live Stream] | | 🗯 **General Discussion** | [GitHub Discussions] |
[github issue tracker]: https://github.com/explosion/spaCy/issues [github issue tracker]: https://github.com/explosion/spaCy/issues
[github discussions]: https://github.com/explosion/spaCy/discussions [github discussions]: https://github.com/explosion/spaCy/discussions
[stack overflow]: https://stackoverflow.com/questions/tagged/spacy [stack overflow]: https://stackoverflow.com/questions/tagged/spacy
[live stream]: https://www.youtube.com/playlist?list=PLBmcuObd5An5_iAxNYLJa_xWmNzsYce8c
## Features ## Features
@ -117,7 +115,7 @@ For detailed installation instructions, see the
- **Operating system**: macOS / OS X · Linux · Windows (Cygwin, MinGW, Visual - **Operating system**: macOS / OS X · Linux · Windows (Cygwin, MinGW, Visual
Studio) Studio)
- **Python version**: Python >=3.7, <3.13 (only 64 bit) - **Python version**: Python 3.7+ (only 64 bit)
- **Package managers**: [pip] · [conda] (via `conda-forge`) - **Package managers**: [pip] · [conda] (via `conda-forge`)
[pip]: https://pypi.org/project/spacy/ [pip]: https://pypi.org/project/spacy/

View File

@ -1,20 +0,0 @@
#!/usr/bin/env bash
set -e
# Insist repository is clean
git diff-index --quiet HEAD
version=$(grep "__version__ = " spacy/about.py)
version=${version/__version__ = }
version=${version/\'/}
version=${version/\'/}
version=${version/\"/}
version=${version/\"/}
echo "Pushing release-v"$version
git tag -d release-v$version || true
git push origin :release-v$version || true
git tag release-v$version
git push origin release-v$version

View File

@ -1,2 +1,6 @@
# build version constraints for use with wheelwright # build version constraints for use with wheelwright
numpy>=2.0.0,<3.0.0 numpy==1.15.0; python_version=='3.7' and platform_machine!='aarch64'
numpy==1.19.2; python_version=='3.7' and platform_machine=='aarch64'
numpy==1.17.3; python_version=='3.8' and platform_machine!='aarch64'
numpy==1.19.2; python_version=='3.8' and platform_machine=='aarch64'
numpy>=1.25.0; python_version>='3.9'

View File

@ -1,67 +1,15 @@
[build-system] [build-system]
requires = [ requires = [
"setuptools", "setuptools",
"cython>=3.0,<4.0", "cython>=0.25,<3.0",
"cymem>=2.0.2,<2.1.0", "cymem>=2.0.2,<2.1.0",
"preshed>=3.0.2,<3.1.0", "preshed>=3.0.2,<3.1.0",
"murmurhash>=0.28.0,<1.1.0", "murmurhash>=0.28.0,<1.1.0",
"thinc>=8.3.4,<8.4.0", "thinc>=8.2.2,<8.3.0",
"numpy>=2.0.0,<3.0.0" "numpy>=1.15.0; python_version < '3.9'",
"numpy>=1.25.0; python_version >= '3.9'",
] ]
build-backend = "setuptools.build_meta" build-backend = "setuptools.build_meta"
[tool.cibuildwheel]
build = "*"
skip = "pp* cp36* cp37* cp38* *-win32 *i686*"
test-skip = ""
free-threaded-support = false
archs = ["native"]
build-frontend = "default"
config-settings = {}
dependency-versions = "pinned"
environment = { PIP_CONSTRAINT = "build-constraints.txt" }
environment-pass = []
build-verbosity = 0
before-all = "curl https://sh.rustup.rs -sSf | sh -s -- -y --profile minimal --default-toolchain stable"
before-build = "pip install -r requirements.txt && python setup.py clean"
repair-wheel-command = ""
test-command = ""
before-test = ""
test-requires = []
test-extras = []
container-engine = "docker"
manylinux-x86_64-image = "manylinux2014"
manylinux-i686-image = "manylinux2014"
manylinux-aarch64-image = "manylinux2014"
manylinux-ppc64le-image = "manylinux2014"
manylinux-s390x-image = "manylinux2014"
manylinux-pypy_x86_64-image = "manylinux2014"
manylinux-pypy_i686-image = "manylinux2014"
manylinux-pypy_aarch64-image = "manylinux2014"
musllinux-x86_64-image = "musllinux_1_2"
musllinux-i686-image = "musllinux_1_2"
musllinux-aarch64-image = "musllinux_1_2"
musllinux-ppc64le-image = "musllinux_1_2"
musllinux-s390x-image = "musllinux_1_2"
[tool.cibuildwheel.linux]
repair-wheel-command = "auditwheel repair -w {dest_dir} {wheel}"
[tool.cibuildwheel.macos]
repair-wheel-command = "delocate-wheel --require-archs {delocate_archs} -w {dest_dir} -v {wheel}"
[tool.cibuildwheel.windows]
[tool.cibuildwheel.pyodide]
[tool.isort] [tool.isort]
profile = "black" profile = "black"

View File

@ -3,26 +3,30 @@ spacy-legacy>=3.0.11,<3.1.0
spacy-loggers>=1.0.0,<2.0.0 spacy-loggers>=1.0.0,<2.0.0
cymem>=2.0.2,<2.1.0 cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0 preshed>=3.0.2,<3.1.0
thinc>=8.3.4,<8.4.0 thinc>=8.2.2,<8.3.0
ml_datasets>=0.2.0,<0.3.0 ml_datasets>=0.2.0,<0.3.0
murmurhash>=0.28.0,<1.1.0 murmurhash>=0.28.0,<1.1.0
wasabi>=0.9.1,<1.2.0 wasabi>=0.9.1,<1.2.0
srsly>=2.4.3,<3.0.0 srsly>=2.4.3,<3.0.0
catalogue>=2.0.6,<2.1.0 catalogue>=2.0.6,<2.1.0
typer-slim>=0.3.0,<1.0.0 typer>=0.3.0,<0.10.0
weasel>=0.1.0,<0.5.0 smart-open>=5.2.1,<7.0.0
weasel>=0.1.0,<0.4.0
# Third party dependencies # Third party dependencies
numpy>=2.0.0,<3.0.0 numpy>=1.15.0; python_version < "3.9"
numpy>=1.19.0; python_version >= "3.9"
requests>=2.13.0,<3.0.0 requests>=2.13.0,<3.0.0
tqdm>=4.38.0,<5.0.0 tqdm>=4.38.0,<5.0.0
pydantic>=1.7.4,!=1.8,!=1.8.1,<3.0.0 pydantic>=1.7.4,!=1.8,!=1.8.1,<3.0.0
jinja2 jinja2
langcodes>=3.2.0,<4.0.0
# Official Python utilities # Official Python utilities
setuptools setuptools
packaging>=20.0 packaging>=20.0
typing_extensions>=3.7.4.1,<4.5.0; python_version < "3.8"
# Development dependencies # Development dependencies
pre-commit>=2.13.0 pre-commit>=2.13.0
cython>=3.0,<4.0 cython>=0.25,<3.0
pytest>=5.2.0,!=7.1.0 pytest>=5.2.0,!=7.1.0
pytest-timeout>=1.3.0,<2.0.0 pytest-timeout>=1.3.0,<2.0.0
mock>=2.0.0,<3.0.0 mock>=2.0.0,<3.0.0

View File

@ -17,11 +17,11 @@ classifiers =
Operating System :: Microsoft :: Windows Operating System :: Microsoft :: Windows
Programming Language :: Cython Programming Language :: Cython
Programming Language :: Python :: 3 Programming Language :: Python :: 3
Programming Language :: Python :: 3.7
Programming Language :: Python :: 3.8
Programming Language :: Python :: 3.9 Programming Language :: Python :: 3.9
Programming Language :: Python :: 3.10 Programming Language :: Python :: 3.10
Programming Language :: Python :: 3.11 Programming Language :: Python :: 3.11
Programming Language :: Python :: 3.12
Programming Language :: Python :: 3.13
Topic :: Scientific/Engineering Topic :: Scientific/Engineering
project_urls = project_urls =
Release notes = https://github.com/explosion/spaCy/releases Release notes = https://github.com/explosion/spaCy/releases
@ -30,18 +30,18 @@ project_urls =
[options] [options]
zip_safe = false zip_safe = false
include_package_data = true include_package_data = true
python_requires = >=3.9,<3.14 python_requires = >=3.7
# NOTE: This section is superseded by pyproject.toml and will be removed in # NOTE: This section is superseded by pyproject.toml and will be removed in
# spaCy v4 # spaCy v4
setup_requires = setup_requires =
cython>=3.0,<4.0 cython>=0.25,<3.0
numpy>=2.0.0,<3.0.0; python_version < "3.9" numpy>=1.15.0; python_version < "3.9"
numpy>=2.0.0,<3.0.0; python_version >= "3.9" numpy>=1.19.0; python_version >= "3.9"
# We also need our Cython packages here to compile against # We also need our Cython packages here to compile against
cymem>=2.0.2,<2.1.0 cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0 preshed>=3.0.2,<3.1.0
murmurhash>=0.28.0,<1.1.0 murmurhash>=0.28.0,<1.1.0
thinc>=8.3.4,<8.4.0 thinc>=8.2.2,<8.3.0
install_requires = install_requires =
# Our libraries # Our libraries
spacy-legacy>=3.0.11,<3.1.0 spacy-legacy>=3.0.11,<3.1.0
@ -49,13 +49,14 @@ install_requires =
murmurhash>=0.28.0,<1.1.0 murmurhash>=0.28.0,<1.1.0
cymem>=2.0.2,<2.1.0 cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0 preshed>=3.0.2,<3.1.0
thinc>=8.3.4,<8.4.0 thinc>=8.2.2,<8.3.0
wasabi>=0.9.1,<1.2.0 wasabi>=0.9.1,<1.2.0
srsly>=2.4.3,<3.0.0 srsly>=2.4.3,<3.0.0
catalogue>=2.0.6,<2.1.0 catalogue>=2.0.6,<2.1.0
weasel>=0.1.0,<0.5.0 weasel>=0.1.0,<0.4.0
# Third-party dependencies # Third-party dependencies
typer-slim>=0.3.0,<1.0.0 typer>=0.3.0,<0.10.0
smart-open>=5.2.1,<7.0.0
tqdm>=4.38.0,<5.0.0 tqdm>=4.38.0,<5.0.0
numpy>=1.15.0; python_version < "3.9" numpy>=1.15.0; python_version < "3.9"
numpy>=1.19.0; python_version >= "3.9" numpy>=1.19.0; python_version >= "3.9"
@ -65,6 +66,8 @@ install_requires =
# Official Python utilities # Official Python utilities
setuptools setuptools
packaging>=20.0 packaging>=20.0
typing_extensions>=3.7.4.1,<4.5.0; python_version < "3.8"
langcodes>=3.2.0,<4.0.0
[options.entry_points] [options.entry_points]
console_scripts = console_scripts =
@ -114,7 +117,7 @@ cuda12x =
cuda-autodetect = cuda-autodetect =
cupy-wheel>=11.0.0,<13.0.0 cupy-wheel>=11.0.0,<13.0.0
apple = apple =
thinc-apple-ops>=1.0.0,<2.0.0 thinc-apple-ops>=0.1.0.dev0,<1.0.0
# Language tokenizers with external dependencies # Language tokenizers with external dependencies
ja = ja =
sudachipy>=0.5.2,!=0.6.1 sudachipy>=0.5.2,!=0.6.1

View File

@ -17,7 +17,6 @@ from .cli.info import info # noqa: F401
from .errors import Errors from .errors import Errors
from .glossary import explain # noqa: F401 from .glossary import explain # noqa: F401
from .language import Language from .language import Language
from .registrations import REGISTRY_POPULATED, populate_registry
from .util import logger, registry # noqa: F401 from .util import logger, registry # noqa: F401
from .vocab import Vocab from .vocab import Vocab

View File

@ -1,5 +1,5 @@
# fmt: off # fmt: off
__title__ = "spacy" __title__ = "spacy"
__version__ = "3.8.7" __version__ = "3.7.4"
__download_url__ = "https://github.com/explosion/spacy-models/releases/download" __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

View File

@ -1,7 +1,5 @@
from wasabi import msg from wasabi import msg
# Needed for testing
from . import download as download_module # noqa: F401
from ._util import app, setup_cli # noqa: F401 from ._util import app, setup_cli # noqa: F401
from .apply import apply # noqa: F401 from .apply import apply # noqa: F401
from .assemble import assemble_cli # noqa: F401 from .assemble import assemble_cli # noqa: F401

View File

@ -170,7 +170,7 @@ def debug_model(
msg.divider(f"STEP 3 - prediction") msg.divider(f"STEP 3 - prediction")
msg.info(str(prediction)) msg.info(str(prediction))
msg.good(f"Successfully ended analysis - model looks good.") msg.good(f"Succesfully ended analysis - model looks good.")
def _sentences(): def _sentences():

View File

@ -1,6 +1,5 @@
import sys import sys
from typing import Optional, Sequence from typing import Optional, Sequence
from urllib.parse import urljoin
import requests import requests
import typer import typer
@ -64,13 +63,6 @@ def download(
) )
pip_args = pip_args + ("--no-deps",) pip_args = pip_args + ("--no-deps",)
if direct: if direct:
# Reject model names with '/', in order to prevent shenanigans.
if "/" in model:
msg.fail(
title="Model download rejected",
text=f"Cannot download model '{model}'. Models are expected to be file names, not URLs or fragments",
exits=True,
)
components = model.split("-") components = model.split("-")
model_name = "".join(components[:-1]) model_name = "".join(components[:-1])
version = components[-1] version = components[-1]
@ -161,16 +153,7 @@ def get_latest_version(model: str) -> str:
def download_model( def download_model(
filename: str, user_pip_args: Optional[Sequence[str]] = None filename: str, user_pip_args: Optional[Sequence[str]] = None
) -> None: ) -> None:
# Construct the download URL carefully. We need to make sure we don't download_url = about.__download_url__ + "/" + filename
# allow relative paths or other shenanigans to trick us into download
# from outside our own repo.
base_url = about.__download_url__
# urljoin requires that the path ends with /, or the last path part will be dropped
if not base_url.endswith("/"):
base_url = about.__download_url__ + "/"
download_url = urljoin(base_url, filename)
if not download_url.startswith(about.__download_url__):
raise ValueError(f"Download from {filename} rejected. Was it a relative path?")
pip_args = list(user_pip_args) if user_pip_args is not None else [] pip_args = list(user_pip_args) if user_pip_args is not None else []
cmd = [sys.executable, "-m", "pip", "install"] + pip_args + [download_url] cmd = [sys.executable, "-m", "pip", "install"] + pip_args + [download_url]
run_command(cmd) run_command(cmd)

View File

@ -39,7 +39,7 @@ def find_threshold_cli(
# fmt: on # fmt: on
): ):
""" """
Runs prediction trials for a trained model with varying thresholds to maximize Runs prediction trials for a trained model with varying tresholds to maximize
the specified metric. The search space for the threshold is traversed linearly the specified metric. The search space for the threshold is traversed linearly
from 0 to 1 in `n_trials` steps. Results are displayed in a table on `stdout` from 0 to 1 in `n_trials` steps. Results are displayed in a table on `stdout`
(the corresponding API call to `spacy.cli.find_threshold.find_threshold()` (the corresponding API call to `spacy.cli.find_threshold.find_threshold()`
@ -81,7 +81,7 @@ def find_threshold(
silent: bool = True, silent: bool = True,
) -> Tuple[float, float, Dict[float, float]]: ) -> Tuple[float, float, Dict[float, float]]:
""" """
Runs prediction trials for models with varying thresholds to maximize the specified metric. Runs prediction trials for models with varying tresholds to maximize the specified metric.
model (Union[str, Path]): Pipeline to evaluate. Can be a package or a path to a data directory. model (Union[str, Path]): Pipeline to evaluate. Can be a package or a path to a data directory.
data_path (Path): Path to file with DocBin with docs to use for threshold search. data_path (Path): Path to file with DocBin with docs to use for threshold search.
pipe_name (str): Name of pipe to examine thresholds for. pipe_name (str): Name of pipe to examine thresholds for.

View File

@ -30,7 +30,6 @@ def package_cli(
version: Optional[str] = Opt(None, "--version", "-v", help="Package version to override meta"), version: Optional[str] = Opt(None, "--version", "-v", help="Package version to override meta"),
build: str = Opt("sdist", "--build", "-b", help="Comma-separated formats to build: sdist and/or wheel, or none."), build: str = Opt("sdist", "--build", "-b", help="Comma-separated formats to build: sdist and/or wheel, or none."),
force: bool = Opt(False, "--force", "-f", "-F", help="Force overwriting existing data in output directory"), force: bool = Opt(False, "--force", "-f", "-F", help="Force overwriting existing data in output directory"),
require_parent: bool = Opt(True, "--require-parent/--no-require-parent", "-R", "-R", help="Include the parent package (e.g. spacy) in the requirements"),
# fmt: on # fmt: on
): ):
""" """
@ -61,7 +60,6 @@ def package_cli(
create_sdist=create_sdist, create_sdist=create_sdist,
create_wheel=create_wheel, create_wheel=create_wheel,
force=force, force=force,
require_parent=require_parent,
silent=False, silent=False,
) )
@ -76,7 +74,6 @@ def package(
create_meta: bool = False, create_meta: bool = False,
create_sdist: bool = True, create_sdist: bool = True,
create_wheel: bool = False, create_wheel: bool = False,
require_parent: bool = False,
force: bool = False, force: bool = False,
silent: bool = True, silent: bool = True,
) -> None: ) -> None:
@ -116,7 +113,7 @@ def package(
if not meta_path.exists() or not meta_path.is_file(): if not meta_path.exists() or not meta_path.is_file():
msg.fail("Can't load pipeline meta.json", meta_path, exits=1) msg.fail("Can't load pipeline meta.json", meta_path, exits=1)
meta = srsly.read_json(meta_path) meta = srsly.read_json(meta_path)
meta = get_meta(input_dir, meta, require_parent=require_parent) meta = get_meta(input_dir, meta)
if meta["requirements"]: if meta["requirements"]:
msg.good( msg.good(
f"Including {len(meta['requirements'])} package requirement(s) from " f"Including {len(meta['requirements'])} package requirement(s) from "
@ -189,7 +186,6 @@ def package(
imports.append(code_path.stem) imports.append(code_path.stem)
shutil.copy(str(code_path), str(package_path)) shutil.copy(str(code_path), str(package_path))
create_file(main_path / "meta.json", srsly.json_dumps(meta, indent=2)) create_file(main_path / "meta.json", srsly.json_dumps(meta, indent=2))
create_file(main_path / "setup.py", TEMPLATE_SETUP) create_file(main_path / "setup.py", TEMPLATE_SETUP)
create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST) create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST)
init_py = TEMPLATE_INIT.format( init_py = TEMPLATE_INIT.format(
@ -306,8 +302,6 @@ def get_third_party_dependencies(
modules.add(func_info["module"].split(".")[0]) # type: ignore[union-attr] modules.add(func_info["module"].split(".")[0]) # type: ignore[union-attr]
dependencies = [] dependencies = []
for module_name in modules: for module_name in modules:
if module_name == about.__title__:
continue
if module_name in distributions: if module_name in distributions:
dist = distributions.get(module_name) dist = distributions.get(module_name)
if dist: if dist:
@ -338,9 +332,7 @@ def create_file(file_path: Path, contents: str) -> None:
def get_meta( def get_meta(
model_path: Union[str, Path], model_path: Union[str, Path], existing_meta: Dict[str, Any]
existing_meta: Dict[str, Any],
require_parent: bool = False,
) -> Dict[str, Any]: ) -> Dict[str, Any]:
meta: Dict[str, Any] = { meta: Dict[str, Any] = {
"lang": "en", "lang": "en",
@ -369,8 +361,6 @@ def get_meta(
existing_reqs = [util.split_requirement(req)[0] for req in meta["requirements"]] existing_reqs = [util.split_requirement(req)[0] for req in meta["requirements"]]
reqs = get_third_party_dependencies(nlp.config, exclude=existing_reqs) reqs = get_third_party_dependencies(nlp.config, exclude=existing_reqs)
meta["requirements"].extend(reqs) meta["requirements"].extend(reqs)
if require_parent and about.__title__ not in meta["requirements"]:
meta["requirements"].append(about.__title__ + meta["spacy_version"])
return meta return meta
@ -545,11 +535,8 @@ def list_files(data_dir):
def list_requirements(meta): def list_requirements(meta):
# Up to version 3.7, we included the parent package parent_package = meta.get('parent_package', 'spacy')
# in requirements by default. This behaviour is removed requirements = [parent_package + meta['spacy_version']]
# in 3.8, with a setting to include the parent package in
# the requirements list in the meta if desired.
requirements = []
if 'setup_requires' in meta: if 'setup_requires' in meta:
requirements += meta['setup_requires'] requirements += meta['setup_requires']
if 'requirements' in meta: if 'requirements' in meta:

View File

@ -1,16 +0,0 @@
from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
from .stop_words import STOP_WORDS
class TibetanDefaults(BaseDefaults):
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
class Tibetan(Language):
lang = "bo"
Defaults = TibetanDefaults
__all__ = ["Tibetan"]

View File

@ -1,16 +0,0 @@
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.bo.examples import sentences
>>> docs = nlp.pipe(sentences)
"""
sentences = [
"དོན་དུ་རྒྱ་མཚོ་བླ་མ་ཞེས་བྱ་ཞིང༌།",
"ཏཱ་ལའི་ཞེས་པ་ནི་སོག་སྐད་ཡིན་པ་དེ་བོད་སྐད་དུ་རྒྱ་མཚོའི་དོན་དུ་འཇུག",
"སོག་པོ་ཨལ་ཐན་རྒྱལ་པོས་རྒྱལ་དབང་བསོད་ནམས་རྒྱ་མཚོར་ཆེ་བསྟོད་ཀྱི་མཚན་གསོལ་བ་ཞིག་ཡིན་ཞིང༌།",
"རྗེས་སུ་རྒྱལ་བ་དགེ་འདུན་གྲུབ་དང༌། དགེ་འདུན་རྒྱ་མཚོ་སོ་སོར་ཡང་ཏཱ་ལའི་བླ་མའི་སྐུ་ཕྲེང་དང་པོ་དང༌།",
"གཉིས་པའི་མཚན་དེ་གསོལ་ཞིང༌།༸རྒྱལ་དབང་སྐུ་ཕྲེང་ལྔ་པས་དགའ་ལྡན་ཕོ་བྲང་གི་སྲིད་དབང་བཙུགས་པ་ནས་ཏཱ་ལའི་བླ་མ་ནི་བོད་ཀྱི་ཆོས་སྲིད་གཉིས་ཀྱི་དབུ་ཁྲིད་དུ་གྱུར་ཞིང་།",
"ད་ལྟའི་བར་ཏཱ་ལའི་བླ་མ་སྐུ་ཕྲེང་བཅུ་བཞི་བྱོན་ཡོད།",
]

View File

@ -1,65 +0,0 @@
from ...attrs import LIKE_NUM
# reference 1: https://en.wikipedia.org/wiki/Tibetan_numerals
_num_words = [
"ཀླད་ཀོར་",
"གཅིག་",
"གཉིས་",
"གསུམ་",
"བཞི་",
"ལྔ་",
"དྲུག་",
"བདུན་",
"བརྒྱད་",
"དགུ་",
"བཅུ་",
"བཅུ་གཅིག་",
"བཅུ་གཉིས་",
"བཅུ་གསུམ་",
"བཅུ་བཞི་",
"བཅུ་ལྔ་",
"བཅུ་དྲུག་",
"བཅུ་བདུན་",
"བཅུ་པརྒྱད",
"བཅུ་དགུ་",
"ཉི་ཤུ་",
"སུམ་ཅུ",
"བཞི་བཅུ",
"ལྔ་བཅུ",
"དྲུག་ཅུ",
"བདུན་ཅུ",
"བརྒྱད་ཅུ",
"དགུ་བཅུ",
"བརྒྱ་",
"སྟོང་",
"ཁྲི་",
"ས་ཡ་",
" བྱེ་བ་",
"དུང་ཕྱུར་",
"ཐེར་འབུམ་",
"ཐེར་འབུམ་ཆེན་པོ་",
"ཁྲག་ཁྲིག་",
"ཁྲག་ཁྲིག་ཆེན་པོ་",
]
def like_num(text):
"""
Check if text resembles a number
"""
if text.startswith(("+", "-", "±", "~")):
text = text[1:]
text = text.replace(",", "").replace(".", "")
if text.isdigit():
return True
if text.count("/") == 1:
num, denom = text.split("/")
if num.isdigit() and denom.isdigit():
return True
if text in _num_words:
return True
return False
LEX_ATTRS = {LIKE_NUM: like_num}

View File

@ -1,198 +0,0 @@
# Source: https://zenodo.org/records/10148636
STOP_WORDS = set(
"""
གས
མས
འད
པས
གཞན
དང
གས
བཅས
ངས
ལས
ཙམ
ཡང
མཐའདག
འད
རང
ངམ
དག
འང
ལགས
ཚང
ཐམསཅད
དམ
འམ
བས
ལགས
གས
མས
བམ
ནམ
ནམ
ངམ
འགའ
ཤས
གམ
ལགས
ཅང
འགའ
སམ
འང
ལས
འཕ
བར
དང
འག
སམ
ཟད
འམ
མམ
དམ
དག
ལམ
ནང
ཙམ
རམ
ཨང
གས
ལགས
པས
རབ
རམ
བས
གཞན
འབའ
གམ
བམ
ཙམ
མམ
ཏམ
ཏམ
ཤས
""".split()
)

View File

@ -1,18 +0,0 @@
from typing import Optional
from ...language import BaseDefaults, Language
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class ScottishDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
stop_words = STOP_WORDS
class Scottish(Language):
lang = "gd"
Defaults = ScottishDefaults
__all__ = ["Scottish"]

View File

@ -1,388 +0,0 @@
STOP_WORDS = set(
"""
'ad
'ar
'd # iad
'g # ag
'ga
'gam
'gan
'gar
'gur
'm # am
'n # an
'n seo
'na
'nad
'nam
'nan
'nar
'nuair
'nur
's
'sa
'san
'sann
'se
'sna
a
a'
a'd # agad
a'm # agam
a-chèile
a-seo
a-sin
a-siud
a chionn
a chionn 's
a chèile
a chéile
a dh'
a h-uile
a seo
ac' # aca
aca
aca-san
acasan
ach
ag
agad
agad-sa
agads'
agadsa
agaibh
agaibhse
againn
againne
agam
agam-sa
agams'
agamsa
agus
aice
aice-se
aicese
aig
aig' # aige
aige
aige-san
aigesan
air
air-san
air neo
airsan
am
an
an seo
an sin
an siud
an uair
ann
ann a
ann a'
ann a shin
ann am
ann an
annad
annam
annam-s'
annamsa
anns
anns an
annta
aon
ar
as
asad
asda
asta
b'
bho
bhon
bhuaidhe # bhuaithe
bhuainn
bhuaipe
bhuaithe
bhuapa
bhur
brì
bu
c'à
car son
carson
cha
chan
chionn
choir
chon
chun
chèile
chéile
chòir
cia mheud
ciamar
co-dhiubh
cuide
cuin
cuin'
cuine
'
càil
càit
càit'
càite
mheud
d'
da
de
dh'
dha
dhaibh
dhaibh-san
dhaibhsan
dhan
dhasan
dhe
dhen
dheth
dhi
dhiom
dhiot
dhith
dhiubh
dhomh
dhomh-s'
dhomhsa
dhu'sa # dhut-sa
dhuibh
dhuibhse
dhuinn
dhuinne
dhuit
dhut
dhutsa
dhut-sa
dhà
dhà-san
dhàsan
dhòmhsa
diubh
do
docha
don
mar
mar
dòch'
dòcha
e
eadar
eatarra
eatorra
eile
esan
fa
far
feud
fhad
fheudar
fhearr
fhein
fheudar
fheàrr
fhèin
fhéin
fhìn
fo
fodha
fodhainn
foipe
fon
fèin
ga
gach
gam
gan
ge brith
ged
gu
gu
gu ruige
gun
gur
gus
i
iad
iadsan
innte
is
ise
le
leam
leam-sa
leamsa
leat
leat-sa
leatha
leatsa
leibh
leis
leis-san
leoth'
leotha
leotha-san
linn
m'
m'a
ma
mac
man
mar
mas
mathaid
mi
mis'
mise
mo
mu
mu 'n
mun
mur
mura
mus
na
na b'
na bu
na iad
nach
nad
nam
nan
nar
nas
neo
no
nuair
o
o'n
oir
oirbh
oirbh-se
oirnn
oirnne
oirre
on
orm
orm-sa
ormsa
orra
orra-san
orrasan
ort
os
r'
ri
ribh
rinn
ris
rithe
rithe-se
rium
rium-sa
riums'
riumsa
riut
riuth'
riutha
riuthasan
ro
ro'n
roimh
roimhe
romhainn
romham
romhpa
ron
ruibh
ruinn
ruinne
sa
san
sann
se
seach
seo
seothach
shin
sibh
sibh-se
sibhse
sin
sineach
sinn
sinne
siod
siodach
siud
siudach
sna # ann an
t'
tarsaing
tarsainn
tarsuinn
thar
thoigh
thro
thu
thuc'
thuca
thugad
thugaibh
thugainn
thugam
thugamsa
thuice
thuige
thus'
thusa
timcheall
toigh
toil
tro
tro' # troimh
troimh
troimhe
tron
tu
tusa
uair
ud
ugaibh
ugam-s'
ugam-sa
uice
uige
uige-san
umad
unnta # ann an
ur
urrainn
à
às
àsan
á
ás
è
ì
ò
ó
""".split(
"\n"
)
)

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,5 @@
The list of Croatian lemmas was extracted from the reldi-tagger repository (https://github.com/clarinsi/reldi-tagger). The list of Croatian lemmas was extracted from the reldi-tagger repository (https://github.com/clarinsi/reldi-tagger).
Reldi-tagger is licensed under the Apache 2.0 licence. Reldi-tagger is licesned under the Apache 2.0 licence.
@InProceedings{ljubesic16-new, @InProceedings{ljubesic16-new,
author = {Nikola Ljubešić and Filip Klubička and Željko Agić and Ivo-Pavao Jazbec}, author = {Nikola Ljubešić and Filip Klubička and Željko Agić and Ivo-Pavao Jazbec},
@ -12,4 +12,4 @@ Reldi-tagger is licensed under the Apache 2.0 licence.
publisher = {European Language Resources Association (ELRA)}, publisher = {European Language Resources Association (ELRA)},
address = {Paris, France}, address = {Paris, France},
isbn = {978-2-9517408-9-1} isbn = {978-2-9517408-9-1}
} }

View File

@ -1,52 +0,0 @@
from typing import Callable, Optional
from thinc.api import Model
from ...language import BaseDefaults, Language
from .lemmatizer import HaitianCreoleLemmatizer
from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .tag_map import TAG_MAP
class HaitianCreoleDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES
infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES
lex_attr_getters = LEX_ATTRS
syntax_iterators = SYNTAX_ITERATORS
stop_words = STOP_WORDS
tag_map = TAG_MAP
class HaitianCreole(Language):
lang = "ht"
Defaults = HaitianCreoleDefaults
@HaitianCreole.factory(
"lemmatizer",
assigns=["token.lemma"],
default_config={
"model": None,
"mode": "rule",
"overwrite": False,
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
},
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(
nlp: Language,
model: Optional[Model],
name: str,
mode: str,
overwrite: bool,
scorer: Optional[Callable],
):
return HaitianCreoleLemmatizer(
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
)
__all__ = ["HaitianCreole"]

View File

@ -1,18 +0,0 @@
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.ht.examples import sentences
>>> docs = nlp.pipe(sentences)
"""
sentences = [
"Apple ap panse achte yon demaraj nan Wayòm Ini pou $1 milya dola",
"Machin otonòm fè responsablite asirans lan ale sou men fabrikan yo",
"San Francisco ap konsidere entèdi robo ki livre sou twotwa yo",
"Lond se yon gwo vil nan Wayòm Ini",
"Kote ou ye?",
"Kilès ki prezidan Lafrans?",
"Ki kapital Etazini?",
"Kile Barack Obama te fèt?",
]

View File

@ -1,51 +0,0 @@
from typing import List, Tuple
from ...pipeline import Lemmatizer
from ...tokens import Token
from ...lookups import Lookups
class HaitianCreoleLemmatizer(Lemmatizer):
"""
Minimal Haitian Creole lemmatizer.
Returns a word's base form based on rules and lookup,
or defaults to the original form.
"""
def is_base_form(self, token: Token) -> bool:
morph = token.morph.to_dict()
upos = token.pos_.lower()
# Consider unmarked forms to be base
if upos in {"noun", "verb", "adj", "adv"}:
if not morph:
return True
if upos == "noun" and morph.get("Number") == "Sing":
return True
if upos == "verb" and morph.get("VerbForm") == "Inf":
return True
if upos == "adj" and morph.get("Degree") == "Pos":
return True
return False
def rule_lemmatize(self, token: Token) -> List[str]:
string = token.text.lower()
pos = token.pos_.lower()
cache_key = (token.orth, token.pos)
if cache_key in self.cache:
return self.cache[cache_key]
forms = []
# fallback rule: just return lowercased form
forms.append(string)
self.cache[cache_key] = forms
return forms
@classmethod
def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]:
if mode == "rule":
required = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"]
return (required, [])
return super().get_lookups_config(mode)

View File

@ -1,78 +0,0 @@
from ...attrs import LIKE_NUM, NORM
# Cardinal numbers in Creole
_num_words = set(
"""
zewo youn en de twa kat senk sis sèt uit nèf dis
onz douz trèz katoz kenz sèz disèt dizwit diznèf
vent trant karant sinkant swasant swasann-dis
san mil milyon milya
""".split()
)
# Ordinal numbers in Creole (some are French-influenced, some simplified)
_ordinal_words = set(
"""
premye dezyèm twazyèm katryèm senkyèm sizyèm sètvyèm uitvyèm nèvyèm dizyèm
onzèm douzyèm trèzyèm katozyèm kenzèm sèzyèm disetyèm dizwityèm diznèvyèm
ventyèm trantyèm karantyèm sinkantyèm swasantyèm
swasann-disyèm santyèm milyèm milyonnyèm milyadyèm
""".split()
)
NORM_MAP = {
"'m": "mwen",
"'w": "ou",
"'l": "li",
"'n": "nou",
"'y": "yo",
"m": "mwen",
"w": "ou",
"l": "li",
"n": "nou",
"y": "yo",
"m": "mwen",
"n": "nou",
"l": "li",
"y": "yo",
"w": "ou",
"t": "te",
"k": "ki",
"p": "pa",
"M": "Mwen",
"N": "Nou",
"L": "Li",
"Y": "Yo",
"W": "Ou",
"T": "Te",
"K": "Ki",
"P": "Pa",
}
def like_num(text):
text = text.strip().lower()
if text.startswith(("+", "-", "±", "~")):
text = text[1:]
text = text.replace(",", "").replace(".", "")
if text.isdigit():
return True
if text.count("/") == 1:
num, denom = text.split("/")
if num.isdigit() and denom.isdigit():
return True
if text in _num_words:
return True
if text in _ordinal_words:
return True
# Handle things like "3yèm", "10yèm", "25yèm", etc.
if text.endswith("yèm") and text[:-3].isdigit():
return True
return False
def norm_custom(text):
return NORM_MAP.get(text, text.lower())
LEX_ATTRS = {
LIKE_NUM: like_num,
NORM: norm_custom,
}

View File

@ -1,43 +0,0 @@
from ..char_classes import (
ALPHA,
ALPHA_LOWER,
ALPHA_UPPER,
CONCAT_QUOTES,
HYPHENS,
LIST_PUNCT,
LIST_QUOTES,
LIST_ELLIPSES,
LIST_ICONS,
merge_chars,
)
ELISION = "'".replace(" ", "")
_prefixes_elision = "m n l y t k w"
_prefixes_elision += " " + _prefixes_elision.upper()
TOKENIZER_PREFIXES = LIST_PUNCT + LIST_QUOTES + [
r"(?:({pe})[{el}])(?=[{a}])".format(
a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision)
)
]
TOKENIZER_SUFFIXES = LIST_PUNCT + LIST_QUOTES + LIST_ELLIPSES + [
r"(?<=[0-9])%", # numbers like 10%
r"(?<=[0-9])(?:{h})".format(h=HYPHENS), # hyphens after numbers
r"(?<=[{a}])[']".format(a=ALPHA), # apostrophes after letters
r"(?<=[{a}])['][mwlnytk](?=\s|$)".format(a=ALPHA), # contractions
r"(?<=[{a}0-9])\)", # right parenthesis after letter/number
r"(?<=[{a}])\.(?=\s|$)".format(a=ALPHA), # period after letter if space or end of string
r"(?<=\))[\.\?!]", # punctuation immediately after right parenthesis
]
TOKENIZER_INFIXES = LIST_ELLIPSES + LIST_ICONS + [
r"(?<=[0-9])[+\-\*^](?=[0-9-])",
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
),
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION),
]

View File

@ -1,50 +0,0 @@
STOP_WORDS = set(
"""
a ak an ankò ant apre ap atò avan avanlè
byen byenke
chak
de depi deja deja
e en epi èske
fòk
gen genyen
ki kisa kilès kote koukou konsa konbyen konn konnen kounye kouman
la l laa le li lye
m m' mwen
nan nap nou n'
ou oumenm
pa paske pami pandan pito pou pral preske pwiske
se selman si sou sòt
ta tap tankou te toujou tou tan tout toutotan twòp tèl
w w' wi wè
y y' yo yon yonn
non o oh eh
sa san si swa si
men mèsi oswa osinon
"""
.split()
)
# Add common contractions, with and without apostrophe variants
contractions = ["m'", "n'", "w'", "y'", "l'", "t'", "k'"]
for apostrophe in ["'", "", ""]:
for word in contractions:
STOP_WORDS.add(word.replace("'", apostrophe))

View File

@ -1,74 +0,0 @@
from typing import Iterator, Tuple, Union
from ...errors import Errors
from ...symbols import NOUN, PRON, PROPN
from ...tokens import Doc, Span
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
"""
Detect base noun phrases from a dependency parse for Haitian Creole.
Works on both Doc and Span objects.
"""
# Core nominal dependencies common in Haitian Creole
labels = [
"nsubj",
"obj",
"obl",
"nmod",
"appos",
"ROOT",
]
# Modifiers to optionally include in chunk (to the right)
post_modifiers = ["compound", "flat", "flat:name", "fixed"]
doc = doclike.doc
if not doc.has_annotation("DEP"):
raise ValueError(Errors.E029)
np_deps = {doc.vocab.strings.add(label) for label in labels}
np_mods = {doc.vocab.strings.add(mod) for mod in post_modifiers}
conj_label = doc.vocab.strings.add("conj")
np_label = doc.vocab.strings.add("NP")
adp_pos = doc.vocab.strings.add("ADP")
cc_pos = doc.vocab.strings.add("CCONJ")
prev_end = -1
for i, word in enumerate(doclike):
if word.pos not in (NOUN, PROPN, PRON):
continue
if word.left_edge.i <= prev_end:
continue
if word.dep in np_deps:
right_end = word
# expand to include known modifiers to the right
for child in word.rights:
if child.dep in np_mods:
right_end = child.right_edge
elif child.pos == NOUN:
right_end = child.right_edge
left_index = word.left_edge.i
# Skip prepositions at the start
if word.left_edge.pos == adp_pos:
left_index += 1
prev_end = right_end.i
yield left_index, right_end.i + 1, np_label
elif word.dep == conj_label:
head = word.head
while head.dep == conj_label and head.head.i < head.i:
head = head.head
if head.dep in np_deps:
left_index = word.left_edge.i
if word.left_edge.pos == cc_pos:
left_index += 1
prev_end = word.i
yield left_index, word.i + 1, np_label
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}

View File

@ -1,21 +0,0 @@
from spacy.symbols import NOUN, VERB, AUX, ADJ, ADV, PRON, DET, ADP, SCONJ, CCONJ, PART, INTJ, NUM, PROPN, PUNCT, SYM, X
TAG_MAP = {
"NOUN": {"pos": NOUN},
"VERB": {"pos": VERB},
"AUX": {"pos": AUX},
"ADJ": {"pos": ADJ},
"ADV": {"pos": ADV},
"PRON": {"pos": PRON},
"DET": {"pos": DET},
"ADP": {"pos": ADP},
"SCONJ": {"pos": SCONJ},
"CCONJ": {"pos": CCONJ},
"PART": {"pos": PART},
"INTJ": {"pos": INTJ},
"NUM": {"pos": NUM},
"PROPN": {"pos": PROPN},
"PUNCT": {"pos": PUNCT},
"SYM": {"pos": SYM},
"X": {"pos": X},
}

View File

@ -1,121 +0,0 @@
from spacy.symbols import ORTH, NORM
def make_variants(base, first_norm, second_orth, second_norm):
return {
base: [
{ORTH: base.split("'")[0] + "'", NORM: first_norm},
{ORTH: second_orth, NORM: second_norm},
],
base.capitalize(): [
{ORTH: base.split("'")[0].capitalize() + "'", NORM: first_norm.capitalize()},
{ORTH: second_orth, NORM: second_norm},
]
}
TOKENIZER_EXCEPTIONS = {
"Dr.": [{ORTH: "Dr."}]
}
# Apostrophe forms
TOKENIZER_EXCEPTIONS.update(make_variants("m'ap", "mwen", "ap", "ap"))
TOKENIZER_EXCEPTIONS.update(make_variants("n'ap", "nou", "ap", "ap"))
TOKENIZER_EXCEPTIONS.update(make_variants("l'ap", "li", "ap", "ap"))
TOKENIZER_EXCEPTIONS.update(make_variants("y'ap", "yo", "ap", "ap"))
TOKENIZER_EXCEPTIONS.update(make_variants("m'te", "mwen", "te", "te"))
TOKENIZER_EXCEPTIONS.update(make_variants("m'pral", "mwen", "pral", "pral"))
TOKENIZER_EXCEPTIONS.update(make_variants("w'ap", "ou", "ap", "ap"))
TOKENIZER_EXCEPTIONS.update(make_variants("k'ap", "ki", "ap", "ap"))
TOKENIZER_EXCEPTIONS.update(make_variants("p'ap", "pa", "ap", "ap"))
TOKENIZER_EXCEPTIONS.update(make_variants("t'ap", "te", "ap", "ap"))
# Non-apostrophe contractions (with capitalized variants)
TOKENIZER_EXCEPTIONS.update({
"map": [
{ORTH: "m", NORM: "mwen"},
{ORTH: "ap", NORM: "ap"},
],
"Map": [
{ORTH: "M", NORM: "Mwen"},
{ORTH: "ap", NORM: "ap"},
],
"lem": [
{ORTH: "le", NORM: "le"},
{ORTH: "m", NORM: "mwen"},
],
"Lem": [
{ORTH: "Le", NORM: "Le"},
{ORTH: "m", NORM: "mwen"},
],
"lew": [
{ORTH: "le", NORM: "le"},
{ORTH: "w", NORM: "ou"},
],
"Lew": [
{ORTH: "Le", NORM: "Le"},
{ORTH: "w", NORM: "ou"},
],
"nap": [
{ORTH: "n", NORM: "nou"},
{ORTH: "ap", NORM: "ap"},
],
"Nap": [
{ORTH: "N", NORM: "Nou"},
{ORTH: "ap", NORM: "ap"},
],
"lap": [
{ORTH: "l", NORM: "li"},
{ORTH: "ap", NORM: "ap"},
],
"Lap": [
{ORTH: "L", NORM: "Li"},
{ORTH: "ap", NORM: "ap"},
],
"yap": [
{ORTH: "y", NORM: "yo"},
{ORTH: "ap", NORM: "ap"},
],
"Yap": [
{ORTH: "Y", NORM: "Yo"},
{ORTH: "ap", NORM: "ap"},
],
"mte": [
{ORTH: "m", NORM: "mwen"},
{ORTH: "te", NORM: "te"},
],
"Mte": [
{ORTH: "M", NORM: "Mwen"},
{ORTH: "te", NORM: "te"},
],
"mpral": [
{ORTH: "m", NORM: "mwen"},
{ORTH: "pral", NORM: "pral"},
],
"Mpral": [
{ORTH: "M", NORM: "Mwen"},
{ORTH: "pral", NORM: "pral"},
],
"wap": [
{ORTH: "w", NORM: "ou"},
{ORTH: "ap", NORM: "ap"},
],
"Wap": [
{ORTH: "W", NORM: "Ou"},
{ORTH: "ap", NORM: "ap"},
],
"kap": [
{ORTH: "k", NORM: "ki"},
{ORTH: "ap", NORM: "ap"},
],
"Kap": [
{ORTH: "K", NORM: "Ki"},
{ORTH: "ap", NORM: "ap"},
],
"tap": [
{ORTH: "t", NORM: "te"},
{ORTH: "ap", NORM: "ap"},
],
"Tap": [
{ORTH: "T", NORM: "Te"},
{ORTH: "ap", NORM: "ap"},
],
})

View File

@ -32,6 +32,7 @@ split_mode = null
""" """
@registry.tokenizers("spacy.ja.JapaneseTokenizer")
def create_tokenizer(split_mode: Optional[str] = None): def create_tokenizer(split_mode: Optional[str] = None):
def japanese_tokenizer_factory(nlp): def japanese_tokenizer_factory(nlp):
return JapaneseTokenizer(nlp.vocab, split_mode=split_mode) return JapaneseTokenizer(nlp.vocab, split_mode=split_mode)

View File

@ -1,16 +0,0 @@
from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
from .stop_words import STOP_WORDS
class KurmanjiDefaults(BaseDefaults):
stop_words = STOP_WORDS
lex_attr_getters = LEX_ATTRS
class Kurmanji(Language):
lang = "kmr"
Defaults = KurmanjiDefaults
__all__ = ["Kurmanji"]

View File

@ -1,17 +0,0 @@
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.kmr.examples import sentences
>>> docs = nlp.pipe(sentences)
"""
sentences = [
"Berê mirovan her tim li geşedana pêşerojê ye", # People's gaze is always on the development of the future
"Kawa Nemir di 14 salan de Ulysses wergerand Kurmancî.", # Kawa Nemir translated Ulysses into Kurmanji in 14 years.
"Mem Ararat hunermendekî Kurd yê bi nav û deng e.", # Mem Ararat is a famous Kurdish artist
"Firat Cewerî 40 sal e pirtûkên Kurdî dinivîsîne.", # Firat Ceweri has been writing Kurdish books for 40 years
"Rojnamegerê ciwan nûçeyeke balkêş li ser rewşa aborî nivîsand", # The young journalist wrote an interesting news article about the economic situation
"Sektora çandiniyê beşeke giring a belavkirina gaza serayê li seranserê cîhanê pêk tîne", # The agricultural sector constitutes an important part of greenhouse gas emissions worldwide
"Xwendekarên jêhatî di pêşbaziya matematîkê de serkeftî bûn", # Talented students succeeded in the mathematics competition
"Ji ber ji tunebûnê bavê min xwişkeke min nedan xwendin ew ji min re bû derd û kulek.", # Because of poverty, my father didn't send my sister to school, which became a pain and sorrow for me
]

View File

@ -1,138 +0,0 @@
from ...attrs import LIKE_NUM
_num_words = [
"sifir",
"yek",
"du",
"",
"çar",
"pênc",
"şeş",
"heft",
"heşt",
"neh",
"deh",
"yazde",
"dazde",
"sêzde",
"çarde",
"pazde",
"şazde",
"hevde",
"hejde",
"nozde",
"bîst",
"",
"çil",
"pêncî",
"şêst",
"heftê",
"heştê",
"nod",
"sed",
"hezar",
"milyon",
"milyar",
]
_ordinal_words = [
"yekem",
"yekemîn",
"duyem",
"duyemîn",
"sêyem",
"sêyemîn",
"çarem",
"çaremîn",
"pêncem",
"pêncemîn",
"şeşem",
"şeşemîn",
"heftem",
"heftemîn",
"heştem",
"heştemîn",
"nehem",
"nehemîn",
"dehem",
"dehemîn",
"yazdehem",
"yazdehemîn",
"dazdehem",
"dazdehemîn",
"sêzdehem",
"sêzdehemîn",
"çardehem",
"çardehemîn",
"pazdehem",
"pazdehemîn",
"şanzdehem",
"şanzdehemîn",
"hevdehem",
"hevdehemîn",
"hejdehem",
"hejdehemîn",
"nozdehem",
"nozdehemîn",
"bîstem",
"bîstemîn",
"sîyem",
"sîyemîn",
"çilem",
"çilemîn",
"pêncîyem",
"pênciyemîn",
"şêstem",
"şêstemîn",
"heftêyem",
"heftêyemîn",
"heştêyem",
"heştêyemîn",
"notem",
"notemîn",
"sedem",
"sedemîn",
"hezarem",
"hezaremîn",
"milyonem",
"milyonemîn",
"milyarem",
"milyaremîn",
]
def like_num(text):
if text.startswith(("+", "-", "±", "~")):
text = text[1:]
text = text.replace(",", "").replace(".", "")
if text.isdigit():
return True
if text.count("/") == 1:
num, denom = text.split("/")
if num.isdigit() and denom.isdigit():
return True
text_lower = text.lower()
if text_lower in _num_words:
return True
# Check ordinal number
if text_lower in _ordinal_words:
return True
if is_digit(text_lower):
return True
return False
def is_digit(text):
endings = ("em", "yem", "emîn", "yemîn")
for ending in endings:
to = len(ending)
if text.endswith(ending) and text[:-to].isdigit():
return True
return False
LEX_ATTRS = {LIKE_NUM: like_num}

View File

@ -1,44 +0,0 @@
STOP_WORDS = set(
"""
û
li
bi
di
da
de
ji
ku
ew
ez
tu
em
hûn
ew
ev
min
te
me
we
wan
va
çi
çawa
çima
kengî
li ku
çend
çiqas
her
hin
gelek
hemû
kes
tişt
""".split()
)

View File

@ -20,6 +20,7 @@ DEFAULT_CONFIG = """
""" """
@registry.tokenizers("spacy.ko.KoreanTokenizer")
def create_tokenizer(): def create_tokenizer():
def korean_tokenizer_factory(nlp): def korean_tokenizer_factory(nlp):
return KoreanTokenizer(nlp.vocab) return KoreanTokenizer(nlp.vocab)

View File

@ -24,6 +24,12 @@ class MacedonianDefaults(BaseDefaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS stop_words = STOP_WORDS
@classmethod
def create_lemmatizer(cls, nlp=None, lookups=None):
if lookups is None:
lookups = Lookups()
return MacedonianLemmatizer(lookups)
class Macedonian(Language): class Macedonian(Language):
lang = "mk" lang = "mk"

View File

@ -13,6 +13,7 @@ DEFAULT_CONFIG = """
""" """
@registry.tokenizers("spacy.th.ThaiTokenizer")
def create_thai_tokenizer(): def create_thai_tokenizer():
def thai_tokenizer_factory(nlp): def thai_tokenizer_factory(nlp):
return ThaiTokenizer(nlp.vocab) return ThaiTokenizer(nlp.vocab)

View File

@ -22,6 +22,7 @@ use_pyvi = true
""" """
@registry.tokenizers("spacy.vi.VietnameseTokenizer")
def create_vietnamese_tokenizer(use_pyvi: bool = True): def create_vietnamese_tokenizer(use_pyvi: bool = True):
def vietnamese_tokenizer_factory(nlp): def vietnamese_tokenizer_factory(nlp):
return VietnameseTokenizer(nlp.vocab, use_pyvi=use_pyvi) return VietnameseTokenizer(nlp.vocab, use_pyvi=use_pyvi)

View File

@ -46,6 +46,7 @@ class Segmenter(str, Enum):
return list(cls.__members__.keys()) return list(cls.__members__.keys())
@registry.tokenizers("spacy.zh.ChineseTokenizer")
def create_chinese_tokenizer(segmenter: Segmenter = Segmenter.char): def create_chinese_tokenizer(segmenter: Segmenter = Segmenter.char):
def chinese_tokenizer_factory(nlp): def chinese_tokenizer_factory(nlp):
return ChineseTokenizer(nlp.vocab, segmenter=segmenter) return ChineseTokenizer(nlp.vocab, segmenter=segmenter)

View File

@ -5,7 +5,7 @@ import multiprocessing as mp
import random import random
import traceback import traceback
import warnings import warnings
from contextlib import ExitStack, contextmanager from contextlib import contextmanager
from copy import deepcopy from copy import deepcopy
from dataclasses import dataclass from dataclasses import dataclass
from itertools import chain, cycle from itertools import chain, cycle
@ -30,11 +30,8 @@ from typing import (
overload, overload,
) )
import numpy
import srsly import srsly
from cymem.cymem import Pool
from thinc.api import Config, CupyOps, Optimizer, get_current_ops from thinc.api import Config, CupyOps, Optimizer, get_current_ops
from thinc.util import convert_recursive
from . import about, ty, util from . import about, ty, util
from .compat import Literal from .compat import Literal
@ -104,6 +101,7 @@ class BaseDefaults:
writing_system = {"direction": "ltr", "has_case": True, "has_letters": True} writing_system = {"direction": "ltr", "has_case": True, "has_letters": True}
@registry.tokenizers("spacy.Tokenizer.v1")
def create_tokenizer() -> Callable[["Language"], Tokenizer]: def create_tokenizer() -> Callable[["Language"], Tokenizer]:
"""Registered function to create a tokenizer. Returns a factory that takes """Registered function to create a tokenizer. Returns a factory that takes
the nlp object and returns a Tokenizer instance using the language detaults. the nlp object and returns a Tokenizer instance using the language detaults.
@ -129,6 +127,7 @@ def create_tokenizer() -> Callable[["Language"], Tokenizer]:
return tokenizer_factory return tokenizer_factory
@registry.misc("spacy.LookupsDataLoader.v1")
def load_lookups_data(lang, tables): def load_lookups_data(lang, tables):
util.logger.debug("Loading lookups from spacy-lookups-data: %s", tables) util.logger.debug("Loading lookups from spacy-lookups-data: %s", tables)
lookups = load_lookups(lang=lang, tables=tables) lookups = load_lookups(lang=lang, tables=tables)
@ -141,7 +140,7 @@ class Language:
Defaults (class): Settings, data and factory methods for creating the `nlp` Defaults (class): Settings, data and factory methods for creating the `nlp`
object and processing pipeline. object and processing pipeline.
lang (str): Two-letter ISO 639-1 or three-letter ISO 639-3 language codes, such as 'en' and 'eng'. lang (str): IETF language code, such as 'en'.
DOCS: https://spacy.io/api/language DOCS: https://spacy.io/api/language
""" """
@ -183,9 +182,6 @@ class Language:
DOCS: https://spacy.io/api/language#init DOCS: https://spacy.io/api/language#init
""" """
from .pipeline.factories import register_factories
register_factories()
# We're only calling this to import all factories provided via entry # We're only calling this to import all factories provided via entry
# points. The factory decorator applied to these functions takes care # points. The factory decorator applied to these functions takes care
# of the rest. # of the rest.
@ -1215,7 +1211,7 @@ class Language:
examples, examples,
): ):
eg.predicted = doc eg.predicted = doc
return _replace_numpy_floats(losses) return losses
def rehearse( def rehearse(
self, self,
@ -1466,7 +1462,7 @@ class Language:
results = scorer.score(examples, per_component=per_component) results = scorer.score(examples, per_component=per_component)
n_words = sum(len(eg.predicted) for eg in examples) n_words = sum(len(eg.predicted) for eg in examples)
results["speed"] = n_words / (end_time - start_time) results["speed"] = n_words / (end_time - start_time)
return _replace_numpy_floats(results) return results
def create_optimizer(self): def create_optimizer(self):
"""Create an optimizer, usually using the [training.optimizer] config.""" """Create an optimizer, usually using the [training.optimizer] config."""
@ -2095,38 +2091,6 @@ class Language:
util.replace_model_node(pipe.model, listener, new_model) # type: ignore[attr-defined] util.replace_model_node(pipe.model, listener, new_model) # type: ignore[attr-defined]
tok2vec.remove_listener(listener, pipe_name) tok2vec.remove_listener(listener, pipe_name)
@contextmanager
def memory_zone(self, mem: Optional[Pool] = None) -> Iterator[Pool]:
"""Begin a block where all resources allocated during the block will
be freed at the end of it. If a resources was created within the
memory zone block, accessing it outside the block is invalid.
Behaviour of this invalid access is undefined. Memory zones should
not be nested.
The memory zone is helpful for services that need to process large
volumes of text with a defined memory budget.
Example
-------
>>> with nlp.memory_zone():
... for doc in nlp.pipe(texts):
... process_my_doc(doc)
>>> # use_doc(doc) <-- Invalid: doc was allocated in the memory zone
"""
if mem is None:
mem = Pool()
# The ExitStack allows programmatic nested context managers.
# We don't know how many we need, so it would be awkward to have
# them as nested blocks.
with ExitStack() as stack:
contexts = [stack.enter_context(self.vocab.memory_zone(mem))]
if hasattr(self.tokenizer, "memory_zone"):
contexts.append(stack.enter_context(self.tokenizer.memory_zone(mem)))
for _, pipe in self.pipeline:
if hasattr(pipe, "memory_zone"):
contexts.append(stack.enter_context(pipe.memory_zone(mem)))
yield mem
def to_disk( def to_disk(
self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList() self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
) -> None: ) -> None:
@ -2144,9 +2108,7 @@ class Language:
serializers["tokenizer"] = lambda p: self.tokenizer.to_disk( # type: ignore[union-attr] serializers["tokenizer"] = lambda p: self.tokenizer.to_disk( # type: ignore[union-attr]
p, exclude=["vocab"] p, exclude=["vocab"]
) )
serializers["meta.json"] = lambda p: srsly.write_json( serializers["meta.json"] = lambda p: srsly.write_json(p, self.meta)
p, _replace_numpy_floats(self.meta)
)
serializers["config.cfg"] = lambda p: self.config.to_disk(p) serializers["config.cfg"] = lambda p: self.config.to_disk(p)
for name, proc in self._components: for name, proc in self._components:
if name in exclude: if name in exclude:
@ -2260,9 +2222,7 @@ class Language:
serializers: Dict[str, Callable[[], bytes]] = {} serializers: Dict[str, Callable[[], bytes]] = {}
serializers["vocab"] = lambda: self.vocab.to_bytes(exclude=exclude) serializers["vocab"] = lambda: self.vocab.to_bytes(exclude=exclude)
serializers["tokenizer"] = lambda: self.tokenizer.to_bytes(exclude=["vocab"]) # type: ignore[union-attr] serializers["tokenizer"] = lambda: self.tokenizer.to_bytes(exclude=["vocab"]) # type: ignore[union-attr]
serializers["meta.json"] = lambda: srsly.json_dumps( serializers["meta.json"] = lambda: srsly.json_dumps(self.meta)
_replace_numpy_floats(self.meta)
)
serializers["config.cfg"] = lambda: self.config.to_bytes() serializers["config.cfg"] = lambda: self.config.to_bytes()
for name, proc in self._components: for name, proc in self._components:
if name in exclude: if name in exclude:
@ -2313,12 +2273,6 @@ class Language:
return self return self
def _replace_numpy_floats(meta_dict: dict) -> dict:
return convert_recursive(
lambda v: isinstance(v, numpy.floating), lambda v: float(v), dict(meta_dict)
)
@dataclass @dataclass
class FactoryMeta: class FactoryMeta:
"""Dataclass containing information about a component and its defaults """Dataclass containing information about a component and its defaults

View File

@ -35,7 +35,7 @@ cdef class Lexeme:
return self return self
@staticmethod @staticmethod
cdef inline void set_struct_attr(LexemeC* lex, attr_id_t name, attr_t value) noexcept nogil: cdef inline void set_struct_attr(LexemeC* lex, attr_id_t name, attr_t value) nogil:
if name < (sizeof(flags_t) * 8): if name < (sizeof(flags_t) * 8):
Lexeme.c_set_flag(lex, name, value) Lexeme.c_set_flag(lex, name, value)
elif name == ID: elif name == ID:
@ -54,7 +54,7 @@ cdef class Lexeme:
lex.lang = value lex.lang = value
@staticmethod @staticmethod
cdef inline attr_t get_struct_attr(const LexemeC* lex, attr_id_t feat_name) noexcept nogil: cdef inline attr_t get_struct_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
if feat_name < (sizeof(flags_t) * 8): if feat_name < (sizeof(flags_t) * 8):
if Lexeme.c_check_flag(lex, feat_name): if Lexeme.c_check_flag(lex, feat_name):
return 1 return 1
@ -82,7 +82,7 @@ cdef class Lexeme:
return 0 return 0
@staticmethod @staticmethod
cdef inline bint c_check_flag(const LexemeC* lexeme, attr_id_t flag_id) noexcept nogil: cdef inline bint c_check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:
cdef flags_t one = 1 cdef flags_t one = 1
if lexeme.flags & (one << flag_id): if lexeme.flags & (one << flag_id):
return True return True
@ -90,7 +90,7 @@ cdef class Lexeme:
return False return False
@staticmethod @staticmethod
cdef inline bint c_set_flag(LexemeC* lex, attr_id_t flag_id, bint value) noexcept nogil: cdef inline bint c_set_flag(LexemeC* lex, attr_id_t flag_id, bint value) nogil:
cdef flags_t one = 1 cdef flags_t one = 1
if value: if value:
lex.flags |= one << flag_id lex.flags |= one << flag_id

View File

@ -70,7 +70,7 @@ cdef class Lexeme:
if isinstance(other, Lexeme): if isinstance(other, Lexeme):
a = self.orth a = self.orth
b = other.orth b = other.orth
elif isinstance(other, int): elif isinstance(other, long):
a = self.orth a = self.orth
b = other b = other
elif isinstance(other, str): elif isinstance(other, str):
@ -104,7 +104,7 @@ cdef class Lexeme:
# skip PROB, e.g. from lexemes.jsonl # skip PROB, e.g. from lexemes.jsonl
if isinstance(value, float): if isinstance(value, float):
continue continue
elif isinstance(value, int): elif isinstance(value, (int, long)):
Lexeme.set_struct_attr(self.c, attr, value) Lexeme.set_struct_attr(self.c, attr, value)
else: else:
Lexeme.set_struct_attr(self.c, attr, self.vocab.strings.add(value)) Lexeme.set_struct_attr(self.c, attr, self.vocab.strings.add(value))
@ -164,48 +164,45 @@ cdef class Lexeme:
vector = self.vector vector = self.vector
return numpy.sqrt((vector**2).sum()) return numpy.sqrt((vector**2).sum())
@property property vector:
def vector(self):
"""A real-valued meaning representation. """A real-valued meaning representation.
RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array
representing the lexeme's semantics. representing the lexeme's semantics.
""" """
cdef int length = self.vocab.vectors_length def __get__(self):
if length == 0: cdef int length = self.vocab.vectors_length
raise ValueError(Errors.E010) if length == 0:
return self.vocab.get_vector(self.c.orth) raise ValueError(Errors.E010)
return self.vocab.get_vector(self.c.orth)
@vector.setter def __set__(self, vector):
def vector(self, vector): if len(vector) != self.vocab.vectors_length:
if len(vector) != self.vocab.vectors_length: raise ValueError(Errors.E073.format(new_length=len(vector),
raise ValueError(Errors.E073.format(new_length=len(vector), length=self.vocab.vectors_length))
length=self.vocab.vectors_length)) self.vocab.set_vector(self.c.orth, vector)
self.vocab.set_vector(self.c.orth, vector)
@property property rank:
def rank(self):
"""RETURNS (str): Sequential ID of the lexeme's lexical type, used """RETURNS (str): Sequential ID of the lexeme's lexical type, used
to index into tables, e.g. for word vectors.""" to index into tables, e.g. for word vectors."""
return self.c.id def __get__(self):
return self.c.id
@rank.setter def __set__(self, value):
def rank(self, value): self.c.id = value
self.c.id = value
@property property sentiment:
def sentiment(self):
"""RETURNS (float): A scalar value indicating the positivity or """RETURNS (float): A scalar value indicating the positivity or
negativity of the lexeme.""" negativity of the lexeme."""
sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment", {}) def __get__(self):
return sentiment_table.get(self.c.orth, 0.0) sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment", {})
return sentiment_table.get(self.c.orth, 0.0)
@sentiment.setter def __set__(self, float x):
def sentiment(self, float x): if "lexeme_sentiment" not in self.vocab.lookups:
if "lexeme_sentiment" not in self.vocab.lookups: self.vocab.lookups.add_table("lexeme_sentiment")
self.vocab.lookups.add_table("lexeme_sentiment") sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment")
sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment") sentiment_table[self.c.orth] = x
sentiment_table[self.c.orth] = x
@property @property
def orth_(self): def orth_(self):
@ -219,338 +216,306 @@ cdef class Lexeme:
"""RETURNS (str): The original verbatim text of the lexeme.""" """RETURNS (str): The original verbatim text of the lexeme."""
return self.orth_ return self.orth_
@property property lower:
def lower(self):
"""RETURNS (uint64): Lowercase form of the lexeme.""" """RETURNS (uint64): Lowercase form of the lexeme."""
return self.c.lower def __get__(self):
return self.c.lower
@lower.setter def __set__(self, attr_t x):
def lower(self, attr_t x): self.c.lower = x
self.c.lower = x
@property property norm:
def norm(self):
"""RETURNS (uint64): The lexeme's norm, i.e. a normalised form of the """RETURNS (uint64): The lexeme's norm, i.e. a normalised form of the
lexeme text. lexeme text.
""" """
return self.c.norm def __get__(self):
return self.c.norm
@norm.setter def __set__(self, attr_t x):
def norm(self, attr_t x): if "lexeme_norm" not in self.vocab.lookups:
if "lexeme_norm" not in self.vocab.lookups: self.vocab.lookups.add_table("lexeme_norm")
self.vocab.lookups.add_table("lexeme_norm") norm_table = self.vocab.lookups.get_table("lexeme_norm")
norm_table = self.vocab.lookups.get_table("lexeme_norm") norm_table[self.c.orth] = self.vocab.strings[x]
norm_table[self.c.orth] = self.vocab.strings[x] self.c.norm = x
self.c.norm = x
@property property shape:
def shape(self):
"""RETURNS (uint64): Transform of the word's string, to show """RETURNS (uint64): Transform of the word's string, to show
orthographic features. orthographic features.
""" """
return self.c.shape def __get__(self):
return self.c.shape
@shape.setter def __set__(self, attr_t x):
def shape(self, attr_t x): self.c.shape = x
self.c.shape = x
@property property prefix:
def prefix(self):
"""RETURNS (uint64): Length-N substring from the start of the word. """RETURNS (uint64): Length-N substring from the start of the word.
Defaults to `N=1`. Defaults to `N=1`.
""" """
return self.c.prefix def __get__(self):
return self.c.prefix
@prefix.setter def __set__(self, attr_t x):
def prefix(self, attr_t x): self.c.prefix = x
self.c.prefix = x
@property property suffix:
def suffix(self):
"""RETURNS (uint64): Length-N substring from the end of the word. """RETURNS (uint64): Length-N substring from the end of the word.
Defaults to `N=3`. Defaults to `N=3`.
""" """
return self.c.suffix def __get__(self):
return self.c.suffix
@suffix.setter def __set__(self, attr_t x):
def suffix(self, attr_t x): self.c.suffix = x
self.c.suffix = x
@property property cluster:
def cluster(self):
"""RETURNS (int): Brown cluster ID.""" """RETURNS (int): Brown cluster ID."""
cluster_table = self.vocab.lookups.get_table("lexeme_cluster", {}) def __get__(self):
return cluster_table.get(self.c.orth, 0) cluster_table = self.vocab.lookups.get_table("lexeme_cluster", {})
return cluster_table.get(self.c.orth, 0)
@cluster.setter def __set__(self, int x):
def cluster(self, int x): cluster_table = self.vocab.lookups.get_table("lexeme_cluster", {})
cluster_table = self.vocab.lookups.get_table("lexeme_cluster", {}) cluster_table[self.c.orth] = x
cluster_table[self.c.orth] = x
@property property lang:
def lang(self):
"""RETURNS (uint64): Language of the parent vocabulary.""" """RETURNS (uint64): Language of the parent vocabulary."""
return self.c.lang def __get__(self):
return self.c.lang
@lang.setter def __set__(self, attr_t x):
def lang(self, attr_t x): self.c.lang = x
self.c.lang = x
@property property prob:
def prob(self):
"""RETURNS (float): Smoothed log probability estimate of the lexeme's """RETURNS (float): Smoothed log probability estimate of the lexeme's
type.""" type."""
prob_table = self.vocab.lookups.get_table("lexeme_prob", {}) def __get__(self):
settings_table = self.vocab.lookups.get_table("lexeme_settings", {}) prob_table = self.vocab.lookups.get_table("lexeme_prob", {})
default_oov_prob = settings_table.get("oov_prob", -20.0) settings_table = self.vocab.lookups.get_table("lexeme_settings", {})
return prob_table.get(self.c.orth, default_oov_prob) default_oov_prob = settings_table.get("oov_prob", -20.0)
return prob_table.get(self.c.orth, default_oov_prob)
@prob.setter def __set__(self, float x):
def prob(self, float x): prob_table = self.vocab.lookups.get_table("lexeme_prob", {})
prob_table = self.vocab.lookups.get_table("lexeme_prob", {}) prob_table[self.c.orth] = x
prob_table[self.c.orth] = x
@property property lower_:
def lower_(self):
"""RETURNS (str): Lowercase form of the word.""" """RETURNS (str): Lowercase form of the word."""
return self.vocab.strings[self.c.lower] def __get__(self):
return self.vocab.strings[self.c.lower]
@lower_.setter def __set__(self, str x):
def lower_(self, str x): self.c.lower = self.vocab.strings.add(x)
self.c.lower = self.vocab.strings.add(x)
@property property norm_:
def norm_(self):
"""RETURNS (str): The lexeme's norm, i.e. a normalised form of the """RETURNS (str): The lexeme's norm, i.e. a normalised form of the
lexeme text. lexeme text.
""" """
return self.vocab.strings[self.c.norm] def __get__(self):
return self.vocab.strings[self.c.norm]
@norm_.setter def __set__(self, str x):
def norm_(self, str x): self.norm = self.vocab.strings.add(x)
self.norm = self.vocab.strings.add(x)
@property property shape_:
def shape_(self):
"""RETURNS (str): Transform of the word's string, to show """RETURNS (str): Transform of the word's string, to show
orthographic features. orthographic features.
""" """
return self.vocab.strings[self.c.shape] def __get__(self):
return self.vocab.strings[self.c.shape]
@shape_.setter def __set__(self, str x):
def shape_(self, str x): self.c.shape = self.vocab.strings.add(x)
self.c.shape = self.vocab.strings.add(x)
@property property prefix_:
def prefix_(self):
"""RETURNS (str): Length-N substring from the start of the word. """RETURNS (str): Length-N substring from the start of the word.
Defaults to `N=1`. Defaults to `N=1`.
""" """
return self.vocab.strings[self.c.prefix] def __get__(self):
return self.vocab.strings[self.c.prefix]
@prefix_.setter def __set__(self, str x):
def prefix_(self, str x): self.c.prefix = self.vocab.strings.add(x)
self.c.prefix = self.vocab.strings.add(x)
@property property suffix_:
def suffix_(self):
"""RETURNS (str): Length-N substring from the end of the word. """RETURNS (str): Length-N substring from the end of the word.
Defaults to `N=3`. Defaults to `N=3`.
""" """
return self.vocab.strings[self.c.suffix] def __get__(self):
return self.vocab.strings[self.c.suffix]
@suffix_.setter def __set__(self, str x):
def suffix_(self, str x): self.c.suffix = self.vocab.strings.add(x)
self.c.suffix = self.vocab.strings.add(x)
@property property lang_:
def lang_(self):
"""RETURNS (str): Language of the parent vocabulary.""" """RETURNS (str): Language of the parent vocabulary."""
return self.vocab.strings[self.c.lang] def __get__(self):
return self.vocab.strings[self.c.lang]
@lang_.setter def __set__(self, str x):
def lang_(self, str x): self.c.lang = self.vocab.strings.add(x)
self.c.lang = self.vocab.strings.add(x)
@property property flags:
def flags(self):
"""RETURNS (uint64): Container of the lexeme's binary flags.""" """RETURNS (uint64): Container of the lexeme's binary flags."""
return self.c.flags def __get__(self):
return self.c.flags
@flags.setter def __set__(self, flags_t x):
def flags(self, flags_t x): self.c.flags = x
self.c.flags = x
@property @property
def is_oov(self): def is_oov(self):
"""RETURNS (bool): Whether the lexeme is out-of-vocabulary.""" """RETURNS (bool): Whether the lexeme is out-of-vocabulary."""
return self.orth not in self.vocab.vectors return self.orth not in self.vocab.vectors
@property property is_stop:
def is_stop(self):
"""RETURNS (bool): Whether the lexeme is a stop word.""" """RETURNS (bool): Whether the lexeme is a stop word."""
return Lexeme.c_check_flag(self.c, IS_STOP) def __get__(self):
return Lexeme.c_check_flag(self.c, IS_STOP)
@is_stop.setter def __set__(self, bint x):
def is_stop(self, bint x): Lexeme.c_set_flag(self.c, IS_STOP, x)
Lexeme.c_set_flag(self.c, IS_STOP, x)
@property property is_alpha:
def is_alpha(self):
"""RETURNS (bool): Whether the lexeme consists of alphabetic """RETURNS (bool): Whether the lexeme consists of alphabetic
characters. Equivalent to `lexeme.text.isalpha()`. characters. Equivalent to `lexeme.text.isalpha()`.
""" """
return Lexeme.c_check_flag(self.c, IS_ALPHA) def __get__(self):
return Lexeme.c_check_flag(self.c, IS_ALPHA)
@is_alpha.setter def __set__(self, bint x):
def is_alpha(self, bint x): Lexeme.c_set_flag(self.c, IS_ALPHA, x)
Lexeme.c_set_flag(self.c, IS_ALPHA, x)
@property property is_ascii:
def is_ascii(self):
"""RETURNS (bool): Whether the lexeme consists of ASCII characters. """RETURNS (bool): Whether the lexeme consists of ASCII characters.
Equivalent to `[any(ord(c) >= 128 for c in lexeme.text)]`. Equivalent to `[any(ord(c) >= 128 for c in lexeme.text)]`.
""" """
return Lexeme.c_check_flag(self.c, IS_ASCII) def __get__(self):
return Lexeme.c_check_flag(self.c, IS_ASCII)
@is_ascii.setter def __set__(self, bint x):
def is_ascii(self, bint x): Lexeme.c_set_flag(self.c, IS_ASCII, x)
Lexeme.c_set_flag(self.c, IS_ASCII, x)
@property property is_digit:
def is_digit(self):
"""RETURNS (bool): Whether the lexeme consists of digits. Equivalent """RETURNS (bool): Whether the lexeme consists of digits. Equivalent
to `lexeme.text.isdigit()`. to `lexeme.text.isdigit()`.
""" """
return Lexeme.c_check_flag(self.c, IS_DIGIT) def __get__(self):
return Lexeme.c_check_flag(self.c, IS_DIGIT)
@is_digit.setter def __set__(self, bint x):
def is_digit(self, bint x): Lexeme.c_set_flag(self.c, IS_DIGIT, x)
Lexeme.c_set_flag(self.c, IS_DIGIT, x)
@property property is_lower:
def is_lower(self):
"""RETURNS (bool): Whether the lexeme is in lowercase. Equivalent to """RETURNS (bool): Whether the lexeme is in lowercase. Equivalent to
`lexeme.text.islower()`. `lexeme.text.islower()`.
""" """
return Lexeme.c_check_flag(self.c, IS_LOWER) def __get__(self):
return Lexeme.c_check_flag(self.c, IS_LOWER)
@is_lower.setter def __set__(self, bint x):
def is_lower(self, bint x): Lexeme.c_set_flag(self.c, IS_LOWER, x)
Lexeme.c_set_flag(self.c, IS_LOWER, x)
@property property is_upper:
def is_upper(self):
"""RETURNS (bool): Whether the lexeme is in uppercase. Equivalent to """RETURNS (bool): Whether the lexeme is in uppercase. Equivalent to
`lexeme.text.isupper()`. `lexeme.text.isupper()`.
""" """
return Lexeme.c_check_flag(self.c, IS_UPPER) def __get__(self):
return Lexeme.c_check_flag(self.c, IS_UPPER)
@is_upper.setter def __set__(self, bint x):
def is_upper(self, bint x): Lexeme.c_set_flag(self.c, IS_UPPER, x)
Lexeme.c_set_flag(self.c, IS_UPPER, x)
@property property is_title:
def is_title(self):
"""RETURNS (bool): Whether the lexeme is in titlecase. Equivalent to """RETURNS (bool): Whether the lexeme is in titlecase. Equivalent to
`lexeme.text.istitle()`. `lexeme.text.istitle()`.
""" """
return Lexeme.c_check_flag(self.c, IS_TITLE) def __get__(self):
return Lexeme.c_check_flag(self.c, IS_TITLE)
@is_title.setter def __set__(self, bint x):
def is_title(self, bint x): Lexeme.c_set_flag(self.c, IS_TITLE, x)
Lexeme.c_set_flag(self.c, IS_TITLE, x)
@property property is_punct:
def is_punct(self):
"""RETURNS (bool): Whether the lexeme is punctuation.""" """RETURNS (bool): Whether the lexeme is punctuation."""
return Lexeme.c_check_flag(self.c, IS_PUNCT) def __get__(self):
return Lexeme.c_check_flag(self.c, IS_PUNCT)
@is_punct.setter def __set__(self, bint x):
def is_punct(self, bint x): Lexeme.c_set_flag(self.c, IS_PUNCT, x)
Lexeme.c_set_flag(self.c, IS_PUNCT, x)
@property property is_space:
def is_space(self):
"""RETURNS (bool): Whether the lexeme consist of whitespace characters. """RETURNS (bool): Whether the lexeme consist of whitespace characters.
Equivalent to `lexeme.text.isspace()`. Equivalent to `lexeme.text.isspace()`.
""" """
return Lexeme.c_check_flag(self.c, IS_SPACE) def __get__(self):
return Lexeme.c_check_flag(self.c, IS_SPACE)
@is_space.setter def __set__(self, bint x):
def is_space(self, bint x): Lexeme.c_set_flag(self.c, IS_SPACE, x)
Lexeme.c_set_flag(self.c, IS_SPACE, x)
@property property is_bracket:
def is_bracket(self):
"""RETURNS (bool): Whether the lexeme is a bracket.""" """RETURNS (bool): Whether the lexeme is a bracket."""
return Lexeme.c_check_flag(self.c, IS_BRACKET) def __get__(self):
return Lexeme.c_check_flag(self.c, IS_BRACKET)
@is_bracket.setter def __set__(self, bint x):
def is_bracket(self, bint x): Lexeme.c_set_flag(self.c, IS_BRACKET, x)
Lexeme.c_set_flag(self.c, IS_BRACKET, x)
@property property is_quote:
def is_quote(self):
"""RETURNS (bool): Whether the lexeme is a quotation mark.""" """RETURNS (bool): Whether the lexeme is a quotation mark."""
return Lexeme.c_check_flag(self.c, IS_QUOTE) def __get__(self):
return Lexeme.c_check_flag(self.c, IS_QUOTE)
@is_quote.setter def __set__(self, bint x):
def is_quote(self, bint x): Lexeme.c_set_flag(self.c, IS_QUOTE, x)
Lexeme.c_set_flag(self.c, IS_QUOTE, x)
@property property is_left_punct:
def is_left_punct(self):
"""RETURNS (bool): Whether the lexeme is left punctuation, e.g. (.""" """RETURNS (bool): Whether the lexeme is left punctuation, e.g. (."""
return Lexeme.c_check_flag(self.c, IS_LEFT_PUNCT) def __get__(self):
return Lexeme.c_check_flag(self.c, IS_LEFT_PUNCT)
@is_left_punct.setter def __set__(self, bint x):
def is_left_punct(self, bint x): Lexeme.c_set_flag(self.c, IS_LEFT_PUNCT, x)
Lexeme.c_set_flag(self.c, IS_LEFT_PUNCT, x)
@property property is_right_punct:
def is_right_punct(self):
"""RETURNS (bool): Whether the lexeme is right punctuation, e.g. ).""" """RETURNS (bool): Whether the lexeme is right punctuation, e.g. )."""
return Lexeme.c_check_flag(self.c, IS_RIGHT_PUNCT) def __get__(self):
return Lexeme.c_check_flag(self.c, IS_RIGHT_PUNCT)
@is_right_punct.setter def __set__(self, bint x):
def is_right_punct(self, bint x): Lexeme.c_set_flag(self.c, IS_RIGHT_PUNCT, x)
Lexeme.c_set_flag(self.c, IS_RIGHT_PUNCT, x)
@property property is_currency:
def is_currency(self):
"""RETURNS (bool): Whether the lexeme is a currency symbol, e.g. $, €.""" """RETURNS (bool): Whether the lexeme is a currency symbol, e.g. $, €."""
return Lexeme.c_check_flag(self.c, IS_CURRENCY) def __get__(self):
return Lexeme.c_check_flag(self.c, IS_CURRENCY)
@is_currency.setter def __set__(self, bint x):
def is_currency(self, bint x): Lexeme.c_set_flag(self.c, IS_CURRENCY, x)
Lexeme.c_set_flag(self.c, IS_CURRENCY, x)
@property property like_url:
def like_url(self):
"""RETURNS (bool): Whether the lexeme resembles a URL.""" """RETURNS (bool): Whether the lexeme resembles a URL."""
return Lexeme.c_check_flag(self.c, LIKE_URL) def __get__(self):
return Lexeme.c_check_flag(self.c, LIKE_URL)
@like_url.setter def __set__(self, bint x):
def like_url(self, bint x): Lexeme.c_set_flag(self.c, LIKE_URL, x)
Lexeme.c_set_flag(self.c, LIKE_URL, x)
@property property like_num:
def like_num(self):
"""RETURNS (bool): Whether the lexeme represents a number, e.g. "10.9", """RETURNS (bool): Whether the lexeme represents a number, e.g. "10.9",
"10", "ten", etc. "10", "ten", etc.
""" """
return Lexeme.c_check_flag(self.c, LIKE_NUM) def __get__(self):
return Lexeme.c_check_flag(self.c, LIKE_NUM)
@like_num.setter def __set__(self, bint x):
def like_num(self, bint x): Lexeme.c_set_flag(self.c, LIKE_NUM, x)
Lexeme.c_set_flag(self.c, LIKE_NUM, x)
@property property like_email:
def like_email(self):
"""RETURNS (bool): Whether the lexeme resembles an email address.""" """RETURNS (bool): Whether the lexeme resembles an email address."""
return Lexeme.c_check_flag(self.c, LIKE_EMAIL) def __get__(self):
return Lexeme.c_check_flag(self.c, LIKE_EMAIL)
@like_email.setter def __set__(self, bint x):
def like_email(self, bint x): Lexeme.c_set_flag(self.c, LIKE_EMAIL, x)
Lexeme.c_set_flag(self.c, LIKE_EMAIL, x)

View File

@ -1,4 +1,4 @@
# cython: binding=True, infer_types=True, language_level=3 # cython: binding=True, infer_types=True
from cpython.object cimport PyObject from cpython.object cimport PyObject
from libc.stdint cimport int64_t from libc.stdint cimport int64_t
@ -27,5 +27,6 @@ cpdef bint levenshtein_compare(input_text: str, pattern_text: str, fuzzy: int =
return levenshtein(input_text, pattern_text, max_edits) <= max_edits return levenshtein(input_text, pattern_text, max_edits) <= max_edits
@registry.misc("spacy.levenshtein_compare.v1")
def make_levenshtein_compare(): def make_levenshtein_compare():
return levenshtein_compare return levenshtein_compare

View File

@ -625,7 +625,7 @@ cdef action_t get_action(
const TokenC * token, const TokenC * token,
const attr_t * extra_attrs, const attr_t * extra_attrs,
const int8_t * predicate_matches const int8_t * predicate_matches
) noexcept nogil: ) nogil:
"""We need to consider: """We need to consider:
a) Does the token match the specification? [Yes, No] a) Does the token match the specification? [Yes, No]
b) What's the quantifier? [1, 0+, ?] b) What's the quantifier? [1, 0+, ?]
@ -740,7 +740,7 @@ cdef int8_t get_is_match(
const TokenC* token, const TokenC* token,
const attr_t* extra_attrs, const attr_t* extra_attrs,
const int8_t* predicate_matches const int8_t* predicate_matches
) noexcept nogil: ) nogil:
for i in range(state.pattern.nr_py): for i in range(state.pattern.nr_py):
if predicate_matches[state.pattern.py_predicates[i]] == -1: if predicate_matches[state.pattern.py_predicates[i]] == -1:
return 0 return 0
@ -755,14 +755,14 @@ cdef int8_t get_is_match(
return True return True
cdef inline int8_t get_is_final(PatternStateC state) noexcept nogil: cdef inline int8_t get_is_final(PatternStateC state) nogil:
if state.pattern[1].quantifier == FINAL_ID: if state.pattern[1].quantifier == FINAL_ID:
return 1 return 1
else: else:
return 0 return 0
cdef inline int8_t get_quantifier(PatternStateC state) noexcept nogil: cdef inline int8_t get_quantifier(PatternStateC state) nogil:
return state.pattern.quantifier return state.pattern.quantifier
@ -805,7 +805,7 @@ cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id, object token_specs)
return pattern return pattern
cdef attr_t get_ent_id(const TokenPatternC* pattern) noexcept nogil: cdef attr_t get_ent_id(const TokenPatternC* pattern) nogil:
while pattern.quantifier != FINAL_ID: while pattern.quantifier != FINAL_ID:
pattern += 1 pattern += 1
id_attr = pattern[0].attrs[0] id_attr = pattern[0].attrs[0]

View File

@ -47,7 +47,7 @@ cdef class PhraseMatcher:
self._terminal_hash = 826361138722620965 self._terminal_hash = 826361138722620965
map_init(self.mem, self.c_map, 8) map_init(self.mem, self.c_map, 8)
if isinstance(attr, int): if isinstance(attr, (int, long)):
self.attr = attr self.attr = attr
else: else:
if attr is None: if attr is None:

View File

@ -7,6 +7,7 @@ from ..tokens import Doc
from ..util import registry from ..util import registry
@registry.layers("spacy.CharEmbed.v1")
def CharacterEmbed(nM: int, nC: int) -> Model[List[Doc], List[Floats2d]]: def CharacterEmbed(nM: int, nC: int) -> Model[List[Doc], List[Floats2d]]:
# nM: Number of dimensions per character. nC: Number of characters. # nM: Number of dimensions per character. nC: Number of characters.
return Model( return Model(

View File

@ -3,6 +3,7 @@ from thinc.api import Model, normal_init
from ..util import registry from ..util import registry
@registry.layers("spacy.PrecomputableAffine.v1")
def PrecomputableAffine(nO, nI, nF, nP, dropout=0.1): def PrecomputableAffine(nO, nI, nF, nP, dropout=0.1):
model = Model( model = Model(
"precomputable_affine", "precomputable_affine",

View File

@ -50,6 +50,7 @@ def models_with_nvtx_range(nlp, forward_color: int, backprop_color: int):
return nlp return nlp
@registry.callbacks("spacy.models_with_nvtx_range.v1")
def create_models_with_nvtx_range( def create_models_with_nvtx_range(
forward_color: int = -1, backprop_color: int = -1 forward_color: int = -1, backprop_color: int = -1
) -> Callable[["Language"], "Language"]: ) -> Callable[["Language"], "Language"]:
@ -109,6 +110,7 @@ def pipes_with_nvtx_range(
return nlp return nlp
@registry.callbacks("spacy.models_and_pipes_with_nvtx_range.v1")
def create_models_and_pipes_with_nvtx_range( def create_models_and_pipes_with_nvtx_range(
forward_color: int = -1, forward_color: int = -1,
backprop_color: int = -1, backprop_color: int = -1,

View File

@ -4,6 +4,7 @@ from ..attrs import LOWER
from ..util import registry from ..util import registry
@registry.layers("spacy.extract_ngrams.v1")
def extract_ngrams(ngram_size: int, attr: int = LOWER) -> Model: def extract_ngrams(ngram_size: int, attr: int = LOWER) -> Model:
model: Model = Model("extract_ngrams", forward) model: Model = Model("extract_ngrams", forward)
model.attrs["ngram_size"] = ngram_size model.attrs["ngram_size"] = ngram_size

View File

@ -6,6 +6,7 @@ from thinc.types import Ints1d, Ragged
from ..util import registry from ..util import registry
@registry.layers("spacy.extract_spans.v1")
def extract_spans() -> Model[Tuple[Ragged, Ragged], Ragged]: def extract_spans() -> Model[Tuple[Ragged, Ragged], Ragged]:
"""Extract spans from a sequence of source arrays, as specified by an array """Extract spans from a sequence of source arrays, as specified by an array
of (start, end) indices. The output is a ragged array of the of (start, end) indices. The output is a ragged array of the

View File

@ -6,9 +6,8 @@ from thinc.types import Ints2d
from ..tokens import Doc from ..tokens import Doc
def FeatureExtractor( @registry.layers("spacy.FeatureExtractor.v1")
columns: Union[List[str], List[int], List[Union[int, str]]] def FeatureExtractor(columns: List[Union[int, str]]) -> Model[List[Doc], List[Ints2d]]:
) -> Model[List[Doc], List[Ints2d]]:
return Model("extract_features", forward, attrs={"columns": columns}) return Model("extract_features", forward, attrs={"columns": columns})

View File

@ -28,6 +28,7 @@ from ...vocab import Vocab
from ..extract_spans import extract_spans from ..extract_spans import extract_spans
@registry.architectures("spacy.EntityLinker.v2")
def build_nel_encoder( def build_nel_encoder(
tok2vec: Model, nO: Optional[int] = None tok2vec: Model, nO: Optional[int] = None
) -> Model[List[Doc], Floats2d]: ) -> Model[List[Doc], Floats2d]:
@ -91,6 +92,7 @@ def span_maker_forward(model, docs: List[Doc], is_train) -> Tuple[Ragged, Callab
return out, lambda x: [] return out, lambda x: []
@registry.misc("spacy.KBFromFile.v1")
def load_kb( def load_kb(
kb_path: Path, kb_path: Path,
) -> Callable[[Vocab], KnowledgeBase]: ) -> Callable[[Vocab], KnowledgeBase]:
@ -102,6 +104,7 @@ def load_kb(
return kb_from_file return kb_from_file
@registry.misc("spacy.EmptyKB.v2")
def empty_kb_for_config() -> Callable[[Vocab, int], KnowledgeBase]: def empty_kb_for_config() -> Callable[[Vocab, int], KnowledgeBase]:
def empty_kb_factory(vocab: Vocab, entity_vector_length: int): def empty_kb_factory(vocab: Vocab, entity_vector_length: int):
return InMemoryLookupKB(vocab=vocab, entity_vector_length=entity_vector_length) return InMemoryLookupKB(vocab=vocab, entity_vector_length=entity_vector_length)
@ -109,6 +112,7 @@ def empty_kb_for_config() -> Callable[[Vocab, int], KnowledgeBase]:
return empty_kb_factory return empty_kb_factory
@registry.misc("spacy.EmptyKB.v1")
def empty_kb( def empty_kb(
entity_vector_length: int, entity_vector_length: int,
) -> Callable[[Vocab], KnowledgeBase]: ) -> Callable[[Vocab], KnowledgeBase]:
@ -118,10 +122,12 @@ def empty_kb(
return empty_kb_factory return empty_kb_factory
@registry.misc("spacy.CandidateGenerator.v1")
def create_candidates() -> Callable[[KnowledgeBase, Span], Iterable[Candidate]]: def create_candidates() -> Callable[[KnowledgeBase, Span], Iterable[Candidate]]:
return get_candidates return get_candidates
@registry.misc("spacy.CandidateBatchGenerator.v1")
def create_candidates_batch() -> Callable[ def create_candidates_batch() -> Callable[
[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]] [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
]: ]:

View File

@ -30,6 +30,7 @@ if TYPE_CHECKING:
from ...vocab import Vocab # noqa: F401 from ...vocab import Vocab # noqa: F401
@registry.architectures("spacy.PretrainVectors.v1")
def create_pretrain_vectors( def create_pretrain_vectors(
maxout_pieces: int, hidden_size: int, loss: str maxout_pieces: int, hidden_size: int, loss: str
) -> Callable[["Vocab", Model], Model]: ) -> Callable[["Vocab", Model], Model]:
@ -56,6 +57,7 @@ def create_pretrain_vectors(
return create_vectors_objective return create_vectors_objective
@registry.architectures("spacy.PretrainCharacters.v1")
def create_pretrain_characters( def create_pretrain_characters(
maxout_pieces: int, hidden_size: int, n_characters: int maxout_pieces: int, hidden_size: int, n_characters: int
) -> Callable[["Vocab", Model], Model]: ) -> Callable[["Vocab", Model], Model]:

View File

@ -11,6 +11,7 @@ from .._precomputable_affine import PrecomputableAffine
from ..tb_framework import TransitionModel from ..tb_framework import TransitionModel
@registry.architectures("spacy.TransitionBasedParser.v2")
def build_tb_parser_model( def build_tb_parser_model(
tok2vec: Model[List[Doc], List[Floats2d]], tok2vec: Model[List[Doc], List[Floats2d]],
state_type: Literal["parser", "ner"], state_type: Literal["parser", "ner"],

View File

@ -10,6 +10,7 @@ InT = List[Doc]
OutT = Floats2d OutT = Floats2d
@registry.architectures("spacy.SpanFinder.v1")
def build_finder_model( def build_finder_model(
tok2vec: Model[InT, List[Floats2d]], scorer: Model[OutT, OutT] tok2vec: Model[InT, List[Floats2d]], scorer: Model[OutT, OutT]
) -> Model[InT, OutT]: ) -> Model[InT, OutT]:

View File

@ -22,6 +22,7 @@ from ...util import registry
from ..extract_spans import extract_spans from ..extract_spans import extract_spans
@registry.layers("spacy.LinearLogistic.v1")
def build_linear_logistic(nO=None, nI=None) -> Model[Floats2d, Floats2d]: def build_linear_logistic(nO=None, nI=None) -> Model[Floats2d, Floats2d]:
"""An output layer for multi-label classification. It uses a linear layer """An output layer for multi-label classification. It uses a linear layer
followed by a logistic activation. followed by a logistic activation.
@ -29,6 +30,7 @@ def build_linear_logistic(nO=None, nI=None) -> Model[Floats2d, Floats2d]:
return chain(Linear(nO=nO, nI=nI, init_W=glorot_uniform_init), Logistic()) return chain(Linear(nO=nO, nI=nI, init_W=glorot_uniform_init), Logistic())
@registry.layers("spacy.mean_max_reducer.v1")
def build_mean_max_reducer(hidden_size: int) -> Model[Ragged, Floats2d]: def build_mean_max_reducer(hidden_size: int) -> Model[Ragged, Floats2d]:
"""Reduce sequences by concatenating their mean and max pooled vectors, """Reduce sequences by concatenating their mean and max pooled vectors,
and then combine the concatenated vectors with a hidden layer. and then combine the concatenated vectors with a hidden layer.
@ -44,6 +46,7 @@ def build_mean_max_reducer(hidden_size: int) -> Model[Ragged, Floats2d]:
) )
@registry.architectures("spacy.SpanCategorizer.v1")
def build_spancat_model( def build_spancat_model(
tok2vec: Model[List[Doc], List[Floats2d]], tok2vec: Model[List[Doc], List[Floats2d]],
reducer: Model[Ragged, Floats2d], reducer: Model[Ragged, Floats2d],

View File

@ -7,6 +7,7 @@ from ...tokens import Doc
from ...util import registry from ...util import registry
@registry.architectures("spacy.Tagger.v2")
def build_tagger_model( def build_tagger_model(
tok2vec: Model[List[Doc], List[Floats2d]], nO: Optional[int] = None, normalize=False tok2vec: Model[List[Doc], List[Floats2d]], nO: Optional[int] = None, normalize=False
) -> Model[List[Doc], List[Floats2d]]: ) -> Model[List[Doc], List[Floats2d]]:

View File

@ -44,6 +44,7 @@ from .tok2vec import get_tok2vec_width
NEG_VALUE = -5000 NEG_VALUE = -5000
@registry.architectures("spacy.TextCatCNN.v2")
def build_simple_cnn_text_classifier( def build_simple_cnn_text_classifier(
tok2vec: Model, exclusive_classes: bool, nO: Optional[int] = None tok2vec: Model, exclusive_classes: bool, nO: Optional[int] = None
) -> Model[List[Doc], Floats2d]: ) -> Model[List[Doc], Floats2d]:
@ -71,6 +72,7 @@ def resize_and_set_ref(model, new_nO, resizable_layer):
return model return model
@registry.architectures("spacy.TextCatBOW.v2")
def build_bow_text_classifier( def build_bow_text_classifier(
exclusive_classes: bool, exclusive_classes: bool,
ngram_size: int, ngram_size: int,
@ -86,6 +88,7 @@ def build_bow_text_classifier(
) )
@registry.architectures("spacy.TextCatBOW.v3")
def build_bow_text_classifier_v3( def build_bow_text_classifier_v3(
exclusive_classes: bool, exclusive_classes: bool,
ngram_size: int, ngram_size: int,
@ -139,6 +142,7 @@ def _build_bow_text_classifier(
return model return model
@registry.architectures("spacy.TextCatEnsemble.v2")
def build_text_classifier_v2( def build_text_classifier_v2(
tok2vec: Model[List[Doc], List[Floats2d]], tok2vec: Model[List[Doc], List[Floats2d]],
linear_model: Model[List[Doc], Floats2d], linear_model: Model[List[Doc], Floats2d],
@ -196,6 +200,7 @@ def init_ensemble_textcat(model, X, Y) -> Model:
return model return model
@registry.architectures("spacy.TextCatLowData.v1")
def build_text_classifier_lowdata( def build_text_classifier_lowdata(
width: int, dropout: Optional[float], nO: Optional[int] = None width: int, dropout: Optional[float], nO: Optional[int] = None
) -> Model[List[Doc], Floats2d]: ) -> Model[List[Doc], Floats2d]:
@ -216,6 +221,7 @@ def build_text_classifier_lowdata(
return model return model
@registry.architectures("spacy.TextCatParametricAttention.v1")
def build_textcat_parametric_attention_v1( def build_textcat_parametric_attention_v1(
tok2vec: Model[List[Doc], List[Floats2d]], tok2vec: Model[List[Doc], List[Floats2d]],
exclusive_classes: bool, exclusive_classes: bool,
@ -288,6 +294,7 @@ def _init_parametric_attention_with_residual_nonlinear(model, X, Y) -> Model:
return model return model
@registry.architectures("spacy.TextCatReduce.v1")
def build_reduce_text_classifier( def build_reduce_text_classifier(
tok2vec: Model, tok2vec: Model,
exclusive_classes: bool, exclusive_classes: bool,

View File

@ -29,6 +29,7 @@ from ..featureextractor import FeatureExtractor
from ..staticvectors import StaticVectors from ..staticvectors import StaticVectors
@registry.architectures("spacy.Tok2VecListener.v1")
def tok2vec_listener_v1(width: int, upstream: str = "*"): def tok2vec_listener_v1(width: int, upstream: str = "*"):
tok2vec = Tok2VecListener(upstream_name=upstream, width=width) tok2vec = Tok2VecListener(upstream_name=upstream, width=width)
return tok2vec return tok2vec
@ -45,6 +46,7 @@ def get_tok2vec_width(model: Model):
return nO return nO
@registry.architectures("spacy.HashEmbedCNN.v2")
def build_hash_embed_cnn_tok2vec( def build_hash_embed_cnn_tok2vec(
*, *,
width: int, width: int,
@ -100,6 +102,7 @@ def build_hash_embed_cnn_tok2vec(
) )
@registry.architectures("spacy.Tok2Vec.v2")
def build_Tok2Vec_model( def build_Tok2Vec_model(
embed: Model[List[Doc], List[Floats2d]], embed: Model[List[Doc], List[Floats2d]],
encode: Model[List[Floats2d], List[Floats2d]], encode: Model[List[Floats2d], List[Floats2d]],
@ -120,9 +123,10 @@ def build_Tok2Vec_model(
return tok2vec return tok2vec
@registry.architectures("spacy.MultiHashEmbed.v2")
def MultiHashEmbed( def MultiHashEmbed(
width: int, width: int,
attrs: Union[List[str], List[int], List[Union[str, int]]], attrs: List[Union[str, int]],
rows: List[int], rows: List[int],
include_static_vectors: bool, include_static_vectors: bool,
) -> Model[List[Doc], List[Floats2d]]: ) -> Model[List[Doc], List[Floats2d]]:
@ -188,7 +192,7 @@ def MultiHashEmbed(
) )
else: else:
model = chain( model = chain(
FeatureExtractor(attrs), FeatureExtractor(list(attrs)),
cast(Model[List[Ints2d], Ragged], list2ragged()), cast(Model[List[Ints2d], Ragged], list2ragged()),
with_array(concatenate(*embeddings)), with_array(concatenate(*embeddings)),
max_out, max_out,
@ -197,6 +201,7 @@ def MultiHashEmbed(
return model return model
@registry.architectures("spacy.CharacterEmbed.v2")
def CharacterEmbed( def CharacterEmbed(
width: int, width: int,
rows: int, rows: int,
@ -273,6 +278,7 @@ def CharacterEmbed(
return model return model
@registry.architectures("spacy.MaxoutWindowEncoder.v2")
def MaxoutWindowEncoder( def MaxoutWindowEncoder(
width: int, window_size: int, maxout_pieces: int, depth: int width: int, window_size: int, maxout_pieces: int, depth: int
) -> Model[List[Floats2d], List[Floats2d]]: ) -> Model[List[Floats2d], List[Floats2d]]:
@ -304,6 +310,7 @@ def MaxoutWindowEncoder(
return with_array(model, pad=receptive_field) return with_array(model, pad=receptive_field)
@registry.architectures("spacy.MishWindowEncoder.v2")
def MishWindowEncoder( def MishWindowEncoder(
width: int, window_size: int, depth: int width: int, window_size: int, depth: int
) -> Model[List[Floats2d], List[Floats2d]]: ) -> Model[List[Floats2d], List[Floats2d]]:
@ -326,6 +333,7 @@ def MishWindowEncoder(
return with_array(model) return with_array(model)
@registry.architectures("spacy.TorchBiLSTMEncoder.v1")
def BiLSTMEncoder( def BiLSTMEncoder(
width: int, depth: int, dropout: float width: int, depth: int, dropout: float
) -> Model[List[Floats2d], List[Floats2d]]: ) -> Model[List[Floats2d], List[Floats2d]]:

View File

@ -52,14 +52,14 @@ cdef SizesC get_c_sizes(model, int batch_size) except *:
return output return output
cdef ActivationsC alloc_activations(SizesC n) noexcept nogil: cdef ActivationsC alloc_activations(SizesC n) nogil:
cdef ActivationsC A cdef ActivationsC A
memset(&A, 0, sizeof(A)) memset(&A, 0, sizeof(A))
resize_activations(&A, n) resize_activations(&A, n)
return A return A
cdef void free_activations(const ActivationsC* A) noexcept nogil: cdef void free_activations(const ActivationsC* A) nogil:
free(A.token_ids) free(A.token_ids)
free(A.scores) free(A.scores)
free(A.unmaxed) free(A.unmaxed)
@ -67,7 +67,7 @@ cdef void free_activations(const ActivationsC* A) noexcept nogil:
free(A.is_valid) free(A.is_valid)
cdef void resize_activations(ActivationsC* A, SizesC n) noexcept nogil: cdef void resize_activations(ActivationsC* A, SizesC n) nogil:
if n.states <= A._max_size: if n.states <= A._max_size:
A._curr_size = n.states A._curr_size = n.states
return return
@ -100,7 +100,7 @@ cdef void resize_activations(ActivationsC* A, SizesC n) noexcept nogil:
cdef void predict_states( cdef void predict_states(
CBlas cblas, ActivationsC* A, StateC** states, const WeightsC* W, SizesC n CBlas cblas, ActivationsC* A, StateC** states, const WeightsC* W, SizesC n
) noexcept nogil: ) nogil:
resize_activations(A, n) resize_activations(A, n)
for i in range(n.states): for i in range(n.states):
states[i].set_context_tokens(&A.token_ids[i*n.feats], n.feats) states[i].set_context_tokens(&A.token_ids[i*n.feats], n.feats)
@ -159,7 +159,7 @@ cdef void sum_state_features(
int B, int B,
int F, int F,
int O int O
) noexcept nogil: ) nogil:
cdef int idx, b, f cdef int idx, b, f
cdef const float* feature cdef const float* feature
padding = cached padding = cached
@ -183,7 +183,7 @@ cdef void cpu_log_loss(
const int* is_valid, const int* is_valid,
const float* scores, const float* scores,
int O int O
) noexcept nogil: ) nogil:
"""Do multi-label log loss""" """Do multi-label log loss"""
cdef double max_, gmax, Z, gZ cdef double max_, gmax, Z, gZ
best = arg_max_if_gold(scores, costs, is_valid, O) best = arg_max_if_gold(scores, costs, is_valid, O)
@ -209,7 +209,7 @@ cdef void cpu_log_loss(
cdef int arg_max_if_gold( cdef int arg_max_if_gold(
const weight_t* scores, const weight_t* costs, const int* is_valid, int n const weight_t* scores, const weight_t* costs, const int* is_valid, int n
) noexcept nogil: ) nogil:
# Find minimum cost # Find minimum cost
cdef float cost = 1 cdef float cost = 1
for i in range(n): for i in range(n):
@ -224,7 +224,7 @@ cdef int arg_max_if_gold(
return best return best
cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) noexcept nogil: cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) nogil:
cdef int best = -1 cdef int best = -1
for i in range(n): for i in range(n):
if is_valid[i] >= 1: if is_valid[i] >= 1:

View File

@ -13,6 +13,7 @@ from ..vectors import Mode, Vectors
from ..vocab import Vocab from ..vocab import Vocab
@registry.layers("spacy.StaticVectors.v2")
def StaticVectors( def StaticVectors(
nO: Optional[int] = None, nO: Optional[int] = None,
nM: Optional[int] = None, nM: Optional[int] = None,

View File

@ -4,6 +4,7 @@ from ..util import registry
from .parser_model import ParserStepModel from .parser_model import ParserStepModel
@registry.layers("spacy.TransitionModel.v1")
def TransitionModel( def TransitionModel(
tok2vec, lower, upper, resize_output, dropout=0.2, unseen_classes=set() tok2vec, lower, upper, resize_output, dropout=0.2, unseen_classes=set()
): ):

View File

@ -57,20 +57,16 @@ cdef class Morphology:
field_feature_pairs = [] field_feature_pairs = []
for field in sorted(string_features): for field in sorted(string_features):
values = string_features[field] values = string_features[field]
self.strings.add(field, allow_transient=False),
field_id = self.strings[field]
for value in values.split(self.VALUE_SEP): for value in values.split(self.VALUE_SEP):
field_sep_value = field + self.FIELD_SEP + value
self.strings.add(field_sep_value, allow_transient=False),
field_feature_pairs.append(( field_feature_pairs.append((
field_id, self.strings.add(field),
self.strings[field_sep_value] self.strings.add(field + self.FIELD_SEP + value),
)) ))
cdef MorphAnalysisC tag = self.create_morph_tag(field_feature_pairs) cdef MorphAnalysisC tag = self.create_morph_tag(field_feature_pairs)
# the hash key for the tag is either the hash of the normalized UFEATS # the hash key for the tag is either the hash of the normalized UFEATS
# string or the hash of an empty placeholder # string or the hash of an empty placeholder
norm_feats_string = self.normalize_features(features) norm_feats_string = self.normalize_features(features)
tag.key = self.strings.add(norm_feats_string, allow_transient=False) tag.key = self.strings.add(norm_feats_string)
self.insert(tag) self.insert(tag)
return tag.key return tag.key

View File

@ -25,8 +25,3 @@ IDS = {
NAMES = {value: key for key, value in IDS.items()} NAMES = {value: key for key, value in IDS.items()}
# As of Cython 3.1, the global Python namespace no longer has the enum
# contents by default.
globals().update(IDS)

View File

@ -17,7 +17,7 @@ from ...typedefs cimport attr_t
from ...vocab cimport EMPTY_LEXEME from ...vocab cimport EMPTY_LEXEME
cdef inline bint is_space_token(const TokenC* token) noexcept nogil: cdef inline bint is_space_token(const TokenC* token) nogil:
return Lexeme.c_check_flag(token.lex, IS_SPACE) return Lexeme.c_check_flag(token.lex, IS_SPACE)
cdef struct ArcC: cdef struct ArcC:
@ -41,7 +41,7 @@ cdef cppclass StateC:
int offset int offset
int _b_i int _b_i
inline __init__(const TokenC* sent, int length) noexcept nogil: __init__(const TokenC* sent, int length) nogil:
this._sent = sent this._sent = sent
this._heads = <int*>calloc(length, sizeof(int)) this._heads = <int*>calloc(length, sizeof(int))
if not (this._sent and this._heads): if not (this._sent and this._heads):
@ -57,10 +57,10 @@ cdef cppclass StateC:
memset(&this._empty_token, 0, sizeof(TokenC)) memset(&this._empty_token, 0, sizeof(TokenC))
this._empty_token.lex = &EMPTY_LEXEME this._empty_token.lex = &EMPTY_LEXEME
inline __dealloc__(): __dealloc__():
free(this._heads) free(this._heads)
inline void set_context_tokens(int* ids, int n) noexcept nogil: void set_context_tokens(int* ids, int n) nogil:
cdef int i, j cdef int i, j
if n == 1: if n == 1:
if this.B(0) >= 0: if this.B(0) >= 0:
@ -131,14 +131,14 @@ cdef cppclass StateC:
else: else:
ids[i] = -1 ids[i] = -1
inline int S(int i) noexcept nogil const: int S(int i) nogil const:
if i >= this._stack.size(): if i >= this._stack.size():
return -1 return -1
elif i < 0: elif i < 0:
return -1 return -1
return this._stack.at(this._stack.size() - (i+1)) return this._stack.at(this._stack.size() - (i+1))
inline int B(int i) noexcept nogil const: int B(int i) nogil const:
if i < 0: if i < 0:
return -1 return -1
elif i < this._rebuffer.size(): elif i < this._rebuffer.size():
@ -150,19 +150,19 @@ cdef cppclass StateC:
else: else:
return b_i return b_i
inline const TokenC* B_(int i) noexcept nogil const: const TokenC* B_(int i) nogil const:
return this.safe_get(this.B(i)) return this.safe_get(this.B(i))
inline const TokenC* E_(int i) noexcept nogil const: const TokenC* E_(int i) nogil const:
return this.safe_get(this.E(i)) return this.safe_get(this.E(i))
inline const TokenC* safe_get(int i) noexcept nogil const: const TokenC* safe_get(int i) nogil const:
if i < 0 or i >= this.length: if i < 0 or i >= this.length:
return &this._empty_token return &this._empty_token
else: else:
return &this._sent[i] return &this._sent[i]
inline void map_get_arcs(const unordered_map[int, vector[ArcC]] &heads_arcs, vector[ArcC]* out) noexcept nogil const: void map_get_arcs(const unordered_map[int, vector[ArcC]] &heads_arcs, vector[ArcC]* out) nogil const:
cdef const vector[ArcC]* arcs cdef const vector[ArcC]* arcs
head_arcs_it = heads_arcs.const_begin() head_arcs_it = heads_arcs.const_begin()
while head_arcs_it != heads_arcs.const_end(): while head_arcs_it != heads_arcs.const_end():
@ -175,23 +175,23 @@ cdef cppclass StateC:
incr(arcs_it) incr(arcs_it)
incr(head_arcs_it) incr(head_arcs_it)
inline void get_arcs(vector[ArcC]* out) noexcept nogil const: void get_arcs(vector[ArcC]* out) nogil const:
this.map_get_arcs(this._left_arcs, out) this.map_get_arcs(this._left_arcs, out)
this.map_get_arcs(this._right_arcs, out) this.map_get_arcs(this._right_arcs, out)
inline int H(int child) noexcept nogil const: int H(int child) nogil const:
if child >= this.length or child < 0: if child >= this.length or child < 0:
return -1 return -1
else: else:
return this._heads[child] return this._heads[child]
inline int E(int i) noexcept nogil const: int E(int i) nogil const:
if this._ents.size() == 0: if this._ents.size() == 0:
return -1 return -1
else: else:
return this._ents.back().start return this._ents.back().start
inline int nth_child(const unordered_map[int, vector[ArcC]]& heads_arcs, int head, int idx) noexcept nogil const: int nth_child(const unordered_map[int, vector[ArcC]]& heads_arcs, int head, int idx) nogil const:
if idx < 1: if idx < 1:
return -1 return -1
@ -215,22 +215,22 @@ cdef cppclass StateC:
return -1 return -1
inline int L(int head, int idx) noexcept nogil const: int L(int head, int idx) nogil const:
return this.nth_child(this._left_arcs, head, idx) return this.nth_child(this._left_arcs, head, idx)
inline int R(int head, int idx) noexcept nogil const: int R(int head, int idx) nogil const:
return this.nth_child(this._right_arcs, head, idx) return this.nth_child(this._right_arcs, head, idx)
inline bint empty() noexcept nogil const: bint empty() nogil const:
return this._stack.size() == 0 return this._stack.size() == 0
inline bint eol() noexcept nogil const: bint eol() nogil const:
return this.buffer_length() == 0 return this.buffer_length() == 0
inline bint is_final() noexcept nogil const: bint is_final() nogil const:
return this.stack_depth() <= 0 and this.eol() return this.stack_depth() <= 0 and this.eol()
inline int cannot_sent_start(int word) noexcept nogil const: int cannot_sent_start(int word) nogil const:
if word < 0 or word >= this.length: if word < 0 or word >= this.length:
return 0 return 0
elif this._sent[word].sent_start == -1: elif this._sent[word].sent_start == -1:
@ -238,7 +238,7 @@ cdef cppclass StateC:
else: else:
return 0 return 0
inline int is_sent_start(int word) noexcept nogil const: int is_sent_start(int word) nogil const:
if word < 0 or word >= this.length: if word < 0 or word >= this.length:
return 0 return 0
elif this._sent[word].sent_start == 1: elif this._sent[word].sent_start == 1:
@ -248,20 +248,20 @@ cdef cppclass StateC:
else: else:
return 0 return 0
inline void set_sent_start(int word, int value) noexcept nogil: void set_sent_start(int word, int value) nogil:
if value >= 1: if value >= 1:
this._sent_starts.insert(word) this._sent_starts.insert(word)
inline bint has_head(int child) noexcept nogil const: bint has_head(int child) nogil const:
return this._heads[child] >= 0 return this._heads[child] >= 0
inline int l_edge(int word) noexcept nogil const: int l_edge(int word) nogil const:
return word return word
inline int r_edge(int word) noexcept nogil const: int r_edge(int word) nogil const:
return word return word
inline int n_arcs(const unordered_map[int, vector[ArcC]] &heads_arcs, int head) noexcept nogil const: int n_arcs(const unordered_map[int, vector[ArcC]] &heads_arcs, int head) nogil const:
cdef int n = 0 cdef int n = 0
head_arcs_it = heads_arcs.const_find(head) head_arcs_it = heads_arcs.const_find(head)
if head_arcs_it == heads_arcs.const_end(): if head_arcs_it == heads_arcs.const_end():
@ -277,28 +277,28 @@ cdef cppclass StateC:
return n return n
inline int n_L(int head) noexcept nogil const: int n_L(int head) nogil const:
return n_arcs(this._left_arcs, head) return n_arcs(this._left_arcs, head)
inline int n_R(int head) noexcept nogil const: int n_R(int head) nogil const:
return n_arcs(this._right_arcs, head) return n_arcs(this._right_arcs, head)
inline bint stack_is_connected() noexcept nogil const: bint stack_is_connected() nogil const:
return False return False
inline bint entity_is_open() noexcept nogil const: bint entity_is_open() nogil const:
if this._ents.size() == 0: if this._ents.size() == 0:
return False return False
else: else:
return this._ents.back().end == -1 return this._ents.back().end == -1
inline int stack_depth() noexcept nogil const: int stack_depth() nogil const:
return this._stack.size() return this._stack.size()
inline int buffer_length() noexcept nogil const: int buffer_length() nogil const:
return (this.length - this._b_i) + this._rebuffer.size() return (this.length - this._b_i) + this._rebuffer.size()
inline void push() noexcept nogil: void push() nogil:
b0 = this.B(0) b0 = this.B(0)
if this._rebuffer.size(): if this._rebuffer.size():
b0 = this._rebuffer.back() b0 = this._rebuffer.back()
@ -308,32 +308,32 @@ cdef cppclass StateC:
this._b_i += 1 this._b_i += 1
this._stack.push_back(b0) this._stack.push_back(b0)
inline void pop() noexcept nogil: void pop() nogil:
this._stack.pop_back() this._stack.pop_back()
inline void force_final() noexcept nogil: void force_final() nogil:
# This should only be used in desperate situations, as it may leave # This should only be used in desperate situations, as it may leave
# the analysis in an unexpected state. # the analysis in an unexpected state.
this._stack.clear() this._stack.clear()
this._b_i = this.length this._b_i = this.length
inline void unshift() noexcept nogil: void unshift() nogil:
s0 = this._stack.back() s0 = this._stack.back()
this._unshiftable[s0] = 1 this._unshiftable[s0] = 1
this._rebuffer.push_back(s0) this._rebuffer.push_back(s0)
this._stack.pop_back() this._stack.pop_back()
inline int is_unshiftable(int item) noexcept nogil const: int is_unshiftable(int item) nogil const:
if item >= this._unshiftable.size(): if item >= this._unshiftable.size():
return 0 return 0
else: else:
return this._unshiftable.at(item) return this._unshiftable.at(item)
inline void set_reshiftable(int item) noexcept nogil: void set_reshiftable(int item) nogil:
if item < this._unshiftable.size(): if item < this._unshiftable.size():
this._unshiftable[item] = 0 this._unshiftable[item] = 0
inline void add_arc(int head, int child, attr_t label) noexcept nogil: void add_arc(int head, int child, attr_t label) nogil:
if this.has_head(child): if this.has_head(child):
this.del_arc(this.H(child), child) this.del_arc(this.H(child), child)
cdef ArcC arc cdef ArcC arc
@ -346,7 +346,7 @@ cdef cppclass StateC:
this._right_arcs[arc.head].push_back(arc) this._right_arcs[arc.head].push_back(arc)
this._heads[child] = head this._heads[child] = head
inline void map_del_arc(unordered_map[int, vector[ArcC]]* heads_arcs, int h_i, int c_i) noexcept nogil: void map_del_arc(unordered_map[int, vector[ArcC]]* heads_arcs, int h_i, int c_i) nogil:
arcs_it = heads_arcs.find(h_i) arcs_it = heads_arcs.find(h_i)
if arcs_it == heads_arcs.end(): if arcs_it == heads_arcs.end():
return return
@ -367,13 +367,13 @@ cdef cppclass StateC:
arc.label = 0 arc.label = 0
break break
inline void del_arc(int h_i, int c_i) noexcept nogil: void del_arc(int h_i, int c_i) nogil:
if h_i > c_i: if h_i > c_i:
this.map_del_arc(&this._left_arcs, h_i, c_i) this.map_del_arc(&this._left_arcs, h_i, c_i)
else: else:
this.map_del_arc(&this._right_arcs, h_i, c_i) this.map_del_arc(&this._right_arcs, h_i, c_i)
inline SpanC get_ent() noexcept nogil const: SpanC get_ent() nogil const:
cdef SpanC ent cdef SpanC ent
if this._ents.size() == 0: if this._ents.size() == 0:
ent.start = 0 ent.start = 0
@ -383,17 +383,17 @@ cdef cppclass StateC:
else: else:
return this._ents.back() return this._ents.back()
inline void open_ent(attr_t label) noexcept nogil: void open_ent(attr_t label) nogil:
cdef SpanC ent cdef SpanC ent
ent.start = this.B(0) ent.start = this.B(0)
ent.label = label ent.label = label
ent.end = -1 ent.end = -1
this._ents.push_back(ent) this._ents.push_back(ent)
inline void close_ent() noexcept nogil: void close_ent() nogil:
this._ents.back().end = this.B(0)+1 this._ents.back().end = this.B(0)+1
inline void clone(const StateC* src) noexcept nogil: void clone(const StateC* src) nogil:
this.length = src.length this.length = src.length
this._sent = src._sent this._sent = src._sent
this._stack = src._stack this._stack = src._stack

View File

@ -155,7 +155,7 @@ cdef GoldParseStateC create_gold_state(
return gs return gs
cdef void update_gold_state(GoldParseStateC* gs, const StateC* s) noexcept nogil: cdef void update_gold_state(GoldParseStateC* gs, const StateC* s) nogil:
for i in range(gs.length): for i in range(gs.length):
gs.state_bits[i] = set_state_flag( gs.state_bits[i] = set_state_flag(
gs.state_bits[i], gs.state_bits[i],
@ -203,7 +203,7 @@ cdef class ArcEagerGold:
def __init__(self, ArcEager moves, StateClass stcls, Example example): def __init__(self, ArcEager moves, StateClass stcls, Example example):
self.mem = Pool() self.mem = Pool()
heads, labels = example.get_aligned_parse(projectivize=True) heads, labels = example.get_aligned_parse(projectivize=True)
labels = [example.x.vocab.strings.add(label, allow_transient=False) if label is not None else MISSING_DEP for label in labels] labels = [example.x.vocab.strings.add(label) if label is not None else MISSING_DEP for label in labels]
sent_starts = _get_aligned_sent_starts(example) sent_starts = _get_aligned_sent_starts(example)
assert len(heads) == len(labels) == len(sent_starts), (len(heads), len(labels), len(sent_starts)) assert len(heads) == len(labels) == len(sent_starts), (len(heads), len(labels), len(sent_starts))
self.c = create_gold_state(self.mem, stcls.c, heads, labels, sent_starts) self.c = create_gold_state(self.mem, stcls.c, heads, labels, sent_starts)
@ -239,12 +239,12 @@ def _get_aligned_sent_starts(example):
return [None] * len(example.x) return [None] * len(example.x)
cdef int check_state_gold(char state_bits, char flag) noexcept nogil: cdef int check_state_gold(char state_bits, char flag) nogil:
cdef char one = 1 cdef char one = 1
return 1 if (state_bits & (one << flag)) else 0 return 1 if (state_bits & (one << flag)) else 0
cdef int set_state_flag(char state_bits, char flag, int value) noexcept nogil: cdef int set_state_flag(char state_bits, char flag, int value) nogil:
cdef char one = 1 cdef char one = 1
if value: if value:
return state_bits | (one << flag) return state_bits | (one << flag)
@ -252,27 +252,27 @@ cdef int set_state_flag(char state_bits, char flag, int value) noexcept nogil:
return state_bits & ~(one << flag) return state_bits & ~(one << flag)
cdef int is_head_in_stack(const GoldParseStateC* gold, int i) noexcept nogil: cdef int is_head_in_stack(const GoldParseStateC* gold, int i) nogil:
return check_state_gold(gold.state_bits[i], HEAD_IN_STACK) return check_state_gold(gold.state_bits[i], HEAD_IN_STACK)
cdef int is_head_in_buffer(const GoldParseStateC* gold, int i) noexcept nogil: cdef int is_head_in_buffer(const GoldParseStateC* gold, int i) nogil:
return check_state_gold(gold.state_bits[i], HEAD_IN_BUFFER) return check_state_gold(gold.state_bits[i], HEAD_IN_BUFFER)
cdef int is_head_unknown(const GoldParseStateC* gold, int i) noexcept nogil: cdef int is_head_unknown(const GoldParseStateC* gold, int i) nogil:
return check_state_gold(gold.state_bits[i], HEAD_UNKNOWN) return check_state_gold(gold.state_bits[i], HEAD_UNKNOWN)
cdef int is_sent_start(const GoldParseStateC* gold, int i) noexcept nogil: cdef int is_sent_start(const GoldParseStateC* gold, int i) nogil:
return check_state_gold(gold.state_bits[i], IS_SENT_START) return check_state_gold(gold.state_bits[i], IS_SENT_START)
cdef int is_sent_start_unknown(const GoldParseStateC* gold, int i) noexcept nogil: cdef int is_sent_start_unknown(const GoldParseStateC* gold, int i) nogil:
return check_state_gold(gold.state_bits[i], SENT_START_UNKNOWN) return check_state_gold(gold.state_bits[i], SENT_START_UNKNOWN)
# Helper functions for the arc-eager oracle # Helper functions for the arc-eager oracle
cdef weight_t push_cost(const StateC* state, const GoldParseStateC* gold) noexcept nogil: cdef weight_t push_cost(const StateC* state, const GoldParseStateC* gold) nogil:
cdef weight_t cost = 0 cdef weight_t cost = 0
b0 = state.B(0) b0 = state.B(0)
if b0 < 0: if b0 < 0:
@ -285,7 +285,7 @@ cdef weight_t push_cost(const StateC* state, const GoldParseStateC* gold) noexce
return cost return cost
cdef weight_t pop_cost(const StateC* state, const GoldParseStateC* gold) noexcept nogil: cdef weight_t pop_cost(const StateC* state, const GoldParseStateC* gold) nogil:
cdef weight_t cost = 0 cdef weight_t cost = 0
s0 = state.S(0) s0 = state.S(0)
if s0 < 0: if s0 < 0:
@ -296,7 +296,7 @@ cdef weight_t pop_cost(const StateC* state, const GoldParseStateC* gold) noexcep
return cost return cost
cdef bint arc_is_gold(const GoldParseStateC* gold, int head, int child) noexcept nogil: cdef bint arc_is_gold(const GoldParseStateC* gold, int head, int child) nogil:
if is_head_unknown(gold, child): if is_head_unknown(gold, child):
return True return True
elif gold.heads[child] == head: elif gold.heads[child] == head:
@ -305,7 +305,7 @@ cdef bint arc_is_gold(const GoldParseStateC* gold, int head, int child) noexcept
return False return False
cdef bint label_is_gold(const GoldParseStateC* gold, int child, attr_t label) noexcept nogil: cdef bint label_is_gold(const GoldParseStateC* gold, int child, attr_t label) nogil:
if is_head_unknown(gold, child): if is_head_unknown(gold, child):
return True return True
elif label == 0: elif label == 0:
@ -316,7 +316,7 @@ cdef bint label_is_gold(const GoldParseStateC* gold, int child, attr_t label) no
return False return False
cdef bint _is_gold_root(const GoldParseStateC* gold, int word) noexcept nogil: cdef bint _is_gold_root(const GoldParseStateC* gold, int word) nogil:
return gold.heads[word] == word or is_head_unknown(gold, word) return gold.heads[word] == word or is_head_unknown(gold, word)
@ -336,7 +336,7 @@ cdef class Shift:
* Advance buffer * Advance buffer
""" """
@staticmethod @staticmethod
cdef bint is_valid(const StateC* st, attr_t label) noexcept nogil: cdef bint is_valid(const StateC* st, attr_t label) nogil:
if st.stack_depth() == 0: if st.stack_depth() == 0:
return 1 return 1
elif st.buffer_length() < 2: elif st.buffer_length() < 2:
@ -349,11 +349,11 @@ cdef class Shift:
return 1 return 1
@staticmethod @staticmethod
cdef int transition(StateC* st, attr_t label) noexcept nogil: cdef int transition(StateC* st, attr_t label) nogil:
st.push() st.push()
@staticmethod @staticmethod
cdef weight_t cost(const StateC* state, const void* _gold, attr_t label) noexcept nogil: cdef weight_t cost(const StateC* state, const void* _gold, attr_t label) nogil:
gold = <const GoldParseStateC*>_gold gold = <const GoldParseStateC*>_gold
return gold.push_cost return gold.push_cost
@ -375,7 +375,7 @@ cdef class Reduce:
cost by those arcs. cost by those arcs.
""" """
@staticmethod @staticmethod
cdef bint is_valid(const StateC* st, attr_t label) noexcept nogil: cdef bint is_valid(const StateC* st, attr_t label) nogil:
if st.stack_depth() == 0: if st.stack_depth() == 0:
return False return False
elif st.buffer_length() == 0: elif st.buffer_length() == 0:
@ -386,14 +386,14 @@ cdef class Reduce:
return True return True
@staticmethod @staticmethod
cdef int transition(StateC* st, attr_t label) noexcept nogil: cdef int transition(StateC* st, attr_t label) nogil:
if st.has_head(st.S(0)) or st.stack_depth() == 1: if st.has_head(st.S(0)) or st.stack_depth() == 1:
st.pop() st.pop()
else: else:
st.unshift() st.unshift()
@staticmethod @staticmethod
cdef weight_t cost(const StateC* state, const void* _gold, attr_t label) noexcept nogil: cdef weight_t cost(const StateC* state, const void* _gold, attr_t label) nogil:
gold = <const GoldParseStateC*>_gold gold = <const GoldParseStateC*>_gold
if state.is_sent_start(state.B(0)): if state.is_sent_start(state.B(0)):
return 0 return 0
@ -421,7 +421,7 @@ cdef class LeftArc:
pop_cost - Arc(B[0], S[0], label) + (Arc(S[1], S[0]) if H(S[0]) else Arcs(S, S[0])) pop_cost - Arc(B[0], S[0], label) + (Arc(S[1], S[0]) if H(S[0]) else Arcs(S, S[0]))
""" """
@staticmethod @staticmethod
cdef bint is_valid(const StateC* st, attr_t label) noexcept nogil: cdef bint is_valid(const StateC* st, attr_t label) nogil:
if st.stack_depth() == 0: if st.stack_depth() == 0:
return 0 return 0
elif st.buffer_length() == 0: elif st.buffer_length() == 0:
@ -434,7 +434,7 @@ cdef class LeftArc:
return 1 return 1
@staticmethod @staticmethod
cdef int transition(StateC* st, attr_t label) noexcept nogil: cdef int transition(StateC* st, attr_t label) nogil:
st.add_arc(st.B(0), st.S(0), label) st.add_arc(st.B(0), st.S(0), label)
# If we change the stack, it's okay to remove the shifted mark, as # If we change the stack, it's okay to remove the shifted mark, as
# we can't get in an infinite loop this way. # we can't get in an infinite loop this way.
@ -442,7 +442,7 @@ cdef class LeftArc:
st.pop() st.pop()
@staticmethod @staticmethod
cdef inline weight_t cost(const StateC* state, const void* _gold, attr_t label) noexcept nogil: cdef inline weight_t cost(const StateC* state, const void* _gold, attr_t label) nogil:
gold = <const GoldParseStateC*>_gold gold = <const GoldParseStateC*>_gold
cdef weight_t cost = gold.pop_cost cdef weight_t cost = gold.pop_cost
s0 = state.S(0) s0 = state.S(0)
@ -474,7 +474,7 @@ cdef class RightArc:
push_cost + (not shifted[b0] and Arc(B[1:], B[0])) - Arc(S[0], B[0], label) push_cost + (not shifted[b0] and Arc(B[1:], B[0])) - Arc(S[0], B[0], label)
""" """
@staticmethod @staticmethod
cdef bint is_valid(const StateC* st, attr_t label) noexcept nogil: cdef bint is_valid(const StateC* st, attr_t label) nogil:
if st.stack_depth() == 0: if st.stack_depth() == 0:
return 0 return 0
elif st.buffer_length() == 0: elif st.buffer_length() == 0:
@ -488,12 +488,12 @@ cdef class RightArc:
return 1 return 1
@staticmethod @staticmethod
cdef int transition(StateC* st, attr_t label) noexcept nogil: cdef int transition(StateC* st, attr_t label) nogil:
st.add_arc(st.S(0), st.B(0), label) st.add_arc(st.S(0), st.B(0), label)
st.push() st.push()
@staticmethod @staticmethod
cdef inline weight_t cost(const StateC* state, const void* _gold, attr_t label) noexcept nogil: cdef inline weight_t cost(const StateC* state, const void* _gold, attr_t label) nogil:
gold = <const GoldParseStateC*>_gold gold = <const GoldParseStateC*>_gold
cost = gold.push_cost cost = gold.push_cost
s0 = state.S(0) s0 = state.S(0)
@ -525,7 +525,7 @@ cdef class Break:
* Arcs between S and B[1] * Arcs between S and B[1]
""" """
@staticmethod @staticmethod
cdef bint is_valid(const StateC* st, attr_t label) noexcept nogil: cdef bint is_valid(const StateC* st, attr_t label) nogil:
if st.buffer_length() < 2: if st.buffer_length() < 2:
return False return False
elif st.B(1) != st.B(0) + 1: elif st.B(1) != st.B(0) + 1:
@ -538,11 +538,11 @@ cdef class Break:
return True return True
@staticmethod @staticmethod
cdef int transition(StateC* st, attr_t label) noexcept nogil: cdef int transition(StateC* st, attr_t label) nogil:
st.set_sent_start(st.B(1), 1) st.set_sent_start(st.B(1), 1)
@staticmethod @staticmethod
cdef weight_t cost(const StateC* state, const void* _gold, attr_t label) noexcept nogil: cdef weight_t cost(const StateC* state, const void* _gold, attr_t label) nogil:
gold = <const GoldParseStateC*>_gold gold = <const GoldParseStateC*>_gold
cdef int b0 = state.B(0) cdef int b0 = state.B(0)
cdef int cost = 0 cdef int cost = 0
@ -785,7 +785,7 @@ cdef class ArcEager(TransitionSystem):
else: else:
return False return False
cdef int set_valid(self, int* output, const StateC* st) noexcept nogil: cdef int set_valid(self, int* output, const StateC* st) nogil:
cdef int[N_MOVES] is_valid cdef int[N_MOVES] is_valid
is_valid[SHIFT] = Shift.is_valid(st, 0) is_valid[SHIFT] = Shift.is_valid(st, 0)
is_valid[REDUCE] = Reduce.is_valid(st, 0) is_valid[REDUCE] = Reduce.is_valid(st, 0)

View File

@ -110,7 +110,7 @@ cdef void update_gold_state(GoldNERStateC* gs, const StateC* state) except *:
cdef do_func_t[N_MOVES] do_funcs cdef do_func_t[N_MOVES] do_funcs
cdef bint _entity_is_sunk(const StateC* state, Transition* golds) noexcept nogil: cdef bint _entity_is_sunk(const StateC* state, Transition* golds) nogil:
if not state.entity_is_open(): if not state.entity_is_open():
return False return False
@ -238,7 +238,7 @@ cdef class BiluoPushDown(TransitionSystem):
def add_action(self, int action, label_name, freq=None): def add_action(self, int action, label_name, freq=None):
cdef attr_t label_id cdef attr_t label_id
if not isinstance(label_name, int): if not isinstance(label_name, (int, long)):
label_id = self.strings.add(label_name) label_id = self.strings.add(label_name)
else: else:
label_id = label_name label_id = label_name
@ -347,21 +347,21 @@ cdef class BiluoPushDown(TransitionSystem):
cdef class Missing: cdef class Missing:
@staticmethod @staticmethod
cdef bint is_valid(const StateC* st, attr_t label) noexcept nogil: cdef bint is_valid(const StateC* st, attr_t label) nogil:
return False return False
@staticmethod @staticmethod
cdef int transition(StateC* s, attr_t label) noexcept nogil: cdef int transition(StateC* s, attr_t label) nogil:
pass pass
@staticmethod @staticmethod
cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) noexcept nogil: cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil:
return 9000 return 9000
cdef class Begin: cdef class Begin:
@staticmethod @staticmethod
cdef bint is_valid(const StateC* st, attr_t label) noexcept nogil: cdef bint is_valid(const StateC* st, attr_t label) nogil:
cdef int preset_ent_iob = st.B_(0).ent_iob cdef int preset_ent_iob = st.B_(0).ent_iob
cdef attr_t preset_ent_label = st.B_(0).ent_type cdef attr_t preset_ent_label = st.B_(0).ent_type
if st.entity_is_open(): if st.entity_is_open():
@ -400,13 +400,13 @@ cdef class Begin:
return True return True
@staticmethod @staticmethod
cdef int transition(StateC* st, attr_t label) noexcept nogil: cdef int transition(StateC* st, attr_t label) nogil:
st.open_ent(label) st.open_ent(label)
st.push() st.push()
st.pop() st.pop()
@staticmethod @staticmethod
cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) noexcept nogil: cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil:
gold = <GoldNERStateC*>_gold gold = <GoldNERStateC*>_gold
b0 = s.B(0) b0 = s.B(0)
cdef int cost = 0 cdef int cost = 0
@ -439,7 +439,7 @@ cdef class Begin:
cdef class In: cdef class In:
@staticmethod @staticmethod
cdef bint is_valid(const StateC* st, attr_t label) noexcept nogil: cdef bint is_valid(const StateC* st, attr_t label) nogil:
if not st.entity_is_open(): if not st.entity_is_open():
return False return False
if st.buffer_length() < 2: if st.buffer_length() < 2:
@ -475,12 +475,12 @@ cdef class In:
return True return True
@staticmethod @staticmethod
cdef int transition(StateC* st, attr_t label) noexcept nogil: cdef int transition(StateC* st, attr_t label) nogil:
st.push() st.push()
st.pop() st.pop()
@staticmethod @staticmethod
cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) noexcept nogil: cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil:
gold = <GoldNERStateC*>_gold gold = <GoldNERStateC*>_gold
cdef int next_act = gold.ner[s.B(1)].move if s.B(1) >= 0 else OUT cdef int next_act = gold.ner[s.B(1)].move if s.B(1) >= 0 else OUT
cdef int g_act = gold.ner[s.B(0)].move cdef int g_act = gold.ner[s.B(0)].move
@ -510,7 +510,7 @@ cdef class In:
cdef class Last: cdef class Last:
@staticmethod @staticmethod
cdef bint is_valid(const StateC* st, attr_t label) noexcept nogil: cdef bint is_valid(const StateC* st, attr_t label) nogil:
cdef int preset_ent_iob = st.B_(0).ent_iob cdef int preset_ent_iob = st.B_(0).ent_iob
cdef attr_t preset_ent_label = st.B_(0).ent_type cdef attr_t preset_ent_label = st.B_(0).ent_type
if label == 0: if label == 0:
@ -535,13 +535,13 @@ cdef class Last:
return True return True
@staticmethod @staticmethod
cdef int transition(StateC* st, attr_t label) noexcept nogil: cdef int transition(StateC* st, attr_t label) nogil:
st.close_ent() st.close_ent()
st.push() st.push()
st.pop() st.pop()
@staticmethod @staticmethod
cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) noexcept nogil: cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil:
gold = <GoldNERStateC*>_gold gold = <GoldNERStateC*>_gold
b0 = s.B(0) b0 = s.B(0)
ent_start = s.E(0) ent_start = s.E(0)
@ -581,7 +581,7 @@ cdef class Last:
cdef class Unit: cdef class Unit:
@staticmethod @staticmethod
cdef bint is_valid(const StateC* st, attr_t label) noexcept nogil: cdef bint is_valid(const StateC* st, attr_t label) nogil:
cdef int preset_ent_iob = st.B_(0).ent_iob cdef int preset_ent_iob = st.B_(0).ent_iob
cdef attr_t preset_ent_label = st.B_(0).ent_type cdef attr_t preset_ent_label = st.B_(0).ent_type
if label == 0: if label == 0:
@ -609,14 +609,14 @@ cdef class Unit:
return True return True
@staticmethod @staticmethod
cdef int transition(StateC* st, attr_t label) noexcept nogil: cdef int transition(StateC* st, attr_t label) nogil:
st.open_ent(label) st.open_ent(label)
st.close_ent() st.close_ent()
st.push() st.push()
st.pop() st.pop()
@staticmethod @staticmethod
cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) noexcept nogil: cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil:
gold = <GoldNERStateC*>_gold gold = <GoldNERStateC*>_gold
cdef int g_act = gold.ner[s.B(0)].move cdef int g_act = gold.ner[s.B(0)].move
cdef attr_t g_tag = gold.ner[s.B(0)].label cdef attr_t g_tag = gold.ner[s.B(0)].label
@ -646,7 +646,7 @@ cdef class Unit:
cdef class Out: cdef class Out:
@staticmethod @staticmethod
cdef bint is_valid(const StateC* st, attr_t label) noexcept nogil: cdef bint is_valid(const StateC* st, attr_t label) nogil:
cdef int preset_ent_iob = st.B_(0).ent_iob cdef int preset_ent_iob = st.B_(0).ent_iob
if st.entity_is_open(): if st.entity_is_open():
return False return False
@ -658,12 +658,12 @@ cdef class Out:
return True return True
@staticmethod @staticmethod
cdef int transition(StateC* st, attr_t label) noexcept nogil: cdef int transition(StateC* st, attr_t label) nogil:
st.push() st.push()
st.pop() st.pop()
@staticmethod @staticmethod
cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) noexcept nogil: cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil:
gold = <GoldNERStateC*>_gold gold = <GoldNERStateC*>_gold
cdef int g_act = gold.ner[s.B(0)].move cdef int g_act = gold.ner[s.B(0)].move
cdef weight_t cost = 0 cdef weight_t cost = 0

View File

@ -94,7 +94,7 @@ cdef bool _has_head_as_ancestor(int tokenid, int head, const vector[int]& heads)
return False return False
cdef string heads_to_string(const vector[int]& heads) noexcept nogil: cdef string heads_to_string(const vector[int]& heads) nogil:
cdef vector[int].const_iterator citer cdef vector[int].const_iterator citer
cdef string cycle_str cdef string cycle_str
@ -183,7 +183,7 @@ cpdef deprojectivize(Doc doc):
new_label, head_label = label.split(DELIMITER) new_label, head_label = label.split(DELIMITER)
new_head = _find_new_head(doc[i], head_label) new_head = _find_new_head(doc[i], head_label)
doc.c[i].head = new_head.i - i doc.c[i].head = new_head.i - i
doc.c[i].dep = doc.vocab.strings.add(new_label, allow_transient=False) doc.c[i].dep = doc.vocab.strings.add(new_label)
set_children_from_heads(doc.c, 0, doc.length) set_children_from_heads(doc.c, 0, doc.length)
return doc return doc

View File

@ -15,22 +15,22 @@ cdef struct Transition:
weight_t score weight_t score
bint (*is_valid)(const StateC* state, attr_t label) noexcept nogil bint (*is_valid)(const StateC* state, attr_t label) nogil
weight_t (*get_cost)(const StateC* state, const void* gold, attr_t label) noexcept nogil weight_t (*get_cost)(const StateC* state, const void* gold, attr_t label) nogil
int (*do)(StateC* state, attr_t label) noexcept nogil int (*do)(StateC* state, attr_t label) nogil
ctypedef weight_t (*get_cost_func_t)( ctypedef weight_t (*get_cost_func_t)(
const StateC* state, const void* gold, attr_tlabel const StateC* state, const void* gold, attr_tlabel
) noexcept nogil ) nogil
ctypedef weight_t (*move_cost_func_t)( ctypedef weight_t (*move_cost_func_t)(
const StateC* state, const void* gold const StateC* state, const void* gold
) noexcept nogil ) nogil
ctypedef weight_t (*label_cost_func_t)( ctypedef weight_t (*label_cost_func_t)(
const StateC* state, const void* gold, attr_t label const StateC* state, const void* gold, attr_t label
) noexcept nogil ) nogil
ctypedef int (*do_func_t)(StateC* state, attr_t label) noexcept nogil ctypedef int (*do_func_t)(StateC* state, attr_t label) nogil
ctypedef void* (*init_state_t)(Pool mem, int length, void* tokens) except NULL ctypedef void* (*init_state_t)(Pool mem, int length, void* tokens) except NULL
@ -53,7 +53,7 @@ cdef class TransitionSystem:
cdef Transition init_transition(self, int clas, int move, attr_t label) except * cdef Transition init_transition(self, int clas, int move, attr_t label) except *
cdef int set_valid(self, int* output, const StateC* st) noexcept nogil cdef int set_valid(self, int* output, const StateC* st) nogil
cdef int set_costs(self, int* is_valid, weight_t* costs, cdef int set_costs(self, int* is_valid, weight_t* costs,
const StateC* state, gold) except -1 const StateC* state, gold) except -1

View File

@ -149,7 +149,7 @@ cdef class TransitionSystem:
action = self.lookup_transition(move_name) action = self.lookup_transition(move_name)
return action.is_valid(stcls.c, action.label) return action.is_valid(stcls.c, action.label)
cdef int set_valid(self, int* is_valid, const StateC* st) noexcept nogil: cdef int set_valid(self, int* is_valid, const StateC* st) nogil:
cdef int i cdef int i
for i in range(self.n_moves): for i in range(self.n_moves):
is_valid[i] = self.c[i].is_valid(st, self.c[i].label) is_valid[i] = self.c[i].is_valid(st, self.c[i].label)
@ -191,7 +191,8 @@ cdef class TransitionSystem:
def add_action(self, int action, label_name): def add_action(self, int action, label_name):
cdef attr_t label_id cdef attr_t label_id
if not isinstance(label_name, int): if not isinstance(label_name, int) and \
not isinstance(label_name, long):
label_id = self.strings.add(label_name) label_id = self.strings.add(label_name)
else: else:
label_id = label_name label_id = label_name

View File

@ -1,5 +1,3 @@
import importlib
import sys
from pathlib import Path from pathlib import Path
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
@ -24,6 +22,19 @@ TagMapType = Dict[str, Dict[Union[int, str], Union[int, str]]]
MorphRulesType = Dict[str, Dict[str, Dict[Union[int, str], Union[int, str]]]] MorphRulesType = Dict[str, Dict[str, Dict[Union[int, str], Union[int, str]]]]
@Language.factory(
"attribute_ruler",
default_config={
"validate": False,
"scorer": {"@scorers": "spacy.attribute_ruler_scorer.v1"},
},
)
def make_attribute_ruler(
nlp: Language, name: str, validate: bool, scorer: Optional[Callable]
):
return AttributeRuler(nlp.vocab, name, validate=validate, scorer=scorer)
def attribute_ruler_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]: def attribute_ruler_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
def morph_key_getter(token, attr): def morph_key_getter(token, attr):
return getattr(token, attr).key return getattr(token, attr).key
@ -43,6 +54,7 @@ def attribute_ruler_score(examples: Iterable[Example], **kwargs) -> Dict[str, An
return results return results
@registry.scorers("spacy.attribute_ruler_scorer.v1")
def make_attribute_ruler_scorer(): def make_attribute_ruler_scorer():
return attribute_ruler_score return attribute_ruler_score
@ -343,11 +355,3 @@ def _split_morph_attrs(attrs: dict) -> Tuple[dict, dict]:
else: else:
morph_attrs[k] = v morph_attrs[k] = v
return other_attrs, morph_attrs return other_attrs, morph_attrs
# Setup backwards compatibility hook for factories
def __getattr__(name):
if name == "make_attribute_ruler":
module = importlib.import_module("spacy.pipeline.factories")
return module.make_attribute_ruler
raise AttributeError(f"module {__name__} has no attribute {name}")

View File

@ -1,6 +1,4 @@
# cython: infer_types=True, binding=True # cython: infer_types=True, binding=True
import importlib
import sys
from collections import defaultdict from collections import defaultdict
from typing import Callable, Optional from typing import Callable, Optional
@ -41,6 +39,188 @@ subword_features = true
DEFAULT_PARSER_MODEL = Config().from_str(default_model_config)["model"] DEFAULT_PARSER_MODEL = Config().from_str(default_model_config)["model"]
@Language.factory(
"parser",
assigns=["token.dep", "token.head", "token.is_sent_start", "doc.sents"],
default_config={
"moves": None,
"update_with_oracle_cut_size": 100,
"learn_tokens": False,
"min_action_freq": 30,
"model": DEFAULT_PARSER_MODEL,
"scorer": {"@scorers": "spacy.parser_scorer.v1"},
},
default_score_weights={
"dep_uas": 0.5,
"dep_las": 0.5,
"dep_las_per_type": None,
"sents_p": None,
"sents_r": None,
"sents_f": 0.0,
},
)
def make_parser(
nlp: Language,
name: str,
model: Model,
moves: Optional[TransitionSystem],
update_with_oracle_cut_size: int,
learn_tokens: bool,
min_action_freq: int,
scorer: Optional[Callable],
):
"""Create a transition-based DependencyParser component. The dependency parser
jointly learns sentence segmentation and labelled dependency parsing, and can
optionally learn to merge tokens that had been over-segmented by the tokenizer.
The parser uses a variant of the non-monotonic arc-eager transition-system
described by Honnibal and Johnson (2014), with the addition of a "break"
transition to perform the sentence segmentation. Nivre's pseudo-projective
dependency transformation is used to allow the parser to predict
non-projective parses.
The parser is trained using an imitation learning objective. The parser follows
the actions predicted by the current weights, and at each state, determines
which actions are compatible with the optimal parse that could be reached
from the current state. The weights such that the scores assigned to the
set of optimal actions is increased, while scores assigned to other
actions are decreased. Note that more than one action may be optimal for
a given state.
model (Model): The model for the transition-based parser. The model needs
to have a specific substructure of named components --- see the
spacy.ml.tb_framework.TransitionModel for details.
moves (Optional[TransitionSystem]): This defines how the parse-state is created,
updated and evaluated. If 'moves' is None, a new instance is
created with `self.TransitionSystem()`. Defaults to `None`.
update_with_oracle_cut_size (int): During training, cut long sequences into
shorter segments by creating intermediate states based on the gold-standard
history. The model is not very sensitive to this parameter, so you usually
won't need to change it. 100 is a good default.
learn_tokens (bool): Whether to learn to merge subtokens that are split
relative to the gold standard. Experimental.
min_action_freq (int): The minimum frequency of labelled actions to retain.
Rarer labelled actions have their label backed-off to "dep". While this
primarily affects the label accuracy, it can also affect the attachment
structure, as the labels are used to represent the pseudo-projectivity
transformation.
scorer (Optional[Callable]): The scoring method.
"""
return DependencyParser(
nlp.vocab,
model,
name,
moves=moves,
update_with_oracle_cut_size=update_with_oracle_cut_size,
multitasks=[],
learn_tokens=learn_tokens,
min_action_freq=min_action_freq,
beam_width=1,
beam_density=0.0,
beam_update_prob=0.0,
# At some point in the future we can try to implement support for
# partial annotations, perhaps only in the beam objective.
incorrect_spans_key=None,
scorer=scorer,
)
@Language.factory(
"beam_parser",
assigns=["token.dep", "token.head", "token.is_sent_start", "doc.sents"],
default_config={
"beam_width": 8,
"beam_density": 0.01,
"beam_update_prob": 0.5,
"moves": None,
"update_with_oracle_cut_size": 100,
"learn_tokens": False,
"min_action_freq": 30,
"model": DEFAULT_PARSER_MODEL,
"scorer": {"@scorers": "spacy.parser_scorer.v1"},
},
default_score_weights={
"dep_uas": 0.5,
"dep_las": 0.5,
"dep_las_per_type": None,
"sents_p": None,
"sents_r": None,
"sents_f": 0.0,
},
)
def make_beam_parser(
nlp: Language,
name: str,
model: Model,
moves: Optional[TransitionSystem],
update_with_oracle_cut_size: int,
learn_tokens: bool,
min_action_freq: int,
beam_width: int,
beam_density: float,
beam_update_prob: float,
scorer: Optional[Callable],
):
"""Create a transition-based DependencyParser component that uses beam-search.
The dependency parser jointly learns sentence segmentation and labelled
dependency parsing, and can optionally learn to merge tokens that had been
over-segmented by the tokenizer.
The parser uses a variant of the non-monotonic arc-eager transition-system
described by Honnibal and Johnson (2014), with the addition of a "break"
transition to perform the sentence segmentation. Nivre's pseudo-projective
dependency transformation is used to allow the parser to predict
non-projective parses.
The parser is trained using a global objective. That is, it learns to assign
probabilities to whole parses.
model (Model): The model for the transition-based parser. The model needs
to have a specific substructure of named components --- see the
spacy.ml.tb_framework.TransitionModel for details.
moves (Optional[TransitionSystem]): This defines how the parse-state is created,
updated and evaluated. If 'moves' is None, a new instance is
created with `self.TransitionSystem()`. Defaults to `None`.
update_with_oracle_cut_size (int): During training, cut long sequences into
shorter segments by creating intermediate states based on the gold-standard
history. The model is not very sensitive to this parameter, so you usually
won't need to change it. 100 is a good default.
beam_width (int): The number of candidate analyses to maintain.
beam_density (float): The minimum ratio between the scores of the first and
last candidates in the beam. This allows the parser to avoid exploring
candidates that are too far behind. This is mostly intended to improve
efficiency, but it can also improve accuracy as deeper search is not
always better.
beam_update_prob (float): The chance of making a beam update, instead of a
greedy update. Greedy updates are an approximation for the beam updates,
and are faster to compute.
learn_tokens (bool): Whether to learn to merge subtokens that are split
relative to the gold standard. Experimental.
min_action_freq (int): The minimum frequency of labelled actions to retain.
Rarer labelled actions have their label backed-off to "dep". While this
primarily affects the label accuracy, it can also affect the attachment
structure, as the labels are used to represent the pseudo-projectivity
transformation.
"""
return DependencyParser(
nlp.vocab,
model,
name,
moves=moves,
update_with_oracle_cut_size=update_with_oracle_cut_size,
beam_width=beam_width,
beam_density=beam_density,
beam_update_prob=beam_update_prob,
multitasks=[],
learn_tokens=learn_tokens,
min_action_freq=min_action_freq,
# At some point in the future we can try to implement support for
# partial annotations, perhaps only in the beam objective.
incorrect_spans_key=None,
scorer=scorer,
)
def parser_score(examples, **kwargs): def parser_score(examples, **kwargs):
"""Score a batch of examples. """Score a batch of examples.
@ -66,6 +246,7 @@ def parser_score(examples, **kwargs):
return results return results
@registry.scorers("spacy.parser_scorer.v1")
def make_parser_scorer(): def make_parser_scorer():
return parser_score return parser_score
@ -165,14 +346,3 @@ cdef class DependencyParser(Parser):
# because we instead have a label frequency cut-off and back off rare # because we instead have a label frequency cut-off and back off rare
# labels to 'dep'. # labels to 'dep'.
pass pass
# Setup backwards compatibility hook for factories
def __getattr__(name):
if name == "make_parser":
module = importlib.import_module("spacy.pipeline.factories")
return module.make_parser
elif name == "make_beam_parser":
module = importlib.import_module("spacy.pipeline.factories")
return module.make_beam_parser
raise AttributeError(f"module {__name__} has no attribute {name}")

View File

@ -1,5 +1,3 @@
import importlib
import sys
from collections import Counter from collections import Counter
from itertools import islice from itertools import islice
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, cast from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, cast
@ -41,6 +39,43 @@ subword_features = true
DEFAULT_EDIT_TREE_LEMMATIZER_MODEL = Config().from_str(default_model_config)["model"] DEFAULT_EDIT_TREE_LEMMATIZER_MODEL = Config().from_str(default_model_config)["model"]
@Language.factory(
"trainable_lemmatizer",
assigns=["token.lemma"],
requires=[],
default_config={
"model": DEFAULT_EDIT_TREE_LEMMATIZER_MODEL,
"backoff": "orth",
"min_tree_freq": 3,
"overwrite": False,
"top_k": 1,
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
},
default_score_weights={"lemma_acc": 1.0},
)
def make_edit_tree_lemmatizer(
nlp: Language,
name: str,
model: Model,
backoff: Optional[str],
min_tree_freq: int,
overwrite: bool,
top_k: int,
scorer: Optional[Callable],
):
"""Construct an EditTreeLemmatizer component."""
return EditTreeLemmatizer(
nlp.vocab,
model,
name,
backoff=backoff,
min_tree_freq=min_tree_freq,
overwrite=overwrite,
top_k=top_k,
scorer=scorer,
)
class EditTreeLemmatizer(TrainablePipe): class EditTreeLemmatizer(TrainablePipe):
""" """
Lemmatizer that lemmatizes each word using a predicted edit tree. Lemmatizer that lemmatizes each word using a predicted edit tree.
@ -386,11 +421,3 @@ class EditTreeLemmatizer(TrainablePipe):
self.tree2label[tree_id] = len(self.cfg["labels"]) self.tree2label[tree_id] = len(self.cfg["labels"])
self.cfg["labels"].append(tree_id) self.cfg["labels"].append(tree_id)
return self.tree2label[tree_id] return self.tree2label[tree_id]
# Setup backwards compatibility hook for factories
def __getattr__(name):
if name == "make_edit_tree_lemmatizer":
module = importlib.import_module("spacy.pipeline.factories")
return module.make_edit_tree_lemmatizer
raise AttributeError(f"module {__name__} has no attribute {name}")

View File

@ -1,6 +1,4 @@
import importlib
import random import random
import sys
from itertools import islice from itertools import islice
from pathlib import Path from pathlib import Path
from typing import Any, Callable, Dict, Iterable, List, Optional, Union from typing import Any, Callable, Dict, Iterable, List, Optional, Union
@ -13,6 +11,7 @@ from .. import util
from ..errors import Errors from ..errors import Errors
from ..kb import Candidate, KnowledgeBase from ..kb import Candidate, KnowledgeBase
from ..language import Language from ..language import Language
from ..ml import empty_kb
from ..scorer import Scorer from ..scorer import Scorer
from ..tokens import Doc, Span from ..tokens import Doc, Span
from ..training import Example, validate_examples, validate_get_examples from ..training import Example, validate_examples, validate_get_examples
@ -42,10 +41,117 @@ subword_features = true
DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"] DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]
@Language.factory(
"entity_linker",
requires=["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"],
assigns=["token.ent_kb_id"],
default_config={
"model": DEFAULT_NEL_MODEL,
"labels_discard": [],
"n_sents": 0,
"incl_prior": True,
"incl_context": True,
"entity_vector_length": 64,
"get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
"get_candidates_batch": {"@misc": "spacy.CandidateBatchGenerator.v1"},
"generate_empty_kb": {"@misc": "spacy.EmptyKB.v2"},
"overwrite": True,
"scorer": {"@scorers": "spacy.entity_linker_scorer.v1"},
"use_gold_ents": True,
"candidates_batch_size": 1,
"threshold": None,
},
default_score_weights={
"nel_micro_f": 1.0,
"nel_micro_r": None,
"nel_micro_p": None,
},
)
def make_entity_linker(
nlp: Language,
name: str,
model: Model,
*,
labels_discard: Iterable[str],
n_sents: int,
incl_prior: bool,
incl_context: bool,
entity_vector_length: int,
get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
get_candidates_batch: Callable[
[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
],
generate_empty_kb: Callable[[Vocab, int], KnowledgeBase],
overwrite: bool,
scorer: Optional[Callable],
use_gold_ents: bool,
candidates_batch_size: int,
threshold: Optional[float] = None,
):
"""Construct an EntityLinker component.
model (Model[List[Doc], Floats2d]): A model that learns document vector
representations. Given a batch of Doc objects, it should return a single
array, with one row per item in the batch.
labels_discard (Iterable[str]): NER labels that will automatically get a "NIL" prediction.
n_sents (int): The number of neighbouring sentences to take into account.
incl_prior (bool): Whether or not to include prior probabilities from the KB in the model.
incl_context (bool): Whether or not to include the local context in the model.
entity_vector_length (int): Size of encoding vectors in the KB.
get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
produces a list of candidates, given a certain knowledge base and a textual mention.
get_candidates_batch (
Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]], Iterable[Candidate]]
): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase.
scorer (Optional[Callable]): The scoring method.
use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
component must provide entity annotations.
candidates_batch_size (int): Size of batches for entity candidate generation.
threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the threshold,
prediction is discarded. If None, predictions are not filtered by any threshold.
"""
if not model.attrs.get("include_span_maker", False):
# The only difference in arguments here is that use_gold_ents and threshold aren't available.
return EntityLinker_v1(
nlp.vocab,
model,
name,
labels_discard=labels_discard,
n_sents=n_sents,
incl_prior=incl_prior,
incl_context=incl_context,
entity_vector_length=entity_vector_length,
get_candidates=get_candidates,
overwrite=overwrite,
scorer=scorer,
)
return EntityLinker(
nlp.vocab,
model,
name,
labels_discard=labels_discard,
n_sents=n_sents,
incl_prior=incl_prior,
incl_context=incl_context,
entity_vector_length=entity_vector_length,
get_candidates=get_candidates,
get_candidates_batch=get_candidates_batch,
generate_empty_kb=generate_empty_kb,
overwrite=overwrite,
scorer=scorer,
use_gold_ents=use_gold_ents,
candidates_batch_size=candidates_batch_size,
threshold=threshold,
)
def entity_linker_score(examples, **kwargs): def entity_linker_score(examples, **kwargs):
return Scorer.score_links(examples, negative_labels=[EntityLinker.NIL], **kwargs) return Scorer.score_links(examples, negative_labels=[EntityLinker.NIL], **kwargs)
@registry.scorers("spacy.entity_linker_scorer.v1")
def make_entity_linker_scorer(): def make_entity_linker_scorer():
return entity_linker_score return entity_linker_score
@ -129,6 +235,7 @@ class EntityLinker(TrainablePipe):
self.cfg: Dict[str, Any] = {"overwrite": overwrite} self.cfg: Dict[str, Any] = {"overwrite": overwrite}
self.distance = CosineDistance(normalize=False) self.distance = CosineDistance(normalize=False)
self.kb = generate_empty_kb(self.vocab, entity_vector_length) self.kb = generate_empty_kb(self.vocab, entity_vector_length)
self.scorer = scorer
self.use_gold_ents = use_gold_ents self.use_gold_ents = use_gold_ents
self.candidates_batch_size = candidates_batch_size self.candidates_batch_size = candidates_batch_size
self.threshold = threshold self.threshold = threshold
@ -136,37 +243,6 @@ class EntityLinker(TrainablePipe):
if candidates_batch_size < 1: if candidates_batch_size < 1:
raise ValueError(Errors.E1044) raise ValueError(Errors.E1044)
def _score_with_ents_set(examples: Iterable[Example], **kwargs):
# Because of how spaCy works, we can't just score immediately, because Language.evaluate
# calls pipe() on the predicted docs, which won't have entities if there is no NER in the pipeline.
if not scorer:
return scorer
if not self.use_gold_ents:
return scorer(examples, **kwargs)
else:
examples = self._ensure_ents(examples)
docs = self.pipe(
(eg.predicted for eg in examples),
)
for eg, doc in zip(examples, docs):
eg.predicted = doc
return scorer(examples, **kwargs)
self.scorer = _score_with_ents_set
def _ensure_ents(self, examples: Iterable[Example]) -> Iterable[Example]:
"""If use_gold_ents is true, set the gold entities to (a copy of) eg.predicted."""
if not self.use_gold_ents:
return examples
new_examples = []
for eg in examples:
ents, _ = eg.get_aligned_ents_and_ner()
new_eg = eg.copy()
new_eg.predicted.ents = ents
new_examples.append(new_eg)
return new_examples
def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]): def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]):
"""Define the KB of this pipe by providing a function that will """Define the KB of this pipe by providing a function that will
create it using this object's vocab.""" create it using this object's vocab."""
@ -208,9 +284,11 @@ class EntityLinker(TrainablePipe):
nO = self.kb.entity_vector_length nO = self.kb.entity_vector_length
doc_sample = [] doc_sample = []
vector_sample = [] vector_sample = []
examples = self._ensure_ents(islice(get_examples(), 10)) for eg in islice(get_examples(), 10):
for eg in examples:
doc = eg.x doc = eg.x
if self.use_gold_ents:
ents, _ = eg.get_aligned_ents_and_ner()
doc.ents = ents
doc_sample.append(doc) doc_sample.append(doc)
vector_sample.append(self.model.ops.alloc1f(nO)) vector_sample.append(self.model.ops.alloc1f(nO))
assert len(doc_sample) > 0, Errors.E923.format(name=self.name) assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
@ -276,17 +354,31 @@ class EntityLinker(TrainablePipe):
losses.setdefault(self.name, 0.0) losses.setdefault(self.name, 0.0)
if not examples: if not examples:
return losses return losses
examples = self._ensure_ents(examples)
validate_examples(examples, "EntityLinker.update") validate_examples(examples, "EntityLinker.update")
set_dropout_rate(self.model, drop)
docs = [eg.predicted for eg in examples]
# save to restore later
old_ents = [doc.ents for doc in docs]
for doc, ex in zip(docs, examples):
if self.use_gold_ents:
ents, _ = ex.get_aligned_ents_and_ner()
doc.ents = ents
else:
# only keep matching ents
doc.ents = ex.get_matching_ents()
# make sure we have something to learn from, if not, short-circuit # make sure we have something to learn from, if not, short-circuit
if not self.batch_has_learnable_example(examples): if not self.batch_has_learnable_example(examples):
return losses return losses
set_dropout_rate(self.model, drop)
docs = [eg.predicted for eg in examples]
sentence_encodings, bp_context = self.model.begin_update(docs) sentence_encodings, bp_context = self.model.begin_update(docs)
# now restore the ents
for doc, old in zip(docs, old_ents):
doc.ents = old
loss, d_scores = self.get_loss( loss, d_scores = self.get_loss(
sentence_encodings=sentence_encodings, examples=examples sentence_encodings=sentence_encodings, examples=examples
) )
@ -294,13 +386,11 @@ class EntityLinker(TrainablePipe):
if sgd is not None: if sgd is not None:
self.finish_update(sgd) self.finish_update(sgd)
losses[self.name] += loss losses[self.name] += loss
return losses return losses
def get_loss(self, examples: Iterable[Example], sentence_encodings: Floats2d): def get_loss(self, examples: Iterable[Example], sentence_encodings: Floats2d):
validate_examples(examples, "EntityLinker.get_loss") validate_examples(examples, "EntityLinker.get_loss")
entity_encodings = [] entity_encodings = []
# We assume that get_loss is called with gold ents set in the examples if need be
eidx = 0 # indices in gold entities to keep eidx = 0 # indices in gold entities to keep
keep_ents = [] # indices in sentence_encodings to keep keep_ents = [] # indices in sentence_encodings to keep
@ -571,11 +661,3 @@ class EntityLinker(TrainablePipe):
def add_label(self, label): def add_label(self, label):
raise NotImplementedError raise NotImplementedError
# Setup backwards compatibility hook for factories
def __getattr__(name):
if name == "make_entity_linker":
module = importlib.import_module("spacy.pipeline.factories")
return module.make_entity_linker
raise AttributeError(f"module {__name__} has no attribute {name}")

View File

@ -1,5 +1,3 @@
import importlib
import sys
import warnings import warnings
from collections import defaultdict from collections import defaultdict
from pathlib import Path from pathlib import Path
@ -21,10 +19,51 @@ DEFAULT_ENT_ID_SEP = "||"
PatternType = Dict[str, Union[str, List[Dict[str, Any]]]] PatternType = Dict[str, Union[str, List[Dict[str, Any]]]]
@Language.factory(
"entity_ruler",
assigns=["doc.ents", "token.ent_type", "token.ent_iob"],
default_config={
"phrase_matcher_attr": None,
"matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"},
"validate": False,
"overwrite_ents": False,
"ent_id_sep": DEFAULT_ENT_ID_SEP,
"scorer": {"@scorers": "spacy.entity_ruler_scorer.v1"},
},
default_score_weights={
"ents_f": 1.0,
"ents_p": 0.0,
"ents_r": 0.0,
"ents_per_type": None,
},
)
def make_entity_ruler(
nlp: Language,
name: str,
phrase_matcher_attr: Optional[Union[int, str]],
matcher_fuzzy_compare: Callable,
validate: bool,
overwrite_ents: bool,
ent_id_sep: str,
scorer: Optional[Callable],
):
return EntityRuler(
nlp,
name,
phrase_matcher_attr=phrase_matcher_attr,
matcher_fuzzy_compare=matcher_fuzzy_compare,
validate=validate,
overwrite_ents=overwrite_ents,
ent_id_sep=ent_id_sep,
scorer=scorer,
)
def entity_ruler_score(examples, **kwargs): def entity_ruler_score(examples, **kwargs):
return get_ner_prf(examples) return get_ner_prf(examples)
@registry.scorers("spacy.entity_ruler_scorer.v1")
def make_entity_ruler_scorer(): def make_entity_ruler_scorer():
return entity_ruler_score return entity_ruler_score
@ -500,11 +539,3 @@ class EntityRuler(Pipe):
srsly.write_jsonl(path, self.patterns) srsly.write_jsonl(path, self.patterns)
else: else:
to_disk(path, serializers, {}) to_disk(path, serializers, {})
# Setup backwards compatibility hook for factories
def __getattr__(name):
if name == "make_entity_ruler":
module = importlib.import_module("spacy.pipeline.factories")
return module.make_entity_ruler
raise AttributeError(f"module {__name__} has no attribute {name}")

View File

@ -1,929 +0,0 @@
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
from thinc.api import Model
from thinc.types import Floats2d, Ragged
from ..kb import Candidate, KnowledgeBase
from ..language import Language
from ..pipeline._parser_internals.transition_system import TransitionSystem
from ..pipeline.attributeruler import AttributeRuler
from ..pipeline.dep_parser import DEFAULT_PARSER_MODEL, DependencyParser
from ..pipeline.edit_tree_lemmatizer import (
DEFAULT_EDIT_TREE_LEMMATIZER_MODEL,
EditTreeLemmatizer,
)
# Import factory default configurations
from ..pipeline.entity_linker import DEFAULT_NEL_MODEL, EntityLinker, EntityLinker_v1
from ..pipeline.entityruler import DEFAULT_ENT_ID_SEP, EntityRuler
from ..pipeline.functions import DocCleaner, TokenSplitter
from ..pipeline.lemmatizer import Lemmatizer
from ..pipeline.morphologizer import DEFAULT_MORPH_MODEL, Morphologizer
from ..pipeline.multitask import DEFAULT_MT_MODEL, MultitaskObjective
from ..pipeline.ner import DEFAULT_NER_MODEL, EntityRecognizer
from ..pipeline.sentencizer import Sentencizer
from ..pipeline.senter import DEFAULT_SENTER_MODEL, SentenceRecognizer
from ..pipeline.span_finder import DEFAULT_SPAN_FINDER_MODEL, SpanFinder
from ..pipeline.span_ruler import DEFAULT_SPANS_KEY as SPAN_RULER_DEFAULT_SPANS_KEY
from ..pipeline.span_ruler import (
SpanRuler,
prioritize_existing_ents_filter,
prioritize_new_ents_filter,
)
from ..pipeline.spancat import (
DEFAULT_SPANCAT_MODEL,
DEFAULT_SPANCAT_SINGLELABEL_MODEL,
DEFAULT_SPANS_KEY,
SpanCategorizer,
Suggester,
)
from ..pipeline.tagger import DEFAULT_TAGGER_MODEL, Tagger
from ..pipeline.textcat import DEFAULT_SINGLE_TEXTCAT_MODEL, TextCategorizer
from ..pipeline.textcat_multilabel import (
DEFAULT_MULTI_TEXTCAT_MODEL,
MultiLabel_TextCategorizer,
)
from ..pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL, Tok2Vec
from ..tokens.doc import Doc
from ..tokens.span import Span
from ..vocab import Vocab
# Global flag to track if factories have been registered
FACTORIES_REGISTERED = False
def register_factories() -> None:
"""Register all factories with the registry.
This function registers all pipeline component factories, centralizing
the registrations that were previously done with @Language.factory decorators.
"""
global FACTORIES_REGISTERED
if FACTORIES_REGISTERED:
return
# Register factories using the same pattern as Language.factory decorator
# We use Language.factory()() pattern which exactly mimics the decorator
# attributeruler
Language.factory(
"attribute_ruler",
default_config={
"validate": False,
"scorer": {"@scorers": "spacy.attribute_ruler_scorer.v1"},
},
)(make_attribute_ruler)
# entity_linker
Language.factory(
"entity_linker",
requires=["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"],
assigns=["token.ent_kb_id"],
default_config={
"model": DEFAULT_NEL_MODEL,
"labels_discard": [],
"n_sents": 0,
"incl_prior": True,
"incl_context": True,
"entity_vector_length": 64,
"get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
"get_candidates_batch": {"@misc": "spacy.CandidateBatchGenerator.v1"},
"generate_empty_kb": {"@misc": "spacy.EmptyKB.v2"},
"overwrite": True,
"scorer": {"@scorers": "spacy.entity_linker_scorer.v1"},
"use_gold_ents": True,
"candidates_batch_size": 1,
"threshold": None,
},
default_score_weights={
"nel_micro_f": 1.0,
"nel_micro_r": None,
"nel_micro_p": None,
},
)(make_entity_linker)
# entity_ruler
Language.factory(
"entity_ruler",
assigns=["doc.ents", "token.ent_type", "token.ent_iob"],
default_config={
"phrase_matcher_attr": None,
"matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"},
"validate": False,
"overwrite_ents": False,
"ent_id_sep": DEFAULT_ENT_ID_SEP,
"scorer": {"@scorers": "spacy.entity_ruler_scorer.v1"},
},
default_score_weights={
"ents_f": 1.0,
"ents_p": 0.0,
"ents_r": 0.0,
"ents_per_type": None,
},
)(make_entity_ruler)
# lemmatizer
Language.factory(
"lemmatizer",
assigns=["token.lemma"],
default_config={
"model": None,
"mode": "lookup",
"overwrite": False,
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
},
default_score_weights={"lemma_acc": 1.0},
)(make_lemmatizer)
# textcat
Language.factory(
"textcat",
assigns=["doc.cats"],
default_config={
"threshold": 0.0,
"model": DEFAULT_SINGLE_TEXTCAT_MODEL,
"scorer": {"@scorers": "spacy.textcat_scorer.v2"},
},
default_score_weights={
"cats_score": 1.0,
"cats_score_desc": None,
"cats_micro_p": None,
"cats_micro_r": None,
"cats_micro_f": None,
"cats_macro_p": None,
"cats_macro_r": None,
"cats_macro_f": None,
"cats_macro_auc": None,
"cats_f_per_type": None,
},
)(make_textcat)
# token_splitter
Language.factory(
"token_splitter",
default_config={"min_length": 25, "split_length": 10},
retokenizes=True,
)(make_token_splitter)
# doc_cleaner
Language.factory(
"doc_cleaner",
default_config={"attrs": {"tensor": None, "_.trf_data": None}, "silent": True},
)(make_doc_cleaner)
# tok2vec
Language.factory(
"tok2vec",
assigns=["doc.tensor"],
default_config={"model": DEFAULT_TOK2VEC_MODEL},
)(make_tok2vec)
# senter
Language.factory(
"senter",
assigns=["token.is_sent_start"],
default_config={
"model": DEFAULT_SENTER_MODEL,
"overwrite": False,
"scorer": {"@scorers": "spacy.senter_scorer.v1"},
},
default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
)(make_senter)
# morphologizer
Language.factory(
"morphologizer",
assigns=["token.morph", "token.pos"],
default_config={
"model": DEFAULT_MORPH_MODEL,
"overwrite": True,
"extend": False,
"scorer": {"@scorers": "spacy.morphologizer_scorer.v1"},
"label_smoothing": 0.0,
},
default_score_weights={
"pos_acc": 0.5,
"morph_acc": 0.5,
"morph_per_feat": None,
},
)(make_morphologizer)
# spancat
Language.factory(
"spancat",
assigns=["doc.spans"],
default_config={
"threshold": 0.5,
"spans_key": DEFAULT_SPANS_KEY,
"max_positive": None,
"model": DEFAULT_SPANCAT_MODEL,
"suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
"scorer": {"@scorers": "spacy.spancat_scorer.v1"},
},
default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0},
)(make_spancat)
# spancat_singlelabel
Language.factory(
"spancat_singlelabel",
assigns=["doc.spans"],
default_config={
"spans_key": DEFAULT_SPANS_KEY,
"model": DEFAULT_SPANCAT_SINGLELABEL_MODEL,
"negative_weight": 1.0,
"suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
"scorer": {"@scorers": "spacy.spancat_scorer.v1"},
"allow_overlap": True,
},
default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0},
)(make_spancat_singlelabel)
# future_entity_ruler
Language.factory(
"future_entity_ruler",
assigns=["doc.ents"],
default_config={
"phrase_matcher_attr": None,
"validate": False,
"overwrite_ents": False,
"scorer": {"@scorers": "spacy.entity_ruler_scorer.v1"},
"ent_id_sep": "__unused__",
"matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"},
},
default_score_weights={
"ents_f": 1.0,
"ents_p": 0.0,
"ents_r": 0.0,
"ents_per_type": None,
},
)(make_future_entity_ruler)
# span_ruler
Language.factory(
"span_ruler",
assigns=["doc.spans"],
default_config={
"spans_key": SPAN_RULER_DEFAULT_SPANS_KEY,
"spans_filter": None,
"annotate_ents": False,
"ents_filter": {"@misc": "spacy.first_longest_spans_filter.v1"},
"phrase_matcher_attr": None,
"matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"},
"validate": False,
"overwrite": True,
"scorer": {
"@scorers": "spacy.overlapping_labeled_spans_scorer.v1",
"spans_key": SPAN_RULER_DEFAULT_SPANS_KEY,
},
},
default_score_weights={
f"spans_{SPAN_RULER_DEFAULT_SPANS_KEY}_f": 1.0,
f"spans_{SPAN_RULER_DEFAULT_SPANS_KEY}_p": 0.0,
f"spans_{SPAN_RULER_DEFAULT_SPANS_KEY}_r": 0.0,
f"spans_{SPAN_RULER_DEFAULT_SPANS_KEY}_per_type": None,
},
)(make_span_ruler)
# trainable_lemmatizer
Language.factory(
"trainable_lemmatizer",
assigns=["token.lemma"],
requires=[],
default_config={
"model": DEFAULT_EDIT_TREE_LEMMATIZER_MODEL,
"backoff": "orth",
"min_tree_freq": 3,
"overwrite": False,
"top_k": 1,
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
},
default_score_weights={"lemma_acc": 1.0},
)(make_edit_tree_lemmatizer)
# textcat_multilabel
Language.factory(
"textcat_multilabel",
assigns=["doc.cats"],
default_config={
"threshold": 0.5,
"model": DEFAULT_MULTI_TEXTCAT_MODEL,
"scorer": {"@scorers": "spacy.textcat_multilabel_scorer.v2"},
},
default_score_weights={
"cats_score": 1.0,
"cats_score_desc": None,
"cats_micro_p": None,
"cats_micro_r": None,
"cats_micro_f": None,
"cats_macro_p": None,
"cats_macro_r": None,
"cats_macro_f": None,
"cats_macro_auc": None,
"cats_f_per_type": None,
},
)(make_multilabel_textcat)
# span_finder
Language.factory(
"span_finder",
assigns=["doc.spans"],
default_config={
"threshold": 0.5,
"model": DEFAULT_SPAN_FINDER_MODEL,
"spans_key": DEFAULT_SPANS_KEY,
"max_length": 25,
"min_length": None,
"scorer": {"@scorers": "spacy.span_finder_scorer.v1"},
},
default_score_weights={
f"spans_{DEFAULT_SPANS_KEY}_f": 1.0,
f"spans_{DEFAULT_SPANS_KEY}_p": 0.0,
f"spans_{DEFAULT_SPANS_KEY}_r": 0.0,
},
)(make_span_finder)
# ner
Language.factory(
"ner",
assigns=["doc.ents", "token.ent_iob", "token.ent_type"],
default_config={
"moves": None,
"update_with_oracle_cut_size": 100,
"model": DEFAULT_NER_MODEL,
"incorrect_spans_key": None,
"scorer": {"@scorers": "spacy.ner_scorer.v1"},
},
default_score_weights={
"ents_f": 1.0,
"ents_p": 0.0,
"ents_r": 0.0,
"ents_per_type": None,
},
)(make_ner)
# beam_ner
Language.factory(
"beam_ner",
assigns=["doc.ents", "token.ent_iob", "token.ent_type"],
default_config={
"moves": None,
"update_with_oracle_cut_size": 100,
"model": DEFAULT_NER_MODEL,
"beam_density": 0.01,
"beam_update_prob": 0.5,
"beam_width": 32,
"incorrect_spans_key": None,
"scorer": {"@scorers": "spacy.ner_scorer.v1"},
},
default_score_weights={
"ents_f": 1.0,
"ents_p": 0.0,
"ents_r": 0.0,
"ents_per_type": None,
},
)(make_beam_ner)
# parser
Language.factory(
"parser",
assigns=["token.dep", "token.head", "token.is_sent_start", "doc.sents"],
default_config={
"moves": None,
"update_with_oracle_cut_size": 100,
"learn_tokens": False,
"min_action_freq": 30,
"model": DEFAULT_PARSER_MODEL,
"scorer": {"@scorers": "spacy.parser_scorer.v1"},
},
default_score_weights={
"dep_uas": 0.5,
"dep_las": 0.5,
"dep_las_per_type": None,
"sents_p": None,
"sents_r": None,
"sents_f": 0.0,
},
)(make_parser)
# beam_parser
Language.factory(
"beam_parser",
assigns=["token.dep", "token.head", "token.is_sent_start", "doc.sents"],
default_config={
"moves": None,
"update_with_oracle_cut_size": 100,
"learn_tokens": False,
"min_action_freq": 30,
"beam_width": 8,
"beam_density": 0.0001,
"beam_update_prob": 0.5,
"model": DEFAULT_PARSER_MODEL,
"scorer": {"@scorers": "spacy.parser_scorer.v1"},
},
default_score_weights={
"dep_uas": 0.5,
"dep_las": 0.5,
"dep_las_per_type": None,
"sents_p": None,
"sents_r": None,
"sents_f": 0.0,
},
)(make_beam_parser)
# tagger
Language.factory(
"tagger",
assigns=["token.tag"],
default_config={
"model": DEFAULT_TAGGER_MODEL,
"overwrite": False,
"scorer": {"@scorers": "spacy.tagger_scorer.v1"},
"neg_prefix": "!",
"label_smoothing": 0.0,
},
default_score_weights={
"tag_acc": 1.0,
"pos_acc": 0.0,
"tag_micro_p": None,
"tag_micro_r": None,
"tag_micro_f": None,
},
)(make_tagger)
# nn_labeller
Language.factory(
"nn_labeller",
default_config={
"labels": None,
"target": "dep_tag_offset",
"model": DEFAULT_MT_MODEL,
},
)(make_nn_labeller)
# sentencizer
Language.factory(
"sentencizer",
assigns=["token.is_sent_start", "doc.sents"],
default_config={
"punct_chars": None,
"overwrite": False,
"scorer": {"@scorers": "spacy.senter_scorer.v1"},
},
default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
)(make_sentencizer)
# Set the flag to indicate that all factories have been registered
FACTORIES_REGISTERED = True
# We can't have function implementations for these factories in Cython, because
# we need to build a Pydantic model for them dynamically, reading their argument
# structure from the signature. In Cython 3, this doesn't work because the
# from __future__ import annotations semantics are used, which means the types
# are stored as strings.
def make_sentencizer(
nlp: Language,
name: str,
punct_chars: Optional[List[str]],
overwrite: bool,
scorer: Optional[Callable],
):
return Sentencizer(
name, punct_chars=punct_chars, overwrite=overwrite, scorer=scorer
)
def make_attribute_ruler(
nlp: Language, name: str, validate: bool, scorer: Optional[Callable]
):
return AttributeRuler(nlp.vocab, name, validate=validate, scorer=scorer)
def make_entity_linker(
nlp: Language,
name: str,
model: Model,
*,
labels_discard: Iterable[str],
n_sents: int,
incl_prior: bool,
incl_context: bool,
entity_vector_length: int,
get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
get_candidates_batch: Callable[
[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
],
generate_empty_kb: Callable[[Vocab, int], KnowledgeBase],
overwrite: bool,
scorer: Optional[Callable],
use_gold_ents: bool,
candidates_batch_size: int,
threshold: Optional[float] = None,
):
if not model.attrs.get("include_span_maker", False):
# The only difference in arguments here is that use_gold_ents and threshold aren't available.
return EntityLinker_v1(
nlp.vocab,
model,
name,
labels_discard=labels_discard,
n_sents=n_sents,
incl_prior=incl_prior,
incl_context=incl_context,
entity_vector_length=entity_vector_length,
get_candidates=get_candidates,
overwrite=overwrite,
scorer=scorer,
)
return EntityLinker(
nlp.vocab,
model,
name,
labels_discard=labels_discard,
n_sents=n_sents,
incl_prior=incl_prior,
incl_context=incl_context,
entity_vector_length=entity_vector_length,
get_candidates=get_candidates,
get_candidates_batch=get_candidates_batch,
generate_empty_kb=generate_empty_kb,
overwrite=overwrite,
scorer=scorer,
use_gold_ents=use_gold_ents,
candidates_batch_size=candidates_batch_size,
threshold=threshold,
)
def make_lemmatizer(
nlp: Language,
model: Optional[Model],
name: str,
mode: str,
overwrite: bool,
scorer: Optional[Callable],
):
return Lemmatizer(
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
)
def make_textcat(
nlp: Language,
name: str,
model: Model[List[Doc], List[Floats2d]],
threshold: float,
scorer: Optional[Callable],
) -> TextCategorizer:
return TextCategorizer(nlp.vocab, model, name, threshold=threshold, scorer=scorer)
def make_token_splitter(
nlp: Language, name: str, *, min_length: int = 0, split_length: int = 0
):
return TokenSplitter(min_length=min_length, split_length=split_length)
def make_doc_cleaner(nlp: Language, name: str, *, attrs: Dict[str, Any], silent: bool):
return DocCleaner(attrs, silent=silent)
def make_tok2vec(nlp: Language, name: str, model: Model) -> Tok2Vec:
return Tok2Vec(nlp.vocab, model, name)
def make_spancat(
nlp: Language,
name: str,
suggester: Suggester,
model: Model[Tuple[List[Doc], Ragged], Floats2d],
spans_key: str,
scorer: Optional[Callable],
threshold: float,
max_positive: Optional[int],
) -> SpanCategorizer:
return SpanCategorizer(
nlp.vocab,
model=model,
suggester=suggester,
name=name,
spans_key=spans_key,
negative_weight=None,
allow_overlap=True,
max_positive=max_positive,
threshold=threshold,
scorer=scorer,
add_negative_label=False,
)
def make_spancat_singlelabel(
nlp: Language,
name: str,
suggester: Suggester,
model: Model[Tuple[List[Doc], Ragged], Floats2d],
spans_key: str,
negative_weight: float,
allow_overlap: bool,
scorer: Optional[Callable],
) -> SpanCategorizer:
return SpanCategorizer(
nlp.vocab,
model=model,
suggester=suggester,
name=name,
spans_key=spans_key,
negative_weight=negative_weight,
allow_overlap=allow_overlap,
max_positive=1,
add_negative_label=True,
threshold=None,
scorer=scorer,
)
def make_future_entity_ruler(
nlp: Language,
name: str,
phrase_matcher_attr: Optional[Union[int, str]],
matcher_fuzzy_compare: Callable,
validate: bool,
overwrite_ents: bool,
scorer: Optional[Callable],
ent_id_sep: str,
):
if overwrite_ents:
ents_filter = prioritize_new_ents_filter
else:
ents_filter = prioritize_existing_ents_filter
return SpanRuler(
nlp,
name,
spans_key=None,
spans_filter=None,
annotate_ents=True,
ents_filter=ents_filter,
phrase_matcher_attr=phrase_matcher_attr,
matcher_fuzzy_compare=matcher_fuzzy_compare,
validate=validate,
overwrite=False,
scorer=scorer,
)
def make_entity_ruler(
nlp: Language,
name: str,
phrase_matcher_attr: Optional[Union[int, str]],
matcher_fuzzy_compare: Callable,
validate: bool,
overwrite_ents: bool,
ent_id_sep: str,
scorer: Optional[Callable],
):
return EntityRuler(
nlp,
name,
phrase_matcher_attr=phrase_matcher_attr,
matcher_fuzzy_compare=matcher_fuzzy_compare,
validate=validate,
overwrite_ents=overwrite_ents,
ent_id_sep=ent_id_sep,
scorer=scorer,
)
def make_span_ruler(
nlp: Language,
name: str,
spans_key: Optional[str],
spans_filter: Optional[Callable[[Iterable[Span], Iterable[Span]], Iterable[Span]]],
annotate_ents: bool,
ents_filter: Callable[[Iterable[Span], Iterable[Span]], Iterable[Span]],
phrase_matcher_attr: Optional[Union[int, str]],
matcher_fuzzy_compare: Callable,
validate: bool,
overwrite: bool,
scorer: Optional[Callable],
):
return SpanRuler(
nlp,
name,
spans_key=spans_key,
spans_filter=spans_filter,
annotate_ents=annotate_ents,
ents_filter=ents_filter,
phrase_matcher_attr=phrase_matcher_attr,
matcher_fuzzy_compare=matcher_fuzzy_compare,
validate=validate,
overwrite=overwrite,
scorer=scorer,
)
def make_edit_tree_lemmatizer(
nlp: Language,
name: str,
model: Model,
backoff: Optional[str],
min_tree_freq: int,
overwrite: bool,
top_k: int,
scorer: Optional[Callable],
):
return EditTreeLemmatizer(
nlp.vocab,
model,
name,
backoff=backoff,
min_tree_freq=min_tree_freq,
overwrite=overwrite,
top_k=top_k,
scorer=scorer,
)
def make_multilabel_textcat(
nlp: Language,
name: str,
model: Model[List[Doc], List[Floats2d]],
threshold: float,
scorer: Optional[Callable],
) -> MultiLabel_TextCategorizer:
return MultiLabel_TextCategorizer(
nlp.vocab, model, name, threshold=threshold, scorer=scorer
)
def make_span_finder(
nlp: Language,
name: str,
model: Model[Iterable[Doc], Floats2d],
spans_key: str,
threshold: float,
max_length: Optional[int],
min_length: Optional[int],
scorer: Optional[Callable],
) -> SpanFinder:
return SpanFinder(
nlp,
model=model,
threshold=threshold,
name=name,
scorer=scorer,
max_length=max_length,
min_length=min_length,
spans_key=spans_key,
)
def make_ner(
nlp: Language,
name: str,
model: Model,
moves: Optional[TransitionSystem],
update_with_oracle_cut_size: int,
incorrect_spans_key: Optional[str],
scorer: Optional[Callable],
):
return EntityRecognizer(
nlp.vocab,
model,
name=name,
moves=moves,
update_with_oracle_cut_size=update_with_oracle_cut_size,
incorrect_spans_key=incorrect_spans_key,
scorer=scorer,
)
def make_beam_ner(
nlp: Language,
name: str,
model: Model,
moves: Optional[TransitionSystem],
update_with_oracle_cut_size: int,
beam_width: int,
beam_density: float,
beam_update_prob: float,
incorrect_spans_key: Optional[str],
scorer: Optional[Callable],
):
return EntityRecognizer(
nlp.vocab,
model,
name=name,
moves=moves,
update_with_oracle_cut_size=update_with_oracle_cut_size,
beam_width=beam_width,
beam_density=beam_density,
beam_update_prob=beam_update_prob,
incorrect_spans_key=incorrect_spans_key,
scorer=scorer,
)
def make_parser(
nlp: Language,
name: str,
model: Model,
moves: Optional[TransitionSystem],
update_with_oracle_cut_size: int,
learn_tokens: bool,
min_action_freq: int,
scorer: Optional[Callable],
):
return DependencyParser(
nlp.vocab,
model,
name=name,
moves=moves,
update_with_oracle_cut_size=update_with_oracle_cut_size,
learn_tokens=learn_tokens,
min_action_freq=min_action_freq,
scorer=scorer,
)
def make_beam_parser(
nlp: Language,
name: str,
model: Model,
moves: Optional[TransitionSystem],
update_with_oracle_cut_size: int,
learn_tokens: bool,
min_action_freq: int,
beam_width: int,
beam_density: float,
beam_update_prob: float,
scorer: Optional[Callable],
):
return DependencyParser(
nlp.vocab,
model,
name=name,
moves=moves,
update_with_oracle_cut_size=update_with_oracle_cut_size,
learn_tokens=learn_tokens,
min_action_freq=min_action_freq,
beam_width=beam_width,
beam_density=beam_density,
beam_update_prob=beam_update_prob,
scorer=scorer,
)
def make_tagger(
nlp: Language,
name: str,
model: Model,
overwrite: bool,
scorer: Optional[Callable],
neg_prefix: str,
label_smoothing: float,
):
return Tagger(
nlp.vocab,
model,
name=name,
overwrite=overwrite,
scorer=scorer,
neg_prefix=neg_prefix,
label_smoothing=label_smoothing,
)
def make_nn_labeller(
nlp: Language, name: str, model: Model, labels: Optional[dict], target: str
):
return MultitaskObjective(nlp.vocab, model, name, target=target)
def make_morphologizer(
nlp: Language,
model: Model,
name: str,
overwrite: bool,
extend: bool,
label_smoothing: float,
scorer: Optional[Callable],
):
return Morphologizer(
nlp.vocab,
model,
name,
overwrite=overwrite,
extend=extend,
label_smoothing=label_smoothing,
scorer=scorer,
)
def make_senter(
nlp: Language, name: str, model: Model, overwrite: bool, scorer: Optional[Callable]
):
return SentenceRecognizer(
nlp.vocab, model, name, overwrite=overwrite, scorer=scorer
)

View File

@ -1,5 +1,3 @@
import importlib
import sys
import warnings import warnings
from typing import Any, Dict from typing import Any, Dict
@ -75,6 +73,17 @@ def merge_subtokens(doc: Doc, label: str = "subtok") -> Doc:
return doc return doc
@Language.factory(
"token_splitter",
default_config={"min_length": 25, "split_length": 10},
retokenizes=True,
)
def make_token_splitter(
nlp: Language, name: str, *, min_length: int = 0, split_length: int = 0
):
return TokenSplitter(min_length=min_length, split_length=split_length)
class TokenSplitter: class TokenSplitter:
def __init__(self, min_length: int = 0, split_length: int = 0): def __init__(self, min_length: int = 0, split_length: int = 0):
self.min_length = min_length self.min_length = min_length
@ -132,6 +141,14 @@ class TokenSplitter:
util.from_disk(path, serializers, []) util.from_disk(path, serializers, [])
@Language.factory(
"doc_cleaner",
default_config={"attrs": {"tensor": None, "_.trf_data": None}, "silent": True},
)
def make_doc_cleaner(nlp: Language, name: str, *, attrs: Dict[str, Any], silent: bool):
return DocCleaner(attrs, silent=silent)
class DocCleaner: class DocCleaner:
def __init__(self, attrs: Dict[str, Any], *, silent: bool = True): def __init__(self, attrs: Dict[str, Any], *, silent: bool = True):
self.cfg: Dict[str, Any] = {"attrs": dict(attrs), "silent": silent} self.cfg: Dict[str, Any] = {"attrs": dict(attrs), "silent": silent}
@ -184,14 +201,3 @@ class DocCleaner:
"cfg": lambda p: self.cfg.update(srsly.read_json(p)), "cfg": lambda p: self.cfg.update(srsly.read_json(p)),
} }
util.from_disk(path, serializers, []) util.from_disk(path, serializers, [])
# Setup backwards compatibility hook for factories
def __getattr__(name):
if name == "make_doc_cleaner":
module = importlib.import_module("spacy.pipeline.factories")
return module.make_doc_cleaner
elif name == "make_token_splitter":
module = importlib.import_module("spacy.pipeline.factories")
return module.make_token_splitter
raise AttributeError(f"module {__name__} has no attribute {name}")

View File

@ -1,5 +1,3 @@
import importlib
import sys
import warnings import warnings
from pathlib import Path from pathlib import Path
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
@ -18,10 +16,35 @@ from ..vocab import Vocab
from .pipe import Pipe from .pipe import Pipe
@Language.factory(
"lemmatizer",
assigns=["token.lemma"],
default_config={
"model": None,
"mode": "lookup",
"overwrite": False,
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
},
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(
nlp: Language,
model: Optional[Model],
name: str,
mode: str,
overwrite: bool,
scorer: Optional[Callable],
):
return Lemmatizer(
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
)
def lemmatizer_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]: def lemmatizer_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
return Scorer.score_token_attr(examples, "lemma", **kwargs) return Scorer.score_token_attr(examples, "lemma", **kwargs)
@registry.scorers("spacy.lemmatizer_scorer.v1")
def make_lemmatizer_scorer(): def make_lemmatizer_scorer():
return lemmatizer_score return lemmatizer_score
@ -218,10 +241,7 @@ class Lemmatizer(Pipe):
if not form: if not form:
pass pass
elif form in index or not form.isalpha(): elif form in index or not form.isalpha():
if form in index: forms.append(form)
forms.insert(0, form)
else:
forms.append(form)
else: else:
oov_forms.append(form) oov_forms.append(form)
# Remove duplicates but preserve the ordering of applied "rules" # Remove duplicates but preserve the ordering of applied "rules"
@ -314,11 +334,3 @@ class Lemmatizer(Pipe):
util.from_bytes(bytes_data, deserialize, exclude) util.from_bytes(bytes_data, deserialize, exclude)
self._validate_tables() self._validate_tables()
return self return self
# Setup backwards compatibility hook for factories
def __getattr__(name):
if name == "make_lemmatizer":
module = importlib.import_module("spacy.pipeline.factories")
return module.make_lemmatizer
raise AttributeError(f"module {__name__} has no attribute {name}")

View File

@ -1,6 +1,4 @@
# cython: infer_types=True, binding=True # cython: infer_types=True, binding=True
import importlib
import sys
from itertools import islice from itertools import islice
from typing import Callable, Dict, Optional, Union from typing import Callable, Dict, Optional, Union
@ -49,6 +47,25 @@ maxout_pieces = 3
DEFAULT_MORPH_MODEL = Config().from_str(default_model_config)["model"] DEFAULT_MORPH_MODEL = Config().from_str(default_model_config)["model"]
@Language.factory(
"morphologizer",
assigns=["token.morph", "token.pos"],
default_config={"model": DEFAULT_MORPH_MODEL, "overwrite": True, "extend": False,
"scorer": {"@scorers": "spacy.morphologizer_scorer.v1"}, "label_smoothing": 0.0},
default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5, "morph_per_feat": None},
)
def make_morphologizer(
nlp: Language,
model: Model,
name: str,
overwrite: bool,
extend: bool,
label_smoothing: float,
scorer: Optional[Callable],
):
return Morphologizer(nlp.vocab, model, name, overwrite=overwrite, extend=extend, label_smoothing=label_smoothing, scorer=scorer)
def morphologizer_score(examples, **kwargs): def morphologizer_score(examples, **kwargs):
def morph_key_getter(token, attr): def morph_key_getter(token, attr):
return getattr(token, attr).key return getattr(token, attr).key
@ -64,6 +81,7 @@ def morphologizer_score(examples, **kwargs):
return results return results
@registry.scorers("spacy.morphologizer_scorer.v1")
def make_morphologizer_scorer(): def make_morphologizer_scorer():
return morphologizer_score return morphologizer_score
@ -291,11 +309,3 @@ class Morphologizer(Tagger):
if self.model.ops.xp.isnan(loss): if self.model.ops.xp.isnan(loss):
raise ValueError(Errors.E910.format(name=self.name)) raise ValueError(Errors.E910.format(name=self.name))
return float(loss), d_scores return float(loss), d_scores
# Setup backwards compatibility hook for factories
def __getattr__(name):
if name == "make_morphologizer":
module = importlib.import_module("spacy.pipeline.factories")
return module.make_morphologizer
raise AttributeError(f"module {__name__} has no attribute {name}")

View File

@ -1,6 +1,4 @@
# cython: infer_types=True, binding=True # cython: infer_types=True, binding=True
import importlib
import sys
from typing import Optional from typing import Optional
import numpy import numpy
@ -32,6 +30,14 @@ subword_features = true
DEFAULT_MT_MODEL = Config().from_str(default_model_config)["model"] DEFAULT_MT_MODEL = Config().from_str(default_model_config)["model"]
@Language.factory(
"nn_labeller",
default_config={"labels": None, "target": "dep_tag_offset", "model": DEFAULT_MT_MODEL}
)
def make_nn_labeller(nlp: Language, name: str, model: Model, labels: Optional[dict], target: str):
return MultitaskObjective(nlp.vocab, model, name)
class MultitaskObjective(Tagger): class MultitaskObjective(Tagger):
"""Experimental: Assist training of a parser or tagger, by training a """Experimental: Assist training of a parser or tagger, by training a
side-objective. side-objective.
@ -207,11 +213,3 @@ class ClozeMultitask(TrainablePipe):
def add_label(self, label): def add_label(self, label):
raise NotImplementedError raise NotImplementedError
# Setup backwards compatibility hook for factories
def __getattr__(name):
if name == "make_nn_labeller":
module = importlib.import_module("spacy.pipeline.factories")
return module.make_nn_labeller
raise AttributeError(f"module {__name__} has no attribute {name}")

View File

@ -1,6 +1,4 @@
# cython: infer_types=True, binding=True # cython: infer_types=True, binding=True
import importlib
import sys
from collections import defaultdict from collections import defaultdict
from typing import Callable, Optional from typing import Callable, Optional
@ -38,10 +36,154 @@ subword_features = true
DEFAULT_NER_MODEL = Config().from_str(default_model_config)["model"] DEFAULT_NER_MODEL = Config().from_str(default_model_config)["model"]
@Language.factory(
"ner",
assigns=["doc.ents", "token.ent_iob", "token.ent_type"],
default_config={
"moves": None,
"update_with_oracle_cut_size": 100,
"model": DEFAULT_NER_MODEL,
"incorrect_spans_key": None,
"scorer": {"@scorers": "spacy.ner_scorer.v1"},
},
default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
)
def make_ner(
nlp: Language,
name: str,
model: Model,
moves: Optional[TransitionSystem],
update_with_oracle_cut_size: int,
incorrect_spans_key: Optional[str],
scorer: Optional[Callable],
):
"""Create a transition-based EntityRecognizer component. The entity recognizer
identifies non-overlapping labelled spans of tokens.
The transition-based algorithm used encodes certain assumptions that are
effective for "traditional" named entity recognition tasks, but may not be
a good fit for every span identification problem. Specifically, the loss
function optimizes for whole entity accuracy, so if your inter-annotator
agreement on boundary tokens is low, the component will likely perform poorly
on your problem. The transition-based algorithm also assumes that the most
decisive information about your entities will be close to their initial tokens.
If your entities are long and characterised by tokens in their middle, the
component will likely do poorly on your task.
model (Model): The model for the transition-based parser. The model needs
to have a specific substructure of named components --- see the
spacy.ml.tb_framework.TransitionModel for details.
moves (Optional[TransitionSystem]): This defines how the parse-state is created,
updated and evaluated. If 'moves' is None, a new instance is
created with `self.TransitionSystem()`. Defaults to `None`.
update_with_oracle_cut_size (int): During training, cut long sequences into
shorter segments by creating intermediate states based on the gold-standard
history. The model is not very sensitive to this parameter, so you usually
won't need to change it. 100 is a good default.
incorrect_spans_key (Optional[str]): Identifies spans that are known
to be incorrect entity annotations. The incorrect entity annotations
can be stored in the span group, under this key.
scorer (Optional[Callable]): The scoring method.
"""
return EntityRecognizer(
nlp.vocab,
model,
name,
moves=moves,
update_with_oracle_cut_size=update_with_oracle_cut_size,
incorrect_spans_key=incorrect_spans_key,
multitasks=[],
beam_width=1,
beam_density=0.0,
beam_update_prob=0.0,
scorer=scorer,
)
@Language.factory(
"beam_ner",
assigns=["doc.ents", "token.ent_iob", "token.ent_type"],
default_config={
"moves": None,
"update_with_oracle_cut_size": 100,
"model": DEFAULT_NER_MODEL,
"beam_density": 0.01,
"beam_update_prob": 0.5,
"beam_width": 32,
"incorrect_spans_key": None,
"scorer": None,
},
default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
)
def make_beam_ner(
nlp: Language,
name: str,
model: Model,
moves: Optional[TransitionSystem],
update_with_oracle_cut_size: int,
beam_width: int,
beam_density: float,
beam_update_prob: float,
incorrect_spans_key: Optional[str],
scorer: Optional[Callable],
):
"""Create a transition-based EntityRecognizer component that uses beam-search.
The entity recognizer identifies non-overlapping labelled spans of tokens.
The transition-based algorithm used encodes certain assumptions that are
effective for "traditional" named entity recognition tasks, but may not be
a good fit for every span identification problem. Specifically, the loss
function optimizes for whole entity accuracy, so if your inter-annotator
agreement on boundary tokens is low, the component will likely perform poorly
on your problem. The transition-based algorithm also assumes that the most
decisive information about your entities will be close to their initial tokens.
If your entities are long and characterised by tokens in their middle, the
component will likely do poorly on your task.
model (Model): The model for the transition-based parser. The model needs
to have a specific substructure of named components --- see the
spacy.ml.tb_framework.TransitionModel for details.
moves (Optional[TransitionSystem]): This defines how the parse-state is created,
updated and evaluated. If 'moves' is None, a new instance is
created with `self.TransitionSystem()`. Defaults to `None`.
update_with_oracle_cut_size (int): During training, cut long sequences into
shorter segments by creating intermediate states based on the gold-standard
history. The model is not very sensitive to this parameter, so you usually
won't need to change it. 100 is a good default.
beam_width (int): The number of candidate analyses to maintain.
beam_density (float): The minimum ratio between the scores of the first and
last candidates in the beam. This allows the parser to avoid exploring
candidates that are too far behind. This is mostly intended to improve
efficiency, but it can also improve accuracy as deeper search is not
always better.
beam_update_prob (float): The chance of making a beam update, instead of a
greedy update. Greedy updates are an approximation for the beam updates,
and are faster to compute.
incorrect_spans_key (Optional[str]): Optional key into span groups of
entities known to be non-entities.
scorer (Optional[Callable]): The scoring method.
"""
return EntityRecognizer(
nlp.vocab,
model,
name,
moves=moves,
update_with_oracle_cut_size=update_with_oracle_cut_size,
multitasks=[],
beam_width=beam_width,
beam_density=beam_density,
beam_update_prob=beam_update_prob,
incorrect_spans_key=incorrect_spans_key,
scorer=scorer,
)
def ner_score(examples, **kwargs): def ner_score(examples, **kwargs):
return get_ner_prf(examples, **kwargs) return get_ner_prf(examples, **kwargs)
@registry.scorers("spacy.ner_scorer.v1")
def make_ner_scorer(): def make_ner_scorer():
return ner_score return ner_score
@ -119,14 +261,3 @@ cdef class EntityRecognizer(Parser):
score_dict[(start, end, label)] += score score_dict[(start, end, label)] += score
entity_scores.append(score_dict) entity_scores.append(score_dict)
return entity_scores return entity_scores
# Setup backwards compatibility hook for factories
def __getattr__(name):
if name == "make_ner":
module = importlib.import_module("spacy.pipeline.factories")
return module.make_ner
elif name == "make_beam_ner":
module = importlib.import_module("spacy.pipeline.factories")
return module.make_beam_ner
raise AttributeError(f"module {__name__} has no attribute {name}")

View File

@ -21,6 +21,13 @@ cdef class Pipe:
DOCS: https://spacy.io/api/pipe DOCS: https://spacy.io/api/pipe
""" """
@classmethod
def __init_subclass__(cls, **kwargs):
"""Raise a warning if an inheriting class implements 'begin_training'
(from v2) instead of the new 'initialize' method (from v3)"""
if hasattr(cls, "begin_training"):
warnings.warn(Warnings.W088.format(name=cls.__name__))
def __call__(self, Doc doc) -> Doc: def __call__(self, Doc doc) -> Doc:
"""Apply the pipe to one document. The document is modified in place, """Apply the pipe to one document. The document is modified in place,
and returned. This usually happens under the hood when the nlp object and returned. This usually happens under the hood when the nlp object

View File

@ -1,6 +1,4 @@
# cython: infer_types=True, binding=True # cython: infer_types=True, binding=True
import importlib
import sys
from typing import Callable, List, Optional from typing import Callable, List, Optional
import srsly import srsly
@ -16,6 +14,22 @@ from .senter import senter_score
BACKWARD_OVERWRITE = False BACKWARD_OVERWRITE = False
@Language.factory(
"sentencizer",
assigns=["token.is_sent_start", "doc.sents"],
default_config={"punct_chars": None, "overwrite": False, "scorer": {"@scorers": "spacy.senter_scorer.v1"}},
default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
)
def make_sentencizer(
nlp: Language,
name: str,
punct_chars: Optional[List[str]],
overwrite: bool,
scorer: Optional[Callable],
):
return Sentencizer(name, punct_chars=punct_chars, overwrite=overwrite, scorer=scorer)
class Sentencizer(Pipe): class Sentencizer(Pipe):
"""Segment the Doc into sentences using a rule-based strategy. """Segment the Doc into sentences using a rule-based strategy.
@ -167,11 +181,3 @@ class Sentencizer(Pipe):
self.punct_chars = set(cfg.get("punct_chars", self.default_punct_chars)) self.punct_chars = set(cfg.get("punct_chars", self.default_punct_chars))
self.overwrite = cfg.get("overwrite", self.overwrite) self.overwrite = cfg.get("overwrite", self.overwrite)
return self return self
# Setup backwards compatibility hook for factories
def __getattr__(name):
if name == "make_sentencizer":
module = importlib.import_module("spacy.pipeline.factories")
return module.make_sentencizer
raise AttributeError(f"module {__name__} has no attribute {name}")

View File

@ -1,6 +1,4 @@
# cython: infer_types=True, binding=True # cython: infer_types=True, binding=True
import importlib
import sys
from itertools import islice from itertools import islice
from typing import Callable, Optional from typing import Callable, Optional
@ -36,6 +34,16 @@ subword_features = true
DEFAULT_SENTER_MODEL = Config().from_str(default_model_config)["model"] DEFAULT_SENTER_MODEL = Config().from_str(default_model_config)["model"]
@Language.factory(
"senter",
assigns=["token.is_sent_start"],
default_config={"model": DEFAULT_SENTER_MODEL, "overwrite": False, "scorer": {"@scorers": "spacy.senter_scorer.v1"}},
default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
)
def make_senter(nlp: Language, name: str, model: Model, overwrite: bool, scorer: Optional[Callable]):
return SentenceRecognizer(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer)
def senter_score(examples, **kwargs): def senter_score(examples, **kwargs):
def has_sents(doc): def has_sents(doc):
return doc.has_annotation("SENT_START") return doc.has_annotation("SENT_START")
@ -45,6 +53,7 @@ def senter_score(examples, **kwargs):
return results return results
@registry.scorers("spacy.senter_scorer.v1")
def make_senter_scorer(): def make_senter_scorer():
return senter_score return senter_score
@ -176,11 +185,3 @@ class SentenceRecognizer(Tagger):
def add_label(self, label, values=None): def add_label(self, label, values=None):
raise NotImplementedError raise NotImplementedError
# Setup backwards compatibility hook for factories
def __getattr__(name):
if name == "make_senter":
module = importlib.import_module("spacy.pipeline.factories")
return module.make_senter
raise AttributeError(f"module {__name__} has no attribute {name}")

View File

@ -1,5 +1,3 @@
import importlib
import sys
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple
from thinc.api import Config, Model, Optimizer, set_dropout_rate from thinc.api import Config, Model, Optimizer, set_dropout_rate
@ -43,6 +41,63 @@ depth = 4
DEFAULT_SPAN_FINDER_MODEL = Config().from_str(span_finder_default_config)["model"] DEFAULT_SPAN_FINDER_MODEL = Config().from_str(span_finder_default_config)["model"]
@Language.factory(
"span_finder",
assigns=["doc.spans"],
default_config={
"threshold": 0.5,
"model": DEFAULT_SPAN_FINDER_MODEL,
"spans_key": DEFAULT_SPANS_KEY,
"max_length": 25,
"min_length": None,
"scorer": {"@scorers": "spacy.span_finder_scorer.v1"},
},
default_score_weights={
f"spans_{DEFAULT_SPANS_KEY}_f": 1.0,
f"spans_{DEFAULT_SPANS_KEY}_p": 0.0,
f"spans_{DEFAULT_SPANS_KEY}_r": 0.0,
},
)
def make_span_finder(
nlp: Language,
name: str,
model: Model[Iterable[Doc], Floats2d],
spans_key: str,
threshold: float,
max_length: Optional[int],
min_length: Optional[int],
scorer: Optional[Callable],
) -> "SpanFinder":
"""Create a SpanFinder component. The component predicts whether a token is
the start or the end of a potential span.
model (Model[List[Doc], Floats2d]): A model instance that
is given a list of documents and predicts a probability for each token.
spans_key (str): Key of the doc.spans dict to save the spans under. During
initialization and training, the component will look for spans on the
reference document under the same key.
threshold (float): Minimum probability to consider a prediction positive.
max_length (Optional[int]): Maximum length of the produced spans, defaults
to None meaning unlimited length.
min_length (Optional[int]): Minimum length of the produced spans, defaults
to None meaning shortest span length is 1.
scorer (Optional[Callable]): The scoring method. Defaults to
Scorer.score_spans for the Doc.spans[spans_key] with overlapping
spans allowed.
"""
return SpanFinder(
nlp,
model=model,
threshold=threshold,
name=name,
scorer=scorer,
max_length=max_length,
min_length=min_length,
spans_key=spans_key,
)
@registry.scorers("spacy.span_finder_scorer.v1")
def make_span_finder_scorer(): def make_span_finder_scorer():
return span_finder_score return span_finder_score
@ -278,11 +333,3 @@ class SpanFinder(TrainablePipe):
self.model.initialize(X=docs, Y=Y) self.model.initialize(X=docs, Y=Y)
else: else:
self.model.initialize() self.model.initialize()
# Setup backwards compatibility hook for factories
def __getattr__(name):
if name == "make_span_finder":
module = importlib.import_module("spacy.pipeline.factories")
return module.make_span_finder
raise AttributeError(f"module {__name__} has no attribute {name}")

View File

@ -1,5 +1,3 @@
import importlib
import sys
import warnings import warnings
from functools import partial from functools import partial
from pathlib import Path from pathlib import Path
@ -34,6 +32,105 @@ PatternType = Dict[str, Union[str, List[Dict[str, Any]]]]
DEFAULT_SPANS_KEY = "ruler" DEFAULT_SPANS_KEY = "ruler"
@Language.factory(
"future_entity_ruler",
assigns=["doc.ents"],
default_config={
"phrase_matcher_attr": None,
"validate": False,
"overwrite_ents": False,
"scorer": {"@scorers": "spacy.entity_ruler_scorer.v1"},
"ent_id_sep": "__unused__",
"matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"},
},
default_score_weights={
"ents_f": 1.0,
"ents_p": 0.0,
"ents_r": 0.0,
"ents_per_type": None,
},
)
def make_entity_ruler(
nlp: Language,
name: str,
phrase_matcher_attr: Optional[Union[int, str]],
matcher_fuzzy_compare: Callable,
validate: bool,
overwrite_ents: bool,
scorer: Optional[Callable],
ent_id_sep: str,
):
if overwrite_ents:
ents_filter = prioritize_new_ents_filter
else:
ents_filter = prioritize_existing_ents_filter
return SpanRuler(
nlp,
name,
spans_key=None,
spans_filter=None,
annotate_ents=True,
ents_filter=ents_filter,
phrase_matcher_attr=phrase_matcher_attr,
matcher_fuzzy_compare=matcher_fuzzy_compare,
validate=validate,
overwrite=False,
scorer=scorer,
)
@Language.factory(
"span_ruler",
assigns=["doc.spans"],
default_config={
"spans_key": DEFAULT_SPANS_KEY,
"spans_filter": None,
"annotate_ents": False,
"ents_filter": {"@misc": "spacy.first_longest_spans_filter.v1"},
"phrase_matcher_attr": None,
"matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"},
"validate": False,
"overwrite": True,
"scorer": {
"@scorers": "spacy.overlapping_labeled_spans_scorer.v1",
"spans_key": DEFAULT_SPANS_KEY,
},
},
default_score_weights={
f"spans_{DEFAULT_SPANS_KEY}_f": 1.0,
f"spans_{DEFAULT_SPANS_KEY}_p": 0.0,
f"spans_{DEFAULT_SPANS_KEY}_r": 0.0,
f"spans_{DEFAULT_SPANS_KEY}_per_type": None,
},
)
def make_span_ruler(
nlp: Language,
name: str,
spans_key: Optional[str],
spans_filter: Optional[Callable[[Iterable[Span], Iterable[Span]], Iterable[Span]]],
annotate_ents: bool,
ents_filter: Callable[[Iterable[Span], Iterable[Span]], Iterable[Span]],
phrase_matcher_attr: Optional[Union[int, str]],
matcher_fuzzy_compare: Callable,
validate: bool,
overwrite: bool,
scorer: Optional[Callable],
):
return SpanRuler(
nlp,
name,
spans_key=spans_key,
spans_filter=spans_filter,
annotate_ents=annotate_ents,
ents_filter=ents_filter,
phrase_matcher_attr=phrase_matcher_attr,
matcher_fuzzy_compare=matcher_fuzzy_compare,
validate=validate,
overwrite=overwrite,
scorer=scorer,
)
def prioritize_new_ents_filter( def prioritize_new_ents_filter(
entities: Iterable[Span], spans: Iterable[Span] entities: Iterable[Span], spans: Iterable[Span]
) -> List[Span]: ) -> List[Span]:
@ -60,6 +157,7 @@ def prioritize_new_ents_filter(
return entities + new_entities return entities + new_entities
@registry.misc("spacy.prioritize_new_ents_filter.v1")
def make_prioritize_new_ents_filter(): def make_prioritize_new_ents_filter():
return prioritize_new_ents_filter return prioritize_new_ents_filter
@ -90,6 +188,7 @@ def prioritize_existing_ents_filter(
return entities + new_entities return entities + new_entities
@registry.misc("spacy.prioritize_existing_ents_filter.v1")
def make_preserve_existing_ents_filter(): def make_preserve_existing_ents_filter():
return prioritize_existing_ents_filter return prioritize_existing_ents_filter
@ -109,6 +208,7 @@ def overlapping_labeled_spans_score(
return Scorer.score_spans(examples, **kwargs) return Scorer.score_spans(examples, **kwargs)
@registry.scorers("spacy.overlapping_labeled_spans_scorer.v1")
def make_overlapping_labeled_spans_scorer(spans_key: str = DEFAULT_SPANS_KEY): def make_overlapping_labeled_spans_scorer(spans_key: str = DEFAULT_SPANS_KEY):
return partial(overlapping_labeled_spans_score, spans_key=spans_key) return partial(overlapping_labeled_spans_score, spans_key=spans_key)
@ -495,14 +595,3 @@ class SpanRuler(Pipe):
"patterns": lambda p: srsly.write_jsonl(p, self.patterns), "patterns": lambda p: srsly.write_jsonl(p, self.patterns),
} }
util.to_disk(path, serializers, {}) util.to_disk(path, serializers, {})
# Setup backwards compatibility hook for factories
def __getattr__(name):
if name == "make_span_ruler":
module = importlib.import_module("spacy.pipeline.factories")
return module.make_span_ruler
elif name == "make_entity_ruler":
module = importlib.import_module("spacy.pipeline.factories")
return module.make_future_entity_ruler
raise AttributeError(f"module {__name__} has no attribute {name}")

View File

@ -1,5 +1,3 @@
import importlib
import sys
from dataclasses import dataclass from dataclasses import dataclass
from functools import partial from functools import partial
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union, cast from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union, cast
@ -136,6 +134,7 @@ def preset_spans_suggester(
return output return output
@registry.misc("spacy.ngram_suggester.v1")
def build_ngram_suggester(sizes: List[int]) -> Suggester: def build_ngram_suggester(sizes: List[int]) -> Suggester:
"""Suggest all spans of the given lengths. Spans are returned as a ragged """Suggest all spans of the given lengths. Spans are returned as a ragged
array of integers. The array has two columns, indicating the start and end array of integers. The array has two columns, indicating the start and end
@ -144,6 +143,7 @@ def build_ngram_suggester(sizes: List[int]) -> Suggester:
return partial(ngram_suggester, sizes=sizes) return partial(ngram_suggester, sizes=sizes)
@registry.misc("spacy.ngram_range_suggester.v1")
def build_ngram_range_suggester(min_size: int, max_size: int) -> Suggester: def build_ngram_range_suggester(min_size: int, max_size: int) -> Suggester:
"""Suggest all spans of the given lengths between a given min and max value - both inclusive. """Suggest all spans of the given lengths between a given min and max value - both inclusive.
Spans are returned as a ragged array of integers. The array has two columns, Spans are returned as a ragged array of integers. The array has two columns,
@ -152,6 +152,7 @@ def build_ngram_range_suggester(min_size: int, max_size: int) -> Suggester:
return build_ngram_suggester(sizes) return build_ngram_suggester(sizes)
@registry.misc("spacy.preset_spans_suggester.v1")
def build_preset_spans_suggester(spans_key: str) -> Suggester: def build_preset_spans_suggester(spans_key: str) -> Suggester:
"""Suggest all spans that are already stored in doc.spans[spans_key]. """Suggest all spans that are already stored in doc.spans[spans_key].
This is useful when an upstream component is used to set the spans This is useful when an upstream component is used to set the spans
@ -159,6 +160,136 @@ def build_preset_spans_suggester(spans_key: str) -> Suggester:
return partial(preset_spans_suggester, spans_key=spans_key) return partial(preset_spans_suggester, spans_key=spans_key)
@Language.factory(
"spancat",
assigns=["doc.spans"],
default_config={
"threshold": 0.5,
"spans_key": DEFAULT_SPANS_KEY,
"max_positive": None,
"model": DEFAULT_SPANCAT_MODEL,
"suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
"scorer": {"@scorers": "spacy.spancat_scorer.v1"},
},
default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0},
)
def make_spancat(
nlp: Language,
name: str,
suggester: Suggester,
model: Model[Tuple[List[Doc], Ragged], Floats2d],
spans_key: str,
scorer: Optional[Callable],
threshold: float,
max_positive: Optional[int],
) -> "SpanCategorizer":
"""Create a SpanCategorizer component and configure it for multi-label
classification to be able to assign multiple labels for each span.
The span categorizer consists of two
parts: a suggester function that proposes candidate spans, and a labeller
model that predicts one or more labels for each span.
name (str): The component instance name, used to add entries to the
losses during training.
suggester (Callable[[Iterable[Doc], Optional[Ops]], Ragged]): A function that suggests spans.
Spans are returned as a ragged array with two integer columns, for the
start and end positions.
model (Model[Tuple[List[Doc], Ragged], Floats2d]): A model instance that
is given a list of documents and (start, end) indices representing
candidate span offsets. The model predicts a probability for each category
for each span.
spans_key (str): Key of the doc.spans dict to save the spans under. During
initialization and training, the component will look for spans on the
reference document under the same key.
scorer (Optional[Callable]): The scoring method. Defaults to
Scorer.score_spans for the Doc.spans[spans_key] with overlapping
spans allowed.
threshold (float): Minimum probability to consider a prediction positive.
Spans with a positive prediction will be saved on the Doc. Defaults to
0.5.
max_positive (Optional[int]): Maximum number of labels to consider positive
per span. Defaults to None, indicating no limit.
"""
return SpanCategorizer(
nlp.vocab,
model=model,
suggester=suggester,
name=name,
spans_key=spans_key,
negative_weight=None,
allow_overlap=True,
max_positive=max_positive,
threshold=threshold,
scorer=scorer,
add_negative_label=False,
)
@Language.factory(
"spancat_singlelabel",
assigns=["doc.spans"],
default_config={
"spans_key": DEFAULT_SPANS_KEY,
"model": DEFAULT_SPANCAT_SINGLELABEL_MODEL,
"negative_weight": 1.0,
"suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
"scorer": {"@scorers": "spacy.spancat_scorer.v1"},
"allow_overlap": True,
},
default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0},
)
def make_spancat_singlelabel(
nlp: Language,
name: str,
suggester: Suggester,
model: Model[Tuple[List[Doc], Ragged], Floats2d],
spans_key: str,
negative_weight: float,
allow_overlap: bool,
scorer: Optional[Callable],
) -> "SpanCategorizer":
"""Create a SpanCategorizer component and configure it for multi-class
classification. With this configuration each span can get at most one
label. The span categorizer consists of two
parts: a suggester function that proposes candidate spans, and a labeller
model that predicts one or more labels for each span.
name (str): The component instance name, used to add entries to the
losses during training.
suggester (Callable[[Iterable[Doc], Optional[Ops]], Ragged]): A function that suggests spans.
Spans are returned as a ragged array with two integer columns, for the
start and end positions.
model (Model[Tuple[List[Doc], Ragged], Floats2d]): A model instance that
is given a list of documents and (start, end) indices representing
candidate span offsets. The model predicts a probability for each category
for each span.
spans_key (str): Key of the doc.spans dict to save the spans under. During
initialization and training, the component will look for spans on the
reference document under the same key.
scorer (Optional[Callable]): The scoring method. Defaults to
Scorer.score_spans for the Doc.spans[spans_key] with overlapping
spans allowed.
negative_weight (float): Multiplier for the loss terms.
Can be used to downweight the negative samples if there are too many.
allow_overlap (bool): If True the data is assumed to contain overlapping spans.
Otherwise it produces non-overlapping spans greedily prioritizing
higher assigned label scores.
"""
return SpanCategorizer(
nlp.vocab,
model=model,
suggester=suggester,
name=name,
spans_key=spans_key,
negative_weight=negative_weight,
allow_overlap=allow_overlap,
max_positive=1,
add_negative_label=True,
threshold=None,
scorer=scorer,
)
def spancat_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]: def spancat_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
kwargs = dict(kwargs) kwargs = dict(kwargs)
attr_prefix = "spans_" attr_prefix = "spans_"
@ -172,6 +303,7 @@ def spancat_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
return Scorer.score_spans(examples, **kwargs) return Scorer.score_spans(examples, **kwargs)
@registry.scorers("spacy.spancat_scorer.v1")
def make_spancat_scorer(): def make_spancat_scorer():
return spancat_score return spancat_score
@ -653,14 +785,3 @@ class SpanCategorizer(TrainablePipe):
spans.attrs["scores"] = numpy.array(attrs_scores) spans.attrs["scores"] = numpy.array(attrs_scores)
return spans return spans
# Setup backwards compatibility hook for factories
def __getattr__(name):
if name == "make_spancat":
module = importlib.import_module("spacy.pipeline.factories")
return module.make_spancat
elif name == "make_spancat_singlelabel":
module = importlib.import_module("spacy.pipeline.factories")
return module.make_spancat_singlelabel
raise AttributeError(f"module {__name__} has no attribute {name}")

Some files were not shown because too many files have changed in this diff Show More