mirror of
https://github.com/explosion/spaCy.git
synced 2025-10-02 18:06:46 +03:00
Compare commits
165 Commits
master
...
v4.0.0.dev
Author | SHA1 | Date | |
---|---|---|---|
|
f4c8fdfaad | ||
|
818fdb537e | ||
|
e32a394ff0 | ||
|
5992e927b9 | ||
|
c27679f210 | ||
|
287deee02c | ||
|
b2ca7253d2 | ||
|
f5918d4353 | ||
|
5bd141013b | ||
|
8696861c8c | ||
|
fbc14aea45 | ||
|
304b9331e6 | ||
|
afb22ad491 | ||
|
1052cba9f3 | ||
|
2d4067d021 | ||
|
70e2f2a14a | ||
|
ce9ea9629f | ||
|
bbf38d4d0f | ||
|
9e97c730be | ||
|
36ee709390 | ||
|
e722284ff4 | ||
|
ce4ea5ffa7 | ||
|
c621e251b8 | ||
|
82ef6783a8 | ||
|
81beaea70e | ||
|
2891e27421 | ||
|
9972333ef9 | ||
|
7351f6bbeb | ||
|
7718886fa3 | ||
|
532225b955 | ||
|
7b689bde44 | ||
|
57203fa0fc | ||
|
5e8bafa5bb | ||
|
9b36729cbd | ||
|
42fe4edfd7 | ||
|
e2591cda36 | ||
|
e5ec45cb7e | ||
|
05803cfe76 | ||
|
1b2d66f98e | ||
|
b4e457d9fe | ||
|
2702db9fef | ||
|
eaaac5a08c | ||
|
f293386d3e | ||
|
4f37e4031c | ||
|
96f2e30c4b | ||
|
846472129c | ||
|
47a82c6164 | ||
|
0e3b6a87d6 | ||
|
536798f9e3 | ||
|
b615964be7 | ||
|
8b2732e276 | ||
|
122f3b32ad | ||
|
bf92ca4f10 | ||
|
2468742cb8 | ||
|
68089f65cd | ||
|
17c4a3d646 | ||
|
95619b6736 | ||
|
096794dd74 | ||
|
4990cfefb4 | ||
|
d82e167aea | ||
|
50c5e9a2dd | ||
|
8a5814bf2c | ||
|
5d0f48fe69 | ||
|
b734e5314d | ||
|
a653dec654 | ||
|
3102e2e27a | ||
|
9340eb8ad2 | ||
|
6ae7618418 | ||
|
520279ff7c | ||
|
41b3a0d932 | ||
|
8ca71f9591 | ||
|
749e446ee3 | ||
|
04f41854c1 | ||
|
1ea31552be | ||
|
da75896ef5 | ||
|
df4c069a13 | ||
|
e27c60a702 | ||
|
dd3f138830 | ||
|
b95123060a | ||
|
cbc2ae933e | ||
|
cf85b81f34 | ||
|
5089efa2d0 | ||
|
eec5ccd72f | ||
|
c47ec5b5c6 | ||
|
89f974d4f5 | ||
|
cd95b29053 | ||
|
6920fb7baf | ||
|
360ccf628a | ||
|
c6cca4c00a | ||
|
fb7f018ded | ||
|
1b5aba9e22 | ||
|
6b07be2110 | ||
|
ec45f704b1 | ||
|
1678a98449 | ||
|
16609517f1 | ||
|
fd911fe2af | ||
|
8548d4d16e | ||
|
de360bc981 | ||
|
6348a7a4b4 | ||
|
b052b1b47f | ||
|
a183db3cef | ||
|
5e297aa20e | ||
|
c2f3e699ca | ||
|
2c2e66e145 | ||
|
fc2723925b | ||
|
6ff5eb256c | ||
|
b2fd9490e3 | ||
|
a231bf65af | ||
|
b510fbd0aa | ||
|
326b541312 | ||
|
6852adc8b7 | ||
|
20b63943f5 | ||
|
d30ba9b7b8 | ||
|
2f08deea2a | ||
|
207565a788 | ||
|
f9308aae13 | ||
|
ca75190a3d | ||
|
f5aabaf7d6 | ||
|
d60997febb | ||
|
6b9af38eeb | ||
|
60379cec65 | ||
|
8267aa1b65 | ||
|
799d226676 | ||
|
04fea09ffd | ||
|
e79910d57e | ||
|
d0fc871a1c | ||
|
68b8fa2df2 | ||
|
cae4589f5a | ||
|
a4bd890f32 | ||
|
0e2b7fb28b | ||
|
103b24fb25 | ||
|
446a3ecf34 | ||
|
c6704f368c | ||
|
d4922f25fc | ||
|
e3027c65b8 | ||
|
5157e4e823 | ||
|
efdbb722c5 | ||
|
60c050e82b | ||
|
977b847cce | ||
|
4a615cacd2 | ||
|
698b8b495f | ||
|
98a916e01a | ||
|
4bce8fa755 | ||
|
2a558a7cdc | ||
|
1eb7ce5ef7 | ||
|
740c33fe58 | ||
|
8dd1fa9896 | ||
|
c44d243f25 | ||
|
bb0e178878 | ||
|
1a5be63715 | ||
|
d757dec5c4 | ||
|
551e73ccfc | ||
|
5d54c0e32a | ||
|
e581eeac34 | ||
|
b2d05f9f66 | ||
|
1ff683a50b | ||
|
ba18d2913d | ||
|
851a7ca4fa | ||
|
1605ef7319 | ||
|
7f3842f54d | ||
|
2f05c6824c | ||
|
10b7223021 | ||
|
5586fd9311 | ||
|
0e71bd973f | ||
|
75f7c15187 |
99
.github/workflows/cibuildwheel.yml
vendored
99
.github/workflows/cibuildwheel.yml
vendored
|
@ -1,99 +0,0 @@
|
|||
name: Build
|
||||
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
# ytf did they invent their own syntax that's almost regex?
|
||||
# ** matches 'zero or more of any character'
|
||||
- 'release-v[0-9]+.[0-9]+.[0-9]+**'
|
||||
- 'prerelease-v[0-9]+.[0-9]+.[0-9]+**'
|
||||
jobs:
|
||||
build_wheels:
|
||||
name: Build wheels on ${{ matrix.os }}
|
||||
runs-on: ${{ matrix.os }}
|
||||
strategy:
|
||||
matrix:
|
||||
# macos-13 is an intel runner, macos-14 is apple silicon
|
||||
os: [ubuntu-latest, windows-latest, macos-13, macos-14, ubuntu-24.04-arm]
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
# aarch64 (arm) is built via qemu emulation
|
||||
# QEMU is sadly too slow. We need to wait for public ARM support
|
||||
#- name: Set up QEMU
|
||||
# if: runner.os == 'Linux'
|
||||
# uses: docker/setup-qemu-action@v3
|
||||
# with:
|
||||
# platforms: all
|
||||
- name: Build wheels
|
||||
uses: pypa/cibuildwheel@v2.21.3
|
||||
env:
|
||||
CIBW_ARCHS_LINUX: auto
|
||||
with:
|
||||
package-dir: .
|
||||
output-dir: wheelhouse
|
||||
config-file: "{package}/pyproject.toml"
|
||||
- uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: cibw-wheels-${{ matrix.os }}-${{ strategy.job-index }}
|
||||
path: ./wheelhouse/*.whl
|
||||
|
||||
build_sdist:
|
||||
name: Build source distribution
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Build sdist
|
||||
run: pipx run build --sdist
|
||||
- uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: cibw-sdist
|
||||
path: dist/*.tar.gz
|
||||
create_release:
|
||||
needs: [build_wheels, build_sdist]
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
contents: write
|
||||
checks: write
|
||||
actions: read
|
||||
issues: read
|
||||
packages: write
|
||||
pull-requests: read
|
||||
repository-projects: read
|
||||
statuses: read
|
||||
steps:
|
||||
- name: Get the tag name and determine if it's a prerelease
|
||||
id: get_tag_info
|
||||
run: |
|
||||
FULL_TAG=${GITHUB_REF#refs/tags/}
|
||||
if [[ $FULL_TAG == release-* ]]; then
|
||||
TAG_NAME=${FULL_TAG#release-}
|
||||
IS_PRERELEASE=false
|
||||
elif [[ $FULL_TAG == prerelease-* ]]; then
|
||||
TAG_NAME=${FULL_TAG#prerelease-}
|
||||
IS_PRERELEASE=true
|
||||
else
|
||||
echo "Tag does not match expected patterns" >&2
|
||||
exit 1
|
||||
fi
|
||||
echo "FULL_TAG=$TAG_NAME" >> $GITHUB_ENV
|
||||
echo "TAG_NAME=$TAG_NAME" >> $GITHUB_ENV
|
||||
echo "IS_PRERELEASE=$IS_PRERELEASE" >> $GITHUB_ENV
|
||||
- uses: actions/download-artifact@v4
|
||||
with:
|
||||
# unpacks all CIBW artifacts into dist/
|
||||
pattern: cibw-*
|
||||
path: dist
|
||||
merge-multiple: true
|
||||
- name: Create Draft Release
|
||||
id: create_release
|
||||
uses: softprops/action-gh-release@v2
|
||||
if: startsWith(github.ref, 'refs/tags/')
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
with:
|
||||
name: ${{ env.TAG_NAME }}
|
||||
draft: true
|
||||
prerelease: ${{ env.IS_PRERELEASE }}
|
||||
files: "./dist/*"
|
29
.github/workflows/publish_pypi.yml
vendored
29
.github/workflows/publish_pypi.yml
vendored
|
@ -1,29 +0,0 @@
|
|||
# The cibuildwheel action triggers on creation of a release, this
|
||||
# triggers on publication.
|
||||
# The expected workflow is to create a draft release and let the wheels
|
||||
# upload, and then hit 'publish', which uploads to PyPi.
|
||||
|
||||
on:
|
||||
release:
|
||||
types:
|
||||
- published
|
||||
|
||||
jobs:
|
||||
upload_pypi:
|
||||
runs-on: ubuntu-latest
|
||||
environment:
|
||||
name: pypi
|
||||
url: https://pypi.org/p/spacy
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
if: github.event_name == 'release' && github.event.action == 'published'
|
||||
# or, alternatively, upload to PyPI on every tag starting with 'v' (remove on: release above to use this)
|
||||
# if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
|
||||
steps:
|
||||
- uses: robinraju/release-downloader@v1
|
||||
with:
|
||||
tag: ${{ github.event.release.tag_name }}
|
||||
fileName: '*'
|
||||
out-file-path: 'dist'
|
||||
- uses: pypa/gh-action-pypi-publish@release/v1
|
90
.github/workflows/tests.yml
vendored
90
.github/workflows/tests.yml
vendored
|
@ -2,8 +2,6 @@ name: tests
|
|||
|
||||
on:
|
||||
push:
|
||||
tags-ignore:
|
||||
- '**'
|
||||
branches-ignore:
|
||||
- "spacy.io"
|
||||
- "nightly.spacy.io"
|
||||
|
@ -12,6 +10,7 @@ on:
|
|||
- "*.md"
|
||||
- "*.mdx"
|
||||
- "website/**"
|
||||
- ".github/workflows/**"
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened, edited]
|
||||
paths-ignore:
|
||||
|
@ -31,7 +30,7 @@ jobs:
|
|||
- name: Configure Python version
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: "3.10"
|
||||
python-version: "3.9"
|
||||
|
||||
- name: black
|
||||
run: |
|
||||
|
@ -45,12 +44,11 @@ jobs:
|
|||
run: |
|
||||
python -m pip install flake8==5.0.4
|
||||
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
|
||||
# Unfortunately cython-lint isn't working after the shift to Cython 3.
|
||||
#- name: cython-lint
|
||||
# run: |
|
||||
# python -m pip install cython-lint -c requirements.txt
|
||||
# # E501: line too log, W291: trailing whitespace, E266: too many leading '#' for block comment
|
||||
# cython-lint spacy --ignore E501,W291,E266
|
||||
- name: cython-lint
|
||||
run: |
|
||||
python -m pip install cython-lint -c requirements.txt
|
||||
# E501: line too log, W291: trailing whitespace, E266: too many leading '#' for block comment
|
||||
cython-lint spacy --ignore E501,W291,E266
|
||||
|
||||
tests:
|
||||
name: Test
|
||||
|
@ -59,7 +57,14 @@ jobs:
|
|||
fail-fast: true
|
||||
matrix:
|
||||
os: [ubuntu-latest, windows-latest, macos-latest]
|
||||
python_version: ["3.9", "3.12", "3.13"]
|
||||
python_version: ["3.12"]
|
||||
include:
|
||||
- os: ubuntu-20.04
|
||||
python_version: "3.9"
|
||||
- os: windows-latest
|
||||
python_version: "3.10"
|
||||
- os: macos-latest
|
||||
python_version: "3.11"
|
||||
|
||||
runs-on: ${{ matrix.os }}
|
||||
|
||||
|
@ -84,7 +89,6 @@ jobs:
|
|||
- name: Run mypy
|
||||
run: |
|
||||
python -m mypy spacy
|
||||
if: matrix.python_version != '3.7'
|
||||
|
||||
- name: Delete source directory and .egg-info
|
||||
run: |
|
||||
|
@ -106,22 +110,22 @@ jobs:
|
|||
- name: Test import
|
||||
run: python -W error -c "import spacy"
|
||||
|
||||
- name: "Test download CLI"
|
||||
run: |
|
||||
python -m spacy download ca_core_news_sm
|
||||
python -m spacy download ca_core_news_md
|
||||
python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
|
||||
if: matrix.python_version == '3.9'
|
||||
|
||||
- name: "Test download_url in info CLI"
|
||||
run: |
|
||||
python -W error -m spacy info ca_core_news_sm | grep -q download_url
|
||||
if: matrix.python_version == '3.9'
|
||||
|
||||
- name: "Test no warnings on load (#11713)"
|
||||
run: |
|
||||
python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
|
||||
if: matrix.python_version == '3.9'
|
||||
# - name: "Test download CLI"
|
||||
# run: |
|
||||
# python -m spacy download ca_core_news_sm
|
||||
# python -m spacy download ca_core_news_md
|
||||
# python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
|
||||
# if: matrix.python_version == '3.9'
|
||||
#
|
||||
# - name: "Test download_url in info CLI"
|
||||
# run: |
|
||||
# python -W error -m spacy info ca_core_news_sm | grep -q download_url
|
||||
# if: matrix.python_version == '3.9'
|
||||
#
|
||||
# - name: "Test no warnings on load (#11713)"
|
||||
# run: |
|
||||
# python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
|
||||
# if: matrix.python_version == '3.9'
|
||||
|
||||
- name: "Test convert CLI"
|
||||
run: |
|
||||
|
@ -145,19 +149,17 @@ jobs:
|
|||
python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
|
||||
if: matrix.python_version == '3.9'
|
||||
|
||||
- name: "Test assemble CLI"
|
||||
run: |
|
||||
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
|
||||
python -m spacy assemble ner_source_sm.cfg output_dir
|
||||
env:
|
||||
PYTHONWARNINGS: "error,ignore::DeprecationWarning"
|
||||
if: matrix.python_version == '3.9'
|
||||
|
||||
- name: "Test assemble CLI vectors warning"
|
||||
run: |
|
||||
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
|
||||
python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
|
||||
if: matrix.python_version == '3.9'
|
||||
# - name: "Test assemble CLI"
|
||||
# run: |
|
||||
# python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
|
||||
# PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
|
||||
# if: matrix.python_version == '3.9'
|
||||
#
|
||||
# - name: "Test assemble CLI vectors warning"
|
||||
# run: |
|
||||
# python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
|
||||
# python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
|
||||
# if: matrix.python_version == '3.9'
|
||||
|
||||
- name: "Install test requirements"
|
||||
run: |
|
||||
|
@ -166,10 +168,4 @@ jobs:
|
|||
- name: "Run CPU tests"
|
||||
run: |
|
||||
python -m pytest --pyargs spacy -W error
|
||||
if: "!(startsWith(matrix.os, 'macos') && matrix.python_version == '3.11')"
|
||||
|
||||
- name: "Run CPU tests with thinc-apple-ops"
|
||||
run: |
|
||||
python -m pip install 'spacy[apple]'
|
||||
python -m pytest --pyargs spacy
|
||||
if: startsWith(matrix.os, 'macos') && matrix.python_version == '3.11'
|
||||
if: matrix.python_version == '3.11'
|
||||
|
|
2
.github/workflows/universe_validation.yml
vendored
2
.github/workflows/universe_validation.yml
vendored
|
@ -25,7 +25,7 @@ jobs:
|
|||
- name: Configure Python version
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: "3.7"
|
||||
python-version: "3.9"
|
||||
|
||||
- name: Validate website/meta/universe.json
|
||||
run: |
|
||||
|
|
|
@ -1,13 +1,13 @@
|
|||
repos:
|
||||
- repo: https://github.com/ambv/black
|
||||
- repo: https://github.com/ambv/black
|
||||
rev: 22.3.0
|
||||
hooks:
|
||||
- id: black
|
||||
language_version: python3.7
|
||||
additional_dependencies: ['click==8.0.4']
|
||||
- repo: https://github.com/pycqa/flake8
|
||||
- id: black
|
||||
language_version: python3.9
|
||||
additional_dependencies: ["click==8.0.4"]
|
||||
- repo: https://github.com/pycqa/flake8
|
||||
rev: 5.0.4
|
||||
hooks:
|
||||
- id: flake8
|
||||
args:
|
||||
- "--config=setup.cfg"
|
||||
- id: flake8
|
||||
args:
|
||||
- "--config=setup.cfg"
|
||||
|
|
|
@ -35,7 +35,7 @@ so that more people can benefit from it.
|
|||
|
||||
When opening an issue, use a **descriptive title** and include your
|
||||
**environment** (operating system, Python version, spaCy version). Our
|
||||
[issue templates](https://github.com/explosion/spaCy/issues/new/choose) help you
|
||||
[issue template](https://github.com/explosion/spaCy/issues/new) helps you
|
||||
remember the most important details to include. If you've discovered a bug, you
|
||||
can also submit a [regression test](#fixing-bugs) straight away. When you're
|
||||
opening an issue to report the bug, simply refer to your pull request in the
|
||||
|
@ -276,7 +276,7 @@ except: # noqa: E722
|
|||
|
||||
### Python conventions
|
||||
|
||||
All Python code must be written **compatible with Python 3.6+**. More detailed
|
||||
All Python code must be written **compatible with Python 3.9+**. More detailed
|
||||
code conventions can be found in the [developer docs](https://github.com/explosion/spaCy/blob/master/extra/DEVELOPER_DOCS/Code%20Conventions.md).
|
||||
|
||||
#### I/O and handling paths
|
||||
|
@ -449,8 +449,8 @@ and plugins in spaCy v3.0, and we can't wait to see what you build with it!
|
|||
[`spacy`](https://github.com/topics/spacy?o=desc&s=stars) and
|
||||
[`spacy-extensions`](https://github.com/topics/spacy-extension?o=desc&s=stars)
|
||||
to make it easier to find. Those are also the topics we're linking to from the
|
||||
spaCy website. If you're sharing your project on X, feel free to tag
|
||||
[@spacy_io](https://x.com/spacy_io) so we can check it out.
|
||||
spaCy website. If you're sharing your project on Twitter, feel free to tag
|
||||
[@spacy_io](https://twitter.com/spacy_io) so we can check it out.
|
||||
|
||||
- Once your extension is published, you can open a
|
||||
[PR](https://github.com/explosion/spaCy/pulls) to suggest it for the
|
||||
|
|
|
@ -4,6 +4,5 @@ include README.md
|
|||
include pyproject.toml
|
||||
include spacy/py.typed
|
||||
recursive-include spacy/cli *.yml
|
||||
recursive-include spacy/tests *.json
|
||||
recursive-include licenses *
|
||||
recursive-exclude spacy *.cpp
|
||||
|
|
2
Makefile
2
Makefile
|
@ -5,7 +5,7 @@ override SPACY_EXTRAS = spacy-lookups-data==1.0.3
|
|||
endif
|
||||
|
||||
ifndef PYVER
|
||||
override PYVER = 3.8
|
||||
override PYVER = 3.9
|
||||
endif
|
||||
|
||||
VENV := ./env$(PYVER)
|
||||
|
|
50
README.md
50
README.md
|
@ -16,7 +16,7 @@ model packaging, deployment and workflow management. spaCy is commercial
|
|||
open-source software, released under the
|
||||
[MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE).
|
||||
|
||||
💫 **Version 3.8 out now!**
|
||||
💫 **Version 3.7 out now!**
|
||||
[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
|
||||
|
||||
[](https://github.com/explosion/spaCy/actions/workflows/tests.yml)
|
||||
|
@ -28,29 +28,29 @@ open-source software, released under the
|
|||
<br />
|
||||
[](https://pypi.org/project/spacy/)
|
||||
[](https://anaconda.org/conda-forge/spacy)
|
||||
[](https://twitter.com/spacy_io)
|
||||
|
||||
## 📖 Documentation
|
||||
|
||||
| Documentation | |
|
||||
| ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| ⭐️ **[spaCy 101]** | New to spaCy? Here's everything you need to know! |
|
||||
| 📚 **[Usage Guides]** | How to use spaCy and its features. |
|
||||
| 🚀 **[New in v3.0]** | New features, backwards incompatibilities and migration guide. |
|
||||
| 🪐 **[Project Templates]** | End-to-end workflows you can clone, modify and run. |
|
||||
| 🎛 **[API Reference]** | The detailed reference for spaCy's API. |
|
||||
| ⏩ **[GPU Processing]** | Use spaCy with CUDA-compatible GPU processing. |
|
||||
| 📦 **[Models]** | Download trained pipelines for spaCy. |
|
||||
| 🦙 **[Large Language Models]** | Integrate LLMs into spaCy pipelines. |
|
||||
| 🌌 **[Universe]** | Plugins, extensions, demos and books from the spaCy ecosystem. |
|
||||
| ⚙️ **[spaCy VS Code Extension]** | Additional tooling and features for working with spaCy's config files. |
|
||||
| 👩🏫 **[Online Course]** | Learn spaCy in this free and interactive online course. |
|
||||
| 📰 **[Blog]** | Read about current spaCy and Prodigy development, releases, talks and more from Explosion. |
|
||||
| 📺 **[Videos]** | Our YouTube channel with video tutorials, talks and more. |
|
||||
| 🔴 **[Live Stream]** | Join Matt as he works on spaCy and chat about NLP, live every week. |
|
||||
| 🛠 **[Changelog]** | Changes and version history. |
|
||||
| 💝 **[Contribute]** | How to contribute to the spaCy project and code base. |
|
||||
| 👕 **[Swag]** | Support us and our work with unique, custom-designed swag! |
|
||||
| <a href="https://explosion.ai/tailored-solutions"><img src="https://github.com/explosion/spaCy/assets/13643239/36d2a42e-98c0-4599-90e1-788ef75181be" width="150" alt="Tailored Solutions"/></a> | Custom NLP consulting, implementation and strategic advice by spaCy’s core development team. Streamlined, production-ready, predictable and maintainable. Send us an email or take our 5-minute questionnaire, and well'be in touch! **[Learn more →](https://explosion.ai/tailored-solutions)** |
|
||||
| Documentation | |
|
||||
| ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| ⭐️ **[spaCy 101]** | New to spaCy? Here's everything you need to know! |
|
||||
| 📚 **[Usage Guides]** | How to use spaCy and its features. |
|
||||
| 🚀 **[New in v3.0]** | New features, backwards incompatibilities and migration guide. |
|
||||
| 🪐 **[Project Templates]** | End-to-end workflows you can clone, modify and run. |
|
||||
| 🎛 **[API Reference]** | The detailed reference for spaCy's API. |
|
||||
| ⏩ **[GPU Processing]** | Use spaCy with CUDA-compatible GPU processing. |
|
||||
| 📦 **[Models]** | Download trained pipelines for spaCy. |
|
||||
| 🦙 **[Large Language Models]** | Integrate LLMs into spaCy pipelines. |
|
||||
| 🌌 **[Universe]** | Plugins, extensions, demos and books from the spaCy ecosystem. |
|
||||
| ⚙️ **[spaCy VS Code Extension]** | Additional tooling and features for working with spaCy's config files. |
|
||||
| 👩🏫 **[Online Course]** | Learn spaCy in this free and interactive online course. |
|
||||
| 📰 **[Blog]** | Read about current spaCy and Prodigy development, releases, talks and more from Explosion. |
|
||||
| 📺 **[Videos]** | Our YouTube channel with video tutorials, talks and more. |
|
||||
| 🛠 **[Changelog]** | Changes and version history. |
|
||||
| 💝 **[Contribute]** | How to contribute to the spaCy project and code base. |
|
||||
| 👕 **[Swag]** | Support us and our work with unique, custom-designed swag! |
|
||||
| <a href="https://explosion.ai/tailored-solutions"><img src="https://github.com/explosion/spaCy/assets/13643239/36d2a42e-98c0-4599-90e1-788ef75181be" width="150" alt="Tailored Solutions"/></a> | Custom NLP consulting, implementation and strategic advice by spaCy’s core development team. Streamlined, production-ready, predictable and maintainable. Send us an email or take our 5-minute questionnaire, and well'be in touch! **[Learn more →](https://explosion.ai/tailored-solutions)** |
|
||||
|
||||
[spacy 101]: https://spacy.io/usage/spacy-101
|
||||
[new in v3.0]: https://spacy.io/usage/v3
|
||||
|
@ -62,7 +62,6 @@ open-source software, released under the
|
|||
[universe]: https://spacy.io/universe
|
||||
[spacy vs code extension]: https://github.com/explosion/spacy-vscode
|
||||
[videos]: https://www.youtube.com/c/ExplosionAI
|
||||
[live stream]: https://www.youtube.com/playlist?list=PLBmcuObd5An5_iAxNYLJa_xWmNzsYce8c
|
||||
[online course]: https://course.spacy.io
|
||||
[blog]: https://explosion.ai
|
||||
[project templates]: https://github.com/explosion/projects
|
||||
|
@ -80,14 +79,13 @@ more people can benefit from it.
|
|||
| Type | Platforms |
|
||||
| ------------------------------- | --------------------------------------- |
|
||||
| 🚨 **Bug Reports** | [GitHub Issue Tracker] |
|
||||
| 🎁 **Feature Requests & Ideas** | [GitHub Discussions] · [Live Stream] |
|
||||
| 🎁 **Feature Requests & Ideas** | [GitHub Discussions] |
|
||||
| 👩💻 **Usage Questions** | [GitHub Discussions] · [Stack Overflow] |
|
||||
| 🗯 **General Discussion** | [GitHub Discussions] · [Live Stream] |
|
||||
| 🗯 **General Discussion** | [GitHub Discussions] |
|
||||
|
||||
[github issue tracker]: https://github.com/explosion/spaCy/issues
|
||||
[github discussions]: https://github.com/explosion/spaCy/discussions
|
||||
[stack overflow]: https://stackoverflow.com/questions/tagged/spacy
|
||||
[live stream]: https://www.youtube.com/playlist?list=PLBmcuObd5An5_iAxNYLJa_xWmNzsYce8c
|
||||
|
||||
## Features
|
||||
|
||||
|
@ -117,7 +115,7 @@ For detailed installation instructions, see the
|
|||
|
||||
- **Operating system**: macOS / OS X · Linux · Windows (Cygwin, MinGW, Visual
|
||||
Studio)
|
||||
- **Python version**: Python >=3.7, <3.13 (only 64 bit)
|
||||
- **Python version**: Python 3.9+ (only 64 bit)
|
||||
- **Package managers**: [pip] · [conda] (via `conda-forge`)
|
||||
|
||||
[pip]: https://pypi.org/project/spacy/
|
||||
|
|
|
@ -1,20 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
set -e
|
||||
|
||||
# Insist repository is clean
|
||||
git diff-index --quiet HEAD
|
||||
|
||||
version=$(grep "__version__ = " spacy/about.py)
|
||||
version=${version/__version__ = }
|
||||
version=${version/\'/}
|
||||
version=${version/\'/}
|
||||
version=${version/\"/}
|
||||
version=${version/\"/}
|
||||
|
||||
echo "Pushing release-v"$version
|
||||
|
||||
git tag -d release-v$version || true
|
||||
git push origin :release-v$version || true
|
||||
git tag release-v$version
|
||||
git push origin release-v$version
|
|
@ -1,2 +1,2 @@
|
|||
# build version constraints for use with wheelwright
|
||||
numpy>=2.0.0,<3.0.0
|
||||
numpy>=1.25.0; python_version>='3.9'
|
||||
|
|
|
@ -31,7 +31,6 @@ These are repos that can be used by spaCy but aren't part of a default insta
|
|||
- [spacy-stanza](https://github.com/explosion/spacy-stanza): This is a wrapper that allows the use of Stanford's Stanza library in spaCy.
|
||||
- [spacy-streamlit](https://github.com/explosion/spacy-streamlit): A wrapper for the Streamlit dashboard building library to help with integrating [displaCy](https://spacy.io/api/top-level/#displacy).
|
||||
- [spacymoji](https://github.com/explosion/spacymoji): A library to add extra support for emoji to spaCy, such as including character names.
|
||||
- [thinc-apple-ops](https://github.com/explosion/thinc-apple-ops): A special backend for OSX that uses Apple's native libraries for improved performance.
|
||||
- [os-signpost](https://github.com/explosion/os-signpost): A Python package that allows you to use the `OSSignposter` API in OSX for performance analysis.
|
||||
- [spacy-ray](https://github.com/explosion/spacy-ray): A wrapper to integrate spaCy with Ray, a distributed training framework. Currently a work in progress.
|
||||
|
||||
|
@ -79,4 +78,3 @@ Repos that don't fit in any of the above categories.
|
|||
- [tokenizations](https://github.com/explosion/tokenizations): A library originally by Yohei Tamura to align strings with tolerance to some variations in features like case and diacritics, used for aligning tokens and wordpieces. Adopted and maintained by Explosion, but usually spacy-alignments is used instead.
|
||||
- [conll-2012](https://github.com/explosion/conll-2012): A repo to hold some slightly cleaned up versions of the official scripts for the CoNLL 2012 shared task involving coreference resolution. Used in the coref project.
|
||||
- [fastapi-explosion-extras](https://github.com/explosion/fastapi-explosion-extras): Some small tweaks to FastAPI used at Explosion.
|
||||
|
||||
|
|
|
@ -1,67 +1,15 @@
|
|||
[build-system]
|
||||
requires = [
|
||||
"setuptools",
|
||||
"cython>=3.0,<4.0",
|
||||
"cython>=0.25,<3.0",
|
||||
"cymem>=2.0.2,<2.1.0",
|
||||
"preshed>=3.0.2,<3.1.0",
|
||||
"murmurhash>=0.28.0,<1.1.0",
|
||||
"thinc>=8.3.4,<8.4.0",
|
||||
"numpy>=2.0.0,<3.0.0"
|
||||
"thinc>=9.0.0,<9.1.0",
|
||||
"numpy>=1.15.0; python_version < '3.9'",
|
||||
"numpy>=1.25.0; python_version >= '3.9'",
|
||||
]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[tool.cibuildwheel]
|
||||
build = "*"
|
||||
skip = "pp* cp36* cp37* cp38* *-win32 *i686*"
|
||||
test-skip = ""
|
||||
free-threaded-support = false
|
||||
|
||||
archs = ["native"]
|
||||
|
||||
build-frontend = "default"
|
||||
config-settings = {}
|
||||
dependency-versions = "pinned"
|
||||
environment = { PIP_CONSTRAINT = "build-constraints.txt" }
|
||||
|
||||
environment-pass = []
|
||||
build-verbosity = 0
|
||||
|
||||
before-all = "curl https://sh.rustup.rs -sSf | sh -s -- -y --profile minimal --default-toolchain stable"
|
||||
before-build = "pip install -r requirements.txt && python setup.py clean"
|
||||
repair-wheel-command = ""
|
||||
|
||||
test-command = ""
|
||||
before-test = ""
|
||||
test-requires = []
|
||||
test-extras = []
|
||||
|
||||
container-engine = "docker"
|
||||
|
||||
manylinux-x86_64-image = "manylinux2014"
|
||||
manylinux-i686-image = "manylinux2014"
|
||||
manylinux-aarch64-image = "manylinux2014"
|
||||
manylinux-ppc64le-image = "manylinux2014"
|
||||
manylinux-s390x-image = "manylinux2014"
|
||||
manylinux-pypy_x86_64-image = "manylinux2014"
|
||||
manylinux-pypy_i686-image = "manylinux2014"
|
||||
manylinux-pypy_aarch64-image = "manylinux2014"
|
||||
|
||||
musllinux-x86_64-image = "musllinux_1_2"
|
||||
musllinux-i686-image = "musllinux_1_2"
|
||||
musllinux-aarch64-image = "musllinux_1_2"
|
||||
musllinux-ppc64le-image = "musllinux_1_2"
|
||||
musllinux-s390x-image = "musllinux_1_2"
|
||||
|
||||
[tool.cibuildwheel.linux]
|
||||
repair-wheel-command = "auditwheel repair -w {dest_dir} {wheel}"
|
||||
|
||||
[tool.cibuildwheel.macos]
|
||||
repair-wheel-command = "delocate-wheel --require-archs {delocate_archs} -w {dest_dir} -v {wheel}"
|
||||
|
||||
[tool.cibuildwheel.windows]
|
||||
|
||||
[tool.cibuildwheel.pyodide]
|
||||
|
||||
|
||||
[tool.isort]
|
||||
profile = "black"
|
||||
|
|
|
@ -1,34 +1,36 @@
|
|||
# Our libraries
|
||||
spacy-legacy>=3.0.11,<3.1.0
|
||||
spacy-legacy>=4.0.0.dev1,<4.1.0
|
||||
spacy-loggers>=1.0.0,<2.0.0
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
thinc>=8.3.4,<8.4.0
|
||||
thinc>=9.0.0,<9.1.0
|
||||
ml_datasets>=0.2.0,<0.3.0
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
wasabi>=0.9.1,<1.2.0
|
||||
srsly>=2.4.3,<3.0.0
|
||||
catalogue>=2.0.6,<2.1.0
|
||||
typer-slim>=0.3.0,<1.0.0
|
||||
typer>=0.3.0,<1.0.0
|
||||
weasel>=0.1.0,<0.5.0
|
||||
# Third party dependencies
|
||||
numpy>=2.0.0,<3.0.0
|
||||
numpy>=1.15.0; python_version < "3.9"
|
||||
numpy>=1.19.0; python_version >= "3.9"
|
||||
requests>=2.13.0,<3.0.0
|
||||
tqdm>=4.38.0,<5.0.0
|
||||
pydantic>=1.7.4,!=1.8,!=1.8.1,<3.0.0
|
||||
jinja2
|
||||
langcodes>=3.2.0,<4.0.0
|
||||
# Official Python utilities
|
||||
setuptools
|
||||
packaging>=20.0
|
||||
# Development dependencies
|
||||
pre-commit>=2.13.0
|
||||
cython>=3.0,<4.0
|
||||
cython>=0.25,<3.0
|
||||
pytest>=5.2.0,!=7.1.0
|
||||
pytest-timeout>=1.3.0,<2.0.0
|
||||
mock>=2.0.0,<3.0.0
|
||||
flake8>=3.8.0,<6.0.0
|
||||
hypothesis>=3.27.0,<7.0.0
|
||||
mypy>=1.5.0,<1.6.0; platform_machine != "aarch64" and python_version >= "3.8"
|
||||
mypy>=1.5.0,<1.6.0; platform_machine != "aarch64"
|
||||
types-mock>=0.1.1
|
||||
types-setuptools>=57.0.0
|
||||
types-requests
|
||||
|
|
25
setup.cfg
25
setup.cfg
|
@ -21,7 +21,6 @@ classifiers =
|
|||
Programming Language :: Python :: 3.10
|
||||
Programming Language :: Python :: 3.11
|
||||
Programming Language :: Python :: 3.12
|
||||
Programming Language :: Python :: 3.13
|
||||
Topic :: Scientific/Engineering
|
||||
project_urls =
|
||||
Release notes = https://github.com/explosion/spaCy/releases
|
||||
|
@ -30,32 +29,21 @@ project_urls =
|
|||
[options]
|
||||
zip_safe = false
|
||||
include_package_data = true
|
||||
python_requires = >=3.9,<3.14
|
||||
# NOTE: This section is superseded by pyproject.toml and will be removed in
|
||||
# spaCy v4
|
||||
setup_requires =
|
||||
cython>=3.0,<4.0
|
||||
numpy>=2.0.0,<3.0.0; python_version < "3.9"
|
||||
numpy>=2.0.0,<3.0.0; python_version >= "3.9"
|
||||
# We also need our Cython packages here to compile against
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
thinc>=8.3.4,<8.4.0
|
||||
python_requires = >=3.9
|
||||
install_requires =
|
||||
# Our libraries
|
||||
spacy-legacy>=3.0.11,<3.1.0
|
||||
spacy-legacy>=4.0.0.dev1,<4.1.0
|
||||
spacy-loggers>=1.0.0,<2.0.0
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
thinc>=8.3.4,<8.4.0
|
||||
thinc>=9.0.0,<9.1.0
|
||||
wasabi>=0.9.1,<1.2.0
|
||||
srsly>=2.4.3,<3.0.0
|
||||
catalogue>=2.0.6,<2.1.0
|
||||
weasel>=0.1.0,<0.5.0
|
||||
# Third-party dependencies
|
||||
typer-slim>=0.3.0,<1.0.0
|
||||
typer>=0.3.0,<1.0.0
|
||||
tqdm>=4.38.0,<5.0.0
|
||||
numpy>=1.15.0; python_version < "3.9"
|
||||
numpy>=1.19.0; python_version >= "3.9"
|
||||
|
@ -65,6 +53,7 @@ install_requires =
|
|||
# Official Python utilities
|
||||
setuptools
|
||||
packaging>=20.0
|
||||
langcodes>=3.2.0,<4.0.0
|
||||
|
||||
[options.entry_points]
|
||||
console_scripts =
|
||||
|
@ -113,14 +102,12 @@ cuda12x =
|
|||
cupy-cuda12x>=11.5.0,<13.0.0
|
||||
cuda-autodetect =
|
||||
cupy-wheel>=11.0.0,<13.0.0
|
||||
apple =
|
||||
thinc-apple-ops>=1.0.0,<2.0.0
|
||||
# Language tokenizers with external dependencies
|
||||
ja =
|
||||
sudachipy>=0.5.2,!=0.6.1
|
||||
sudachidict_core>=20211220
|
||||
ko =
|
||||
natto-py>=0.9.0
|
||||
mecab-ko>=1.0.0
|
||||
th =
|
||||
pythainlp>=2.0
|
||||
|
||||
|
|
5
setup.py
5
setup.py
|
@ -37,7 +37,6 @@ MOD_NAMES = [
|
|||
"spacy.pipeline.dep_parser",
|
||||
"spacy.pipeline._edit_tree_internals.edit_trees",
|
||||
"spacy.pipeline.morphologizer",
|
||||
"spacy.pipeline.multitask",
|
||||
"spacy.pipeline.ner",
|
||||
"spacy.pipeline.pipe",
|
||||
"spacy.pipeline.trainable_pipe",
|
||||
|
@ -48,6 +47,7 @@ MOD_NAMES = [
|
|||
"spacy.pipeline._parser_internals.arc_eager",
|
||||
"spacy.pipeline._parser_internals.ner",
|
||||
"spacy.pipeline._parser_internals.nonproj",
|
||||
"spacy.pipeline._parser_internals.search",
|
||||
"spacy.pipeline._parser_internals._state",
|
||||
"spacy.pipeline._parser_internals.stateclass",
|
||||
"spacy.pipeline._parser_internals.transition_system",
|
||||
|
@ -61,12 +61,13 @@ MOD_NAMES = [
|
|||
"spacy.tokens.span_group",
|
||||
"spacy.tokens.graph",
|
||||
"spacy.tokens.morphanalysis",
|
||||
"spacy.tokens._retokenize",
|
||||
"spacy.tokens.retokenizer",
|
||||
"spacy.matcher.matcher",
|
||||
"spacy.matcher.phrasematcher",
|
||||
"spacy.matcher.dependencymatcher",
|
||||
"spacy.symbols",
|
||||
"spacy.vectors",
|
||||
"spacy.tests.parser._search",
|
||||
]
|
||||
COMPILE_OPTIONS = {
|
||||
"msvc": ["/Ox", "/EHsc"],
|
||||
|
|
|
@ -17,7 +17,6 @@ from .cli.info import info # noqa: F401
|
|||
from .errors import Errors
|
||||
from .glossary import explain # noqa: F401
|
||||
from .language import Language
|
||||
from .registrations import REGISTRY_POPULATED, populate_registry
|
||||
from .util import logger, registry # noqa: F401
|
||||
from .vocab import Vocab
|
||||
|
||||
|
|
|
@ -1,5 +1,9 @@
|
|||
# fmt: off
|
||||
__title__ = "spacy"
|
||||
__version__ = "3.8.7"
|
||||
__version__ = "4.0.0.dev3"
|
||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||
__projects__ = "https://github.com/explosion/projects"
|
||||
__projects_branch__ = "v3"
|
||||
__lookups_tag__ = "v1.0.3"
|
||||
__lookups_url__ = f"https://raw.githubusercontent.com/explosion/spacy-lookups-data/{__lookups_tag__}/spacy_lookups_data/data/"
|
||||
|
|
129
spacy/attrs.pxd
129
spacy/attrs.pxd
|
@ -1,99 +1,50 @@
|
|||
# Reserve 64 values for flag features
|
||||
from . cimport symbols
|
||||
|
||||
|
||||
cdef enum attr_id_t:
|
||||
NULL_ATTR
|
||||
IS_ALPHA
|
||||
IS_ASCII
|
||||
IS_DIGIT
|
||||
IS_LOWER
|
||||
IS_PUNCT
|
||||
IS_SPACE
|
||||
IS_TITLE
|
||||
IS_UPPER
|
||||
LIKE_URL
|
||||
LIKE_NUM
|
||||
LIKE_EMAIL
|
||||
IS_STOP
|
||||
IS_OOV_DEPRECATED
|
||||
IS_BRACKET
|
||||
IS_QUOTE
|
||||
IS_LEFT_PUNCT
|
||||
IS_RIGHT_PUNCT
|
||||
IS_CURRENCY
|
||||
NULL_ATTR = 0
|
||||
IS_ALPHA = symbols.IS_ALPHA
|
||||
IS_ASCII = symbols.IS_ASCII
|
||||
IS_DIGIT = symbols.IS_DIGIT
|
||||
IS_LOWER = symbols.IS_LOWER
|
||||
IS_PUNCT = symbols.IS_PUNCT
|
||||
IS_SPACE = symbols.IS_SPACE
|
||||
IS_TITLE = symbols.IS_TITLE
|
||||
IS_UPPER = symbols.IS_UPPER
|
||||
LIKE_URL = symbols.LIKE_URL
|
||||
LIKE_NUM = symbols.LIKE_NUM
|
||||
LIKE_EMAIL = symbols.LIKE_EMAIL
|
||||
IS_STOP = symbols.IS_STOP
|
||||
IS_BRACKET = symbols.IS_BRACKET
|
||||
IS_QUOTE = symbols.IS_QUOTE
|
||||
IS_LEFT_PUNCT = symbols.IS_LEFT_PUNCT
|
||||
IS_RIGHT_PUNCT = symbols.IS_RIGHT_PUNCT
|
||||
IS_CURRENCY = symbols.IS_CURRENCY
|
||||
|
||||
FLAG19 = 19
|
||||
FLAG20
|
||||
FLAG21
|
||||
FLAG22
|
||||
FLAG23
|
||||
FLAG24
|
||||
FLAG25
|
||||
FLAG26
|
||||
FLAG27
|
||||
FLAG28
|
||||
FLAG29
|
||||
FLAG30
|
||||
FLAG31
|
||||
FLAG32
|
||||
FLAG33
|
||||
FLAG34
|
||||
FLAG35
|
||||
FLAG36
|
||||
FLAG37
|
||||
FLAG38
|
||||
FLAG39
|
||||
FLAG40
|
||||
FLAG41
|
||||
FLAG42
|
||||
FLAG43
|
||||
FLAG44
|
||||
FLAG45
|
||||
FLAG46
|
||||
FLAG47
|
||||
FLAG48
|
||||
FLAG49
|
||||
FLAG50
|
||||
FLAG51
|
||||
FLAG52
|
||||
FLAG53
|
||||
FLAG54
|
||||
FLAG55
|
||||
FLAG56
|
||||
FLAG57
|
||||
FLAG58
|
||||
FLAG59
|
||||
FLAG60
|
||||
FLAG61
|
||||
FLAG62
|
||||
FLAG63
|
||||
ID = symbols.ID
|
||||
ORTH = symbols.ORTH
|
||||
LOWER = symbols.LOWER
|
||||
NORM = symbols.NORM
|
||||
SHAPE = symbols.SHAPE
|
||||
PREFIX = symbols.PREFIX
|
||||
SUFFIX = symbols.SUFFIX
|
||||
|
||||
ID
|
||||
ORTH
|
||||
LOWER
|
||||
NORM
|
||||
SHAPE
|
||||
PREFIX
|
||||
SUFFIX
|
||||
LENGTH = symbols.LENGTH
|
||||
CLUSTER = symbols.CLUSTER
|
||||
LEMMA = symbols.LEMMA
|
||||
POS = symbols.POS
|
||||
TAG = symbols.TAG
|
||||
DEP = symbols.DEP
|
||||
ENT_IOB = symbols.ENT_IOB
|
||||
ENT_TYPE = symbols.ENT_TYPE
|
||||
HEAD = symbols.HEAD
|
||||
SENT_START = symbols.SENT_START
|
||||
SPACY = symbols.SPACY
|
||||
PROB = symbols.PROB
|
||||
|
||||
LENGTH
|
||||
CLUSTER
|
||||
LEMMA
|
||||
POS
|
||||
TAG
|
||||
DEP
|
||||
ENT_IOB
|
||||
ENT_TYPE
|
||||
HEAD
|
||||
SENT_START
|
||||
SPACY
|
||||
PROB
|
||||
|
||||
LANG
|
||||
LANG = symbols.LANG
|
||||
ENT_KB_ID = symbols.ENT_KB_ID
|
||||
MORPH
|
||||
MORPH = symbols.MORPH
|
||||
ENT_ID = symbols.ENT_ID
|
||||
|
||||
IDX
|
||||
SENT_END
|
||||
IDX = symbols.IDX
|
||||
|
|
120
spacy/attrs.pyx
120
spacy/attrs.pyx
|
@ -17,57 +17,11 @@ IDS = {
|
|||
"LIKE_NUM": LIKE_NUM,
|
||||
"LIKE_EMAIL": LIKE_EMAIL,
|
||||
"IS_STOP": IS_STOP,
|
||||
"IS_OOV_DEPRECATED": IS_OOV_DEPRECATED,
|
||||
"IS_BRACKET": IS_BRACKET,
|
||||
"IS_QUOTE": IS_QUOTE,
|
||||
"IS_LEFT_PUNCT": IS_LEFT_PUNCT,
|
||||
"IS_RIGHT_PUNCT": IS_RIGHT_PUNCT,
|
||||
"IS_CURRENCY": IS_CURRENCY,
|
||||
"FLAG19": FLAG19,
|
||||
"FLAG20": FLAG20,
|
||||
"FLAG21": FLAG21,
|
||||
"FLAG22": FLAG22,
|
||||
"FLAG23": FLAG23,
|
||||
"FLAG24": FLAG24,
|
||||
"FLAG25": FLAG25,
|
||||
"FLAG26": FLAG26,
|
||||
"FLAG27": FLAG27,
|
||||
"FLAG28": FLAG28,
|
||||
"FLAG29": FLAG29,
|
||||
"FLAG30": FLAG30,
|
||||
"FLAG31": FLAG31,
|
||||
"FLAG32": FLAG32,
|
||||
"FLAG33": FLAG33,
|
||||
"FLAG34": FLAG34,
|
||||
"FLAG35": FLAG35,
|
||||
"FLAG36": FLAG36,
|
||||
"FLAG37": FLAG37,
|
||||
"FLAG38": FLAG38,
|
||||
"FLAG39": FLAG39,
|
||||
"FLAG40": FLAG40,
|
||||
"FLAG41": FLAG41,
|
||||
"FLAG42": FLAG42,
|
||||
"FLAG43": FLAG43,
|
||||
"FLAG44": FLAG44,
|
||||
"FLAG45": FLAG45,
|
||||
"FLAG46": FLAG46,
|
||||
"FLAG47": FLAG47,
|
||||
"FLAG48": FLAG48,
|
||||
"FLAG49": FLAG49,
|
||||
"FLAG50": FLAG50,
|
||||
"FLAG51": FLAG51,
|
||||
"FLAG52": FLAG52,
|
||||
"FLAG53": FLAG53,
|
||||
"FLAG54": FLAG54,
|
||||
"FLAG55": FLAG55,
|
||||
"FLAG56": FLAG56,
|
||||
"FLAG57": FLAG57,
|
||||
"FLAG58": FLAG58,
|
||||
"FLAG59": FLAG59,
|
||||
"FLAG60": FLAG60,
|
||||
"FLAG61": FLAG61,
|
||||
"FLAG62": FLAG62,
|
||||
"FLAG63": FLAG63,
|
||||
"ID": ID,
|
||||
"ORTH": ORTH,
|
||||
"LOWER": LOWER,
|
||||
|
@ -93,12 +47,11 @@ IDS = {
|
|||
}
|
||||
|
||||
|
||||
# ATTR IDs, in order of the symbol
|
||||
NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]
|
||||
NAMES = {v: k for k, v in IDS.items()}
|
||||
locals().update(IDS)
|
||||
|
||||
|
||||
def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
|
||||
def intify_attrs(stringy_attrs, strings_map=None):
|
||||
"""
|
||||
Normalize a dictionary of attributes, converting them to ints.
|
||||
|
||||
|
@ -110,75 +63,6 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
|
|||
converted to ints.
|
||||
"""
|
||||
inty_attrs = {}
|
||||
if _do_deprecated:
|
||||
if "F" in stringy_attrs:
|
||||
stringy_attrs["ORTH"] = stringy_attrs.pop("F")
|
||||
if "L" in stringy_attrs:
|
||||
stringy_attrs["LEMMA"] = stringy_attrs.pop("L")
|
||||
if "pos" in stringy_attrs:
|
||||
stringy_attrs["TAG"] = stringy_attrs.pop("pos")
|
||||
if "morph" in stringy_attrs:
|
||||
morphs = stringy_attrs.pop("morph") # no-cython-lint
|
||||
if "number" in stringy_attrs:
|
||||
stringy_attrs.pop("number")
|
||||
if "tenspect" in stringy_attrs:
|
||||
stringy_attrs.pop("tenspect")
|
||||
morph_keys = [
|
||||
"PunctType",
|
||||
"PunctSide",
|
||||
"Other",
|
||||
"Degree",
|
||||
"AdvType",
|
||||
"Number",
|
||||
"VerbForm",
|
||||
"PronType",
|
||||
"Aspect",
|
||||
"Tense",
|
||||
"PartType",
|
||||
"Poss",
|
||||
"Hyph",
|
||||
"ConjType",
|
||||
"NumType",
|
||||
"Foreign",
|
||||
"VerbType",
|
||||
"NounType",
|
||||
"Gender",
|
||||
"Mood",
|
||||
"Negative",
|
||||
"Tense",
|
||||
"Voice",
|
||||
"Abbr",
|
||||
"Derivation",
|
||||
"Echo",
|
||||
"Foreign",
|
||||
"NameType",
|
||||
"NounType",
|
||||
"NumForm",
|
||||
"NumValue",
|
||||
"PartType",
|
||||
"Polite",
|
||||
"StyleVariant",
|
||||
"PronType",
|
||||
"AdjType",
|
||||
"Person",
|
||||
"Variant",
|
||||
"AdpType",
|
||||
"Reflex",
|
||||
"Negative",
|
||||
"Mood",
|
||||
"Aspect",
|
||||
"Case",
|
||||
"Polarity",
|
||||
"PrepCase",
|
||||
"Animacy", # U20
|
||||
]
|
||||
for key in morph_keys:
|
||||
if key in stringy_attrs:
|
||||
stringy_attrs.pop(key)
|
||||
elif key.lower() in stringy_attrs:
|
||||
stringy_attrs.pop(key.lower())
|
||||
elif key.upper() in stringy_attrs:
|
||||
stringy_attrs.pop(key.upper())
|
||||
for name, value in stringy_attrs.items():
|
||||
int_key = intify_attr(name)
|
||||
if int_key is not None:
|
||||
|
|
|
@ -14,6 +14,7 @@ from .debug_config import debug_config # noqa: F401
|
|||
from .debug_data import debug_data # noqa: F401
|
||||
from .debug_diff import debug_diff # noqa: F401
|
||||
from .debug_model import debug_model # noqa: F401
|
||||
from .distill import distill # noqa: F401
|
||||
from .download import download # noqa: F401
|
||||
from .evaluate import evaluate # noqa: F401
|
||||
from .find_function import find_function # noqa: F401
|
||||
|
|
|
@ -11,6 +11,7 @@ from typing import (
|
|||
Dict,
|
||||
Iterable,
|
||||
List,
|
||||
Literal,
|
||||
Optional,
|
||||
Tuple,
|
||||
Union,
|
||||
|
@ -28,7 +29,7 @@ from wasabi import Printer, msg
|
|||
from weasel import app as project_cli
|
||||
|
||||
from .. import about
|
||||
from ..compat import Literal
|
||||
from ..errors import RENAMED_LANGUAGE_CODES
|
||||
from ..schemas import validate
|
||||
from ..util import (
|
||||
ENV_VARS,
|
||||
|
@ -148,6 +149,16 @@ def _parse_override(value: Any) -> Any:
|
|||
return str(value)
|
||||
|
||||
|
||||
def _handle_renamed_language_codes(lang: Optional[str]) -> None:
|
||||
# Throw error for renamed language codes in v4
|
||||
if lang in RENAMED_LANGUAGE_CODES:
|
||||
msg.fail(
|
||||
title="Renamed language code",
|
||||
text=f"Language code '{lang}' was replaced with '{RENAMED_LANGUAGE_CODES[lang]}' in spaCy v4. Update the language code from '{lang}' to '{RENAMED_LANGUAGE_CODES[lang]}'.",
|
||||
exits=1,
|
||||
)
|
||||
|
||||
|
||||
@contextmanager
|
||||
def show_validation_error(
|
||||
file_path: Optional[Union[str, Path]] = None,
|
||||
|
@ -192,6 +203,13 @@ def show_validation_error(
|
|||
msg.fail("Config validation error", e, exits=1)
|
||||
|
||||
|
||||
def import_code_paths(code_paths: str) -> None:
|
||||
"""Helper to import comma-separated list of code paths."""
|
||||
code_paths = [Path(p.strip()) for p in string_to_list(code_paths)]
|
||||
for code_path in code_paths:
|
||||
import_code(code_path)
|
||||
|
||||
|
||||
def import_code(code_path: Optional[Union[Path, str]]) -> None:
|
||||
"""Helper to import Python file provided in training commands / commands
|
||||
using the config. This makes custom registered functions available.
|
||||
|
|
|
@ -11,7 +11,7 @@ from ._util import (
|
|||
Arg,
|
||||
Opt,
|
||||
app,
|
||||
import_code,
|
||||
import_code_paths,
|
||||
parse_config_overrides,
|
||||
show_validation_error,
|
||||
)
|
||||
|
@ -26,7 +26,7 @@ def assemble_cli(
|
|||
ctx: typer.Context, # This is only used to read additional arguments
|
||||
config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
|
||||
output_path: Path = Arg(..., help="Output directory to store assembled pipeline in"),
|
||||
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||
code_path: str = Opt("", "--code", "-c", help="Comma-separated paths to Python files with additional code (registered functions) to be imported"),
|
||||
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||
# fmt: on
|
||||
):
|
||||
|
@ -46,7 +46,7 @@ def assemble_cli(
|
|||
if not config_path or (str(config_path) != "-" and not config_path.exists()):
|
||||
msg.fail("Config file not found", config_path, exits=1)
|
||||
overrides = parse_config_overrides(ctx.args)
|
||||
import_code(code_path)
|
||||
import_code_paths(code_path)
|
||||
with show_validation_error(config_path):
|
||||
config = util.load_config(config_path, overrides=overrides, interpolate=False)
|
||||
msg.divider("Initializing pipeline")
|
||||
|
|
|
@ -16,7 +16,7 @@ from ..training.converters import (
|
|||
iob_to_docs,
|
||||
json_to_docs,
|
||||
)
|
||||
from ._util import Arg, Opt, app, walk_directory
|
||||
from ._util import Arg, Opt, _handle_renamed_language_codes, app, walk_directory
|
||||
|
||||
# Converters are matched by file extension except for ner/iob, which are
|
||||
# matched by file extension and content. To add a converter, add a new
|
||||
|
@ -116,6 +116,10 @@ def convert(
|
|||
input_path = Path(input_path)
|
||||
if not msg:
|
||||
msg = Printer(no_print=silent)
|
||||
|
||||
# Throw error for renamed language codes in v4
|
||||
_handle_renamed_language_codes(lang)
|
||||
|
||||
ner_map = srsly.read_json(ner_map) if ner_map is not None else None
|
||||
doc_files = []
|
||||
for input_loc in walk_directory(input_path, converter):
|
||||
|
|
|
@ -13,7 +13,7 @@ from ._util import (
|
|||
Arg,
|
||||
Opt,
|
||||
debug_cli,
|
||||
import_code,
|
||||
import_code_paths,
|
||||
parse_config_overrides,
|
||||
show_validation_error,
|
||||
)
|
||||
|
@ -27,7 +27,7 @@ def debug_config_cli(
|
|||
# fmt: off
|
||||
ctx: typer.Context, # This is only used to read additional arguments
|
||||
config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
|
||||
code_path: Optional[Path] = Opt(None, "--code-path", "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||
code_path: str = Opt("", "--code", "-c", help="Comma-separated paths to Python files with additional code (registered functions) to be imported"),
|
||||
show_funcs: bool = Opt(False, "--show-functions", "-F", help="Show an overview of all registered functions used in the config and where they come from (modules, files etc.)"),
|
||||
show_vars: bool = Opt(False, "--show-variables", "-V", help="Show an overview of all variables referenced in the config and their values. This will also reflect variables overwritten on the CLI.")
|
||||
# fmt: on
|
||||
|
@ -44,7 +44,7 @@ def debug_config_cli(
|
|||
DOCS: https://spacy.io/api/cli#debug-config
|
||||
"""
|
||||
overrides = parse_config_overrides(ctx.args)
|
||||
import_code(code_path)
|
||||
import_code_paths(code_path)
|
||||
debug_config(
|
||||
config_path, overrides=overrides, show_funcs=show_funcs, show_vars=show_vars
|
||||
)
|
||||
|
|
|
@ -7,6 +7,7 @@ from typing import (
|
|||
Dict,
|
||||
Iterable,
|
||||
List,
|
||||
Literal,
|
||||
Optional,
|
||||
Sequence,
|
||||
Set,
|
||||
|
@ -22,7 +23,6 @@ import typer
|
|||
from wasabi import MESSAGES, Printer, msg
|
||||
|
||||
from .. import util
|
||||
from ..compat import Literal
|
||||
from ..language import Language
|
||||
from ..morphology import Morphology
|
||||
from ..pipeline import Morphologizer, SpanCategorizer, TrainablePipe
|
||||
|
@ -40,7 +40,7 @@ from ._util import (
|
|||
_format_number,
|
||||
app,
|
||||
debug_cli,
|
||||
import_code,
|
||||
import_code_paths,
|
||||
parse_config_overrides,
|
||||
show_validation_error,
|
||||
)
|
||||
|
@ -72,7 +72,7 @@ def debug_data_cli(
|
|||
# fmt: off
|
||||
ctx: typer.Context, # This is only used to read additional arguments
|
||||
config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
|
||||
code_path: Optional[Path] = Opt(None, "--code-path", "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||
code_path: str = Opt("", "--code", "-c", help="Comma-separated paths to Python files with additional code (registered functions) to be imported"),
|
||||
ignore_warnings: bool = Opt(False, "--ignore-warnings", "-IW", help="Ignore warnings, only show stats and errors"),
|
||||
verbose: bool = Opt(False, "--verbose", "-V", help="Print additional information and explanations"),
|
||||
no_format: bool = Opt(False, "--no-format", "-NF", help="Don't pretty-print the results"),
|
||||
|
@ -92,7 +92,7 @@ def debug_data_cli(
|
|||
"--help for an overview of the other available debugging commands."
|
||||
)
|
||||
overrides = parse_config_overrides(ctx.args)
|
||||
import_code(code_path)
|
||||
import_code_paths(code_path)
|
||||
debug_data(
|
||||
config_path,
|
||||
config_overrides=overrides,
|
||||
|
@ -1073,8 +1073,7 @@ def _get_distribution(docs, normalize: bool = True) -> Counter:
|
|||
word_counts: Counter = Counter()
|
||||
for doc in docs:
|
||||
for token in doc:
|
||||
# Normalize the text
|
||||
t = token.text.lower().replace("``", '"').replace("''", '"')
|
||||
t = token.text.lower()
|
||||
word_counts[t] += 1
|
||||
if normalize:
|
||||
total = sum(word_counts.values(), 0.0)
|
||||
|
|
|
@ -170,7 +170,7 @@ def debug_model(
|
|||
msg.divider(f"STEP 3 - prediction")
|
||||
msg.info(str(prediction))
|
||||
|
||||
msg.good(f"Successfully ended analysis - model looks good.")
|
||||
msg.good(f"Succesfully ended analysis - model looks good.")
|
||||
|
||||
|
||||
def _sentences():
|
||||
|
|
98
spacy/cli/distill.py
Normal file
98
spacy/cli/distill.py
Normal file
|
@ -0,0 +1,98 @@
|
|||
import logging
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Optional, Union
|
||||
|
||||
import typer
|
||||
from wasabi import msg
|
||||
|
||||
from .. import util
|
||||
from ..pipeline.trainable_pipe import TrainablePipe
|
||||
from ..schemas import ConfigSchemaDistill
|
||||
from ..training.initialize import init_nlp_student
|
||||
from ..training.loop import distill as distill_nlp
|
||||
from ._util import (
|
||||
Arg,
|
||||
Opt,
|
||||
app,
|
||||
import_code_paths,
|
||||
parse_config_overrides,
|
||||
setup_gpu,
|
||||
show_validation_error,
|
||||
)
|
||||
|
||||
|
||||
@app.command(
|
||||
"distill",
|
||||
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
||||
)
|
||||
def distill_cli(
|
||||
# fmt: off
|
||||
ctx: typer.Context, # This is only used to read additional arguments
|
||||
teacher_model: str = Arg(..., help="Teacher model name or path"),
|
||||
student_config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
|
||||
output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store trained pipeline in"),
|
||||
code_path: str = Opt("", "--code", "-c", help="Comma-separated paths to Python files with additional code (registered functions) to be imported"),
|
||||
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
|
||||
# fmt: on
|
||||
):
|
||||
"""
|
||||
Distill a spaCy pipeline from a teacher model.
|
||||
|
||||
DOCS: https://spacy.io/api/cli#distill
|
||||
"""
|
||||
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
||||
overrides = parse_config_overrides(ctx.args)
|
||||
import_code_paths(code_path)
|
||||
distill(
|
||||
teacher_model,
|
||||
student_config_path,
|
||||
output_path,
|
||||
use_gpu=use_gpu,
|
||||
overrides=overrides,
|
||||
)
|
||||
|
||||
|
||||
def distill(
|
||||
teacher_model: Union[str, Path],
|
||||
student_config_path: Union[str, Path],
|
||||
output_path: Optional[Union[str, Path]] = None,
|
||||
*,
|
||||
use_gpu: int = -1,
|
||||
overrides: Dict[str, Any] = util.SimpleFrozenDict(),
|
||||
):
|
||||
student_config_path = util.ensure_path(student_config_path)
|
||||
output_path = util.ensure_path(output_path)
|
||||
# Make sure all files and paths exist if they are needed
|
||||
if not student_config_path or (
|
||||
str(student_config_path) != "-" and not student_config_path.exists()
|
||||
):
|
||||
msg.fail("Student config file not found", student_config_path, exits=1)
|
||||
if not output_path:
|
||||
msg.info("No output directory provided")
|
||||
else:
|
||||
if not output_path.exists():
|
||||
output_path.mkdir(parents=True)
|
||||
msg.good(f"Created output directory: {output_path}")
|
||||
msg.info(f"Saving to output directory: {output_path}")
|
||||
setup_gpu(use_gpu)
|
||||
teacher = util.load_model(teacher_model)
|
||||
with show_validation_error(student_config_path):
|
||||
config = util.load_config(
|
||||
student_config_path, overrides=overrides, interpolate=False
|
||||
)
|
||||
msg.divider("Initializing student pipeline")
|
||||
with show_validation_error(student_config_path, hint_fill=False):
|
||||
student = init_nlp_student(config, teacher, use_gpu=use_gpu)
|
||||
|
||||
msg.good("Initialized student pipeline")
|
||||
msg.divider("Distilling student pipeline from teacher")
|
||||
distill_nlp(
|
||||
teacher,
|
||||
student,
|
||||
output_path,
|
||||
use_gpu=use_gpu,
|
||||
stdout=sys.stdout,
|
||||
stderr=sys.stderr,
|
||||
)
|
|
@ -7,9 +7,10 @@ import typer
|
|||
from wasabi import msg
|
||||
|
||||
from .. import about
|
||||
from ..errors import OLD_MODEL_SHORTCUTS
|
||||
from ..util import (
|
||||
get_installed_models,
|
||||
get_minor_version,
|
||||
get_package_version,
|
||||
is_in_interactive,
|
||||
is_in_jupyter,
|
||||
is_package,
|
||||
|
@ -76,15 +77,17 @@ def download(
|
|||
version = components[-1]
|
||||
else:
|
||||
model_name = model
|
||||
if model in OLD_MODEL_SHORTCUTS:
|
||||
msg.warn(
|
||||
f"As of spaCy v3.0, shortcuts like '{model}' are deprecated. Please "
|
||||
f"use the full pipeline package name '{OLD_MODEL_SHORTCUTS[model]}' instead."
|
||||
)
|
||||
model_name = OLD_MODEL_SHORTCUTS[model]
|
||||
compatibility = get_compatibility()
|
||||
version = get_version(model_name, compatibility)
|
||||
|
||||
# If we already have this version installed, skip downloading
|
||||
installed = get_installed_models()
|
||||
if model_name in installed:
|
||||
installed_version = get_package_version(model_name)
|
||||
if installed_version == version:
|
||||
msg.warn(f"{model_name} v{version} already installed, skipping")
|
||||
return
|
||||
|
||||
filename = get_model_filename(model_name, version, sdist)
|
||||
|
||||
download_model(filename, pip_args)
|
||||
|
|
|
@ -10,7 +10,7 @@ from .. import displacy, util
|
|||
from ..scorer import Scorer
|
||||
from ..tokens import Doc
|
||||
from ..training import Corpus
|
||||
from ._util import Arg, Opt, app, benchmark_cli, import_code, setup_gpu
|
||||
from ._util import Arg, Opt, app, benchmark_cli, import_code_paths, setup_gpu
|
||||
|
||||
|
||||
@benchmark_cli.command(
|
||||
|
@ -22,7 +22,7 @@ def evaluate_cli(
|
|||
model: str = Arg(..., help="Model name or path"),
|
||||
data_path: Path = Arg(..., help="Location of binary evaluation data in .spacy format", exists=True),
|
||||
output: Optional[Path] = Opt(None, "--output", "-o", help="Output JSON file for metrics", dir_okay=False),
|
||||
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||
code_path: str = Opt("", "--code", "-c", help="Comma-separated paths to Python files with additional code (registered functions) to be imported"),
|
||||
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
||||
gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"),
|
||||
displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False),
|
||||
|
@ -43,7 +43,7 @@ def evaluate_cli(
|
|||
|
||||
DOCS: https://spacy.io/api/cli#benchmark-accuracy
|
||||
"""
|
||||
import_code(code_path)
|
||||
import_code_paths(code_path)
|
||||
evaluate(
|
||||
model,
|
||||
data_path,
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
import importlib.metadata
|
||||
import json
|
||||
import platform
|
||||
from pathlib import Path
|
||||
|
@ -7,7 +8,6 @@ import srsly
|
|||
from wasabi import MarkdownRenderer, Printer
|
||||
|
||||
from .. import about, util
|
||||
from ..compat import importlib_metadata
|
||||
from ._util import Arg, Opt, app, string_to_list
|
||||
from .download import get_latest_version, get_model_filename
|
||||
|
||||
|
@ -137,7 +137,7 @@ def info_installed_model_url(model: str) -> Optional[str]:
|
|||
dist-info available.
|
||||
"""
|
||||
try:
|
||||
dist = importlib_metadata.distribution(model)
|
||||
dist = importlib.metadata.distribution(model)
|
||||
text = dist.read_text("direct_url.json")
|
||||
if isinstance(text, str):
|
||||
data = json.loads(text)
|
||||
|
|
|
@ -9,13 +9,14 @@ from thinc.api import Config
|
|||
from wasabi import Printer, diff_strings
|
||||
|
||||
from .. import util
|
||||
from ..language import DEFAULT_CONFIG_PRETRAIN_PATH
|
||||
from ..language import DEFAULT_CONFIG_DISTILL_PATH, DEFAULT_CONFIG_PRETRAIN_PATH
|
||||
from ..schemas import RecommendationSchema
|
||||
from ..util import SimpleFrozenList
|
||||
from ._util import (
|
||||
COMMAND,
|
||||
Arg,
|
||||
Opt,
|
||||
_handle_renamed_language_codes,
|
||||
import_code,
|
||||
init_cli,
|
||||
show_validation_error,
|
||||
|
@ -50,7 +51,7 @@ class InitValues:
|
|||
def init_config_cli(
|
||||
# fmt: off
|
||||
output_file: Path = Arg(..., help="File to save the config to or - for stdout (will only output config and no additional logging info)", allow_dash=True),
|
||||
lang: str = Opt(InitValues.lang, "--lang", "-l", help="Two-letter code of the language to use"),
|
||||
lang: str = Opt(InitValues.lang, "--lang", "-l", help="Code of the language to use"),
|
||||
pipeline: str = Opt(",".join(InitValues.pipeline), "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include (without 'tok2vec' or 'transformer')"),
|
||||
optimize: Optimizations = Opt(InitValues.optimize, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."),
|
||||
gpu: bool = Opt(InitValues.gpu, "--gpu", "-G", help="Whether the model can run on GPU. This will impact the choice of architecture, pretrained weights and related hyperparameters."),
|
||||
|
@ -90,6 +91,7 @@ def init_fill_config_cli(
|
|||
# fmt: off
|
||||
base_path: Path = Arg(..., help="Path to base config to fill", exists=True, dir_okay=False),
|
||||
output_file: Path = Arg("-", help="Path to output .cfg file (or - for stdout)", allow_dash=True),
|
||||
distillation: bool = Opt(False, "--distillation", "-dt", help="Include config for distillation (with 'spacy distill')"),
|
||||
pretraining: bool = Opt(False, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"),
|
||||
diff: bool = Opt(False, "--diff", "-D", help="Print a visual diff highlighting the changes"),
|
||||
code_path: Optional[Path] = Opt(None, "--code-path", "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||
|
@ -105,13 +107,20 @@ def init_fill_config_cli(
|
|||
DOCS: https://spacy.io/api/cli#init-fill-config
|
||||
"""
|
||||
import_code(code_path)
|
||||
fill_config(output_file, base_path, pretraining=pretraining, diff=diff)
|
||||
fill_config(
|
||||
output_file,
|
||||
base_path,
|
||||
distillation=distillation,
|
||||
pretraining=pretraining,
|
||||
diff=diff,
|
||||
)
|
||||
|
||||
|
||||
def fill_config(
|
||||
output_file: Path,
|
||||
base_path: Path,
|
||||
*,
|
||||
distillation: bool = False,
|
||||
pretraining: bool = False,
|
||||
diff: bool = False,
|
||||
silent: bool = False,
|
||||
|
@ -130,6 +139,9 @@ def fill_config(
|
|||
# replaced with their actual config after loading, so we have to re-add them
|
||||
sourced = util.get_sourced_components(config)
|
||||
filled["components"].update(sourced)
|
||||
if distillation:
|
||||
distillation_config = util.load_config(DEFAULT_CONFIG_DISTILL_PATH)
|
||||
filled = distillation_config.merge(filled)
|
||||
if pretraining:
|
||||
validate_config_for_pretrain(filled, msg)
|
||||
pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH)
|
||||
|
@ -165,6 +177,10 @@ def init_config(
|
|||
msg = Printer(no_print=silent)
|
||||
with TEMPLATE_PATH.open("r") as f:
|
||||
template = Template(f.read())
|
||||
|
||||
# Throw error for renamed language codes in v4
|
||||
_handle_renamed_language_codes(lang)
|
||||
|
||||
# Filter out duplicates since tok2vec and transformer are added by template
|
||||
pipeline = [pipe for pipe in pipeline if pipe not in ("tok2vec", "transformer")]
|
||||
defaults = RECOMMENDATIONS["__default__"]
|
||||
|
|
|
@ -12,6 +12,7 @@ from ..training.initialize import convert_vectors, init_nlp
|
|||
from ._util import (
|
||||
Arg,
|
||||
Opt,
|
||||
_handle_renamed_language_codes,
|
||||
import_code,
|
||||
init_cli,
|
||||
parse_config_overrides,
|
||||
|
@ -29,7 +30,6 @@ def init_vectors_cli(
|
|||
prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"),
|
||||
truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
|
||||
mode: str = Opt("default", "--mode", "-m", help="Vectors mode: default or floret"),
|
||||
name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
|
||||
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||
jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True),
|
||||
attr: str = Opt("ORTH", "--attr", "-a", help="Optional token attribute to use for vectors, e.g. LOWER or NORM"),
|
||||
|
@ -39,8 +39,11 @@ def init_vectors_cli(
|
|||
you can use in the [initialize] block of your config to initialize
|
||||
a model with vectors.
|
||||
"""
|
||||
if verbose:
|
||||
util.logger.setLevel(logging.DEBUG)
|
||||
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
||||
|
||||
# Throw error for renamed language codes in v4
|
||||
_handle_renamed_language_codes(lang)
|
||||
|
||||
msg.info(f"Creating blank nlp object for language '{lang}'")
|
||||
nlp = util.get_lang_class(lang)()
|
||||
if jsonl_loc is not None:
|
||||
|
@ -50,7 +53,6 @@ def init_vectors_cli(
|
|||
vectors_loc,
|
||||
truncate=truncate,
|
||||
prune=prune,
|
||||
name=name,
|
||||
mode=mode,
|
||||
attr=attr,
|
||||
)
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
import importlib.metadata
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
|
@ -13,7 +14,6 @@ from thinc.api import Config
|
|||
from wasabi import MarkdownRenderer, Printer, get_raw_input
|
||||
|
||||
from .. import about, util
|
||||
from ..compat import importlib_metadata
|
||||
from ..schemas import ModelMetaSchema, validate
|
||||
from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app, string_to_list
|
||||
|
||||
|
@ -23,14 +23,13 @@ def package_cli(
|
|||
# fmt: off
|
||||
input_dir: Path = Arg(..., help="Directory with pipeline data", exists=True, file_okay=False),
|
||||
output_dir: Path = Arg(..., help="Output parent directory", exists=True, file_okay=False),
|
||||
code_paths: str = Opt("", "--code", "-c", help="Comma-separated paths to Python file with additional code (registered functions) to be included in the package"),
|
||||
code_paths: str = Opt("", "--code", "-c", help="Comma-separated paths to Python files with additional code (registered functions) to be included in the package"),
|
||||
meta_path: Optional[Path] = Opt(None, "--meta-path", "--meta", "-m", help="Path to meta.json", exists=True, dir_okay=False),
|
||||
create_meta: bool = Opt(False, "--create-meta", "-C", help="Create meta.json, even if one exists"),
|
||||
name: Optional[str] = Opt(None, "--name", "-n", help="Package name to override meta"),
|
||||
version: Optional[str] = Opt(None, "--version", "-v", help="Package version to override meta"),
|
||||
build: str = Opt("sdist", "--build", "-b", help="Comma-separated formats to build: sdist and/or wheel, or none."),
|
||||
force: bool = Opt(False, "--force", "-f", "-F", help="Force overwriting existing data in output directory"),
|
||||
require_parent: bool = Opt(True, "--require-parent/--no-require-parent", "-R", "-R", help="Include the parent package (e.g. spacy) in the requirements"),
|
||||
# fmt: on
|
||||
):
|
||||
"""
|
||||
|
@ -61,7 +60,6 @@ def package_cli(
|
|||
create_sdist=create_sdist,
|
||||
create_wheel=create_wheel,
|
||||
force=force,
|
||||
require_parent=require_parent,
|
||||
silent=False,
|
||||
)
|
||||
|
||||
|
@ -76,7 +74,6 @@ def package(
|
|||
create_meta: bool = False,
|
||||
create_sdist: bool = True,
|
||||
create_wheel: bool = False,
|
||||
require_parent: bool = False,
|
||||
force: bool = False,
|
||||
silent: bool = True,
|
||||
) -> None:
|
||||
|
@ -116,7 +113,7 @@ def package(
|
|||
if not meta_path.exists() or not meta_path.is_file():
|
||||
msg.fail("Can't load pipeline meta.json", meta_path, exits=1)
|
||||
meta = srsly.read_json(meta_path)
|
||||
meta = get_meta(input_dir, meta, require_parent=require_parent)
|
||||
meta = get_meta(input_dir, meta)
|
||||
if meta["requirements"]:
|
||||
msg.good(
|
||||
f"Including {len(meta['requirements'])} package requirement(s) from "
|
||||
|
@ -189,7 +186,6 @@ def package(
|
|||
imports.append(code_path.stem)
|
||||
shutil.copy(str(code_path), str(package_path))
|
||||
create_file(main_path / "meta.json", srsly.json_dumps(meta, indent=2))
|
||||
|
||||
create_file(main_path / "setup.py", TEMPLATE_SETUP)
|
||||
create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST)
|
||||
init_py = TEMPLATE_INIT.format(
|
||||
|
@ -254,9 +250,9 @@ def has_build() -> bool:
|
|||
# in an editable install), so an import check is not sufficient; instead
|
||||
# check that there is a package version
|
||||
try:
|
||||
importlib_metadata.version("build")
|
||||
importlib.metadata.version("build")
|
||||
return True
|
||||
except importlib_metadata.PackageNotFoundError: # type: ignore[attr-defined]
|
||||
except importlib.metadata.PackageNotFoundError: # type: ignore[attr-defined]
|
||||
return False
|
||||
|
||||
|
||||
|
@ -306,8 +302,6 @@ def get_third_party_dependencies(
|
|||
modules.add(func_info["module"].split(".")[0]) # type: ignore[union-attr]
|
||||
dependencies = []
|
||||
for module_name in modules:
|
||||
if module_name == about.__title__:
|
||||
continue
|
||||
if module_name in distributions:
|
||||
dist = distributions.get(module_name)
|
||||
if dist:
|
||||
|
@ -338,9 +332,7 @@ def create_file(file_path: Path, contents: str) -> None:
|
|||
|
||||
|
||||
def get_meta(
|
||||
model_path: Union[str, Path],
|
||||
existing_meta: Dict[str, Any],
|
||||
require_parent: bool = False,
|
||||
model_path: Union[str, Path], existing_meta: Dict[str, Any]
|
||||
) -> Dict[str, Any]:
|
||||
meta: Dict[str, Any] = {
|
||||
"lang": "en",
|
||||
|
@ -360,7 +352,6 @@ def get_meta(
|
|||
"width": nlp.vocab.vectors_length,
|
||||
"vectors": len(nlp.vocab.vectors),
|
||||
"keys": nlp.vocab.vectors.n_keys,
|
||||
"name": nlp.vocab.vectors.name,
|
||||
}
|
||||
if about.__title__ != "spacy":
|
||||
meta["parent_package"] = about.__title__
|
||||
|
@ -369,8 +360,6 @@ def get_meta(
|
|||
existing_reqs = [util.split_requirement(req)[0] for req in meta["requirements"]]
|
||||
reqs = get_third_party_dependencies(nlp.config, exclude=existing_reqs)
|
||||
meta["requirements"].extend(reqs)
|
||||
if require_parent and about.__title__ not in meta["requirements"]:
|
||||
meta["requirements"].append(about.__title__ + meta["spacy_version"])
|
||||
return meta
|
||||
|
||||
|
||||
|
@ -545,11 +534,8 @@ def list_files(data_dir):
|
|||
|
||||
|
||||
def list_requirements(meta):
|
||||
# Up to version 3.7, we included the parent package
|
||||
# in requirements by default. This behaviour is removed
|
||||
# in 3.8, with a setting to include the parent package in
|
||||
# the requirements list in the meta if desired.
|
||||
requirements = []
|
||||
parent_package = meta.get('parent_package', 'spacy')
|
||||
requirements = [parent_package + meta['spacy_version']]
|
||||
if 'setup_requires' in meta:
|
||||
requirements += meta['setup_requires']
|
||||
if 'requirements' in meta:
|
||||
|
|
|
@ -11,7 +11,7 @@ from ._util import (
|
|||
Arg,
|
||||
Opt,
|
||||
app,
|
||||
import_code,
|
||||
import_code_paths,
|
||||
parse_config_overrides,
|
||||
setup_gpu,
|
||||
show_validation_error,
|
||||
|
@ -27,7 +27,7 @@ def pretrain_cli(
|
|||
ctx: typer.Context, # This is only used to read additional arguments
|
||||
config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False, allow_dash=True),
|
||||
output_dir: Path = Arg(..., help="Directory to write weights to on each epoch"),
|
||||
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||
code_path: str = Opt("", "--code", "-c", help="Comma-separated paths to Python files with additional code (registered functions) to be imported"),
|
||||
resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
|
||||
epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."),
|
||||
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
||||
|
@ -56,7 +56,7 @@ def pretrain_cli(
|
|||
DOCS: https://spacy.io/api/cli#pretrain
|
||||
"""
|
||||
config_overrides = parse_config_overrides(ctx.args)
|
||||
import_code(code_path)
|
||||
import_code_paths(code_path)
|
||||
verify_cli_args(config_path, output_dir, resume_path, epoch_resume)
|
||||
setup_gpu(use_gpu)
|
||||
msg.info(f"Loading config from: {config_path}")
|
||||
|
|
|
@ -238,7 +238,7 @@ grad_factor = 1.0
|
|||
{% if "entity_linker" in components -%}
|
||||
[components.entity_linker]
|
||||
factory = "entity_linker"
|
||||
get_candidates = {"@misc":"spacy.CandidateGenerator.v1"}
|
||||
get_candidates = {"@misc":"spacy.CandidateGenerator.v2"}
|
||||
incl_context = true
|
||||
incl_prior = true
|
||||
|
||||
|
@ -517,7 +517,7 @@ width = ${components.tok2vec.model.encode.width}
|
|||
{% if "entity_linker" in components -%}
|
||||
[components.entity_linker]
|
||||
factory = "entity_linker"
|
||||
get_candidates = {"@misc":"spacy.CandidateGenerator.v1"}
|
||||
get_candidates = {"@misc":"spacy.CandidateGenerator.v2"}
|
||||
incl_context = true
|
||||
incl_prior = true
|
||||
|
||||
|
|
|
@ -13,7 +13,7 @@ from ._util import (
|
|||
Arg,
|
||||
Opt,
|
||||
app,
|
||||
import_code,
|
||||
import_code_paths,
|
||||
parse_config_overrides,
|
||||
setup_gpu,
|
||||
show_validation_error,
|
||||
|
@ -28,7 +28,7 @@ def train_cli(
|
|||
ctx: typer.Context, # This is only used to read additional arguments
|
||||
config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
|
||||
output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store trained pipeline in"),
|
||||
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||
code_path: str = Opt("", "--code", "-c", help="Comma-separated paths to Python files with additional code (registered functions) to be imported"),
|
||||
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
|
||||
# fmt: on
|
||||
|
@ -50,7 +50,7 @@ def train_cli(
|
|||
if verbose:
|
||||
util.logger.setLevel(logging.DEBUG)
|
||||
overrides = parse_config_overrides(ctx.args)
|
||||
import_code(code_path)
|
||||
import_code_paths(code_path)
|
||||
train(config_path, output_path, use_gpu=use_gpu, overrides=overrides)
|
||||
|
||||
|
||||
|
|
|
@ -23,19 +23,6 @@ try:
|
|||
except ImportError:
|
||||
cupy = None
|
||||
|
||||
if sys.version_info[:2] >= (3, 8): # Python 3.8+
|
||||
from typing import Literal, Protocol, runtime_checkable
|
||||
else:
|
||||
from typing_extensions import Literal, Protocol, runtime_checkable # noqa: F401
|
||||
|
||||
# Important note: The importlib_metadata "backport" includes functionality
|
||||
# that's not part of the built-in importlib.metadata. We should treat this
|
||||
# import like the built-in and only use what's available there.
|
||||
try: # Python 3.8+
|
||||
import importlib.metadata as importlib_metadata
|
||||
except ImportError:
|
||||
from catalogue import _importlib_metadata as importlib_metadata # type: ignore[no-redef] # noqa: F401
|
||||
|
||||
from thinc.api import Optimizer # noqa: F401
|
||||
|
||||
pickle = pickle
|
||||
|
|
34
spacy/default_config_distillation.cfg
Normal file
34
spacy/default_config_distillation.cfg
Normal file
|
@ -0,0 +1,34 @@
|
|||
[paths]
|
||||
raw_text = null
|
||||
|
||||
[distillation]
|
||||
corpus = "corpora.distillation"
|
||||
dropout = 0.1
|
||||
max_epochs = 1
|
||||
max_steps = 0
|
||||
student_to_teacher = {}
|
||||
|
||||
[distillation.batcher]
|
||||
@batchers = "spacy.batch_by_words.v1"
|
||||
size = 3000
|
||||
discard_oversize = false
|
||||
tolerance = 0.2
|
||||
|
||||
[distillation.optimizer]
|
||||
@optimizers = "Adam.v1"
|
||||
beta1 = 0.9
|
||||
beta2 = 0.999
|
||||
L2_is_weight_decay = true
|
||||
L2 = 0.01
|
||||
grad_clip = 1.0
|
||||
use_averages = true
|
||||
eps = 1e-8
|
||||
learn_rate = 1e-4
|
||||
|
||||
[corpora]
|
||||
|
||||
[corpora.distillation]
|
||||
@readers = "spacy.PlainTextCorpus.v1"
|
||||
path = ${paths.raw_text}
|
||||
min_length = 0
|
||||
max_length = 0
|
|
@ -1,6 +1,7 @@
|
|||
import warnings
|
||||
from typing import Literal
|
||||
|
||||
from .compat import Literal
|
||||
from . import about
|
||||
|
||||
|
||||
class ErrorsWithCodes(type):
|
||||
|
@ -83,7 +84,7 @@ class Warnings(metaclass=ErrorsWithCodes):
|
|||
"ignoring the duplicate entry.")
|
||||
W021 = ("Unexpected hash collision in PhraseMatcher. Matches may be "
|
||||
"incorrect. Modify PhraseMatcher._terminal_hash to fix.")
|
||||
W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in "
|
||||
W024 = ("Entity '{entity}' - alias '{alias}' combination already exists in "
|
||||
"the Knowledge Base.")
|
||||
W026 = ("Unable to set all sentence boundaries from dependency parses. If "
|
||||
"you are constructing a parse tree incrementally by setting "
|
||||
|
@ -104,13 +105,14 @@ class Warnings(metaclass=ErrorsWithCodes):
|
|||
"table. This may degrade the performance of the model to some "
|
||||
"degree. If this is intentional or the language you're using "
|
||||
"doesn't have a normalization table, please ignore this warning. "
|
||||
"If this is surprising, make sure you have the spacy-lookups-data "
|
||||
"package installed and load the table in your config. The "
|
||||
"languages with lexeme normalization tables are currently: "
|
||||
"{langs}\n\nLoad the table in your config with:\n\n"
|
||||
"If this is surprising, make sure you are loading the table in "
|
||||
"your config. The languages with lexeme normalization tables are "
|
||||
"currently: {langs}\n\nAn example of how to load a table in "
|
||||
"your config :\n\n"
|
||||
"[initialize.lookups]\n"
|
||||
"@misc = \"spacy.LookupsDataLoader.v1\"\n"
|
||||
"@misc = \"spacy.LookupsDataLoaderFromURL.v1\"\n"
|
||||
"lang = ${{nlp.lang}}\n"
|
||||
f'url = "{about.__lookups_url__}"\n'
|
||||
"tables = [\"lexeme_norm\"]\n")
|
||||
W035 = ("Discarding subpattern '{pattern}' due to an unrecognized "
|
||||
"attribute or operator.")
|
||||
|
@ -132,13 +134,6 @@ class Warnings(metaclass=ErrorsWithCodes):
|
|||
"and make it independent. For example, `replace_listeners = "
|
||||
"[\"model.tok2vec\"]` See the documentation for details: "
|
||||
"https://spacy.io/usage/training#config-components-listeners")
|
||||
W088 = ("The pipeline component {name} implements a `begin_training` "
|
||||
"method, which won't be called by spaCy. As of v3.0, `begin_training` "
|
||||
"has been renamed to `initialize`, so you likely want to rename the "
|
||||
"component method. See the documentation for details: "
|
||||
"https://spacy.io/api/language#initialize")
|
||||
W089 = ("As of spaCy v3.0, the `nlp.begin_training` method has been renamed "
|
||||
"to `nlp.initialize`.")
|
||||
W090 = ("Could not locate any {format} files in path '{path}'.")
|
||||
W091 = ("Could not clean/remove the temp directory at {dir}: {msg}.")
|
||||
W092 = ("Ignoring annotations for sentence starts, as dependency heads are set.")
|
||||
|
@ -222,6 +217,11 @@ class Warnings(metaclass=ErrorsWithCodes):
|
|||
W126 = ("These keys are unsupported: {unsupported}")
|
||||
W127 = ("Not all `Language.pipe` worker processes completed successfully")
|
||||
|
||||
# v4 warning strings
|
||||
W401 = ("`incl_prior is True`, but the selected knowledge base type {kb_type} doesn't support prior probability "
|
||||
"lookups so this setting will be ignored. If your KB does support prior probability lookups, make sure "
|
||||
"to return `True` in `.supports_prior_probs`.")
|
||||
|
||||
|
||||
class Errors(metaclass=ErrorsWithCodes):
|
||||
E001 = ("No component '{name}' found in pipeline. Available names: {opts}")
|
||||
|
@ -256,9 +256,7 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
"https://spacy.io/usage/models")
|
||||
E011 = ("Unknown operator: '{op}'. Options: {opts}")
|
||||
E012 = ("Cannot add pattern for zero tokens to matcher.\nKey: {key}")
|
||||
E016 = ("MultitaskObjective target should be function or one of: dep, "
|
||||
"tag, ent, dep_tag_offset, ent_tag.")
|
||||
E017 = ("Can only add unicode or bytes. Got type: {value_type}")
|
||||
E017 = ("Can only add 'str' inputs to StringStore. Got type: {value_type}")
|
||||
E018 = ("Can't retrieve string for hash '{hash_value}'. This usually "
|
||||
"refers to an issue with the `Vocab` or `StringStore`.")
|
||||
E019 = ("Can't create transition with unknown action ID: {action}. Action "
|
||||
|
@ -470,13 +468,13 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
"same, but found '{nlp}' and '{vocab}' respectively.")
|
||||
E152 = ("The attribute {attr} is not supported for token patterns. "
|
||||
"Please use the option `validate=True` with the Matcher, PhraseMatcher, "
|
||||
"EntityRuler or AttributeRuler for more details.")
|
||||
"SpanRuler or AttributeRuler for more details.")
|
||||
E153 = ("The value type {vtype} is not supported for token patterns. "
|
||||
"Please use the option validate=True with Matcher, PhraseMatcher, "
|
||||
"EntityRuler or AttributeRuler for more details.")
|
||||
"SpanRuler or AttributeRuler for more details.")
|
||||
E154 = ("One of the attributes or values is not supported for token "
|
||||
"patterns. Please use the option `validate=True` with the Matcher, "
|
||||
"PhraseMatcher, or EntityRuler for more details.")
|
||||
"PhraseMatcher, or SpanRuler for more details.")
|
||||
E155 = ("The pipeline needs to include a {pipe} in order to use "
|
||||
"Matcher or PhraseMatcher with the attribute {attr}. "
|
||||
"Try using `nlp()` instead of `nlp.make_doc()` or `list(nlp.pipe())` "
|
||||
|
@ -500,7 +498,7 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
"Current DocBin: {current}\nOther DocBin: {other}")
|
||||
E169 = ("Can't find module: {module}")
|
||||
E170 = ("Cannot apply transition {name}: invalid for the current state.")
|
||||
E171 = ("Matcher.add received invalid 'on_match' callback argument: expected "
|
||||
E171 = ("{name}.add received invalid 'on_match' callback argument: expected "
|
||||
"callable or None, but got: {arg_type}")
|
||||
E175 = ("Can't remove rule for unknown match pattern ID: {key}")
|
||||
E176 = ("Alias '{alias}' is not defined in the Knowledge Base.")
|
||||
|
@ -739,13 +737,6 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
"method in component '{name}'. If you want to use this "
|
||||
"method, make sure it's overwritten on the subclass.")
|
||||
E940 = ("Found NaN values in scores.")
|
||||
E941 = ("Can't find model '{name}'. It looks like you're trying to load a "
|
||||
"model from a shortcut, which is obsolete as of spaCy v3.0. To "
|
||||
"load the model, use its full name instead:\n\n"
|
||||
"nlp = spacy.load(\"{full}\")\n\nFor more details on the available "
|
||||
"models, see the models directory: https://spacy.io/models and if "
|
||||
"you want to create a blank model, use spacy.blank: "
|
||||
"nlp = spacy.blank(\"{name}\")")
|
||||
E942 = ("Executing `after_{name}` callback failed. Expected the function to "
|
||||
"return an initialized nlp object but got: {value}. Maybe "
|
||||
"you forgot to return the modified object in your function?")
|
||||
|
@ -759,7 +750,7 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
"loaded nlp object, but got: {source}")
|
||||
E947 = ("`Matcher.add` received invalid `greedy` argument: expected "
|
||||
"a string value from {expected} but got: '{arg}'")
|
||||
E948 = ("`Matcher.add` received invalid 'patterns' argument: expected "
|
||||
E948 = ("`{name}.add` received invalid 'patterns' argument: expected "
|
||||
"a list, but got: {arg_type}")
|
||||
E949 = ("Unable to align tokens for the predicted and reference docs. It "
|
||||
"is only possible to align the docs when both texts are the same "
|
||||
|
@ -933,8 +924,6 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
E1021 = ("`pos` value \"{pp}\" is not a valid Universal Dependencies tag. "
|
||||
"Non-UD tags should use the `tag` property.")
|
||||
E1022 = ("Words must be of type str or int, but input is of type '{wtype}'")
|
||||
E1023 = ("Couldn't read EntityRuler from the {path}. This file doesn't "
|
||||
"exist.")
|
||||
E1024 = ("A pattern with {attr_type} '{label}' is not present in "
|
||||
"'{component}' patterns.")
|
||||
E1025 = ("Cannot intify the value '{value}' as an IOB string. The only "
|
||||
|
@ -945,7 +934,7 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
E1029 = ("Edit tree cannot be applied to form.")
|
||||
E1030 = ("Edit tree identifier out of range.")
|
||||
E1031 = ("Could not find gold transition - see logs above.")
|
||||
E1032 = ("`{var}` should not be {forbidden}, but received {value}.")
|
||||
E1032 = ("Span {var} {value} is out of bounds for {obj} with length {length}.")
|
||||
E1033 = ("Dimension {name} invalid -- only nO, nF, nP")
|
||||
E1034 = ("Node index {i} out of bounds ({length})")
|
||||
E1035 = ("Token index {i} out of bounds ({length})")
|
||||
|
@ -962,7 +951,6 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
"case pass an empty list for the previously not specified argument to avoid this error.")
|
||||
E1043 = ("Expected None or a value in range [{range_start}, {range_end}] for entity linker threshold, but got "
|
||||
"{value}.")
|
||||
E1044 = ("Expected `candidates_batch_size` to be >= 1, but got: {value}")
|
||||
E1045 = ("Encountered {parent} subclass without `{parent}.{method}` "
|
||||
"method in '{name}'. If you want to use this method, make "
|
||||
"sure it's overwritten on the subclass.")
|
||||
|
@ -989,15 +977,35 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
"reduction. Please enable one of `use_reduce_first`, "
|
||||
"`use_reduce_last`, `use_reduce_max` or `use_reduce_mean`.")
|
||||
|
||||
# v4 error strings
|
||||
E4000 = ("Expected a Doc as input, but got: '{type}'")
|
||||
E4001 = ("Expected input to be one of the following types: ({expected_types}), "
|
||||
"but got '{received_type}'")
|
||||
E4002 = ("Pipe '{name}' requires a teacher pipe for distillation.")
|
||||
E4003 = ("Training examples for distillation must have the exact same tokens in the "
|
||||
"reference and predicted docs.")
|
||||
E4004 = ("Backprop is not supported when is_train is not set.")
|
||||
E4005 = ("EntityLinker_v1 is not supported in spaCy v4. Update your configuration.")
|
||||
E4006 = ("Expected `entity_id` to be of type {exp_type}, but is of type {found_type}.")
|
||||
E4007 = ("Span {var} {value} must be {op} Span {existing_var} "
|
||||
"{existing_value}.")
|
||||
E4008 = ("Span {pos}_char {value} does not correspond to a token {pos}.")
|
||||
E4009 = ("The '{attr}' parameter should be 'None' or 'True', but found '{value}'.")
|
||||
E4010 = ("Required lemmatizer table(s) {missing_tables} not found in "
|
||||
"[initialize] or in registered lookups (spacy-lookups-data). An "
|
||||
"example for how to load lemmatizer tables in [initialize]:\n\n"
|
||||
"[initialize.components]\n\n"
|
||||
"[initialize.components.{pipe_name}]\n\n"
|
||||
"[initialize.components.{pipe_name}.lookups]\n"
|
||||
'@misc = "spacy.LookupsDataLoaderFromURL.v1"\n'
|
||||
"lang = ${{nlp.lang}}\n"
|
||||
f'url = "{about.__lookups_url__}"\n'
|
||||
"tables = {tables}\n"
|
||||
"# or required tables only: tables = {required_tables}\n")
|
||||
E4011 = ("Server error ({status_code}), couldn't fetch {url}")
|
||||
|
||||
# Deprecated model shortcuts, only used in errors and warnings
|
||||
OLD_MODEL_SHORTCUTS = {
|
||||
"en": "en_core_web_sm", "de": "de_core_news_sm", "es": "es_core_news_sm",
|
||||
"pt": "pt_core_news_sm", "fr": "fr_core_news_sm", "it": "it_core_news_sm",
|
||||
"nl": "nl_core_news_sm", "el": "el_core_news_sm", "nb": "nb_core_news_sm",
|
||||
"lt": "lt_core_news_sm", "xx": "xx_ent_wiki_sm"
|
||||
}
|
||||
|
||||
RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"}
|
||||
|
||||
# fmt: on
|
||||
|
||||
|
|
|
@ -1,11 +1,10 @@
|
|||
from .candidate import Candidate, get_candidates, get_candidates_batch
|
||||
from .candidate import Candidate, InMemoryCandidate
|
||||
from .kb import KnowledgeBase
|
||||
from .kb_in_memory import InMemoryLookupKB
|
||||
|
||||
__all__ = [
|
||||
"Candidate",
|
||||
"KnowledgeBase",
|
||||
"InMemoryCandidate",
|
||||
"InMemoryLookupKB",
|
||||
"get_candidates",
|
||||
"get_candidates_batch",
|
||||
]
|
||||
|
|
|
@ -1,15 +1,17 @@
|
|||
from libcpp.vector cimport vector
|
||||
|
||||
from ..typedefs cimport hash_t
|
||||
from .kb cimport KnowledgeBase
|
||||
from .kb_in_memory cimport InMemoryLookupKB
|
||||
|
||||
|
||||
# Object used by the Entity Linker that summarizes one entity-alias candidate
|
||||
# combination.
|
||||
cdef class Candidate:
|
||||
cdef readonly KnowledgeBase kb
|
||||
cdef hash_t entity_hash
|
||||
cdef float entity_freq
|
||||
cdef vector[float] entity_vector
|
||||
cdef hash_t alias_hash
|
||||
cdef float prior_prob
|
||||
pass
|
||||
|
||||
|
||||
cdef class InMemoryCandidate(Candidate):
|
||||
cdef readonly hash_t _entity_hash
|
||||
cdef readonly hash_t _alias_hash
|
||||
cdef vector[float] _entity_vector
|
||||
cdef float _prior_prob
|
||||
cdef readonly InMemoryLookupKB _kb
|
||||
cdef float _entity_freq
|
||||
|
|
|
@ -1,90 +1,98 @@
|
|||
# cython: infer_types=True
|
||||
|
||||
from typing import Iterable
|
||||
from .kb_in_memory cimport InMemoryLookupKB
|
||||
|
||||
from .kb cimport KnowledgeBase
|
||||
|
||||
from ..tokens import Span
|
||||
from ..errors import Errors
|
||||
|
||||
|
||||
cdef class Candidate:
|
||||
"""A `Candidate` object refers to a textual mention (`alias`) that may or
|
||||
may not be resolved to a specific `entity` from a Knowledge Base. This
|
||||
will be used as input for the entity linking algorithm which will
|
||||
disambiguate the various candidates to the correct one.
|
||||
Each candidate (alias, entity) pair is assigned a certain prior probability.
|
||||
"""A `Candidate` object refers to a textual mention that may or may not be resolved
|
||||
to a specific entity from a Knowledge Base. This will be used as input for the entity linking
|
||||
algorithm which will disambiguate the various candidates to the correct one.
|
||||
Each candidate, which represents a possible link between one textual mention and one entity in the knowledge base,
|
||||
is assigned a certain prior probability.
|
||||
|
||||
DOCS: https://spacy.io/api/kb/#candidate-init
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
# Make sure abstract Candidate is not instantiated.
|
||||
if self.__class__ == Candidate:
|
||||
raise TypeError(
|
||||
Errors.E1046.format(cls_name=self.__class__.__name__)
|
||||
)
|
||||
|
||||
@property
|
||||
def entity_id(self) -> int:
|
||||
"""RETURNS (int): Numerical representation of entity ID (if entity ID is numerical, this is just the entity ID,
|
||||
otherwise the hash of the entity ID string)."""
|
||||
raise NotImplementedError
|
||||
|
||||
@property
|
||||
def entity_id_(self) -> str:
|
||||
"""RETURNS (str): String representation of entity ID."""
|
||||
raise NotImplementedError
|
||||
|
||||
@property
|
||||
def entity_vector(self) -> vector[float]:
|
||||
"""RETURNS (vector[float]): Entity vector."""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
cdef class InMemoryCandidate(Candidate):
|
||||
"""Candidate for InMemoryLookupKB."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
KnowledgeBase kb,
|
||||
entity_hash,
|
||||
entity_freq,
|
||||
entity_vector,
|
||||
alias_hash,
|
||||
prior_prob
|
||||
kb: InMemoryLookupKB,
|
||||
entity_hash: int,
|
||||
alias_hash: int,
|
||||
entity_vector: vector[float],
|
||||
prior_prob: float,
|
||||
entity_freq: float
|
||||
):
|
||||
self.kb = kb
|
||||
self.entity_hash = entity_hash
|
||||
self.entity_freq = entity_freq
|
||||
self.entity_vector = entity_vector
|
||||
self.alias_hash = alias_hash
|
||||
self.prior_prob = prior_prob
|
||||
"""
|
||||
kb (InMemoryLookupKB]): InMemoryLookupKB instance.
|
||||
entity_id (int): Entity ID as hash that can be looked up with InMemoryKB.vocab.strings.__getitem__().
|
||||
entity_freq (int): Entity frequency in KB corpus.
|
||||
entity_vector (List[float]): Entity embedding.
|
||||
alias_hash (int): Alias hash.
|
||||
prior_prob (float): Prior probability of entity for this alias. I. e. the probability that, independent of
|
||||
the context, this alias - which matches one of this entity's aliases - resolves to one this entity.
|
||||
"""
|
||||
super().__init__()
|
||||
|
||||
self._entity_hash = entity_hash
|
||||
self._entity_vector = entity_vector
|
||||
self._prior_prob = prior_prob
|
||||
self._kb = kb
|
||||
self._alias_hash = alias_hash
|
||||
self._entity_freq = entity_freq
|
||||
|
||||
@property
|
||||
def entity(self) -> int:
|
||||
"""RETURNS (uint64): hash of the entity's KB ID/name"""
|
||||
return self.entity_hash
|
||||
def entity_id(self) -> int:
|
||||
return self._entity_hash
|
||||
|
||||
@property
|
||||
def entity_(self) -> str:
|
||||
"""RETURNS (str): ID/name of this entity in the KB"""
|
||||
return self.kb.vocab.strings[self.entity_hash]
|
||||
|
||||
@property
|
||||
def alias(self) -> int:
|
||||
"""RETURNS (uint64): hash of the alias"""
|
||||
return self.alias_hash
|
||||
|
||||
@property
|
||||
def alias_(self) -> str:
|
||||
"""RETURNS (str): ID of the original alias"""
|
||||
return self.kb.vocab.strings[self.alias_hash]
|
||||
|
||||
@property
|
||||
def entity_freq(self) -> float:
|
||||
return self.entity_freq
|
||||
|
||||
@property
|
||||
def entity_vector(self) -> Iterable[float]:
|
||||
return self.entity_vector
|
||||
def entity_vector(self) -> vector[float]:
|
||||
return self._entity_vector
|
||||
|
||||
@property
|
||||
def prior_prob(self) -> float:
|
||||
return self.prior_prob
|
||||
"""RETURNS (float): Prior probability that this alias, which matches one of this entity's synonyms, resolves to
|
||||
this entity."""
|
||||
return self._prior_prob
|
||||
|
||||
@property
|
||||
def alias(self) -> str:
|
||||
"""RETURNS (str): Alias."""
|
||||
return self._kb.vocab.strings[self._alias_hash]
|
||||
|
||||
def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]:
|
||||
"""
|
||||
Return candidate entities for a given mention and fetching appropriate
|
||||
entries from the index.
|
||||
kb (KnowledgeBase): Knowledge base to query.
|
||||
mention (Span): Entity mention for which to identify candidates.
|
||||
RETURNS (Iterable[Candidate]): Identified candidates.
|
||||
"""
|
||||
return kb.get_candidates(mention)
|
||||
@property
|
||||
def entity_id_(self) -> str:
|
||||
return self._kb.vocab.strings[self._entity_hash]
|
||||
|
||||
|
||||
def get_candidates_batch(
|
||||
kb: KnowledgeBase, mentions: Iterable[Span]
|
||||
) -> Iterable[Iterable[Candidate]]:
|
||||
"""
|
||||
Return candidate entities for the given mentions and fetching appropriate entries
|
||||
from the index.
|
||||
kb (KnowledgeBase): Knowledge base to query.
|
||||
mention (Iterable[Span]): Entity mentions for which to identify candidates.
|
||||
RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
|
||||
"""
|
||||
return kb.get_candidates_batch(mentions)
|
||||
@property
|
||||
def entity_freq(self) -> float:
|
||||
"""RETURNS (float): Entity frequency in KB corpus."""
|
||||
return self._entity_freq
|
||||
|
|
|
@ -1,14 +1,14 @@
|
|||
# cython: infer_types=True
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Iterable, Tuple, Union
|
||||
from typing import Iterable, Iterator, Tuple, Union
|
||||
|
||||
from cymem.cymem cimport Pool
|
||||
|
||||
from ..errors import Errors
|
||||
from ..tokens import Span
|
||||
from ..tokens import SpanGroup
|
||||
from ..util import SimpleFrozenList
|
||||
from .candidate import Candidate
|
||||
from .candidate cimport Candidate
|
||||
|
||||
|
||||
cdef class KnowledgeBase:
|
||||
|
@ -19,6 +19,8 @@ cdef class KnowledgeBase:
|
|||
|
||||
DOCS: https://spacy.io/api/kb
|
||||
"""
|
||||
CandidatesForMentionT = Iterable[Candidate]
|
||||
CandidatesForDocT = Iterable[CandidatesForMentionT]
|
||||
|
||||
def __init__(self, vocab: Vocab, entity_vector_length: int):
|
||||
"""Create a KnowledgeBase."""
|
||||
|
@ -32,27 +34,15 @@ cdef class KnowledgeBase:
|
|||
self.entity_vector_length = entity_vector_length
|
||||
self.mem = Pool()
|
||||
|
||||
def get_candidates_batch(
|
||||
self, mentions: Iterable[Span]
|
||||
) -> Iterable[Iterable[Candidate]]:
|
||||
def get_candidates(self, mentions: Iterator[SpanGroup]) -> Iterator[CandidatesForDocT]:
|
||||
"""
|
||||
Return candidate entities for specified texts. Each candidate defines
|
||||
the entity, the original alias, and the prior probability of that
|
||||
alias resolving to that entity.
|
||||
If no candidate is found for a given text, an empty list is returned.
|
||||
mentions (Iterable[Span]): Mentions for which to get candidates.
|
||||
RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
|
||||
"""
|
||||
return [self.get_candidates(span) for span in mentions]
|
||||
|
||||
def get_candidates(self, mention: Span) -> Iterable[Candidate]:
|
||||
"""
|
||||
Return candidate entities for specified text. Each candidate defines
|
||||
the entity, the original alias,
|
||||
and the prior probability of that alias resolving to that entity.
|
||||
If the no candidate is found for a given text, an empty list is returned.
|
||||
mention (Span): Mention for which to get candidates.
|
||||
RETURNS (Iterable[Candidate]): Identified candidates.
|
||||
Return candidate entities for the specified groups of mentions (as SpanGroup) per Doc.
|
||||
Each candidate for a mention defines at least the entity and the entity's embedding vector. Depending on the KB
|
||||
implementation, further properties - such as the prior probability of the specified mention text resolving to
|
||||
that entity - might be included.
|
||||
If no candidates are found for a given mention, an empty list is returned.
|
||||
mentions (Iterator[SpanGroup]): Mentions for which to get candidates.
|
||||
RETURNS (Iterator[Iterable[Iterable[Candidate]]]): Identified candidates per mention/doc/doc batch.
|
||||
"""
|
||||
raise NotImplementedError(
|
||||
Errors.E1045.format(
|
||||
|
@ -128,3 +118,10 @@ cdef class KnowledgeBase:
|
|||
parent="KnowledgeBase", method="from_disk", name=self.__name__
|
||||
)
|
||||
)
|
||||
|
||||
@property
|
||||
def supports_prior_probs(self) -> bool:
|
||||
"""RETURNS (bool): Whether this KB type supports looking up prior probabilities for entity mentions."""
|
||||
raise NotImplementedError(
|
||||
Errors.E1045.format(parent="KnowledgeBase", method="supports_prior_probs", name=self.__name__)
|
||||
)
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
# cython: infer_types=True
|
||||
from typing import Any, Callable, Dict, Iterable
|
||||
from typing import Any, Callable, Dict, Iterable, Iterator
|
||||
|
||||
import srsly
|
||||
|
||||
|
@ -12,7 +12,7 @@ from preshed.maps cimport PreshMap
|
|||
import warnings
|
||||
from pathlib import Path
|
||||
|
||||
from ..tokens import Span
|
||||
from ..tokens import SpanGroup
|
||||
|
||||
from ..typedefs cimport hash_t
|
||||
|
||||
|
@ -23,7 +23,7 @@ from ..util import SimpleFrozenList, ensure_path
|
|||
from ..vocab cimport Vocab
|
||||
from .kb cimport KnowledgeBase
|
||||
|
||||
from .candidate import Candidate as Candidate
|
||||
from .candidate import InMemoryCandidate
|
||||
|
||||
|
||||
cdef class InMemoryLookupKB(KnowledgeBase):
|
||||
|
@ -255,10 +255,11 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
alias_entry.probs = probs
|
||||
self._aliases_table[alias_index] = alias_entry
|
||||
|
||||
def get_candidates(self, mention: Span) -> Iterable[Candidate]:
|
||||
return self.get_alias_candidates(mention.text) # type: ignore
|
||||
def get_candidates(self, mentions: Iterator[SpanGroup]) -> Iterator[Iterable[Iterable[InMemoryCandidate]]]:
|
||||
for mentions_for_doc in mentions:
|
||||
yield [self._get_alias_candidates(span.text) for span in mentions_for_doc]
|
||||
|
||||
def get_alias_candidates(self, str alias) -> Iterable[Candidate]:
|
||||
def _get_alias_candidates(self, str alias) -> Iterable[InMemoryCandidate]:
|
||||
"""
|
||||
Return candidate entities for an alias. Each candidate defines the
|
||||
entity, the original alias, and the prior probability of that alias
|
||||
|
@ -271,18 +272,18 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
alias_index = <int64_t>self._alias_index.get(alias_hash)
|
||||
alias_entry = self._aliases_table[alias_index]
|
||||
|
||||
return [Candidate(kb=self,
|
||||
entity_hash=self._entries[entry_index].entity_hash,
|
||||
entity_freq=self._entries[entry_index].freq,
|
||||
entity_vector=self._vectors_table[
|
||||
self._entries[entry_index].vector_index
|
||||
],
|
||||
alias_hash=alias_hash,
|
||||
prior_prob=prior_prob)
|
||||
for (entry_index, prior_prob) in zip(
|
||||
alias_entry.entry_indices, alias_entry.probs
|
||||
)
|
||||
if entry_index != 0]
|
||||
return [
|
||||
InMemoryCandidate(
|
||||
kb=self,
|
||||
entity_hash=self._entries[entry_index].entity_hash,
|
||||
alias_hash=alias_hash,
|
||||
entity_vector=self._vectors_table[self._entries[entry_index].vector_index],
|
||||
prior_prob=prior_prob,
|
||||
entity_freq=self._entries[entry_index].freq
|
||||
)
|
||||
for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs)
|
||||
if entry_index != 0
|
||||
]
|
||||
|
||||
def get_vector(self, str entity):
|
||||
cdef hash_t entity_hash = self.vocab.strings[entity]
|
||||
|
@ -316,6 +317,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
|
||||
return 0.0
|
||||
|
||||
def supports_prior_probs(self) -> bool:
|
||||
return True
|
||||
|
||||
def to_bytes(self, **kwargs):
|
||||
"""Serialize the current state to a binary string.
|
||||
"""
|
||||
|
|
|
@ -1,16 +0,0 @@
|
|||
from ...language import BaseDefaults, Language
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .stop_words import STOP_WORDS
|
||||
|
||||
|
||||
class TibetanDefaults(BaseDefaults):
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class Tibetan(Language):
|
||||
lang = "bo"
|
||||
Defaults = TibetanDefaults
|
||||
|
||||
|
||||
__all__ = ["Tibetan"]
|
|
@ -1,16 +0,0 @@
|
|||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
>>> from spacy.lang.bo.examples import sentences
|
||||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"དོན་དུ་རྒྱ་མཚོ་བླ་མ་ཞེས་བྱ་ཞིང༌།",
|
||||
"ཏཱ་ལའི་ཞེས་པ་ནི་སོག་སྐད་ཡིན་པ་དེ་བོད་སྐད་དུ་རྒྱ་མཚོའི་དོན་དུ་འཇུག",
|
||||
"སོག་པོ་ཨལ་ཐན་རྒྱལ་པོས་རྒྱལ་དབང་བསོད་ནམས་རྒྱ་མཚོར་ཆེ་བསྟོད་ཀྱི་མཚན་གསོལ་བ་ཞིག་ཡིན་ཞིང༌།",
|
||||
"རྗེས་སུ་རྒྱལ་བ་དགེ་འདུན་གྲུབ་དང༌། དགེ་འདུན་རྒྱ་མཚོ་སོ་སོར་ཡང་ཏཱ་ལའི་བླ་མའི་སྐུ་ཕྲེང་དང་པོ་དང༌།",
|
||||
"གཉིས་པའི་མཚན་དེ་གསོལ་ཞིང༌།༸རྒྱལ་དབང་སྐུ་ཕྲེང་ལྔ་པས་དགའ་ལྡན་ཕོ་བྲང་གི་སྲིད་དབང་བཙུགས་པ་ནས་ཏཱ་ལའི་བླ་མ་ནི་བོད་ཀྱི་ཆོས་སྲིད་གཉིས་ཀྱི་དབུ་ཁྲིད་དུ་གྱུར་ཞིང་།",
|
||||
"ད་ལྟའི་བར་ཏཱ་ལའི་བླ་མ་སྐུ་ཕྲེང་བཅུ་བཞི་བྱོན་ཡོད།",
|
||||
]
|
|
@ -1,65 +0,0 @@
|
|||
from ...attrs import LIKE_NUM
|
||||
|
||||
# reference 1: https://en.wikipedia.org/wiki/Tibetan_numerals
|
||||
|
||||
_num_words = [
|
||||
"ཀླད་ཀོར་",
|
||||
"གཅིག་",
|
||||
"གཉིས་",
|
||||
"གསུམ་",
|
||||
"བཞི་",
|
||||
"ལྔ་",
|
||||
"དྲུག་",
|
||||
"བདུན་",
|
||||
"བརྒྱད་",
|
||||
"དགུ་",
|
||||
"བཅུ་",
|
||||
"བཅུ་གཅིག་",
|
||||
"བཅུ་གཉིས་",
|
||||
"བཅུ་གསུམ་",
|
||||
"བཅུ་བཞི་",
|
||||
"བཅུ་ལྔ་",
|
||||
"བཅུ་དྲུག་",
|
||||
"བཅུ་བདུན་",
|
||||
"བཅུ་པརྒྱད",
|
||||
"བཅུ་དགུ་",
|
||||
"ཉི་ཤུ་",
|
||||
"སུམ་ཅུ",
|
||||
"བཞི་བཅུ",
|
||||
"ལྔ་བཅུ",
|
||||
"དྲུག་ཅུ",
|
||||
"བདུན་ཅུ",
|
||||
"བརྒྱད་ཅུ",
|
||||
"དགུ་བཅུ",
|
||||
"བརྒྱ་",
|
||||
"སྟོང་",
|
||||
"ཁྲི་",
|
||||
"ས་ཡ་",
|
||||
" བྱེ་བ་",
|
||||
"དུང་ཕྱུར་",
|
||||
"ཐེར་འབུམ་",
|
||||
"ཐེར་འབུམ་ཆེན་པོ་",
|
||||
"ཁྲག་ཁྲིག་",
|
||||
"ཁྲག་ཁྲིག་ཆེན་པོ་",
|
||||
]
|
||||
|
||||
|
||||
def like_num(text):
|
||||
"""
|
||||
Check if text resembles a number
|
||||
"""
|
||||
if text.startswith(("+", "-", "±", "~")):
|
||||
text = text[1:]
|
||||
text = text.replace(",", "").replace(".", "")
|
||||
if text.isdigit():
|
||||
return True
|
||||
if text.count("/") == 1:
|
||||
num, denom = text.split("/")
|
||||
if num.isdigit() and denom.isdigit():
|
||||
return True
|
||||
if text in _num_words:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
LEX_ATTRS = {LIKE_NUM: like_num}
|
|
@ -1,198 +0,0 @@
|
|||
# Source: https://zenodo.org/records/10148636
|
||||
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
འི་
|
||||
།
|
||||
དུ་
|
||||
གིས་
|
||||
སོགས་
|
||||
ཏེ
|
||||
གི་
|
||||
རྣམས་
|
||||
ནི
|
||||
ཀུན་
|
||||
ཡི་
|
||||
འདི
|
||||
ཀྱི་
|
||||
སྙེད་
|
||||
པས་
|
||||
གཞན་
|
||||
ཀྱིས་
|
||||
ཡི
|
||||
ལ
|
||||
ནི་
|
||||
དང་
|
||||
སོགས
|
||||
ཅིང་
|
||||
ར
|
||||
དུ
|
||||
མི་
|
||||
སུ་
|
||||
བཅས་
|
||||
ཡོངས་
|
||||
ལས
|
||||
ཙམ་
|
||||
གྱིས་
|
||||
དེ་
|
||||
ཡང་
|
||||
མཐའ་དག་
|
||||
ཏུ་
|
||||
ཉིད་
|
||||
ས
|
||||
ཏེ་
|
||||
གྱི་
|
||||
སྤྱི
|
||||
དེ
|
||||
ཀ་
|
||||
ཡིན་
|
||||
ཞིང་
|
||||
འདི་
|
||||
རུང་
|
||||
རང་
|
||||
ཞིག་
|
||||
སྟེ
|
||||
སྟེ་
|
||||
ན་རེ
|
||||
ངམ
|
||||
ཤིང་
|
||||
དག་
|
||||
ཏོ
|
||||
རེ་
|
||||
འང་
|
||||
ཀྱང་
|
||||
ལགས་པ
|
||||
ཚུ
|
||||
དོ
|
||||
ཡིན་པ
|
||||
རེ
|
||||
ན་རེ་
|
||||
ཨེ་
|
||||
ཚང་མ
|
||||
ཐམས་ཅད་
|
||||
དམ་
|
||||
འོ་
|
||||
ཅིག་
|
||||
གྱིན་
|
||||
ཡིན
|
||||
ན
|
||||
ཁོ་ན་
|
||||
འམ་
|
||||
ཀྱིན་
|
||||
ལོ
|
||||
ཀྱིས
|
||||
བས་
|
||||
ལགས་
|
||||
ཤིག
|
||||
གིས
|
||||
ཀི་
|
||||
སྣ་ཚོགས་
|
||||
རྣམས
|
||||
སྙེད་པ
|
||||
ཡིས་
|
||||
གྱི
|
||||
གི
|
||||
བམ་
|
||||
ཤིག་
|
||||
རེ་རེ་
|
||||
ནམ
|
||||
མིན་
|
||||
ནམ་
|
||||
ངམ་
|
||||
རུ་
|
||||
འགའ་
|
||||
ཀུན
|
||||
ཤས་
|
||||
ཏུ
|
||||
ཡིས
|
||||
གིན་
|
||||
གམ་
|
||||
འོ
|
||||
ཡིན་པ་
|
||||
མིན
|
||||
ལགས
|
||||
གྱིས
|
||||
ཅང་
|
||||
འགའ
|
||||
སམ་
|
||||
ཞིག
|
||||
འང
|
||||
ལས་ཆེ་
|
||||
འཕྲལ་
|
||||
བར་
|
||||
རུ
|
||||
དང
|
||||
ཡ
|
||||
འག
|
||||
སམ
|
||||
ཀ
|
||||
ཅུང་ཟད་
|
||||
ཅིག
|
||||
ཉིད
|
||||
དུ་མ
|
||||
མ
|
||||
ཡིན་བ
|
||||
འམ
|
||||
མམ
|
||||
དམ
|
||||
དག
|
||||
ཁོ་ན
|
||||
ཀྱི
|
||||
ལམ
|
||||
ཕྱི་
|
||||
ནང་
|
||||
ཙམ
|
||||
ནོ་
|
||||
སོ་
|
||||
རམ་
|
||||
བོ་
|
||||
ཨང་
|
||||
ཕྱི
|
||||
ཏོ་
|
||||
ཚོ
|
||||
ལ་ལ་
|
||||
ཚོ་
|
||||
ཅིང
|
||||
མ་གི་
|
||||
གེ
|
||||
གོ
|
||||
ཡིན་ལུགས་
|
||||
རོ་
|
||||
བོ
|
||||
ལགས་པ་
|
||||
པས
|
||||
རབ་
|
||||
འི
|
||||
རམ
|
||||
བས
|
||||
གཞན
|
||||
སྙེད་པ་
|
||||
འབའ་
|
||||
མཾ་
|
||||
པོ
|
||||
ག་
|
||||
ག
|
||||
གམ
|
||||
སྤྱི་
|
||||
བམ
|
||||
མོ་
|
||||
ཙམ་པ་
|
||||
ཤ་སྟག་
|
||||
མམ་
|
||||
རེ་རེ
|
||||
སྙེད
|
||||
ཏམ་
|
||||
ངོ
|
||||
གྲང་
|
||||
ཏ་རེ
|
||||
ཏམ
|
||||
ཁ་
|
||||
ངེ་
|
||||
ཅོག་
|
||||
རིལ་
|
||||
ཉུང་ཤས་
|
||||
གིང་
|
||||
ཚ་
|
||||
ཀྱང
|
||||
""".split()
|
||||
)
|
|
@ -1,18 +0,0 @@
|
|||
from typing import Optional
|
||||
|
||||
from ...language import BaseDefaults, Language
|
||||
from .stop_words import STOP_WORDS
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
|
||||
|
||||
class ScottishDefaults(BaseDefaults):
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class Scottish(Language):
|
||||
lang = "gd"
|
||||
Defaults = ScottishDefaults
|
||||
|
||||
|
||||
__all__ = ["Scottish"]
|
|
@ -1,388 +0,0 @@
|
|||
STOP_WORDS = set(
|
||||
"""
|
||||
'ad
|
||||
'ar
|
||||
'd # iad
|
||||
'g # ag
|
||||
'ga
|
||||
'gam
|
||||
'gan
|
||||
'gar
|
||||
'gur
|
||||
'm # am
|
||||
'n # an
|
||||
'n seo
|
||||
'na
|
||||
'nad
|
||||
'nam
|
||||
'nan
|
||||
'nar
|
||||
'nuair
|
||||
'nur
|
||||
's
|
||||
'sa
|
||||
'san
|
||||
'sann
|
||||
'se
|
||||
'sna
|
||||
a
|
||||
a'
|
||||
a'd # agad
|
||||
a'm # agam
|
||||
a-chèile
|
||||
a-seo
|
||||
a-sin
|
||||
a-siud
|
||||
a chionn
|
||||
a chionn 's
|
||||
a chèile
|
||||
a chéile
|
||||
a dh'
|
||||
a h-uile
|
||||
a seo
|
||||
ac' # aca
|
||||
aca
|
||||
aca-san
|
||||
acasan
|
||||
ach
|
||||
ag
|
||||
agad
|
||||
agad-sa
|
||||
agads'
|
||||
agadsa
|
||||
agaibh
|
||||
agaibhse
|
||||
againn
|
||||
againne
|
||||
agam
|
||||
agam-sa
|
||||
agams'
|
||||
agamsa
|
||||
agus
|
||||
aice
|
||||
aice-se
|
||||
aicese
|
||||
aig
|
||||
aig' # aige
|
||||
aige
|
||||
aige-san
|
||||
aigesan
|
||||
air
|
||||
air-san
|
||||
air neo
|
||||
airsan
|
||||
am
|
||||
an
|
||||
an seo
|
||||
an sin
|
||||
an siud
|
||||
an uair
|
||||
ann
|
||||
ann a
|
||||
ann a'
|
||||
ann a shin
|
||||
ann am
|
||||
ann an
|
||||
annad
|
||||
annam
|
||||
annam-s'
|
||||
annamsa
|
||||
anns
|
||||
anns an
|
||||
annta
|
||||
aon
|
||||
ar
|
||||
as
|
||||
asad
|
||||
asda
|
||||
asta
|
||||
b'
|
||||
bho
|
||||
bhon
|
||||
bhuaidhe # bhuaithe
|
||||
bhuainn
|
||||
bhuaipe
|
||||
bhuaithe
|
||||
bhuapa
|
||||
bhur
|
||||
brì
|
||||
bu
|
||||
c'à
|
||||
car son
|
||||
carson
|
||||
cha
|
||||
chan
|
||||
chionn
|
||||
choir
|
||||
chon
|
||||
chun
|
||||
chèile
|
||||
chéile
|
||||
chòir
|
||||
cia mheud
|
||||
ciamar
|
||||
co-dhiubh
|
||||
cuide
|
||||
cuin
|
||||
cuin'
|
||||
cuine
|
||||
cà
|
||||
cà'
|
||||
càil
|
||||
càit
|
||||
càit'
|
||||
càite
|
||||
cò
|
||||
cò mheud
|
||||
có
|
||||
d'
|
||||
da
|
||||
de
|
||||
dh'
|
||||
dha
|
||||
dhaibh
|
||||
dhaibh-san
|
||||
dhaibhsan
|
||||
dhan
|
||||
dhasan
|
||||
dhe
|
||||
dhen
|
||||
dheth
|
||||
dhi
|
||||
dhiom
|
||||
dhiot
|
||||
dhith
|
||||
dhiubh
|
||||
dhomh
|
||||
dhomh-s'
|
||||
dhomhsa
|
||||
dhu'sa # dhut-sa
|
||||
dhuibh
|
||||
dhuibhse
|
||||
dhuinn
|
||||
dhuinne
|
||||
dhuit
|
||||
dhut
|
||||
dhutsa
|
||||
dhut-sa
|
||||
dhà
|
||||
dhà-san
|
||||
dhàsan
|
||||
dhòmhsa
|
||||
diubh
|
||||
do
|
||||
docha
|
||||
don
|
||||
dà
|
||||
dè
|
||||
dè mar
|
||||
dé
|
||||
dé mar
|
||||
dòch'
|
||||
dòcha
|
||||
e
|
||||
eadar
|
||||
eatarra
|
||||
eatorra
|
||||
eile
|
||||
esan
|
||||
fa
|
||||
far
|
||||
feud
|
||||
fhad
|
||||
fheudar
|
||||
fhearr
|
||||
fhein
|
||||
fheudar
|
||||
fheàrr
|
||||
fhèin
|
||||
fhéin
|
||||
fhìn
|
||||
fo
|
||||
fodha
|
||||
fodhainn
|
||||
foipe
|
||||
fon
|
||||
fèin
|
||||
ga
|
||||
gach
|
||||
gam
|
||||
gan
|
||||
ge brith
|
||||
ged
|
||||
gu
|
||||
gu dè
|
||||
gu ruige
|
||||
gun
|
||||
gur
|
||||
gus
|
||||
i
|
||||
iad
|
||||
iadsan
|
||||
innte
|
||||
is
|
||||
ise
|
||||
le
|
||||
leam
|
||||
leam-sa
|
||||
leamsa
|
||||
leat
|
||||
leat-sa
|
||||
leatha
|
||||
leatsa
|
||||
leibh
|
||||
leis
|
||||
leis-san
|
||||
leoth'
|
||||
leotha
|
||||
leotha-san
|
||||
linn
|
||||
m'
|
||||
m'a
|
||||
ma
|
||||
mac
|
||||
man
|
||||
mar
|
||||
mas
|
||||
mathaid
|
||||
mi
|
||||
mis'
|
||||
mise
|
||||
mo
|
||||
mu
|
||||
mu 'n
|
||||
mun
|
||||
mur
|
||||
mura
|
||||
mus
|
||||
na
|
||||
na b'
|
||||
na bu
|
||||
na iad
|
||||
nach
|
||||
nad
|
||||
nam
|
||||
nan
|
||||
nar
|
||||
nas
|
||||
neo
|
||||
no
|
||||
nuair
|
||||
o
|
||||
o'n
|
||||
oir
|
||||
oirbh
|
||||
oirbh-se
|
||||
oirnn
|
||||
oirnne
|
||||
oirre
|
||||
on
|
||||
orm
|
||||
orm-sa
|
||||
ormsa
|
||||
orra
|
||||
orra-san
|
||||
orrasan
|
||||
ort
|
||||
os
|
||||
r'
|
||||
ri
|
||||
ribh
|
||||
rinn
|
||||
ris
|
||||
rithe
|
||||
rithe-se
|
||||
rium
|
||||
rium-sa
|
||||
riums'
|
||||
riumsa
|
||||
riut
|
||||
riuth'
|
||||
riutha
|
||||
riuthasan
|
||||
ro
|
||||
ro'n
|
||||
roimh
|
||||
roimhe
|
||||
romhainn
|
||||
romham
|
||||
romhpa
|
||||
ron
|
||||
ruibh
|
||||
ruinn
|
||||
ruinne
|
||||
sa
|
||||
san
|
||||
sann
|
||||
se
|
||||
seach
|
||||
seo
|
||||
seothach
|
||||
shin
|
||||
sibh
|
||||
sibh-se
|
||||
sibhse
|
||||
sin
|
||||
sineach
|
||||
sinn
|
||||
sinne
|
||||
siod
|
||||
siodach
|
||||
siud
|
||||
siudach
|
||||
sna # ann an
|
||||
sè
|
||||
t'
|
||||
tarsaing
|
||||
tarsainn
|
||||
tarsuinn
|
||||
thar
|
||||
thoigh
|
||||
thro
|
||||
thu
|
||||
thuc'
|
||||
thuca
|
||||
thugad
|
||||
thugaibh
|
||||
thugainn
|
||||
thugam
|
||||
thugamsa
|
||||
thuice
|
||||
thuige
|
||||
thus'
|
||||
thusa
|
||||
timcheall
|
||||
toigh
|
||||
toil
|
||||
tro
|
||||
tro' # troimh
|
||||
troimh
|
||||
troimhe
|
||||
tron
|
||||
tu
|
||||
tusa
|
||||
uair
|
||||
ud
|
||||
ugaibh
|
||||
ugam-s'
|
||||
ugam-sa
|
||||
uice
|
||||
uige
|
||||
uige-san
|
||||
umad
|
||||
unnta # ann an
|
||||
ur
|
||||
urrainn
|
||||
à
|
||||
às
|
||||
àsan
|
||||
á
|
||||
ás
|
||||
è
|
||||
ì
|
||||
ò
|
||||
ó
|
||||
""".split(
|
||||
"\n"
|
||||
)
|
||||
)
|
File diff suppressed because it is too large
Load Diff
|
@ -1,5 +1,5 @@
|
|||
The list of Croatian lemmas was extracted from the reldi-tagger repository (https://github.com/clarinsi/reldi-tagger).
|
||||
Reldi-tagger is licensed under the Apache 2.0 licence.
|
||||
Reldi-tagger is licesned under the Apache 2.0 licence.
|
||||
|
||||
@InProceedings{ljubesic16-new,
|
||||
author = {Nikola Ljubešić and Filip Klubička and Željko Agić and Ivo-Pavao Jazbec},
|
||||
|
|
|
@ -1,52 +0,0 @@
|
|||
from typing import Callable, Optional
|
||||
|
||||
from thinc.api import Model
|
||||
|
||||
from ...language import BaseDefaults, Language
|
||||
from .lemmatizer import HaitianCreoleLemmatizer
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||
from .stop_words import STOP_WORDS
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .tag_map import TAG_MAP
|
||||
|
||||
|
||||
class HaitianCreoleDefaults(BaseDefaults):
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
prefixes = TOKENIZER_PREFIXES
|
||||
infixes = TOKENIZER_INFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
syntax_iterators = SYNTAX_ITERATORS
|
||||
stop_words = STOP_WORDS
|
||||
tag_map = TAG_MAP
|
||||
|
||||
class HaitianCreole(Language):
|
||||
lang = "ht"
|
||||
Defaults = HaitianCreoleDefaults
|
||||
|
||||
@HaitianCreole.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={
|
||||
"model": None,
|
||||
"mode": "rule",
|
||||
"overwrite": False,
|
||||
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||
},
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language,
|
||||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
overwrite: bool,
|
||||
scorer: Optional[Callable],
|
||||
):
|
||||
return HaitianCreoleLemmatizer(
|
||||
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||
)
|
||||
|
||||
__all__ = ["HaitianCreole"]
|
|
@ -1,18 +0,0 @@
|
|||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
>>> from spacy.lang.ht.examples import sentences
|
||||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"Apple ap panse achte yon demaraj nan Wayòm Ini pou $1 milya dola",
|
||||
"Machin otonòm fè responsablite asirans lan ale sou men fabrikan yo",
|
||||
"San Francisco ap konsidere entèdi robo ki livre sou twotwa yo",
|
||||
"Lond se yon gwo vil nan Wayòm Ini",
|
||||
"Kote ou ye?",
|
||||
"Kilès ki prezidan Lafrans?",
|
||||
"Ki kapital Etazini?",
|
||||
"Kile Barack Obama te fèt?",
|
||||
]
|
|
@ -1,51 +0,0 @@
|
|||
from typing import List, Tuple
|
||||
|
||||
from ...pipeline import Lemmatizer
|
||||
from ...tokens import Token
|
||||
from ...lookups import Lookups
|
||||
|
||||
|
||||
class HaitianCreoleLemmatizer(Lemmatizer):
|
||||
"""
|
||||
Minimal Haitian Creole lemmatizer.
|
||||
Returns a word's base form based on rules and lookup,
|
||||
or defaults to the original form.
|
||||
"""
|
||||
|
||||
def is_base_form(self, token: Token) -> bool:
|
||||
morph = token.morph.to_dict()
|
||||
upos = token.pos_.lower()
|
||||
|
||||
# Consider unmarked forms to be base
|
||||
if upos in {"noun", "verb", "adj", "adv"}:
|
||||
if not morph:
|
||||
return True
|
||||
if upos == "noun" and morph.get("Number") == "Sing":
|
||||
return True
|
||||
if upos == "verb" and morph.get("VerbForm") == "Inf":
|
||||
return True
|
||||
if upos == "adj" and morph.get("Degree") == "Pos":
|
||||
return True
|
||||
return False
|
||||
|
||||
def rule_lemmatize(self, token: Token) -> List[str]:
|
||||
string = token.text.lower()
|
||||
pos = token.pos_.lower()
|
||||
cache_key = (token.orth, token.pos)
|
||||
if cache_key in self.cache:
|
||||
return self.cache[cache_key]
|
||||
|
||||
forms = []
|
||||
|
||||
# fallback rule: just return lowercased form
|
||||
forms.append(string)
|
||||
|
||||
self.cache[cache_key] = forms
|
||||
return forms
|
||||
|
||||
@classmethod
|
||||
def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]:
|
||||
if mode == "rule":
|
||||
required = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"]
|
||||
return (required, [])
|
||||
return super().get_lookups_config(mode)
|
|
@ -1,78 +0,0 @@
|
|||
from ...attrs import LIKE_NUM, NORM
|
||||
|
||||
# Cardinal numbers in Creole
|
||||
_num_words = set(
|
||||
"""
|
||||
zewo youn en de twa kat senk sis sèt uit nèf dis
|
||||
onz douz trèz katoz kenz sèz disèt dizwit diznèf
|
||||
vent trant karant sinkant swasant swasann-dis
|
||||
san mil milyon milya
|
||||
""".split()
|
||||
)
|
||||
|
||||
# Ordinal numbers in Creole (some are French-influenced, some simplified)
|
||||
_ordinal_words = set(
|
||||
"""
|
||||
premye dezyèm twazyèm katryèm senkyèm sizyèm sètvyèm uitvyèm nèvyèm dizyèm
|
||||
onzèm douzyèm trèzyèm katozyèm kenzèm sèzyèm disetyèm dizwityèm diznèvyèm
|
||||
ventyèm trantyèm karantyèm sinkantyèm swasantyèm
|
||||
swasann-disyèm santyèm milyèm milyonnyèm milyadyèm
|
||||
""".split()
|
||||
)
|
||||
|
||||
NORM_MAP = {
|
||||
"'m": "mwen",
|
||||
"'w": "ou",
|
||||
"'l": "li",
|
||||
"'n": "nou",
|
||||
"'y": "yo",
|
||||
"’m": "mwen",
|
||||
"’w": "ou",
|
||||
"’l": "li",
|
||||
"’n": "nou",
|
||||
"’y": "yo",
|
||||
"m": "mwen",
|
||||
"n": "nou",
|
||||
"l": "li",
|
||||
"y": "yo",
|
||||
"w": "ou",
|
||||
"t": "te",
|
||||
"k": "ki",
|
||||
"p": "pa",
|
||||
"M": "Mwen",
|
||||
"N": "Nou",
|
||||
"L": "Li",
|
||||
"Y": "Yo",
|
||||
"W": "Ou",
|
||||
"T": "Te",
|
||||
"K": "Ki",
|
||||
"P": "Pa",
|
||||
}
|
||||
|
||||
def like_num(text):
|
||||
text = text.strip().lower()
|
||||
if text.startswith(("+", "-", "±", "~")):
|
||||
text = text[1:]
|
||||
text = text.replace(",", "").replace(".", "")
|
||||
if text.isdigit():
|
||||
return True
|
||||
if text.count("/") == 1:
|
||||
num, denom = text.split("/")
|
||||
if num.isdigit() and denom.isdigit():
|
||||
return True
|
||||
if text in _num_words:
|
||||
return True
|
||||
if text in _ordinal_words:
|
||||
return True
|
||||
# Handle things like "3yèm", "10yèm", "25yèm", etc.
|
||||
if text.endswith("yèm") and text[:-3].isdigit():
|
||||
return True
|
||||
return False
|
||||
|
||||
def norm_custom(text):
|
||||
return NORM_MAP.get(text, text.lower())
|
||||
|
||||
LEX_ATTRS = {
|
||||
LIKE_NUM: like_num,
|
||||
NORM: norm_custom,
|
||||
}
|
|
@ -1,43 +0,0 @@
|
|||
from ..char_classes import (
|
||||
ALPHA,
|
||||
ALPHA_LOWER,
|
||||
ALPHA_UPPER,
|
||||
CONCAT_QUOTES,
|
||||
HYPHENS,
|
||||
LIST_PUNCT,
|
||||
LIST_QUOTES,
|
||||
LIST_ELLIPSES,
|
||||
LIST_ICONS,
|
||||
merge_chars,
|
||||
)
|
||||
|
||||
ELISION = "'’".replace(" ", "")
|
||||
|
||||
_prefixes_elision = "m n l y t k w"
|
||||
_prefixes_elision += " " + _prefixes_elision.upper()
|
||||
|
||||
TOKENIZER_PREFIXES = LIST_PUNCT + LIST_QUOTES + [
|
||||
r"(?:({pe})[{el}])(?=[{a}])".format(
|
||||
a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision)
|
||||
)
|
||||
]
|
||||
|
||||
TOKENIZER_SUFFIXES = LIST_PUNCT + LIST_QUOTES + LIST_ELLIPSES + [
|
||||
r"(?<=[0-9])%", # numbers like 10%
|
||||
r"(?<=[0-9])(?:{h})".format(h=HYPHENS), # hyphens after numbers
|
||||
r"(?<=[{a}])['’]".format(a=ALPHA), # apostrophes after letters
|
||||
r"(?<=[{a}])['’][mwlnytk](?=\s|$)".format(a=ALPHA), # contractions
|
||||
r"(?<=[{a}0-9])\)", # right parenthesis after letter/number
|
||||
r"(?<=[{a}])\.(?=\s|$)".format(a=ALPHA), # period after letter if space or end of string
|
||||
r"(?<=\))[\.\?!]", # punctuation immediately after right parenthesis
|
||||
]
|
||||
|
||||
TOKENIZER_INFIXES = LIST_ELLIPSES + LIST_ICONS + [
|
||||
r"(?<=[0-9])[+\-\*^](?=[0-9-])",
|
||||
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
|
||||
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
|
||||
),
|
||||
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
|
||||
r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION),
|
||||
]
|
|
@ -1,50 +0,0 @@
|
|||
STOP_WORDS = set(
|
||||
"""
|
||||
a ak an ankò ant apre ap atò avan avanlè
|
||||
byen bò byenke
|
||||
|
||||
chak
|
||||
|
||||
de depi deja deja
|
||||
|
||||
e en epi èske
|
||||
|
||||
fò fòk
|
||||
|
||||
gen genyen
|
||||
|
||||
ki kisa kilès kote koukou konsa konbyen konn konnen kounye kouman
|
||||
|
||||
la l laa le lè li lye lò
|
||||
|
||||
m m' mwen
|
||||
|
||||
nan nap nou n'
|
||||
|
||||
ou oumenm
|
||||
|
||||
pa paske pami pandan pito pou pral preske pwiske
|
||||
|
||||
se selman si sou sòt
|
||||
|
||||
ta tap tankou te toujou tou tan tout toutotan twòp tèl
|
||||
|
||||
w w' wi wè
|
||||
|
||||
y y' yo yon yonn
|
||||
|
||||
non o oh eh
|
||||
|
||||
sa san si swa si
|
||||
|
||||
men mèsi oswa osinon
|
||||
|
||||
"""
|
||||
.split()
|
||||
)
|
||||
|
||||
# Add common contractions, with and without apostrophe variants
|
||||
contractions = ["m'", "n'", "w'", "y'", "l'", "t'", "k'"]
|
||||
for apostrophe in ["'", "’", "‘"]:
|
||||
for word in contractions:
|
||||
STOP_WORDS.add(word.replace("'", apostrophe))
|
|
@ -1,74 +0,0 @@
|
|||
from typing import Iterator, Tuple, Union
|
||||
|
||||
from ...errors import Errors
|
||||
from ...symbols import NOUN, PRON, PROPN
|
||||
from ...tokens import Doc, Span
|
||||
|
||||
|
||||
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
|
||||
"""
|
||||
Detect base noun phrases from a dependency parse for Haitian Creole.
|
||||
Works on both Doc and Span objects.
|
||||
"""
|
||||
|
||||
# Core nominal dependencies common in Haitian Creole
|
||||
labels = [
|
||||
"nsubj",
|
||||
"obj",
|
||||
"obl",
|
||||
"nmod",
|
||||
"appos",
|
||||
"ROOT",
|
||||
]
|
||||
|
||||
# Modifiers to optionally include in chunk (to the right)
|
||||
post_modifiers = ["compound", "flat", "flat:name", "fixed"]
|
||||
|
||||
doc = doclike.doc
|
||||
if not doc.has_annotation("DEP"):
|
||||
raise ValueError(Errors.E029)
|
||||
|
||||
np_deps = {doc.vocab.strings.add(label) for label in labels}
|
||||
np_mods = {doc.vocab.strings.add(mod) for mod in post_modifiers}
|
||||
conj_label = doc.vocab.strings.add("conj")
|
||||
np_label = doc.vocab.strings.add("NP")
|
||||
adp_pos = doc.vocab.strings.add("ADP")
|
||||
cc_pos = doc.vocab.strings.add("CCONJ")
|
||||
|
||||
prev_end = -1
|
||||
for i, word in enumerate(doclike):
|
||||
if word.pos not in (NOUN, PROPN, PRON):
|
||||
continue
|
||||
if word.left_edge.i <= prev_end:
|
||||
continue
|
||||
|
||||
if word.dep in np_deps:
|
||||
right_end = word
|
||||
# expand to include known modifiers to the right
|
||||
for child in word.rights:
|
||||
if child.dep in np_mods:
|
||||
right_end = child.right_edge
|
||||
elif child.pos == NOUN:
|
||||
right_end = child.right_edge
|
||||
|
||||
left_index = word.left_edge.i
|
||||
# Skip prepositions at the start
|
||||
if word.left_edge.pos == adp_pos:
|
||||
left_index += 1
|
||||
|
||||
prev_end = right_end.i
|
||||
yield left_index, right_end.i + 1, np_label
|
||||
|
||||
elif word.dep == conj_label:
|
||||
head = word.head
|
||||
while head.dep == conj_label and head.head.i < head.i:
|
||||
head = head.head
|
||||
if head.dep in np_deps:
|
||||
left_index = word.left_edge.i
|
||||
if word.left_edge.pos == cc_pos:
|
||||
left_index += 1
|
||||
prev_end = word.i
|
||||
yield left_index, word.i + 1, np_label
|
||||
|
||||
|
||||
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
|
|
@ -1,21 +0,0 @@
|
|||
from spacy.symbols import NOUN, VERB, AUX, ADJ, ADV, PRON, DET, ADP, SCONJ, CCONJ, PART, INTJ, NUM, PROPN, PUNCT, SYM, X
|
||||
|
||||
TAG_MAP = {
|
||||
"NOUN": {"pos": NOUN},
|
||||
"VERB": {"pos": VERB},
|
||||
"AUX": {"pos": AUX},
|
||||
"ADJ": {"pos": ADJ},
|
||||
"ADV": {"pos": ADV},
|
||||
"PRON": {"pos": PRON},
|
||||
"DET": {"pos": DET},
|
||||
"ADP": {"pos": ADP},
|
||||
"SCONJ": {"pos": SCONJ},
|
||||
"CCONJ": {"pos": CCONJ},
|
||||
"PART": {"pos": PART},
|
||||
"INTJ": {"pos": INTJ},
|
||||
"NUM": {"pos": NUM},
|
||||
"PROPN": {"pos": PROPN},
|
||||
"PUNCT": {"pos": PUNCT},
|
||||
"SYM": {"pos": SYM},
|
||||
"X": {"pos": X},
|
||||
}
|
|
@ -1,121 +0,0 @@
|
|||
from spacy.symbols import ORTH, NORM
|
||||
|
||||
def make_variants(base, first_norm, second_orth, second_norm):
|
||||
return {
|
||||
base: [
|
||||
{ORTH: base.split("'")[0] + "'", NORM: first_norm},
|
||||
{ORTH: second_orth, NORM: second_norm},
|
||||
],
|
||||
base.capitalize(): [
|
||||
{ORTH: base.split("'")[0].capitalize() + "'", NORM: first_norm.capitalize()},
|
||||
{ORTH: second_orth, NORM: second_norm},
|
||||
]
|
||||
}
|
||||
|
||||
TOKENIZER_EXCEPTIONS = {
|
||||
"Dr.": [{ORTH: "Dr."}]
|
||||
}
|
||||
|
||||
# Apostrophe forms
|
||||
TOKENIZER_EXCEPTIONS.update(make_variants("m'ap", "mwen", "ap", "ap"))
|
||||
TOKENIZER_EXCEPTIONS.update(make_variants("n'ap", "nou", "ap", "ap"))
|
||||
TOKENIZER_EXCEPTIONS.update(make_variants("l'ap", "li", "ap", "ap"))
|
||||
TOKENIZER_EXCEPTIONS.update(make_variants("y'ap", "yo", "ap", "ap"))
|
||||
TOKENIZER_EXCEPTIONS.update(make_variants("m'te", "mwen", "te", "te"))
|
||||
TOKENIZER_EXCEPTIONS.update(make_variants("m'pral", "mwen", "pral", "pral"))
|
||||
TOKENIZER_EXCEPTIONS.update(make_variants("w'ap", "ou", "ap", "ap"))
|
||||
TOKENIZER_EXCEPTIONS.update(make_variants("k'ap", "ki", "ap", "ap"))
|
||||
TOKENIZER_EXCEPTIONS.update(make_variants("p'ap", "pa", "ap", "ap"))
|
||||
TOKENIZER_EXCEPTIONS.update(make_variants("t'ap", "te", "ap", "ap"))
|
||||
|
||||
# Non-apostrophe contractions (with capitalized variants)
|
||||
TOKENIZER_EXCEPTIONS.update({
|
||||
"map": [
|
||||
{ORTH: "m", NORM: "mwen"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"Map": [
|
||||
{ORTH: "M", NORM: "Mwen"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"lem": [
|
||||
{ORTH: "le", NORM: "le"},
|
||||
{ORTH: "m", NORM: "mwen"},
|
||||
],
|
||||
"Lem": [
|
||||
{ORTH: "Le", NORM: "Le"},
|
||||
{ORTH: "m", NORM: "mwen"},
|
||||
],
|
||||
"lew": [
|
||||
{ORTH: "le", NORM: "le"},
|
||||
{ORTH: "w", NORM: "ou"},
|
||||
],
|
||||
"Lew": [
|
||||
{ORTH: "Le", NORM: "Le"},
|
||||
{ORTH: "w", NORM: "ou"},
|
||||
],
|
||||
"nap": [
|
||||
{ORTH: "n", NORM: "nou"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"Nap": [
|
||||
{ORTH: "N", NORM: "Nou"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"lap": [
|
||||
{ORTH: "l", NORM: "li"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"Lap": [
|
||||
{ORTH: "L", NORM: "Li"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"yap": [
|
||||
{ORTH: "y", NORM: "yo"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"Yap": [
|
||||
{ORTH: "Y", NORM: "Yo"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"mte": [
|
||||
{ORTH: "m", NORM: "mwen"},
|
||||
{ORTH: "te", NORM: "te"},
|
||||
],
|
||||
"Mte": [
|
||||
{ORTH: "M", NORM: "Mwen"},
|
||||
{ORTH: "te", NORM: "te"},
|
||||
],
|
||||
"mpral": [
|
||||
{ORTH: "m", NORM: "mwen"},
|
||||
{ORTH: "pral", NORM: "pral"},
|
||||
],
|
||||
"Mpral": [
|
||||
{ORTH: "M", NORM: "Mwen"},
|
||||
{ORTH: "pral", NORM: "pral"},
|
||||
],
|
||||
"wap": [
|
||||
{ORTH: "w", NORM: "ou"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"Wap": [
|
||||
{ORTH: "W", NORM: "Ou"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"kap": [
|
||||
{ORTH: "k", NORM: "ki"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"Kap": [
|
||||
{ORTH: "K", NORM: "Ki"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"tap": [
|
||||
{ORTH: "t", NORM: "te"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
"Tap": [
|
||||
{ORTH: "T", NORM: "Te"},
|
||||
{ORTH: "ap", NORM: "ap"},
|
||||
],
|
||||
})
|
|
@ -7,7 +7,7 @@ class IcelandicDefaults(BaseDefaults):
|
|||
|
||||
|
||||
class Icelandic(Language):
|
||||
lang = "is"
|
||||
lang = "isl"
|
||||
Defaults = IcelandicDefaults
|
||||
|
||||
|
|
@ -32,6 +32,7 @@ split_mode = null
|
|||
"""
|
||||
|
||||
|
||||
@registry.tokenizers("spacy.ja.JapaneseTokenizer")
|
||||
def create_tokenizer(split_mode: Optional[str] = None):
|
||||
def japanese_tokenizer_factory(nlp):
|
||||
return JapaneseTokenizer(nlp.vocab, split_mode=split_mode)
|
||||
|
|
|
@ -1,16 +0,0 @@
|
|||
from ...language import BaseDefaults, Language
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .stop_words import STOP_WORDS
|
||||
|
||||
|
||||
class KurmanjiDefaults(BaseDefaults):
|
||||
stop_words = STOP_WORDS
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
|
||||
|
||||
class Kurmanji(Language):
|
||||
lang = "kmr"
|
||||
Defaults = KurmanjiDefaults
|
||||
|
||||
|
||||
__all__ = ["Kurmanji"]
|
|
@ -1,17 +0,0 @@
|
|||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
>>> from spacy.lang.kmr.examples import sentences
|
||||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
sentences = [
|
||||
"Berê mirovan her tim li geşedana pêşerojê ye", # People's gaze is always on the development of the future
|
||||
"Kawa Nemir di 14 salan de Ulysses wergerand Kurmancî.", # Kawa Nemir translated Ulysses into Kurmanji in 14 years.
|
||||
"Mem Ararat hunermendekî Kurd yê bi nav û deng e.", # Mem Ararat is a famous Kurdish artist
|
||||
"Firat Cewerî 40 sal e pirtûkên Kurdî dinivîsîne.", # Firat Ceweri has been writing Kurdish books for 40 years
|
||||
"Rojnamegerê ciwan nûçeyeke balkêş li ser rewşa aborî nivîsand", # The young journalist wrote an interesting news article about the economic situation
|
||||
"Sektora çandiniyê beşeke giring a belavkirina gaza serayê li seranserê cîhanê pêk tîne", # The agricultural sector constitutes an important part of greenhouse gas emissions worldwide
|
||||
"Xwendekarên jêhatî di pêşbaziya matematîkê de serkeftî bûn", # Talented students succeeded in the mathematics competition
|
||||
"Ji ber ji tunebûnê bavê min xwişkeke min nedan xwendin ew ji min re bû derd û kulek.", # Because of poverty, my father didn't send my sister to school, which became a pain and sorrow for me
|
||||
]
|
|
@ -1,138 +0,0 @@
|
|||
from ...attrs import LIKE_NUM
|
||||
|
||||
_num_words = [
|
||||
"sifir",
|
||||
"yek",
|
||||
"du",
|
||||
"sê",
|
||||
"çar",
|
||||
"pênc",
|
||||
"şeş",
|
||||
"heft",
|
||||
"heşt",
|
||||
"neh",
|
||||
"deh",
|
||||
"yazde",
|
||||
"dazde",
|
||||
"sêzde",
|
||||
"çarde",
|
||||
"pazde",
|
||||
"şazde",
|
||||
"hevde",
|
||||
"hejde",
|
||||
"nozde",
|
||||
"bîst",
|
||||
"sî",
|
||||
"çil",
|
||||
"pêncî",
|
||||
"şêst",
|
||||
"heftê",
|
||||
"heştê",
|
||||
"nod",
|
||||
"sed",
|
||||
"hezar",
|
||||
"milyon",
|
||||
"milyar",
|
||||
]
|
||||
|
||||
_ordinal_words = [
|
||||
"yekem",
|
||||
"yekemîn",
|
||||
"duyem",
|
||||
"duyemîn",
|
||||
"sêyem",
|
||||
"sêyemîn",
|
||||
"çarem",
|
||||
"çaremîn",
|
||||
"pêncem",
|
||||
"pêncemîn",
|
||||
"şeşem",
|
||||
"şeşemîn",
|
||||
"heftem",
|
||||
"heftemîn",
|
||||
"heştem",
|
||||
"heştemîn",
|
||||
"nehem",
|
||||
"nehemîn",
|
||||
"dehem",
|
||||
"dehemîn",
|
||||
"yazdehem",
|
||||
"yazdehemîn",
|
||||
"dazdehem",
|
||||
"dazdehemîn",
|
||||
"sêzdehem",
|
||||
"sêzdehemîn",
|
||||
"çardehem",
|
||||
"çardehemîn",
|
||||
"pazdehem",
|
||||
"pazdehemîn",
|
||||
"şanzdehem",
|
||||
"şanzdehemîn",
|
||||
"hevdehem",
|
||||
"hevdehemîn",
|
||||
"hejdehem",
|
||||
"hejdehemîn",
|
||||
"nozdehem",
|
||||
"nozdehemîn",
|
||||
"bîstem",
|
||||
"bîstemîn",
|
||||
"sîyem",
|
||||
"sîyemîn",
|
||||
"çilem",
|
||||
"çilemîn",
|
||||
"pêncîyem",
|
||||
"pênciyemîn",
|
||||
"şêstem",
|
||||
"şêstemîn",
|
||||
"heftêyem",
|
||||
"heftêyemîn",
|
||||
"heştêyem",
|
||||
"heştêyemîn",
|
||||
"notem",
|
||||
"notemîn",
|
||||
"sedem",
|
||||
"sedemîn",
|
||||
"hezarem",
|
||||
"hezaremîn",
|
||||
"milyonem",
|
||||
"milyonemîn",
|
||||
"milyarem",
|
||||
"milyaremîn",
|
||||
]
|
||||
|
||||
|
||||
def like_num(text):
|
||||
if text.startswith(("+", "-", "±", "~")):
|
||||
text = text[1:]
|
||||
text = text.replace(",", "").replace(".", "")
|
||||
if text.isdigit():
|
||||
return True
|
||||
if text.count("/") == 1:
|
||||
num, denom = text.split("/")
|
||||
if num.isdigit() and denom.isdigit():
|
||||
return True
|
||||
text_lower = text.lower()
|
||||
if text_lower in _num_words:
|
||||
return True
|
||||
|
||||
# Check ordinal number
|
||||
if text_lower in _ordinal_words:
|
||||
return True
|
||||
|
||||
if is_digit(text_lower):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def is_digit(text):
|
||||
endings = ("em", "yem", "emîn", "yemîn")
|
||||
for ending in endings:
|
||||
to = len(ending)
|
||||
if text.endswith(ending) and text[:-to].isdigit():
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
LEX_ATTRS = {LIKE_NUM: like_num}
|
|
@ -1,44 +0,0 @@
|
|||
STOP_WORDS = set(
|
||||
"""
|
||||
û
|
||||
li
|
||||
bi
|
||||
di
|
||||
da
|
||||
de
|
||||
ji
|
||||
ku
|
||||
ew
|
||||
ez
|
||||
tu
|
||||
em
|
||||
hûn
|
||||
ew
|
||||
ev
|
||||
min
|
||||
te
|
||||
wî
|
||||
wê
|
||||
me
|
||||
we
|
||||
wan
|
||||
vê
|
||||
vî
|
||||
va
|
||||
çi
|
||||
kî
|
||||
kê
|
||||
çawa
|
||||
çima
|
||||
kengî
|
||||
li ku
|
||||
çend
|
||||
çiqas
|
||||
her
|
||||
hin
|
||||
gelek
|
||||
hemû
|
||||
kes
|
||||
tişt
|
||||
""".split()
|
||||
)
|
|
@ -17,33 +17,23 @@ DEFAULT_CONFIG = """
|
|||
|
||||
[nlp.tokenizer]
|
||||
@tokenizers = "spacy.ko.KoreanTokenizer"
|
||||
mecab_args = ""
|
||||
"""
|
||||
|
||||
|
||||
def create_tokenizer():
|
||||
@registry.tokenizers("spacy.ko.KoreanTokenizer")
|
||||
def create_tokenizer(mecab_args: str):
|
||||
def korean_tokenizer_factory(nlp):
|
||||
return KoreanTokenizer(nlp.vocab)
|
||||
return KoreanTokenizer(nlp.vocab, mecab_args=mecab_args)
|
||||
|
||||
return korean_tokenizer_factory
|
||||
|
||||
|
||||
class KoreanTokenizer(DummyTokenizer):
|
||||
def __init__(self, vocab: Vocab):
|
||||
def __init__(self, vocab: Vocab, *, mecab_args: str = ""):
|
||||
self.vocab = vocab
|
||||
self._mecab = try_mecab_import() # type: ignore[func-returns-value]
|
||||
self._mecab_tokenizer = None
|
||||
|
||||
@property
|
||||
def mecab_tokenizer(self):
|
||||
# This is a property so that initializing a pipeline with blank:ko is
|
||||
# possible without actually requiring mecab-ko, e.g. to run
|
||||
# `spacy init vectors ko` for a pipeline that will have a different
|
||||
# tokenizer in the end. The languages need to match for the vectors
|
||||
# to be imported and there's no way to pass a custom config to
|
||||
# `init vectors`.
|
||||
if self._mecab_tokenizer is None:
|
||||
self._mecab_tokenizer = self._mecab("-F%f[0],%f[7]")
|
||||
return self._mecab_tokenizer
|
||||
mecab = try_mecab_import()
|
||||
self.mecab_tokenizer = mecab.Tagger(mecab_args)
|
||||
|
||||
def __reduce__(self):
|
||||
return KoreanTokenizer, (self.vocab,)
|
||||
|
@ -66,13 +56,15 @@ class KoreanTokenizer(DummyTokenizer):
|
|||
def detailed_tokens(self, text: str) -> Iterator[Dict[str, Any]]:
|
||||
# 품사 태그(POS)[0], 의미 부류(semantic class)[1], 종성 유무(jongseong)[2], 읽기(reading)[3],
|
||||
# 타입(type)[4], 첫번째 품사(start pos)[5], 마지막 품사(end pos)[6], 표현(expression)[7], *
|
||||
for node in self.mecab_tokenizer.parse(text, as_nodes=True):
|
||||
if node.is_eos():
|
||||
for line in self.mecab_tokenizer.parse(text).split("\n"):
|
||||
if line == "EOS":
|
||||
break
|
||||
surface = node.surface
|
||||
feature = node.feature
|
||||
tag, _, expr = feature.partition(",")
|
||||
lemma, _, remainder = expr.partition("/")
|
||||
surface, _, expr = line.partition("\t")
|
||||
features = expr.split("/")[0].split(",")
|
||||
tag = features[0]
|
||||
lemma = "*"
|
||||
if len(features) >= 8:
|
||||
lemma = features[7]
|
||||
if lemma == "*":
|
||||
lemma = surface
|
||||
yield {"surface": surface, "lemma": lemma, "tag": tag}
|
||||
|
@ -95,20 +87,94 @@ class Korean(Language):
|
|||
Defaults = KoreanDefaults
|
||||
|
||||
|
||||
def try_mecab_import() -> None:
|
||||
def try_mecab_import():
|
||||
try:
|
||||
from natto import MeCab
|
||||
import mecab_ko as MeCab
|
||||
|
||||
return MeCab
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
'The Korean tokenizer ("spacy.ko.KoreanTokenizer") requires '
|
||||
"[mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), "
|
||||
"[mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), "
|
||||
"and [natto-py](https://github.com/buruzaemon/natto-py)"
|
||||
"the python package `mecab-ko`: pip install mecab-ko"
|
||||
) from None
|
||||
|
||||
|
||||
@registry.tokenizers("spacy.KoreanNattoTokenizer.v1")
|
||||
def create_natto_tokenizer():
|
||||
def korean_natto_tokenizer_factory(nlp):
|
||||
return KoreanNattoTokenizer(nlp.vocab)
|
||||
|
||||
return korean_natto_tokenizer_factory
|
||||
|
||||
|
||||
class KoreanNattoTokenizer(DummyTokenizer):
|
||||
def __init__(self, vocab: Vocab):
|
||||
self.vocab = vocab
|
||||
self._mecab = self._try_mecab_import() # type: ignore[func-returns-value]
|
||||
self._mecab_tokenizer = None
|
||||
|
||||
@property
|
||||
def mecab_tokenizer(self):
|
||||
# This is a property so that initializing a pipeline with blank:ko is
|
||||
# possible without actually requiring mecab-ko, e.g. to run
|
||||
# `spacy init vectors ko` for a pipeline that will have a different
|
||||
# tokenizer in the end. The languages need to match for the vectors
|
||||
# to be imported and there's no way to pass a custom config to
|
||||
# `init vectors`.
|
||||
if self._mecab_tokenizer is None:
|
||||
self._mecab_tokenizer = self._mecab("-F%f[0],%f[7]")
|
||||
return self._mecab_tokenizer
|
||||
|
||||
def __reduce__(self):
|
||||
return KoreanNattoTokenizer, (self.vocab,)
|
||||
|
||||
def __call__(self, text: str) -> Doc:
|
||||
dtokens = list(self.detailed_tokens(text))
|
||||
surfaces = [dt["surface"] for dt in dtokens]
|
||||
doc = Doc(self.vocab, words=surfaces, spaces=list(check_spaces(text, surfaces)))
|
||||
for token, dtoken in zip(doc, dtokens):
|
||||
first_tag, sep, eomi_tags = dtoken["tag"].partition("+")
|
||||
token.tag_ = first_tag # stem(어간) or pre-final(선어말 어미)
|
||||
if token.tag_ in TAG_MAP:
|
||||
token.pos = TAG_MAP[token.tag_][POS]
|
||||
else:
|
||||
token.pos = X
|
||||
token.lemma_ = dtoken["lemma"]
|
||||
doc.user_data["full_tags"] = [dt["tag"] for dt in dtokens]
|
||||
return doc
|
||||
|
||||
def detailed_tokens(self, text: str) -> Iterator[Dict[str, Any]]:
|
||||
# 품사 태그(POS)[0], 의미 부류(semantic class)[1], 종성 유무(jongseong)[2], 읽기(reading)[3],
|
||||
# 타입(type)[4], 첫번째 품사(start pos)[5], 마지막 품사(end pos)[6], 표현(expression)[7], *
|
||||
for node in self.mecab_tokenizer.parse(text, as_nodes=True):
|
||||
if node.is_eos():
|
||||
break
|
||||
surface = node.surface
|
||||
feature = node.feature
|
||||
tag, _, expr = feature.partition(",")
|
||||
lemma, _, remainder = expr.partition("/")
|
||||
if lemma == "*" or lemma == "":
|
||||
lemma = surface
|
||||
yield {"surface": surface, "lemma": lemma, "tag": tag}
|
||||
|
||||
def score(self, examples):
|
||||
validate_examples(examples, "KoreanTokenizer.score")
|
||||
return Scorer.score_tokenization(examples)
|
||||
|
||||
def _try_mecab_import(self):
|
||||
try:
|
||||
from natto import MeCab
|
||||
|
||||
return MeCab
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
'The Korean Natto tokenizer ("spacy.ko.KoreanNattoTokenizer") requires '
|
||||
"[mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), "
|
||||
"[mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), "
|
||||
"and [natto-py](https://github.com/buruzaemon/natto-py)"
|
||||
) from None
|
||||
|
||||
|
||||
def check_spaces(text, tokens):
|
||||
prev_end = -1
|
||||
start = 0
|
||||
|
|
|
@ -24,6 +24,12 @@ class MacedonianDefaults(BaseDefaults):
|
|||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
@classmethod
|
||||
def create_lemmatizer(cls, nlp=None, lookups=None):
|
||||
if lookups is None:
|
||||
lookups = Lookups()
|
||||
return MacedonianLemmatizer(lookups)
|
||||
|
||||
|
||||
class Macedonian(Language):
|
||||
lang = "mk"
|
||||
|
|
|
@ -3,10 +3,10 @@ from ...language import Language
|
|||
|
||||
class MultiLanguage(Language):
|
||||
"""Language class to be used for models that support multiple languages.
|
||||
This module allows models to specify their language ID as 'xx'.
|
||||
This module allows models to specify their language ID as 'mul'.
|
||||
"""
|
||||
|
||||
lang = "xx"
|
||||
lang = "mul"
|
||||
|
||||
|
||||
__all__ = ["MultiLanguage"]
|
|
@ -13,6 +13,7 @@ DEFAULT_CONFIG = """
|
|||
"""
|
||||
|
||||
|
||||
@registry.tokenizers("spacy.th.ThaiTokenizer")
|
||||
def create_thai_tokenizer():
|
||||
def thai_tokenizer_factory(nlp):
|
||||
return ThaiTokenizer(nlp.vocab)
|
||||
|
|
|
@ -16,10 +16,6 @@ URL_PATTERN = (
|
|||
r"(?:\S+(?::\S*)?@)?"
|
||||
r"(?:"
|
||||
# IP address exclusion
|
||||
# private & local networks
|
||||
r"(?!(?:10|127)(?:\.\d{1,3}){3})"
|
||||
r"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})"
|
||||
r"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})"
|
||||
# IP address dotted notation octets
|
||||
# excludes loopback network 0.0.0.0
|
||||
# excludes reserved space >= 224.0.0.0
|
||||
|
|
|
@ -22,6 +22,7 @@ use_pyvi = true
|
|||
"""
|
||||
|
||||
|
||||
@registry.tokenizers("spacy.vi.VietnameseTokenizer")
|
||||
def create_vietnamese_tokenizer(use_pyvi: bool = True):
|
||||
def vietnamese_tokenizer_factory(nlp):
|
||||
return VietnameseTokenizer(nlp.vocab, use_pyvi=use_pyvi)
|
||||
|
|
|
@ -31,7 +31,7 @@ segmenter = "char"
|
|||
[initialize]
|
||||
|
||||
[initialize.tokenizer]
|
||||
pkuseg_model = null
|
||||
pkuseg_model = "spacy_ontonotes"
|
||||
pkuseg_user_dict = "default"
|
||||
"""
|
||||
|
||||
|
@ -46,6 +46,7 @@ class Segmenter(str, Enum):
|
|||
return list(cls.__members__.keys())
|
||||
|
||||
|
||||
@registry.tokenizers("spacy.zh.ChineseTokenizer")
|
||||
def create_chinese_tokenizer(segmenter: Segmenter = Segmenter.char):
|
||||
def chinese_tokenizer_factory(nlp):
|
||||
return ChineseTokenizer(nlp.vocab, segmenter=segmenter)
|
||||
|
|
|
@ -5,7 +5,7 @@ import multiprocessing as mp
|
|||
import random
|
||||
import traceback
|
||||
import warnings
|
||||
from contextlib import ExitStack, contextmanager
|
||||
from contextlib import contextmanager
|
||||
from copy import deepcopy
|
||||
from dataclasses import dataclass
|
||||
from itertools import chain, cycle
|
||||
|
@ -18,6 +18,7 @@ from typing import (
|
|||
Iterable,
|
||||
Iterator,
|
||||
List,
|
||||
Literal,
|
||||
NoReturn,
|
||||
Optional,
|
||||
Pattern,
|
||||
|
@ -30,14 +31,10 @@ from typing import (
|
|||
overload,
|
||||
)
|
||||
|
||||
import numpy
|
||||
import srsly
|
||||
from cymem.cymem import Pool
|
||||
from thinc.api import Config, CupyOps, Optimizer, get_current_ops
|
||||
from thinc.util import convert_recursive
|
||||
|
||||
from . import about, ty, util
|
||||
from .compat import Literal
|
||||
from .errors import Errors, Warnings
|
||||
from .git_info import GIT_VERSION
|
||||
from .lang.punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
||||
|
@ -55,7 +52,7 @@ from .scorer import Scorer
|
|||
from .tokenizer import Tokenizer
|
||||
from .tokens import Doc
|
||||
from .tokens.underscore import Underscore
|
||||
from .training import Example, validate_examples
|
||||
from .training import Example, validate_distillation_examples, validate_examples
|
||||
from .training.initialize import init_tok2vec, init_vocab
|
||||
from .util import (
|
||||
_DEFAULT_EMPTY_PIPES,
|
||||
|
@ -77,6 +74,9 @@ PipeCallable = Callable[[Doc], Doc]
|
|||
# This is the base config will all settings (training etc.)
|
||||
DEFAULT_CONFIG_PATH = Path(__file__).parent / "default_config.cfg"
|
||||
DEFAULT_CONFIG = util.load_config(DEFAULT_CONFIG_PATH)
|
||||
# This is the base config for the [distillation] block and currently not included
|
||||
# in the main config and only added via the 'init fill-config' command
|
||||
DEFAULT_CONFIG_DISTILL_PATH = Path(__file__).parent / "default_config_distillation.cfg"
|
||||
# This is the base config for the [pretraining] block and currently not included
|
||||
# in the main config and only added via the 'init fill-config' command
|
||||
DEFAULT_CONFIG_PRETRAIN_PATH = Path(__file__).parent / "default_config_pretraining.cfg"
|
||||
|
@ -104,6 +104,7 @@ class BaseDefaults:
|
|||
writing_system = {"direction": "ltr", "has_case": True, "has_letters": True}
|
||||
|
||||
|
||||
@registry.tokenizers("spacy.Tokenizer.v1")
|
||||
def create_tokenizer() -> Callable[["Language"], Tokenizer]:
|
||||
"""Registered function to create a tokenizer. Returns a factory that takes
|
||||
the nlp object and returns a Tokenizer instance using the language detaults.
|
||||
|
@ -129,19 +130,13 @@ def create_tokenizer() -> Callable[["Language"], Tokenizer]:
|
|||
return tokenizer_factory
|
||||
|
||||
|
||||
def load_lookups_data(lang, tables):
|
||||
util.logger.debug("Loading lookups from spacy-lookups-data: %s", tables)
|
||||
lookups = load_lookups(lang=lang, tables=tables)
|
||||
return lookups
|
||||
|
||||
|
||||
class Language:
|
||||
"""A text-processing pipeline. Usually you'll load this once per process,
|
||||
and pass the instance around your application.
|
||||
|
||||
Defaults (class): Settings, data and factory methods for creating the `nlp`
|
||||
object and processing pipeline.
|
||||
lang (str): Two-letter ISO 639-1 or three-letter ISO 639-3 language codes, such as 'en' and 'eng'.
|
||||
lang (str): IETF language code, such as 'en'.
|
||||
|
||||
DOCS: https://spacy.io/api/language
|
||||
"""
|
||||
|
@ -183,9 +178,6 @@ class Language:
|
|||
|
||||
DOCS: https://spacy.io/api/language#init
|
||||
"""
|
||||
from .pipeline.factories import register_factories
|
||||
|
||||
register_factories()
|
||||
# We're only calling this to import all factories provided via entry
|
||||
# points. The factory decorator applied to these functions takes care
|
||||
# of the rest.
|
||||
|
@ -202,8 +194,7 @@ class Language:
|
|||
if not isinstance(vocab, Vocab) and vocab is not True:
|
||||
raise ValueError(Errors.E918.format(vocab=vocab, vocab_type=type(Vocab)))
|
||||
if vocab is True:
|
||||
vectors_name = meta.get("vectors", {}).get("name")
|
||||
vocab = create_vocab(self.lang, self.Defaults, vectors_name=vectors_name)
|
||||
vocab = create_vocab(self.lang, self.Defaults)
|
||||
if not create_vectors:
|
||||
vectors_cfg = {"vectors": self._config["nlp"]["vectors"]}
|
||||
create_vectors = registry.resolve(vectors_cfg)["vectors"]
|
||||
|
@ -261,7 +252,6 @@ class Language:
|
|||
"width": self.vocab.vectors_length,
|
||||
"vectors": len(self.vocab.vectors),
|
||||
"keys": self.vocab.vectors.n_keys,
|
||||
"name": self.vocab.vectors.name,
|
||||
"mode": self.vocab.vectors.mode,
|
||||
}
|
||||
self._meta["labels"] = dict(self.pipe_labels)
|
||||
|
@ -772,8 +762,8 @@ class Language:
|
|||
*,
|
||||
before: Optional[Union[str, int]] = None,
|
||||
after: Optional[Union[str, int]] = None,
|
||||
first: Optional[bool] = None,
|
||||
last: Optional[bool] = None,
|
||||
first: Optional[Literal[True]] = None,
|
||||
last: Optional[Literal[True]] = None,
|
||||
source: Optional["Language"] = None,
|
||||
config: Dict[str, Any] = SimpleFrozenDict(),
|
||||
raw_config: Optional[Config] = None,
|
||||
|
@ -792,8 +782,8 @@ class Language:
|
|||
component directly before.
|
||||
after (Union[str, int]): Name or index of the component to insert new
|
||||
component directly after.
|
||||
first (bool): If True, insert component first in the pipeline.
|
||||
last (bool): If True, insert component last in the pipeline.
|
||||
first (Optional[Literal[True]]): If True, insert component first in the pipeline.
|
||||
last (Optional[Literal[True]]): If True, insert component last in the pipeline.
|
||||
source (Language): Optional loaded nlp object to copy the pipeline
|
||||
component from.
|
||||
config (Dict[str, Any]): Config parameters to use for this component.
|
||||
|
@ -839,18 +829,22 @@ class Language:
|
|||
self,
|
||||
before: Optional[Union[str, int]] = None,
|
||||
after: Optional[Union[str, int]] = None,
|
||||
first: Optional[bool] = None,
|
||||
last: Optional[bool] = None,
|
||||
first: Optional[Literal[True]] = None,
|
||||
last: Optional[Literal[True]] = None,
|
||||
) -> int:
|
||||
"""Determine where to insert a pipeline component based on the before/
|
||||
after/first/last values.
|
||||
|
||||
before (str): Name or index of the component to insert directly before.
|
||||
after (str): Name or index of component to insert directly after.
|
||||
first (bool): If True, insert component first in the pipeline.
|
||||
last (bool): If True, insert component last in the pipeline.
|
||||
first (Optional[Literal[True]]): If True, insert component first in the pipeline.
|
||||
last (Optional[Literal[True]]): If True, insert component last in the pipeline.
|
||||
RETURNS (int): The index of the new pipeline component.
|
||||
"""
|
||||
if first is not None and first is not True:
|
||||
raise ValueError(Errors.E4009.format(attr="first", value=first))
|
||||
if last is not None and last is not True:
|
||||
raise ValueError(Errors.E4009.format(attr="last", value=last))
|
||||
all_args = {"before": before, "after": after, "first": first, "last": last}
|
||||
if sum(arg is not None for arg in [before, after, first, last]) >= 2:
|
||||
raise ValueError(
|
||||
|
@ -1060,6 +1054,116 @@ class Language:
|
|||
raise ValueError(Errors.E005.format(name=name, returned_type=type(doc)))
|
||||
return doc
|
||||
|
||||
def distill(
|
||||
self,
|
||||
teacher: "Language",
|
||||
examples: Iterable[Example],
|
||||
*,
|
||||
drop: float = 0.0,
|
||||
sgd: Union[Optimizer, None, Literal[False]] = None,
|
||||
losses: Optional[Dict[str, float]] = None,
|
||||
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
|
||||
exclude: Iterable[str] = SimpleFrozenList(),
|
||||
annotates: Iterable[str] = SimpleFrozenList(),
|
||||
student_to_teacher: Optional[Dict[str, str]] = None,
|
||||
):
|
||||
"""Distill the models in a student pipeline from a teacher pipeline.
|
||||
teacher (Language): Teacher to distill from.
|
||||
examples (Iterable[Example]): Distillation examples. The reference
|
||||
(teacher) and predicted (student) docs must have the same number of
|
||||
tokens and the same orthography.
|
||||
drop (float): The dropout rate.
|
||||
sgd (Union[Optimizer, None, Literal[False]]): An optimizer. Will
|
||||
be created via create_optimizer if 'None'. No optimizer will
|
||||
be used when set to 'False'.
|
||||
losses (Optional(Dict[str, float])): Dictionary to update with the loss,
|
||||
keyed by component.
|
||||
component_cfg (Optional[Dict[str, Dict[str, Any]]]): Config parameters
|
||||
for specific pipeline components, keyed by component name.
|
||||
exclude (Iterable[str]): Names of components that shouldn't be updated.
|
||||
annotates (Iterable[str]): Names of components that should set
|
||||
annotations on the predicted examples after updating.
|
||||
student_to_teacher (Optional[Dict[str, str]]): Map student pipe name to
|
||||
teacher pipe name, only needed for pipes where the student pipe
|
||||
name does not match the teacher pipe name.
|
||||
RETURNS (Dict[str, float]): The updated losses dictionary
|
||||
|
||||
DOCS: https://spacy.io/api/language#distill
|
||||
"""
|
||||
if student_to_teacher is None:
|
||||
student_to_teacher = {}
|
||||
if losses is None:
|
||||
losses = {}
|
||||
if isinstance(examples, list) and len(examples) == 0:
|
||||
return losses
|
||||
|
||||
validate_distillation_examples(examples, "Language.distill")
|
||||
examples = _copy_examples(examples, copy_x=True, copy_y=True)
|
||||
|
||||
if sgd is None:
|
||||
if self._optimizer is None:
|
||||
self._optimizer = self.create_optimizer()
|
||||
sgd = self._optimizer
|
||||
|
||||
if component_cfg is None:
|
||||
component_cfg = {}
|
||||
pipe_kwargs = {}
|
||||
for student_name, student_proc in self.pipeline:
|
||||
component_cfg.setdefault(student_name, {})
|
||||
pipe_kwargs[student_name] = deepcopy(component_cfg[student_name])
|
||||
component_cfg[student_name].setdefault("drop", drop)
|
||||
pipe_kwargs[student_name].setdefault("batch_size", self.batch_size)
|
||||
|
||||
teacher_pipes = dict(teacher.pipeline)
|
||||
for student_name, student_proc in self.pipeline:
|
||||
if student_name in annotates:
|
||||
for doc, eg in zip(
|
||||
_pipe(
|
||||
(eg.predicted for eg in examples),
|
||||
proc=student_proc,
|
||||
name=student_name,
|
||||
default_error_handler=self.default_error_handler,
|
||||
kwargs=pipe_kwargs[student_name],
|
||||
),
|
||||
examples,
|
||||
):
|
||||
eg.predicted = doc
|
||||
|
||||
if (
|
||||
student_name not in exclude
|
||||
and isinstance(student_proc, ty.DistillableComponent)
|
||||
and student_proc.is_distillable
|
||||
):
|
||||
# A missing teacher pipe is not an error, some student pipes
|
||||
# do not need a teacher, such as tok2vec layer losses.
|
||||
teacher_name = (
|
||||
student_to_teacher[student_name]
|
||||
if student_name in student_to_teacher
|
||||
else student_name
|
||||
)
|
||||
teacher_pipe = teacher_pipes.get(teacher_name, None)
|
||||
student_proc.distill(
|
||||
teacher_pipe,
|
||||
examples,
|
||||
sgd=None,
|
||||
losses=losses,
|
||||
**component_cfg[student_name],
|
||||
)
|
||||
|
||||
# Only finish the update after all component updates are done. Some
|
||||
# components may share weights (such as tok2vec) and we only want
|
||||
# to apply weight updates after all gradients are accumulated.
|
||||
for student_name, student_proc in self.pipeline:
|
||||
if (
|
||||
student_name not in exclude
|
||||
and isinstance(student_proc, ty.DistillableComponent)
|
||||
and student_proc.is_distillable
|
||||
and sgd not in (None, False)
|
||||
):
|
||||
student_proc.finish_update(sgd)
|
||||
|
||||
return losses
|
||||
|
||||
def disable_pipes(self, *names) -> "DisabledPipes":
|
||||
"""Disable one or more pipeline components. If used as a context
|
||||
manager, the pipeline will be restored to the initial state at the end
|
||||
|
@ -1148,7 +1252,7 @@ class Language:
|
|||
_: Optional[Any] = None,
|
||||
*,
|
||||
drop: float = 0.0,
|
||||
sgd: Optional[Optimizer] = None,
|
||||
sgd: Union[Optimizer, None, Literal[False]] = None,
|
||||
losses: Optional[Dict[str, float]] = None,
|
||||
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
|
||||
exclude: Iterable[str] = SimpleFrozenList(),
|
||||
|
@ -1159,7 +1263,9 @@ class Language:
|
|||
examples (Iterable[Example]): A batch of examples
|
||||
_: Should not be set - serves to catch backwards-incompatible scripts.
|
||||
drop (float): The dropout rate.
|
||||
sgd (Optimizer): An optimizer.
|
||||
sgd (Union[Optimizer, None, Literal[False]]): An optimizer. Will
|
||||
be created via create_optimizer if 'None'. No optimizer will
|
||||
be used when set to 'False'.
|
||||
losses (Dict[str, float]): Dictionary to update with the loss, keyed by
|
||||
component.
|
||||
component_cfg (Dict[str, Dict]): Config parameters for specific pipeline
|
||||
|
@ -1192,17 +1298,12 @@ class Language:
|
|||
component_cfg[name].setdefault("drop", drop)
|
||||
pipe_kwargs[name].setdefault("batch_size", self.batch_size)
|
||||
for name, proc in self.pipeline:
|
||||
# ignore statements are used here because mypy ignores hasattr
|
||||
if name not in exclude and hasattr(proc, "update"):
|
||||
proc.update(examples, sgd=None, losses=losses, **component_cfg[name]) # type: ignore
|
||||
if sgd not in (None, False):
|
||||
if (
|
||||
name not in exclude
|
||||
and isinstance(proc, ty.TrainableComponent)
|
||||
and proc.is_trainable
|
||||
and proc.model not in (True, False, None)
|
||||
):
|
||||
proc.finish_update(sgd)
|
||||
if (
|
||||
name not in exclude
|
||||
and isinstance(proc, ty.TrainableComponent)
|
||||
and proc.is_trainable
|
||||
):
|
||||
proc.update(examples, sgd=None, losses=losses, **component_cfg[name])
|
||||
if name in annotates:
|
||||
for doc, eg in zip(
|
||||
_pipe(
|
||||
|
@ -1215,7 +1316,19 @@ class Language:
|
|||
examples,
|
||||
):
|
||||
eg.predicted = doc
|
||||
return _replace_numpy_floats(losses)
|
||||
# Only finish the update after all component updates are done. Some
|
||||
# components may share weights (such as tok2vec) and we only want
|
||||
# to apply weight updates after all gradients are accumulated.
|
||||
for name, proc in self.pipeline:
|
||||
if (
|
||||
name not in exclude
|
||||
and isinstance(proc, ty.TrainableComponent)
|
||||
and proc.is_trainable
|
||||
and sgd not in (None, False)
|
||||
):
|
||||
proc.finish_update(sgd)
|
||||
|
||||
return losses
|
||||
|
||||
def rehearse(
|
||||
self,
|
||||
|
@ -1281,25 +1394,20 @@ class Language:
|
|||
sgd(key, W, dW) # type: ignore[call-arg, misc]
|
||||
return losses
|
||||
|
||||
def begin_training(
|
||||
self,
|
||||
get_examples: Optional[Callable[[], Iterable[Example]]] = None,
|
||||
*,
|
||||
sgd: Optional[Optimizer] = None,
|
||||
) -> Optimizer:
|
||||
warnings.warn(Warnings.W089, DeprecationWarning)
|
||||
return self.initialize(get_examples, sgd=sgd)
|
||||
|
||||
def initialize(
|
||||
self,
|
||||
get_examples: Optional[Callable[[], Iterable[Example]]] = None,
|
||||
*,
|
||||
labels: Optional[Dict[str, Any]] = None,
|
||||
sgd: Optional[Optimizer] = None,
|
||||
) -> Optimizer:
|
||||
"""Initialize the pipe for training, using data examples if available.
|
||||
|
||||
get_examples (Callable[[], Iterable[Example]]): Optional function that
|
||||
returns gold-standard Example objects.
|
||||
labels (Optional[Dict[str, Any]]): Labels to pass to pipe initialization,
|
||||
using the names of the pipes as keys. Overrides labels that are in
|
||||
the model configuration.
|
||||
sgd (Optional[Optimizer]): An optimizer to use for updates. If not
|
||||
provided, will be created using the .create_optimizer() method.
|
||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
||||
|
@ -1347,6 +1455,8 @@ class Language:
|
|||
for name, proc in self.pipeline:
|
||||
if isinstance(proc, ty.InitializableComponent):
|
||||
p_settings = I["components"].get(name, {})
|
||||
if labels is not None and name in labels:
|
||||
p_settings["labels"] = labels[name]
|
||||
p_settings = validate_init_settings(
|
||||
proc.initialize, p_settings, section="components", name=name
|
||||
)
|
||||
|
@ -1466,7 +1576,7 @@ class Language:
|
|||
results = scorer.score(examples, per_component=per_component)
|
||||
n_words = sum(len(eg.predicted) for eg in examples)
|
||||
results["speed"] = n_words / (end_time - start_time)
|
||||
return _replace_numpy_floats(results)
|
||||
return results
|
||||
|
||||
def create_optimizer(self):
|
||||
"""Create an optimizer, usually using the [training.optimizer] config."""
|
||||
|
@ -1820,6 +1930,7 @@ class Language:
|
|||
# using the nlp.config with all defaults.
|
||||
config = util.copy_config(config)
|
||||
orig_pipeline = config.pop("components", {})
|
||||
orig_distill = config.pop("distillation", None)
|
||||
orig_pretraining = config.pop("pretraining", None)
|
||||
config["components"] = {}
|
||||
if auto_fill:
|
||||
|
@ -1828,6 +1939,9 @@ class Language:
|
|||
filled = config
|
||||
filled["components"] = orig_pipeline
|
||||
config["components"] = orig_pipeline
|
||||
if orig_distill is not None:
|
||||
filled["distillation"] = orig_distill
|
||||
config["distillation"] = orig_distill
|
||||
if orig_pretraining is not None:
|
||||
filled["pretraining"] = orig_pretraining
|
||||
config["pretraining"] = orig_pretraining
|
||||
|
@ -2095,38 +2209,6 @@ class Language:
|
|||
util.replace_model_node(pipe.model, listener, new_model) # type: ignore[attr-defined]
|
||||
tok2vec.remove_listener(listener, pipe_name)
|
||||
|
||||
@contextmanager
|
||||
def memory_zone(self, mem: Optional[Pool] = None) -> Iterator[Pool]:
|
||||
"""Begin a block where all resources allocated during the block will
|
||||
be freed at the end of it. If a resources was created within the
|
||||
memory zone block, accessing it outside the block is invalid.
|
||||
Behaviour of this invalid access is undefined. Memory zones should
|
||||
not be nested.
|
||||
|
||||
The memory zone is helpful for services that need to process large
|
||||
volumes of text with a defined memory budget.
|
||||
|
||||
Example
|
||||
-------
|
||||
>>> with nlp.memory_zone():
|
||||
... for doc in nlp.pipe(texts):
|
||||
... process_my_doc(doc)
|
||||
>>> # use_doc(doc) <-- Invalid: doc was allocated in the memory zone
|
||||
"""
|
||||
if mem is None:
|
||||
mem = Pool()
|
||||
# The ExitStack allows programmatic nested context managers.
|
||||
# We don't know how many we need, so it would be awkward to have
|
||||
# them as nested blocks.
|
||||
with ExitStack() as stack:
|
||||
contexts = [stack.enter_context(self.vocab.memory_zone(mem))]
|
||||
if hasattr(self.tokenizer, "memory_zone"):
|
||||
contexts.append(stack.enter_context(self.tokenizer.memory_zone(mem)))
|
||||
for _, pipe in self.pipeline:
|
||||
if hasattr(pipe, "memory_zone"):
|
||||
contexts.append(stack.enter_context(pipe.memory_zone(mem)))
|
||||
yield mem
|
||||
|
||||
def to_disk(
|
||||
self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
|
||||
) -> None:
|
||||
|
@ -2144,9 +2226,7 @@ class Language:
|
|||
serializers["tokenizer"] = lambda p: self.tokenizer.to_disk( # type: ignore[union-attr]
|
||||
p, exclude=["vocab"]
|
||||
)
|
||||
serializers["meta.json"] = lambda p: srsly.write_json(
|
||||
p, _replace_numpy_floats(self.meta)
|
||||
)
|
||||
serializers["meta.json"] = lambda p: srsly.write_json(p, self.meta)
|
||||
serializers["config.cfg"] = lambda p: self.config.to_disk(p)
|
||||
for name, proc in self._components:
|
||||
if name in exclude:
|
||||
|
@ -2214,9 +2294,6 @@ class Language:
|
|||
if path.exists():
|
||||
data = srsly.read_json(path)
|
||||
self.meta.update(data)
|
||||
# self.meta always overrides meta["vectors"] with the metadata
|
||||
# from self.vocab.vectors, so set the name directly
|
||||
self.vocab.vectors.name = data.get("vectors", {}).get("name")
|
||||
|
||||
def deserialize_vocab(path: Path) -> None:
|
||||
if path.exists():
|
||||
|
@ -2260,9 +2337,7 @@ class Language:
|
|||
serializers: Dict[str, Callable[[], bytes]] = {}
|
||||
serializers["vocab"] = lambda: self.vocab.to_bytes(exclude=exclude)
|
||||
serializers["tokenizer"] = lambda: self.tokenizer.to_bytes(exclude=["vocab"]) # type: ignore[union-attr]
|
||||
serializers["meta.json"] = lambda: srsly.json_dumps(
|
||||
_replace_numpy_floats(self.meta)
|
||||
)
|
||||
serializers["meta.json"] = lambda: srsly.json_dumps(self.meta)
|
||||
serializers["config.cfg"] = lambda: self.config.to_bytes()
|
||||
for name, proc in self._components:
|
||||
if name in exclude:
|
||||
|
@ -2287,9 +2362,6 @@ class Language:
|
|||
def deserialize_meta(b):
|
||||
data = srsly.json_loads(b)
|
||||
self.meta.update(data)
|
||||
# self.meta always overrides meta["vectors"] with the metadata
|
||||
# from self.vocab.vectors, so set the name directly
|
||||
self.vocab.vectors.name = data.get("vectors", {}).get("name")
|
||||
|
||||
deserializers: Dict[str, Callable[[bytes], Any]] = {}
|
||||
deserializers["config.cfg"] = lambda b: self.config.from_bytes(
|
||||
|
@ -2313,12 +2385,6 @@ class Language:
|
|||
return self
|
||||
|
||||
|
||||
def _replace_numpy_floats(meta_dict: dict) -> dict:
|
||||
return convert_recursive(
|
||||
lambda v: isinstance(v, numpy.floating), lambda v: float(v), dict(meta_dict)
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class FactoryMeta:
|
||||
"""Dataclass containing information about a component and its defaults
|
||||
|
@ -2362,13 +2428,18 @@ class DisabledPipes(list):
|
|||
self[:] = []
|
||||
|
||||
|
||||
def _copy_examples(examples: Iterable[Example]) -> List[Example]:
|
||||
def _copy_examples(
|
||||
examples: Iterable[Example], *, copy_x: bool = True, copy_y: bool = False
|
||||
) -> List[Example]:
|
||||
"""Make a copy of a batch of examples, copying the predicted Doc as well.
|
||||
This is used in contexts where we need to take ownership of the examples
|
||||
so that they can be mutated, for instance during Language.evaluate and
|
||||
Language.update.
|
||||
"""
|
||||
return [Example(eg.x.copy(), eg.y) for eg in examples]
|
||||
return [
|
||||
Example(eg.x.copy() if copy_x else eg.x, eg.y.copy() if copy_y else eg.y)
|
||||
for eg in examples
|
||||
]
|
||||
|
||||
|
||||
def _apply_pipes(
|
||||
|
|
|
@ -12,7 +12,6 @@ from .attrs cimport (
|
|||
SUFFIX,
|
||||
attr_id_t,
|
||||
)
|
||||
from .strings cimport StringStore
|
||||
from .structs cimport LexemeC
|
||||
from .typedefs cimport attr_t, flags_t, hash_t, len_t, tag_t
|
||||
from .vocab cimport Vocab
|
||||
|
@ -35,7 +34,7 @@ cdef class Lexeme:
|
|||
return self
|
||||
|
||||
@staticmethod
|
||||
cdef inline void set_struct_attr(LexemeC* lex, attr_id_t name, attr_t value) noexcept nogil:
|
||||
cdef inline void set_struct_attr(LexemeC* lex, attr_id_t name, attr_t value) nogil:
|
||||
if name < (sizeof(flags_t) * 8):
|
||||
Lexeme.c_set_flag(lex, name, value)
|
||||
elif name == ID:
|
||||
|
@ -54,7 +53,7 @@ cdef class Lexeme:
|
|||
lex.lang = value
|
||||
|
||||
@staticmethod
|
||||
cdef inline attr_t get_struct_attr(const LexemeC* lex, attr_id_t feat_name) noexcept nogil:
|
||||
cdef inline attr_t get_struct_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
|
||||
if feat_name < (sizeof(flags_t) * 8):
|
||||
if Lexeme.c_check_flag(lex, feat_name):
|
||||
return 1
|
||||
|
@ -82,7 +81,7 @@ cdef class Lexeme:
|
|||
return 0
|
||||
|
||||
@staticmethod
|
||||
cdef inline bint c_check_flag(const LexemeC* lexeme, attr_id_t flag_id) noexcept nogil:
|
||||
cdef inline bint c_check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:
|
||||
cdef flags_t one = 1
|
||||
if lexeme.flags & (one << flag_id):
|
||||
return True
|
||||
|
@ -90,7 +89,7 @@ cdef class Lexeme:
|
|||
return False
|
||||
|
||||
@staticmethod
|
||||
cdef inline bint c_set_flag(LexemeC* lex, attr_id_t flag_id, bint value) noexcept nogil:
|
||||
cdef inline bint c_set_flag(LexemeC* lex, attr_id_t flag_id, bint value) nogil:
|
||||
cdef flags_t one = 1
|
||||
if value:
|
||||
lex.flags |= one << flag_id
|
||||
|
|
|
@ -19,7 +19,6 @@ class Lexeme:
|
|||
def vector_norm(self) -> float: ...
|
||||
vector: Floats1d
|
||||
rank: int
|
||||
sentiment: float
|
||||
@property
|
||||
def orth_(self) -> str: ...
|
||||
@property
|
||||
|
|
|
@ -57,7 +57,7 @@ cdef class Lexeme:
|
|||
"""
|
||||
self.vocab = vocab
|
||||
self.orth = orth
|
||||
self.c = <LexemeC*><void*>vocab.get_by_orth(vocab.mem, orth)
|
||||
self.c = <LexemeC*><void*>vocab.get_by_orth(orth)
|
||||
if self.c.orth != orth:
|
||||
raise ValueError(Errors.E071.format(orth=orth, vocab_orth=self.c.orth))
|
||||
|
||||
|
@ -70,7 +70,7 @@ cdef class Lexeme:
|
|||
if isinstance(other, Lexeme):
|
||||
a = self.orth
|
||||
b = other.orth
|
||||
elif isinstance(other, int):
|
||||
elif isinstance(other, long):
|
||||
a = self.orth
|
||||
b = other
|
||||
elif isinstance(other, str):
|
||||
|
@ -104,7 +104,7 @@ cdef class Lexeme:
|
|||
# skip PROB, e.g. from lexemes.jsonl
|
||||
if isinstance(value, float):
|
||||
continue
|
||||
elif isinstance(value, int):
|
||||
elif isinstance(value, (int, long)):
|
||||
Lexeme.set_struct_attr(self.c, attr, value)
|
||||
else:
|
||||
Lexeme.set_struct_attr(self.c, attr, self.vocab.strings.add(value))
|
||||
|
@ -193,20 +193,6 @@ cdef class Lexeme:
|
|||
def rank(self, value):
|
||||
self.c.id = value
|
||||
|
||||
@property
|
||||
def sentiment(self):
|
||||
"""RETURNS (float): A scalar value indicating the positivity or
|
||||
negativity of the lexeme."""
|
||||
sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment", {})
|
||||
return sentiment_table.get(self.c.orth, 0.0)
|
||||
|
||||
@sentiment.setter
|
||||
def sentiment(self, float x):
|
||||
if "lexeme_sentiment" not in self.vocab.lookups:
|
||||
self.vocab.lookups.add_table("lexeme_sentiment")
|
||||
sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment")
|
||||
sentiment_table[self.c.orth] = x
|
||||
|
||||
@property
|
||||
def orth_(self):
|
||||
"""RETURNS (str): The original verbatim text of the lexeme
|
||||
|
|
|
@ -2,16 +2,40 @@ from collections import OrderedDict
|
|||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
|
||||
import requests
|
||||
import srsly
|
||||
from preshed.bloom import BloomFilter
|
||||
|
||||
from .errors import Errors
|
||||
from .strings import get_string_id
|
||||
from .util import SimpleFrozenDict, ensure_path, load_language_data, registry
|
||||
from .util import SimpleFrozenDict, ensure_path, load_language_data, logger, registry
|
||||
|
||||
UNSET = object()
|
||||
|
||||
|
||||
@registry.misc("spacy.LookupsDataLoader.v1")
|
||||
def load_lookups_data(lang, tables):
|
||||
logger.debug(f"Loading lookups from spacy-lookups-data: {tables}")
|
||||
lookups = load_lookups(lang=lang, tables=tables)
|
||||
return lookups
|
||||
|
||||
|
||||
@registry.misc("spacy.LookupsDataLoaderFromURL.v1")
|
||||
def load_lookups_data_from_url(lang, tables, url):
|
||||
logger.debug(f"Loading lookups from {url}: {tables}")
|
||||
lookups = Lookups()
|
||||
for table in tables:
|
||||
table_url = url + lang + "_" + table + ".json"
|
||||
r = requests.get(table_url)
|
||||
if r.status_code != 200:
|
||||
raise ValueError(
|
||||
Errors.E4011.format(status_code=r.status_code, url=table_url)
|
||||
)
|
||||
table_data = r.json()
|
||||
lookups.add_table(table, table_data)
|
||||
return lookups
|
||||
|
||||
|
||||
def load_lookups(lang: str, tables: List[str], strict: bool = True) -> "Lookups":
|
||||
"""Load the data from the spacy-lookups-data package for a given language,
|
||||
if available. Returns an empty `Lookups` container if there's no data or if the package
|
||||
|
|
|
@ -175,9 +175,9 @@ cdef class DependencyMatcher:
|
|||
on_match (callable): Optional callback executed on match.
|
||||
"""
|
||||
if on_match is not None and not hasattr(on_match, "__call__"):
|
||||
raise ValueError(Errors.E171.format(arg_type=type(on_match)))
|
||||
if patterns is None or not isinstance(patterns, List): # old API
|
||||
raise ValueError(Errors.E948.format(arg_type=type(patterns)))
|
||||
raise ValueError(Errors.E171.format(name="DependencyMatcher", arg_type=type(on_match)))
|
||||
if patterns is None or not isinstance(patterns, List):
|
||||
raise ValueError(Errors.E948.format(name="DependencyMatcher", arg_type=type(patterns)))
|
||||
for pattern in patterns:
|
||||
if len(pattern) == 0:
|
||||
raise ValueError(Errors.E012.format(key=key))
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# cython: binding=True, infer_types=True, language_level=3
|
||||
# cython: binding=True, infer_types=True
|
||||
from cpython.object cimport PyObject
|
||||
from libc.stdint cimport int64_t
|
||||
|
||||
|
@ -27,5 +27,6 @@ cpdef bint levenshtein_compare(input_text: str, pattern_text: str, fuzzy: int =
|
|||
return levenshtein(input_text, pattern_text, max_edits) <= max_edits
|
||||
|
||||
|
||||
@registry.misc("spacy.levenshtein_compare.v1")
|
||||
def make_levenshtein_compare():
|
||||
return levenshtein_compare
|
||||
|
|
|
@ -5,13 +5,13 @@ from typing import (
|
|||
Iterable,
|
||||
Iterator,
|
||||
List,
|
||||
Literal,
|
||||
Optional,
|
||||
Tuple,
|
||||
Union,
|
||||
overload,
|
||||
)
|
||||
|
||||
from ..compat import Literal
|
||||
from ..tokens import Doc, Span
|
||||
from ..vocab import Vocab
|
||||
|
||||
|
|
|
@ -20,6 +20,12 @@ from ..tokens.span cimport Span
|
|||
from ..tokens.token cimport Token
|
||||
from ..typedefs cimport attr_t
|
||||
|
||||
from ..errors import Errors, MatchPatternError, Warnings
|
||||
from ..schemas import validate_token_pattern
|
||||
from .levenshtein import levenshtein_compare
|
||||
|
||||
from ..strings cimport get_string_id
|
||||
|
||||
from ..attrs import IDS
|
||||
from ..errors import Errors, MatchPatternError, Warnings
|
||||
from ..schemas import validate_token_pattern
|
||||
|
@ -113,9 +119,9 @@ cdef class Matcher:
|
|||
"""
|
||||
errors = {}
|
||||
if on_match is not None and not hasattr(on_match, "__call__"):
|
||||
raise ValueError(Errors.E171.format(arg_type=type(on_match)))
|
||||
if patterns is None or not isinstance(patterns, List): # old API
|
||||
raise ValueError(Errors.E948.format(arg_type=type(patterns)))
|
||||
raise ValueError(Errors.E171.format(name="Matcher", arg_type=type(on_match)))
|
||||
if patterns is None or not isinstance(patterns, List):
|
||||
raise ValueError(Errors.E948.format(name="Matcher", arg_type=type(patterns)))
|
||||
if greedy is not None and greedy not in ["FIRST", "LONGEST"]:
|
||||
raise ValueError(Errors.E947.format(expected=["FIRST", "LONGEST"], arg=greedy))
|
||||
for i, pattern in enumerate(patterns):
|
||||
|
@ -275,6 +281,10 @@ cdef class Matcher:
|
|||
# non-overlapping ones this `match` can be either (start, end) or
|
||||
# (start, end, alignments) depending on `with_alignments=` option.
|
||||
for key, *match in matches:
|
||||
# Adjust span matches to doc offsets
|
||||
if isinstance(doclike, Span):
|
||||
match[0] += doclike.start
|
||||
match[1] += doclike.start
|
||||
span_filter = self._filter.get(key)
|
||||
if span_filter is not None:
|
||||
pairs = pairs_by_id.get(key, [])
|
||||
|
@ -305,9 +315,6 @@ cdef class Matcher:
|
|||
if as_spans:
|
||||
final_results = []
|
||||
for key, start, end, *_ in final_matches:
|
||||
if isinstance(doclike, Span):
|
||||
start += doclike.start
|
||||
end += doclike.start
|
||||
final_results.append(Span(doc, start, end, label=key))
|
||||
elif with_alignments:
|
||||
# convert alignments List[Dict[str, int]] --> List[int]
|
||||
|
@ -625,7 +632,7 @@ cdef action_t get_action(
|
|||
const TokenC * token,
|
||||
const attr_t * extra_attrs,
|
||||
const int8_t * predicate_matches
|
||||
) noexcept nogil:
|
||||
) nogil:
|
||||
"""We need to consider:
|
||||
a) Does the token match the specification? [Yes, No]
|
||||
b) What's the quantifier? [1, 0+, ?]
|
||||
|
@ -740,7 +747,7 @@ cdef int8_t get_is_match(
|
|||
const TokenC* token,
|
||||
const attr_t* extra_attrs,
|
||||
const int8_t* predicate_matches
|
||||
) noexcept nogil:
|
||||
) nogil:
|
||||
for i in range(state.pattern.nr_py):
|
||||
if predicate_matches[state.pattern.py_predicates[i]] == -1:
|
||||
return 0
|
||||
|
@ -755,14 +762,14 @@ cdef int8_t get_is_match(
|
|||
return True
|
||||
|
||||
|
||||
cdef inline int8_t get_is_final(PatternStateC state) noexcept nogil:
|
||||
cdef inline int8_t get_is_final(PatternStateC state) nogil:
|
||||
if state.pattern[1].quantifier == FINAL_ID:
|
||||
return 1
|
||||
else:
|
||||
return 0
|
||||
|
||||
|
||||
cdef inline int8_t get_quantifier(PatternStateC state) noexcept nogil:
|
||||
cdef inline int8_t get_quantifier(PatternStateC state) nogil:
|
||||
return state.pattern.quantifier
|
||||
|
||||
|
||||
|
@ -805,7 +812,7 @@ cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id, object token_specs)
|
|||
return pattern
|
||||
|
||||
|
||||
cdef attr_t get_ent_id(const TokenPatternC* pattern) noexcept nogil:
|
||||
cdef attr_t get_ent_id(const TokenPatternC* pattern) nogil:
|
||||
while pattern.quantifier != FINAL_ID:
|
||||
pattern += 1
|
||||
id_attr = pattern[0].attrs[0]
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
from typing import Any, Callable, Dict, List, Optional, Tuple, Union, overload
|
||||
from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union, overload
|
||||
|
||||
from ..compat import Literal
|
||||
from ..tokens import Doc, Span
|
||||
from ..vocab import Vocab
|
||||
from .matcher import Matcher
|
||||
|
@ -21,6 +20,15 @@ class PhraseMatcher:
|
|||
Callable[[Matcher, Doc, int, List[Tuple[Any, ...]]], Any]
|
||||
] = ...,
|
||||
) -> None: ...
|
||||
def _add_from_arrays(
|
||||
self,
|
||||
key: str,
|
||||
specs: List[List[int]],
|
||||
*,
|
||||
on_match: Optional[
|
||||
Callable[[Matcher, Doc, int, List[Tuple[Any, ...]]], Any]
|
||||
] = ...,
|
||||
) -> None: ...
|
||||
def remove(self, key: str) -> None: ...
|
||||
@overload
|
||||
def __call__(
|
||||
|
|
|
@ -1,4 +1,7 @@
|
|||
# cython: infer_types=True
|
||||
from collections import defaultdict
|
||||
from typing import List
|
||||
|
||||
from preshed.maps cimport map_clear, map_get, map_init, map_iter, map_set
|
||||
|
||||
import warnings
|
||||
|
@ -39,7 +42,7 @@ cdef class PhraseMatcher:
|
|||
"""
|
||||
self.vocab = vocab
|
||||
self._callbacks = {}
|
||||
self._docs = {}
|
||||
self._docs = defaultdict(set)
|
||||
self._validate = validate
|
||||
|
||||
self.mem = Pool()
|
||||
|
@ -47,7 +50,7 @@ cdef class PhraseMatcher:
|
|||
self._terminal_hash = 826361138722620965
|
||||
map_init(self.mem, self.c_map, 8)
|
||||
|
||||
if isinstance(attr, int):
|
||||
if isinstance(attr, (int, long)):
|
||||
self.attr = attr
|
||||
else:
|
||||
if attr is None:
|
||||
|
@ -155,66 +158,23 @@ cdef class PhraseMatcher:
|
|||
del self._callbacks[key]
|
||||
del self._docs[key]
|
||||
|
||||
def add(self, key, docs, *_docs, on_match=None):
|
||||
"""Add a match-rule to the phrase-matcher. A match-rule consists of: an ID
|
||||
key, an on_match callback, and one or more patterns.
|
||||
|
||||
Since spaCy v2.2.2, PhraseMatcher.add takes a list of patterns as the
|
||||
second argument, with the on_match callback as an optional keyword
|
||||
argument.
|
||||
def _add_from_arrays(self, key, specs, *, on_match=None):
|
||||
"""Add a preprocessed list of specs, with an optional callback.
|
||||
|
||||
key (str): The match ID.
|
||||
docs (list): List of `Doc` objects representing match patterns.
|
||||
specs (List[List[int]]): A list of lists of hashes to match.
|
||||
on_match (callable): Callback executed on match.
|
||||
*_docs (Doc): For backwards compatibility: list of patterns to add
|
||||
as variable arguments. Will be ignored if a list of patterns is
|
||||
provided as the second argument.
|
||||
|
||||
DOCS: https://spacy.io/api/phrasematcher#add
|
||||
"""
|
||||
if docs is None or hasattr(docs, "__call__"): # old API
|
||||
on_match = docs
|
||||
docs = _docs
|
||||
|
||||
_ = self.vocab[key]
|
||||
self._callbacks[key] = on_match
|
||||
self._docs.setdefault(key, set())
|
||||
|
||||
cdef MapStruct* current_node
|
||||
cdef MapStruct* internal_node
|
||||
cdef void* result
|
||||
|
||||
if isinstance(docs, Doc):
|
||||
raise ValueError(Errors.E179.format(key=key))
|
||||
for doc in docs:
|
||||
if len(doc) == 0:
|
||||
continue
|
||||
if isinstance(doc, Doc):
|
||||
attrs = (TAG, POS, MORPH, LEMMA, DEP)
|
||||
has_annotation = {attr: doc.has_annotation(attr) for attr in attrs}
|
||||
for attr in attrs:
|
||||
if self.attr == attr and not has_annotation[attr]:
|
||||
if attr == TAG:
|
||||
pipe = "tagger"
|
||||
elif attr in (POS, MORPH):
|
||||
pipe = "morphologizer or tagger+attribute_ruler"
|
||||
elif attr == LEMMA:
|
||||
pipe = "lemmatizer"
|
||||
elif attr == DEP:
|
||||
pipe = "parser"
|
||||
error_msg = Errors.E155.format(pipe=pipe, attr=self.vocab.strings.as_string(attr))
|
||||
raise ValueError(error_msg)
|
||||
if self._validate and any(has_annotation.values()) \
|
||||
and self.attr not in attrs:
|
||||
string_attr = self.vocab.strings[self.attr]
|
||||
warnings.warn(Warnings.W012.format(key=key, attr=string_attr))
|
||||
keyword = self._convert_to_array(doc)
|
||||
else:
|
||||
keyword = doc
|
||||
self._docs[key].add(tuple(keyword))
|
||||
self._callbacks[key] = on_match
|
||||
for spec in specs:
|
||||
self._docs[key].add(tuple(spec))
|
||||
|
||||
current_node = self.c_map
|
||||
for token in keyword:
|
||||
for token in spec:
|
||||
if token == self._terminal_hash:
|
||||
warnings.warn(Warnings.W021)
|
||||
break
|
||||
|
@ -233,6 +193,56 @@ cdef class PhraseMatcher:
|
|||
result = internal_node
|
||||
map_set(self.mem, <MapStruct*>result, self.vocab.strings[key], NULL)
|
||||
|
||||
def add(self, key, docs, *, on_match=None):
|
||||
"""Add a match-rule to the phrase-matcher. A match-rule consists of: an ID
|
||||
key, a list of one or more patterns, and (optionally) an on_match callback.
|
||||
|
||||
key (str): The match ID.
|
||||
docs (list): List of `Doc` objects representing match patterns.
|
||||
on_match (callable): Callback executed on match.
|
||||
|
||||
If any of the input Docs are invalid, no internal state will be updated.
|
||||
|
||||
DOCS: https://spacy.io/api/phrasematcher#add
|
||||
"""
|
||||
if isinstance(docs, Doc):
|
||||
raise ValueError(Errors.E179.format(key=key))
|
||||
if docs is None or not isinstance(docs, List):
|
||||
raise ValueError(Errors.E948.format(name="PhraseMatcher", arg_type=type(docs)))
|
||||
if on_match is not None and not hasattr(on_match, "__call__"):
|
||||
raise ValueError(Errors.E171.format(name="PhraseMatcher", arg_type=type(on_match)))
|
||||
|
||||
_ = self.vocab[key]
|
||||
specs = []
|
||||
|
||||
for doc in docs:
|
||||
if len(doc) == 0:
|
||||
continue
|
||||
if not isinstance(doc, Doc):
|
||||
raise ValueError(Errors.E4000.format(type=type(doc)))
|
||||
|
||||
attrs = (TAG, POS, MORPH, LEMMA, DEP)
|
||||
has_annotation = {attr: doc.has_annotation(attr) for attr in attrs}
|
||||
for attr in attrs:
|
||||
if self.attr == attr and not has_annotation[attr]:
|
||||
if attr == TAG:
|
||||
pipe = "tagger"
|
||||
elif attr in (POS, MORPH):
|
||||
pipe = "morphologizer or tagger+attribute_ruler"
|
||||
elif attr == LEMMA:
|
||||
pipe = "lemmatizer"
|
||||
elif attr == DEP:
|
||||
pipe = "parser"
|
||||
error_msg = Errors.E155.format(pipe=pipe, attr=self.vocab.strings.as_string(attr))
|
||||
raise ValueError(error_msg)
|
||||
if self._validate and any(has_annotation.values()) \
|
||||
and self.attr not in attrs:
|
||||
string_attr = self.vocab.strings[self.attr]
|
||||
warnings.warn(Warnings.W012.format(key=key, attr=string_attr))
|
||||
specs.append(self._convert_to_array(doc))
|
||||
|
||||
self._add_from_arrays(key, specs, on_match=on_match)
|
||||
|
||||
def __call__(self, object doclike, *, as_spans=False):
|
||||
"""Find all sequences matching the supplied patterns on the `Doc`.
|
||||
|
||||
|
@ -345,7 +355,7 @@ def unpickle_matcher(vocab, docs, callbacks, attr):
|
|||
matcher = PhraseMatcher(vocab, attr=attr)
|
||||
for key, specs in docs.items():
|
||||
callback = callbacks.get(key, None)
|
||||
matcher.add(key, specs, on_match=callback)
|
||||
matcher._add_from_arrays(key, specs, on_match=callback)
|
||||
return matcher
|
||||
|
||||
|
||||
|
|
|
@ -3,6 +3,7 @@ from thinc.api import Model, normal_init
|
|||
from ..util import registry
|
||||
|
||||
|
||||
@registry.layers("spacy.PrecomputableAffine.v1")
|
||||
def PrecomputableAffine(nO, nI, nF, nP, dropout=0.1):
|
||||
model = Model(
|
||||
"precomputable_affine",
|
||||
|
|
|
@ -23,6 +23,7 @@ DEFAULT_NVTX_ANNOTATABLE_PIPE_METHODS = [
|
|||
"update",
|
||||
"rehearse",
|
||||
"get_loss",
|
||||
"get_teacher_student_loss",
|
||||
"initialize",
|
||||
"begin_update",
|
||||
"finish_update",
|
||||
|
@ -50,6 +51,7 @@ def models_with_nvtx_range(nlp, forward_color: int, backprop_color: int):
|
|||
return nlp
|
||||
|
||||
|
||||
@registry.callbacks("spacy.models_with_nvtx_range.v1")
|
||||
def create_models_with_nvtx_range(
|
||||
forward_color: int = -1, backprop_color: int = -1
|
||||
) -> Callable[["Language"], "Language"]:
|
||||
|
@ -109,6 +111,7 @@ def pipes_with_nvtx_range(
|
|||
return nlp
|
||||
|
||||
|
||||
@registry.callbacks("spacy.models_and_pipes_with_nvtx_range.v1")
|
||||
def create_models_and_pipes_with_nvtx_range(
|
||||
forward_color: int = -1,
|
||||
backprop_color: int = -1,
|
||||
|
|
|
@ -7,6 +7,7 @@ from ..tokens import Doc
|
|||
from ..util import registry
|
||||
|
||||
|
||||
@registry.layers("spacy.CharEmbed.v1")
|
||||
def CharacterEmbed(nM: int, nC: int) -> Model[List[Doc], List[Floats2d]]:
|
||||
# nM: Number of dimensions per character. nC: Number of characters.
|
||||
return Model(
|
|
@ -4,6 +4,7 @@ from ..attrs import LOWER
|
|||
from ..util import registry
|
||||
|
||||
|
||||
@registry.layers("spacy.extract_ngrams.v1")
|
||||
def extract_ngrams(ngram_size: int, attr: int = LOWER) -> Model:
|
||||
model: Model = Model("extract_ngrams", forward)
|
||||
model.attrs["ngram_size"] = ngram_size
|
||||
|
|
|
@ -6,6 +6,7 @@ from thinc.types import Ints1d, Ragged
|
|||
from ..util import registry
|
||||
|
||||
|
||||
@registry.layers("spacy.extract_spans.v1")
|
||||
def extract_spans() -> Model[Tuple[Ragged, Ragged], Ragged]:
|
||||
"""Extract spans from a sequence of source arrays, as specified by an array
|
||||
of (start, end) indices. The output is a ragged array of the
|
||||
|
|
|
@ -6,9 +6,8 @@ from thinc.types import Ints2d
|
|||
from ..tokens import Doc
|
||||
|
||||
|
||||
def FeatureExtractor(
|
||||
columns: Union[List[str], List[int], List[Union[int, str]]]
|
||||
) -> Model[List[Doc], List[Ints2d]]:
|
||||
@registry.layers("spacy.FeatureExtractor.v1")
|
||||
def FeatureExtractor(columns: List[Union[int, str]]) -> Model[List[Doc], List[Ints2d]]:
|
||||
return Model("extract_features", forward, attrs={"columns": columns})
|
||||
|
||||
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
from pathlib import Path
|
||||
from typing import Callable, Iterable, List, Optional, Tuple
|
||||
from typing import Callable, Iterable, Iterator, List, Optional, Tuple
|
||||
|
||||
from thinc.api import (
|
||||
Linear,
|
||||
|
@ -15,19 +15,17 @@ from thinc.api import (
|
|||
from thinc.types import Floats2d
|
||||
|
||||
from ...errors import Errors
|
||||
from ...kb import (
|
||||
Candidate,
|
||||
InMemoryLookupKB,
|
||||
KnowledgeBase,
|
||||
get_candidates,
|
||||
get_candidates_batch,
|
||||
)
|
||||
from ...tokens import Doc, Span
|
||||
from ...kb import Candidate, InMemoryLookupKB, KnowledgeBase
|
||||
from ...tokens import Doc, Span, SpanGroup
|
||||
from ...util import registry
|
||||
from ...vocab import Vocab
|
||||
from ..extract_spans import extract_spans
|
||||
|
||||
CandidatesForMentionT = Iterable[Candidate]
|
||||
CandidatesForDocT = Iterable[CandidatesForMentionT]
|
||||
|
||||
|
||||
@registry.architectures("spacy.EntityLinker.v2")
|
||||
def build_nel_encoder(
|
||||
tok2vec: Model, nO: Optional[int] = None
|
||||
) -> Model[List[Doc], Floats2d]:
|
||||
|
@ -91,6 +89,7 @@ def span_maker_forward(model, docs: List[Doc], is_train) -> Tuple[Ragged, Callab
|
|||
return out, lambda x: []
|
||||
|
||||
|
||||
@registry.misc("spacy.KBFromFile.v1")
|
||||
def load_kb(
|
||||
kb_path: Path,
|
||||
) -> Callable[[Vocab], KnowledgeBase]:
|
||||
|
@ -102,6 +101,7 @@ def load_kb(
|
|||
return kb_from_file
|
||||
|
||||
|
||||
@registry.misc("spacy.EmptyKB.v2")
|
||||
def empty_kb_for_config() -> Callable[[Vocab, int], KnowledgeBase]:
|
||||
def empty_kb_factory(vocab: Vocab, entity_vector_length: int):
|
||||
return InMemoryLookupKB(vocab=vocab, entity_vector_length=entity_vector_length)
|
||||
|
@ -109,6 +109,7 @@ def empty_kb_for_config() -> Callable[[Vocab, int], KnowledgeBase]:
|
|||
return empty_kb_factory
|
||||
|
||||
|
||||
@registry.misc("spacy.EmptyKB.v1")
|
||||
def empty_kb(
|
||||
entity_vector_length: int,
|
||||
) -> Callable[[Vocab], KnowledgeBase]:
|
||||
|
@ -118,11 +119,39 @@ def empty_kb(
|
|||
return empty_kb_factory
|
||||
|
||||
|
||||
def create_candidates() -> Callable[[KnowledgeBase, Span], Iterable[Candidate]]:
|
||||
@registry.misc("spacy.CandidateGenerator.v1")
|
||||
def create_get_candidates() -> Callable[[KnowledgeBase, Span], Iterable[Candidate]]:
|
||||
return get_candidates
|
||||
|
||||
|
||||
def create_candidates_batch() -> Callable[
|
||||
[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
|
||||
@registry.misc("spacy.CandidateGenerator.v2")
|
||||
def create_get_candidates_v2() -> Callable[
|
||||
[KnowledgeBase, Iterator[SpanGroup]], Iterator[CandidatesForDocT]
|
||||
]:
|
||||
return get_candidates_batch
|
||||
return get_candidates_v2
|
||||
|
||||
|
||||
def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]:
|
||||
"""
|
||||
Return candidate entities for the given mention from the KB.
|
||||
kb (KnowledgeBase): Knowledge base to query.
|
||||
mention (Span): Entity mention.
|
||||
RETURNS (Iterable[Candidate]): Identified candidates for specified mention.
|
||||
"""
|
||||
cands_per_doc = next(
|
||||
get_candidates_v2(kb, iter([SpanGroup(mention.doc, spans=[mention])]))
|
||||
)
|
||||
assert isinstance(cands_per_doc, list)
|
||||
return next(cands_per_doc[0])
|
||||
|
||||
|
||||
def get_candidates_v2(
|
||||
kb: KnowledgeBase, mentions: Iterator[SpanGroup]
|
||||
) -> Iterator[Iterable[Iterable[Candidate]]]:
|
||||
"""
|
||||
Return candidate entities for the given mentions from the KB.
|
||||
kb (KnowledgeBase): Knowledge base to query.
|
||||
mentions (Iterator[SpanGroup]): Mentions per doc.
|
||||
RETURNS (Iterator[Iterable[Iterable[Candidate]]]): Identified candidates per mentions in document/SpanGroup.
|
||||
"""
|
||||
return kb.get_candidates(mentions)
|
||||
|
|
|
@ -30,6 +30,7 @@ if TYPE_CHECKING:
|
|||
from ...vocab import Vocab # noqa: F401
|
||||
|
||||
|
||||
@registry.architectures("spacy.PretrainVectors.v1")
|
||||
def create_pretrain_vectors(
|
||||
maxout_pieces: int, hidden_size: int, loss: str
|
||||
) -> Callable[["Vocab", Model], Model]:
|
||||
|
@ -56,6 +57,7 @@ def create_pretrain_vectors(
|
|||
return create_vectors_objective
|
||||
|
||||
|
||||
@registry.architectures("spacy.PretrainCharacters.v1")
|
||||
def create_pretrain_characters(
|
||||
maxout_pieces: int, hidden_size: int, n_characters: int
|
||||
) -> Callable[["Vocab", Model], Model]:
|
||||
|
|
|
@ -1,9 +1,8 @@
|
|||
from typing import List, Optional, cast
|
||||
from typing import List, Literal, Optional
|
||||
|
||||
from thinc.api import Linear, Model, chain, list2array, use_ops, zero_init
|
||||
from thinc.types import Floats2d
|
||||
|
||||
from ...compat import Literal
|
||||
from ...errors import Errors
|
||||
from ...tokens import Doc
|
||||
from ...util import registry
|
||||
|
@ -11,6 +10,7 @@ from .._precomputable_affine import PrecomputableAffine
|
|||
from ..tb_framework import TransitionModel
|
||||
|
||||
|
||||
@registry.architectures("spacy.TransitionBasedParser.v2")
|
||||
def build_tb_parser_model(
|
||||
tok2vec: Model[List[Doc], List[Floats2d]],
|
||||
state_type: Literal["parser", "ner"],
|
||||
|
|
|
@ -10,6 +10,7 @@ InT = List[Doc]
|
|||
OutT = Floats2d
|
||||
|
||||
|
||||
@registry.architectures("spacy.SpanFinder.v1")
|
||||
def build_finder_model(
|
||||
tok2vec: Model[InT, List[Floats2d]], scorer: Model[OutT, OutT]
|
||||
) -> Model[InT, OutT]:
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user