mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-04 04:10:20 +03:00
Compare commits
267 Commits
Author | SHA1 | Date | |
---|---|---|---|
|
41e07772dc | ||
|
e8f40e2169 | ||
|
7b1d6e58ff | ||
|
864c2f3b51 | ||
|
75a9d9b9ad | ||
|
bec546cec0 | ||
|
46613e27cf | ||
|
b205ff65e6 | ||
|
92f1b8cdb4 | ||
|
4b65aa79ee | ||
|
d08f4e3b10 | ||
|
6036f344d3 | ||
|
5bebbf7550 | ||
|
911539e9a4 | ||
|
22c1bc785b | ||
|
cb5e760e91 | ||
|
87ec2b72a5 | ||
|
aa8de0ed37 | ||
|
98a19df91a | ||
|
92bd042502 | ||
|
d0c705cbc9 | ||
|
b3c46c315e | ||
|
d194f06437 | ||
|
055e07d9cc | ||
|
8e1c14e977 | ||
|
4278182dd0 | ||
|
85cc763006 | ||
|
ba7468e32e | ||
|
311f7cc9fb | ||
|
682140496a | ||
|
343f4f21d7 | ||
|
be0fa812c2 | ||
|
a6317b3836 | ||
|
3e30b5bef6 | ||
|
3ecec1324c | ||
|
15fbf5ef36 | ||
|
1ee9a19059 | ||
|
0d7e57fc3e | ||
|
ae5c3e078d | ||
|
8d2902b0e7 | ||
|
44d1906453 | ||
|
52a4cb0d14 | ||
|
10a6f508ab | ||
|
bda4bb0184 | ||
|
628c973db5 | ||
|
e0782c5e4c | ||
|
5230754986 | ||
|
411b70f5f3 | ||
|
08705f5a8c | ||
|
77177d0216 | ||
|
5196366af5 | ||
|
29232ad3b5 | ||
|
dd47fbb45f | ||
|
63f1b53c1a | ||
|
0cdcfe56cb | ||
|
924cbc9703 | ||
|
e1d050517d | ||
|
6c038aaae0 | ||
|
f0084b9143 | ||
|
ff81bfb8db | ||
|
9c5b61bdff | ||
|
725ccbac39 | ||
|
a8837beab7 | ||
|
3a0aadcf86 | ||
|
a61a1d43cf | ||
|
114b4894fb | ||
|
dec13b4258 | ||
|
c03f060527 | ||
|
6255cb985f | ||
|
3b165a8716 | ||
|
969832f5d6 | ||
|
8ce53a6bbe | ||
|
6fa0d709d5 | ||
|
5010fcbd3a | ||
|
de4f19f3a3 | ||
|
3d03565498 | ||
|
0576a1ff56 | ||
|
2f1e7ed09a | ||
|
e2dc9b79e1 | ||
|
3c3d75015b | ||
|
50aa3b5cbe | ||
|
8266031454 | ||
|
8dcc4b8daf | ||
|
3a635d2c94 | ||
|
a0ce61f55a | ||
|
83b4015b36 | ||
|
419bfaf6e7 | ||
|
69ecb85fad | ||
|
b427597fc8 | ||
|
1869a197c9 | ||
|
c068e1de1b | ||
|
184e508d9c | ||
|
30f1f33e78 | ||
|
f1a5ff9dba | ||
|
c80dacd046 | ||
|
7fbbb2002a | ||
|
89c1774d43 | ||
|
081e4e385d | ||
|
0190e669c5 | ||
|
54dc4ee8fb | ||
|
5a7ad5572c | ||
|
b18cc94451 | ||
|
4cc3ebe74e | ||
|
a019315534 | ||
|
59ac7e6bdb | ||
|
b65491b641 | ||
|
1b8d560d0e | ||
|
608f65ce40 | ||
|
acbf2a428f | ||
|
55db9c2e87 | ||
|
319e02545c | ||
|
a8accc3396 | ||
|
8cda27aefa | ||
|
f78e5ce732 | ||
|
a6d0fc3602 | ||
|
82fc2ecfa5 | ||
|
c195ca4f9c | ||
|
d3a232f773 | ||
|
ecd85d2618 | ||
|
045cd43c3f | ||
|
74836524e3 | ||
|
6d6c10ab9c | ||
|
2e2334632b | ||
|
2e96797696 | ||
|
f5e85fa05a | ||
|
21aea59001 | ||
|
4dc5fe5469 | ||
|
1252370f69 | ||
|
d410d95b52 | ||
|
0518c36f04 | ||
|
bff8725f4b | ||
|
fdfdbcd9f4 | ||
|
14bd9d89a3 | ||
|
e1249d3722 | ||
|
40422ff904 | ||
|
2dbb332cea | ||
|
d84068e460 | ||
|
89a43f39b7 | ||
|
68d7841df5 | ||
|
00e938a7c3 | ||
|
68b85ea950 | ||
|
7496e03a2c | ||
|
a493981163 | ||
|
a8894a8946 | ||
|
afac7fb650 | ||
|
5a2ad4af4b | ||
|
128197a5fc | ||
|
3b3b5cdc63 | ||
|
575c405ae3 | ||
|
256468c414 | ||
|
91c24c0285 | ||
|
c608baeecc | ||
|
0062c22c35 | ||
|
e2a3952de5 | ||
|
7ebba86402 | ||
|
764be103bc | ||
|
56fc3bc0f3 | ||
|
7df328fbfe | ||
|
d56ee65ddf | ||
|
e79a9c5acd | ||
|
8cfccdd2f8 | ||
|
f78b91c03b | ||
|
9fcd2bfa08 | ||
|
a25a3b996b | ||
|
55ed2b4e82 | ||
|
e467573550 | ||
|
0e43fca036 | ||
|
da7ad97519 | ||
|
bf7c2ea99a | ||
|
8f69e56a5a | ||
|
b6e022381d | ||
|
9f2ce6bb00 | ||
|
bd2c17e206 | ||
|
b2e831d966 | ||
|
513bbd5fa3 | ||
|
2b8da84717 | ||
|
0c25725359 | ||
|
ff9ddb6a07 | ||
|
c096c5c0c9 | ||
|
92f1d0a195 | ||
|
c4e2daf6ef | ||
|
a804b83a4b | ||
|
48248c62b6 | ||
|
0c15876502 | ||
|
9deaac9786 | ||
|
d717123819 | ||
|
a89eae9283 | ||
|
699dd8b3b7 | ||
|
ea1befa8ff | ||
|
d72029d9c8 | ||
|
77c568e524 | ||
|
65e7bd54f5 | ||
|
b83f1e3724 | ||
|
df07c4734b | ||
|
030d63ad73 | ||
|
be29216fe2 | ||
|
1162fcf099 | ||
|
862f8254e8 | ||
|
1dec138e61 | ||
|
6e54360a3d | ||
|
734826db79 | ||
|
829613b959 | ||
|
9d036607f1 | ||
|
aec59c0088 | ||
|
6d0185f7fb | ||
|
92ce32aa3f | ||
|
160e61772e | ||
|
0fed2d9e28 | ||
|
1b043dde3f | ||
|
4ec41e98f6 | ||
|
483d4a5bc0 | ||
|
6b4f774418 | ||
|
78504c25a5 | ||
|
467c82439e | ||
|
b4990395f9 | ||
|
76d94b31f2 | ||
|
1adf79414e | ||
|
538304948e | ||
|
55614d6799 | ||
|
36201cb6a1 | ||
|
406794a081 | ||
|
beda27a91e | ||
|
6255e38695 | ||
|
b4501db6f8 | ||
|
ff4215f1c7 | ||
|
935a5455b6 | ||
|
ed8c11e2aa | ||
|
4e3360ad12 | ||
|
bef9f63e13 | ||
|
830eba5426 | ||
|
163ec6fba8 | ||
|
8f0d6b0a8c | ||
|
36d4767aca | ||
|
013762be41 | ||
|
def7013eec | ||
|
cc78847688 | ||
|
6d1f6d9a23 | ||
|
5c1f9264c2 | ||
|
065ead4eed | ||
|
3e4264899c | ||
|
52758e1afa | ||
|
c2303858e6 | ||
|
d8a32c1050 | ||
|
869cc4ab0b | ||
|
6dd56868de | ||
|
198488ee86 | ||
|
76a9f9c6c6 | ||
|
64b8ee2dbe | ||
|
d50b8d51e2 | ||
|
6a4aa43164 | ||
|
9622c11529 | ||
|
6ef29c4115 | ||
|
060241a8d5 | ||
|
e5773e0c69 | ||
|
0737443096 | ||
|
0fe43f40f1 | ||
|
9ffa5d8a15 | ||
|
5888afa884 | ||
|
4f8daa4f00 | ||
|
ea54d1775a | ||
|
79ec68f01b | ||
|
b1b20bf69d | ||
|
9e63006b12 | ||
|
991bcc111e | ||
|
d195923164 | ||
|
8113cfb257 | ||
|
6f3a71999e |
1
.github/FUNDING.yml
vendored
Normal file
1
.github/FUNDING.yml
vendored
Normal file
|
@ -0,0 +1 @@
|
||||||
|
custom: [https://explosion.ai/merch, https://explosion.ai/tailored-solutions]
|
99
.github/workflows/cibuildwheel.yml
vendored
Normal file
99
.github/workflows/cibuildwheel.yml
vendored
Normal file
|
@ -0,0 +1,99 @@
|
||||||
|
name: Build
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
tags:
|
||||||
|
# ytf did they invent their own syntax that's almost regex?
|
||||||
|
# ** matches 'zero or more of any character'
|
||||||
|
- 'release-v[0-9]+.[0-9]+.[0-9]+**'
|
||||||
|
- 'prerelease-v[0-9]+.[0-9]+.[0-9]+**'
|
||||||
|
jobs:
|
||||||
|
build_wheels:
|
||||||
|
name: Build wheels on ${{ matrix.os }}
|
||||||
|
runs-on: ${{ matrix.os }}
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
# macos-13 is an intel runner, macos-14 is apple silicon
|
||||||
|
os: [ubuntu-latest, windows-latest, macos-13, macos-14, ubuntu-24.04-arm]
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
# aarch64 (arm) is built via qemu emulation
|
||||||
|
# QEMU is sadly too slow. We need to wait for public ARM support
|
||||||
|
#- name: Set up QEMU
|
||||||
|
# if: runner.os == 'Linux'
|
||||||
|
# uses: docker/setup-qemu-action@v3
|
||||||
|
# with:
|
||||||
|
# platforms: all
|
||||||
|
- name: Build wheels
|
||||||
|
uses: pypa/cibuildwheel@v2.21.3
|
||||||
|
env:
|
||||||
|
CIBW_ARCHS_LINUX: auto
|
||||||
|
with:
|
||||||
|
package-dir: .
|
||||||
|
output-dir: wheelhouse
|
||||||
|
config-file: "{package}/pyproject.toml"
|
||||||
|
- uses: actions/upload-artifact@v4
|
||||||
|
with:
|
||||||
|
name: cibw-wheels-${{ matrix.os }}-${{ strategy.job-index }}
|
||||||
|
path: ./wheelhouse/*.whl
|
||||||
|
|
||||||
|
build_sdist:
|
||||||
|
name: Build source distribution
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Build sdist
|
||||||
|
run: pipx run build --sdist
|
||||||
|
- uses: actions/upload-artifact@v4
|
||||||
|
with:
|
||||||
|
name: cibw-sdist
|
||||||
|
path: dist/*.tar.gz
|
||||||
|
create_release:
|
||||||
|
needs: [build_wheels, build_sdist]
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
permissions:
|
||||||
|
contents: write
|
||||||
|
checks: write
|
||||||
|
actions: read
|
||||||
|
issues: read
|
||||||
|
packages: write
|
||||||
|
pull-requests: read
|
||||||
|
repository-projects: read
|
||||||
|
statuses: read
|
||||||
|
steps:
|
||||||
|
- name: Get the tag name and determine if it's a prerelease
|
||||||
|
id: get_tag_info
|
||||||
|
run: |
|
||||||
|
FULL_TAG=${GITHUB_REF#refs/tags/}
|
||||||
|
if [[ $FULL_TAG == release-* ]]; then
|
||||||
|
TAG_NAME=${FULL_TAG#release-}
|
||||||
|
IS_PRERELEASE=false
|
||||||
|
elif [[ $FULL_TAG == prerelease-* ]]; then
|
||||||
|
TAG_NAME=${FULL_TAG#prerelease-}
|
||||||
|
IS_PRERELEASE=true
|
||||||
|
else
|
||||||
|
echo "Tag does not match expected patterns" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo "FULL_TAG=$TAG_NAME" >> $GITHUB_ENV
|
||||||
|
echo "TAG_NAME=$TAG_NAME" >> $GITHUB_ENV
|
||||||
|
echo "IS_PRERELEASE=$IS_PRERELEASE" >> $GITHUB_ENV
|
||||||
|
- uses: actions/download-artifact@v4
|
||||||
|
with:
|
||||||
|
# unpacks all CIBW artifacts into dist/
|
||||||
|
pattern: cibw-*
|
||||||
|
path: dist
|
||||||
|
merge-multiple: true
|
||||||
|
- name: Create Draft Release
|
||||||
|
id: create_release
|
||||||
|
uses: softprops/action-gh-release@v2
|
||||||
|
if: startsWith(github.ref, 'refs/tags/')
|
||||||
|
env:
|
||||||
|
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
with:
|
||||||
|
name: ${{ env.TAG_NAME }}
|
||||||
|
draft: true
|
||||||
|
prerelease: ${{ env.IS_PRERELEASE }}
|
||||||
|
files: "./dist/*"
|
2
.github/workflows/explosionbot.yml
vendored
2
.github/workflows/explosionbot.yml
vendored
|
@ -15,7 +15,7 @@ jobs:
|
||||||
env:
|
env:
|
||||||
GITHUB_CONTEXT: ${{ toJson(github) }}
|
GITHUB_CONTEXT: ${{ toJson(github) }}
|
||||||
run: echo "$GITHUB_CONTEXT"
|
run: echo "$GITHUB_CONTEXT"
|
||||||
- uses: actions/checkout@v3
|
- uses: actions/checkout@v4
|
||||||
- uses: actions/setup-python@v4
|
- uses: actions/setup-python@v4
|
||||||
- name: Install and run explosion-bot
|
- name: Install and run explosion-bot
|
||||||
run: |
|
run: |
|
||||||
|
|
2
.github/workflows/lock.yml
vendored
2
.github/workflows/lock.yml
vendored
|
@ -16,7 +16,7 @@ jobs:
|
||||||
if: github.repository_owner == 'explosion'
|
if: github.repository_owner == 'explosion'
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: dessant/lock-threads@v4
|
- uses: dessant/lock-threads@v5
|
||||||
with:
|
with:
|
||||||
process-only: 'issues'
|
process-only: 'issues'
|
||||||
issue-inactive-days: '30'
|
issue-inactive-days: '30'
|
||||||
|
|
29
.github/workflows/publish_pypi.yml
vendored
Normal file
29
.github/workflows/publish_pypi.yml
vendored
Normal file
|
@ -0,0 +1,29 @@
|
||||||
|
# The cibuildwheel action triggers on creation of a release, this
|
||||||
|
# triggers on publication.
|
||||||
|
# The expected workflow is to create a draft release and let the wheels
|
||||||
|
# upload, and then hit 'publish', which uploads to PyPi.
|
||||||
|
|
||||||
|
on:
|
||||||
|
release:
|
||||||
|
types:
|
||||||
|
- published
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
upload_pypi:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
environment:
|
||||||
|
name: pypi
|
||||||
|
url: https://pypi.org/p/spacy
|
||||||
|
permissions:
|
||||||
|
id-token: write
|
||||||
|
contents: read
|
||||||
|
if: github.event_name == 'release' && github.event.action == 'published'
|
||||||
|
# or, alternatively, upload to PyPI on every tag starting with 'v' (remove on: release above to use this)
|
||||||
|
# if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
|
||||||
|
steps:
|
||||||
|
- uses: robinraju/release-downloader@v1
|
||||||
|
with:
|
||||||
|
tag: ${{ github.event.release.tag_name }}
|
||||||
|
fileName: '*'
|
||||||
|
out-file-path: 'dist'
|
||||||
|
- uses: pypa/gh-action-pypi-publish@release/v1
|
|
@ -14,7 +14,7 @@ jobs:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v3
|
uses: actions/checkout@v4
|
||||||
with:
|
with:
|
||||||
ref: ${{ matrix.branch }}
|
ref: ${{ matrix.branch }}
|
||||||
- name: Get commits from past 24 hours
|
- name: Get commits from past 24 hours
|
2
.github/workflows/spacy_universe_alert.yml
vendored
2
.github/workflows/spacy_universe_alert.yml
vendored
|
@ -18,7 +18,7 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
echo "$GITHUB_CONTEXT"
|
echo "$GITHUB_CONTEXT"
|
||||||
|
|
||||||
- uses: actions/checkout@v3
|
- uses: actions/checkout@v4
|
||||||
- uses: actions/setup-python@v4
|
- uses: actions/setup-python@v4
|
||||||
with:
|
with:
|
||||||
python-version: '3.10'
|
python-version: '3.10'
|
||||||
|
|
41
.github/workflows/tests.yml
vendored
41
.github/workflows/tests.yml
vendored
|
@ -2,6 +2,8 @@ name: tests
|
||||||
|
|
||||||
on:
|
on:
|
||||||
push:
|
push:
|
||||||
|
tags-ignore:
|
||||||
|
- '**'
|
||||||
branches-ignore:
|
branches-ignore:
|
||||||
- "spacy.io"
|
- "spacy.io"
|
||||||
- "nightly.spacy.io"
|
- "nightly.spacy.io"
|
||||||
|
@ -10,7 +12,6 @@ on:
|
||||||
- "*.md"
|
- "*.md"
|
||||||
- "*.mdx"
|
- "*.mdx"
|
||||||
- "website/**"
|
- "website/**"
|
||||||
- ".github/workflows/**"
|
|
||||||
pull_request:
|
pull_request:
|
||||||
types: [opened, synchronize, reopened, edited]
|
types: [opened, synchronize, reopened, edited]
|
||||||
paths-ignore:
|
paths-ignore:
|
||||||
|
@ -25,13 +26,12 @@ jobs:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Check out repo
|
- name: Check out repo
|
||||||
uses: actions/checkout@v3
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
- name: Configure Python version
|
- name: Configure Python version
|
||||||
uses: actions/setup-python@v4
|
uses: actions/setup-python@v4
|
||||||
with:
|
with:
|
||||||
python-version: "3.7"
|
python-version: "3.10"
|
||||||
architecture: x64
|
|
||||||
|
|
||||||
- name: black
|
- name: black
|
||||||
run: |
|
run: |
|
||||||
|
@ -45,11 +45,12 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
python -m pip install flake8==5.0.4
|
python -m pip install flake8==5.0.4
|
||||||
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
|
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
|
||||||
- name: cython-lint
|
# Unfortunately cython-lint isn't working after the shift to Cython 3.
|
||||||
run: |
|
#- name: cython-lint
|
||||||
python -m pip install cython-lint -c requirements.txt
|
# run: |
|
||||||
# E501: line too log, W291: trailing whitespace, E266: too many leading '#' for block comment
|
# python -m pip install cython-lint -c requirements.txt
|
||||||
cython-lint spacy --ignore E501,W291,E266
|
# # E501: line too log, W291: trailing whitespace, E266: too many leading '#' for block comment
|
||||||
|
# cython-lint spacy --ignore E501,W291,E266
|
||||||
|
|
||||||
tests:
|
tests:
|
||||||
name: Test
|
name: Test
|
||||||
|
@ -58,30 +59,18 @@ jobs:
|
||||||
fail-fast: true
|
fail-fast: true
|
||||||
matrix:
|
matrix:
|
||||||
os: [ubuntu-latest, windows-latest, macos-latest]
|
os: [ubuntu-latest, windows-latest, macos-latest]
|
||||||
python_version: ["3.11"]
|
python_version: ["3.9", "3.12", "3.13"]
|
||||||
include:
|
|
||||||
- os: ubuntu-20.04
|
|
||||||
python_version: "3.6"
|
|
||||||
- os: windows-latest
|
|
||||||
python_version: "3.7"
|
|
||||||
- os: macos-latest
|
|
||||||
python_version: "3.8"
|
|
||||||
- os: ubuntu-latest
|
|
||||||
python_version: "3.9"
|
|
||||||
- os: windows-latest
|
|
||||||
python_version: "3.10"
|
|
||||||
|
|
||||||
runs-on: ${{ matrix.os }}
|
runs-on: ${{ matrix.os }}
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Check out repo
|
- name: Check out repo
|
||||||
uses: actions/checkout@v3
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
- name: Configure Python version
|
- name: Configure Python version
|
||||||
uses: actions/setup-python@v4
|
uses: actions/setup-python@v4
|
||||||
with:
|
with:
|
||||||
python-version: ${{ matrix.python_version }}
|
python-version: ${{ matrix.python_version }}
|
||||||
architecture: x64
|
|
||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
|
@ -95,7 +84,7 @@ jobs:
|
||||||
- name: Run mypy
|
- name: Run mypy
|
||||||
run: |
|
run: |
|
||||||
python -m mypy spacy
|
python -m mypy spacy
|
||||||
if: matrix.python_version != '3.6'
|
if: matrix.python_version != '3.7'
|
||||||
|
|
||||||
- name: Delete source directory and .egg-info
|
- name: Delete source directory and .egg-info
|
||||||
run: |
|
run: |
|
||||||
|
@ -159,7 +148,9 @@ jobs:
|
||||||
- name: "Test assemble CLI"
|
- name: "Test assemble CLI"
|
||||||
run: |
|
run: |
|
||||||
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
|
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
|
||||||
PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
|
python -m spacy assemble ner_source_sm.cfg output_dir
|
||||||
|
env:
|
||||||
|
PYTHONWARNINGS: "error,ignore::DeprecationWarning"
|
||||||
if: matrix.python_version == '3.9'
|
if: matrix.python_version == '3.9'
|
||||||
|
|
||||||
- name: "Test assemble CLI vectors warning"
|
- name: "Test assemble CLI vectors warning"
|
||||||
|
|
3
.github/workflows/universe_validation.yml
vendored
3
.github/workflows/universe_validation.yml
vendored
|
@ -20,13 +20,12 @@ jobs:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Check out repo
|
- name: Check out repo
|
||||||
uses: actions/checkout@v3
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
- name: Configure Python version
|
- name: Configure Python version
|
||||||
uses: actions/setup-python@v4
|
uses: actions/setup-python@v4
|
||||||
with:
|
with:
|
||||||
python-version: "3.7"
|
python-version: "3.7"
|
||||||
architecture: x64
|
|
||||||
|
|
||||||
- name: Validate website/meta/universe.json
|
- name: Validate website/meta/universe.json
|
||||||
run: |
|
run: |
|
||||||
|
|
|
@ -35,7 +35,7 @@ so that more people can benefit from it.
|
||||||
|
|
||||||
When opening an issue, use a **descriptive title** and include your
|
When opening an issue, use a **descriptive title** and include your
|
||||||
**environment** (operating system, Python version, spaCy version). Our
|
**environment** (operating system, Python version, spaCy version). Our
|
||||||
[issue template](https://github.com/explosion/spaCy/issues/new) helps you
|
[issue templates](https://github.com/explosion/spaCy/issues/new/choose) help you
|
||||||
remember the most important details to include. If you've discovered a bug, you
|
remember the most important details to include. If you've discovered a bug, you
|
||||||
can also submit a [regression test](#fixing-bugs) straight away. When you're
|
can also submit a [regression test](#fixing-bugs) straight away. When you're
|
||||||
opening an issue to report the bug, simply refer to your pull request in the
|
opening an issue to report the bug, simply refer to your pull request in the
|
||||||
|
@ -449,13 +449,12 @@ and plugins in spaCy v3.0, and we can't wait to see what you build with it!
|
||||||
[`spacy`](https://github.com/topics/spacy?o=desc&s=stars) and
|
[`spacy`](https://github.com/topics/spacy?o=desc&s=stars) and
|
||||||
[`spacy-extensions`](https://github.com/topics/spacy-extension?o=desc&s=stars)
|
[`spacy-extensions`](https://github.com/topics/spacy-extension?o=desc&s=stars)
|
||||||
to make it easier to find. Those are also the topics we're linking to from the
|
to make it easier to find. Those are also the topics we're linking to from the
|
||||||
spaCy website. If you're sharing your project on Twitter, feel free to tag
|
spaCy website. If you're sharing your project on X, feel free to tag
|
||||||
[@spacy_io](https://twitter.com/spacy_io) so we can check it out.
|
[@spacy_io](https://x.com/spacy_io) so we can check it out.
|
||||||
|
|
||||||
- Once your extension is published, you can open an issue on the
|
- Once your extension is published, you can open a
|
||||||
[issue tracker](https://github.com/explosion/spacy/issues) to suggest it for the
|
[PR](https://github.com/explosion/spaCy/pulls) to suggest it for the
|
||||||
[resources directory](https://spacy.io/usage/resources#extensions) on the
|
[Universe](https://spacy.io/universe) page.
|
||||||
website.
|
|
||||||
|
|
||||||
📖 **For more tips and best practices, see the [checklist for developing spaCy extensions](https://spacy.io/usage/processing-pipelines#extensions).**
|
📖 **For more tips and best practices, see the [checklist for developing spaCy extensions](https://spacy.io/usage/processing-pipelines#extensions).**
|
||||||
|
|
||||||
|
|
2
LICENSE
2
LICENSE
|
@ -1,6 +1,6 @@
|
||||||
The MIT License (MIT)
|
The MIT License (MIT)
|
||||||
|
|
||||||
Copyright (C) 2016-2022 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal
|
Copyright (C) 2016-2024 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal
|
||||||
|
|
||||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
of this software and associated documentation files (the "Software"), to deal
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
|
|
@ -4,5 +4,6 @@ include README.md
|
||||||
include pyproject.toml
|
include pyproject.toml
|
||||||
include spacy/py.typed
|
include spacy/py.typed
|
||||||
recursive-include spacy/cli *.yml
|
recursive-include spacy/cli *.yml
|
||||||
|
recursive-include spacy/tests *.json
|
||||||
recursive-include licenses *
|
recursive-include licenses *
|
||||||
recursive-exclude spacy *.cpp
|
recursive-exclude spacy *.cpp
|
||||||
|
|
23
README.md
23
README.md
|
@ -16,7 +16,7 @@ model packaging, deployment and workflow management. spaCy is commercial
|
||||||
open-source software, released under the
|
open-source software, released under the
|
||||||
[MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE).
|
[MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE).
|
||||||
|
|
||||||
💫 **Version 3.6 out now!**
|
💫 **Version 3.8 out now!**
|
||||||
[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
|
[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
|
||||||
|
|
||||||
[](https://github.com/explosion/spaCy/actions/workflows/tests.yml)
|
[](https://github.com/explosion/spaCy/actions/workflows/tests.yml)
|
||||||
|
@ -28,7 +28,6 @@ open-source software, released under the
|
||||||
<br />
|
<br />
|
||||||
[](https://pypi.org/project/spacy/)
|
[](https://pypi.org/project/spacy/)
|
||||||
[](https://anaconda.org/conda-forge/spacy)
|
[](https://anaconda.org/conda-forge/spacy)
|
||||||
[](https://twitter.com/spacy_io)
|
|
||||||
|
|
||||||
## 📖 Documentation
|
## 📖 Documentation
|
||||||
|
|
||||||
|
@ -39,28 +38,37 @@ open-source software, released under the
|
||||||
| 🚀 **[New in v3.0]** | New features, backwards incompatibilities and migration guide. |
|
| 🚀 **[New in v3.0]** | New features, backwards incompatibilities and migration guide. |
|
||||||
| 🪐 **[Project Templates]** | End-to-end workflows you can clone, modify and run. |
|
| 🪐 **[Project Templates]** | End-to-end workflows you can clone, modify and run. |
|
||||||
| 🎛 **[API Reference]** | The detailed reference for spaCy's API. |
|
| 🎛 **[API Reference]** | The detailed reference for spaCy's API. |
|
||||||
|
| ⏩ **[GPU Processing]** | Use spaCy with CUDA-compatible GPU processing. |
|
||||||
| 📦 **[Models]** | Download trained pipelines for spaCy. |
|
| 📦 **[Models]** | Download trained pipelines for spaCy. |
|
||||||
|
| 🦙 **[Large Language Models]** | Integrate LLMs into spaCy pipelines. |
|
||||||
| 🌌 **[Universe]** | Plugins, extensions, demos and books from the spaCy ecosystem. |
|
| 🌌 **[Universe]** | Plugins, extensions, demos and books from the spaCy ecosystem. |
|
||||||
| ⚙️ **[spaCy VS Code Extension]** | Additional tooling and features for working with spaCy's config files. |
|
| ⚙️ **[spaCy VS Code Extension]** | Additional tooling and features for working with spaCy's config files. |
|
||||||
| 👩🏫 **[Online Course]** | Learn spaCy in this free and interactive online course. |
|
| 👩🏫 **[Online Course]** | Learn spaCy in this free and interactive online course. |
|
||||||
|
| 📰 **[Blog]** | Read about current spaCy and Prodigy development, releases, talks and more from Explosion. |
|
||||||
| 📺 **[Videos]** | Our YouTube channel with video tutorials, talks and more. |
|
| 📺 **[Videos]** | Our YouTube channel with video tutorials, talks and more. |
|
||||||
|
| 🔴 **[Live Stream]** | Join Matt as he works on spaCy and chat about NLP, live every week. |
|
||||||
| 🛠 **[Changelog]** | Changes and version history. |
|
| 🛠 **[Changelog]** | Changes and version history. |
|
||||||
| 💝 **[Contribute]** | How to contribute to the spaCy project and code base. |
|
| 💝 **[Contribute]** | How to contribute to the spaCy project and code base. |
|
||||||
| <a href="https://explosion.ai/spacy-tailored-pipelines"><img src="https://user-images.githubusercontent.com/13643239/152853098-1c761611-ccb0-4ec6-9066-b234552831fe.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Get a custom spaCy pipeline, tailor-made for your NLP problem by spaCy's core developers. Streamlined, production-ready, predictable and maintainable. Start by completing our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-pipelines)** |
|
| 👕 **[Swag]** | Support us and our work with unique, custom-designed swag! |
|
||||||
| <a href="https://explosion.ai/spacy-tailored-analysis"><img src="https://user-images.githubusercontent.com/1019791/206151300-b00cd189-e503-4797-aa1e-1bb6344062c5.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Bespoke advice for problem solving, strategy and analysis for applied NLP projects. Services include data strategy, code reviews, pipeline design and annotation coaching. Curious? Fill in our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-analysis)** |
|
| <a href="https://explosion.ai/tailored-solutions"><img src="https://github.com/explosion/spaCy/assets/13643239/36d2a42e-98c0-4599-90e1-788ef75181be" width="150" alt="Tailored Solutions"/></a> | Custom NLP consulting, implementation and strategic advice by spaCy’s core development team. Streamlined, production-ready, predictable and maintainable. Send us an email or take our 5-minute questionnaire, and well'be in touch! **[Learn more →](https://explosion.ai/tailored-solutions)** |
|
||||||
|
|
||||||
[spacy 101]: https://spacy.io/usage/spacy-101
|
[spacy 101]: https://spacy.io/usage/spacy-101
|
||||||
[new in v3.0]: https://spacy.io/usage/v3
|
[new in v3.0]: https://spacy.io/usage/v3
|
||||||
[usage guides]: https://spacy.io/usage/
|
[usage guides]: https://spacy.io/usage/
|
||||||
[api reference]: https://spacy.io/api/
|
[api reference]: https://spacy.io/api/
|
||||||
|
[gpu processing]: https://spacy.io/usage#gpu
|
||||||
[models]: https://spacy.io/models
|
[models]: https://spacy.io/models
|
||||||
|
[large language models]: https://spacy.io/usage/large-language-models
|
||||||
[universe]: https://spacy.io/universe
|
[universe]: https://spacy.io/universe
|
||||||
[spacy vs code extension]: https://github.com/explosion/spacy-vscode
|
[spacy vs code extension]: https://github.com/explosion/spacy-vscode
|
||||||
[videos]: https://www.youtube.com/c/ExplosionAI
|
[videos]: https://www.youtube.com/c/ExplosionAI
|
||||||
|
[live stream]: https://www.youtube.com/playlist?list=PLBmcuObd5An5_iAxNYLJa_xWmNzsYce8c
|
||||||
[online course]: https://course.spacy.io
|
[online course]: https://course.spacy.io
|
||||||
|
[blog]: https://explosion.ai
|
||||||
[project templates]: https://github.com/explosion/projects
|
[project templates]: https://github.com/explosion/projects
|
||||||
[changelog]: https://spacy.io/usage#changelog
|
[changelog]: https://spacy.io/usage#changelog
|
||||||
[contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md
|
[contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md
|
||||||
|
[swag]: https://explosion.ai/merch
|
||||||
|
|
||||||
## 💬 Where to ask questions
|
## 💬 Where to ask questions
|
||||||
|
|
||||||
|
@ -72,13 +80,14 @@ more people can benefit from it.
|
||||||
| Type | Platforms |
|
| Type | Platforms |
|
||||||
| ------------------------------- | --------------------------------------- |
|
| ------------------------------- | --------------------------------------- |
|
||||||
| 🚨 **Bug Reports** | [GitHub Issue Tracker] |
|
| 🚨 **Bug Reports** | [GitHub Issue Tracker] |
|
||||||
| 🎁 **Feature Requests & Ideas** | [GitHub Discussions] |
|
| 🎁 **Feature Requests & Ideas** | [GitHub Discussions] · [Live Stream] |
|
||||||
| 👩💻 **Usage Questions** | [GitHub Discussions] · [Stack Overflow] |
|
| 👩💻 **Usage Questions** | [GitHub Discussions] · [Stack Overflow] |
|
||||||
| 🗯 **General Discussion** | [GitHub Discussions] |
|
| 🗯 **General Discussion** | [GitHub Discussions] · [Live Stream] |
|
||||||
|
|
||||||
[github issue tracker]: https://github.com/explosion/spaCy/issues
|
[github issue tracker]: https://github.com/explosion/spaCy/issues
|
||||||
[github discussions]: https://github.com/explosion/spaCy/discussions
|
[github discussions]: https://github.com/explosion/spaCy/discussions
|
||||||
[stack overflow]: https://stackoverflow.com/questions/tagged/spacy
|
[stack overflow]: https://stackoverflow.com/questions/tagged/spacy
|
||||||
|
[live stream]: https://www.youtube.com/playlist?list=PLBmcuObd5An5_iAxNYLJa_xWmNzsYce8c
|
||||||
|
|
||||||
## Features
|
## Features
|
||||||
|
|
||||||
|
@ -108,7 +117,7 @@ For detailed installation instructions, see the
|
||||||
|
|
||||||
- **Operating system**: macOS / OS X · Linux · Windows (Cygwin, MinGW, Visual
|
- **Operating system**: macOS / OS X · Linux · Windows (Cygwin, MinGW, Visual
|
||||||
Studio)
|
Studio)
|
||||||
- **Python version**: Python 3.6+ (only 64 bit)
|
- **Python version**: Python >=3.7, <3.13 (only 64 bit)
|
||||||
- **Package managers**: [pip] · [conda] (via `conda-forge`)
|
- **Package managers**: [pip] · [conda] (via `conda-forge`)
|
||||||
|
|
||||||
[pip]: https://pypi.org/project/spacy/
|
[pip]: https://pypi.org/project/spacy/
|
||||||
|
|
20
bin/release.sh
Executable file
20
bin/release.sh
Executable file
|
@ -0,0 +1,20 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
# Insist repository is clean
|
||||||
|
git diff-index --quiet HEAD
|
||||||
|
|
||||||
|
version=$(grep "__version__ = " spacy/about.py)
|
||||||
|
version=${version/__version__ = }
|
||||||
|
version=${version/\'/}
|
||||||
|
version=${version/\'/}
|
||||||
|
version=${version/\"/}
|
||||||
|
version=${version/\"/}
|
||||||
|
|
||||||
|
echo "Pushing release-v"$version
|
||||||
|
|
||||||
|
git tag -d release-v$version || true
|
||||||
|
git push origin :release-v$version || true
|
||||||
|
git tag release-v$version
|
||||||
|
git push origin release-v$version
|
|
@ -1,9 +1,2 @@
|
||||||
# build version constraints for use with wheelwright + multibuild
|
# build version constraints for use with wheelwright
|
||||||
numpy==1.15.0; python_version<='3.7' and platform_machine!='aarch64'
|
numpy>=2.0.0,<3.0.0
|
||||||
numpy==1.19.2; python_version<='3.7' and platform_machine=='aarch64'
|
|
||||||
numpy==1.17.3; python_version=='3.8' and platform_machine!='aarch64'
|
|
||||||
numpy==1.19.2; python_version=='3.8' and platform_machine=='aarch64'
|
|
||||||
numpy==1.19.3; python_version=='3.9'
|
|
||||||
numpy==1.21.3; python_version=='3.10'
|
|
||||||
numpy==1.23.2; python_version=='3.11'
|
|
||||||
numpy; python_version>='3.12'
|
|
||||||
|
|
|
@ -1,14 +1,17 @@
|
||||||
# Listeners
|
# Listeners
|
||||||
|
|
||||||
1. [Overview](#1-overview)
|
- [1. Overview](#1-overview)
|
||||||
2. [Initialization](#2-initialization)
|
- [2. Initialization](#2-initialization)
|
||||||
- [A. Linking listeners to the embedding component](#2a-linking-listeners-to-the-embedding-component)
|
- [2A. Linking listeners to the embedding component](#2a-linking-listeners-to-the-embedding-component)
|
||||||
- [B. Shape inference](#2b-shape-inference)
|
- [2B. Shape inference](#2b-shape-inference)
|
||||||
3. [Internal communication](#3-internal-communication)
|
- [3. Internal communication](#3-internal-communication)
|
||||||
- [A. During prediction](#3a-during-prediction)
|
- [3A. During prediction](#3a-during-prediction)
|
||||||
- [B. During training](#3b-during-training)
|
- [3B. During training](#3b-during-training)
|
||||||
- [C. Frozen components](#3c-frozen-components)
|
- [Training with multiple listeners](#training-with-multiple-listeners)
|
||||||
4. [Replacing listener with standalone](#4-replacing-listener-with-standalone)
|
- [3C. Frozen components](#3c-frozen-components)
|
||||||
|
- [The Tok2Vec or Transformer is frozen](#the-tok2vec-or-transformer-is-frozen)
|
||||||
|
- [The upstream component is frozen](#the-upstream-component-is-frozen)
|
||||||
|
- [4. Replacing listener with standalone](#4-replacing-listener-with-standalone)
|
||||||
|
|
||||||
## 1. Overview
|
## 1. Overview
|
||||||
|
|
||||||
|
@ -62,7 +65,7 @@ of this `find_listener()` method will specifically identify sublayers of a model
|
||||||
|
|
||||||
If it's a Transformer-based pipeline, a
|
If it's a Transformer-based pipeline, a
|
||||||
[`transformer` component](https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/pipeline_component.py)
|
[`transformer` component](https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/pipeline_component.py)
|
||||||
has a similar implementation but its `find_listener()` function will specifically look for `TransformerListener`
|
has a similar implementation but its `find_listener()` function will specifically look for `TransformerListener`
|
||||||
sublayers of downstream components.
|
sublayers of downstream components.
|
||||||
|
|
||||||
### 2B. Shape inference
|
### 2B. Shape inference
|
||||||
|
@ -154,7 +157,7 @@ as a tagger or a parser. This used to be impossible before 3.1, but has become s
|
||||||
embedding component in the [`annotating_components`](https://spacy.io/usage/training#annotating-components)
|
embedding component in the [`annotating_components`](https://spacy.io/usage/training#annotating-components)
|
||||||
list of the config. This works like any other "annotating component" because it relies on the `Doc` attributes.
|
list of the config. This works like any other "annotating component" because it relies on the `Doc` attributes.
|
||||||
|
|
||||||
However, if the `Tok2Vec` or `Transformer` is frozen, and not present in `annotating_components`, and a related
|
However, if the `Tok2Vec` or `Transformer` is frozen, and not present in `annotating_components`, and a related
|
||||||
listener isn't frozen, then a `W086` warning is shown and further training of the pipeline will likely end with `E954`.
|
listener isn't frozen, then a `W086` warning is shown and further training of the pipeline will likely end with `E954`.
|
||||||
|
|
||||||
#### The upstream component is frozen
|
#### The upstream component is frozen
|
||||||
|
@ -216,5 +219,17 @@ new_model = tok2vec_model.attrs["replace_listener"](new_model)
|
||||||
```
|
```
|
||||||
|
|
||||||
The new config and model are then properly stored on the `nlp` object.
|
The new config and model are then properly stored on the `nlp` object.
|
||||||
Note that this functionality (running the replacement for a transformer listener) was broken prior to
|
Note that this functionality (running the replacement for a transformer listener) was broken prior to
|
||||||
`spacy-transformers` 1.0.5.
|
`spacy-transformers` 1.0.5.
|
||||||
|
|
||||||
|
In spaCy 3.7, `Language.replace_listeners` was updated to pass the following additional arguments to the `replace_listener` callback:
|
||||||
|
the listener to be replaced and the `tok2vec`/`transformer` pipe from which the new model was copied. To maintain backwards-compatiblity,
|
||||||
|
the method only passes these extra arguments for callbacks that support them:
|
||||||
|
|
||||||
|
```
|
||||||
|
def replace_listener_pre_37(copied_tok2vec_model):
|
||||||
|
...
|
||||||
|
|
||||||
|
def replace_listener_post_37(copied_tok2vec_model, replaced_listener, tok2vec_pipe):
|
||||||
|
...
|
||||||
|
```
|
||||||
|
|
|
@ -158,3 +158,45 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
SOFTWARE.
|
SOFTWARE.
|
||||||
|
|
||||||
|
|
||||||
|
SciPy
|
||||||
|
-----
|
||||||
|
|
||||||
|
* Files: scorer.py
|
||||||
|
|
||||||
|
The implementation of trapezoid() is adapted from SciPy, which is distributed
|
||||||
|
under the following license:
|
||||||
|
|
||||||
|
New BSD License
|
||||||
|
|
||||||
|
Copyright (c) 2001-2002 Enthought, Inc. 2003-2023, SciPy Developers.
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions
|
||||||
|
are met:
|
||||||
|
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
|
||||||
|
2. Redistributions in binary form must reproduce the above
|
||||||
|
copyright notice, this list of conditions and the following
|
||||||
|
disclaimer in the documentation and/or other materials provided
|
||||||
|
with the distribution.
|
||||||
|
|
||||||
|
3. Neither the name of the copyright holder nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived
|
||||||
|
from this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
|
@ -1,14 +1,67 @@
|
||||||
[build-system]
|
[build-system]
|
||||||
requires = [
|
requires = [
|
||||||
"setuptools",
|
"setuptools",
|
||||||
"cython>=0.25,<3.0",
|
"cython>=3.0,<4.0",
|
||||||
"cymem>=2.0.2,<2.1.0",
|
"cymem>=2.0.2,<2.1.0",
|
||||||
"preshed>=3.0.2,<3.1.0",
|
"preshed>=3.0.2,<3.1.0",
|
||||||
"murmurhash>=0.28.0,<1.1.0",
|
"murmurhash>=0.28.0,<1.1.0",
|
||||||
"thinc>=8.1.8,<8.2.0",
|
"thinc>=8.3.4,<8.4.0",
|
||||||
"numpy>=1.15.0",
|
"numpy>=2.0.0,<3.0.0"
|
||||||
]
|
]
|
||||||
build-backend = "setuptools.build_meta"
|
build-backend = "setuptools.build_meta"
|
||||||
|
|
||||||
|
[tool.cibuildwheel]
|
||||||
|
build = "*"
|
||||||
|
skip = "pp* cp36* cp37* cp38* *-win32 *i686*"
|
||||||
|
test-skip = ""
|
||||||
|
free-threaded-support = false
|
||||||
|
|
||||||
|
archs = ["native"]
|
||||||
|
|
||||||
|
build-frontend = "default"
|
||||||
|
config-settings = {}
|
||||||
|
dependency-versions = "pinned"
|
||||||
|
environment = { PIP_CONSTRAINT = "build-constraints.txt" }
|
||||||
|
|
||||||
|
environment-pass = []
|
||||||
|
build-verbosity = 0
|
||||||
|
|
||||||
|
before-all = "curl https://sh.rustup.rs -sSf | sh -s -- -y --profile minimal --default-toolchain stable"
|
||||||
|
before-build = "pip install -r requirements.txt && python setup.py clean"
|
||||||
|
repair-wheel-command = ""
|
||||||
|
|
||||||
|
test-command = ""
|
||||||
|
before-test = ""
|
||||||
|
test-requires = []
|
||||||
|
test-extras = []
|
||||||
|
|
||||||
|
container-engine = "docker"
|
||||||
|
|
||||||
|
manylinux-x86_64-image = "manylinux2014"
|
||||||
|
manylinux-i686-image = "manylinux2014"
|
||||||
|
manylinux-aarch64-image = "manylinux2014"
|
||||||
|
manylinux-ppc64le-image = "manylinux2014"
|
||||||
|
manylinux-s390x-image = "manylinux2014"
|
||||||
|
manylinux-pypy_x86_64-image = "manylinux2014"
|
||||||
|
manylinux-pypy_i686-image = "manylinux2014"
|
||||||
|
manylinux-pypy_aarch64-image = "manylinux2014"
|
||||||
|
|
||||||
|
musllinux-x86_64-image = "musllinux_1_2"
|
||||||
|
musllinux-i686-image = "musllinux_1_2"
|
||||||
|
musllinux-aarch64-image = "musllinux_1_2"
|
||||||
|
musllinux-ppc64le-image = "musllinux_1_2"
|
||||||
|
musllinux-s390x-image = "musllinux_1_2"
|
||||||
|
|
||||||
|
[tool.cibuildwheel.linux]
|
||||||
|
repair-wheel-command = "auditwheel repair -w {dest_dir} {wheel}"
|
||||||
|
|
||||||
|
[tool.cibuildwheel.macos]
|
||||||
|
repair-wheel-command = "delocate-wheel --require-archs {delocate_archs} -w {dest_dir} -v {wheel}"
|
||||||
|
|
||||||
|
[tool.cibuildwheel.windows]
|
||||||
|
|
||||||
|
[tool.cibuildwheel.pyodide]
|
||||||
|
|
||||||
|
|
||||||
[tool.isort]
|
[tool.isort]
|
||||||
profile = "black"
|
profile = "black"
|
||||||
|
|
|
@ -3,40 +3,36 @@ spacy-legacy>=3.0.11,<3.1.0
|
||||||
spacy-loggers>=1.0.0,<2.0.0
|
spacy-loggers>=1.0.0,<2.0.0
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
thinc>=8.1.8,<8.2.0
|
thinc>=8.3.4,<8.4.0
|
||||||
ml_datasets>=0.2.0,<0.3.0
|
ml_datasets>=0.2.0,<0.3.0
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
wasabi>=0.9.1,<1.2.0
|
wasabi>=0.9.1,<1.2.0
|
||||||
srsly>=2.4.3,<3.0.0
|
srsly>=2.4.3,<3.0.0
|
||||||
catalogue>=2.0.6,<2.1.0
|
catalogue>=2.0.6,<2.1.0
|
||||||
typer>=0.3.0,<0.10.0
|
typer-slim>=0.3.0,<1.0.0
|
||||||
pathy>=0.10.0
|
weasel>=0.1.0,<0.5.0
|
||||||
smart-open>=5.2.1,<7.0.0
|
|
||||||
# Third party dependencies
|
# Third party dependencies
|
||||||
numpy>=1.15.0
|
numpy>=2.0.0,<3.0.0
|
||||||
requests>=2.13.0,<3.0.0
|
requests>=2.13.0,<3.0.0
|
||||||
tqdm>=4.38.0,<5.0.0
|
tqdm>=4.38.0,<5.0.0
|
||||||
pydantic>=1.7.4,!=1.8,!=1.8.1,<3.0.0
|
pydantic>=1.7.4,!=1.8,!=1.8.1,<3.0.0
|
||||||
jinja2
|
jinja2
|
||||||
langcodes>=3.2.0,<4.0.0
|
|
||||||
# Official Python utilities
|
# Official Python utilities
|
||||||
setuptools
|
setuptools
|
||||||
packaging>=20.0
|
packaging>=20.0
|
||||||
typing_extensions>=3.7.4.1,<4.5.0; python_version < "3.8"
|
|
||||||
# Development dependencies
|
# Development dependencies
|
||||||
pre-commit>=2.13.0
|
pre-commit>=2.13.0
|
||||||
cython>=0.25,<3.0
|
cython>=3.0,<4.0
|
||||||
pytest>=5.2.0,!=7.1.0
|
pytest>=5.2.0,!=7.1.0
|
||||||
pytest-timeout>=1.3.0,<2.0.0
|
pytest-timeout>=1.3.0,<2.0.0
|
||||||
mock>=2.0.0,<3.0.0
|
mock>=2.0.0,<3.0.0
|
||||||
flake8>=3.8.0,<6.0.0
|
flake8>=3.8.0,<6.0.0
|
||||||
hypothesis>=3.27.0,<7.0.0
|
hypothesis>=3.27.0,<7.0.0
|
||||||
mypy>=0.990,<1.1.0; platform_machine != "aarch64" and python_version >= "3.7"
|
mypy>=1.5.0,<1.6.0; platform_machine != "aarch64" and python_version >= "3.8"
|
||||||
types-dataclasses>=0.1.3; python_version < "3.7"
|
|
||||||
types-mock>=0.1.1
|
types-mock>=0.1.1
|
||||||
types-setuptools>=57.0.0
|
types-setuptools>=57.0.0
|
||||||
types-requests
|
types-requests
|
||||||
types-setuptools>=57.0.0
|
types-setuptools>=57.0.0
|
||||||
black==22.3.0
|
black==22.3.0
|
||||||
cython-lint>=0.15.0; python_version >= "3.7"
|
cython-lint>=0.15.0
|
||||||
isort>=5.0,<6.0
|
isort>=5.0,<6.0
|
||||||
|
|
34
setup.cfg
34
setup.cfg
|
@ -17,12 +17,11 @@ classifiers =
|
||||||
Operating System :: Microsoft :: Windows
|
Operating System :: Microsoft :: Windows
|
||||||
Programming Language :: Cython
|
Programming Language :: Cython
|
||||||
Programming Language :: Python :: 3
|
Programming Language :: Python :: 3
|
||||||
Programming Language :: Python :: 3.6
|
|
||||||
Programming Language :: Python :: 3.7
|
|
||||||
Programming Language :: Python :: 3.8
|
|
||||||
Programming Language :: Python :: 3.9
|
Programming Language :: Python :: 3.9
|
||||||
Programming Language :: Python :: 3.10
|
Programming Language :: Python :: 3.10
|
||||||
Programming Language :: Python :: 3.11
|
Programming Language :: Python :: 3.11
|
||||||
|
Programming Language :: Python :: 3.12
|
||||||
|
Programming Language :: Python :: 3.13
|
||||||
Topic :: Scientific/Engineering
|
Topic :: Scientific/Engineering
|
||||||
project_urls =
|
project_urls =
|
||||||
Release notes = https://github.com/explosion/spaCy/releases
|
Release notes = https://github.com/explosion/spaCy/releases
|
||||||
|
@ -31,15 +30,18 @@ project_urls =
|
||||||
[options]
|
[options]
|
||||||
zip_safe = false
|
zip_safe = false
|
||||||
include_package_data = true
|
include_package_data = true
|
||||||
python_requires = >=3.6
|
python_requires = >=3.9,<3.14
|
||||||
|
# NOTE: This section is superseded by pyproject.toml and will be removed in
|
||||||
|
# spaCy v4
|
||||||
setup_requires =
|
setup_requires =
|
||||||
cython>=0.25,<3.0
|
cython>=3.0,<4.0
|
||||||
numpy>=1.15.0
|
numpy>=2.0.0,<3.0.0; python_version < "3.9"
|
||||||
|
numpy>=2.0.0,<3.0.0; python_version >= "3.9"
|
||||||
# We also need our Cython packages here to compile against
|
# We also need our Cython packages here to compile against
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
thinc>=8.1.8,<8.2.0
|
thinc>=8.3.4,<8.4.0
|
||||||
install_requires =
|
install_requires =
|
||||||
# Our libraries
|
# Our libraries
|
||||||
spacy-legacy>=3.0.11,<3.1.0
|
spacy-legacy>=3.0.11,<3.1.0
|
||||||
|
@ -47,24 +49,22 @@ install_requires =
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
thinc>=8.1.8,<8.2.0
|
thinc>=8.3.4,<8.4.0
|
||||||
wasabi>=0.9.1,<1.2.0
|
wasabi>=0.9.1,<1.2.0
|
||||||
srsly>=2.4.3,<3.0.0
|
srsly>=2.4.3,<3.0.0
|
||||||
catalogue>=2.0.6,<2.1.0
|
catalogue>=2.0.6,<2.1.0
|
||||||
|
weasel>=0.1.0,<0.5.0
|
||||||
# Third-party dependencies
|
# Third-party dependencies
|
||||||
typer>=0.3.0,<0.10.0
|
typer-slim>=0.3.0,<1.0.0
|
||||||
pathy>=0.10.0
|
|
||||||
smart-open>=5.2.1,<7.0.0
|
|
||||||
tqdm>=4.38.0,<5.0.0
|
tqdm>=4.38.0,<5.0.0
|
||||||
numpy>=1.15.0
|
numpy>=1.15.0; python_version < "3.9"
|
||||||
|
numpy>=1.19.0; python_version >= "3.9"
|
||||||
requests>=2.13.0,<3.0.0
|
requests>=2.13.0,<3.0.0
|
||||||
pydantic>=1.7.4,!=1.8,!=1.8.1,<3.0.0
|
pydantic>=1.7.4,!=1.8,!=1.8.1,<3.0.0
|
||||||
jinja2
|
jinja2
|
||||||
# Official Python utilities
|
# Official Python utilities
|
||||||
setuptools
|
setuptools
|
||||||
packaging>=20.0
|
packaging>=20.0
|
||||||
typing_extensions>=3.7.4.1,<4.5.0; python_version < "3.8"
|
|
||||||
langcodes>=3.2.0,<4.0.0
|
|
||||||
|
|
||||||
[options.entry_points]
|
[options.entry_points]
|
||||||
console_scripts =
|
console_scripts =
|
||||||
|
@ -74,9 +74,7 @@ console_scripts =
|
||||||
lookups =
|
lookups =
|
||||||
spacy_lookups_data>=1.0.3,<1.1.0
|
spacy_lookups_data>=1.0.3,<1.1.0
|
||||||
transformers =
|
transformers =
|
||||||
spacy_transformers>=1.1.2,<1.3.0
|
spacy_transformers>=1.1.2,<1.4.0
|
||||||
ray =
|
|
||||||
spacy_ray>=0.1.0,<1.0.0
|
|
||||||
cuda =
|
cuda =
|
||||||
cupy>=5.0.0b4,<13.0.0
|
cupy>=5.0.0b4,<13.0.0
|
||||||
cuda80 =
|
cuda80 =
|
||||||
|
@ -116,7 +114,7 @@ cuda12x =
|
||||||
cuda-autodetect =
|
cuda-autodetect =
|
||||||
cupy-wheel>=11.0.0,<13.0.0
|
cupy-wheel>=11.0.0,<13.0.0
|
||||||
apple =
|
apple =
|
||||||
thinc-apple-ops>=0.1.0.dev0,<1.0.0
|
thinc-apple-ops>=1.0.0,<2.0.0
|
||||||
# Language tokenizers with external dependencies
|
# Language tokenizers with external dependencies
|
||||||
ja =
|
ja =
|
||||||
sudachipy>=0.5.2,!=0.6.1
|
sudachipy>=0.5.2,!=0.6.1
|
||||||
|
|
1
setup.py
1
setup.py
|
@ -78,6 +78,7 @@ COMPILER_DIRECTIVES = {
|
||||||
"language_level": -3,
|
"language_level": -3,
|
||||||
"embedsignature": True,
|
"embedsignature": True,
|
||||||
"annotation_typing": False,
|
"annotation_typing": False,
|
||||||
|
"profile": sys.version_info < (3, 12),
|
||||||
}
|
}
|
||||||
# Files to copy into the package that are otherwise not included
|
# Files to copy into the package that are otherwise not included
|
||||||
COPY_FILES = {
|
COPY_FILES = {
|
||||||
|
|
|
@ -17,6 +17,7 @@ from .cli.info import info # noqa: F401
|
||||||
from .errors import Errors
|
from .errors import Errors
|
||||||
from .glossary import explain # noqa: F401
|
from .glossary import explain # noqa: F401
|
||||||
from .language import Language
|
from .language import Language
|
||||||
|
from .registrations import REGISTRY_POPULATED, populate_registry
|
||||||
from .util import logger, registry # noqa: F401
|
from .util import logger, registry # noqa: F401
|
||||||
from .vocab import Vocab
|
from .vocab import Vocab
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,5 @@
|
||||||
# fmt: off
|
# fmt: off
|
||||||
__title__ = "spacy"
|
__title__ = "spacy"
|
||||||
__version__ = "3.6.1"
|
__version__ = "3.8.7"
|
||||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||||
__projects__ = "https://github.com/explosion/projects"
|
|
||||||
__projects_branch__ = "v3"
|
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
# cython: profile=False
|
||||||
from .errors import Errors
|
from .errors import Errors
|
||||||
|
|
||||||
IOB_STRINGS = ("", "I", "O", "B")
|
IOB_STRINGS = ("", "I", "O", "B")
|
||||||
|
|
|
@ -1,5 +1,7 @@
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
|
|
||||||
|
# Needed for testing
|
||||||
|
from . import download as download_module # noqa: F401
|
||||||
from ._util import app, setup_cli # noqa: F401
|
from ._util import app, setup_cli # noqa: F401
|
||||||
from .apply import apply # noqa: F401
|
from .apply import apply # noqa: F401
|
||||||
from .assemble import assemble_cli # noqa: F401
|
from .assemble import assemble_cli # noqa: F401
|
||||||
|
@ -22,15 +24,17 @@ from .init_pipeline import init_pipeline_cli # noqa: F401
|
||||||
from .package import package # noqa: F401
|
from .package import package # noqa: F401
|
||||||
from .pretrain import pretrain # noqa: F401
|
from .pretrain import pretrain # noqa: F401
|
||||||
from .profile import profile # noqa: F401
|
from .profile import profile # noqa: F401
|
||||||
from .project.assets import project_assets # noqa: F401
|
from .project.assets import project_assets # type: ignore[attr-defined] # noqa: F401
|
||||||
from .project.clone import project_clone # noqa: F401
|
from .project.clone import project_clone # type: ignore[attr-defined] # noqa: F401
|
||||||
from .project.document import project_document # noqa: F401
|
from .project.document import ( # type: ignore[attr-defined] # noqa: F401
|
||||||
from .project.dvc import project_update_dvc # noqa: F401
|
project_document,
|
||||||
from .project.pull import project_pull # noqa: F401
|
)
|
||||||
from .project.push import project_push # noqa: F401
|
from .project.dvc import project_update_dvc # type: ignore[attr-defined] # noqa: F401
|
||||||
from .project.run import project_run # noqa: F401
|
from .project.pull import project_pull # type: ignore[attr-defined] # noqa: F401
|
||||||
from .train import train_cli # noqa: F401
|
from .project.push import project_push # type: ignore[attr-defined] # noqa: F401
|
||||||
from .validate import validate # noqa: F401
|
from .project.run import project_run # type: ignore[attr-defined] # noqa: F401
|
||||||
|
from .train import train_cli # type: ignore[attr-defined] # noqa: F401
|
||||||
|
from .validate import validate # type: ignore[attr-defined] # noqa: F401
|
||||||
|
|
||||||
|
|
||||||
@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
|
@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
|
||||||
|
|
|
@ -25,10 +25,11 @@ from thinc.api import Config, ConfigValidationError, require_gpu
|
||||||
from thinc.util import gpu_is_available
|
from thinc.util import gpu_is_available
|
||||||
from typer.main import get_command
|
from typer.main import get_command
|
||||||
from wasabi import Printer, msg
|
from wasabi import Printer, msg
|
||||||
|
from weasel import app as project_cli
|
||||||
|
|
||||||
from .. import about
|
from .. import about
|
||||||
from ..compat import Literal
|
from ..compat import Literal
|
||||||
from ..schemas import ProjectConfigSchema, validate
|
from ..schemas import validate
|
||||||
from ..util import (
|
from ..util import (
|
||||||
ENV_VARS,
|
ENV_VARS,
|
||||||
SimpleFrozenDict,
|
SimpleFrozenDict,
|
||||||
|
@ -40,15 +41,10 @@ from ..util import (
|
||||||
run_command,
|
run_command,
|
||||||
)
|
)
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
|
||||||
from pathy import FluidPath # noqa: F401
|
|
||||||
|
|
||||||
|
|
||||||
SDIST_SUFFIX = ".tar.gz"
|
SDIST_SUFFIX = ".tar.gz"
|
||||||
WHEEL_SUFFIX = "-py3-none-any.whl"
|
WHEEL_SUFFIX = "-py3-none-any.whl"
|
||||||
|
|
||||||
PROJECT_FILE = "project.yml"
|
PROJECT_FILE = "project.yml"
|
||||||
PROJECT_LOCK = "project.lock"
|
|
||||||
COMMAND = "python -m spacy"
|
COMMAND = "python -m spacy"
|
||||||
NAME = "spacy"
|
NAME = "spacy"
|
||||||
HELP = """spaCy Command-line Interface
|
HELP = """spaCy Command-line Interface
|
||||||
|
@ -74,11 +70,10 @@ Opt = typer.Option
|
||||||
|
|
||||||
app = typer.Typer(name=NAME, help=HELP)
|
app = typer.Typer(name=NAME, help=HELP)
|
||||||
benchmark_cli = typer.Typer(name="benchmark", help=BENCHMARK_HELP, no_args_is_help=True)
|
benchmark_cli = typer.Typer(name="benchmark", help=BENCHMARK_HELP, no_args_is_help=True)
|
||||||
project_cli = typer.Typer(name="project", help=PROJECT_HELP, no_args_is_help=True)
|
|
||||||
debug_cli = typer.Typer(name="debug", help=DEBUG_HELP, no_args_is_help=True)
|
debug_cli = typer.Typer(name="debug", help=DEBUG_HELP, no_args_is_help=True)
|
||||||
init_cli = typer.Typer(name="init", help=INIT_HELP, no_args_is_help=True)
|
init_cli = typer.Typer(name="init", help=INIT_HELP, no_args_is_help=True)
|
||||||
|
|
||||||
app.add_typer(project_cli)
|
app.add_typer(project_cli, name="project", help=PROJECT_HELP, no_args_is_help=True)
|
||||||
app.add_typer(debug_cli)
|
app.add_typer(debug_cli)
|
||||||
app.add_typer(benchmark_cli)
|
app.add_typer(benchmark_cli)
|
||||||
app.add_typer(init_cli)
|
app.add_typer(init_cli)
|
||||||
|
@ -153,148 +148,6 @@ def _parse_override(value: Any) -> Any:
|
||||||
return str(value)
|
return str(value)
|
||||||
|
|
||||||
|
|
||||||
def load_project_config(
|
|
||||||
path: Path, interpolate: bool = True, overrides: Dict[str, Any] = SimpleFrozenDict()
|
|
||||||
) -> Dict[str, Any]:
|
|
||||||
"""Load the project.yml file from a directory and validate it. Also make
|
|
||||||
sure that all directories defined in the config exist.
|
|
||||||
|
|
||||||
path (Path): The path to the project directory.
|
|
||||||
interpolate (bool): Whether to substitute project variables.
|
|
||||||
overrides (Dict[str, Any]): Optional config overrides.
|
|
||||||
RETURNS (Dict[str, Any]): The loaded project.yml.
|
|
||||||
"""
|
|
||||||
config_path = path / PROJECT_FILE
|
|
||||||
if not config_path.exists():
|
|
||||||
msg.fail(f"Can't find {PROJECT_FILE}", config_path, exits=1)
|
|
||||||
invalid_err = f"Invalid {PROJECT_FILE}. Double-check that the YAML is correct."
|
|
||||||
try:
|
|
||||||
config = srsly.read_yaml(config_path)
|
|
||||||
except ValueError as e:
|
|
||||||
msg.fail(invalid_err, e, exits=1)
|
|
||||||
errors = validate(ProjectConfigSchema, config)
|
|
||||||
if errors:
|
|
||||||
msg.fail(invalid_err)
|
|
||||||
print("\n".join(errors))
|
|
||||||
sys.exit(1)
|
|
||||||
validate_project_version(config)
|
|
||||||
validate_project_commands(config)
|
|
||||||
if interpolate:
|
|
||||||
err = f"{PROJECT_FILE} validation error"
|
|
||||||
with show_validation_error(title=err, hint_fill=False):
|
|
||||||
config = substitute_project_variables(config, overrides)
|
|
||||||
# Make sure directories defined in config exist
|
|
||||||
for subdir in config.get("directories", []):
|
|
||||||
dir_path = path / subdir
|
|
||||||
if not dir_path.exists():
|
|
||||||
dir_path.mkdir(parents=True)
|
|
||||||
return config
|
|
||||||
|
|
||||||
|
|
||||||
def substitute_project_variables(
|
|
||||||
config: Dict[str, Any],
|
|
||||||
overrides: Dict[str, Any] = SimpleFrozenDict(),
|
|
||||||
key: str = "vars",
|
|
||||||
env_key: str = "env",
|
|
||||||
) -> Dict[str, Any]:
|
|
||||||
"""Interpolate variables in the project file using the config system.
|
|
||||||
|
|
||||||
config (Dict[str, Any]): The project config.
|
|
||||||
overrides (Dict[str, Any]): Optional config overrides.
|
|
||||||
key (str): Key containing variables in project config.
|
|
||||||
env_key (str): Key containing environment variable mapping in project config.
|
|
||||||
RETURNS (Dict[str, Any]): The interpolated project config.
|
|
||||||
"""
|
|
||||||
config.setdefault(key, {})
|
|
||||||
config.setdefault(env_key, {})
|
|
||||||
# Substitute references to env vars with their values
|
|
||||||
for config_var, env_var in config[env_key].items():
|
|
||||||
config[env_key][config_var] = _parse_override(os.environ.get(env_var, ""))
|
|
||||||
# Need to put variables in the top scope again so we can have a top-level
|
|
||||||
# section "project" (otherwise, a list of commands in the top scope wouldn't)
|
|
||||||
# be allowed by Thinc's config system
|
|
||||||
cfg = Config({"project": config, key: config[key], env_key: config[env_key]})
|
|
||||||
cfg = Config().from_str(cfg.to_str(), overrides=overrides)
|
|
||||||
interpolated = cfg.interpolate()
|
|
||||||
return dict(interpolated["project"])
|
|
||||||
|
|
||||||
|
|
||||||
def validate_project_version(config: Dict[str, Any]) -> None:
|
|
||||||
"""If the project defines a compatible spaCy version range, chec that it's
|
|
||||||
compatible with the current version of spaCy.
|
|
||||||
|
|
||||||
config (Dict[str, Any]): The loaded config.
|
|
||||||
"""
|
|
||||||
spacy_version = config.get("spacy_version", None)
|
|
||||||
if spacy_version and not is_compatible_version(about.__version__, spacy_version):
|
|
||||||
err = (
|
|
||||||
f"The {PROJECT_FILE} specifies a spaCy version range ({spacy_version}) "
|
|
||||||
f"that's not compatible with the version of spaCy you're running "
|
|
||||||
f"({about.__version__}). You can edit version requirement in the "
|
|
||||||
f"{PROJECT_FILE} to load it, but the project may not run as expected."
|
|
||||||
)
|
|
||||||
msg.fail(err, exits=1)
|
|
||||||
|
|
||||||
|
|
||||||
def validate_project_commands(config: Dict[str, Any]) -> None:
|
|
||||||
"""Check that project commands and workflows are valid, don't contain
|
|
||||||
duplicates, don't clash and only refer to commands that exist.
|
|
||||||
|
|
||||||
config (Dict[str, Any]): The loaded config.
|
|
||||||
"""
|
|
||||||
command_names = [cmd["name"] for cmd in config.get("commands", [])]
|
|
||||||
workflows = config.get("workflows", {})
|
|
||||||
duplicates = set([cmd for cmd in command_names if command_names.count(cmd) > 1])
|
|
||||||
if duplicates:
|
|
||||||
err = f"Duplicate commands defined in {PROJECT_FILE}: {', '.join(duplicates)}"
|
|
||||||
msg.fail(err, exits=1)
|
|
||||||
for workflow_name, workflow_steps in workflows.items():
|
|
||||||
if workflow_name in command_names:
|
|
||||||
err = f"Can't use workflow name '{workflow_name}': name already exists as a command"
|
|
||||||
msg.fail(err, exits=1)
|
|
||||||
for step in workflow_steps:
|
|
||||||
if step not in command_names:
|
|
||||||
msg.fail(
|
|
||||||
f"Unknown command specified in workflow '{workflow_name}': {step}",
|
|
||||||
f"Workflows can only refer to commands defined in the 'commands' "
|
|
||||||
f"section of the {PROJECT_FILE}.",
|
|
||||||
exits=1,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def get_hash(data, exclude: Iterable[str] = tuple()) -> str:
|
|
||||||
"""Get the hash for a JSON-serializable object.
|
|
||||||
|
|
||||||
data: The data to hash.
|
|
||||||
exclude (Iterable[str]): Top-level keys to exclude if data is a dict.
|
|
||||||
RETURNS (str): The hash.
|
|
||||||
"""
|
|
||||||
if isinstance(data, dict):
|
|
||||||
data = {k: v for k, v in data.items() if k not in exclude}
|
|
||||||
data_str = srsly.json_dumps(data, sort_keys=True).encode("utf8")
|
|
||||||
return hashlib.md5(data_str).hexdigest()
|
|
||||||
|
|
||||||
|
|
||||||
def get_checksum(path: Union[Path, str]) -> str:
|
|
||||||
"""Get the checksum for a file or directory given its file path. If a
|
|
||||||
directory path is provided, this uses all files in that directory.
|
|
||||||
|
|
||||||
path (Union[Path, str]): The file or directory path.
|
|
||||||
RETURNS (str): The checksum.
|
|
||||||
"""
|
|
||||||
path = Path(path)
|
|
||||||
if not (path.is_file() or path.is_dir()):
|
|
||||||
msg.fail(f"Can't get checksum for {path}: not a file or directory", exits=1)
|
|
||||||
if path.is_file():
|
|
||||||
return hashlib.md5(Path(path).read_bytes()).hexdigest()
|
|
||||||
else:
|
|
||||||
# TODO: this is currently pretty slow
|
|
||||||
dir_checksum = hashlib.md5()
|
|
||||||
for sub_file in sorted(fp for fp in path.rglob("*") if fp.is_file()):
|
|
||||||
dir_checksum.update(sub_file.read_bytes())
|
|
||||||
return dir_checksum.hexdigest()
|
|
||||||
|
|
||||||
|
|
||||||
@contextmanager
|
@contextmanager
|
||||||
def show_validation_error(
|
def show_validation_error(
|
||||||
file_path: Optional[Union[str, Path]] = None,
|
file_path: Optional[Union[str, Path]] = None,
|
||||||
|
@ -352,166 +205,10 @@ def import_code(code_path: Optional[Union[Path, str]]) -> None:
|
||||||
msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1)
|
msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1)
|
||||||
|
|
||||||
|
|
||||||
def upload_file(src: Path, dest: Union[str, "FluidPath"]) -> None:
|
|
||||||
"""Upload a file.
|
|
||||||
|
|
||||||
src (Path): The source path.
|
|
||||||
url (str): The destination URL to upload to.
|
|
||||||
"""
|
|
||||||
import smart_open
|
|
||||||
|
|
||||||
# Create parent directories for local paths
|
|
||||||
if isinstance(dest, Path):
|
|
||||||
if not dest.parent.exists():
|
|
||||||
dest.parent.mkdir(parents=True)
|
|
||||||
|
|
||||||
dest = str(dest)
|
|
||||||
with smart_open.open(dest, mode="wb") as output_file:
|
|
||||||
with src.open(mode="rb") as input_file:
|
|
||||||
output_file.write(input_file.read())
|
|
||||||
|
|
||||||
|
|
||||||
def download_file(
|
|
||||||
src: Union[str, "FluidPath"], dest: Path, *, force: bool = False
|
|
||||||
) -> None:
|
|
||||||
"""Download a file using smart_open.
|
|
||||||
|
|
||||||
url (str): The URL of the file.
|
|
||||||
dest (Path): The destination path.
|
|
||||||
force (bool): Whether to force download even if file exists.
|
|
||||||
If False, the download will be skipped.
|
|
||||||
"""
|
|
||||||
import smart_open
|
|
||||||
|
|
||||||
if dest.exists() and not force:
|
|
||||||
return None
|
|
||||||
src = str(src)
|
|
||||||
with smart_open.open(src, mode="rb", compression="disable") as input_file:
|
|
||||||
with dest.open(mode="wb") as output_file:
|
|
||||||
shutil.copyfileobj(input_file, output_file)
|
|
||||||
|
|
||||||
|
|
||||||
def ensure_pathy(path):
|
|
||||||
"""Temporary helper to prevent importing Pathy globally (which can cause
|
|
||||||
slow and annoying Google Cloud warning)."""
|
|
||||||
from pathy import Pathy # noqa: F811
|
|
||||||
|
|
||||||
return Pathy.fluid(path)
|
|
||||||
|
|
||||||
|
|
||||||
def git_checkout(
|
|
||||||
repo: str, subpath: str, dest: Path, *, branch: str = "master", sparse: bool = False
|
|
||||||
):
|
|
||||||
git_version = get_git_version()
|
|
||||||
if dest.exists():
|
|
||||||
msg.fail("Destination of checkout must not exist", exits=1)
|
|
||||||
if not dest.parent.exists():
|
|
||||||
msg.fail("Parent of destination of checkout must exist", exits=1)
|
|
||||||
if sparse and git_version >= (2, 22):
|
|
||||||
return git_sparse_checkout(repo, subpath, dest, branch)
|
|
||||||
elif sparse:
|
|
||||||
# Only show warnings if the user explicitly wants sparse checkout but
|
|
||||||
# the Git version doesn't support it
|
|
||||||
err_old = (
|
|
||||||
f"You're running an old version of Git (v{git_version[0]}.{git_version[1]}) "
|
|
||||||
f"that doesn't fully support sparse checkout yet."
|
|
||||||
)
|
|
||||||
err_unk = "You're running an unknown version of Git, so sparse checkout has been disabled."
|
|
||||||
msg.warn(
|
|
||||||
f"{err_unk if git_version == (0, 0) else err_old} "
|
|
||||||
f"This means that more files than necessary may be downloaded "
|
|
||||||
f"temporarily. To only download the files needed, make sure "
|
|
||||||
f"you're using Git v2.22 or above."
|
|
||||||
)
|
|
||||||
with make_tempdir() as tmp_dir:
|
|
||||||
cmd = f"git -C {tmp_dir} clone {repo} . -b {branch}"
|
|
||||||
run_command(cmd, capture=True)
|
|
||||||
# We need Path(name) to make sure we also support subdirectories
|
|
||||||
try:
|
|
||||||
source_path = tmp_dir / Path(subpath)
|
|
||||||
if not is_subpath_of(tmp_dir, source_path):
|
|
||||||
err = f"'{subpath}' is a path outside of the cloned repository."
|
|
||||||
msg.fail(err, repo, exits=1)
|
|
||||||
shutil.copytree(str(source_path), str(dest))
|
|
||||||
except FileNotFoundError:
|
|
||||||
err = f"Can't clone {subpath}. Make sure the directory exists in the repo (branch '{branch}')"
|
|
||||||
msg.fail(err, repo, exits=1)
|
|
||||||
|
|
||||||
|
|
||||||
def git_sparse_checkout(repo, subpath, dest, branch):
|
|
||||||
# We're using Git, partial clone and sparse checkout to
|
|
||||||
# only clone the files we need
|
|
||||||
# This ends up being RIDICULOUS. omg.
|
|
||||||
# So, every tutorial and SO post talks about 'sparse checkout'...But they
|
|
||||||
# go and *clone* the whole repo. Worthless. And cloning part of a repo
|
|
||||||
# turns out to be completely broken. The only way to specify a "path" is..
|
|
||||||
# a path *on the server*? The contents of which, specifies the paths. Wat.
|
|
||||||
# Obviously this is hopelessly broken and insecure, because you can query
|
|
||||||
# arbitrary paths on the server! So nobody enables this.
|
|
||||||
# What we have to do is disable *all* files. We could then just checkout
|
|
||||||
# the path, and it'd "work", but be hopelessly slow...Because it goes and
|
|
||||||
# transfers every missing object one-by-one. So the final piece is that we
|
|
||||||
# need to use some weird git internals to fetch the missings in bulk, and
|
|
||||||
# *that* we can do by path.
|
|
||||||
# We're using Git and sparse checkout to only clone the files we need
|
|
||||||
with make_tempdir() as tmp_dir:
|
|
||||||
# This is the "clone, but don't download anything" part.
|
|
||||||
cmd = (
|
|
||||||
f"git clone {repo} {tmp_dir} --no-checkout --depth 1 "
|
|
||||||
f"-b {branch} --filter=blob:none"
|
|
||||||
)
|
|
||||||
run_command(cmd)
|
|
||||||
# Now we need to find the missing filenames for the subpath we want.
|
|
||||||
# Looking for this 'rev-list' command in the git --help? Hah.
|
|
||||||
cmd = f"git -C {tmp_dir} rev-list --objects --all --missing=print -- {subpath}"
|
|
||||||
ret = run_command(cmd, capture=True)
|
|
||||||
git_repo = _http_to_git(repo)
|
|
||||||
# Now pass those missings into another bit of git internals
|
|
||||||
missings = " ".join([x[1:] for x in ret.stdout.split() if x.startswith("?")])
|
|
||||||
if not missings:
|
|
||||||
err = (
|
|
||||||
f"Could not find any relevant files for '{subpath}'. "
|
|
||||||
f"Did you specify a correct and complete path within repo '{repo}' "
|
|
||||||
f"and branch {branch}?"
|
|
||||||
)
|
|
||||||
msg.fail(err, exits=1)
|
|
||||||
cmd = f"git -C {tmp_dir} fetch-pack {git_repo} {missings}"
|
|
||||||
run_command(cmd, capture=True)
|
|
||||||
# And finally, we can checkout our subpath
|
|
||||||
cmd = f"git -C {tmp_dir} checkout {branch} {subpath}"
|
|
||||||
run_command(cmd, capture=True)
|
|
||||||
|
|
||||||
# Get a subdirectory of the cloned path, if appropriate
|
|
||||||
source_path = tmp_dir / Path(subpath)
|
|
||||||
if not is_subpath_of(tmp_dir, source_path):
|
|
||||||
err = f"'{subpath}' is a path outside of the cloned repository."
|
|
||||||
msg.fail(err, repo, exits=1)
|
|
||||||
|
|
||||||
shutil.move(str(source_path), str(dest))
|
|
||||||
|
|
||||||
|
|
||||||
def git_repo_branch_exists(repo: str, branch: str) -> bool:
|
|
||||||
"""Uses 'git ls-remote' to check if a repository and branch exists
|
|
||||||
|
|
||||||
repo (str): URL to get repo.
|
|
||||||
branch (str): Branch on repo to check.
|
|
||||||
RETURNS (bool): True if repo:branch exists.
|
|
||||||
"""
|
|
||||||
get_git_version()
|
|
||||||
cmd = f"git ls-remote {repo} {branch}"
|
|
||||||
# We might be tempted to use `--exit-code` with `git ls-remote`, but
|
|
||||||
# `run_command` handles the `returncode` for us, so we'll rely on
|
|
||||||
# the fact that stdout returns '' if the requested branch doesn't exist
|
|
||||||
ret = run_command(cmd, capture=True)
|
|
||||||
exists = ret.stdout != ""
|
|
||||||
return exists
|
|
||||||
|
|
||||||
|
|
||||||
def get_git_version(
|
def get_git_version(
|
||||||
error: str = "Could not run 'git'. Make sure it's installed and the executable is available.",
|
error: str = "Could not run 'git'. Make sure it's installed and the executable is available.",
|
||||||
) -> Tuple[int, int]:
|
) -> Tuple[int, int]:
|
||||||
"""Get the version of git and raise an error if calling 'git --version' fails.
|
"""Get the version of git and raise an error if calling 'git --version' fails.
|
||||||
|
|
||||||
error (str): The error message to show.
|
error (str): The error message to show.
|
||||||
RETURNS (Tuple[int, int]): The version as a (major, minor) tuple. Returns
|
RETURNS (Tuple[int, int]): The version as a (major, minor) tuple. Returns
|
||||||
(0, 0) if the version couldn't be determined.
|
(0, 0) if the version couldn't be determined.
|
||||||
|
@ -527,30 +224,6 @@ def get_git_version(
|
||||||
return int(version[0]), int(version[1])
|
return int(version[0]), int(version[1])
|
||||||
|
|
||||||
|
|
||||||
def _http_to_git(repo: str) -> str:
|
|
||||||
if repo.startswith("http://"):
|
|
||||||
repo = repo.replace(r"http://", r"https://")
|
|
||||||
if repo.startswith(r"https://"):
|
|
||||||
repo = repo.replace("https://", "git@").replace("/", ":", 1)
|
|
||||||
if repo.endswith("/"):
|
|
||||||
repo = repo[:-1]
|
|
||||||
repo = f"{repo}.git"
|
|
||||||
return repo
|
|
||||||
|
|
||||||
|
|
||||||
def is_subpath_of(parent, child):
|
|
||||||
"""
|
|
||||||
Check whether `child` is a path contained within `parent`.
|
|
||||||
"""
|
|
||||||
# Based on https://stackoverflow.com/a/37095733 .
|
|
||||||
|
|
||||||
# In Python 3.9, the `Path.is_relative_to()` method will supplant this, so
|
|
||||||
# we can stop using crusty old os.path functions.
|
|
||||||
parent_realpath = os.path.realpath(parent)
|
|
||||||
child_realpath = os.path.realpath(child)
|
|
||||||
return os.path.commonpath([parent_realpath, child_realpath]) == parent_realpath
|
|
||||||
|
|
||||||
|
|
||||||
@overload
|
@overload
|
||||||
def string_to_list(value: str, intify: Literal[False] = ...) -> List[str]:
|
def string_to_list(value: str, intify: Literal[False] = ...) -> List[str]:
|
||||||
...
|
...
|
||||||
|
|
|
@ -133,7 +133,9 @@ def apply(
|
||||||
if len(text_files) > 0:
|
if len(text_files) > 0:
|
||||||
streams.append(_stream_texts(text_files))
|
streams.append(_stream_texts(text_files))
|
||||||
datagen = cast(DocOrStrStream, chain(*streams))
|
datagen = cast(DocOrStrStream, chain(*streams))
|
||||||
for doc in tqdm.tqdm(nlp.pipe(datagen, batch_size=batch_size, n_process=n_process)):
|
for doc in tqdm.tqdm(
|
||||||
|
nlp.pipe(datagen, batch_size=batch_size, n_process=n_process), disable=None
|
||||||
|
):
|
||||||
docbin.add(doc)
|
docbin.add(doc)
|
||||||
if output_file.suffix == "":
|
if output_file.suffix == "":
|
||||||
output_file = output_file.with_suffix(".spacy")
|
output_file = output_file.with_suffix(".spacy")
|
||||||
|
|
|
@ -13,7 +13,7 @@ from .. import util
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..tokens import Doc
|
from ..tokens import Doc
|
||||||
from ..training import Corpus
|
from ..training import Corpus
|
||||||
from ._util import Arg, Opt, benchmark_cli, setup_gpu
|
from ._util import Arg, Opt, benchmark_cli, import_code, setup_gpu
|
||||||
|
|
||||||
|
|
||||||
@benchmark_cli.command(
|
@benchmark_cli.command(
|
||||||
|
@ -30,12 +30,14 @@ def benchmark_speed_cli(
|
||||||
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
||||||
n_batches: int = Opt(50, "--batches", help="Minimum number of batches to benchmark", min=30,),
|
n_batches: int = Opt(50, "--batches", help="Minimum number of batches to benchmark", min=30,),
|
||||||
warmup_epochs: int = Opt(3, "--warmup", "-w", min=0, help="Number of iterations over the data for warmup"),
|
warmup_epochs: int = Opt(3, "--warmup", "-w", min=0, help="Number of iterations over the data for warmup"),
|
||||||
|
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Benchmark a pipeline. Expects a loadable spaCy pipeline and benchmark
|
Benchmark a pipeline. Expects a loadable spaCy pipeline and benchmark
|
||||||
data in the binary .spacy format.
|
data in the binary .spacy format.
|
||||||
"""
|
"""
|
||||||
|
import_code(code_path)
|
||||||
setup_gpu(use_gpu=use_gpu, silent=False)
|
setup_gpu(use_gpu=use_gpu, silent=False)
|
||||||
|
|
||||||
nlp = util.load_model(model)
|
nlp = util.load_model(model)
|
||||||
|
@ -89,7 +91,7 @@ class Quartiles:
|
||||||
def annotate(
|
def annotate(
|
||||||
nlp: Language, docs: List[Doc], batch_size: Optional[int]
|
nlp: Language, docs: List[Doc], batch_size: Optional[int]
|
||||||
) -> numpy.ndarray:
|
) -> numpy.ndarray:
|
||||||
docs = nlp.pipe(tqdm(docs, unit="doc"), batch_size=batch_size)
|
docs = nlp.pipe(tqdm(docs, unit="doc", disable=None), batch_size=batch_size)
|
||||||
wps = []
|
wps = []
|
||||||
while True:
|
while True:
|
||||||
with time_context() as elapsed:
|
with time_context() as elapsed:
|
||||||
|
@ -171,5 +173,5 @@ def print_outliers(sample: numpy.ndarray):
|
||||||
def warmup(
|
def warmup(
|
||||||
nlp: Language, docs: List[Doc], warmup_epochs: int, batch_size: Optional[int]
|
nlp: Language, docs: List[Doc], warmup_epochs: int, batch_size: Optional[int]
|
||||||
) -> numpy.ndarray:
|
) -> numpy.ndarray:
|
||||||
docs = warmup_epochs * docs
|
docs = [doc.copy() for doc in docs * warmup_epochs]
|
||||||
return annotate(nlp, docs, batch_size)
|
return annotate(nlp, docs, batch_size)
|
||||||
|
|
|
@ -170,7 +170,7 @@ def debug_model(
|
||||||
msg.divider(f"STEP 3 - prediction")
|
msg.divider(f"STEP 3 - prediction")
|
||||||
msg.info(str(prediction))
|
msg.info(str(prediction))
|
||||||
|
|
||||||
msg.good(f"Succesfully ended analysis - model looks good.")
|
msg.good(f"Successfully ended analysis - model looks good.")
|
||||||
|
|
||||||
|
|
||||||
def _sentences():
|
def _sentences():
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
import sys
|
import sys
|
||||||
from typing import Optional, Sequence
|
from typing import Optional, Sequence
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
import typer
|
import typer
|
||||||
|
@ -7,7 +8,14 @@ from wasabi import msg
|
||||||
|
|
||||||
from .. import about
|
from .. import about
|
||||||
from ..errors import OLD_MODEL_SHORTCUTS
|
from ..errors import OLD_MODEL_SHORTCUTS
|
||||||
from ..util import get_minor_version, is_package, is_prerelease_version, run_command
|
from ..util import (
|
||||||
|
get_minor_version,
|
||||||
|
is_in_interactive,
|
||||||
|
is_in_jupyter,
|
||||||
|
is_package,
|
||||||
|
is_prerelease_version,
|
||||||
|
run_command,
|
||||||
|
)
|
||||||
from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app
|
from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app
|
||||||
|
|
||||||
|
|
||||||
|
@ -56,6 +64,13 @@ def download(
|
||||||
)
|
)
|
||||||
pip_args = pip_args + ("--no-deps",)
|
pip_args = pip_args + ("--no-deps",)
|
||||||
if direct:
|
if direct:
|
||||||
|
# Reject model names with '/', in order to prevent shenanigans.
|
||||||
|
if "/" in model:
|
||||||
|
msg.fail(
|
||||||
|
title="Model download rejected",
|
||||||
|
text=f"Cannot download model '{model}'. Models are expected to be file names, not URLs or fragments",
|
||||||
|
exits=True,
|
||||||
|
)
|
||||||
components = model.split("-")
|
components = model.split("-")
|
||||||
model_name = "".join(components[:-1])
|
model_name = "".join(components[:-1])
|
||||||
version = components[-1]
|
version = components[-1]
|
||||||
|
@ -77,6 +92,27 @@ def download(
|
||||||
"Download and installation successful",
|
"Download and installation successful",
|
||||||
f"You can now load the package via spacy.load('{model_name}')",
|
f"You can now load the package via spacy.load('{model_name}')",
|
||||||
)
|
)
|
||||||
|
if is_in_jupyter():
|
||||||
|
reload_deps_msg = (
|
||||||
|
"If you are in a Jupyter or Colab notebook, you may need to "
|
||||||
|
"restart Python in order to load all the package's dependencies. "
|
||||||
|
"You can do this by selecting the 'Restart kernel' or 'Restart "
|
||||||
|
"runtime' option."
|
||||||
|
)
|
||||||
|
msg.warn(
|
||||||
|
"Restart to reload dependencies",
|
||||||
|
reload_deps_msg,
|
||||||
|
)
|
||||||
|
elif is_in_interactive():
|
||||||
|
reload_deps_msg = (
|
||||||
|
"If you are in an interactive Python session, you may need to "
|
||||||
|
"exit and restart Python to load all the package's dependencies. "
|
||||||
|
"You can exit with Ctrl-D (or Ctrl-Z and Enter on Windows)."
|
||||||
|
)
|
||||||
|
msg.warn(
|
||||||
|
"Restart to reload dependencies",
|
||||||
|
reload_deps_msg,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def get_model_filename(model_name: str, version: str, sdist: bool = False) -> str:
|
def get_model_filename(model_name: str, version: str, sdist: bool = False) -> str:
|
||||||
|
@ -125,7 +161,16 @@ def get_latest_version(model: str) -> str:
|
||||||
def download_model(
|
def download_model(
|
||||||
filename: str, user_pip_args: Optional[Sequence[str]] = None
|
filename: str, user_pip_args: Optional[Sequence[str]] = None
|
||||||
) -> None:
|
) -> None:
|
||||||
download_url = about.__download_url__ + "/" + filename
|
# Construct the download URL carefully. We need to make sure we don't
|
||||||
|
# allow relative paths or other shenanigans to trick us into download
|
||||||
|
# from outside our own repo.
|
||||||
|
base_url = about.__download_url__
|
||||||
|
# urljoin requires that the path ends with /, or the last path part will be dropped
|
||||||
|
if not base_url.endswith("/"):
|
||||||
|
base_url = about.__download_url__ + "/"
|
||||||
|
download_url = urljoin(base_url, filename)
|
||||||
|
if not download_url.startswith(about.__download_url__):
|
||||||
|
raise ValueError(f"Download from {filename} rejected. Was it a relative path?")
|
||||||
pip_args = list(user_pip_args) if user_pip_args is not None else []
|
pip_args = list(user_pip_args) if user_pip_args is not None else []
|
||||||
cmd = [sys.executable, "-m", "pip", "install"] + pip_args + [download_url]
|
cmd = [sys.executable, "-m", "pip", "install"] + pip_args + [download_url]
|
||||||
run_command(cmd)
|
run_command(cmd)
|
||||||
|
|
|
@ -28,6 +28,7 @@ def evaluate_cli(
|
||||||
displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False),
|
displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False),
|
||||||
displacy_limit: int = Opt(25, "--displacy-limit", "-dl", help="Limit of parses to render as HTML"),
|
displacy_limit: int = Opt(25, "--displacy-limit", "-dl", help="Limit of parses to render as HTML"),
|
||||||
per_component: bool = Opt(False, "--per-component", "-P", help="Return scores per component, only applicable when an output JSON file is specified."),
|
per_component: bool = Opt(False, "--per-component", "-P", help="Return scores per component, only applicable when an output JSON file is specified."),
|
||||||
|
spans_key: str = Opt("sc", "--spans-key", "-sk", help="Spans key to use when evaluating Doc.spans"),
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
|
@ -53,6 +54,7 @@ def evaluate_cli(
|
||||||
displacy_limit=displacy_limit,
|
displacy_limit=displacy_limit,
|
||||||
per_component=per_component,
|
per_component=per_component,
|
||||||
silent=False,
|
silent=False,
|
||||||
|
spans_key=spans_key,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -39,7 +39,7 @@ def find_threshold_cli(
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Runs prediction trials for a trained model with varying tresholds to maximize
|
Runs prediction trials for a trained model with varying thresholds to maximize
|
||||||
the specified metric. The search space for the threshold is traversed linearly
|
the specified metric. The search space for the threshold is traversed linearly
|
||||||
from 0 to 1 in `n_trials` steps. Results are displayed in a table on `stdout`
|
from 0 to 1 in `n_trials` steps. Results are displayed in a table on `stdout`
|
||||||
(the corresponding API call to `spacy.cli.find_threshold.find_threshold()`
|
(the corresponding API call to `spacy.cli.find_threshold.find_threshold()`
|
||||||
|
@ -81,7 +81,7 @@ def find_threshold(
|
||||||
silent: bool = True,
|
silent: bool = True,
|
||||||
) -> Tuple[float, float, Dict[float, float]]:
|
) -> Tuple[float, float, Dict[float, float]]:
|
||||||
"""
|
"""
|
||||||
Runs prediction trials for models with varying tresholds to maximize the specified metric.
|
Runs prediction trials for models with varying thresholds to maximize the specified metric.
|
||||||
model (Union[str, Path]): Pipeline to evaluate. Can be a package or a path to a data directory.
|
model (Union[str, Path]): Pipeline to evaluate. Can be a package or a path to a data directory.
|
||||||
data_path (Path): Path to file with DocBin with docs to use for threshold search.
|
data_path (Path): Path to file with DocBin with docs to use for threshold search.
|
||||||
pipe_name (str): Name of pipe to examine thresholds for.
|
pipe_name (str): Name of pipe to examine thresholds for.
|
||||||
|
|
|
@ -1,5 +1,7 @@
|
||||||
|
import os
|
||||||
import re
|
import re
|
||||||
import shutil
|
import shutil
|
||||||
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
@ -11,6 +13,7 @@ from thinc.api import Config
|
||||||
from wasabi import MarkdownRenderer, Printer, get_raw_input
|
from wasabi import MarkdownRenderer, Printer, get_raw_input
|
||||||
|
|
||||||
from .. import about, util
|
from .. import about, util
|
||||||
|
from ..compat import importlib_metadata
|
||||||
from ..schemas import ModelMetaSchema, validate
|
from ..schemas import ModelMetaSchema, validate
|
||||||
from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app, string_to_list
|
from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app, string_to_list
|
||||||
|
|
||||||
|
@ -27,6 +30,7 @@ def package_cli(
|
||||||
version: Optional[str] = Opt(None, "--version", "-v", help="Package version to override meta"),
|
version: Optional[str] = Opt(None, "--version", "-v", help="Package version to override meta"),
|
||||||
build: str = Opt("sdist", "--build", "-b", help="Comma-separated formats to build: sdist and/or wheel, or none."),
|
build: str = Opt("sdist", "--build", "-b", help="Comma-separated formats to build: sdist and/or wheel, or none."),
|
||||||
force: bool = Opt(False, "--force", "-f", "-F", help="Force overwriting existing data in output directory"),
|
force: bool = Opt(False, "--force", "-f", "-F", help="Force overwriting existing data in output directory"),
|
||||||
|
require_parent: bool = Opt(True, "--require-parent/--no-require-parent", "-R", "-R", help="Include the parent package (e.g. spacy) in the requirements"),
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
|
@ -35,7 +39,7 @@ def package_cli(
|
||||||
specified output directory, and the data will be copied over. If
|
specified output directory, and the data will be copied over. If
|
||||||
--create-meta is set and a meta.json already exists in the output directory,
|
--create-meta is set and a meta.json already exists in the output directory,
|
||||||
the existing values will be used as the defaults in the command-line prompt.
|
the existing values will be used as the defaults in the command-line prompt.
|
||||||
After packaging, "python setup.py sdist" is run in the package directory,
|
After packaging, "python -m build --sdist" is run in the package directory,
|
||||||
which will create a .tar.gz archive that can be installed via "pip install".
|
which will create a .tar.gz archive that can be installed via "pip install".
|
||||||
|
|
||||||
If additional code files are provided (e.g. Python files containing custom
|
If additional code files are provided (e.g. Python files containing custom
|
||||||
|
@ -57,6 +61,7 @@ def package_cli(
|
||||||
create_sdist=create_sdist,
|
create_sdist=create_sdist,
|
||||||
create_wheel=create_wheel,
|
create_wheel=create_wheel,
|
||||||
force=force,
|
force=force,
|
||||||
|
require_parent=require_parent,
|
||||||
silent=False,
|
silent=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -71,6 +76,7 @@ def package(
|
||||||
create_meta: bool = False,
|
create_meta: bool = False,
|
||||||
create_sdist: bool = True,
|
create_sdist: bool = True,
|
||||||
create_wheel: bool = False,
|
create_wheel: bool = False,
|
||||||
|
require_parent: bool = False,
|
||||||
force: bool = False,
|
force: bool = False,
|
||||||
silent: bool = True,
|
silent: bool = True,
|
||||||
) -> None:
|
) -> None:
|
||||||
|
@ -78,9 +84,17 @@ def package(
|
||||||
input_path = util.ensure_path(input_dir)
|
input_path = util.ensure_path(input_dir)
|
||||||
output_path = util.ensure_path(output_dir)
|
output_path = util.ensure_path(output_dir)
|
||||||
meta_path = util.ensure_path(meta_path)
|
meta_path = util.ensure_path(meta_path)
|
||||||
if create_wheel and not has_wheel():
|
if create_wheel and not has_wheel() and not has_build():
|
||||||
err = "Generating a binary .whl file requires wheel to be installed"
|
err = (
|
||||||
msg.fail(err, "pip install wheel", exits=1)
|
"Generating wheels requires 'build' or 'wheel' (deprecated) to be installed"
|
||||||
|
)
|
||||||
|
msg.fail(err, "pip install build", exits=1)
|
||||||
|
if not has_build():
|
||||||
|
msg.warn(
|
||||||
|
"Generating packages without the 'build' package is deprecated and "
|
||||||
|
"will not be supported in the future. To install 'build': pip "
|
||||||
|
"install build"
|
||||||
|
)
|
||||||
if not input_path or not input_path.exists():
|
if not input_path or not input_path.exists():
|
||||||
msg.fail("Can't locate pipeline data", input_path, exits=1)
|
msg.fail("Can't locate pipeline data", input_path, exits=1)
|
||||||
if not output_path or not output_path.exists():
|
if not output_path or not output_path.exists():
|
||||||
|
@ -102,7 +116,7 @@ def package(
|
||||||
if not meta_path.exists() or not meta_path.is_file():
|
if not meta_path.exists() or not meta_path.is_file():
|
||||||
msg.fail("Can't load pipeline meta.json", meta_path, exits=1)
|
msg.fail("Can't load pipeline meta.json", meta_path, exits=1)
|
||||||
meta = srsly.read_json(meta_path)
|
meta = srsly.read_json(meta_path)
|
||||||
meta = get_meta(input_dir, meta)
|
meta = get_meta(input_dir, meta, require_parent=require_parent)
|
||||||
if meta["requirements"]:
|
if meta["requirements"]:
|
||||||
msg.good(
|
msg.good(
|
||||||
f"Including {len(meta['requirements'])} package requirement(s) from "
|
f"Including {len(meta['requirements'])} package requirement(s) from "
|
||||||
|
@ -175,6 +189,7 @@ def package(
|
||||||
imports.append(code_path.stem)
|
imports.append(code_path.stem)
|
||||||
shutil.copy(str(code_path), str(package_path))
|
shutil.copy(str(code_path), str(package_path))
|
||||||
create_file(main_path / "meta.json", srsly.json_dumps(meta, indent=2))
|
create_file(main_path / "meta.json", srsly.json_dumps(meta, indent=2))
|
||||||
|
|
||||||
create_file(main_path / "setup.py", TEMPLATE_SETUP)
|
create_file(main_path / "setup.py", TEMPLATE_SETUP)
|
||||||
create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST)
|
create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST)
|
||||||
init_py = TEMPLATE_INIT.format(
|
init_py = TEMPLATE_INIT.format(
|
||||||
|
@ -184,12 +199,37 @@ def package(
|
||||||
msg.good(f"Successfully created package directory '{model_name_v}'", main_path)
|
msg.good(f"Successfully created package directory '{model_name_v}'", main_path)
|
||||||
if create_sdist:
|
if create_sdist:
|
||||||
with util.working_dir(main_path):
|
with util.working_dir(main_path):
|
||||||
util.run_command([sys.executable, "setup.py", "sdist"], capture=False)
|
# run directly, since util.run_command is not designed to continue
|
||||||
|
# after a command fails
|
||||||
|
ret = subprocess.run(
|
||||||
|
[sys.executable, "-m", "build", ".", "--sdist"],
|
||||||
|
env=os.environ.copy(),
|
||||||
|
)
|
||||||
|
if ret.returncode != 0:
|
||||||
|
msg.warn(
|
||||||
|
"Creating sdist with 'python -m build' failed. Falling "
|
||||||
|
"back to deprecated use of 'python setup.py sdist'"
|
||||||
|
)
|
||||||
|
util.run_command([sys.executable, "setup.py", "sdist"], capture=False)
|
||||||
zip_file = main_path / "dist" / f"{model_name_v}{SDIST_SUFFIX}"
|
zip_file = main_path / "dist" / f"{model_name_v}{SDIST_SUFFIX}"
|
||||||
msg.good(f"Successfully created zipped Python package", zip_file)
|
msg.good(f"Successfully created zipped Python package", zip_file)
|
||||||
if create_wheel:
|
if create_wheel:
|
||||||
with util.working_dir(main_path):
|
with util.working_dir(main_path):
|
||||||
util.run_command([sys.executable, "setup.py", "bdist_wheel"], capture=False)
|
# run directly, since util.run_command is not designed to continue
|
||||||
|
# after a command fails
|
||||||
|
ret = subprocess.run(
|
||||||
|
[sys.executable, "-m", "build", ".", "--wheel"],
|
||||||
|
env=os.environ.copy(),
|
||||||
|
)
|
||||||
|
if ret.returncode != 0:
|
||||||
|
msg.warn(
|
||||||
|
"Creating wheel with 'python -m build' failed. Falling "
|
||||||
|
"back to deprecated use of 'wheel' with "
|
||||||
|
"'python setup.py bdist_wheel'"
|
||||||
|
)
|
||||||
|
util.run_command(
|
||||||
|
[sys.executable, "setup.py", "bdist_wheel"], capture=False
|
||||||
|
)
|
||||||
wheel_name_squashed = re.sub("_+", "_", model_name_v)
|
wheel_name_squashed = re.sub("_+", "_", model_name_v)
|
||||||
wheel = main_path / "dist" / f"{wheel_name_squashed}{WHEEL_SUFFIX}"
|
wheel = main_path / "dist" / f"{wheel_name_squashed}{WHEEL_SUFFIX}"
|
||||||
msg.good(f"Successfully created binary wheel", wheel)
|
msg.good(f"Successfully created binary wheel", wheel)
|
||||||
|
@ -209,6 +249,17 @@ def has_wheel() -> bool:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def has_build() -> bool:
|
||||||
|
# it's very likely that there is a local directory named build/ (especially
|
||||||
|
# in an editable install), so an import check is not sufficient; instead
|
||||||
|
# check that there is a package version
|
||||||
|
try:
|
||||||
|
importlib_metadata.version("build")
|
||||||
|
return True
|
||||||
|
except importlib_metadata.PackageNotFoundError: # type: ignore[attr-defined]
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
def get_third_party_dependencies(
|
def get_third_party_dependencies(
|
||||||
config: Config, exclude: List[str] = util.SimpleFrozenList()
|
config: Config, exclude: List[str] = util.SimpleFrozenList()
|
||||||
) -> List[str]:
|
) -> List[str]:
|
||||||
|
@ -255,6 +306,8 @@ def get_third_party_dependencies(
|
||||||
modules.add(func_info["module"].split(".")[0]) # type: ignore[union-attr]
|
modules.add(func_info["module"].split(".")[0]) # type: ignore[union-attr]
|
||||||
dependencies = []
|
dependencies = []
|
||||||
for module_name in modules:
|
for module_name in modules:
|
||||||
|
if module_name == about.__title__:
|
||||||
|
continue
|
||||||
if module_name in distributions:
|
if module_name in distributions:
|
||||||
dist = distributions.get(module_name)
|
dist = distributions.get(module_name)
|
||||||
if dist:
|
if dist:
|
||||||
|
@ -285,7 +338,9 @@ def create_file(file_path: Path, contents: str) -> None:
|
||||||
|
|
||||||
|
|
||||||
def get_meta(
|
def get_meta(
|
||||||
model_path: Union[str, Path], existing_meta: Dict[str, Any]
|
model_path: Union[str, Path],
|
||||||
|
existing_meta: Dict[str, Any],
|
||||||
|
require_parent: bool = False,
|
||||||
) -> Dict[str, Any]:
|
) -> Dict[str, Any]:
|
||||||
meta: Dict[str, Any] = {
|
meta: Dict[str, Any] = {
|
||||||
"lang": "en",
|
"lang": "en",
|
||||||
|
@ -314,6 +369,8 @@ def get_meta(
|
||||||
existing_reqs = [util.split_requirement(req)[0] for req in meta["requirements"]]
|
existing_reqs = [util.split_requirement(req)[0] for req in meta["requirements"]]
|
||||||
reqs = get_third_party_dependencies(nlp.config, exclude=existing_reqs)
|
reqs = get_third_party_dependencies(nlp.config, exclude=existing_reqs)
|
||||||
meta["requirements"].extend(reqs)
|
meta["requirements"].extend(reqs)
|
||||||
|
if require_parent and about.__title__ not in meta["requirements"]:
|
||||||
|
meta["requirements"].append(about.__title__ + meta["spacy_version"])
|
||||||
return meta
|
return meta
|
||||||
|
|
||||||
|
|
||||||
|
@ -488,8 +545,11 @@ def list_files(data_dir):
|
||||||
|
|
||||||
|
|
||||||
def list_requirements(meta):
|
def list_requirements(meta):
|
||||||
parent_package = meta.get('parent_package', 'spacy')
|
# Up to version 3.7, we included the parent package
|
||||||
requirements = [parent_package + meta['spacy_version']]
|
# in requirements by default. This behaviour is removed
|
||||||
|
# in 3.8, with a setting to include the parent package in
|
||||||
|
# the requirements list in the meta if desired.
|
||||||
|
requirements = []
|
||||||
if 'setup_requires' in meta:
|
if 'setup_requires' in meta:
|
||||||
requirements += meta['setup_requires']
|
requirements += meta['setup_requires']
|
||||||
if 'requirements' in meta:
|
if 'requirements' in meta:
|
||||||
|
|
|
@ -71,7 +71,7 @@ def profile(model: str, inputs: Optional[Path] = None, n_texts: int = 10000) ->
|
||||||
|
|
||||||
|
|
||||||
def parse_texts(nlp: Language, texts: Sequence[str]) -> None:
|
def parse_texts(nlp: Language, texts: Sequence[str]) -> None:
|
||||||
for doc in nlp.pipe(tqdm.tqdm(texts), batch_size=16):
|
for doc in nlp.pipe(tqdm.tqdm(texts, disable=None), batch_size=16):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,217 +1 @@
|
||||||
import os
|
from weasel.cli.assets import *
|
||||||
import re
|
|
||||||
import shutil
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Any, Dict, Optional
|
|
||||||
|
|
||||||
import requests
|
|
||||||
import typer
|
|
||||||
from wasabi import msg
|
|
||||||
|
|
||||||
from ...util import ensure_path, working_dir
|
|
||||||
from .._util import (
|
|
||||||
PROJECT_FILE,
|
|
||||||
Arg,
|
|
||||||
Opt,
|
|
||||||
SimpleFrozenDict,
|
|
||||||
download_file,
|
|
||||||
get_checksum,
|
|
||||||
get_git_version,
|
|
||||||
git_checkout,
|
|
||||||
load_project_config,
|
|
||||||
parse_config_overrides,
|
|
||||||
project_cli,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Whether assets are extra if `extra` is not set.
|
|
||||||
EXTRA_DEFAULT = False
|
|
||||||
|
|
||||||
|
|
||||||
@project_cli.command(
|
|
||||||
"assets",
|
|
||||||
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
|
||||||
)
|
|
||||||
def project_assets_cli(
|
|
||||||
# fmt: off
|
|
||||||
ctx: typer.Context, # This is only used to read additional arguments
|
|
||||||
project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
|
|
||||||
sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse checkout for assets provided via Git, to only check out and clone the files needed. Requires Git v22.2+."),
|
|
||||||
extra: bool = Opt(False, "--extra", "-e", help="Download all assets, including those marked as 'extra'.")
|
|
||||||
# fmt: on
|
|
||||||
):
|
|
||||||
"""Fetch project assets like datasets and pretrained weights. Assets are
|
|
||||||
defined in the "assets" section of the project.yml. If a checksum is
|
|
||||||
provided in the project.yml, the file is only downloaded if no local file
|
|
||||||
with the same checksum exists.
|
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/cli#project-assets
|
|
||||||
"""
|
|
||||||
overrides = parse_config_overrides(ctx.args)
|
|
||||||
project_assets(
|
|
||||||
project_dir,
|
|
||||||
overrides=overrides,
|
|
||||||
sparse_checkout=sparse_checkout,
|
|
||||||
extra=extra,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def project_assets(
|
|
||||||
project_dir: Path,
|
|
||||||
*,
|
|
||||||
overrides: Dict[str, Any] = SimpleFrozenDict(),
|
|
||||||
sparse_checkout: bool = False,
|
|
||||||
extra: bool = False,
|
|
||||||
) -> None:
|
|
||||||
"""Fetch assets for a project using DVC if possible.
|
|
||||||
|
|
||||||
project_dir (Path): Path to project directory.
|
|
||||||
sparse_checkout (bool): Use sparse checkout for assets provided via Git, to only check out and clone the files
|
|
||||||
needed.
|
|
||||||
extra (bool): Whether to download all assets, including those marked as 'extra'.
|
|
||||||
"""
|
|
||||||
project_path = ensure_path(project_dir)
|
|
||||||
config = load_project_config(project_path, overrides=overrides)
|
|
||||||
assets = [
|
|
||||||
asset
|
|
||||||
for asset in config.get("assets", [])
|
|
||||||
if extra or not asset.get("extra", EXTRA_DEFAULT)
|
|
||||||
]
|
|
||||||
if not assets:
|
|
||||||
msg.warn(
|
|
||||||
f"No assets specified in {PROJECT_FILE} (if assets are marked as extra, download them with --extra)",
|
|
||||||
exits=0,
|
|
||||||
)
|
|
||||||
msg.info(f"Fetching {len(assets)} asset(s)")
|
|
||||||
|
|
||||||
for asset in assets:
|
|
||||||
dest = (project_dir / asset["dest"]).resolve()
|
|
||||||
checksum = asset.get("checksum")
|
|
||||||
if "git" in asset:
|
|
||||||
git_err = (
|
|
||||||
f"Cloning spaCy project templates requires Git and the 'git' command. "
|
|
||||||
f"Make sure it's installed and that the executable is available."
|
|
||||||
)
|
|
||||||
get_git_version(error=git_err)
|
|
||||||
if dest.exists():
|
|
||||||
# If there's already a file, check for checksum
|
|
||||||
if checksum and checksum == get_checksum(dest):
|
|
||||||
msg.good(
|
|
||||||
f"Skipping download with matching checksum: {asset['dest']}"
|
|
||||||
)
|
|
||||||
continue
|
|
||||||
else:
|
|
||||||
if dest.is_dir():
|
|
||||||
shutil.rmtree(dest)
|
|
||||||
else:
|
|
||||||
dest.unlink()
|
|
||||||
if "repo" not in asset["git"] or asset["git"]["repo"] is None:
|
|
||||||
msg.fail(
|
|
||||||
"A git asset must include 'repo', the repository address.", exits=1
|
|
||||||
)
|
|
||||||
if "path" not in asset["git"] or asset["git"]["path"] is None:
|
|
||||||
msg.fail(
|
|
||||||
"A git asset must include 'path' - use \"\" to get the entire repository.",
|
|
||||||
exits=1,
|
|
||||||
)
|
|
||||||
git_checkout(
|
|
||||||
asset["git"]["repo"],
|
|
||||||
asset["git"]["path"],
|
|
||||||
dest,
|
|
||||||
branch=asset["git"].get("branch"),
|
|
||||||
sparse=sparse_checkout,
|
|
||||||
)
|
|
||||||
msg.good(f"Downloaded asset {dest}")
|
|
||||||
else:
|
|
||||||
url = asset.get("url")
|
|
||||||
if not url:
|
|
||||||
# project.yml defines asset without URL that the user has to place
|
|
||||||
check_private_asset(dest, checksum)
|
|
||||||
continue
|
|
||||||
fetch_asset(project_path, url, dest, checksum)
|
|
||||||
|
|
||||||
|
|
||||||
def check_private_asset(dest: Path, checksum: Optional[str] = None) -> None:
|
|
||||||
"""Check and validate assets without a URL (private assets that the user
|
|
||||||
has to provide themselves) and give feedback about the checksum.
|
|
||||||
|
|
||||||
dest (Path): Destination path of the asset.
|
|
||||||
checksum (Optional[str]): Optional checksum of the expected file.
|
|
||||||
"""
|
|
||||||
if not Path(dest).exists():
|
|
||||||
err = f"No URL provided for asset. You need to add this file yourself: {dest}"
|
|
||||||
msg.warn(err)
|
|
||||||
else:
|
|
||||||
if not checksum:
|
|
||||||
msg.good(f"Asset already exists: {dest}")
|
|
||||||
elif checksum == get_checksum(dest):
|
|
||||||
msg.good(f"Asset exists with matching checksum: {dest}")
|
|
||||||
else:
|
|
||||||
msg.fail(f"Asset available but with incorrect checksum: {dest}")
|
|
||||||
|
|
||||||
|
|
||||||
def fetch_asset(
|
|
||||||
project_path: Path, url: str, dest: Path, checksum: Optional[str] = None
|
|
||||||
) -> None:
|
|
||||||
"""Fetch an asset from a given URL or path. If a checksum is provided and a
|
|
||||||
local file exists, it's only re-downloaded if the checksum doesn't match.
|
|
||||||
|
|
||||||
project_path (Path): Path to project directory.
|
|
||||||
url (str): URL or path to asset.
|
|
||||||
checksum (Optional[str]): Optional expected checksum of local file.
|
|
||||||
RETURNS (Optional[Path]): The path to the fetched asset or None if fetching
|
|
||||||
the asset failed.
|
|
||||||
"""
|
|
||||||
dest_path = (project_path / dest).resolve()
|
|
||||||
if dest_path.exists():
|
|
||||||
# If there's already a file, check for checksum
|
|
||||||
if checksum:
|
|
||||||
if checksum == get_checksum(dest_path):
|
|
||||||
msg.good(f"Skipping download with matching checksum: {dest}")
|
|
||||||
return
|
|
||||||
else:
|
|
||||||
# If there's not a checksum, make sure the file is a possibly valid size
|
|
||||||
if os.path.getsize(dest_path) == 0:
|
|
||||||
msg.warn(f"Asset exists but with size of 0 bytes, deleting: {dest}")
|
|
||||||
os.remove(dest_path)
|
|
||||||
# We might as well support the user here and create parent directories in
|
|
||||||
# case the asset dir isn't listed as a dir to create in the project.yml
|
|
||||||
if not dest_path.parent.exists():
|
|
||||||
dest_path.parent.mkdir(parents=True)
|
|
||||||
with working_dir(project_path):
|
|
||||||
url = convert_asset_url(url)
|
|
||||||
try:
|
|
||||||
download_file(url, dest_path)
|
|
||||||
msg.good(f"Downloaded asset {dest}")
|
|
||||||
except requests.exceptions.RequestException as e:
|
|
||||||
if Path(url).exists() and Path(url).is_file():
|
|
||||||
# If it's a local file, copy to destination
|
|
||||||
shutil.copy(url, str(dest_path))
|
|
||||||
msg.good(f"Copied local asset {dest}")
|
|
||||||
else:
|
|
||||||
msg.fail(f"Download failed: {dest}", e)
|
|
||||||
if checksum and checksum != get_checksum(dest_path):
|
|
||||||
msg.fail(f"Checksum doesn't match value defined in {PROJECT_FILE}: {dest}")
|
|
||||||
|
|
||||||
|
|
||||||
def convert_asset_url(url: str) -> str:
|
|
||||||
"""Check and convert the asset URL if needed.
|
|
||||||
|
|
||||||
url (str): The asset URL.
|
|
||||||
RETURNS (str): The converted URL.
|
|
||||||
"""
|
|
||||||
# If the asset URL is a regular GitHub URL it's likely a mistake
|
|
||||||
if (
|
|
||||||
re.match(r"(http(s?)):\/\/github.com", url)
|
|
||||||
and "releases/download" not in url
|
|
||||||
and "/raw/" not in url
|
|
||||||
):
|
|
||||||
converted = url.replace("github.com", "raw.githubusercontent.com")
|
|
||||||
converted = re.sub(r"/(tree|blob)/", "/", converted)
|
|
||||||
msg.warn(
|
|
||||||
"Downloading from a regular GitHub URL. This will only download "
|
|
||||||
"the source of the page, not the actual file. Converting the URL "
|
|
||||||
"to a raw URL.",
|
|
||||||
converted,
|
|
||||||
)
|
|
||||||
return converted
|
|
||||||
return url
|
|
||||||
|
|
|
@ -1,124 +1 @@
|
||||||
import re
|
from weasel.cli.clone import *
|
||||||
import subprocess
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
from wasabi import msg
|
|
||||||
|
|
||||||
from ... import about
|
|
||||||
from ...util import ensure_path
|
|
||||||
from .._util import (
|
|
||||||
COMMAND,
|
|
||||||
PROJECT_FILE,
|
|
||||||
Arg,
|
|
||||||
Opt,
|
|
||||||
get_git_version,
|
|
||||||
git_checkout,
|
|
||||||
git_repo_branch_exists,
|
|
||||||
project_cli,
|
|
||||||
)
|
|
||||||
|
|
||||||
DEFAULT_REPO = about.__projects__
|
|
||||||
DEFAULT_PROJECTS_BRANCH = about.__projects_branch__
|
|
||||||
DEFAULT_BRANCHES = ["main", "master"]
|
|
||||||
|
|
||||||
|
|
||||||
@project_cli.command("clone")
|
|
||||||
def project_clone_cli(
|
|
||||||
# fmt: off
|
|
||||||
name: str = Arg(..., help="The name of the template to clone"),
|
|
||||||
dest: Optional[Path] = Arg(None, help="Where to clone the project. Defaults to current working directory", exists=False),
|
|
||||||
repo: str = Opt(DEFAULT_REPO, "--repo", "-r", help="The repository to clone from"),
|
|
||||||
branch: Optional[str] = Opt(None, "--branch", "-b", help=f"The branch to clone from. If not provided, will attempt {', '.join(DEFAULT_BRANCHES)}"),
|
|
||||||
sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse Git checkout to only check out and clone the files needed. Requires Git v22.2+.")
|
|
||||||
# fmt: on
|
|
||||||
):
|
|
||||||
"""Clone a project template from a repository. Calls into "git" and will
|
|
||||||
only download the files from the given subdirectory. The GitHub repo
|
|
||||||
defaults to the official spaCy template repo, but can be customized
|
|
||||||
(including using a private repo).
|
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/cli#project-clone
|
|
||||||
"""
|
|
||||||
if dest is None:
|
|
||||||
dest = Path.cwd() / Path(name).parts[-1]
|
|
||||||
if repo == DEFAULT_REPO and branch is None:
|
|
||||||
branch = DEFAULT_PROJECTS_BRANCH
|
|
||||||
|
|
||||||
if branch is None:
|
|
||||||
for default_branch in DEFAULT_BRANCHES:
|
|
||||||
if git_repo_branch_exists(repo, default_branch):
|
|
||||||
branch = default_branch
|
|
||||||
break
|
|
||||||
if branch is None:
|
|
||||||
default_branches_msg = ", ".join(f"'{b}'" for b in DEFAULT_BRANCHES)
|
|
||||||
msg.fail(
|
|
||||||
"No branch provided and attempted default "
|
|
||||||
f"branches {default_branches_msg} do not exist.",
|
|
||||||
exits=1,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
if not git_repo_branch_exists(repo, branch):
|
|
||||||
msg.fail(f"repo: {repo} (branch: {branch}) does not exist.", exits=1)
|
|
||||||
assert isinstance(branch, str)
|
|
||||||
project_clone(name, dest, repo=repo, branch=branch, sparse_checkout=sparse_checkout)
|
|
||||||
|
|
||||||
|
|
||||||
def project_clone(
|
|
||||||
name: str,
|
|
||||||
dest: Path,
|
|
||||||
*,
|
|
||||||
repo: str = about.__projects__,
|
|
||||||
branch: str = about.__projects_branch__,
|
|
||||||
sparse_checkout: bool = False,
|
|
||||||
) -> None:
|
|
||||||
"""Clone a project template from a repository.
|
|
||||||
|
|
||||||
name (str): Name of subdirectory to clone.
|
|
||||||
dest (Path): Destination path of cloned project.
|
|
||||||
repo (str): URL of Git repo containing project templates.
|
|
||||||
branch (str): The branch to clone from
|
|
||||||
"""
|
|
||||||
dest = ensure_path(dest)
|
|
||||||
check_clone(name, dest, repo)
|
|
||||||
project_dir = dest.resolve()
|
|
||||||
repo_name = re.sub(r"(http(s?)):\/\/github.com/", "", repo)
|
|
||||||
try:
|
|
||||||
git_checkout(repo, name, dest, branch=branch, sparse=sparse_checkout)
|
|
||||||
except subprocess.CalledProcessError:
|
|
||||||
err = f"Could not clone '{name}' from repo '{repo_name}' (branch '{branch}')"
|
|
||||||
msg.fail(err, exits=1)
|
|
||||||
msg.good(f"Cloned '{name}' from '{repo_name}' (branch '{branch}')", project_dir)
|
|
||||||
if not (project_dir / PROJECT_FILE).exists():
|
|
||||||
msg.warn(f"No {PROJECT_FILE} found in directory")
|
|
||||||
else:
|
|
||||||
msg.good(f"Your project is now ready!")
|
|
||||||
print(f"To fetch the assets, run:\n{COMMAND} project assets {dest}")
|
|
||||||
|
|
||||||
|
|
||||||
def check_clone(name: str, dest: Path, repo: str) -> None:
|
|
||||||
"""Check and validate that the destination path can be used to clone. Will
|
|
||||||
check that Git is available and that the destination path is suitable.
|
|
||||||
|
|
||||||
name (str): Name of the directory to clone from the repo.
|
|
||||||
dest (Path): Local destination of cloned directory.
|
|
||||||
repo (str): URL of the repo to clone from.
|
|
||||||
"""
|
|
||||||
git_err = (
|
|
||||||
f"Cloning spaCy project templates requires Git and the 'git' command. "
|
|
||||||
f"To clone a project without Git, copy the files from the '{name}' "
|
|
||||||
f"directory in the {repo} to {dest} manually."
|
|
||||||
)
|
|
||||||
get_git_version(error=git_err)
|
|
||||||
if not dest:
|
|
||||||
msg.fail(f"Not a valid directory to clone project: {dest}", exits=1)
|
|
||||||
if dest.exists():
|
|
||||||
# Directory already exists (not allowed, clone needs to create it)
|
|
||||||
msg.fail(f"Can't clone project, directory already exists: {dest}", exits=1)
|
|
||||||
if not dest.parent.exists():
|
|
||||||
# We're not creating parents, parent dir should exist
|
|
||||||
msg.fail(
|
|
||||||
f"Can't clone project, parent directory doesn't exist: {dest.parent}. "
|
|
||||||
f"Create the necessary folder(s) first before continuing.",
|
|
||||||
exits=1,
|
|
||||||
)
|
|
||||||
|
|
|
@ -1,115 +1 @@
|
||||||
from pathlib import Path
|
from weasel.cli.document import *
|
||||||
|
|
||||||
from wasabi import MarkdownRenderer, msg
|
|
||||||
|
|
||||||
from ...util import working_dir
|
|
||||||
from .._util import PROJECT_FILE, Arg, Opt, load_project_config, project_cli
|
|
||||||
|
|
||||||
DOCS_URL = "https://spacy.io"
|
|
||||||
INTRO_PROJECT = f"""The [`{PROJECT_FILE}`]({PROJECT_FILE}) defines the data assets required by the
|
|
||||||
project, as well as the available commands and workflows. For details, see the
|
|
||||||
[spaCy projects documentation]({DOCS_URL}/usage/projects)."""
|
|
||||||
INTRO_COMMANDS = f"""The following commands are defined by the project. They
|
|
||||||
can be executed using [`spacy project run [name]`]({DOCS_URL}/api/cli#project-run).
|
|
||||||
Commands are only re-run if their inputs have changed."""
|
|
||||||
INTRO_WORKFLOWS = f"""The following workflows are defined by the project. They
|
|
||||||
can be executed using [`spacy project run [name]`]({DOCS_URL}/api/cli#project-run)
|
|
||||||
and will run the specified commands in order. Commands are only re-run if their
|
|
||||||
inputs have changed."""
|
|
||||||
INTRO_ASSETS = f"""The following assets are defined by the project. They can
|
|
||||||
be fetched by running [`spacy project assets`]({DOCS_URL}/api/cli#project-assets)
|
|
||||||
in the project directory."""
|
|
||||||
# These markers are added to the Markdown and can be used to update the file in
|
|
||||||
# place if it already exists. Only the auto-generated part will be replaced.
|
|
||||||
MARKER_START = "<!-- SPACY PROJECT: AUTO-GENERATED DOCS START (do not remove) -->"
|
|
||||||
MARKER_END = "<!-- SPACY PROJECT: AUTO-GENERATED DOCS END (do not remove) -->"
|
|
||||||
# If this marker is used in an existing README, it's ignored and not replaced
|
|
||||||
MARKER_IGNORE = "<!-- SPACY PROJECT: IGNORE -->"
|
|
||||||
|
|
||||||
|
|
||||||
@project_cli.command("document")
|
|
||||||
def project_document_cli(
|
|
||||||
# fmt: off
|
|
||||||
project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
|
|
||||||
output_file: Path = Opt("-", "--output", "-o", help="Path to output Markdown file for output. Defaults to - for standard output"),
|
|
||||||
no_emoji: bool = Opt(False, "--no-emoji", "-NE", help="Don't use emoji")
|
|
||||||
# fmt: on
|
|
||||||
):
|
|
||||||
"""
|
|
||||||
Auto-generate a README.md for a project. If the content is saved to a file,
|
|
||||||
hidden markers are added so you can add custom content before or after the
|
|
||||||
auto-generated section and only the auto-generated docs will be replaced
|
|
||||||
when you re-run the command.
|
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/cli#project-document
|
|
||||||
"""
|
|
||||||
project_document(project_dir, output_file, no_emoji=no_emoji)
|
|
||||||
|
|
||||||
|
|
||||||
def project_document(
|
|
||||||
project_dir: Path, output_file: Path, *, no_emoji: bool = False
|
|
||||||
) -> None:
|
|
||||||
is_stdout = str(output_file) == "-"
|
|
||||||
config = load_project_config(project_dir)
|
|
||||||
md = MarkdownRenderer(no_emoji=no_emoji)
|
|
||||||
md.add(MARKER_START)
|
|
||||||
title = config.get("title")
|
|
||||||
description = config.get("description")
|
|
||||||
md.add(md.title(1, f"spaCy Project{f': {title}' if title else ''}", "🪐"))
|
|
||||||
if description:
|
|
||||||
md.add(description)
|
|
||||||
md.add(md.title(2, PROJECT_FILE, "📋"))
|
|
||||||
md.add(INTRO_PROJECT)
|
|
||||||
# Commands
|
|
||||||
cmds = config.get("commands", [])
|
|
||||||
data = [(md.code(cmd["name"]), cmd.get("help", "")) for cmd in cmds]
|
|
||||||
if data:
|
|
||||||
md.add(md.title(3, "Commands", "⏯"))
|
|
||||||
md.add(INTRO_COMMANDS)
|
|
||||||
md.add(md.table(data, ["Command", "Description"]))
|
|
||||||
# Workflows
|
|
||||||
wfs = config.get("workflows", {}).items()
|
|
||||||
data = [(md.code(n), " → ".join(md.code(w) for w in stp)) for n, stp in wfs]
|
|
||||||
if data:
|
|
||||||
md.add(md.title(3, "Workflows", "⏭"))
|
|
||||||
md.add(INTRO_WORKFLOWS)
|
|
||||||
md.add(md.table(data, ["Workflow", "Steps"]))
|
|
||||||
# Assets
|
|
||||||
assets = config.get("assets", [])
|
|
||||||
data = []
|
|
||||||
for a in assets:
|
|
||||||
source = "Git" if a.get("git") else "URL" if a.get("url") else "Local"
|
|
||||||
dest_path = a["dest"]
|
|
||||||
dest = md.code(dest_path)
|
|
||||||
if source == "Local":
|
|
||||||
# Only link assets if they're in the repo
|
|
||||||
with working_dir(project_dir) as p:
|
|
||||||
if (p / dest_path).exists():
|
|
||||||
dest = md.link(dest, dest_path)
|
|
||||||
data.append((dest, source, a.get("description", "")))
|
|
||||||
if data:
|
|
||||||
md.add(md.title(3, "Assets", "🗂"))
|
|
||||||
md.add(INTRO_ASSETS)
|
|
||||||
md.add(md.table(data, ["File", "Source", "Description"]))
|
|
||||||
md.add(MARKER_END)
|
|
||||||
# Output result
|
|
||||||
if is_stdout:
|
|
||||||
print(md.text)
|
|
||||||
else:
|
|
||||||
content = md.text
|
|
||||||
if output_file.exists():
|
|
||||||
with output_file.open("r", encoding="utf8") as f:
|
|
||||||
existing = f.read()
|
|
||||||
if MARKER_IGNORE in existing:
|
|
||||||
msg.warn("Found ignore marker in existing file: skipping", output_file)
|
|
||||||
return
|
|
||||||
if MARKER_START in existing and MARKER_END in existing:
|
|
||||||
msg.info("Found existing file: only replacing auto-generated docs")
|
|
||||||
before = existing.split(MARKER_START)[0]
|
|
||||||
after = existing.split(MARKER_END)[1]
|
|
||||||
content = f"{before}{content}{after}"
|
|
||||||
else:
|
|
||||||
msg.warn("Replacing existing file")
|
|
||||||
with output_file.open("w", encoding="utf8") as f:
|
|
||||||
f.write(content)
|
|
||||||
msg.good("Saved project documentation", output_file)
|
|
||||||
|
|
|
@ -1,220 +1 @@
|
||||||
"""This module contains helpers and subcommands for integrating spaCy projects
|
from weasel.cli.dvc import *
|
||||||
with Data Version Controk (DVC). https://dvc.org"""
|
|
||||||
import subprocess
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Any, Dict, Iterable, List, Optional
|
|
||||||
|
|
||||||
from wasabi import msg
|
|
||||||
|
|
||||||
from ...util import (
|
|
||||||
SimpleFrozenList,
|
|
||||||
join_command,
|
|
||||||
run_command,
|
|
||||||
split_command,
|
|
||||||
working_dir,
|
|
||||||
)
|
|
||||||
from .._util import (
|
|
||||||
COMMAND,
|
|
||||||
NAME,
|
|
||||||
PROJECT_FILE,
|
|
||||||
Arg,
|
|
||||||
Opt,
|
|
||||||
get_hash,
|
|
||||||
load_project_config,
|
|
||||||
project_cli,
|
|
||||||
)
|
|
||||||
|
|
||||||
DVC_CONFIG = "dvc.yaml"
|
|
||||||
DVC_DIR = ".dvc"
|
|
||||||
UPDATE_COMMAND = "dvc"
|
|
||||||
DVC_CONFIG_COMMENT = f"""# This file is auto-generated by spaCy based on your {PROJECT_FILE}. If you've
|
|
||||||
# edited your {PROJECT_FILE}, you can regenerate this file by running:
|
|
||||||
# {COMMAND} project {UPDATE_COMMAND}"""
|
|
||||||
|
|
||||||
|
|
||||||
@project_cli.command(UPDATE_COMMAND)
|
|
||||||
def project_update_dvc_cli(
|
|
||||||
# fmt: off
|
|
||||||
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
|
|
||||||
workflow: Optional[str] = Arg(None, help=f"Name of workflow defined in {PROJECT_FILE}. Defaults to first workflow if not set."),
|
|
||||||
verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"),
|
|
||||||
quiet: bool = Opt(False, "--quiet", "-q", help="Print less info"),
|
|
||||||
force: bool = Opt(False, "--force", "-F", help="Force update DVC config"),
|
|
||||||
# fmt: on
|
|
||||||
):
|
|
||||||
"""Auto-generate Data Version Control (DVC) config. A DVC
|
|
||||||
project can only define one pipeline, so you need to specify one workflow
|
|
||||||
defined in the project.yml. If no workflow is specified, the first defined
|
|
||||||
workflow is used. The DVC config will only be updated if the project.yml
|
|
||||||
changed.
|
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/cli#project-dvc
|
|
||||||
"""
|
|
||||||
project_update_dvc(project_dir, workflow, verbose=verbose, quiet=quiet, force=force)
|
|
||||||
|
|
||||||
|
|
||||||
def project_update_dvc(
|
|
||||||
project_dir: Path,
|
|
||||||
workflow: Optional[str] = None,
|
|
||||||
*,
|
|
||||||
verbose: bool = False,
|
|
||||||
quiet: bool = False,
|
|
||||||
force: bool = False,
|
|
||||||
) -> None:
|
|
||||||
"""Update the auto-generated Data Version Control (DVC) config file. A DVC
|
|
||||||
project can only define one pipeline, so you need to specify one workflow
|
|
||||||
defined in the project.yml. Will only update the file if the checksum changed.
|
|
||||||
|
|
||||||
project_dir (Path): The project directory.
|
|
||||||
workflow (Optional[str]): Optional name of workflow defined in project.yml.
|
|
||||||
If not set, the first workflow will be used.
|
|
||||||
verbose (bool): Print more info.
|
|
||||||
quiet (bool): Print less info.
|
|
||||||
force (bool): Force update DVC config.
|
|
||||||
"""
|
|
||||||
config = load_project_config(project_dir)
|
|
||||||
updated = update_dvc_config(
|
|
||||||
project_dir, config, workflow, verbose=verbose, quiet=quiet, force=force
|
|
||||||
)
|
|
||||||
help_msg = "To execute the workflow with DVC, run: dvc repro"
|
|
||||||
if updated:
|
|
||||||
msg.good(f"Updated DVC config from {PROJECT_FILE}", help_msg)
|
|
||||||
else:
|
|
||||||
msg.info(f"No changes found in {PROJECT_FILE}, no update needed", help_msg)
|
|
||||||
|
|
||||||
|
|
||||||
def update_dvc_config(
|
|
||||||
path: Path,
|
|
||||||
config: Dict[str, Any],
|
|
||||||
workflow: Optional[str] = None,
|
|
||||||
verbose: bool = False,
|
|
||||||
quiet: bool = False,
|
|
||||||
force: bool = False,
|
|
||||||
) -> bool:
|
|
||||||
"""Re-run the DVC commands in dry mode and update dvc.yaml file in the
|
|
||||||
project directory. The file is auto-generated based on the config. The
|
|
||||||
first line of the auto-generated file specifies the hash of the config
|
|
||||||
dict, so if any of the config values change, the DVC config is regenerated.
|
|
||||||
|
|
||||||
path (Path): The path to the project directory.
|
|
||||||
config (Dict[str, Any]): The loaded project.yml.
|
|
||||||
verbose (bool): Whether to print additional info (via DVC).
|
|
||||||
quiet (bool): Don't output anything (via DVC).
|
|
||||||
force (bool): Force update, even if hashes match.
|
|
||||||
RETURNS (bool): Whether the DVC config file was updated.
|
|
||||||
"""
|
|
||||||
ensure_dvc(path)
|
|
||||||
workflows = config.get("workflows", {})
|
|
||||||
workflow_names = list(workflows.keys())
|
|
||||||
check_workflows(workflow_names, workflow)
|
|
||||||
if not workflow:
|
|
||||||
workflow = workflow_names[0]
|
|
||||||
config_hash = get_hash(config)
|
|
||||||
path = path.resolve()
|
|
||||||
dvc_config_path = path / DVC_CONFIG
|
|
||||||
if dvc_config_path.exists():
|
|
||||||
# Check if the file was generated using the current config, if not, redo
|
|
||||||
with dvc_config_path.open("r", encoding="utf8") as f:
|
|
||||||
ref_hash = f.readline().strip().replace("# ", "")
|
|
||||||
if ref_hash == config_hash and not force:
|
|
||||||
return False # Nothing has changed in project.yml, don't need to update
|
|
||||||
dvc_config_path.unlink()
|
|
||||||
dvc_commands = []
|
|
||||||
config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
|
|
||||||
|
|
||||||
# some flags that apply to every command
|
|
||||||
flags = []
|
|
||||||
if verbose:
|
|
||||||
flags.append("--verbose")
|
|
||||||
if quiet:
|
|
||||||
flags.append("--quiet")
|
|
||||||
|
|
||||||
for name in workflows[workflow]:
|
|
||||||
command = config_commands[name]
|
|
||||||
deps = command.get("deps", [])
|
|
||||||
outputs = command.get("outputs", [])
|
|
||||||
outputs_no_cache = command.get("outputs_no_cache", [])
|
|
||||||
if not deps and not outputs and not outputs_no_cache:
|
|
||||||
continue
|
|
||||||
# Default to the working dir as the project path since dvc.yaml is auto-generated
|
|
||||||
# and we don't want arbitrary paths in there
|
|
||||||
project_cmd = ["python", "-m", NAME, "project", "run", name]
|
|
||||||
deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl]
|
|
||||||
outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl]
|
|
||||||
outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl]
|
|
||||||
|
|
||||||
dvc_cmd = ["run", *flags, "-n", name, "-w", str(path), "--no-exec"]
|
|
||||||
if command.get("no_skip"):
|
|
||||||
dvc_cmd.append("--always-changed")
|
|
||||||
full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd]
|
|
||||||
dvc_commands.append(join_command(full_cmd))
|
|
||||||
|
|
||||||
if not dvc_commands:
|
|
||||||
# If we don't check for this, then there will be an error when reading the
|
|
||||||
# config, since DVC wouldn't create it.
|
|
||||||
msg.fail(
|
|
||||||
"No usable commands for DVC found. This can happen if none of your "
|
|
||||||
"commands have dependencies or outputs.",
|
|
||||||
exits=1,
|
|
||||||
)
|
|
||||||
|
|
||||||
with working_dir(path):
|
|
||||||
for c in dvc_commands:
|
|
||||||
dvc_command = "dvc " + c
|
|
||||||
run_command(dvc_command)
|
|
||||||
with dvc_config_path.open("r+", encoding="utf8") as f:
|
|
||||||
content = f.read()
|
|
||||||
f.seek(0, 0)
|
|
||||||
f.write(f"# {config_hash}\n{DVC_CONFIG_COMMENT}\n{content}")
|
|
||||||
return True
|
|
||||||
|
|
||||||
|
|
||||||
def check_workflows(workflows: List[str], workflow: Optional[str] = None) -> None:
|
|
||||||
"""Validate workflows provided in project.yml and check that a given
|
|
||||||
workflow can be used to generate a DVC config.
|
|
||||||
|
|
||||||
workflows (List[str]): Names of the available workflows.
|
|
||||||
workflow (Optional[str]): The name of the workflow to convert.
|
|
||||||
"""
|
|
||||||
if not workflows:
|
|
||||||
msg.fail(
|
|
||||||
f"No workflows defined in {PROJECT_FILE}. To generate a DVC config, "
|
|
||||||
f"define at least one list of commands.",
|
|
||||||
exits=1,
|
|
||||||
)
|
|
||||||
if workflow is not None and workflow not in workflows:
|
|
||||||
msg.fail(
|
|
||||||
f"Workflow '{workflow}' not defined in {PROJECT_FILE}. "
|
|
||||||
f"Available workflows: {', '.join(workflows)}",
|
|
||||||
exits=1,
|
|
||||||
)
|
|
||||||
if not workflow:
|
|
||||||
msg.warn(
|
|
||||||
f"No workflow specified for DVC pipeline. Using the first workflow "
|
|
||||||
f"defined in {PROJECT_FILE}: '{workflows[0]}'"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def ensure_dvc(project_dir: Path) -> None:
|
|
||||||
"""Ensure that the "dvc" command is available and that the current project
|
|
||||||
directory is an initialized DVC project.
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
subprocess.run(["dvc", "--version"], stdout=subprocess.DEVNULL)
|
|
||||||
except Exception:
|
|
||||||
msg.fail(
|
|
||||||
"To use spaCy projects with DVC (Data Version Control), DVC needs "
|
|
||||||
"to be installed and the 'dvc' command needs to be available",
|
|
||||||
"You can install the Python package from pip (pip install dvc) or "
|
|
||||||
"conda (conda install -c conda-forge dvc). For more details, see the "
|
|
||||||
"documentation: https://dvc.org/doc/install",
|
|
||||||
exits=1,
|
|
||||||
)
|
|
||||||
if not (project_dir / ".dvc").exists():
|
|
||||||
msg.fail(
|
|
||||||
"Project not initialized as a DVC project",
|
|
||||||
"To initialize a DVC project, you can run 'dvc init' in the project "
|
|
||||||
"directory. For more details, see the documentation: "
|
|
||||||
"https://dvc.org/doc/command-reference/init",
|
|
||||||
exits=1,
|
|
||||||
)
|
|
||||||
|
|
|
@ -1,67 +1 @@
|
||||||
from pathlib import Path
|
from weasel.cli.pull import *
|
||||||
|
|
||||||
from wasabi import msg
|
|
||||||
|
|
||||||
from .._util import Arg, load_project_config, logger, project_cli
|
|
||||||
from .remote_storage import RemoteStorage, get_command_hash
|
|
||||||
from .run import update_lockfile
|
|
||||||
|
|
||||||
|
|
||||||
@project_cli.command("pull")
|
|
||||||
def project_pull_cli(
|
|
||||||
# fmt: off
|
|
||||||
remote: str = Arg("default", help="Name or path of remote storage"),
|
|
||||||
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
|
|
||||||
# fmt: on
|
|
||||||
):
|
|
||||||
"""Retrieve available precomputed outputs from a remote storage.
|
|
||||||
You can alias remotes in your project.yml by mapping them to storage paths.
|
|
||||||
A storage can be anything that the smart-open library can upload to, e.g.
|
|
||||||
AWS, Google Cloud Storage, SSH, local directories etc.
|
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/cli#project-pull
|
|
||||||
"""
|
|
||||||
for url, output_path in project_pull(project_dir, remote):
|
|
||||||
if url is not None:
|
|
||||||
msg.good(f"Pulled {output_path} from {url}")
|
|
||||||
|
|
||||||
|
|
||||||
def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
|
|
||||||
# TODO: We don't have tests for this :(. It would take a bit of mockery to
|
|
||||||
# set up. I guess see if it breaks first?
|
|
||||||
config = load_project_config(project_dir)
|
|
||||||
if remote in config.get("remotes", {}):
|
|
||||||
remote = config["remotes"][remote]
|
|
||||||
storage = RemoteStorage(project_dir, remote)
|
|
||||||
commands = list(config.get("commands", []))
|
|
||||||
# We use a while loop here because we don't know how the commands
|
|
||||||
# will be ordered. A command might need dependencies from one that's later
|
|
||||||
# in the list.
|
|
||||||
while commands:
|
|
||||||
for i, cmd in enumerate(list(commands)):
|
|
||||||
logger.debug("CMD: %s.", cmd["name"])
|
|
||||||
deps = [project_dir / dep for dep in cmd.get("deps", [])]
|
|
||||||
if all(dep.exists() for dep in deps):
|
|
||||||
cmd_hash = get_command_hash("", "", deps, cmd["script"])
|
|
||||||
for output_path in cmd.get("outputs", []):
|
|
||||||
url = storage.pull(output_path, command_hash=cmd_hash)
|
|
||||||
logger.debug(
|
|
||||||
"URL: %s for %s with command hash %s",
|
|
||||||
url,
|
|
||||||
output_path,
|
|
||||||
cmd_hash,
|
|
||||||
)
|
|
||||||
yield url, output_path
|
|
||||||
|
|
||||||
out_locs = [project_dir / out for out in cmd.get("outputs", [])]
|
|
||||||
if all(loc.exists() for loc in out_locs):
|
|
||||||
update_lockfile(project_dir, cmd)
|
|
||||||
# We remove the command from the list here, and break, so that
|
|
||||||
# we iterate over the loop again.
|
|
||||||
commands.pop(i)
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
logger.debug("Dependency missing. Skipping %s outputs.", cmd["name"])
|
|
||||||
else:
|
|
||||||
# If we didn't break the for loop, break the while loop.
|
|
||||||
break
|
|
||||||
|
|
|
@ -1,69 +1 @@
|
||||||
from pathlib import Path
|
from weasel.cli.push import *
|
||||||
|
|
||||||
from wasabi import msg
|
|
||||||
|
|
||||||
from .._util import Arg, load_project_config, logger, project_cli
|
|
||||||
from .remote_storage import RemoteStorage, get_command_hash, get_content_hash
|
|
||||||
|
|
||||||
|
|
||||||
@project_cli.command("push")
|
|
||||||
def project_push_cli(
|
|
||||||
# fmt: off
|
|
||||||
remote: str = Arg("default", help="Name or path of remote storage"),
|
|
||||||
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
|
|
||||||
# fmt: on
|
|
||||||
):
|
|
||||||
"""Persist outputs to a remote storage. You can alias remotes in your
|
|
||||||
project.yml by mapping them to storage paths. A storage can be anything that
|
|
||||||
the smart-open library can upload to, e.g. AWS, Google Cloud Storage, SSH,
|
|
||||||
local directories etc.
|
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/cli#project-push
|
|
||||||
"""
|
|
||||||
for output_path, url in project_push(project_dir, remote):
|
|
||||||
if url is None:
|
|
||||||
msg.info(f"Skipping {output_path}")
|
|
||||||
else:
|
|
||||||
msg.good(f"Pushed {output_path} to {url}")
|
|
||||||
|
|
||||||
|
|
||||||
def project_push(project_dir: Path, remote: str):
|
|
||||||
"""Persist outputs to a remote storage. You can alias remotes in your project.yml
|
|
||||||
by mapping them to storage paths. A storage can be anything that the smart-open
|
|
||||||
library can upload to, e.g. gcs, aws, ssh, local directories etc
|
|
||||||
"""
|
|
||||||
config = load_project_config(project_dir)
|
|
||||||
if remote in config.get("remotes", {}):
|
|
||||||
remote = config["remotes"][remote]
|
|
||||||
storage = RemoteStorage(project_dir, remote)
|
|
||||||
for cmd in config.get("commands", []):
|
|
||||||
logger.debug("CMD: %s", cmd["name"])
|
|
||||||
deps = [project_dir / dep for dep in cmd.get("deps", [])]
|
|
||||||
if any(not dep.exists() for dep in deps):
|
|
||||||
logger.debug("Dependency missing. Skipping %s outputs", cmd["name"])
|
|
||||||
continue
|
|
||||||
cmd_hash = get_command_hash(
|
|
||||||
"", "", [project_dir / dep for dep in cmd.get("deps", [])], cmd["script"]
|
|
||||||
)
|
|
||||||
logger.debug("CMD_HASH: %s", cmd_hash)
|
|
||||||
for output_path in cmd.get("outputs", []):
|
|
||||||
output_loc = project_dir / output_path
|
|
||||||
if output_loc.exists() and _is_not_empty_dir(output_loc):
|
|
||||||
url = storage.push(
|
|
||||||
output_path,
|
|
||||||
command_hash=cmd_hash,
|
|
||||||
content_hash=get_content_hash(output_loc),
|
|
||||||
)
|
|
||||||
logger.debug(
|
|
||||||
"URL: %s for output %s with cmd_hash %s", url, output_path, cmd_hash
|
|
||||||
)
|
|
||||||
yield output_path, url
|
|
||||||
|
|
||||||
|
|
||||||
def _is_not_empty_dir(loc: Path):
|
|
||||||
if not loc.is_dir():
|
|
||||||
return True
|
|
||||||
elif any(_is_not_empty_dir(child) for child in loc.iterdir()):
|
|
||||||
return True
|
|
||||||
else:
|
|
||||||
return False
|
|
||||||
|
|
|
@ -1,212 +1 @@
|
||||||
import hashlib
|
from weasel.cli.remote_storage import *
|
||||||
import os
|
|
||||||
import site
|
|
||||||
import tarfile
|
|
||||||
import urllib.parse
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import TYPE_CHECKING, Dict, List, Optional
|
|
||||||
|
|
||||||
from wasabi import msg
|
|
||||||
|
|
||||||
from ... import about
|
|
||||||
from ...errors import Errors
|
|
||||||
from ...git_info import GIT_VERSION
|
|
||||||
from ...util import ENV_VARS, check_bool_env_var, get_minor_version
|
|
||||||
from .._util import (
|
|
||||||
download_file,
|
|
||||||
ensure_pathy,
|
|
||||||
get_checksum,
|
|
||||||
get_hash,
|
|
||||||
make_tempdir,
|
|
||||||
upload_file,
|
|
||||||
)
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
|
||||||
from pathy import FluidPath # noqa: F401
|
|
||||||
|
|
||||||
|
|
||||||
class RemoteStorage:
|
|
||||||
"""Push and pull outputs to and from a remote file storage.
|
|
||||||
|
|
||||||
Remotes can be anything that `smart-open` can support: AWS, GCS, file system,
|
|
||||||
ssh, etc.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, project_root: Path, url: str, *, compression="gz"):
|
|
||||||
self.root = project_root
|
|
||||||
self.url = ensure_pathy(url)
|
|
||||||
self.compression = compression
|
|
||||||
|
|
||||||
def push(self, path: Path, command_hash: str, content_hash: str) -> "FluidPath":
|
|
||||||
"""Compress a file or directory within a project and upload it to a remote
|
|
||||||
storage. If an object exists at the full URL, nothing is done.
|
|
||||||
|
|
||||||
Within the remote storage, files are addressed by their project path
|
|
||||||
(url encoded) and two user-supplied hashes, representing their creation
|
|
||||||
context and their file contents. If the URL already exists, the data is
|
|
||||||
not uploaded. Paths are archived and compressed prior to upload.
|
|
||||||
"""
|
|
||||||
loc = self.root / path
|
|
||||||
if not loc.exists():
|
|
||||||
raise IOError(f"Cannot push {loc}: does not exist.")
|
|
||||||
url = self.make_url(path, command_hash, content_hash)
|
|
||||||
if url.exists():
|
|
||||||
return url
|
|
||||||
tmp: Path
|
|
||||||
with make_tempdir() as tmp:
|
|
||||||
tar_loc = tmp / self.encode_name(str(path))
|
|
||||||
mode_string = f"w:{self.compression}" if self.compression else "w"
|
|
||||||
with tarfile.open(tar_loc, mode=mode_string) as tar_file:
|
|
||||||
tar_file.add(str(loc), arcname=str(path))
|
|
||||||
upload_file(tar_loc, url)
|
|
||||||
return url
|
|
||||||
|
|
||||||
def pull(
|
|
||||||
self,
|
|
||||||
path: Path,
|
|
||||||
*,
|
|
||||||
command_hash: Optional[str] = None,
|
|
||||||
content_hash: Optional[str] = None,
|
|
||||||
) -> Optional["FluidPath"]:
|
|
||||||
"""Retrieve a file from the remote cache. If the file already exists,
|
|
||||||
nothing is done.
|
|
||||||
|
|
||||||
If the command_hash and/or content_hash are specified, only matching
|
|
||||||
results are returned. If no results are available, an error is raised.
|
|
||||||
"""
|
|
||||||
dest = self.root / path
|
|
||||||
if dest.exists():
|
|
||||||
return None
|
|
||||||
url = self.find(path, command_hash=command_hash, content_hash=content_hash)
|
|
||||||
if url is None:
|
|
||||||
return url
|
|
||||||
else:
|
|
||||||
# Make sure the destination exists
|
|
||||||
if not dest.parent.exists():
|
|
||||||
dest.parent.mkdir(parents=True)
|
|
||||||
tmp: Path
|
|
||||||
with make_tempdir() as tmp:
|
|
||||||
tar_loc = tmp / url.parts[-1]
|
|
||||||
download_file(url, tar_loc)
|
|
||||||
mode_string = f"r:{self.compression}" if self.compression else "r"
|
|
||||||
with tarfile.open(tar_loc, mode=mode_string) as tar_file:
|
|
||||||
# This requires that the path is added correctly, relative
|
|
||||||
# to root. This is how we set things up in push()
|
|
||||||
|
|
||||||
# Disallow paths outside the current directory for the tar
|
|
||||||
# file (CVE-2007-4559, directory traversal vulnerability)
|
|
||||||
def is_within_directory(directory, target):
|
|
||||||
abs_directory = os.path.abspath(directory)
|
|
||||||
abs_target = os.path.abspath(target)
|
|
||||||
prefix = os.path.commonprefix([abs_directory, abs_target])
|
|
||||||
return prefix == abs_directory
|
|
||||||
|
|
||||||
def safe_extract(tar, path):
|
|
||||||
for member in tar.getmembers():
|
|
||||||
member_path = os.path.join(path, member.name)
|
|
||||||
if not is_within_directory(path, member_path):
|
|
||||||
raise ValueError(Errors.E852)
|
|
||||||
tar.extractall(path)
|
|
||||||
|
|
||||||
safe_extract(tar_file, self.root)
|
|
||||||
return url
|
|
||||||
|
|
||||||
def find(
|
|
||||||
self,
|
|
||||||
path: Path,
|
|
||||||
*,
|
|
||||||
command_hash: Optional[str] = None,
|
|
||||||
content_hash: Optional[str] = None,
|
|
||||||
) -> Optional["FluidPath"]:
|
|
||||||
"""Find the best matching version of a file within the storage,
|
|
||||||
or `None` if no match can be found. If both the creation and content hash
|
|
||||||
are specified, only exact matches will be returned. Otherwise, the most
|
|
||||||
recent matching file is preferred.
|
|
||||||
"""
|
|
||||||
name = self.encode_name(str(path))
|
|
||||||
urls = []
|
|
||||||
if command_hash is not None and content_hash is not None:
|
|
||||||
url = self.url / name / command_hash / content_hash
|
|
||||||
urls = [url] if url.exists() else []
|
|
||||||
elif command_hash is not None:
|
|
||||||
if (self.url / name / command_hash).exists():
|
|
||||||
urls = list((self.url / name / command_hash).iterdir())
|
|
||||||
else:
|
|
||||||
if (self.url / name).exists():
|
|
||||||
for sub_dir in (self.url / name).iterdir():
|
|
||||||
urls.extend(sub_dir.iterdir())
|
|
||||||
if content_hash is not None:
|
|
||||||
urls = [url for url in urls if url.parts[-1] == content_hash]
|
|
||||||
if len(urls) >= 2:
|
|
||||||
try:
|
|
||||||
urls.sort(key=lambda x: x.stat().last_modified) # type: ignore
|
|
||||||
except Exception:
|
|
||||||
msg.warn(
|
|
||||||
"Unable to sort remote files by last modified. The file(s) "
|
|
||||||
"pulled from the cache may not be the most recent."
|
|
||||||
)
|
|
||||||
return urls[-1] if urls else None
|
|
||||||
|
|
||||||
def make_url(self, path: Path, command_hash: str, content_hash: str) -> "FluidPath":
|
|
||||||
"""Construct a URL from a subpath, a creation hash and a content hash."""
|
|
||||||
return self.url / self.encode_name(str(path)) / command_hash / content_hash
|
|
||||||
|
|
||||||
def encode_name(self, name: str) -> str:
|
|
||||||
"""Encode a subpath into a URL-safe name."""
|
|
||||||
return urllib.parse.quote_plus(name)
|
|
||||||
|
|
||||||
|
|
||||||
def get_content_hash(loc: Path) -> str:
|
|
||||||
return get_checksum(loc)
|
|
||||||
|
|
||||||
|
|
||||||
def get_command_hash(
|
|
||||||
site_hash: str, env_hash: str, deps: List[Path], cmd: List[str]
|
|
||||||
) -> str:
|
|
||||||
"""Create a hash representing the execution of a command. This includes the
|
|
||||||
currently installed packages, whatever environment variables have been marked
|
|
||||||
as relevant, and the command.
|
|
||||||
"""
|
|
||||||
if check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION):
|
|
||||||
spacy_v = GIT_VERSION
|
|
||||||
else:
|
|
||||||
spacy_v = str(get_minor_version(about.__version__) or "")
|
|
||||||
dep_checksums = [get_checksum(dep) for dep in sorted(deps)]
|
|
||||||
hashes = [spacy_v, site_hash, env_hash] + dep_checksums
|
|
||||||
hashes.extend(cmd)
|
|
||||||
creation_bytes = "".join(hashes).encode("utf8")
|
|
||||||
return hashlib.md5(creation_bytes).hexdigest()
|
|
||||||
|
|
||||||
|
|
||||||
def get_site_hash():
|
|
||||||
"""Hash the current Python environment's site-packages contents, including
|
|
||||||
the name and version of the libraries. The list we're hashing is what
|
|
||||||
`pip freeze` would output.
|
|
||||||
"""
|
|
||||||
site_dirs = site.getsitepackages()
|
|
||||||
if site.ENABLE_USER_SITE:
|
|
||||||
site_dirs.extend(site.getusersitepackages())
|
|
||||||
packages = set()
|
|
||||||
for site_dir in site_dirs:
|
|
||||||
site_dir = Path(site_dir)
|
|
||||||
for subpath in site_dir.iterdir():
|
|
||||||
if subpath.parts[-1].endswith("dist-info"):
|
|
||||||
packages.add(subpath.parts[-1].replace(".dist-info", ""))
|
|
||||||
package_bytes = "".join(sorted(packages)).encode("utf8")
|
|
||||||
return hashlib.md5sum(package_bytes).hexdigest()
|
|
||||||
|
|
||||||
|
|
||||||
def get_env_hash(env: Dict[str, str]) -> str:
|
|
||||||
"""Construct a hash of the environment variables that will be passed into
|
|
||||||
the commands.
|
|
||||||
|
|
||||||
Values in the env dict may be references to the current os.environ, using
|
|
||||||
the syntax $ENV_VAR to mean os.environ[ENV_VAR]
|
|
||||||
"""
|
|
||||||
env_vars = {}
|
|
||||||
for key, value in env.items():
|
|
||||||
if value.startswith("$"):
|
|
||||||
env_vars[key] = os.environ.get(value[1:], "")
|
|
||||||
else:
|
|
||||||
env_vars[key] = value
|
|
||||||
return get_hash(env_vars)
|
|
||||||
|
|
|
@ -1,379 +1 @@
|
||||||
import os.path
|
from weasel.cli.run import *
|
||||||
import sys
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple
|
|
||||||
|
|
||||||
import srsly
|
|
||||||
import typer
|
|
||||||
from wasabi import msg
|
|
||||||
from wasabi.util import locale_escape
|
|
||||||
|
|
||||||
from ... import about
|
|
||||||
from ...git_info import GIT_VERSION
|
|
||||||
from ...util import (
|
|
||||||
ENV_VARS,
|
|
||||||
SimpleFrozenDict,
|
|
||||||
SimpleFrozenList,
|
|
||||||
check_bool_env_var,
|
|
||||||
is_cwd,
|
|
||||||
is_minor_version_match,
|
|
||||||
join_command,
|
|
||||||
run_command,
|
|
||||||
split_command,
|
|
||||||
working_dir,
|
|
||||||
)
|
|
||||||
from .._util import (
|
|
||||||
COMMAND,
|
|
||||||
PROJECT_FILE,
|
|
||||||
PROJECT_LOCK,
|
|
||||||
Arg,
|
|
||||||
Opt,
|
|
||||||
get_checksum,
|
|
||||||
get_hash,
|
|
||||||
load_project_config,
|
|
||||||
parse_config_overrides,
|
|
||||||
project_cli,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@project_cli.command(
|
|
||||||
"run", context_settings={"allow_extra_args": True, "ignore_unknown_options": True}
|
|
||||||
)
|
|
||||||
def project_run_cli(
|
|
||||||
# fmt: off
|
|
||||||
ctx: typer.Context, # This is only used to read additional arguments
|
|
||||||
subcommand: str = Arg(None, help=f"Name of command defined in the {PROJECT_FILE}"),
|
|
||||||
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
|
|
||||||
force: bool = Opt(False, "--force", "-F", help="Force re-running steps, even if nothing changed"),
|
|
||||||
dry: bool = Opt(False, "--dry", "-D", help="Perform a dry run and don't execute scripts"),
|
|
||||||
show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
|
|
||||||
# fmt: on
|
|
||||||
):
|
|
||||||
"""Run a named command or workflow defined in the project.yml. If a workflow
|
|
||||||
name is specified, all commands in the workflow are run, in order. If
|
|
||||||
commands define dependencies and/or outputs, they will only be re-run if
|
|
||||||
state has changed.
|
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/cli#project-run
|
|
||||||
"""
|
|
||||||
if show_help or not subcommand:
|
|
||||||
print_run_help(project_dir, subcommand)
|
|
||||||
else:
|
|
||||||
overrides = parse_config_overrides(ctx.args)
|
|
||||||
project_run(project_dir, subcommand, overrides=overrides, force=force, dry=dry)
|
|
||||||
|
|
||||||
|
|
||||||
def project_run(
|
|
||||||
project_dir: Path,
|
|
||||||
subcommand: str,
|
|
||||||
*,
|
|
||||||
overrides: Dict[str, Any] = SimpleFrozenDict(),
|
|
||||||
force: bool = False,
|
|
||||||
dry: bool = False,
|
|
||||||
capture: bool = False,
|
|
||||||
skip_requirements_check: bool = False,
|
|
||||||
) -> None:
|
|
||||||
"""Run a named script defined in the project.yml. If the script is part
|
|
||||||
of the default pipeline (defined in the "run" section), DVC is used to
|
|
||||||
execute the command, so it can determine whether to rerun it. It then
|
|
||||||
calls into "exec" to execute it.
|
|
||||||
|
|
||||||
project_dir (Path): Path to project directory.
|
|
||||||
subcommand (str): Name of command to run.
|
|
||||||
overrides (Dict[str, Any]): Optional config overrides.
|
|
||||||
force (bool): Force re-running, even if nothing changed.
|
|
||||||
dry (bool): Perform a dry run and don't execute commands.
|
|
||||||
capture (bool): Whether to capture the output and errors of individual commands.
|
|
||||||
If False, the stdout and stderr will not be redirected, and if there's an error,
|
|
||||||
sys.exit will be called with the return code. You should use capture=False
|
|
||||||
when you want to turn over execution to the command, and capture=True
|
|
||||||
when you want to run the command more like a function.
|
|
||||||
skip_requirements_check (bool): Whether to skip the requirements check.
|
|
||||||
"""
|
|
||||||
config = load_project_config(project_dir, overrides=overrides)
|
|
||||||
commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
|
|
||||||
workflows = config.get("workflows", {})
|
|
||||||
validate_subcommand(list(commands.keys()), list(workflows.keys()), subcommand)
|
|
||||||
|
|
||||||
req_path = project_dir / "requirements.txt"
|
|
||||||
if not skip_requirements_check:
|
|
||||||
if config.get("check_requirements", True) and os.path.exists(req_path):
|
|
||||||
with req_path.open() as requirements_file:
|
|
||||||
_check_requirements([req.strip() for req in requirements_file])
|
|
||||||
|
|
||||||
if subcommand in workflows:
|
|
||||||
msg.info(f"Running workflow '{subcommand}'")
|
|
||||||
for cmd in workflows[subcommand]:
|
|
||||||
project_run(
|
|
||||||
project_dir,
|
|
||||||
cmd,
|
|
||||||
overrides=overrides,
|
|
||||||
force=force,
|
|
||||||
dry=dry,
|
|
||||||
capture=capture,
|
|
||||||
skip_requirements_check=True,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
cmd = commands[subcommand]
|
|
||||||
for dep in cmd.get("deps", []):
|
|
||||||
if not (project_dir / dep).exists():
|
|
||||||
err = f"Missing dependency specified by command '{subcommand}': {dep}"
|
|
||||||
err_help = "Maybe you forgot to run the 'project assets' command or a previous step?"
|
|
||||||
err_exits = 1 if not dry else None
|
|
||||||
msg.fail(err, err_help, exits=err_exits)
|
|
||||||
check_spacy_commit = check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION)
|
|
||||||
with working_dir(project_dir) as current_dir:
|
|
||||||
msg.divider(subcommand)
|
|
||||||
rerun = check_rerun(current_dir, cmd, check_spacy_commit=check_spacy_commit)
|
|
||||||
if not rerun and not force:
|
|
||||||
msg.info(f"Skipping '{cmd['name']}': nothing changed")
|
|
||||||
else:
|
|
||||||
run_commands(cmd["script"], dry=dry, capture=capture)
|
|
||||||
if not dry:
|
|
||||||
update_lockfile(current_dir, cmd)
|
|
||||||
|
|
||||||
|
|
||||||
def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
|
|
||||||
"""Simulate a CLI help prompt using the info available in the project.yml.
|
|
||||||
|
|
||||||
project_dir (Path): The project directory.
|
|
||||||
subcommand (Optional[str]): The subcommand or None. If a subcommand is
|
|
||||||
provided, the subcommand help is shown. Otherwise, the top-level help
|
|
||||||
and a list of available commands is printed.
|
|
||||||
"""
|
|
||||||
config = load_project_config(project_dir)
|
|
||||||
config_commands = config.get("commands", [])
|
|
||||||
commands = {cmd["name"]: cmd for cmd in config_commands}
|
|
||||||
workflows = config.get("workflows", {})
|
|
||||||
project_loc = "" if is_cwd(project_dir) else project_dir
|
|
||||||
if subcommand:
|
|
||||||
validate_subcommand(list(commands.keys()), list(workflows.keys()), subcommand)
|
|
||||||
print(f"Usage: {COMMAND} project run {subcommand} {project_loc}")
|
|
||||||
if subcommand in commands:
|
|
||||||
help_text = commands[subcommand].get("help")
|
|
||||||
if help_text:
|
|
||||||
print(f"\n{help_text}\n")
|
|
||||||
elif subcommand in workflows:
|
|
||||||
steps = workflows[subcommand]
|
|
||||||
print(f"\nWorkflow consisting of {len(steps)} commands:")
|
|
||||||
steps_data = [
|
|
||||||
(f"{i + 1}. {step}", commands[step].get("help", ""))
|
|
||||||
for i, step in enumerate(steps)
|
|
||||||
]
|
|
||||||
msg.table(steps_data)
|
|
||||||
help_cmd = f"{COMMAND} project run [COMMAND] {project_loc} --help"
|
|
||||||
print(f"For command details, run: {help_cmd}")
|
|
||||||
else:
|
|
||||||
print("")
|
|
||||||
title = config.get("title")
|
|
||||||
if title:
|
|
||||||
print(f"{locale_escape(title)}\n")
|
|
||||||
if config_commands:
|
|
||||||
print(f"Available commands in {PROJECT_FILE}")
|
|
||||||
print(f"Usage: {COMMAND} project run [COMMAND] {project_loc}")
|
|
||||||
msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands])
|
|
||||||
if workflows:
|
|
||||||
print(f"Available workflows in {PROJECT_FILE}")
|
|
||||||
print(f"Usage: {COMMAND} project run [WORKFLOW] {project_loc}")
|
|
||||||
msg.table([(name, " -> ".join(steps)) for name, steps in workflows.items()])
|
|
||||||
|
|
||||||
|
|
||||||
def run_commands(
|
|
||||||
commands: Iterable[str] = SimpleFrozenList(),
|
|
||||||
silent: bool = False,
|
|
||||||
dry: bool = False,
|
|
||||||
capture: bool = False,
|
|
||||||
) -> None:
|
|
||||||
"""Run a sequence of commands in a subprocess, in order.
|
|
||||||
|
|
||||||
commands (List[str]): The string commands.
|
|
||||||
silent (bool): Don't print the commands.
|
|
||||||
dry (bool): Perform a dry run and don't execut anything.
|
|
||||||
capture (bool): Whether to capture the output and errors of individual commands.
|
|
||||||
If False, the stdout and stderr will not be redirected, and if there's an error,
|
|
||||||
sys.exit will be called with the return code. You should use capture=False
|
|
||||||
when you want to turn over execution to the command, and capture=True
|
|
||||||
when you want to run the command more like a function.
|
|
||||||
"""
|
|
||||||
for c in commands:
|
|
||||||
command = split_command(c)
|
|
||||||
# Not sure if this is needed or a good idea. Motivation: users may often
|
|
||||||
# use commands in their config that reference "python" and we want to
|
|
||||||
# make sure that it's always executing the same Python that spaCy is
|
|
||||||
# executed with and the pip in the same env, not some other Python/pip.
|
|
||||||
# Also ensures cross-compatibility if user 1 writes "python3" (because
|
|
||||||
# that's how it's set up on their system), and user 2 without the
|
|
||||||
# shortcut tries to re-run the command.
|
|
||||||
if len(command) and command[0] in ("python", "python3"):
|
|
||||||
command[0] = sys.executable
|
|
||||||
elif len(command) and command[0] in ("pip", "pip3"):
|
|
||||||
command = [sys.executable, "-m", "pip", *command[1:]]
|
|
||||||
if not silent:
|
|
||||||
print(f"Running command: {join_command(command)}")
|
|
||||||
if not dry:
|
|
||||||
run_command(command, capture=capture)
|
|
||||||
|
|
||||||
|
|
||||||
def validate_subcommand(
|
|
||||||
commands: Sequence[str], workflows: Sequence[str], subcommand: str
|
|
||||||
) -> None:
|
|
||||||
"""Check that a subcommand is valid and defined. Raises an error otherwise.
|
|
||||||
|
|
||||||
commands (Sequence[str]): The available commands.
|
|
||||||
subcommand (str): The subcommand.
|
|
||||||
"""
|
|
||||||
if not commands and not workflows:
|
|
||||||
msg.fail(f"No commands or workflows defined in {PROJECT_FILE}", exits=1)
|
|
||||||
if subcommand not in commands and subcommand not in workflows:
|
|
||||||
help_msg = []
|
|
||||||
if subcommand in ["assets", "asset"]:
|
|
||||||
help_msg.append("Did you mean to run: python -m spacy project assets?")
|
|
||||||
if commands:
|
|
||||||
help_msg.append(f"Available commands: {', '.join(commands)}")
|
|
||||||
if workflows:
|
|
||||||
help_msg.append(f"Available workflows: {', '.join(workflows)}")
|
|
||||||
msg.fail(
|
|
||||||
f"Can't find command or workflow '{subcommand}' in {PROJECT_FILE}",
|
|
||||||
". ".join(help_msg),
|
|
||||||
exits=1,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def check_rerun(
|
|
||||||
project_dir: Path,
|
|
||||||
command: Dict[str, Any],
|
|
||||||
*,
|
|
||||||
check_spacy_version: bool = True,
|
|
||||||
check_spacy_commit: bool = False,
|
|
||||||
) -> bool:
|
|
||||||
"""Check if a command should be rerun because its settings or inputs/outputs
|
|
||||||
changed.
|
|
||||||
|
|
||||||
project_dir (Path): The current project directory.
|
|
||||||
command (Dict[str, Any]): The command, as defined in the project.yml.
|
|
||||||
strict_version (bool):
|
|
||||||
RETURNS (bool): Whether to re-run the command.
|
|
||||||
"""
|
|
||||||
# Always rerun if no-skip is set
|
|
||||||
if command.get("no_skip", False):
|
|
||||||
return True
|
|
||||||
lock_path = project_dir / PROJECT_LOCK
|
|
||||||
if not lock_path.exists(): # We don't have a lockfile, run command
|
|
||||||
return True
|
|
||||||
data = srsly.read_yaml(lock_path)
|
|
||||||
if command["name"] not in data: # We don't have info about this command
|
|
||||||
return True
|
|
||||||
entry = data[command["name"]]
|
|
||||||
# Always run commands with no outputs (otherwise they'd always be skipped)
|
|
||||||
if not entry.get("outs", []):
|
|
||||||
return True
|
|
||||||
# Always rerun if spaCy version or commit hash changed
|
|
||||||
spacy_v = entry.get("spacy_version")
|
|
||||||
commit = entry.get("spacy_git_version")
|
|
||||||
if check_spacy_version and not is_minor_version_match(spacy_v, about.__version__):
|
|
||||||
info = f"({spacy_v} in {PROJECT_LOCK}, {about.__version__} current)"
|
|
||||||
msg.info(f"Re-running '{command['name']}': spaCy minor version changed {info}")
|
|
||||||
return True
|
|
||||||
if check_spacy_commit and commit != GIT_VERSION:
|
|
||||||
info = f"({commit} in {PROJECT_LOCK}, {GIT_VERSION} current)"
|
|
||||||
msg.info(f"Re-running '{command['name']}': spaCy commit changed {info}")
|
|
||||||
return True
|
|
||||||
# If the entry in the lockfile matches the lockfile entry that would be
|
|
||||||
# generated from the current command, we don't rerun because it means that
|
|
||||||
# all inputs/outputs, hashes and scripts are the same and nothing changed
|
|
||||||
lock_entry = get_lock_entry(project_dir, command)
|
|
||||||
exclude = ["spacy_version", "spacy_git_version"]
|
|
||||||
return get_hash(lock_entry, exclude=exclude) != get_hash(entry, exclude=exclude)
|
|
||||||
|
|
||||||
|
|
||||||
def update_lockfile(project_dir: Path, command: Dict[str, Any]) -> None:
|
|
||||||
"""Update the lockfile after running a command. Will create a lockfile if
|
|
||||||
it doesn't yet exist and will add an entry for the current command, its
|
|
||||||
script and dependencies/outputs.
|
|
||||||
|
|
||||||
project_dir (Path): The current project directory.
|
|
||||||
command (Dict[str, Any]): The command, as defined in the project.yml.
|
|
||||||
"""
|
|
||||||
lock_path = project_dir / PROJECT_LOCK
|
|
||||||
if not lock_path.exists():
|
|
||||||
srsly.write_yaml(lock_path, {})
|
|
||||||
data = {}
|
|
||||||
else:
|
|
||||||
data = srsly.read_yaml(lock_path)
|
|
||||||
data[command["name"]] = get_lock_entry(project_dir, command)
|
|
||||||
srsly.write_yaml(lock_path, data)
|
|
||||||
|
|
||||||
|
|
||||||
def get_lock_entry(project_dir: Path, command: Dict[str, Any]) -> Dict[str, Any]:
|
|
||||||
"""Get a lockfile entry for a given command. An entry includes the command,
|
|
||||||
the script (command steps) and a list of dependencies and outputs with
|
|
||||||
their paths and file hashes, if available. The format is based on the
|
|
||||||
dvc.lock files, to keep things consistent.
|
|
||||||
|
|
||||||
project_dir (Path): The current project directory.
|
|
||||||
command (Dict[str, Any]): The command, as defined in the project.yml.
|
|
||||||
RETURNS (Dict[str, Any]): The lockfile entry.
|
|
||||||
"""
|
|
||||||
deps = get_fileinfo(project_dir, command.get("deps", []))
|
|
||||||
outs = get_fileinfo(project_dir, command.get("outputs", []))
|
|
||||||
outs_nc = get_fileinfo(project_dir, command.get("outputs_no_cache", []))
|
|
||||||
return {
|
|
||||||
"cmd": f"{COMMAND} run {command['name']}",
|
|
||||||
"script": command["script"],
|
|
||||||
"deps": deps,
|
|
||||||
"outs": [*outs, *outs_nc],
|
|
||||||
"spacy_version": about.__version__,
|
|
||||||
"spacy_git_version": GIT_VERSION,
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def get_fileinfo(project_dir: Path, paths: List[str]) -> List[Dict[str, Optional[str]]]:
|
|
||||||
"""Generate the file information for a list of paths (dependencies, outputs).
|
|
||||||
Includes the file path and the file's checksum.
|
|
||||||
|
|
||||||
project_dir (Path): The current project directory.
|
|
||||||
paths (List[str]): The file paths.
|
|
||||||
RETURNS (List[Dict[str, str]]): The lockfile entry for a file.
|
|
||||||
"""
|
|
||||||
data = []
|
|
||||||
for path in paths:
|
|
||||||
file_path = project_dir / path
|
|
||||||
md5 = get_checksum(file_path) if file_path.exists() else None
|
|
||||||
data.append({"path": path, "md5": md5})
|
|
||||||
return data
|
|
||||||
|
|
||||||
|
|
||||||
def _check_requirements(requirements: List[str]) -> Tuple[bool, bool]:
|
|
||||||
"""Checks whether requirements are installed and free of version conflicts.
|
|
||||||
requirements (List[str]): List of requirements.
|
|
||||||
RETURNS (Tuple[bool, bool]): Whether (1) any packages couldn't be imported, (2) any packages with version conflicts
|
|
||||||
exist.
|
|
||||||
"""
|
|
||||||
import pkg_resources
|
|
||||||
|
|
||||||
failed_pkgs_msgs: List[str] = []
|
|
||||||
conflicting_pkgs_msgs: List[str] = []
|
|
||||||
|
|
||||||
for req in requirements:
|
|
||||||
try:
|
|
||||||
pkg_resources.require(req)
|
|
||||||
except pkg_resources.DistributionNotFound as dnf:
|
|
||||||
failed_pkgs_msgs.append(dnf.report())
|
|
||||||
except pkg_resources.VersionConflict as vc:
|
|
||||||
conflicting_pkgs_msgs.append(vc.report())
|
|
||||||
except Exception:
|
|
||||||
msg.warn(
|
|
||||||
f"Unable to check requirement: {req} "
|
|
||||||
"Checks are currently limited to requirement specifiers "
|
|
||||||
"(PEP 508)"
|
|
||||||
)
|
|
||||||
|
|
||||||
if len(failed_pkgs_msgs) or len(conflicting_pkgs_msgs):
|
|
||||||
msg.warn(
|
|
||||||
title="Missing requirements or requirement conflicts detected. Make sure your Python environment is set up "
|
|
||||||
"correctly and you installed all requirements specified in your project's requirements.txt: "
|
|
||||||
)
|
|
||||||
for pgk_msg in failed_pkgs_msgs + conflicting_pkgs_msgs:
|
|
||||||
msg.text(pgk_msg)
|
|
||||||
|
|
||||||
return len(failed_pkgs_msgs) > 0, len(conflicting_pkgs_msgs) > 0
|
|
||||||
|
|
|
@ -271,8 +271,9 @@ grad_factor = 1.0
|
||||||
@layers = "reduce_mean.v1"
|
@layers = "reduce_mean.v1"
|
||||||
|
|
||||||
[components.textcat.model.linear_model]
|
[components.textcat.model.linear_model]
|
||||||
@architectures = "spacy.TextCatBOW.v2"
|
@architectures = "spacy.TextCatBOW.v3"
|
||||||
exclusive_classes = true
|
exclusive_classes = true
|
||||||
|
length = 262144
|
||||||
ngram_size = 1
|
ngram_size = 1
|
||||||
no_output_layer = false
|
no_output_layer = false
|
||||||
|
|
||||||
|
@ -308,8 +309,9 @@ grad_factor = 1.0
|
||||||
@layers = "reduce_mean.v1"
|
@layers = "reduce_mean.v1"
|
||||||
|
|
||||||
[components.textcat_multilabel.model.linear_model]
|
[components.textcat_multilabel.model.linear_model]
|
||||||
@architectures = "spacy.TextCatBOW.v2"
|
@architectures = "spacy.TextCatBOW.v3"
|
||||||
exclusive_classes = false
|
exclusive_classes = false
|
||||||
|
length = 262144
|
||||||
ngram_size = 1
|
ngram_size = 1
|
||||||
no_output_layer = false
|
no_output_layer = false
|
||||||
|
|
||||||
|
@ -542,14 +544,15 @@ nO = null
|
||||||
width = ${components.tok2vec.model.encode.width}
|
width = ${components.tok2vec.model.encode.width}
|
||||||
|
|
||||||
[components.textcat.model.linear_model]
|
[components.textcat.model.linear_model]
|
||||||
@architectures = "spacy.TextCatBOW.v2"
|
@architectures = "spacy.TextCatBOW.v3"
|
||||||
exclusive_classes = true
|
exclusive_classes = true
|
||||||
|
length = 262144
|
||||||
ngram_size = 1
|
ngram_size = 1
|
||||||
no_output_layer = false
|
no_output_layer = false
|
||||||
|
|
||||||
{% else -%}
|
{% else -%}
|
||||||
[components.textcat.model]
|
[components.textcat.model]
|
||||||
@architectures = "spacy.TextCatBOW.v2"
|
@architectures = "spacy.TextCatBOW.v3"
|
||||||
exclusive_classes = true
|
exclusive_classes = true
|
||||||
ngram_size = 1
|
ngram_size = 1
|
||||||
no_output_layer = false
|
no_output_layer = false
|
||||||
|
@ -570,15 +573,17 @@ nO = null
|
||||||
width = ${components.tok2vec.model.encode.width}
|
width = ${components.tok2vec.model.encode.width}
|
||||||
|
|
||||||
[components.textcat_multilabel.model.linear_model]
|
[components.textcat_multilabel.model.linear_model]
|
||||||
@architectures = "spacy.TextCatBOW.v2"
|
@architectures = "spacy.TextCatBOW.v3"
|
||||||
exclusive_classes = false
|
exclusive_classes = false
|
||||||
|
length = 262144
|
||||||
ngram_size = 1
|
ngram_size = 1
|
||||||
no_output_layer = false
|
no_output_layer = false
|
||||||
|
|
||||||
{% else -%}
|
{% else -%}
|
||||||
[components.textcat_multilabel.model]
|
[components.textcat_multilabel.model]
|
||||||
@architectures = "spacy.TextCatBOW.v2"
|
@architectures = "spacy.TextCatBOW.v3"
|
||||||
exclusive_classes = false
|
exclusive_classes = false
|
||||||
|
length = 262144
|
||||||
ngram_size = 1
|
ngram_size = 1
|
||||||
no_output_layer = false
|
no_output_layer = false
|
||||||
{%- endif %}
|
{%- endif %}
|
||||||
|
|
|
@ -26,6 +26,9 @@ batch_size = 1000
|
||||||
[nlp.tokenizer]
|
[nlp.tokenizer]
|
||||||
@tokenizers = "spacy.Tokenizer.v1"
|
@tokenizers = "spacy.Tokenizer.v1"
|
||||||
|
|
||||||
|
[nlp.vectors]
|
||||||
|
@vectors = "spacy.Vectors.v1"
|
||||||
|
|
||||||
# The pipeline components and their models
|
# The pipeline components and their models
|
||||||
[components]
|
[components]
|
||||||
|
|
||||||
|
|
|
@ -142,7 +142,25 @@ class SpanRenderer:
|
||||||
spans (list): Individual entity spans and their start, end, label, kb_id and kb_url.
|
spans (list): Individual entity spans and their start, end, label, kb_id and kb_url.
|
||||||
title (str / None): Document title set in Doc.user_data['title'].
|
title (str / None): Document title set in Doc.user_data['title'].
|
||||||
"""
|
"""
|
||||||
per_token_info = []
|
per_token_info = self._assemble_per_token_info(tokens, spans)
|
||||||
|
markup = self._render_markup(per_token_info)
|
||||||
|
markup = TPL_SPANS.format(content=markup, dir=self.direction)
|
||||||
|
if title:
|
||||||
|
markup = TPL_TITLE.format(title=title) + markup
|
||||||
|
return markup
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _assemble_per_token_info(
|
||||||
|
tokens: List[str], spans: List[Dict[str, Any]]
|
||||||
|
) -> List[Dict[str, List[Dict[str, Any]]]]:
|
||||||
|
"""Assembles token info used to generate markup in render_spans().
|
||||||
|
tokens (List[str]): Tokens in text.
|
||||||
|
spans (List[Dict[str, Any]]): Spans in text.
|
||||||
|
RETURNS (List[Dict[str, List[Dict, str, Any]]]): Per token info needed to render HTML markup for given tokens
|
||||||
|
and spans.
|
||||||
|
"""
|
||||||
|
per_token_info: List[Dict[str, List[Dict[str, Any]]]] = []
|
||||||
|
|
||||||
# we must sort so that we can correctly describe when spans need to "stack"
|
# we must sort so that we can correctly describe when spans need to "stack"
|
||||||
# which is determined by their start token, then span length (longer spans on top),
|
# which is determined by their start token, then span length (longer spans on top),
|
||||||
# then break any remaining ties with the span label
|
# then break any remaining ties with the span label
|
||||||
|
@ -154,21 +172,22 @@ class SpanRenderer:
|
||||||
s["label"],
|
s["label"],
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
for s in spans:
|
for s in spans:
|
||||||
# this is the vertical 'slot' that the span will be rendered in
|
# this is the vertical 'slot' that the span will be rendered in
|
||||||
# vertical_position = span_label_offset + (offset_step * (slot - 1))
|
# vertical_position = span_label_offset + (offset_step * (slot - 1))
|
||||||
s["render_slot"] = 0
|
s["render_slot"] = 0
|
||||||
|
|
||||||
for idx, token in enumerate(tokens):
|
for idx, token in enumerate(tokens):
|
||||||
# Identify if a token belongs to a Span (and which) and if it's a
|
# Identify if a token belongs to a Span (and which) and if it's a
|
||||||
# start token of said Span. We'll use this for the final HTML render
|
# start token of said Span. We'll use this for the final HTML render
|
||||||
token_markup: Dict[str, Any] = {}
|
token_markup: Dict[str, Any] = {}
|
||||||
token_markup["text"] = token
|
token_markup["text"] = token
|
||||||
concurrent_spans = 0
|
intersecting_spans: List[Dict[str, Any]] = []
|
||||||
entities = []
|
entities = []
|
||||||
for span in spans:
|
for span in spans:
|
||||||
ent = {}
|
ent = {}
|
||||||
if span["start_token"] <= idx < span["end_token"]:
|
if span["start_token"] <= idx < span["end_token"]:
|
||||||
concurrent_spans += 1
|
|
||||||
span_start = idx == span["start_token"]
|
span_start = idx == span["start_token"]
|
||||||
ent["label"] = span["label"]
|
ent["label"] = span["label"]
|
||||||
ent["is_start"] = span_start
|
ent["is_start"] = span_start
|
||||||
|
@ -176,7 +195,12 @@ class SpanRenderer:
|
||||||
# When the span starts, we need to know how many other
|
# When the span starts, we need to know how many other
|
||||||
# spans are on the 'span stack' and will be rendered.
|
# spans are on the 'span stack' and will be rendered.
|
||||||
# This value becomes the vertical render slot for this entire span
|
# This value becomes the vertical render slot for this entire span
|
||||||
span["render_slot"] = concurrent_spans
|
span["render_slot"] = (
|
||||||
|
intersecting_spans[-1]["render_slot"]
|
||||||
|
if len(intersecting_spans)
|
||||||
|
else 0
|
||||||
|
) + 1
|
||||||
|
intersecting_spans.append(span)
|
||||||
ent["render_slot"] = span["render_slot"]
|
ent["render_slot"] = span["render_slot"]
|
||||||
kb_id = span.get("kb_id", "")
|
kb_id = span.get("kb_id", "")
|
||||||
kb_url = span.get("kb_url", "#")
|
kb_url = span.get("kb_url", "#")
|
||||||
|
@ -193,11 +217,8 @@ class SpanRenderer:
|
||||||
span["render_slot"] = 0
|
span["render_slot"] = 0
|
||||||
token_markup["entities"] = entities
|
token_markup["entities"] = entities
|
||||||
per_token_info.append(token_markup)
|
per_token_info.append(token_markup)
|
||||||
markup = self._render_markup(per_token_info)
|
|
||||||
markup = TPL_SPANS.format(content=markup, dir=self.direction)
|
return per_token_info
|
||||||
if title:
|
|
||||||
markup = TPL_TITLE.format(title=title) + markup
|
|
||||||
return markup
|
|
||||||
|
|
||||||
def _render_markup(self, per_token_info: List[Dict[str, Any]]) -> str:
|
def _render_markup(self, per_token_info: List[Dict[str, Any]]) -> str:
|
||||||
"""Render the markup from per-token information"""
|
"""Render the markup from per-token information"""
|
||||||
|
|
|
@ -219,6 +219,8 @@ class Warnings(metaclass=ErrorsWithCodes):
|
||||||
W125 = ("The StaticVectors key_attr is no longer used. To set a custom "
|
W125 = ("The StaticVectors key_attr is no longer used. To set a custom "
|
||||||
"key attribute for vectors, configure it through Vectors(attr=) or "
|
"key attribute for vectors, configure it through Vectors(attr=) or "
|
||||||
"'spacy init vectors --attr'")
|
"'spacy init vectors --attr'")
|
||||||
|
W126 = ("These keys are unsupported: {unsupported}")
|
||||||
|
W127 = ("Not all `Language.pipe` worker processes completed successfully")
|
||||||
|
|
||||||
|
|
||||||
class Errors(metaclass=ErrorsWithCodes):
|
class Errors(metaclass=ErrorsWithCodes):
|
||||||
|
@ -226,7 +228,6 @@ class Errors(metaclass=ErrorsWithCodes):
|
||||||
E002 = ("Can't find factory for '{name}' for language {lang} ({lang_code}). "
|
E002 = ("Can't find factory for '{name}' for language {lang} ({lang_code}). "
|
||||||
"This usually happens when spaCy calls `nlp.{method}` with a custom "
|
"This usually happens when spaCy calls `nlp.{method}` with a custom "
|
||||||
"component name that's not registered on the current language class. "
|
"component name that's not registered on the current language class. "
|
||||||
"If you're using a Transformer, make sure to install 'spacy-transformers'. "
|
|
||||||
"If you're using a custom component, make sure you've added the "
|
"If you're using a custom component, make sure you've added the "
|
||||||
"decorator `@Language.component` (for function components) or "
|
"decorator `@Language.component` (for function components) or "
|
||||||
"`@Language.factory` (for class components).\n\nAvailable "
|
"`@Language.factory` (for class components).\n\nAvailable "
|
||||||
|
@ -553,12 +554,12 @@ class Errors(metaclass=ErrorsWithCodes):
|
||||||
"during training, make sure to include it in 'annotating components'")
|
"during training, make sure to include it in 'annotating components'")
|
||||||
|
|
||||||
# New errors added in v3.x
|
# New errors added in v3.x
|
||||||
|
E849 = ("The vocab only supports {method} for vectors of type "
|
||||||
|
"spacy.vectors.Vectors, not {vectors_type}.")
|
||||||
E850 = ("The PretrainVectors objective currently only supports default or "
|
E850 = ("The PretrainVectors objective currently only supports default or "
|
||||||
"floret vectors, not {mode} vectors.")
|
"floret vectors, not {mode} vectors.")
|
||||||
E851 = ("The 'textcat' component labels should only have values of 0 or 1, "
|
E851 = ("The 'textcat' component labels should only have values of 0 or 1, "
|
||||||
"but found value of '{val}'.")
|
"but found value of '{val}'.")
|
||||||
E852 = ("The tar file pulled from the remote attempted an unsafe path "
|
|
||||||
"traversal.")
|
|
||||||
E853 = ("Unsupported component factory name '{name}'. The character '.' is "
|
E853 = ("Unsupported component factory name '{name}'. The character '.' is "
|
||||||
"not permitted in factory names.")
|
"not permitted in factory names.")
|
||||||
E854 = ("Unable to set doc.ents. Check that the 'ents_filter' does not "
|
E854 = ("Unable to set doc.ents. Check that the 'ents_filter' does not "
|
||||||
|
@ -981,6 +982,12 @@ class Errors(metaclass=ErrorsWithCodes):
|
||||||
" 'min_length': {min_length}, 'max_length': {max_length}")
|
" 'min_length': {min_length}, 'max_length': {max_length}")
|
||||||
E1054 = ("The text, including whitespace, must match between reference and "
|
E1054 = ("The text, including whitespace, must match between reference and "
|
||||||
"predicted docs when training {component}.")
|
"predicted docs when training {component}.")
|
||||||
|
E1055 = ("The 'replace_listener' callback expects {num_params} parameters, "
|
||||||
|
"but only callbacks with one or three parameters are supported")
|
||||||
|
E1056 = ("The `TextCatBOW` architecture expects a length of at least 1, was {length}.")
|
||||||
|
E1057 = ("The `TextCatReduce` architecture must be used with at least one "
|
||||||
|
"reduction. Please enable one of `use_reduce_first`, "
|
||||||
|
"`use_reduce_last`, `use_reduce_max` or `use_reduce_mean`.")
|
||||||
|
|
||||||
|
|
||||||
# Deprecated model shortcuts, only used in errors and warnings
|
# Deprecated model shortcuts, only used in errors and warnings
|
||||||
|
|
|
@ -1,3 +1,11 @@
|
||||||
from .candidate import Candidate, get_candidates, get_candidates_batch
|
from .candidate import Candidate, get_candidates, get_candidates_batch
|
||||||
from .kb import KnowledgeBase
|
from .kb import KnowledgeBase
|
||||||
from .kb_in_memory import InMemoryLookupKB
|
from .kb_in_memory import InMemoryLookupKB
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"Candidate",
|
||||||
|
"KnowledgeBase",
|
||||||
|
"InMemoryLookupKB",
|
||||||
|
"get_candidates",
|
||||||
|
"get_candidates_batch",
|
||||||
|
]
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
# cython: infer_types=True, profile=True
|
# cython: infer_types=True
|
||||||
|
|
||||||
from typing import Iterable
|
from typing import Iterable
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
# cython: infer_types=True, profile=True
|
# cython: infer_types=True
|
||||||
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterable, Tuple, Union
|
from typing import Iterable, Tuple, Union
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
# cython: infer_types=True, profile=True
|
# cython: infer_types=True
|
||||||
from typing import Any, Callable, Dict, Iterable
|
from typing import Any, Callable, Dict, Iterable
|
||||||
|
|
||||||
import srsly
|
import srsly
|
||||||
|
|
16
spacy/lang/bo/__init__.py
Normal file
16
spacy/lang/bo/__init__.py
Normal file
|
@ -0,0 +1,16 @@
|
||||||
|
from ...language import BaseDefaults, Language
|
||||||
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
from .stop_words import STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
|
class TibetanDefaults(BaseDefaults):
|
||||||
|
lex_attr_getters = LEX_ATTRS
|
||||||
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
|
class Tibetan(Language):
|
||||||
|
lang = "bo"
|
||||||
|
Defaults = TibetanDefaults
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["Tibetan"]
|
16
spacy/lang/bo/examples.py
Normal file
16
spacy/lang/bo/examples.py
Normal file
|
@ -0,0 +1,16 @@
|
||||||
|
"""
|
||||||
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
>>> from spacy.lang.bo.examples import sentences
|
||||||
|
>>> docs = nlp.pipe(sentences)
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
sentences = [
|
||||||
|
"དོན་དུ་རྒྱ་མཚོ་བླ་མ་ཞེས་བྱ་ཞིང༌།",
|
||||||
|
"ཏཱ་ལའི་ཞེས་པ་ནི་སོག་སྐད་ཡིན་པ་དེ་བོད་སྐད་དུ་རྒྱ་མཚོའི་དོན་དུ་འཇུག",
|
||||||
|
"སོག་པོ་ཨལ་ཐན་རྒྱལ་པོས་རྒྱལ་དབང་བསོད་ནམས་རྒྱ་མཚོར་ཆེ་བསྟོད་ཀྱི་མཚན་གསོལ་བ་ཞིག་ཡིན་ཞིང༌།",
|
||||||
|
"རྗེས་སུ་རྒྱལ་བ་དགེ་འདུན་གྲུབ་དང༌། དགེ་འདུན་རྒྱ་མཚོ་སོ་སོར་ཡང་ཏཱ་ལའི་བླ་མའི་སྐུ་ཕྲེང་དང་པོ་དང༌།",
|
||||||
|
"གཉིས་པའི་མཚན་དེ་གསོལ་ཞིང༌།༸རྒྱལ་དབང་སྐུ་ཕྲེང་ལྔ་པས་དགའ་ལྡན་ཕོ་བྲང་གི་སྲིད་དབང་བཙུགས་པ་ནས་ཏཱ་ལའི་བླ་མ་ནི་བོད་ཀྱི་ཆོས་སྲིད་གཉིས་ཀྱི་དབུ་ཁྲིད་དུ་གྱུར་ཞིང་།",
|
||||||
|
"ད་ལྟའི་བར་ཏཱ་ལའི་བླ་མ་སྐུ་ཕྲེང་བཅུ་བཞི་བྱོན་ཡོད།",
|
||||||
|
]
|
65
spacy/lang/bo/lex_attrs.py
Normal file
65
spacy/lang/bo/lex_attrs.py
Normal file
|
@ -0,0 +1,65 @@
|
||||||
|
from ...attrs import LIKE_NUM
|
||||||
|
|
||||||
|
# reference 1: https://en.wikipedia.org/wiki/Tibetan_numerals
|
||||||
|
|
||||||
|
_num_words = [
|
||||||
|
"ཀླད་ཀོར་",
|
||||||
|
"གཅིག་",
|
||||||
|
"གཉིས་",
|
||||||
|
"གསུམ་",
|
||||||
|
"བཞི་",
|
||||||
|
"ལྔ་",
|
||||||
|
"དྲུག་",
|
||||||
|
"བདུན་",
|
||||||
|
"བརྒྱད་",
|
||||||
|
"དགུ་",
|
||||||
|
"བཅུ་",
|
||||||
|
"བཅུ་གཅིག་",
|
||||||
|
"བཅུ་གཉིས་",
|
||||||
|
"བཅུ་གསུམ་",
|
||||||
|
"བཅུ་བཞི་",
|
||||||
|
"བཅུ་ལྔ་",
|
||||||
|
"བཅུ་དྲུག་",
|
||||||
|
"བཅུ་བདུན་",
|
||||||
|
"བཅུ་པརྒྱད",
|
||||||
|
"བཅུ་དགུ་",
|
||||||
|
"ཉི་ཤུ་",
|
||||||
|
"སུམ་ཅུ",
|
||||||
|
"བཞི་བཅུ",
|
||||||
|
"ལྔ་བཅུ",
|
||||||
|
"དྲུག་ཅུ",
|
||||||
|
"བདུན་ཅུ",
|
||||||
|
"བརྒྱད་ཅུ",
|
||||||
|
"དགུ་བཅུ",
|
||||||
|
"བརྒྱ་",
|
||||||
|
"སྟོང་",
|
||||||
|
"ཁྲི་",
|
||||||
|
"ས་ཡ་",
|
||||||
|
" བྱེ་བ་",
|
||||||
|
"དུང་ཕྱུར་",
|
||||||
|
"ཐེར་འབུམ་",
|
||||||
|
"ཐེར་འབུམ་ཆེན་པོ་",
|
||||||
|
"ཁྲག་ཁྲིག་",
|
||||||
|
"ཁྲག་ཁྲིག་ཆེན་པོ་",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def like_num(text):
|
||||||
|
"""
|
||||||
|
Check if text resembles a number
|
||||||
|
"""
|
||||||
|
if text.startswith(("+", "-", "±", "~")):
|
||||||
|
text = text[1:]
|
||||||
|
text = text.replace(",", "").replace(".", "")
|
||||||
|
if text.isdigit():
|
||||||
|
return True
|
||||||
|
if text.count("/") == 1:
|
||||||
|
num, denom = text.split("/")
|
||||||
|
if num.isdigit() and denom.isdigit():
|
||||||
|
return True
|
||||||
|
if text in _num_words:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
LEX_ATTRS = {LIKE_NUM: like_num}
|
198
spacy/lang/bo/stop_words.py
Normal file
198
spacy/lang/bo/stop_words.py
Normal file
|
@ -0,0 +1,198 @@
|
||||||
|
# Source: https://zenodo.org/records/10148636
|
||||||
|
|
||||||
|
STOP_WORDS = set(
|
||||||
|
"""
|
||||||
|
འི་
|
||||||
|
།
|
||||||
|
དུ་
|
||||||
|
གིས་
|
||||||
|
སོགས་
|
||||||
|
ཏེ
|
||||||
|
གི་
|
||||||
|
རྣམས་
|
||||||
|
ནི
|
||||||
|
ཀུན་
|
||||||
|
ཡི་
|
||||||
|
འདི
|
||||||
|
ཀྱི་
|
||||||
|
སྙེད་
|
||||||
|
པས་
|
||||||
|
གཞན་
|
||||||
|
ཀྱིས་
|
||||||
|
ཡི
|
||||||
|
ལ
|
||||||
|
ནི་
|
||||||
|
དང་
|
||||||
|
སོགས
|
||||||
|
ཅིང་
|
||||||
|
ར
|
||||||
|
དུ
|
||||||
|
མི་
|
||||||
|
སུ་
|
||||||
|
བཅས་
|
||||||
|
ཡོངས་
|
||||||
|
ལས
|
||||||
|
ཙམ་
|
||||||
|
གྱིས་
|
||||||
|
དེ་
|
||||||
|
ཡང་
|
||||||
|
མཐའ་དག་
|
||||||
|
ཏུ་
|
||||||
|
ཉིད་
|
||||||
|
ས
|
||||||
|
ཏེ་
|
||||||
|
གྱི་
|
||||||
|
སྤྱི
|
||||||
|
དེ
|
||||||
|
ཀ་
|
||||||
|
ཡིན་
|
||||||
|
ཞིང་
|
||||||
|
འདི་
|
||||||
|
རུང་
|
||||||
|
རང་
|
||||||
|
ཞིག་
|
||||||
|
སྟེ
|
||||||
|
སྟེ་
|
||||||
|
ན་རེ
|
||||||
|
ངམ
|
||||||
|
ཤིང་
|
||||||
|
དག་
|
||||||
|
ཏོ
|
||||||
|
རེ་
|
||||||
|
འང་
|
||||||
|
ཀྱང་
|
||||||
|
ལགས་པ
|
||||||
|
ཚུ
|
||||||
|
དོ
|
||||||
|
ཡིན་པ
|
||||||
|
རེ
|
||||||
|
ན་རེ་
|
||||||
|
ཨེ་
|
||||||
|
ཚང་མ
|
||||||
|
ཐམས་ཅད་
|
||||||
|
དམ་
|
||||||
|
འོ་
|
||||||
|
ཅིག་
|
||||||
|
གྱིན་
|
||||||
|
ཡིན
|
||||||
|
ན
|
||||||
|
ཁོ་ན་
|
||||||
|
འམ་
|
||||||
|
ཀྱིན་
|
||||||
|
ལོ
|
||||||
|
ཀྱིས
|
||||||
|
བས་
|
||||||
|
ལགས་
|
||||||
|
ཤིག
|
||||||
|
གིས
|
||||||
|
ཀི་
|
||||||
|
སྣ་ཚོགས་
|
||||||
|
རྣམས
|
||||||
|
སྙེད་པ
|
||||||
|
ཡིས་
|
||||||
|
གྱི
|
||||||
|
གི
|
||||||
|
བམ་
|
||||||
|
ཤིག་
|
||||||
|
རེ་རེ་
|
||||||
|
ནམ
|
||||||
|
མིན་
|
||||||
|
ནམ་
|
||||||
|
ངམ་
|
||||||
|
རུ་
|
||||||
|
འགའ་
|
||||||
|
ཀུན
|
||||||
|
ཤས་
|
||||||
|
ཏུ
|
||||||
|
ཡིས
|
||||||
|
གིན་
|
||||||
|
གམ་
|
||||||
|
འོ
|
||||||
|
ཡིན་པ་
|
||||||
|
མིན
|
||||||
|
ལགས
|
||||||
|
གྱིས
|
||||||
|
ཅང་
|
||||||
|
འགའ
|
||||||
|
སམ་
|
||||||
|
ཞིག
|
||||||
|
འང
|
||||||
|
ལས་ཆེ་
|
||||||
|
འཕྲལ་
|
||||||
|
བར་
|
||||||
|
རུ
|
||||||
|
དང
|
||||||
|
ཡ
|
||||||
|
འག
|
||||||
|
སམ
|
||||||
|
ཀ
|
||||||
|
ཅུང་ཟད་
|
||||||
|
ཅིག
|
||||||
|
ཉིད
|
||||||
|
དུ་མ
|
||||||
|
མ
|
||||||
|
ཡིན་བ
|
||||||
|
འམ
|
||||||
|
མམ
|
||||||
|
དམ
|
||||||
|
དག
|
||||||
|
ཁོ་ན
|
||||||
|
ཀྱི
|
||||||
|
ལམ
|
||||||
|
ཕྱི་
|
||||||
|
ནང་
|
||||||
|
ཙམ
|
||||||
|
ནོ་
|
||||||
|
སོ་
|
||||||
|
རམ་
|
||||||
|
བོ་
|
||||||
|
ཨང་
|
||||||
|
ཕྱི
|
||||||
|
ཏོ་
|
||||||
|
ཚོ
|
||||||
|
ལ་ལ་
|
||||||
|
ཚོ་
|
||||||
|
ཅིང
|
||||||
|
མ་གི་
|
||||||
|
གེ
|
||||||
|
གོ
|
||||||
|
ཡིན་ལུགས་
|
||||||
|
རོ་
|
||||||
|
བོ
|
||||||
|
ལགས་པ་
|
||||||
|
པས
|
||||||
|
རབ་
|
||||||
|
འི
|
||||||
|
རམ
|
||||||
|
བས
|
||||||
|
གཞན
|
||||||
|
སྙེད་པ་
|
||||||
|
འབའ་
|
||||||
|
མཾ་
|
||||||
|
པོ
|
||||||
|
ག་
|
||||||
|
ག
|
||||||
|
གམ
|
||||||
|
སྤྱི་
|
||||||
|
བམ
|
||||||
|
མོ་
|
||||||
|
ཙམ་པ་
|
||||||
|
ཤ་སྟག་
|
||||||
|
མམ་
|
||||||
|
རེ་རེ
|
||||||
|
སྙེད
|
||||||
|
ཏམ་
|
||||||
|
ངོ
|
||||||
|
གྲང་
|
||||||
|
ཏ་རེ
|
||||||
|
ཏམ
|
||||||
|
ཁ་
|
||||||
|
ངེ་
|
||||||
|
ཅོག་
|
||||||
|
རིལ་
|
||||||
|
ཉུང་ཤས་
|
||||||
|
གིང་
|
||||||
|
ཚ་
|
||||||
|
ཀྱང
|
||||||
|
""".split()
|
||||||
|
)
|
|
@ -6,7 +6,8 @@ _num_words = [
|
||||||
"nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
|
"nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
|
||||||
"sixteen", "seventeen", "eighteen", "nineteen", "twenty", "thirty", "forty",
|
"sixteen", "seventeen", "eighteen", "nineteen", "twenty", "thirty", "forty",
|
||||||
"fifty", "sixty", "seventy", "eighty", "ninety", "hundred", "thousand",
|
"fifty", "sixty", "seventy", "eighty", "ninety", "hundred", "thousand",
|
||||||
"million", "billion", "trillion", "quadrillion", "gajillion", "bazillion"
|
"million", "billion", "trillion", "quadrillion", "quintillion", "sextillion",
|
||||||
|
"septillion", "octillion", "nonillion", "decillion", "gajillion", "bazillion"
|
||||||
]
|
]
|
||||||
_ordinal_words = [
|
_ordinal_words = [
|
||||||
"first", "second", "third", "fourth", "fifth", "sixth", "seventh", "eighth",
|
"first", "second", "third", "fourth", "fifth", "sixth", "seventh", "eighth",
|
||||||
|
@ -14,7 +15,8 @@ _ordinal_words = [
|
||||||
"fifteenth", "sixteenth", "seventeenth", "eighteenth", "nineteenth",
|
"fifteenth", "sixteenth", "seventeenth", "eighteenth", "nineteenth",
|
||||||
"twentieth", "thirtieth", "fortieth", "fiftieth", "sixtieth", "seventieth",
|
"twentieth", "thirtieth", "fortieth", "fiftieth", "sixtieth", "seventieth",
|
||||||
"eightieth", "ninetieth", "hundredth", "thousandth", "millionth", "billionth",
|
"eightieth", "ninetieth", "hundredth", "thousandth", "millionth", "billionth",
|
||||||
"trillionth", "quadrillionth", "gajillionth", "bazillionth"
|
"trillionth", "quadrillionth", "quintillionth", "sextillionth", "septillionth",
|
||||||
|
"octillionth", "nonillionth", "decillionth", "gajillionth", "bazillionth"
|
||||||
]
|
]
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
||||||
|
|
|
@ -163,7 +163,7 @@ class SpanishLemmatizer(Lemmatizer):
|
||||||
for old, new in self.lookups.get_table("lemma_rules").get("det", []):
|
for old, new in self.lookups.get_table("lemma_rules").get("det", []):
|
||||||
if word == old:
|
if word == old:
|
||||||
return [new]
|
return [new]
|
||||||
# If none of the specfic rules apply, search in the common rules for
|
# If none of the specific rules apply, search in the common rules for
|
||||||
# determiners and pronouns that follow a unique pattern for
|
# determiners and pronouns that follow a unique pattern for
|
||||||
# lemmatization. If the word is in the list, return the corresponding
|
# lemmatization. If the word is in the list, return the corresponding
|
||||||
# lemma.
|
# lemma.
|
||||||
|
@ -291,7 +291,7 @@ class SpanishLemmatizer(Lemmatizer):
|
||||||
for old, new in self.lookups.get_table("lemma_rules").get("pron", []):
|
for old, new in self.lookups.get_table("lemma_rules").get("pron", []):
|
||||||
if word == old:
|
if word == old:
|
||||||
return [new]
|
return [new]
|
||||||
# If none of the specfic rules apply, search in the common rules for
|
# If none of the specific rules apply, search in the common rules for
|
||||||
# determiners and pronouns that follow a unique pattern for
|
# determiners and pronouns that follow a unique pattern for
|
||||||
# lemmatization. If the word is in the list, return the corresponding
|
# lemmatization. If the word is in the list, return the corresponding
|
||||||
# lemma.
|
# lemma.
|
||||||
|
|
18
spacy/lang/fo/__init__.py
Normal file
18
spacy/lang/fo/__init__.py
Normal file
|
@ -0,0 +1,18 @@
|
||||||
|
from ...language import BaseDefaults, Language
|
||||||
|
from ..punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
||||||
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
|
|
||||||
|
|
||||||
|
class FaroeseDefaults(BaseDefaults):
|
||||||
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
|
infixes = TOKENIZER_INFIXES
|
||||||
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
|
prefixes = TOKENIZER_PREFIXES
|
||||||
|
|
||||||
|
|
||||||
|
class Faroese(Language):
|
||||||
|
lang = "fo"
|
||||||
|
Defaults = FaroeseDefaults
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["Faroese"]
|
90
spacy/lang/fo/tokenizer_exceptions.py
Normal file
90
spacy/lang/fo/tokenizer_exceptions.py
Normal file
|
@ -0,0 +1,90 @@
|
||||||
|
from ...symbols import ORTH
|
||||||
|
from ...util import update_exc
|
||||||
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
|
|
||||||
|
_exc = {}
|
||||||
|
|
||||||
|
for orth in [
|
||||||
|
"apr.",
|
||||||
|
"aug.",
|
||||||
|
"avgr.",
|
||||||
|
"árg.",
|
||||||
|
"ávís.",
|
||||||
|
"beinl.",
|
||||||
|
"blkv.",
|
||||||
|
"blaðkv.",
|
||||||
|
"blm.",
|
||||||
|
"blaðm.",
|
||||||
|
"bls.",
|
||||||
|
"blstj.",
|
||||||
|
"blaðstj.",
|
||||||
|
"des.",
|
||||||
|
"eint.",
|
||||||
|
"febr.",
|
||||||
|
"fyrrv.",
|
||||||
|
"góðk.",
|
||||||
|
"h.m.",
|
||||||
|
"innt.",
|
||||||
|
"jan.",
|
||||||
|
"kl.",
|
||||||
|
"m.a.",
|
||||||
|
"mðr.",
|
||||||
|
"mió.",
|
||||||
|
"nr.",
|
||||||
|
"nto.",
|
||||||
|
"nov.",
|
||||||
|
"nút.",
|
||||||
|
"o.a.",
|
||||||
|
"o.a.m.",
|
||||||
|
"o.a.tíl.",
|
||||||
|
"o.fl.",
|
||||||
|
"ff.",
|
||||||
|
"o.m.a.",
|
||||||
|
"o.o.",
|
||||||
|
"o.s.fr.",
|
||||||
|
"o.tíl.",
|
||||||
|
"o.ø.",
|
||||||
|
"okt.",
|
||||||
|
"omf.",
|
||||||
|
"pst.",
|
||||||
|
"ritstj.",
|
||||||
|
"sbr.",
|
||||||
|
"sms.",
|
||||||
|
"smst.",
|
||||||
|
"smb.",
|
||||||
|
"sb.",
|
||||||
|
"sbrt.",
|
||||||
|
"sp.",
|
||||||
|
"sept.",
|
||||||
|
"spf.",
|
||||||
|
"spsk.",
|
||||||
|
"t.e.",
|
||||||
|
"t.s.",
|
||||||
|
"t.s.s.",
|
||||||
|
"tlf.",
|
||||||
|
"tel.",
|
||||||
|
"tsk.",
|
||||||
|
"t.o.v.",
|
||||||
|
"t.d.",
|
||||||
|
"uml.",
|
||||||
|
"ums.",
|
||||||
|
"uppl.",
|
||||||
|
"upprfr.",
|
||||||
|
"uppr.",
|
||||||
|
"útg.",
|
||||||
|
"útl.",
|
||||||
|
"útr.",
|
||||||
|
"vanl.",
|
||||||
|
"v.",
|
||||||
|
"v.h.",
|
||||||
|
"v.ø.o.",
|
||||||
|
"viðm.",
|
||||||
|
"viðv.",
|
||||||
|
"vm.",
|
||||||
|
"v.m.",
|
||||||
|
]:
|
||||||
|
_exc[orth] = [{ORTH: orth}]
|
||||||
|
capitalized = orth.capitalize()
|
||||||
|
_exc[capitalized] = [{ORTH: capitalized}]
|
||||||
|
|
||||||
|
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
18
spacy/lang/gd/__init__.py
Normal file
18
spacy/lang/gd/__init__.py
Normal file
|
@ -0,0 +1,18 @@
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from ...language import BaseDefaults, Language
|
||||||
|
from .stop_words import STOP_WORDS
|
||||||
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
|
|
||||||
|
|
||||||
|
class ScottishDefaults(BaseDefaults):
|
||||||
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
|
class Scottish(Language):
|
||||||
|
lang = "gd"
|
||||||
|
Defaults = ScottishDefaults
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["Scottish"]
|
388
spacy/lang/gd/stop_words.py
Normal file
388
spacy/lang/gd/stop_words.py
Normal file
|
@ -0,0 +1,388 @@
|
||||||
|
STOP_WORDS = set(
|
||||||
|
"""
|
||||||
|
'ad
|
||||||
|
'ar
|
||||||
|
'd # iad
|
||||||
|
'g # ag
|
||||||
|
'ga
|
||||||
|
'gam
|
||||||
|
'gan
|
||||||
|
'gar
|
||||||
|
'gur
|
||||||
|
'm # am
|
||||||
|
'n # an
|
||||||
|
'n seo
|
||||||
|
'na
|
||||||
|
'nad
|
||||||
|
'nam
|
||||||
|
'nan
|
||||||
|
'nar
|
||||||
|
'nuair
|
||||||
|
'nur
|
||||||
|
's
|
||||||
|
'sa
|
||||||
|
'san
|
||||||
|
'sann
|
||||||
|
'se
|
||||||
|
'sna
|
||||||
|
a
|
||||||
|
a'
|
||||||
|
a'd # agad
|
||||||
|
a'm # agam
|
||||||
|
a-chèile
|
||||||
|
a-seo
|
||||||
|
a-sin
|
||||||
|
a-siud
|
||||||
|
a chionn
|
||||||
|
a chionn 's
|
||||||
|
a chèile
|
||||||
|
a chéile
|
||||||
|
a dh'
|
||||||
|
a h-uile
|
||||||
|
a seo
|
||||||
|
ac' # aca
|
||||||
|
aca
|
||||||
|
aca-san
|
||||||
|
acasan
|
||||||
|
ach
|
||||||
|
ag
|
||||||
|
agad
|
||||||
|
agad-sa
|
||||||
|
agads'
|
||||||
|
agadsa
|
||||||
|
agaibh
|
||||||
|
agaibhse
|
||||||
|
againn
|
||||||
|
againne
|
||||||
|
agam
|
||||||
|
agam-sa
|
||||||
|
agams'
|
||||||
|
agamsa
|
||||||
|
agus
|
||||||
|
aice
|
||||||
|
aice-se
|
||||||
|
aicese
|
||||||
|
aig
|
||||||
|
aig' # aige
|
||||||
|
aige
|
||||||
|
aige-san
|
||||||
|
aigesan
|
||||||
|
air
|
||||||
|
air-san
|
||||||
|
air neo
|
||||||
|
airsan
|
||||||
|
am
|
||||||
|
an
|
||||||
|
an seo
|
||||||
|
an sin
|
||||||
|
an siud
|
||||||
|
an uair
|
||||||
|
ann
|
||||||
|
ann a
|
||||||
|
ann a'
|
||||||
|
ann a shin
|
||||||
|
ann am
|
||||||
|
ann an
|
||||||
|
annad
|
||||||
|
annam
|
||||||
|
annam-s'
|
||||||
|
annamsa
|
||||||
|
anns
|
||||||
|
anns an
|
||||||
|
annta
|
||||||
|
aon
|
||||||
|
ar
|
||||||
|
as
|
||||||
|
asad
|
||||||
|
asda
|
||||||
|
asta
|
||||||
|
b'
|
||||||
|
bho
|
||||||
|
bhon
|
||||||
|
bhuaidhe # bhuaithe
|
||||||
|
bhuainn
|
||||||
|
bhuaipe
|
||||||
|
bhuaithe
|
||||||
|
bhuapa
|
||||||
|
bhur
|
||||||
|
brì
|
||||||
|
bu
|
||||||
|
c'à
|
||||||
|
car son
|
||||||
|
carson
|
||||||
|
cha
|
||||||
|
chan
|
||||||
|
chionn
|
||||||
|
choir
|
||||||
|
chon
|
||||||
|
chun
|
||||||
|
chèile
|
||||||
|
chéile
|
||||||
|
chòir
|
||||||
|
cia mheud
|
||||||
|
ciamar
|
||||||
|
co-dhiubh
|
||||||
|
cuide
|
||||||
|
cuin
|
||||||
|
cuin'
|
||||||
|
cuine
|
||||||
|
cà
|
||||||
|
cà'
|
||||||
|
càil
|
||||||
|
càit
|
||||||
|
càit'
|
||||||
|
càite
|
||||||
|
cò
|
||||||
|
cò mheud
|
||||||
|
có
|
||||||
|
d'
|
||||||
|
da
|
||||||
|
de
|
||||||
|
dh'
|
||||||
|
dha
|
||||||
|
dhaibh
|
||||||
|
dhaibh-san
|
||||||
|
dhaibhsan
|
||||||
|
dhan
|
||||||
|
dhasan
|
||||||
|
dhe
|
||||||
|
dhen
|
||||||
|
dheth
|
||||||
|
dhi
|
||||||
|
dhiom
|
||||||
|
dhiot
|
||||||
|
dhith
|
||||||
|
dhiubh
|
||||||
|
dhomh
|
||||||
|
dhomh-s'
|
||||||
|
dhomhsa
|
||||||
|
dhu'sa # dhut-sa
|
||||||
|
dhuibh
|
||||||
|
dhuibhse
|
||||||
|
dhuinn
|
||||||
|
dhuinne
|
||||||
|
dhuit
|
||||||
|
dhut
|
||||||
|
dhutsa
|
||||||
|
dhut-sa
|
||||||
|
dhà
|
||||||
|
dhà-san
|
||||||
|
dhàsan
|
||||||
|
dhòmhsa
|
||||||
|
diubh
|
||||||
|
do
|
||||||
|
docha
|
||||||
|
don
|
||||||
|
dà
|
||||||
|
dè
|
||||||
|
dè mar
|
||||||
|
dé
|
||||||
|
dé mar
|
||||||
|
dòch'
|
||||||
|
dòcha
|
||||||
|
e
|
||||||
|
eadar
|
||||||
|
eatarra
|
||||||
|
eatorra
|
||||||
|
eile
|
||||||
|
esan
|
||||||
|
fa
|
||||||
|
far
|
||||||
|
feud
|
||||||
|
fhad
|
||||||
|
fheudar
|
||||||
|
fhearr
|
||||||
|
fhein
|
||||||
|
fheudar
|
||||||
|
fheàrr
|
||||||
|
fhèin
|
||||||
|
fhéin
|
||||||
|
fhìn
|
||||||
|
fo
|
||||||
|
fodha
|
||||||
|
fodhainn
|
||||||
|
foipe
|
||||||
|
fon
|
||||||
|
fèin
|
||||||
|
ga
|
||||||
|
gach
|
||||||
|
gam
|
||||||
|
gan
|
||||||
|
ge brith
|
||||||
|
ged
|
||||||
|
gu
|
||||||
|
gu dè
|
||||||
|
gu ruige
|
||||||
|
gun
|
||||||
|
gur
|
||||||
|
gus
|
||||||
|
i
|
||||||
|
iad
|
||||||
|
iadsan
|
||||||
|
innte
|
||||||
|
is
|
||||||
|
ise
|
||||||
|
le
|
||||||
|
leam
|
||||||
|
leam-sa
|
||||||
|
leamsa
|
||||||
|
leat
|
||||||
|
leat-sa
|
||||||
|
leatha
|
||||||
|
leatsa
|
||||||
|
leibh
|
||||||
|
leis
|
||||||
|
leis-san
|
||||||
|
leoth'
|
||||||
|
leotha
|
||||||
|
leotha-san
|
||||||
|
linn
|
||||||
|
m'
|
||||||
|
m'a
|
||||||
|
ma
|
||||||
|
mac
|
||||||
|
man
|
||||||
|
mar
|
||||||
|
mas
|
||||||
|
mathaid
|
||||||
|
mi
|
||||||
|
mis'
|
||||||
|
mise
|
||||||
|
mo
|
||||||
|
mu
|
||||||
|
mu 'n
|
||||||
|
mun
|
||||||
|
mur
|
||||||
|
mura
|
||||||
|
mus
|
||||||
|
na
|
||||||
|
na b'
|
||||||
|
na bu
|
||||||
|
na iad
|
||||||
|
nach
|
||||||
|
nad
|
||||||
|
nam
|
||||||
|
nan
|
||||||
|
nar
|
||||||
|
nas
|
||||||
|
neo
|
||||||
|
no
|
||||||
|
nuair
|
||||||
|
o
|
||||||
|
o'n
|
||||||
|
oir
|
||||||
|
oirbh
|
||||||
|
oirbh-se
|
||||||
|
oirnn
|
||||||
|
oirnne
|
||||||
|
oirre
|
||||||
|
on
|
||||||
|
orm
|
||||||
|
orm-sa
|
||||||
|
ormsa
|
||||||
|
orra
|
||||||
|
orra-san
|
||||||
|
orrasan
|
||||||
|
ort
|
||||||
|
os
|
||||||
|
r'
|
||||||
|
ri
|
||||||
|
ribh
|
||||||
|
rinn
|
||||||
|
ris
|
||||||
|
rithe
|
||||||
|
rithe-se
|
||||||
|
rium
|
||||||
|
rium-sa
|
||||||
|
riums'
|
||||||
|
riumsa
|
||||||
|
riut
|
||||||
|
riuth'
|
||||||
|
riutha
|
||||||
|
riuthasan
|
||||||
|
ro
|
||||||
|
ro'n
|
||||||
|
roimh
|
||||||
|
roimhe
|
||||||
|
romhainn
|
||||||
|
romham
|
||||||
|
romhpa
|
||||||
|
ron
|
||||||
|
ruibh
|
||||||
|
ruinn
|
||||||
|
ruinne
|
||||||
|
sa
|
||||||
|
san
|
||||||
|
sann
|
||||||
|
se
|
||||||
|
seach
|
||||||
|
seo
|
||||||
|
seothach
|
||||||
|
shin
|
||||||
|
sibh
|
||||||
|
sibh-se
|
||||||
|
sibhse
|
||||||
|
sin
|
||||||
|
sineach
|
||||||
|
sinn
|
||||||
|
sinne
|
||||||
|
siod
|
||||||
|
siodach
|
||||||
|
siud
|
||||||
|
siudach
|
||||||
|
sna # ann an
|
||||||
|
sè
|
||||||
|
t'
|
||||||
|
tarsaing
|
||||||
|
tarsainn
|
||||||
|
tarsuinn
|
||||||
|
thar
|
||||||
|
thoigh
|
||||||
|
thro
|
||||||
|
thu
|
||||||
|
thuc'
|
||||||
|
thuca
|
||||||
|
thugad
|
||||||
|
thugaibh
|
||||||
|
thugainn
|
||||||
|
thugam
|
||||||
|
thugamsa
|
||||||
|
thuice
|
||||||
|
thuige
|
||||||
|
thus'
|
||||||
|
thusa
|
||||||
|
timcheall
|
||||||
|
toigh
|
||||||
|
toil
|
||||||
|
tro
|
||||||
|
tro' # troimh
|
||||||
|
troimh
|
||||||
|
troimhe
|
||||||
|
tron
|
||||||
|
tu
|
||||||
|
tusa
|
||||||
|
uair
|
||||||
|
ud
|
||||||
|
ugaibh
|
||||||
|
ugam-s'
|
||||||
|
ugam-sa
|
||||||
|
uice
|
||||||
|
uige
|
||||||
|
uige-san
|
||||||
|
umad
|
||||||
|
unnta # ann an
|
||||||
|
ur
|
||||||
|
urrainn
|
||||||
|
à
|
||||||
|
às
|
||||||
|
àsan
|
||||||
|
á
|
||||||
|
ás
|
||||||
|
è
|
||||||
|
ì
|
||||||
|
ò
|
||||||
|
ó
|
||||||
|
""".split(
|
||||||
|
"\n"
|
||||||
|
)
|
||||||
|
)
|
1983
spacy/lang/gd/tokenizer_exceptions.py
Normal file
1983
spacy/lang/gd/tokenizer_exceptions.py
Normal file
File diff suppressed because it is too large
Load Diff
|
@ -15,6 +15,7 @@ _prefixes = (
|
||||||
[
|
[
|
||||||
"†",
|
"†",
|
||||||
"⸏",
|
"⸏",
|
||||||
|
"〈",
|
||||||
]
|
]
|
||||||
+ LIST_PUNCT
|
+ LIST_PUNCT
|
||||||
+ LIST_ELLIPSES
|
+ LIST_ELLIPSES
|
||||||
|
@ -31,6 +32,7 @@ _suffixes = (
|
||||||
+ [
|
+ [
|
||||||
"†",
|
"†",
|
||||||
"⸎",
|
"⸎",
|
||||||
|
"〉",
|
||||||
r"(?<=[\u1F00-\u1FFF\u0370-\u03FF])[\-\.⸏]",
|
r"(?<=[\u1F00-\u1FFF\u0370-\u03FF])[\-\.⸏]",
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
The list of Croatian lemmas was extracted from the reldi-tagger repository (https://github.com/clarinsi/reldi-tagger).
|
The list of Croatian lemmas was extracted from the reldi-tagger repository (https://github.com/clarinsi/reldi-tagger).
|
||||||
Reldi-tagger is licesned under the Apache 2.0 licence.
|
Reldi-tagger is licensed under the Apache 2.0 licence.
|
||||||
|
|
||||||
@InProceedings{ljubesic16-new,
|
@InProceedings{ljubesic16-new,
|
||||||
author = {Nikola Ljubešić and Filip Klubička and Željko Agić and Ivo-Pavao Jazbec},
|
author = {Nikola Ljubešić and Filip Klubička and Željko Agić and Ivo-Pavao Jazbec},
|
||||||
|
@ -12,4 +12,4 @@ Reldi-tagger is licesned under the Apache 2.0 licence.
|
||||||
publisher = {European Language Resources Association (ELRA)},
|
publisher = {European Language Resources Association (ELRA)},
|
||||||
address = {Paris, France},
|
address = {Paris, France},
|
||||||
isbn = {978-2-9517408-9-1}
|
isbn = {978-2-9517408-9-1}
|
||||||
}
|
}
|
||||||
|
|
52
spacy/lang/ht/__init__.py
Normal file
52
spacy/lang/ht/__init__.py
Normal file
|
@ -0,0 +1,52 @@
|
||||||
|
from typing import Callable, Optional
|
||||||
|
|
||||||
|
from thinc.api import Model
|
||||||
|
|
||||||
|
from ...language import BaseDefaults, Language
|
||||||
|
from .lemmatizer import HaitianCreoleLemmatizer
|
||||||
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||||
|
from .stop_words import STOP_WORDS
|
||||||
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
|
from .tag_map import TAG_MAP
|
||||||
|
|
||||||
|
|
||||||
|
class HaitianCreoleDefaults(BaseDefaults):
|
||||||
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
|
prefixes = TOKENIZER_PREFIXES
|
||||||
|
infixes = TOKENIZER_INFIXES
|
||||||
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
|
lex_attr_getters = LEX_ATTRS
|
||||||
|
syntax_iterators = SYNTAX_ITERATORS
|
||||||
|
stop_words = STOP_WORDS
|
||||||
|
tag_map = TAG_MAP
|
||||||
|
|
||||||
|
class HaitianCreole(Language):
|
||||||
|
lang = "ht"
|
||||||
|
Defaults = HaitianCreoleDefaults
|
||||||
|
|
||||||
|
@HaitianCreole.factory(
|
||||||
|
"lemmatizer",
|
||||||
|
assigns=["token.lemma"],
|
||||||
|
default_config={
|
||||||
|
"model": None,
|
||||||
|
"mode": "rule",
|
||||||
|
"overwrite": False,
|
||||||
|
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||||
|
},
|
||||||
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
|
)
|
||||||
|
def make_lemmatizer(
|
||||||
|
nlp: Language,
|
||||||
|
model: Optional[Model],
|
||||||
|
name: str,
|
||||||
|
mode: str,
|
||||||
|
overwrite: bool,
|
||||||
|
scorer: Optional[Callable],
|
||||||
|
):
|
||||||
|
return HaitianCreoleLemmatizer(
|
||||||
|
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||||
|
)
|
||||||
|
|
||||||
|
__all__ = ["HaitianCreole"]
|
18
spacy/lang/ht/examples.py
Normal file
18
spacy/lang/ht/examples.py
Normal file
|
@ -0,0 +1,18 @@
|
||||||
|
"""
|
||||||
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
>>> from spacy.lang.ht.examples import sentences
|
||||||
|
>>> docs = nlp.pipe(sentences)
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
sentences = [
|
||||||
|
"Apple ap panse achte yon demaraj nan Wayòm Ini pou $1 milya dola",
|
||||||
|
"Machin otonòm fè responsablite asirans lan ale sou men fabrikan yo",
|
||||||
|
"San Francisco ap konsidere entèdi robo ki livre sou twotwa yo",
|
||||||
|
"Lond se yon gwo vil nan Wayòm Ini",
|
||||||
|
"Kote ou ye?",
|
||||||
|
"Kilès ki prezidan Lafrans?",
|
||||||
|
"Ki kapital Etazini?",
|
||||||
|
"Kile Barack Obama te fèt?",
|
||||||
|
]
|
51
spacy/lang/ht/lemmatizer.py
Normal file
51
spacy/lang/ht/lemmatizer.py
Normal file
|
@ -0,0 +1,51 @@
|
||||||
|
from typing import List, Tuple
|
||||||
|
|
||||||
|
from ...pipeline import Lemmatizer
|
||||||
|
from ...tokens import Token
|
||||||
|
from ...lookups import Lookups
|
||||||
|
|
||||||
|
|
||||||
|
class HaitianCreoleLemmatizer(Lemmatizer):
|
||||||
|
"""
|
||||||
|
Minimal Haitian Creole lemmatizer.
|
||||||
|
Returns a word's base form based on rules and lookup,
|
||||||
|
or defaults to the original form.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def is_base_form(self, token: Token) -> bool:
|
||||||
|
morph = token.morph.to_dict()
|
||||||
|
upos = token.pos_.lower()
|
||||||
|
|
||||||
|
# Consider unmarked forms to be base
|
||||||
|
if upos in {"noun", "verb", "adj", "adv"}:
|
||||||
|
if not morph:
|
||||||
|
return True
|
||||||
|
if upos == "noun" and morph.get("Number") == "Sing":
|
||||||
|
return True
|
||||||
|
if upos == "verb" and morph.get("VerbForm") == "Inf":
|
||||||
|
return True
|
||||||
|
if upos == "adj" and morph.get("Degree") == "Pos":
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
def rule_lemmatize(self, token: Token) -> List[str]:
|
||||||
|
string = token.text.lower()
|
||||||
|
pos = token.pos_.lower()
|
||||||
|
cache_key = (token.orth, token.pos)
|
||||||
|
if cache_key in self.cache:
|
||||||
|
return self.cache[cache_key]
|
||||||
|
|
||||||
|
forms = []
|
||||||
|
|
||||||
|
# fallback rule: just return lowercased form
|
||||||
|
forms.append(string)
|
||||||
|
|
||||||
|
self.cache[cache_key] = forms
|
||||||
|
return forms
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]:
|
||||||
|
if mode == "rule":
|
||||||
|
required = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"]
|
||||||
|
return (required, [])
|
||||||
|
return super().get_lookups_config(mode)
|
78
spacy/lang/ht/lex_attrs.py
Normal file
78
spacy/lang/ht/lex_attrs.py
Normal file
|
@ -0,0 +1,78 @@
|
||||||
|
from ...attrs import LIKE_NUM, NORM
|
||||||
|
|
||||||
|
# Cardinal numbers in Creole
|
||||||
|
_num_words = set(
|
||||||
|
"""
|
||||||
|
zewo youn en de twa kat senk sis sèt uit nèf dis
|
||||||
|
onz douz trèz katoz kenz sèz disèt dizwit diznèf
|
||||||
|
vent trant karant sinkant swasant swasann-dis
|
||||||
|
san mil milyon milya
|
||||||
|
""".split()
|
||||||
|
)
|
||||||
|
|
||||||
|
# Ordinal numbers in Creole (some are French-influenced, some simplified)
|
||||||
|
_ordinal_words = set(
|
||||||
|
"""
|
||||||
|
premye dezyèm twazyèm katryèm senkyèm sizyèm sètvyèm uitvyèm nèvyèm dizyèm
|
||||||
|
onzèm douzyèm trèzyèm katozyèm kenzèm sèzyèm disetyèm dizwityèm diznèvyèm
|
||||||
|
ventyèm trantyèm karantyèm sinkantyèm swasantyèm
|
||||||
|
swasann-disyèm santyèm milyèm milyonnyèm milyadyèm
|
||||||
|
""".split()
|
||||||
|
)
|
||||||
|
|
||||||
|
NORM_MAP = {
|
||||||
|
"'m": "mwen",
|
||||||
|
"'w": "ou",
|
||||||
|
"'l": "li",
|
||||||
|
"'n": "nou",
|
||||||
|
"'y": "yo",
|
||||||
|
"’m": "mwen",
|
||||||
|
"’w": "ou",
|
||||||
|
"’l": "li",
|
||||||
|
"’n": "nou",
|
||||||
|
"’y": "yo",
|
||||||
|
"m": "mwen",
|
||||||
|
"n": "nou",
|
||||||
|
"l": "li",
|
||||||
|
"y": "yo",
|
||||||
|
"w": "ou",
|
||||||
|
"t": "te",
|
||||||
|
"k": "ki",
|
||||||
|
"p": "pa",
|
||||||
|
"M": "Mwen",
|
||||||
|
"N": "Nou",
|
||||||
|
"L": "Li",
|
||||||
|
"Y": "Yo",
|
||||||
|
"W": "Ou",
|
||||||
|
"T": "Te",
|
||||||
|
"K": "Ki",
|
||||||
|
"P": "Pa",
|
||||||
|
}
|
||||||
|
|
||||||
|
def like_num(text):
|
||||||
|
text = text.strip().lower()
|
||||||
|
if text.startswith(("+", "-", "±", "~")):
|
||||||
|
text = text[1:]
|
||||||
|
text = text.replace(",", "").replace(".", "")
|
||||||
|
if text.isdigit():
|
||||||
|
return True
|
||||||
|
if text.count("/") == 1:
|
||||||
|
num, denom = text.split("/")
|
||||||
|
if num.isdigit() and denom.isdigit():
|
||||||
|
return True
|
||||||
|
if text in _num_words:
|
||||||
|
return True
|
||||||
|
if text in _ordinal_words:
|
||||||
|
return True
|
||||||
|
# Handle things like "3yèm", "10yèm", "25yèm", etc.
|
||||||
|
if text.endswith("yèm") and text[:-3].isdigit():
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
def norm_custom(text):
|
||||||
|
return NORM_MAP.get(text, text.lower())
|
||||||
|
|
||||||
|
LEX_ATTRS = {
|
||||||
|
LIKE_NUM: like_num,
|
||||||
|
NORM: norm_custom,
|
||||||
|
}
|
43
spacy/lang/ht/punctuation.py
Normal file
43
spacy/lang/ht/punctuation.py
Normal file
|
@ -0,0 +1,43 @@
|
||||||
|
from ..char_classes import (
|
||||||
|
ALPHA,
|
||||||
|
ALPHA_LOWER,
|
||||||
|
ALPHA_UPPER,
|
||||||
|
CONCAT_QUOTES,
|
||||||
|
HYPHENS,
|
||||||
|
LIST_PUNCT,
|
||||||
|
LIST_QUOTES,
|
||||||
|
LIST_ELLIPSES,
|
||||||
|
LIST_ICONS,
|
||||||
|
merge_chars,
|
||||||
|
)
|
||||||
|
|
||||||
|
ELISION = "'’".replace(" ", "")
|
||||||
|
|
||||||
|
_prefixes_elision = "m n l y t k w"
|
||||||
|
_prefixes_elision += " " + _prefixes_elision.upper()
|
||||||
|
|
||||||
|
TOKENIZER_PREFIXES = LIST_PUNCT + LIST_QUOTES + [
|
||||||
|
r"(?:({pe})[{el}])(?=[{a}])".format(
|
||||||
|
a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision)
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
TOKENIZER_SUFFIXES = LIST_PUNCT + LIST_QUOTES + LIST_ELLIPSES + [
|
||||||
|
r"(?<=[0-9])%", # numbers like 10%
|
||||||
|
r"(?<=[0-9])(?:{h})".format(h=HYPHENS), # hyphens after numbers
|
||||||
|
r"(?<=[{a}])['’]".format(a=ALPHA), # apostrophes after letters
|
||||||
|
r"(?<=[{a}])['’][mwlnytk](?=\s|$)".format(a=ALPHA), # contractions
|
||||||
|
r"(?<=[{a}0-9])\)", # right parenthesis after letter/number
|
||||||
|
r"(?<=[{a}])\.(?=\s|$)".format(a=ALPHA), # period after letter if space or end of string
|
||||||
|
r"(?<=\))[\.\?!]", # punctuation immediately after right parenthesis
|
||||||
|
]
|
||||||
|
|
||||||
|
TOKENIZER_INFIXES = LIST_ELLIPSES + LIST_ICONS + [
|
||||||
|
r"(?<=[0-9])[+\-\*^](?=[0-9-])",
|
||||||
|
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
|
||||||
|
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
|
||||||
|
),
|
||||||
|
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||||
|
r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
|
||||||
|
r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION),
|
||||||
|
]
|
50
spacy/lang/ht/stop_words.py
Normal file
50
spacy/lang/ht/stop_words.py
Normal file
|
@ -0,0 +1,50 @@
|
||||||
|
STOP_WORDS = set(
|
||||||
|
"""
|
||||||
|
a ak an ankò ant apre ap atò avan avanlè
|
||||||
|
byen bò byenke
|
||||||
|
|
||||||
|
chak
|
||||||
|
|
||||||
|
de depi deja deja
|
||||||
|
|
||||||
|
e en epi èske
|
||||||
|
|
||||||
|
fò fòk
|
||||||
|
|
||||||
|
gen genyen
|
||||||
|
|
||||||
|
ki kisa kilès kote koukou konsa konbyen konn konnen kounye kouman
|
||||||
|
|
||||||
|
la l laa le lè li lye lò
|
||||||
|
|
||||||
|
m m' mwen
|
||||||
|
|
||||||
|
nan nap nou n'
|
||||||
|
|
||||||
|
ou oumenm
|
||||||
|
|
||||||
|
pa paske pami pandan pito pou pral preske pwiske
|
||||||
|
|
||||||
|
se selman si sou sòt
|
||||||
|
|
||||||
|
ta tap tankou te toujou tou tan tout toutotan twòp tèl
|
||||||
|
|
||||||
|
w w' wi wè
|
||||||
|
|
||||||
|
y y' yo yon yonn
|
||||||
|
|
||||||
|
non o oh eh
|
||||||
|
|
||||||
|
sa san si swa si
|
||||||
|
|
||||||
|
men mèsi oswa osinon
|
||||||
|
|
||||||
|
"""
|
||||||
|
.split()
|
||||||
|
)
|
||||||
|
|
||||||
|
# Add common contractions, with and without apostrophe variants
|
||||||
|
contractions = ["m'", "n'", "w'", "y'", "l'", "t'", "k'"]
|
||||||
|
for apostrophe in ["'", "’", "‘"]:
|
||||||
|
for word in contractions:
|
||||||
|
STOP_WORDS.add(word.replace("'", apostrophe))
|
74
spacy/lang/ht/syntax_iterators.py
Normal file
74
spacy/lang/ht/syntax_iterators.py
Normal file
|
@ -0,0 +1,74 @@
|
||||||
|
from typing import Iterator, Tuple, Union
|
||||||
|
|
||||||
|
from ...errors import Errors
|
||||||
|
from ...symbols import NOUN, PRON, PROPN
|
||||||
|
from ...tokens import Doc, Span
|
||||||
|
|
||||||
|
|
||||||
|
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
|
||||||
|
"""
|
||||||
|
Detect base noun phrases from a dependency parse for Haitian Creole.
|
||||||
|
Works on both Doc and Span objects.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Core nominal dependencies common in Haitian Creole
|
||||||
|
labels = [
|
||||||
|
"nsubj",
|
||||||
|
"obj",
|
||||||
|
"obl",
|
||||||
|
"nmod",
|
||||||
|
"appos",
|
||||||
|
"ROOT",
|
||||||
|
]
|
||||||
|
|
||||||
|
# Modifiers to optionally include in chunk (to the right)
|
||||||
|
post_modifiers = ["compound", "flat", "flat:name", "fixed"]
|
||||||
|
|
||||||
|
doc = doclike.doc
|
||||||
|
if not doc.has_annotation("DEP"):
|
||||||
|
raise ValueError(Errors.E029)
|
||||||
|
|
||||||
|
np_deps = {doc.vocab.strings.add(label) for label in labels}
|
||||||
|
np_mods = {doc.vocab.strings.add(mod) for mod in post_modifiers}
|
||||||
|
conj_label = doc.vocab.strings.add("conj")
|
||||||
|
np_label = doc.vocab.strings.add("NP")
|
||||||
|
adp_pos = doc.vocab.strings.add("ADP")
|
||||||
|
cc_pos = doc.vocab.strings.add("CCONJ")
|
||||||
|
|
||||||
|
prev_end = -1
|
||||||
|
for i, word in enumerate(doclike):
|
||||||
|
if word.pos not in (NOUN, PROPN, PRON):
|
||||||
|
continue
|
||||||
|
if word.left_edge.i <= prev_end:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if word.dep in np_deps:
|
||||||
|
right_end = word
|
||||||
|
# expand to include known modifiers to the right
|
||||||
|
for child in word.rights:
|
||||||
|
if child.dep in np_mods:
|
||||||
|
right_end = child.right_edge
|
||||||
|
elif child.pos == NOUN:
|
||||||
|
right_end = child.right_edge
|
||||||
|
|
||||||
|
left_index = word.left_edge.i
|
||||||
|
# Skip prepositions at the start
|
||||||
|
if word.left_edge.pos == adp_pos:
|
||||||
|
left_index += 1
|
||||||
|
|
||||||
|
prev_end = right_end.i
|
||||||
|
yield left_index, right_end.i + 1, np_label
|
||||||
|
|
||||||
|
elif word.dep == conj_label:
|
||||||
|
head = word.head
|
||||||
|
while head.dep == conj_label and head.head.i < head.i:
|
||||||
|
head = head.head
|
||||||
|
if head.dep in np_deps:
|
||||||
|
left_index = word.left_edge.i
|
||||||
|
if word.left_edge.pos == cc_pos:
|
||||||
|
left_index += 1
|
||||||
|
prev_end = word.i
|
||||||
|
yield left_index, word.i + 1, np_label
|
||||||
|
|
||||||
|
|
||||||
|
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
|
21
spacy/lang/ht/tag_map.py
Normal file
21
spacy/lang/ht/tag_map.py
Normal file
|
@ -0,0 +1,21 @@
|
||||||
|
from spacy.symbols import NOUN, VERB, AUX, ADJ, ADV, PRON, DET, ADP, SCONJ, CCONJ, PART, INTJ, NUM, PROPN, PUNCT, SYM, X
|
||||||
|
|
||||||
|
TAG_MAP = {
|
||||||
|
"NOUN": {"pos": NOUN},
|
||||||
|
"VERB": {"pos": VERB},
|
||||||
|
"AUX": {"pos": AUX},
|
||||||
|
"ADJ": {"pos": ADJ},
|
||||||
|
"ADV": {"pos": ADV},
|
||||||
|
"PRON": {"pos": PRON},
|
||||||
|
"DET": {"pos": DET},
|
||||||
|
"ADP": {"pos": ADP},
|
||||||
|
"SCONJ": {"pos": SCONJ},
|
||||||
|
"CCONJ": {"pos": CCONJ},
|
||||||
|
"PART": {"pos": PART},
|
||||||
|
"INTJ": {"pos": INTJ},
|
||||||
|
"NUM": {"pos": NUM},
|
||||||
|
"PROPN": {"pos": PROPN},
|
||||||
|
"PUNCT": {"pos": PUNCT},
|
||||||
|
"SYM": {"pos": SYM},
|
||||||
|
"X": {"pos": X},
|
||||||
|
}
|
121
spacy/lang/ht/tokenizer_exceptions.py
Normal file
121
spacy/lang/ht/tokenizer_exceptions.py
Normal file
|
@ -0,0 +1,121 @@
|
||||||
|
from spacy.symbols import ORTH, NORM
|
||||||
|
|
||||||
|
def make_variants(base, first_norm, second_orth, second_norm):
|
||||||
|
return {
|
||||||
|
base: [
|
||||||
|
{ORTH: base.split("'")[0] + "'", NORM: first_norm},
|
||||||
|
{ORTH: second_orth, NORM: second_norm},
|
||||||
|
],
|
||||||
|
base.capitalize(): [
|
||||||
|
{ORTH: base.split("'")[0].capitalize() + "'", NORM: first_norm.capitalize()},
|
||||||
|
{ORTH: second_orth, NORM: second_norm},
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
TOKENIZER_EXCEPTIONS = {
|
||||||
|
"Dr.": [{ORTH: "Dr."}]
|
||||||
|
}
|
||||||
|
|
||||||
|
# Apostrophe forms
|
||||||
|
TOKENIZER_EXCEPTIONS.update(make_variants("m'ap", "mwen", "ap", "ap"))
|
||||||
|
TOKENIZER_EXCEPTIONS.update(make_variants("n'ap", "nou", "ap", "ap"))
|
||||||
|
TOKENIZER_EXCEPTIONS.update(make_variants("l'ap", "li", "ap", "ap"))
|
||||||
|
TOKENIZER_EXCEPTIONS.update(make_variants("y'ap", "yo", "ap", "ap"))
|
||||||
|
TOKENIZER_EXCEPTIONS.update(make_variants("m'te", "mwen", "te", "te"))
|
||||||
|
TOKENIZER_EXCEPTIONS.update(make_variants("m'pral", "mwen", "pral", "pral"))
|
||||||
|
TOKENIZER_EXCEPTIONS.update(make_variants("w'ap", "ou", "ap", "ap"))
|
||||||
|
TOKENIZER_EXCEPTIONS.update(make_variants("k'ap", "ki", "ap", "ap"))
|
||||||
|
TOKENIZER_EXCEPTIONS.update(make_variants("p'ap", "pa", "ap", "ap"))
|
||||||
|
TOKENIZER_EXCEPTIONS.update(make_variants("t'ap", "te", "ap", "ap"))
|
||||||
|
|
||||||
|
# Non-apostrophe contractions (with capitalized variants)
|
||||||
|
TOKENIZER_EXCEPTIONS.update({
|
||||||
|
"map": [
|
||||||
|
{ORTH: "m", NORM: "mwen"},
|
||||||
|
{ORTH: "ap", NORM: "ap"},
|
||||||
|
],
|
||||||
|
"Map": [
|
||||||
|
{ORTH: "M", NORM: "Mwen"},
|
||||||
|
{ORTH: "ap", NORM: "ap"},
|
||||||
|
],
|
||||||
|
"lem": [
|
||||||
|
{ORTH: "le", NORM: "le"},
|
||||||
|
{ORTH: "m", NORM: "mwen"},
|
||||||
|
],
|
||||||
|
"Lem": [
|
||||||
|
{ORTH: "Le", NORM: "Le"},
|
||||||
|
{ORTH: "m", NORM: "mwen"},
|
||||||
|
],
|
||||||
|
"lew": [
|
||||||
|
{ORTH: "le", NORM: "le"},
|
||||||
|
{ORTH: "w", NORM: "ou"},
|
||||||
|
],
|
||||||
|
"Lew": [
|
||||||
|
{ORTH: "Le", NORM: "Le"},
|
||||||
|
{ORTH: "w", NORM: "ou"},
|
||||||
|
],
|
||||||
|
"nap": [
|
||||||
|
{ORTH: "n", NORM: "nou"},
|
||||||
|
{ORTH: "ap", NORM: "ap"},
|
||||||
|
],
|
||||||
|
"Nap": [
|
||||||
|
{ORTH: "N", NORM: "Nou"},
|
||||||
|
{ORTH: "ap", NORM: "ap"},
|
||||||
|
],
|
||||||
|
"lap": [
|
||||||
|
{ORTH: "l", NORM: "li"},
|
||||||
|
{ORTH: "ap", NORM: "ap"},
|
||||||
|
],
|
||||||
|
"Lap": [
|
||||||
|
{ORTH: "L", NORM: "Li"},
|
||||||
|
{ORTH: "ap", NORM: "ap"},
|
||||||
|
],
|
||||||
|
"yap": [
|
||||||
|
{ORTH: "y", NORM: "yo"},
|
||||||
|
{ORTH: "ap", NORM: "ap"},
|
||||||
|
],
|
||||||
|
"Yap": [
|
||||||
|
{ORTH: "Y", NORM: "Yo"},
|
||||||
|
{ORTH: "ap", NORM: "ap"},
|
||||||
|
],
|
||||||
|
"mte": [
|
||||||
|
{ORTH: "m", NORM: "mwen"},
|
||||||
|
{ORTH: "te", NORM: "te"},
|
||||||
|
],
|
||||||
|
"Mte": [
|
||||||
|
{ORTH: "M", NORM: "Mwen"},
|
||||||
|
{ORTH: "te", NORM: "te"},
|
||||||
|
],
|
||||||
|
"mpral": [
|
||||||
|
{ORTH: "m", NORM: "mwen"},
|
||||||
|
{ORTH: "pral", NORM: "pral"},
|
||||||
|
],
|
||||||
|
"Mpral": [
|
||||||
|
{ORTH: "M", NORM: "Mwen"},
|
||||||
|
{ORTH: "pral", NORM: "pral"},
|
||||||
|
],
|
||||||
|
"wap": [
|
||||||
|
{ORTH: "w", NORM: "ou"},
|
||||||
|
{ORTH: "ap", NORM: "ap"},
|
||||||
|
],
|
||||||
|
"Wap": [
|
||||||
|
{ORTH: "W", NORM: "Ou"},
|
||||||
|
{ORTH: "ap", NORM: "ap"},
|
||||||
|
],
|
||||||
|
"kap": [
|
||||||
|
{ORTH: "k", NORM: "ki"},
|
||||||
|
{ORTH: "ap", NORM: "ap"},
|
||||||
|
],
|
||||||
|
"Kap": [
|
||||||
|
{ORTH: "K", NORM: "Ki"},
|
||||||
|
{ORTH: "ap", NORM: "ap"},
|
||||||
|
],
|
||||||
|
"tap": [
|
||||||
|
{ORTH: "t", NORM: "te"},
|
||||||
|
{ORTH: "ap", NORM: "ap"},
|
||||||
|
],
|
||||||
|
"Tap": [
|
||||||
|
{ORTH: "T", NORM: "Te"},
|
||||||
|
{ORTH: "ap", NORM: "ap"},
|
||||||
|
],
|
||||||
|
})
|
|
@ -32,7 +32,6 @@ split_mode = null
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
@registry.tokenizers("spacy.ja.JapaneseTokenizer")
|
|
||||||
def create_tokenizer(split_mode: Optional[str] = None):
|
def create_tokenizer(split_mode: Optional[str] = None):
|
||||||
def japanese_tokenizer_factory(nlp):
|
def japanese_tokenizer_factory(nlp):
|
||||||
return JapaneseTokenizer(nlp.vocab, split_mode=split_mode)
|
return JapaneseTokenizer(nlp.vocab, split_mode=split_mode)
|
||||||
|
|
16
spacy/lang/kmr/__init__.py
Normal file
16
spacy/lang/kmr/__init__.py
Normal file
|
@ -0,0 +1,16 @@
|
||||||
|
from ...language import BaseDefaults, Language
|
||||||
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
from .stop_words import STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
|
class KurmanjiDefaults(BaseDefaults):
|
||||||
|
stop_words = STOP_WORDS
|
||||||
|
lex_attr_getters = LEX_ATTRS
|
||||||
|
|
||||||
|
|
||||||
|
class Kurmanji(Language):
|
||||||
|
lang = "kmr"
|
||||||
|
Defaults = KurmanjiDefaults
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["Kurmanji"]
|
17
spacy/lang/kmr/examples.py
Normal file
17
spacy/lang/kmr/examples.py
Normal file
|
@ -0,0 +1,17 @@
|
||||||
|
"""
|
||||||
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
>>> from spacy.lang.kmr.examples import sentences
|
||||||
|
>>> docs = nlp.pipe(sentences)
|
||||||
|
"""
|
||||||
|
|
||||||
|
sentences = [
|
||||||
|
"Berê mirovan her tim li geşedana pêşerojê ye", # People's gaze is always on the development of the future
|
||||||
|
"Kawa Nemir di 14 salan de Ulysses wergerand Kurmancî.", # Kawa Nemir translated Ulysses into Kurmanji in 14 years.
|
||||||
|
"Mem Ararat hunermendekî Kurd yê bi nav û deng e.", # Mem Ararat is a famous Kurdish artist
|
||||||
|
"Firat Cewerî 40 sal e pirtûkên Kurdî dinivîsîne.", # Firat Ceweri has been writing Kurdish books for 40 years
|
||||||
|
"Rojnamegerê ciwan nûçeyeke balkêş li ser rewşa aborî nivîsand", # The young journalist wrote an interesting news article about the economic situation
|
||||||
|
"Sektora çandiniyê beşeke giring a belavkirina gaza serayê li seranserê cîhanê pêk tîne", # The agricultural sector constitutes an important part of greenhouse gas emissions worldwide
|
||||||
|
"Xwendekarên jêhatî di pêşbaziya matematîkê de serkeftî bûn", # Talented students succeeded in the mathematics competition
|
||||||
|
"Ji ber ji tunebûnê bavê min xwişkeke min nedan xwendin ew ji min re bû derd û kulek.", # Because of poverty, my father didn't send my sister to school, which became a pain and sorrow for me
|
||||||
|
]
|
138
spacy/lang/kmr/lex_attrs.py
Normal file
138
spacy/lang/kmr/lex_attrs.py
Normal file
|
@ -0,0 +1,138 @@
|
||||||
|
from ...attrs import LIKE_NUM
|
||||||
|
|
||||||
|
_num_words = [
|
||||||
|
"sifir",
|
||||||
|
"yek",
|
||||||
|
"du",
|
||||||
|
"sê",
|
||||||
|
"çar",
|
||||||
|
"pênc",
|
||||||
|
"şeş",
|
||||||
|
"heft",
|
||||||
|
"heşt",
|
||||||
|
"neh",
|
||||||
|
"deh",
|
||||||
|
"yazde",
|
||||||
|
"dazde",
|
||||||
|
"sêzde",
|
||||||
|
"çarde",
|
||||||
|
"pazde",
|
||||||
|
"şazde",
|
||||||
|
"hevde",
|
||||||
|
"hejde",
|
||||||
|
"nozde",
|
||||||
|
"bîst",
|
||||||
|
"sî",
|
||||||
|
"çil",
|
||||||
|
"pêncî",
|
||||||
|
"şêst",
|
||||||
|
"heftê",
|
||||||
|
"heştê",
|
||||||
|
"nod",
|
||||||
|
"sed",
|
||||||
|
"hezar",
|
||||||
|
"milyon",
|
||||||
|
"milyar",
|
||||||
|
]
|
||||||
|
|
||||||
|
_ordinal_words = [
|
||||||
|
"yekem",
|
||||||
|
"yekemîn",
|
||||||
|
"duyem",
|
||||||
|
"duyemîn",
|
||||||
|
"sêyem",
|
||||||
|
"sêyemîn",
|
||||||
|
"çarem",
|
||||||
|
"çaremîn",
|
||||||
|
"pêncem",
|
||||||
|
"pêncemîn",
|
||||||
|
"şeşem",
|
||||||
|
"şeşemîn",
|
||||||
|
"heftem",
|
||||||
|
"heftemîn",
|
||||||
|
"heştem",
|
||||||
|
"heştemîn",
|
||||||
|
"nehem",
|
||||||
|
"nehemîn",
|
||||||
|
"dehem",
|
||||||
|
"dehemîn",
|
||||||
|
"yazdehem",
|
||||||
|
"yazdehemîn",
|
||||||
|
"dazdehem",
|
||||||
|
"dazdehemîn",
|
||||||
|
"sêzdehem",
|
||||||
|
"sêzdehemîn",
|
||||||
|
"çardehem",
|
||||||
|
"çardehemîn",
|
||||||
|
"pazdehem",
|
||||||
|
"pazdehemîn",
|
||||||
|
"şanzdehem",
|
||||||
|
"şanzdehemîn",
|
||||||
|
"hevdehem",
|
||||||
|
"hevdehemîn",
|
||||||
|
"hejdehem",
|
||||||
|
"hejdehemîn",
|
||||||
|
"nozdehem",
|
||||||
|
"nozdehemîn",
|
||||||
|
"bîstem",
|
||||||
|
"bîstemîn",
|
||||||
|
"sîyem",
|
||||||
|
"sîyemîn",
|
||||||
|
"çilem",
|
||||||
|
"çilemîn",
|
||||||
|
"pêncîyem",
|
||||||
|
"pênciyemîn",
|
||||||
|
"şêstem",
|
||||||
|
"şêstemîn",
|
||||||
|
"heftêyem",
|
||||||
|
"heftêyemîn",
|
||||||
|
"heştêyem",
|
||||||
|
"heştêyemîn",
|
||||||
|
"notem",
|
||||||
|
"notemîn",
|
||||||
|
"sedem",
|
||||||
|
"sedemîn",
|
||||||
|
"hezarem",
|
||||||
|
"hezaremîn",
|
||||||
|
"milyonem",
|
||||||
|
"milyonemîn",
|
||||||
|
"milyarem",
|
||||||
|
"milyaremîn",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def like_num(text):
|
||||||
|
if text.startswith(("+", "-", "±", "~")):
|
||||||
|
text = text[1:]
|
||||||
|
text = text.replace(",", "").replace(".", "")
|
||||||
|
if text.isdigit():
|
||||||
|
return True
|
||||||
|
if text.count("/") == 1:
|
||||||
|
num, denom = text.split("/")
|
||||||
|
if num.isdigit() and denom.isdigit():
|
||||||
|
return True
|
||||||
|
text_lower = text.lower()
|
||||||
|
if text_lower in _num_words:
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Check ordinal number
|
||||||
|
if text_lower in _ordinal_words:
|
||||||
|
return True
|
||||||
|
|
||||||
|
if is_digit(text_lower):
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def is_digit(text):
|
||||||
|
endings = ("em", "yem", "emîn", "yemîn")
|
||||||
|
for ending in endings:
|
||||||
|
to = len(ending)
|
||||||
|
if text.endswith(ending) and text[:-to].isdigit():
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
LEX_ATTRS = {LIKE_NUM: like_num}
|
44
spacy/lang/kmr/stop_words.py
Normal file
44
spacy/lang/kmr/stop_words.py
Normal file
|
@ -0,0 +1,44 @@
|
||||||
|
STOP_WORDS = set(
|
||||||
|
"""
|
||||||
|
û
|
||||||
|
li
|
||||||
|
bi
|
||||||
|
di
|
||||||
|
da
|
||||||
|
de
|
||||||
|
ji
|
||||||
|
ku
|
||||||
|
ew
|
||||||
|
ez
|
||||||
|
tu
|
||||||
|
em
|
||||||
|
hûn
|
||||||
|
ew
|
||||||
|
ev
|
||||||
|
min
|
||||||
|
te
|
||||||
|
wî
|
||||||
|
wê
|
||||||
|
me
|
||||||
|
we
|
||||||
|
wan
|
||||||
|
vê
|
||||||
|
vî
|
||||||
|
va
|
||||||
|
çi
|
||||||
|
kî
|
||||||
|
kê
|
||||||
|
çawa
|
||||||
|
çima
|
||||||
|
kengî
|
||||||
|
li ku
|
||||||
|
çend
|
||||||
|
çiqas
|
||||||
|
her
|
||||||
|
hin
|
||||||
|
gelek
|
||||||
|
hemû
|
||||||
|
kes
|
||||||
|
tişt
|
||||||
|
""".split()
|
||||||
|
)
|
|
@ -20,7 +20,6 @@ DEFAULT_CONFIG = """
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
@registry.tokenizers("spacy.ko.KoreanTokenizer")
|
|
||||||
def create_tokenizer():
|
def create_tokenizer():
|
||||||
def korean_tokenizer_factory(nlp):
|
def korean_tokenizer_factory(nlp):
|
||||||
return KoreanTokenizer(nlp.vocab)
|
return KoreanTokenizer(nlp.vocab)
|
||||||
|
|
|
@ -24,12 +24,6 @@ class MacedonianDefaults(BaseDefaults):
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def create_lemmatizer(cls, nlp=None, lookups=None):
|
|
||||||
if lookups is None:
|
|
||||||
lookups = Lookups()
|
|
||||||
return MacedonianLemmatizer(lookups)
|
|
||||||
|
|
||||||
|
|
||||||
class Macedonian(Language):
|
class Macedonian(Language):
|
||||||
lang = "mk"
|
lang = "mk"
|
||||||
|
|
20
spacy/lang/nn/__init__.py
Normal file
20
spacy/lang/nn/__init__.py
Normal file
|
@ -0,0 +1,20 @@
|
||||||
|
from ...language import BaseDefaults, Language
|
||||||
|
from ..nb import SYNTAX_ITERATORS
|
||||||
|
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
||||||
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
|
|
||||||
|
|
||||||
|
class NorwegianNynorskDefaults(BaseDefaults):
|
||||||
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
|
prefixes = TOKENIZER_PREFIXES
|
||||||
|
infixes = TOKENIZER_INFIXES
|
||||||
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
|
syntax_iterators = SYNTAX_ITERATORS
|
||||||
|
|
||||||
|
|
||||||
|
class NorwegianNynorsk(Language):
|
||||||
|
lang = "nn"
|
||||||
|
Defaults = NorwegianNynorskDefaults
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["NorwegianNynorsk"]
|
15
spacy/lang/nn/examples.py
Normal file
15
spacy/lang/nn/examples.py
Normal file
|
@ -0,0 +1,15 @@
|
||||||
|
"""
|
||||||
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
>>> from spacy.lang.nn.examples import sentences
|
||||||
|
>>> docs = nlp.pipe(sentences)
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
# sentences taken from Omsetjingsminne frå Nynorsk pressekontor 2022 (https://www.nb.no/sprakbanken/en/resource-catalogue/oai-nb-no-sbr-80/)
|
||||||
|
sentences = [
|
||||||
|
"Konseptet går ut på at alle tre omgangar tel, alle hopparar må stille i kvalifiseringa og poengsummen skal telje.",
|
||||||
|
"Det er ein meir enn i same periode i fjor.",
|
||||||
|
"Det har lava ned enorme snømengder i store delar av Europa den siste tida.",
|
||||||
|
"Akhtar Chaudhry er ikkje innstilt på Oslo-lista til SV, men utfordrar Heikki Holmås om førsteplassen.",
|
||||||
|
]
|
74
spacy/lang/nn/punctuation.py
Normal file
74
spacy/lang/nn/punctuation.py
Normal file
|
@ -0,0 +1,74 @@
|
||||||
|
from ..char_classes import (
|
||||||
|
ALPHA,
|
||||||
|
ALPHA_LOWER,
|
||||||
|
ALPHA_UPPER,
|
||||||
|
CONCAT_QUOTES,
|
||||||
|
CURRENCY,
|
||||||
|
LIST_CURRENCY,
|
||||||
|
LIST_ELLIPSES,
|
||||||
|
LIST_ICONS,
|
||||||
|
LIST_PUNCT,
|
||||||
|
LIST_QUOTES,
|
||||||
|
PUNCT,
|
||||||
|
UNITS,
|
||||||
|
)
|
||||||
|
from ..punctuation import TOKENIZER_SUFFIXES
|
||||||
|
|
||||||
|
_quotes = CONCAT_QUOTES.replace("'", "")
|
||||||
|
_list_punct = [x for x in LIST_PUNCT if x != "#"]
|
||||||
|
_list_icons = [x for x in LIST_ICONS if x != "°"]
|
||||||
|
_list_icons = [x.replace("\\u00B0", "") for x in _list_icons]
|
||||||
|
_list_quotes = [x for x in LIST_QUOTES if x != "\\'"]
|
||||||
|
|
||||||
|
|
||||||
|
_prefixes = (
|
||||||
|
["§", "%", "=", "—", "–", r"\+(?![0-9])"]
|
||||||
|
+ _list_punct
|
||||||
|
+ LIST_ELLIPSES
|
||||||
|
+ LIST_QUOTES
|
||||||
|
+ LIST_CURRENCY
|
||||||
|
+ LIST_ICONS
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
_infixes = (
|
||||||
|
LIST_ELLIPSES
|
||||||
|
+ _list_icons
|
||||||
|
+ [
|
||||||
|
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
|
||||||
|
r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
|
||||||
|
r"(?<=[{a}])[:<>=/](?=[{a}])".format(a=ALPHA),
|
||||||
|
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||||
|
r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes),
|
||||||
|
r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
_suffixes = (
|
||||||
|
LIST_PUNCT
|
||||||
|
+ LIST_ELLIPSES
|
||||||
|
+ _list_quotes
|
||||||
|
+ _list_icons
|
||||||
|
+ ["—", "–"]
|
||||||
|
+ [
|
||||||
|
r"(?<=[0-9])\+",
|
||||||
|
r"(?<=°[FfCcKk])\.",
|
||||||
|
r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
|
||||||
|
r"(?<=[0-9])(?:{u})".format(u=UNITS),
|
||||||
|
r"(?<=[{al}{e}{p}(?:{q})])\.".format(
|
||||||
|
al=ALPHA_LOWER, e=r"%²\-\+", q=_quotes, p=PUNCT
|
||||||
|
),
|
||||||
|
r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
|
||||||
|
]
|
||||||
|
+ [r"(?<=[^sSxXzZ])'"]
|
||||||
|
)
|
||||||
|
_suffixes += [
|
||||||
|
suffix
|
||||||
|
for suffix in TOKENIZER_SUFFIXES
|
||||||
|
if suffix not in ["'s", "'S", "’s", "’S", r"\'"]
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
TOKENIZER_PREFIXES = _prefixes
|
||||||
|
TOKENIZER_INFIXES = _infixes
|
||||||
|
TOKENIZER_SUFFIXES = _suffixes
|
228
spacy/lang/nn/tokenizer_exceptions.py
Normal file
228
spacy/lang/nn/tokenizer_exceptions.py
Normal file
|
@ -0,0 +1,228 @@
|
||||||
|
from ...symbols import NORM, ORTH
|
||||||
|
from ...util import update_exc
|
||||||
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
|
|
||||||
|
_exc = {}
|
||||||
|
|
||||||
|
|
||||||
|
for exc_data in [
|
||||||
|
{ORTH: "jan.", NORM: "januar"},
|
||||||
|
{ORTH: "feb.", NORM: "februar"},
|
||||||
|
{ORTH: "mar.", NORM: "mars"},
|
||||||
|
{ORTH: "apr.", NORM: "april"},
|
||||||
|
{ORTH: "jun.", NORM: "juni"},
|
||||||
|
# note: "jul." is in the simple list below without a NORM exception
|
||||||
|
{ORTH: "aug.", NORM: "august"},
|
||||||
|
{ORTH: "sep.", NORM: "september"},
|
||||||
|
{ORTH: "okt.", NORM: "oktober"},
|
||||||
|
{ORTH: "nov.", NORM: "november"},
|
||||||
|
{ORTH: "des.", NORM: "desember"},
|
||||||
|
]:
|
||||||
|
_exc[exc_data[ORTH]] = [exc_data]
|
||||||
|
|
||||||
|
|
||||||
|
for orth in [
|
||||||
|
"Ap.",
|
||||||
|
"Aq.",
|
||||||
|
"Ca.",
|
||||||
|
"Chr.",
|
||||||
|
"Co.",
|
||||||
|
"Dr.",
|
||||||
|
"F.eks.",
|
||||||
|
"Fr.p.",
|
||||||
|
"Frp.",
|
||||||
|
"Grl.",
|
||||||
|
"Kr.",
|
||||||
|
"Kr.F.",
|
||||||
|
"Kr.F.s",
|
||||||
|
"Mr.",
|
||||||
|
"Mrs.",
|
||||||
|
"Pb.",
|
||||||
|
"Pr.",
|
||||||
|
"Sp.",
|
||||||
|
"St.",
|
||||||
|
"a.m.",
|
||||||
|
"ad.",
|
||||||
|
"adm.dir.",
|
||||||
|
"adr.",
|
||||||
|
"b.c.",
|
||||||
|
"bl.a.",
|
||||||
|
"bla.",
|
||||||
|
"bm.",
|
||||||
|
"bnr.",
|
||||||
|
"bto.",
|
||||||
|
"c.c.",
|
||||||
|
"ca.",
|
||||||
|
"cand.mag.",
|
||||||
|
"co.",
|
||||||
|
"d.d.",
|
||||||
|
"d.m.",
|
||||||
|
"d.y.",
|
||||||
|
"dept.",
|
||||||
|
"dr.",
|
||||||
|
"dr.med.",
|
||||||
|
"dr.philos.",
|
||||||
|
"dr.psychol.",
|
||||||
|
"dss.",
|
||||||
|
"dvs.",
|
||||||
|
"e.Kr.",
|
||||||
|
"e.l.",
|
||||||
|
"eg.",
|
||||||
|
"eig.",
|
||||||
|
"ekskl.",
|
||||||
|
"el.",
|
||||||
|
"et.",
|
||||||
|
"etc.",
|
||||||
|
"etg.",
|
||||||
|
"ev.",
|
||||||
|
"evt.",
|
||||||
|
"f.",
|
||||||
|
"f.Kr.",
|
||||||
|
"f.eks.",
|
||||||
|
"f.o.m.",
|
||||||
|
"fhv.",
|
||||||
|
"fk.",
|
||||||
|
"foreg.",
|
||||||
|
"fork.",
|
||||||
|
"fv.",
|
||||||
|
"fvt.",
|
||||||
|
"g.",
|
||||||
|
"gl.",
|
||||||
|
"gno.",
|
||||||
|
"gnr.",
|
||||||
|
"grl.",
|
||||||
|
"gt.",
|
||||||
|
"h.r.adv.",
|
||||||
|
"hhv.",
|
||||||
|
"hoh.",
|
||||||
|
"hr.",
|
||||||
|
"ifb.",
|
||||||
|
"ifm.",
|
||||||
|
"iht.",
|
||||||
|
"inkl.",
|
||||||
|
"istf.",
|
||||||
|
"jf.",
|
||||||
|
"jr.",
|
||||||
|
"jul.",
|
||||||
|
"juris.",
|
||||||
|
"kfr.",
|
||||||
|
"kgl.",
|
||||||
|
"kgl.res.",
|
||||||
|
"kl.",
|
||||||
|
"komm.",
|
||||||
|
"kr.",
|
||||||
|
"kst.",
|
||||||
|
"lat.",
|
||||||
|
"lø.",
|
||||||
|
"m.a.",
|
||||||
|
"m.a.o.",
|
||||||
|
"m.fl.",
|
||||||
|
"m.m.",
|
||||||
|
"m.v.",
|
||||||
|
"ma.",
|
||||||
|
"mag.art.",
|
||||||
|
"md.",
|
||||||
|
"mfl.",
|
||||||
|
"mht.",
|
||||||
|
"mill.",
|
||||||
|
"min.",
|
||||||
|
"mnd.",
|
||||||
|
"moh.",
|
||||||
|
"mrd.",
|
||||||
|
"muh.",
|
||||||
|
"mv.",
|
||||||
|
"mva.",
|
||||||
|
"n.å.",
|
||||||
|
"ndf.",
|
||||||
|
"nr.",
|
||||||
|
"nto.",
|
||||||
|
"nyno.",
|
||||||
|
"o.a.",
|
||||||
|
"o.l.",
|
||||||
|
"obl.",
|
||||||
|
"off.",
|
||||||
|
"ofl.",
|
||||||
|
"on.",
|
||||||
|
"op.",
|
||||||
|
"org.",
|
||||||
|
"osv.",
|
||||||
|
"ovf.",
|
||||||
|
"p.",
|
||||||
|
"p.a.",
|
||||||
|
"p.g.a.",
|
||||||
|
"p.m.",
|
||||||
|
"p.t.",
|
||||||
|
"pga.",
|
||||||
|
"ph.d.",
|
||||||
|
"pkt.",
|
||||||
|
"pr.",
|
||||||
|
"pst.",
|
||||||
|
"pt.",
|
||||||
|
"red.anm.",
|
||||||
|
"ref.",
|
||||||
|
"res.",
|
||||||
|
"res.kap.",
|
||||||
|
"resp.",
|
||||||
|
"rv.",
|
||||||
|
"s.",
|
||||||
|
"s.d.",
|
||||||
|
"s.k.",
|
||||||
|
"s.u.",
|
||||||
|
"s.å.",
|
||||||
|
"sen.",
|
||||||
|
"sep.",
|
||||||
|
"siviling.",
|
||||||
|
"sms.",
|
||||||
|
"snr.",
|
||||||
|
"spm.",
|
||||||
|
"sr.",
|
||||||
|
"sst.",
|
||||||
|
"st.",
|
||||||
|
"st.meld.",
|
||||||
|
"st.prp.",
|
||||||
|
"stip.",
|
||||||
|
"stk.",
|
||||||
|
"stud.",
|
||||||
|
"sv.",
|
||||||
|
"såk.",
|
||||||
|
"sø.",
|
||||||
|
"t.d.",
|
||||||
|
"t.h.",
|
||||||
|
"t.o.m.",
|
||||||
|
"t.v.",
|
||||||
|
"temp.",
|
||||||
|
"ti.",
|
||||||
|
"tils.",
|
||||||
|
"tilsv.",
|
||||||
|
"tl;dr",
|
||||||
|
"tlf.",
|
||||||
|
"to.",
|
||||||
|
"ult.",
|
||||||
|
"utg.",
|
||||||
|
"v.",
|
||||||
|
"vedk.",
|
||||||
|
"vedr.",
|
||||||
|
"vg.",
|
||||||
|
"vgs.",
|
||||||
|
"vha.",
|
||||||
|
"vit.ass.",
|
||||||
|
"vn.",
|
||||||
|
"vol.",
|
||||||
|
"vs.",
|
||||||
|
"vsa.",
|
||||||
|
"§§",
|
||||||
|
"©NTB",
|
||||||
|
"årg.",
|
||||||
|
"årh.",
|
||||||
|
]:
|
||||||
|
_exc[orth] = [{ORTH: orth}]
|
||||||
|
|
||||||
|
# Dates
|
||||||
|
for h in range(1, 31 + 1):
|
||||||
|
for period in ["."]:
|
||||||
|
_exc[f"{h}{period}"] = [{ORTH: f"{h}."}]
|
||||||
|
|
||||||
|
_custom_base_exc = {"i.": [{ORTH: "i", NORM: "i"}, {ORTH: "."}]}
|
||||||
|
_exc.update(_custom_base_exc)
|
||||||
|
|
||||||
|
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
|
@ -13,7 +13,6 @@ DEFAULT_CONFIG = """
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
@registry.tokenizers("spacy.th.ThaiTokenizer")
|
|
||||||
def create_thai_tokenizer():
|
def create_thai_tokenizer():
|
||||||
def thai_tokenizer_factory(nlp):
|
def thai_tokenizer_factory(nlp):
|
||||||
return ThaiTokenizer(nlp.vocab)
|
return ThaiTokenizer(nlp.vocab)
|
||||||
|
|
|
@ -15,4 +15,7 @@ sentences = [
|
||||||
"Türkiye'nin başkenti neresi?",
|
"Türkiye'nin başkenti neresi?",
|
||||||
"Bakanlar Kurulu 180 günlük eylem planını açıkladı.",
|
"Bakanlar Kurulu 180 günlük eylem planını açıkladı.",
|
||||||
"Merkez Bankası, beklentiler doğrultusunda faizlerde değişikliğe gitmedi.",
|
"Merkez Bankası, beklentiler doğrultusunda faizlerde değişikliğe gitmedi.",
|
||||||
|
"Cemal Sureya kimdir?",
|
||||||
|
"Bunlari Biliyor muydunuz?",
|
||||||
|
"Altinoluk Turkiye haritasinin neresinde yer alir?",
|
||||||
]
|
]
|
||||||
|
|
|
@ -22,7 +22,6 @@ use_pyvi = true
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
@registry.tokenizers("spacy.vi.VietnameseTokenizer")
|
|
||||||
def create_vietnamese_tokenizer(use_pyvi: bool = True):
|
def create_vietnamese_tokenizer(use_pyvi: bool = True):
|
||||||
def vietnamese_tokenizer_factory(nlp):
|
def vietnamese_tokenizer_factory(nlp):
|
||||||
return VietnameseTokenizer(nlp.vocab, use_pyvi=use_pyvi)
|
return VietnameseTokenizer(nlp.vocab, use_pyvi=use_pyvi)
|
||||||
|
|
|
@ -46,7 +46,6 @@ class Segmenter(str, Enum):
|
||||||
return list(cls.__members__.keys())
|
return list(cls.__members__.keys())
|
||||||
|
|
||||||
|
|
||||||
@registry.tokenizers("spacy.zh.ChineseTokenizer")
|
|
||||||
def create_chinese_tokenizer(segmenter: Segmenter = Segmenter.char):
|
def create_chinese_tokenizer(segmenter: Segmenter = Segmenter.char):
|
||||||
def chinese_tokenizer_factory(nlp):
|
def chinese_tokenizer_factory(nlp):
|
||||||
return ChineseTokenizer(nlp.vocab, segmenter=segmenter)
|
return ChineseTokenizer(nlp.vocab, segmenter=segmenter)
|
||||||
|
|
|
@ -1,10 +1,11 @@
|
||||||
import functools
|
import functools
|
||||||
|
import inspect
|
||||||
import itertools
|
import itertools
|
||||||
import multiprocessing as mp
|
import multiprocessing as mp
|
||||||
import random
|
import random
|
||||||
import traceback
|
import traceback
|
||||||
import warnings
|
import warnings
|
||||||
from contextlib import contextmanager
|
from contextlib import ExitStack, contextmanager
|
||||||
from copy import deepcopy
|
from copy import deepcopy
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from itertools import chain, cycle
|
from itertools import chain, cycle
|
||||||
|
@ -29,8 +30,11 @@ from typing import (
|
||||||
overload,
|
overload,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
import numpy
|
||||||
import srsly
|
import srsly
|
||||||
|
from cymem.cymem import Pool
|
||||||
from thinc.api import Config, CupyOps, Optimizer, get_current_ops
|
from thinc.api import Config, CupyOps, Optimizer, get_current_ops
|
||||||
|
from thinc.util import convert_recursive
|
||||||
|
|
||||||
from . import about, ty, util
|
from . import about, ty, util
|
||||||
from .compat import Literal
|
from .compat import Literal
|
||||||
|
@ -64,6 +68,7 @@ from .util import (
|
||||||
registry,
|
registry,
|
||||||
warn_if_jupyter_cupy,
|
warn_if_jupyter_cupy,
|
||||||
)
|
)
|
||||||
|
from .vectors import BaseVectors
|
||||||
from .vocab import Vocab, create_vocab
|
from .vocab import Vocab, create_vocab
|
||||||
|
|
||||||
PipeCallable = Callable[[Doc], Doc]
|
PipeCallable = Callable[[Doc], Doc]
|
||||||
|
@ -99,7 +104,6 @@ class BaseDefaults:
|
||||||
writing_system = {"direction": "ltr", "has_case": True, "has_letters": True}
|
writing_system = {"direction": "ltr", "has_case": True, "has_letters": True}
|
||||||
|
|
||||||
|
|
||||||
@registry.tokenizers("spacy.Tokenizer.v1")
|
|
||||||
def create_tokenizer() -> Callable[["Language"], Tokenizer]:
|
def create_tokenizer() -> Callable[["Language"], Tokenizer]:
|
||||||
"""Registered function to create a tokenizer. Returns a factory that takes
|
"""Registered function to create a tokenizer. Returns a factory that takes
|
||||||
the nlp object and returns a Tokenizer instance using the language detaults.
|
the nlp object and returns a Tokenizer instance using the language detaults.
|
||||||
|
@ -125,7 +129,6 @@ def create_tokenizer() -> Callable[["Language"], Tokenizer]:
|
||||||
return tokenizer_factory
|
return tokenizer_factory
|
||||||
|
|
||||||
|
|
||||||
@registry.misc("spacy.LookupsDataLoader.v1")
|
|
||||||
def load_lookups_data(lang, tables):
|
def load_lookups_data(lang, tables):
|
||||||
util.logger.debug("Loading lookups from spacy-lookups-data: %s", tables)
|
util.logger.debug("Loading lookups from spacy-lookups-data: %s", tables)
|
||||||
lookups = load_lookups(lang=lang, tables=tables)
|
lookups = load_lookups(lang=lang, tables=tables)
|
||||||
|
@ -138,7 +141,7 @@ class Language:
|
||||||
|
|
||||||
Defaults (class): Settings, data and factory methods for creating the `nlp`
|
Defaults (class): Settings, data and factory methods for creating the `nlp`
|
||||||
object and processing pipeline.
|
object and processing pipeline.
|
||||||
lang (str): IETF language code, such as 'en'.
|
lang (str): Two-letter ISO 639-1 or three-letter ISO 639-3 language codes, such as 'en' and 'eng'.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language
|
DOCS: https://spacy.io/api/language
|
||||||
"""
|
"""
|
||||||
|
@ -157,6 +160,7 @@ class Language:
|
||||||
max_length: int = 10**6,
|
max_length: int = 10**6,
|
||||||
meta: Dict[str, Any] = {},
|
meta: Dict[str, Any] = {},
|
||||||
create_tokenizer: Optional[Callable[["Language"], Callable[[str], Doc]]] = None,
|
create_tokenizer: Optional[Callable[["Language"], Callable[[str], Doc]]] = None,
|
||||||
|
create_vectors: Optional[Callable[["Vocab"], BaseVectors]] = None,
|
||||||
batch_size: int = 1000,
|
batch_size: int = 1000,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> None:
|
) -> None:
|
||||||
|
@ -179,6 +183,9 @@ class Language:
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#init
|
DOCS: https://spacy.io/api/language#init
|
||||||
"""
|
"""
|
||||||
|
from .pipeline.factories import register_factories
|
||||||
|
|
||||||
|
register_factories()
|
||||||
# We're only calling this to import all factories provided via entry
|
# We're only calling this to import all factories provided via entry
|
||||||
# points. The factory decorator applied to these functions takes care
|
# points. The factory decorator applied to these functions takes care
|
||||||
# of the rest.
|
# of the rest.
|
||||||
|
@ -197,6 +204,10 @@ class Language:
|
||||||
if vocab is True:
|
if vocab is True:
|
||||||
vectors_name = meta.get("vectors", {}).get("name")
|
vectors_name = meta.get("vectors", {}).get("name")
|
||||||
vocab = create_vocab(self.lang, self.Defaults, vectors_name=vectors_name)
|
vocab = create_vocab(self.lang, self.Defaults, vectors_name=vectors_name)
|
||||||
|
if not create_vectors:
|
||||||
|
vectors_cfg = {"vectors": self._config["nlp"]["vectors"]}
|
||||||
|
create_vectors = registry.resolve(vectors_cfg)["vectors"]
|
||||||
|
vocab.vectors = create_vectors(vocab)
|
||||||
else:
|
else:
|
||||||
if (self.lang and vocab.lang) and (self.lang != vocab.lang):
|
if (self.lang and vocab.lang) and (self.lang != vocab.lang):
|
||||||
raise ValueError(Errors.E150.format(nlp=self.lang, vocab=vocab.lang))
|
raise ValueError(Errors.E150.format(nlp=self.lang, vocab=vocab.lang))
|
||||||
|
@ -1204,7 +1215,7 @@ class Language:
|
||||||
examples,
|
examples,
|
||||||
):
|
):
|
||||||
eg.predicted = doc
|
eg.predicted = doc
|
||||||
return losses
|
return _replace_numpy_floats(losses)
|
||||||
|
|
||||||
def rehearse(
|
def rehearse(
|
||||||
self,
|
self,
|
||||||
|
@ -1455,7 +1466,7 @@ class Language:
|
||||||
results = scorer.score(examples, per_component=per_component)
|
results = scorer.score(examples, per_component=per_component)
|
||||||
n_words = sum(len(eg.predicted) for eg in examples)
|
n_words = sum(len(eg.predicted) for eg in examples)
|
||||||
results["speed"] = n_words / (end_time - start_time)
|
results["speed"] = n_words / (end_time - start_time)
|
||||||
return results
|
return _replace_numpy_floats(results)
|
||||||
|
|
||||||
def create_optimizer(self):
|
def create_optimizer(self):
|
||||||
"""Create an optimizer, usually using the [training.optimizer] config."""
|
"""Create an optimizer, usually using the [training.optimizer] config."""
|
||||||
|
@ -1676,6 +1687,12 @@ class Language:
|
||||||
for proc in procs:
|
for proc in procs:
|
||||||
proc.start()
|
proc.start()
|
||||||
|
|
||||||
|
# Close writing-end of channels. This is needed to avoid that reading
|
||||||
|
# from the channel blocks indefinitely when the worker closes the
|
||||||
|
# channel.
|
||||||
|
for tx in bytedocs_send_ch:
|
||||||
|
tx.close()
|
||||||
|
|
||||||
# Cycle channels not to break the order of docs.
|
# Cycle channels not to break the order of docs.
|
||||||
# The received object is a batch of byte-encoded docs, so flatten them with chain.from_iterable.
|
# The received object is a batch of byte-encoded docs, so flatten them with chain.from_iterable.
|
||||||
byte_tuples = chain.from_iterable(
|
byte_tuples = chain.from_iterable(
|
||||||
|
@ -1698,8 +1715,27 @@ class Language:
|
||||||
# tell `sender` that one batch was consumed.
|
# tell `sender` that one batch was consumed.
|
||||||
sender.step()
|
sender.step()
|
||||||
finally:
|
finally:
|
||||||
|
# If we are stopping in an orderly fashion, the workers' queues
|
||||||
|
# are empty. Put the sentinel in their queues to signal that work
|
||||||
|
# is done, so that they can exit gracefully.
|
||||||
|
for q in texts_q:
|
||||||
|
q.put(_WORK_DONE_SENTINEL)
|
||||||
|
q.close()
|
||||||
|
|
||||||
|
# Otherwise, we are stopping because the error handler raised an
|
||||||
|
# exception. The sentinel will be last to go out of the queue.
|
||||||
|
# To avoid doing unnecessary work or hanging on platforms that
|
||||||
|
# block on sending (Windows), we'll close our end of the channel.
|
||||||
|
# This signals to the worker that it can exit the next time it
|
||||||
|
# attempts to send data down the channel.
|
||||||
|
for r in bytedocs_recv_ch:
|
||||||
|
r.close()
|
||||||
|
|
||||||
for proc in procs:
|
for proc in procs:
|
||||||
proc.terminate()
|
proc.join()
|
||||||
|
|
||||||
|
if not all(proc.exitcode == 0 for proc in procs):
|
||||||
|
warnings.warn(Warnings.W127)
|
||||||
|
|
||||||
def _link_components(self) -> None:
|
def _link_components(self) -> None:
|
||||||
"""Register 'listeners' within pipeline components, to allow them to
|
"""Register 'listeners' within pipeline components, to allow them to
|
||||||
|
@ -1764,6 +1800,10 @@ class Language:
|
||||||
).merge(config)
|
).merge(config)
|
||||||
if "nlp" not in config:
|
if "nlp" not in config:
|
||||||
raise ValueError(Errors.E985.format(config=config))
|
raise ValueError(Errors.E985.format(config=config))
|
||||||
|
# fill in [nlp.vectors] if not present (as a narrower alternative to
|
||||||
|
# auto-filling [nlp] from the default config)
|
||||||
|
if "vectors" not in config["nlp"]:
|
||||||
|
config["nlp"]["vectors"] = {"@vectors": "spacy.Vectors.v1"}
|
||||||
config_lang = config["nlp"].get("lang")
|
config_lang = config["nlp"].get("lang")
|
||||||
if config_lang is not None and config_lang != cls.lang:
|
if config_lang is not None and config_lang != cls.lang:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
|
@ -1795,6 +1835,7 @@ class Language:
|
||||||
filled["nlp"], validate=validate, schema=ConfigSchemaNlp
|
filled["nlp"], validate=validate, schema=ConfigSchemaNlp
|
||||||
)
|
)
|
||||||
create_tokenizer = resolved_nlp["tokenizer"]
|
create_tokenizer = resolved_nlp["tokenizer"]
|
||||||
|
create_vectors = resolved_nlp["vectors"]
|
||||||
before_creation = resolved_nlp["before_creation"]
|
before_creation = resolved_nlp["before_creation"]
|
||||||
after_creation = resolved_nlp["after_creation"]
|
after_creation = resolved_nlp["after_creation"]
|
||||||
after_pipeline_creation = resolved_nlp["after_pipeline_creation"]
|
after_pipeline_creation = resolved_nlp["after_pipeline_creation"]
|
||||||
|
@ -1815,7 +1856,12 @@ class Language:
|
||||||
# inside stuff like the spacy train function. If we loaded them here,
|
# inside stuff like the spacy train function. If we loaded them here,
|
||||||
# then we would load them twice at runtime: once when we make from config,
|
# then we would load them twice at runtime: once when we make from config,
|
||||||
# and then again when we load from disk.
|
# and then again when we load from disk.
|
||||||
nlp = lang_cls(vocab=vocab, create_tokenizer=create_tokenizer, meta=meta)
|
nlp = lang_cls(
|
||||||
|
vocab=vocab,
|
||||||
|
create_tokenizer=create_tokenizer,
|
||||||
|
create_vectors=create_vectors,
|
||||||
|
meta=meta,
|
||||||
|
)
|
||||||
if after_creation is not None:
|
if after_creation is not None:
|
||||||
nlp = after_creation(nlp)
|
nlp = after_creation(nlp)
|
||||||
if not isinstance(nlp, cls):
|
if not isinstance(nlp, cls):
|
||||||
|
@ -2032,11 +2078,55 @@ class Language:
|
||||||
# Go over the listener layers and replace them
|
# Go over the listener layers and replace them
|
||||||
for listener in pipe_listeners:
|
for listener in pipe_listeners:
|
||||||
new_model = tok2vec_model.copy()
|
new_model = tok2vec_model.copy()
|
||||||
if "replace_listener" in tok2vec_model.attrs:
|
replace_listener_func = tok2vec_model.attrs.get("replace_listener")
|
||||||
new_model = tok2vec_model.attrs["replace_listener"](new_model)
|
if replace_listener_func is not None:
|
||||||
|
# Pass the extra args to the callback without breaking compatibility with
|
||||||
|
# old library versions that only expect a single parameter.
|
||||||
|
num_params = len(
|
||||||
|
inspect.signature(replace_listener_func).parameters
|
||||||
|
)
|
||||||
|
if num_params == 1:
|
||||||
|
new_model = replace_listener_func(new_model)
|
||||||
|
elif num_params == 3:
|
||||||
|
new_model = replace_listener_func(new_model, listener, tok2vec)
|
||||||
|
else:
|
||||||
|
raise ValueError(Errors.E1055.format(num_params=num_params))
|
||||||
|
|
||||||
util.replace_model_node(pipe.model, listener, new_model) # type: ignore[attr-defined]
|
util.replace_model_node(pipe.model, listener, new_model) # type: ignore[attr-defined]
|
||||||
tok2vec.remove_listener(listener, pipe_name)
|
tok2vec.remove_listener(listener, pipe_name)
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def memory_zone(self, mem: Optional[Pool] = None) -> Iterator[Pool]:
|
||||||
|
"""Begin a block where all resources allocated during the block will
|
||||||
|
be freed at the end of it. If a resources was created within the
|
||||||
|
memory zone block, accessing it outside the block is invalid.
|
||||||
|
Behaviour of this invalid access is undefined. Memory zones should
|
||||||
|
not be nested.
|
||||||
|
|
||||||
|
The memory zone is helpful for services that need to process large
|
||||||
|
volumes of text with a defined memory budget.
|
||||||
|
|
||||||
|
Example
|
||||||
|
-------
|
||||||
|
>>> with nlp.memory_zone():
|
||||||
|
... for doc in nlp.pipe(texts):
|
||||||
|
... process_my_doc(doc)
|
||||||
|
>>> # use_doc(doc) <-- Invalid: doc was allocated in the memory zone
|
||||||
|
"""
|
||||||
|
if mem is None:
|
||||||
|
mem = Pool()
|
||||||
|
# The ExitStack allows programmatic nested context managers.
|
||||||
|
# We don't know how many we need, so it would be awkward to have
|
||||||
|
# them as nested blocks.
|
||||||
|
with ExitStack() as stack:
|
||||||
|
contexts = [stack.enter_context(self.vocab.memory_zone(mem))]
|
||||||
|
if hasattr(self.tokenizer, "memory_zone"):
|
||||||
|
contexts.append(stack.enter_context(self.tokenizer.memory_zone(mem)))
|
||||||
|
for _, pipe in self.pipeline:
|
||||||
|
if hasattr(pipe, "memory_zone"):
|
||||||
|
contexts.append(stack.enter_context(pipe.memory_zone(mem)))
|
||||||
|
yield mem
|
||||||
|
|
||||||
def to_disk(
|
def to_disk(
|
||||||
self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
|
self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
|
||||||
) -> None:
|
) -> None:
|
||||||
|
@ -2054,7 +2144,9 @@ class Language:
|
||||||
serializers["tokenizer"] = lambda p: self.tokenizer.to_disk( # type: ignore[union-attr]
|
serializers["tokenizer"] = lambda p: self.tokenizer.to_disk( # type: ignore[union-attr]
|
||||||
p, exclude=["vocab"]
|
p, exclude=["vocab"]
|
||||||
)
|
)
|
||||||
serializers["meta.json"] = lambda p: srsly.write_json(p, self.meta)
|
serializers["meta.json"] = lambda p: srsly.write_json(
|
||||||
|
p, _replace_numpy_floats(self.meta)
|
||||||
|
)
|
||||||
serializers["config.cfg"] = lambda p: self.config.to_disk(p)
|
serializers["config.cfg"] = lambda p: self.config.to_disk(p)
|
||||||
for name, proc in self._components:
|
for name, proc in self._components:
|
||||||
if name in exclude:
|
if name in exclude:
|
||||||
|
@ -2168,7 +2260,9 @@ class Language:
|
||||||
serializers: Dict[str, Callable[[], bytes]] = {}
|
serializers: Dict[str, Callable[[], bytes]] = {}
|
||||||
serializers["vocab"] = lambda: self.vocab.to_bytes(exclude=exclude)
|
serializers["vocab"] = lambda: self.vocab.to_bytes(exclude=exclude)
|
||||||
serializers["tokenizer"] = lambda: self.tokenizer.to_bytes(exclude=["vocab"]) # type: ignore[union-attr]
|
serializers["tokenizer"] = lambda: self.tokenizer.to_bytes(exclude=["vocab"]) # type: ignore[union-attr]
|
||||||
serializers["meta.json"] = lambda: srsly.json_dumps(self.meta)
|
serializers["meta.json"] = lambda: srsly.json_dumps(
|
||||||
|
_replace_numpy_floats(self.meta)
|
||||||
|
)
|
||||||
serializers["config.cfg"] = lambda: self.config.to_bytes()
|
serializers["config.cfg"] = lambda: self.config.to_bytes()
|
||||||
for name, proc in self._components:
|
for name, proc in self._components:
|
||||||
if name in exclude:
|
if name in exclude:
|
||||||
|
@ -2219,6 +2313,12 @@ class Language:
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
|
||||||
|
def _replace_numpy_floats(meta_dict: dict) -> dict:
|
||||||
|
return convert_recursive(
|
||||||
|
lambda v: isinstance(v, numpy.floating), lambda v: float(v), dict(meta_dict)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class FactoryMeta:
|
class FactoryMeta:
|
||||||
"""Dataclass containing information about a component and its defaults
|
"""Dataclass containing information about a component and its defaults
|
||||||
|
@ -2294,6 +2394,13 @@ def _apply_pipes(
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
texts_with_ctx = receiver.get()
|
texts_with_ctx = receiver.get()
|
||||||
|
|
||||||
|
# Stop working if we encounter the end-of-work sentinel.
|
||||||
|
if isinstance(texts_with_ctx, _WorkDoneSentinel):
|
||||||
|
sender.close()
|
||||||
|
receiver.close()
|
||||||
|
return
|
||||||
|
|
||||||
docs = (
|
docs = (
|
||||||
ensure_doc(doc_like, context) for doc_like, context in texts_with_ctx
|
ensure_doc(doc_like, context) for doc_like, context in texts_with_ctx
|
||||||
)
|
)
|
||||||
|
@ -2302,11 +2409,23 @@ def _apply_pipes(
|
||||||
# Connection does not accept unpickable objects, so send list.
|
# Connection does not accept unpickable objects, so send list.
|
||||||
byte_docs = [(doc.to_bytes(), doc._context, None) for doc in docs]
|
byte_docs = [(doc.to_bytes(), doc._context, None) for doc in docs]
|
||||||
padding = [(None, None, None)] * (len(texts_with_ctx) - len(byte_docs))
|
padding = [(None, None, None)] * (len(texts_with_ctx) - len(byte_docs))
|
||||||
sender.send(byte_docs + padding) # type: ignore[operator]
|
data: Sequence[Tuple[Optional[bytes], Optional[Any], Optional[bytes]]] = (
|
||||||
|
byte_docs + padding # type: ignore[operator]
|
||||||
|
)
|
||||||
except Exception:
|
except Exception:
|
||||||
error_msg = [(None, None, srsly.msgpack_dumps(traceback.format_exc()))]
|
error_msg = [(None, None, srsly.msgpack_dumps(traceback.format_exc()))]
|
||||||
padding = [(None, None, None)] * (len(texts_with_ctx) - 1)
|
padding = [(None, None, None)] * (len(texts_with_ctx) - 1)
|
||||||
sender.send(error_msg + padding)
|
data = error_msg + padding
|
||||||
|
|
||||||
|
try:
|
||||||
|
sender.send(data)
|
||||||
|
except BrokenPipeError:
|
||||||
|
# Parent has closed the pipe prematurely. This happens when a
|
||||||
|
# worker encounters an error and the error handler is set to
|
||||||
|
# stop processing.
|
||||||
|
sender.close()
|
||||||
|
receiver.close()
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
class _Sender:
|
class _Sender:
|
||||||
|
@ -2336,3 +2455,10 @@ class _Sender:
|
||||||
if self.count >= self.chunk_size:
|
if self.count >= self.chunk_size:
|
||||||
self.count = 0
|
self.count = 0
|
||||||
self.send()
|
self.send()
|
||||||
|
|
||||||
|
|
||||||
|
class _WorkDoneSentinel:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
_WORK_DONE_SENTINEL = _WorkDoneSentinel()
|
||||||
|
|
|
@ -35,7 +35,7 @@ cdef class Lexeme:
|
||||||
return self
|
return self
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef inline void set_struct_attr(LexemeC* lex, attr_id_t name, attr_t value) nogil:
|
cdef inline void set_struct_attr(LexemeC* lex, attr_id_t name, attr_t value) noexcept nogil:
|
||||||
if name < (sizeof(flags_t) * 8):
|
if name < (sizeof(flags_t) * 8):
|
||||||
Lexeme.c_set_flag(lex, name, value)
|
Lexeme.c_set_flag(lex, name, value)
|
||||||
elif name == ID:
|
elif name == ID:
|
||||||
|
@ -54,7 +54,7 @@ cdef class Lexeme:
|
||||||
lex.lang = value
|
lex.lang = value
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef inline attr_t get_struct_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
|
cdef inline attr_t get_struct_attr(const LexemeC* lex, attr_id_t feat_name) noexcept nogil:
|
||||||
if feat_name < (sizeof(flags_t) * 8):
|
if feat_name < (sizeof(flags_t) * 8):
|
||||||
if Lexeme.c_check_flag(lex, feat_name):
|
if Lexeme.c_check_flag(lex, feat_name):
|
||||||
return 1
|
return 1
|
||||||
|
@ -82,7 +82,7 @@ cdef class Lexeme:
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef inline bint c_check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:
|
cdef inline bint c_check_flag(const LexemeC* lexeme, attr_id_t flag_id) noexcept nogil:
|
||||||
cdef flags_t one = 1
|
cdef flags_t one = 1
|
||||||
if lexeme.flags & (one << flag_id):
|
if lexeme.flags & (one << flag_id):
|
||||||
return True
|
return True
|
||||||
|
@ -90,7 +90,7 @@ cdef class Lexeme:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef inline bint c_set_flag(LexemeC* lex, attr_id_t flag_id, bint value) nogil:
|
cdef inline bint c_set_flag(LexemeC* lex, attr_id_t flag_id, bint value) noexcept nogil:
|
||||||
cdef flags_t one = 1
|
cdef flags_t one = 1
|
||||||
if value:
|
if value:
|
||||||
lex.flags |= one << flag_id
|
lex.flags |= one << flag_id
|
||||||
|
|
430
spacy/lexeme.pyx
430
spacy/lexeme.pyx
|
@ -1,4 +1,5 @@
|
||||||
# cython: embedsignature=True
|
# cython: embedsignature=True
|
||||||
|
# cython: profile=False
|
||||||
# Compiler crashes on memory view coercion without this. Should report bug.
|
# Compiler crashes on memory view coercion without this. Should report bug.
|
||||||
cimport numpy as np
|
cimport numpy as np
|
||||||
from libc.string cimport memset
|
from libc.string cimport memset
|
||||||
|
@ -69,7 +70,7 @@ cdef class Lexeme:
|
||||||
if isinstance(other, Lexeme):
|
if isinstance(other, Lexeme):
|
||||||
a = self.orth
|
a = self.orth
|
||||||
b = other.orth
|
b = other.orth
|
||||||
elif isinstance(other, long):
|
elif isinstance(other, int):
|
||||||
a = self.orth
|
a = self.orth
|
||||||
b = other
|
b = other
|
||||||
elif isinstance(other, str):
|
elif isinstance(other, str):
|
||||||
|
@ -103,7 +104,7 @@ cdef class Lexeme:
|
||||||
# skip PROB, e.g. from lexemes.jsonl
|
# skip PROB, e.g. from lexemes.jsonl
|
||||||
if isinstance(value, float):
|
if isinstance(value, float):
|
||||||
continue
|
continue
|
||||||
elif isinstance(value, (int, long)):
|
elif isinstance(value, int):
|
||||||
Lexeme.set_struct_attr(self.c, attr, value)
|
Lexeme.set_struct_attr(self.c, attr, value)
|
||||||
else:
|
else:
|
||||||
Lexeme.set_struct_attr(self.c, attr, self.vocab.strings.add(value))
|
Lexeme.set_struct_attr(self.c, attr, self.vocab.strings.add(value))
|
||||||
|
@ -163,45 +164,48 @@ cdef class Lexeme:
|
||||||
vector = self.vector
|
vector = self.vector
|
||||||
return numpy.sqrt((vector**2).sum())
|
return numpy.sqrt((vector**2).sum())
|
||||||
|
|
||||||
property vector:
|
@property
|
||||||
|
def vector(self):
|
||||||
"""A real-valued meaning representation.
|
"""A real-valued meaning representation.
|
||||||
|
|
||||||
RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array
|
RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array
|
||||||
representing the lexeme's semantics.
|
representing the lexeme's semantics.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
cdef int length = self.vocab.vectors_length
|
||||||
cdef int length = self.vocab.vectors_length
|
if length == 0:
|
||||||
if length == 0:
|
raise ValueError(Errors.E010)
|
||||||
raise ValueError(Errors.E010)
|
return self.vocab.get_vector(self.c.orth)
|
||||||
return self.vocab.get_vector(self.c.orth)
|
|
||||||
|
|
||||||
def __set__(self, vector):
|
@vector.setter
|
||||||
if len(vector) != self.vocab.vectors_length:
|
def vector(self, vector):
|
||||||
raise ValueError(Errors.E073.format(new_length=len(vector),
|
if len(vector) != self.vocab.vectors_length:
|
||||||
length=self.vocab.vectors_length))
|
raise ValueError(Errors.E073.format(new_length=len(vector),
|
||||||
self.vocab.set_vector(self.c.orth, vector)
|
length=self.vocab.vectors_length))
|
||||||
|
self.vocab.set_vector(self.c.orth, vector)
|
||||||
|
|
||||||
property rank:
|
@property
|
||||||
|
def rank(self):
|
||||||
"""RETURNS (str): Sequential ID of the lexeme's lexical type, used
|
"""RETURNS (str): Sequential ID of the lexeme's lexical type, used
|
||||||
to index into tables, e.g. for word vectors."""
|
to index into tables, e.g. for word vectors."""
|
||||||
def __get__(self):
|
return self.c.id
|
||||||
return self.c.id
|
|
||||||
|
|
||||||
def __set__(self, value):
|
@rank.setter
|
||||||
self.c.id = value
|
def rank(self, value):
|
||||||
|
self.c.id = value
|
||||||
|
|
||||||
property sentiment:
|
@property
|
||||||
|
def sentiment(self):
|
||||||
"""RETURNS (float): A scalar value indicating the positivity or
|
"""RETURNS (float): A scalar value indicating the positivity or
|
||||||
negativity of the lexeme."""
|
negativity of the lexeme."""
|
||||||
def __get__(self):
|
sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment", {})
|
||||||
sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment", {})
|
return sentiment_table.get(self.c.orth, 0.0)
|
||||||
return sentiment_table.get(self.c.orth, 0.0)
|
|
||||||
|
|
||||||
def __set__(self, float x):
|
@sentiment.setter
|
||||||
if "lexeme_sentiment" not in self.vocab.lookups:
|
def sentiment(self, float x):
|
||||||
self.vocab.lookups.add_table("lexeme_sentiment")
|
if "lexeme_sentiment" not in self.vocab.lookups:
|
||||||
sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment")
|
self.vocab.lookups.add_table("lexeme_sentiment")
|
||||||
sentiment_table[self.c.orth] = x
|
sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment")
|
||||||
|
sentiment_table[self.c.orth] = x
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def orth_(self):
|
def orth_(self):
|
||||||
|
@ -215,306 +219,338 @@ cdef class Lexeme:
|
||||||
"""RETURNS (str): The original verbatim text of the lexeme."""
|
"""RETURNS (str): The original verbatim text of the lexeme."""
|
||||||
return self.orth_
|
return self.orth_
|
||||||
|
|
||||||
property lower:
|
@property
|
||||||
|
def lower(self):
|
||||||
"""RETURNS (uint64): Lowercase form of the lexeme."""
|
"""RETURNS (uint64): Lowercase form of the lexeme."""
|
||||||
def __get__(self):
|
return self.c.lower
|
||||||
return self.c.lower
|
|
||||||
|
|
||||||
def __set__(self, attr_t x):
|
@lower.setter
|
||||||
self.c.lower = x
|
def lower(self, attr_t x):
|
||||||
|
self.c.lower = x
|
||||||
|
|
||||||
property norm:
|
@property
|
||||||
|
def norm(self):
|
||||||
"""RETURNS (uint64): The lexeme's norm, i.e. a normalised form of the
|
"""RETURNS (uint64): The lexeme's norm, i.e. a normalised form of the
|
||||||
lexeme text.
|
lexeme text.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
return self.c.norm
|
||||||
return self.c.norm
|
|
||||||
|
|
||||||
def __set__(self, attr_t x):
|
@norm.setter
|
||||||
if "lexeme_norm" not in self.vocab.lookups:
|
def norm(self, attr_t x):
|
||||||
self.vocab.lookups.add_table("lexeme_norm")
|
if "lexeme_norm" not in self.vocab.lookups:
|
||||||
norm_table = self.vocab.lookups.get_table("lexeme_norm")
|
self.vocab.lookups.add_table("lexeme_norm")
|
||||||
norm_table[self.c.orth] = self.vocab.strings[x]
|
norm_table = self.vocab.lookups.get_table("lexeme_norm")
|
||||||
self.c.norm = x
|
norm_table[self.c.orth] = self.vocab.strings[x]
|
||||||
|
self.c.norm = x
|
||||||
|
|
||||||
property shape:
|
@property
|
||||||
|
def shape(self):
|
||||||
"""RETURNS (uint64): Transform of the word's string, to show
|
"""RETURNS (uint64): Transform of the word's string, to show
|
||||||
orthographic features.
|
orthographic features.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
return self.c.shape
|
||||||
return self.c.shape
|
|
||||||
|
|
||||||
def __set__(self, attr_t x):
|
@shape.setter
|
||||||
self.c.shape = x
|
def shape(self, attr_t x):
|
||||||
|
self.c.shape = x
|
||||||
|
|
||||||
property prefix:
|
@property
|
||||||
|
def prefix(self):
|
||||||
"""RETURNS (uint64): Length-N substring from the start of the word.
|
"""RETURNS (uint64): Length-N substring from the start of the word.
|
||||||
Defaults to `N=1`.
|
Defaults to `N=1`.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
return self.c.prefix
|
||||||
return self.c.prefix
|
|
||||||
|
|
||||||
def __set__(self, attr_t x):
|
@prefix.setter
|
||||||
self.c.prefix = x
|
def prefix(self, attr_t x):
|
||||||
|
self.c.prefix = x
|
||||||
|
|
||||||
property suffix:
|
@property
|
||||||
|
def suffix(self):
|
||||||
"""RETURNS (uint64): Length-N substring from the end of the word.
|
"""RETURNS (uint64): Length-N substring from the end of the word.
|
||||||
Defaults to `N=3`.
|
Defaults to `N=3`.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
return self.c.suffix
|
||||||
return self.c.suffix
|
|
||||||
|
|
||||||
def __set__(self, attr_t x):
|
@suffix.setter
|
||||||
self.c.suffix = x
|
def suffix(self, attr_t x):
|
||||||
|
self.c.suffix = x
|
||||||
|
|
||||||
property cluster:
|
@property
|
||||||
|
def cluster(self):
|
||||||
"""RETURNS (int): Brown cluster ID."""
|
"""RETURNS (int): Brown cluster ID."""
|
||||||
def __get__(self):
|
cluster_table = self.vocab.lookups.get_table("lexeme_cluster", {})
|
||||||
cluster_table = self.vocab.lookups.get_table("lexeme_cluster", {})
|
return cluster_table.get(self.c.orth, 0)
|
||||||
return cluster_table.get(self.c.orth, 0)
|
|
||||||
|
|
||||||
def __set__(self, int x):
|
@cluster.setter
|
||||||
cluster_table = self.vocab.lookups.get_table("lexeme_cluster", {})
|
def cluster(self, int x):
|
||||||
cluster_table[self.c.orth] = x
|
cluster_table = self.vocab.lookups.get_table("lexeme_cluster", {})
|
||||||
|
cluster_table[self.c.orth] = x
|
||||||
|
|
||||||
property lang:
|
@property
|
||||||
|
def lang(self):
|
||||||
"""RETURNS (uint64): Language of the parent vocabulary."""
|
"""RETURNS (uint64): Language of the parent vocabulary."""
|
||||||
def __get__(self):
|
return self.c.lang
|
||||||
return self.c.lang
|
|
||||||
|
|
||||||
def __set__(self, attr_t x):
|
@lang.setter
|
||||||
self.c.lang = x
|
def lang(self, attr_t x):
|
||||||
|
self.c.lang = x
|
||||||
|
|
||||||
property prob:
|
@property
|
||||||
|
def prob(self):
|
||||||
"""RETURNS (float): Smoothed log probability estimate of the lexeme's
|
"""RETURNS (float): Smoothed log probability estimate of the lexeme's
|
||||||
type."""
|
type."""
|
||||||
def __get__(self):
|
prob_table = self.vocab.lookups.get_table("lexeme_prob", {})
|
||||||
prob_table = self.vocab.lookups.get_table("lexeme_prob", {})
|
settings_table = self.vocab.lookups.get_table("lexeme_settings", {})
|
||||||
settings_table = self.vocab.lookups.get_table("lexeme_settings", {})
|
default_oov_prob = settings_table.get("oov_prob", -20.0)
|
||||||
default_oov_prob = settings_table.get("oov_prob", -20.0)
|
return prob_table.get(self.c.orth, default_oov_prob)
|
||||||
return prob_table.get(self.c.orth, default_oov_prob)
|
|
||||||
|
|
||||||
def __set__(self, float x):
|
@prob.setter
|
||||||
prob_table = self.vocab.lookups.get_table("lexeme_prob", {})
|
def prob(self, float x):
|
||||||
prob_table[self.c.orth] = x
|
prob_table = self.vocab.lookups.get_table("lexeme_prob", {})
|
||||||
|
prob_table[self.c.orth] = x
|
||||||
|
|
||||||
property lower_:
|
@property
|
||||||
|
def lower_(self):
|
||||||
"""RETURNS (str): Lowercase form of the word."""
|
"""RETURNS (str): Lowercase form of the word."""
|
||||||
def __get__(self):
|
return self.vocab.strings[self.c.lower]
|
||||||
return self.vocab.strings[self.c.lower]
|
|
||||||
|
|
||||||
def __set__(self, str x):
|
@lower_.setter
|
||||||
self.c.lower = self.vocab.strings.add(x)
|
def lower_(self, str x):
|
||||||
|
self.c.lower = self.vocab.strings.add(x)
|
||||||
|
|
||||||
property norm_:
|
@property
|
||||||
|
def norm_(self):
|
||||||
"""RETURNS (str): The lexeme's norm, i.e. a normalised form of the
|
"""RETURNS (str): The lexeme's norm, i.e. a normalised form of the
|
||||||
lexeme text.
|
lexeme text.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
return self.vocab.strings[self.c.norm]
|
||||||
return self.vocab.strings[self.c.norm]
|
|
||||||
|
|
||||||
def __set__(self, str x):
|
@norm_.setter
|
||||||
self.norm = self.vocab.strings.add(x)
|
def norm_(self, str x):
|
||||||
|
self.norm = self.vocab.strings.add(x)
|
||||||
|
|
||||||
property shape_:
|
@property
|
||||||
|
def shape_(self):
|
||||||
"""RETURNS (str): Transform of the word's string, to show
|
"""RETURNS (str): Transform of the word's string, to show
|
||||||
orthographic features.
|
orthographic features.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
return self.vocab.strings[self.c.shape]
|
||||||
return self.vocab.strings[self.c.shape]
|
|
||||||
|
|
||||||
def __set__(self, str x):
|
@shape_.setter
|
||||||
self.c.shape = self.vocab.strings.add(x)
|
def shape_(self, str x):
|
||||||
|
self.c.shape = self.vocab.strings.add(x)
|
||||||
|
|
||||||
property prefix_:
|
@property
|
||||||
|
def prefix_(self):
|
||||||
"""RETURNS (str): Length-N substring from the start of the word.
|
"""RETURNS (str): Length-N substring from the start of the word.
|
||||||
Defaults to `N=1`.
|
Defaults to `N=1`.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
return self.vocab.strings[self.c.prefix]
|
||||||
return self.vocab.strings[self.c.prefix]
|
|
||||||
|
|
||||||
def __set__(self, str x):
|
@prefix_.setter
|
||||||
self.c.prefix = self.vocab.strings.add(x)
|
def prefix_(self, str x):
|
||||||
|
self.c.prefix = self.vocab.strings.add(x)
|
||||||
|
|
||||||
property suffix_:
|
@property
|
||||||
|
def suffix_(self):
|
||||||
"""RETURNS (str): Length-N substring from the end of the word.
|
"""RETURNS (str): Length-N substring from the end of the word.
|
||||||
Defaults to `N=3`.
|
Defaults to `N=3`.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
return self.vocab.strings[self.c.suffix]
|
||||||
return self.vocab.strings[self.c.suffix]
|
|
||||||
|
|
||||||
def __set__(self, str x):
|
@suffix_.setter
|
||||||
self.c.suffix = self.vocab.strings.add(x)
|
def suffix_(self, str x):
|
||||||
|
self.c.suffix = self.vocab.strings.add(x)
|
||||||
|
|
||||||
property lang_:
|
@property
|
||||||
|
def lang_(self):
|
||||||
"""RETURNS (str): Language of the parent vocabulary."""
|
"""RETURNS (str): Language of the parent vocabulary."""
|
||||||
def __get__(self):
|
return self.vocab.strings[self.c.lang]
|
||||||
return self.vocab.strings[self.c.lang]
|
|
||||||
|
|
||||||
def __set__(self, str x):
|
@lang_.setter
|
||||||
self.c.lang = self.vocab.strings.add(x)
|
def lang_(self, str x):
|
||||||
|
self.c.lang = self.vocab.strings.add(x)
|
||||||
|
|
||||||
property flags:
|
@property
|
||||||
|
def flags(self):
|
||||||
"""RETURNS (uint64): Container of the lexeme's binary flags."""
|
"""RETURNS (uint64): Container of the lexeme's binary flags."""
|
||||||
def __get__(self):
|
return self.c.flags
|
||||||
return self.c.flags
|
|
||||||
|
|
||||||
def __set__(self, flags_t x):
|
@flags.setter
|
||||||
self.c.flags = x
|
def flags(self, flags_t x):
|
||||||
|
self.c.flags = x
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def is_oov(self):
|
def is_oov(self):
|
||||||
"""RETURNS (bool): Whether the lexeme is out-of-vocabulary."""
|
"""RETURNS (bool): Whether the lexeme is out-of-vocabulary."""
|
||||||
return self.orth not in self.vocab.vectors
|
return self.orth not in self.vocab.vectors
|
||||||
|
|
||||||
property is_stop:
|
@property
|
||||||
|
def is_stop(self):
|
||||||
"""RETURNS (bool): Whether the lexeme is a stop word."""
|
"""RETURNS (bool): Whether the lexeme is a stop word."""
|
||||||
def __get__(self):
|
return Lexeme.c_check_flag(self.c, IS_STOP)
|
||||||
return Lexeme.c_check_flag(self.c, IS_STOP)
|
|
||||||
|
|
||||||
def __set__(self, bint x):
|
@is_stop.setter
|
||||||
Lexeme.c_set_flag(self.c, IS_STOP, x)
|
def is_stop(self, bint x):
|
||||||
|
Lexeme.c_set_flag(self.c, IS_STOP, x)
|
||||||
|
|
||||||
property is_alpha:
|
@property
|
||||||
|
def is_alpha(self):
|
||||||
"""RETURNS (bool): Whether the lexeme consists of alphabetic
|
"""RETURNS (bool): Whether the lexeme consists of alphabetic
|
||||||
characters. Equivalent to `lexeme.text.isalpha()`.
|
characters. Equivalent to `lexeme.text.isalpha()`.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
return Lexeme.c_check_flag(self.c, IS_ALPHA)
|
||||||
return Lexeme.c_check_flag(self.c, IS_ALPHA)
|
|
||||||
|
|
||||||
def __set__(self, bint x):
|
@is_alpha.setter
|
||||||
Lexeme.c_set_flag(self.c, IS_ALPHA, x)
|
def is_alpha(self, bint x):
|
||||||
|
Lexeme.c_set_flag(self.c, IS_ALPHA, x)
|
||||||
|
|
||||||
property is_ascii:
|
@property
|
||||||
|
def is_ascii(self):
|
||||||
"""RETURNS (bool): Whether the lexeme consists of ASCII characters.
|
"""RETURNS (bool): Whether the lexeme consists of ASCII characters.
|
||||||
Equivalent to `[any(ord(c) >= 128 for c in lexeme.text)]`.
|
Equivalent to `[any(ord(c) >= 128 for c in lexeme.text)]`.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
return Lexeme.c_check_flag(self.c, IS_ASCII)
|
||||||
return Lexeme.c_check_flag(self.c, IS_ASCII)
|
|
||||||
|
|
||||||
def __set__(self, bint x):
|
@is_ascii.setter
|
||||||
Lexeme.c_set_flag(self.c, IS_ASCII, x)
|
def is_ascii(self, bint x):
|
||||||
|
Lexeme.c_set_flag(self.c, IS_ASCII, x)
|
||||||
|
|
||||||
property is_digit:
|
@property
|
||||||
|
def is_digit(self):
|
||||||
"""RETURNS (bool): Whether the lexeme consists of digits. Equivalent
|
"""RETURNS (bool): Whether the lexeme consists of digits. Equivalent
|
||||||
to `lexeme.text.isdigit()`.
|
to `lexeme.text.isdigit()`.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
return Lexeme.c_check_flag(self.c, IS_DIGIT)
|
||||||
return Lexeme.c_check_flag(self.c, IS_DIGIT)
|
|
||||||
|
|
||||||
def __set__(self, bint x):
|
@is_digit.setter
|
||||||
Lexeme.c_set_flag(self.c, IS_DIGIT, x)
|
def is_digit(self, bint x):
|
||||||
|
Lexeme.c_set_flag(self.c, IS_DIGIT, x)
|
||||||
|
|
||||||
property is_lower:
|
@property
|
||||||
|
def is_lower(self):
|
||||||
"""RETURNS (bool): Whether the lexeme is in lowercase. Equivalent to
|
"""RETURNS (bool): Whether the lexeme is in lowercase. Equivalent to
|
||||||
`lexeme.text.islower()`.
|
`lexeme.text.islower()`.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
return Lexeme.c_check_flag(self.c, IS_LOWER)
|
||||||
return Lexeme.c_check_flag(self.c, IS_LOWER)
|
|
||||||
|
|
||||||
def __set__(self, bint x):
|
@is_lower.setter
|
||||||
Lexeme.c_set_flag(self.c, IS_LOWER, x)
|
def is_lower(self, bint x):
|
||||||
|
Lexeme.c_set_flag(self.c, IS_LOWER, x)
|
||||||
|
|
||||||
property is_upper:
|
@property
|
||||||
|
def is_upper(self):
|
||||||
"""RETURNS (bool): Whether the lexeme is in uppercase. Equivalent to
|
"""RETURNS (bool): Whether the lexeme is in uppercase. Equivalent to
|
||||||
`lexeme.text.isupper()`.
|
`lexeme.text.isupper()`.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
return Lexeme.c_check_flag(self.c, IS_UPPER)
|
||||||
return Lexeme.c_check_flag(self.c, IS_UPPER)
|
|
||||||
|
|
||||||
def __set__(self, bint x):
|
@is_upper.setter
|
||||||
Lexeme.c_set_flag(self.c, IS_UPPER, x)
|
def is_upper(self, bint x):
|
||||||
|
Lexeme.c_set_flag(self.c, IS_UPPER, x)
|
||||||
|
|
||||||
property is_title:
|
@property
|
||||||
|
def is_title(self):
|
||||||
"""RETURNS (bool): Whether the lexeme is in titlecase. Equivalent to
|
"""RETURNS (bool): Whether the lexeme is in titlecase. Equivalent to
|
||||||
`lexeme.text.istitle()`.
|
`lexeme.text.istitle()`.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
return Lexeme.c_check_flag(self.c, IS_TITLE)
|
||||||
return Lexeme.c_check_flag(self.c, IS_TITLE)
|
|
||||||
|
|
||||||
def __set__(self, bint x):
|
@is_title.setter
|
||||||
Lexeme.c_set_flag(self.c, IS_TITLE, x)
|
def is_title(self, bint x):
|
||||||
|
Lexeme.c_set_flag(self.c, IS_TITLE, x)
|
||||||
|
|
||||||
property is_punct:
|
@property
|
||||||
|
def is_punct(self):
|
||||||
"""RETURNS (bool): Whether the lexeme is punctuation."""
|
"""RETURNS (bool): Whether the lexeme is punctuation."""
|
||||||
def __get__(self):
|
return Lexeme.c_check_flag(self.c, IS_PUNCT)
|
||||||
return Lexeme.c_check_flag(self.c, IS_PUNCT)
|
|
||||||
|
|
||||||
def __set__(self, bint x):
|
@is_punct.setter
|
||||||
Lexeme.c_set_flag(self.c, IS_PUNCT, x)
|
def is_punct(self, bint x):
|
||||||
|
Lexeme.c_set_flag(self.c, IS_PUNCT, x)
|
||||||
|
|
||||||
property is_space:
|
@property
|
||||||
|
def is_space(self):
|
||||||
"""RETURNS (bool): Whether the lexeme consist of whitespace characters.
|
"""RETURNS (bool): Whether the lexeme consist of whitespace characters.
|
||||||
Equivalent to `lexeme.text.isspace()`.
|
Equivalent to `lexeme.text.isspace()`.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
return Lexeme.c_check_flag(self.c, IS_SPACE)
|
||||||
return Lexeme.c_check_flag(self.c, IS_SPACE)
|
|
||||||
|
|
||||||
def __set__(self, bint x):
|
@is_space.setter
|
||||||
Lexeme.c_set_flag(self.c, IS_SPACE, x)
|
def is_space(self, bint x):
|
||||||
|
Lexeme.c_set_flag(self.c, IS_SPACE, x)
|
||||||
|
|
||||||
property is_bracket:
|
@property
|
||||||
|
def is_bracket(self):
|
||||||
"""RETURNS (bool): Whether the lexeme is a bracket."""
|
"""RETURNS (bool): Whether the lexeme is a bracket."""
|
||||||
def __get__(self):
|
return Lexeme.c_check_flag(self.c, IS_BRACKET)
|
||||||
return Lexeme.c_check_flag(self.c, IS_BRACKET)
|
|
||||||
|
|
||||||
def __set__(self, bint x):
|
@is_bracket.setter
|
||||||
Lexeme.c_set_flag(self.c, IS_BRACKET, x)
|
def is_bracket(self, bint x):
|
||||||
|
Lexeme.c_set_flag(self.c, IS_BRACKET, x)
|
||||||
|
|
||||||
property is_quote:
|
@property
|
||||||
|
def is_quote(self):
|
||||||
"""RETURNS (bool): Whether the lexeme is a quotation mark."""
|
"""RETURNS (bool): Whether the lexeme is a quotation mark."""
|
||||||
def __get__(self):
|
return Lexeme.c_check_flag(self.c, IS_QUOTE)
|
||||||
return Lexeme.c_check_flag(self.c, IS_QUOTE)
|
|
||||||
|
|
||||||
def __set__(self, bint x):
|
@is_quote.setter
|
||||||
Lexeme.c_set_flag(self.c, IS_QUOTE, x)
|
def is_quote(self, bint x):
|
||||||
|
Lexeme.c_set_flag(self.c, IS_QUOTE, x)
|
||||||
|
|
||||||
property is_left_punct:
|
@property
|
||||||
|
def is_left_punct(self):
|
||||||
"""RETURNS (bool): Whether the lexeme is left punctuation, e.g. (."""
|
"""RETURNS (bool): Whether the lexeme is left punctuation, e.g. (."""
|
||||||
def __get__(self):
|
return Lexeme.c_check_flag(self.c, IS_LEFT_PUNCT)
|
||||||
return Lexeme.c_check_flag(self.c, IS_LEFT_PUNCT)
|
|
||||||
|
|
||||||
def __set__(self, bint x):
|
@is_left_punct.setter
|
||||||
Lexeme.c_set_flag(self.c, IS_LEFT_PUNCT, x)
|
def is_left_punct(self, bint x):
|
||||||
|
Lexeme.c_set_flag(self.c, IS_LEFT_PUNCT, x)
|
||||||
|
|
||||||
property is_right_punct:
|
@property
|
||||||
|
def is_right_punct(self):
|
||||||
"""RETURNS (bool): Whether the lexeme is right punctuation, e.g. )."""
|
"""RETURNS (bool): Whether the lexeme is right punctuation, e.g. )."""
|
||||||
def __get__(self):
|
return Lexeme.c_check_flag(self.c, IS_RIGHT_PUNCT)
|
||||||
return Lexeme.c_check_flag(self.c, IS_RIGHT_PUNCT)
|
|
||||||
|
|
||||||
def __set__(self, bint x):
|
@is_right_punct.setter
|
||||||
Lexeme.c_set_flag(self.c, IS_RIGHT_PUNCT, x)
|
def is_right_punct(self, bint x):
|
||||||
|
Lexeme.c_set_flag(self.c, IS_RIGHT_PUNCT, x)
|
||||||
|
|
||||||
property is_currency:
|
@property
|
||||||
|
def is_currency(self):
|
||||||
"""RETURNS (bool): Whether the lexeme is a currency symbol, e.g. $, €."""
|
"""RETURNS (bool): Whether the lexeme is a currency symbol, e.g. $, €."""
|
||||||
def __get__(self):
|
return Lexeme.c_check_flag(self.c, IS_CURRENCY)
|
||||||
return Lexeme.c_check_flag(self.c, IS_CURRENCY)
|
|
||||||
|
|
||||||
def __set__(self, bint x):
|
@is_currency.setter
|
||||||
Lexeme.c_set_flag(self.c, IS_CURRENCY, x)
|
def is_currency(self, bint x):
|
||||||
|
Lexeme.c_set_flag(self.c, IS_CURRENCY, x)
|
||||||
|
|
||||||
property like_url:
|
@property
|
||||||
|
def like_url(self):
|
||||||
"""RETURNS (bool): Whether the lexeme resembles a URL."""
|
"""RETURNS (bool): Whether the lexeme resembles a URL."""
|
||||||
def __get__(self):
|
return Lexeme.c_check_flag(self.c, LIKE_URL)
|
||||||
return Lexeme.c_check_flag(self.c, LIKE_URL)
|
|
||||||
|
|
||||||
def __set__(self, bint x):
|
@like_url.setter
|
||||||
Lexeme.c_set_flag(self.c, LIKE_URL, x)
|
def like_url(self, bint x):
|
||||||
|
Lexeme.c_set_flag(self.c, LIKE_URL, x)
|
||||||
|
|
||||||
property like_num:
|
@property
|
||||||
|
def like_num(self):
|
||||||
"""RETURNS (bool): Whether the lexeme represents a number, e.g. "10.9",
|
"""RETURNS (bool): Whether the lexeme represents a number, e.g. "10.9",
|
||||||
"10", "ten", etc.
|
"10", "ten", etc.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
return Lexeme.c_check_flag(self.c, LIKE_NUM)
|
||||||
return Lexeme.c_check_flag(self.c, LIKE_NUM)
|
|
||||||
|
|
||||||
def __set__(self, bint x):
|
@like_num.setter
|
||||||
Lexeme.c_set_flag(self.c, LIKE_NUM, x)
|
def like_num(self, bint x):
|
||||||
|
Lexeme.c_set_flag(self.c, LIKE_NUM, x)
|
||||||
|
|
||||||
property like_email:
|
@property
|
||||||
|
def like_email(self):
|
||||||
"""RETURNS (bool): Whether the lexeme resembles an email address."""
|
"""RETURNS (bool): Whether the lexeme resembles an email address."""
|
||||||
def __get__(self):
|
return Lexeme.c_check_flag(self.c, LIKE_EMAIL)
|
||||||
return Lexeme.c_check_flag(self.c, LIKE_EMAIL)
|
|
||||||
|
|
||||||
def __set__(self, bint x):
|
@like_email.setter
|
||||||
Lexeme.c_set_flag(self.c, LIKE_EMAIL, x)
|
def like_email(self, bint x):
|
||||||
|
Lexeme.c_set_flag(self.c, LIKE_EMAIL, x)
|
||||||
|
|
|
@ -3,4 +3,4 @@ from .levenshtein import levenshtein
|
||||||
from .matcher import Matcher
|
from .matcher import Matcher
|
||||||
from .phrasematcher import PhraseMatcher
|
from .phrasematcher import PhraseMatcher
|
||||||
|
|
||||||
__all__ = ["Matcher", "PhraseMatcher", "DependencyMatcher", "levenshtein"]
|
__all__ = ["DependencyMatcher", "Matcher", "PhraseMatcher", "levenshtein"]
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
# cython: infer_types=True, profile=True
|
# cython: infer_types=True
|
||||||
import warnings
|
import warnings
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from itertools import product
|
from itertools import product
|
||||||
|
@ -129,6 +129,7 @@ cdef class DependencyMatcher:
|
||||||
else:
|
else:
|
||||||
required_keys = {"RIGHT_ID", "RIGHT_ATTRS", "REL_OP", "LEFT_ID"}
|
required_keys = {"RIGHT_ID", "RIGHT_ATTRS", "REL_OP", "LEFT_ID"}
|
||||||
relation_keys = set(relation.keys())
|
relation_keys = set(relation.keys())
|
||||||
|
# Identify required keys that have not been specified
|
||||||
missing = required_keys - relation_keys
|
missing = required_keys - relation_keys
|
||||||
if missing:
|
if missing:
|
||||||
missing_txt = ", ".join(list(missing))
|
missing_txt = ", ".join(list(missing))
|
||||||
|
@ -136,6 +137,13 @@ cdef class DependencyMatcher:
|
||||||
required=required_keys,
|
required=required_keys,
|
||||||
missing=missing_txt
|
missing=missing_txt
|
||||||
))
|
))
|
||||||
|
# Identify additional, unsupported keys
|
||||||
|
unsupported = relation_keys - required_keys
|
||||||
|
if unsupported:
|
||||||
|
unsupported_txt = ", ".join(list(unsupported))
|
||||||
|
warnings.warn(Warnings.W126.format(
|
||||||
|
unsupported=unsupported_txt
|
||||||
|
))
|
||||||
if (
|
if (
|
||||||
relation["RIGHT_ID"] in visited_nodes
|
relation["RIGHT_ID"] in visited_nodes
|
||||||
or relation["LEFT_ID"] not in visited_nodes
|
or relation["LEFT_ID"] not in visited_nodes
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
# cython: profile=True, binding=True, infer_types=True
|
# cython: binding=True, infer_types=True, language_level=3
|
||||||
from cpython.object cimport PyObject
|
from cpython.object cimport PyObject
|
||||||
from libc.stdint cimport int64_t
|
from libc.stdint cimport int64_t
|
||||||
|
|
||||||
|
@ -27,6 +27,5 @@ cpdef bint levenshtein_compare(input_text: str, pattern_text: str, fuzzy: int =
|
||||||
return levenshtein(input_text, pattern_text, max_edits) <= max_edits
|
return levenshtein(input_text, pattern_text, max_edits) <= max_edits
|
||||||
|
|
||||||
|
|
||||||
@registry.misc("spacy.levenshtein_compare.v1")
|
|
||||||
def make_levenshtein_compare():
|
def make_levenshtein_compare():
|
||||||
return levenshtein_compare
|
return levenshtein_compare
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
# cython: binding=True, infer_types=True, profile=True
|
# cython: binding=True, infer_types=True
|
||||||
from typing import Iterable, List
|
from typing import Iterable, List
|
||||||
|
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
|
@ -625,7 +625,7 @@ cdef action_t get_action(
|
||||||
const TokenC * token,
|
const TokenC * token,
|
||||||
const attr_t * extra_attrs,
|
const attr_t * extra_attrs,
|
||||||
const int8_t * predicate_matches
|
const int8_t * predicate_matches
|
||||||
) nogil:
|
) noexcept nogil:
|
||||||
"""We need to consider:
|
"""We need to consider:
|
||||||
a) Does the token match the specification? [Yes, No]
|
a) Does the token match the specification? [Yes, No]
|
||||||
b) What's the quantifier? [1, 0+, ?]
|
b) What's the quantifier? [1, 0+, ?]
|
||||||
|
@ -740,7 +740,7 @@ cdef int8_t get_is_match(
|
||||||
const TokenC* token,
|
const TokenC* token,
|
||||||
const attr_t* extra_attrs,
|
const attr_t* extra_attrs,
|
||||||
const int8_t* predicate_matches
|
const int8_t* predicate_matches
|
||||||
) nogil:
|
) noexcept nogil:
|
||||||
for i in range(state.pattern.nr_py):
|
for i in range(state.pattern.nr_py):
|
||||||
if predicate_matches[state.pattern.py_predicates[i]] == -1:
|
if predicate_matches[state.pattern.py_predicates[i]] == -1:
|
||||||
return 0
|
return 0
|
||||||
|
@ -755,14 +755,14 @@ cdef int8_t get_is_match(
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
cdef inline int8_t get_is_final(PatternStateC state) nogil:
|
cdef inline int8_t get_is_final(PatternStateC state) noexcept nogil:
|
||||||
if state.pattern[1].quantifier == FINAL_ID:
|
if state.pattern[1].quantifier == FINAL_ID:
|
||||||
return 1
|
return 1
|
||||||
else:
|
else:
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
||||||
cdef inline int8_t get_quantifier(PatternStateC state) nogil:
|
cdef inline int8_t get_quantifier(PatternStateC state) noexcept nogil:
|
||||||
return state.pattern.quantifier
|
return state.pattern.quantifier
|
||||||
|
|
||||||
|
|
||||||
|
@ -805,7 +805,7 @@ cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id, object token_specs)
|
||||||
return pattern
|
return pattern
|
||||||
|
|
||||||
|
|
||||||
cdef attr_t get_ent_id(const TokenPatternC* pattern) nogil:
|
cdef attr_t get_ent_id(const TokenPatternC* pattern) noexcept nogil:
|
||||||
while pattern.quantifier != FINAL_ID:
|
while pattern.quantifier != FINAL_ID:
|
||||||
pattern += 1
|
pattern += 1
|
||||||
id_attr = pattern[0].attrs[0]
|
id_attr = pattern[0].attrs[0]
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
# cython: infer_types=True, profile=True
|
# cython: infer_types=True
|
||||||
from preshed.maps cimport map_clear, map_get, map_init, map_iter, map_set
|
from preshed.maps cimport map_clear, map_get, map_init, map_iter, map_set
|
||||||
|
|
||||||
import warnings
|
import warnings
|
||||||
|
@ -47,7 +47,7 @@ cdef class PhraseMatcher:
|
||||||
self._terminal_hash = 826361138722620965
|
self._terminal_hash = 826361138722620965
|
||||||
map_init(self.mem, self.c_map, 8)
|
map_init(self.mem, self.c_map, 8)
|
||||||
|
|
||||||
if isinstance(attr, (int, long)):
|
if isinstance(attr, int):
|
||||||
self.attr = attr
|
self.attr = attr
|
||||||
else:
|
else:
|
||||||
if attr is None:
|
if attr is None:
|
||||||
|
|
|
@ -7,7 +7,6 @@ from ..tokens import Doc
|
||||||
from ..util import registry
|
from ..util import registry
|
||||||
|
|
||||||
|
|
||||||
@registry.layers("spacy.CharEmbed.v1")
|
|
||||||
def CharacterEmbed(nM: int, nC: int) -> Model[List[Doc], List[Floats2d]]:
|
def CharacterEmbed(nM: int, nC: int) -> Model[List[Doc], List[Floats2d]]:
|
||||||
# nM: Number of dimensions per character. nC: Number of characters.
|
# nM: Number of dimensions per character. nC: Number of characters.
|
||||||
return Model(
|
return Model(
|
||||||
|
|
|
@ -3,7 +3,6 @@ from thinc.api import Model, normal_init
|
||||||
from ..util import registry
|
from ..util import registry
|
||||||
|
|
||||||
|
|
||||||
@registry.layers("spacy.PrecomputableAffine.v1")
|
|
||||||
def PrecomputableAffine(nO, nI, nF, nP, dropout=0.1):
|
def PrecomputableAffine(nO, nI, nF, nP, dropout=0.1):
|
||||||
model = Model(
|
model = Model(
|
||||||
"precomputable_affine",
|
"precomputable_affine",
|
||||||
|
|
|
@ -50,7 +50,6 @@ def models_with_nvtx_range(nlp, forward_color: int, backprop_color: int):
|
||||||
return nlp
|
return nlp
|
||||||
|
|
||||||
|
|
||||||
@registry.callbacks("spacy.models_with_nvtx_range.v1")
|
|
||||||
def create_models_with_nvtx_range(
|
def create_models_with_nvtx_range(
|
||||||
forward_color: int = -1, backprop_color: int = -1
|
forward_color: int = -1, backprop_color: int = -1
|
||||||
) -> Callable[["Language"], "Language"]:
|
) -> Callable[["Language"], "Language"]:
|
||||||
|
@ -110,7 +109,6 @@ def pipes_with_nvtx_range(
|
||||||
return nlp
|
return nlp
|
||||||
|
|
||||||
|
|
||||||
@registry.callbacks("spacy.models_and_pipes_with_nvtx_range.v1")
|
|
||||||
def create_models_and_pipes_with_nvtx_range(
|
def create_models_and_pipes_with_nvtx_range(
|
||||||
forward_color: int = -1,
|
forward_color: int = -1,
|
||||||
backprop_color: int = -1,
|
backprop_color: int = -1,
|
||||||
|
|
|
@ -4,7 +4,6 @@ from ..attrs import LOWER
|
||||||
from ..util import registry
|
from ..util import registry
|
||||||
|
|
||||||
|
|
||||||
@registry.layers("spacy.extract_ngrams.v1")
|
|
||||||
def extract_ngrams(ngram_size: int, attr: int = LOWER) -> Model:
|
def extract_ngrams(ngram_size: int, attr: int = LOWER) -> Model:
|
||||||
model: Model = Model("extract_ngrams", forward)
|
model: Model = Model("extract_ngrams", forward)
|
||||||
model.attrs["ngram_size"] = ngram_size
|
model.attrs["ngram_size"] = ngram_size
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user