mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-04 04:10:20 +03:00
Compare commits
46 Commits
Author | SHA1 | Date | |
---|---|---|---|
|
fa9d24e9fd | ||
|
f645747553 | ||
|
b4117be7d0 | ||
|
f82950657e | ||
|
515c2394c4 | ||
|
2f2c23384c | ||
|
15466c9c1d | ||
|
4e032da3b9 | ||
|
5b3b18d626 | ||
|
b0fb316ca9 | ||
|
4297340d8c | ||
|
9c794644ab | ||
|
4e050330a8 | ||
|
279749a9d9 | ||
|
289ed4fa43 | ||
|
cc0c7dab29 | ||
|
809887a925 | ||
|
062bd27f22 | ||
|
1e701c8bee | ||
|
2201459603 | ||
|
07026337d2 | ||
|
76449e07a0 | ||
|
7fefb39e58 | ||
|
7d12262145 | ||
|
c900f8573d | ||
|
9b9b743e8b | ||
|
4c56dd5fb8 | ||
|
1cb197e8d9 | ||
|
838d2ff5e7 | ||
|
5fb597f778 | ||
|
f2219b1fd8 | ||
|
0926a0993a | ||
|
0bf367dfc2 | ||
|
7c0d582224 | ||
|
7fe0594898 | ||
|
1ec2ff41e2 | ||
|
e61c8d2975 | ||
|
7d30e6620e | ||
|
34261f628f | ||
|
1bf18b85e4 | ||
|
f6b39fce2c | ||
|
ecd4343990 | ||
|
985cf8eb64 | ||
|
de6607fc9b | ||
|
31a00ad7e0 | ||
|
4619a99185 |
1
.github/FUNDING.yml
vendored
1
.github/FUNDING.yml
vendored
|
@ -1 +0,0 @@
|
|||
custom: [https://explosion.ai/merch, https://explosion.ai/tailored-solutions]
|
2
.github/ISSUE_TEMPLATE/01_bugs.md
vendored
2
.github/ISSUE_TEMPLATE/01_bugs.md
vendored
|
@ -10,7 +10,7 @@ about: Use this template if you came across a bug or unexpected behaviour differ
|
|||
<!-- Include a code example or the steps that led to the problem. Please try to be as specific as possible. -->
|
||||
|
||||
## Your Environment
|
||||
<!-- Include details of your environment. You can also type `python -m spacy info --markdown` and copy-paste the result here.-->
|
||||
<!-- Include details of your environment. If you're using spaCy 1.7+, you can also type `python -m spacy info --markdown` and copy-paste the result here.-->
|
||||
* Operating System:
|
||||
* Python Version Used:
|
||||
* spaCy Version Used:
|
||||
|
|
106
.github/contributors/Lucaterre.md
vendored
106
.github/contributors/Lucaterre.md
vendored
|
@ -1,106 +0,0 @@
|
|||
# spaCy contributor agreement
|
||||
|
||||
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||
The SCA applies to any contribution that you make to any product or project
|
||||
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||
**"you"** shall mean the person or entity identified below.
|
||||
|
||||
If you agree to be bound by these terms, fill in the information requested
|
||||
below and include the filled-in version with your first pull request, under the
|
||||
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||
should be your GitHub username, with the extension `.md`. For example, the user
|
||||
example_user would create the file `.github/contributors/example_user.md`.
|
||||
|
||||
Read this agreement carefully before signing. These terms and conditions
|
||||
constitute a binding legal agreement.
|
||||
|
||||
## Contributor Agreement
|
||||
|
||||
1. The term "contribution" or "contributed materials" means any source code,
|
||||
object code, patch, tool, sample, graphic, specification, manual,
|
||||
documentation, or any other material posted or submitted by you to the project.
|
||||
|
||||
2. With respect to any worldwide copyrights, or copyright applications and
|
||||
registrations, in your contribution:
|
||||
|
||||
* you hereby assign to us joint ownership, and to the extent that such
|
||||
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||
royalty-free, unrestricted license to exercise all rights under those
|
||||
copyrights. This includes, at our option, the right to sublicense these same
|
||||
rights to third parties through multiple levels of sublicensees or other
|
||||
licensing arrangements;
|
||||
|
||||
* you agree that each of us can do all things in relation to your
|
||||
contribution as if each of us were the sole owners, and if one of us makes
|
||||
a derivative work of your contribution, the one who makes the derivative
|
||||
work (or has it made will be the sole owner of that derivative work;
|
||||
|
||||
* you agree that you will not assert any moral rights in your contribution
|
||||
against us, our licensees or transferees;
|
||||
|
||||
* you agree that we may register a copyright in your contribution and
|
||||
exercise all ownership rights associated with it; and
|
||||
|
||||
* you agree that neither of us has any duty to consult with, obtain the
|
||||
consent of, pay or render an accounting to the other for any use or
|
||||
distribution of your contribution.
|
||||
|
||||
3. With respect to any patents you own, or that you can license without payment
|
||||
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||
|
||||
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||
your contribution in whole or in part, alone or in combination with or
|
||||
included in any product, work or materials arising out of the project to
|
||||
which your contribution was submitted, and
|
||||
|
||||
* at our option, to sublicense these same rights to third parties through
|
||||
multiple levels of sublicensees or other licensing arrangements.
|
||||
|
||||
4. Except as set out above, you keep all right, title, and interest in your
|
||||
contribution. The rights that you grant to us under these terms are effective
|
||||
on the date you first submitted a contribution to us, even if your submission
|
||||
took place before the date you sign these terms.
|
||||
|
||||
5. You covenant, represent, warrant and agree that:
|
||||
|
||||
* Each contribution that you submit is and shall be an original work of
|
||||
authorship and you can legally grant the rights set out in this SCA;
|
||||
|
||||
* to the best of your knowledge, each contribution will not violate any
|
||||
third party's copyrights, trademarks, patents, or other intellectual
|
||||
property rights; and
|
||||
|
||||
* each contribution shall be in compliance with U.S. export control laws and
|
||||
other applicable export and import laws. You agree to notify us if you
|
||||
become aware of any circumstance which would make any of the foregoing
|
||||
representations inaccurate in any respect. We may publicly disclose your
|
||||
participation in the project, including the fact that you have signed the SCA.
|
||||
|
||||
6. This SCA is governed by the laws of the State of California and applicable
|
||||
U.S. Federal law. Any choice of law rules will not apply.
|
||||
|
||||
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||
mark both statements:
|
||||
|
||||
* [x] I am signing on behalf of myself as an individual and no other person
|
||||
or entity, including my employer, has or will have rights with respect to my
|
||||
contributions.
|
||||
|
||||
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||
actual authority to contractually bind that entity.
|
||||
|
||||
## Contributor Details
|
||||
|
||||
| Field | Entry |
|
||||
|------------------------------- |---------------|
|
||||
| Name | Lucas Terriel |
|
||||
| Company name (if applicable) | |
|
||||
| Title or role (if applicable) | |
|
||||
| Date | 2022-06-20 |
|
||||
| GitHub username | Lucaterre |
|
||||
| Website (optional) | |
|
13
.github/no-response.yml
vendored
Normal file
13
.github/no-response.yml
vendored
Normal file
|
@ -0,0 +1,13 @@
|
|||
# Configuration for probot-no-response - https://github.com/probot/no-response
|
||||
|
||||
# Number of days of inactivity before an Issue is closed for lack of response
|
||||
daysUntilClose: 14
|
||||
# Label requiring a response
|
||||
responseRequiredLabel: more-info-needed
|
||||
# Comment to post when closing an Issue for lack of response. Set to `false` to disable
|
||||
closeComment: >
|
||||
This issue has been automatically closed because there has been no response
|
||||
to a request for more information from the original author. With only the
|
||||
information that is currently in the issue, there's not enough information
|
||||
to take action. If you're the original author, feel free to reopen the issue
|
||||
if you have or find the answers needed to investigate further.
|
67
.github/spacy_universe_alert.py
vendored
67
.github/spacy_universe_alert.py
vendored
|
@ -1,67 +0,0 @@
|
|||
import os
|
||||
import sys
|
||||
import json
|
||||
from datetime import datetime
|
||||
|
||||
from slack_sdk.web.client import WebClient
|
||||
|
||||
CHANNEL = "#alerts-universe"
|
||||
SLACK_TOKEN = os.environ.get("SLACK_BOT_TOKEN", "ENV VAR not available!")
|
||||
DATETIME_FORMAT = "%Y-%m-%dT%H:%M:%SZ"
|
||||
|
||||
client = WebClient(SLACK_TOKEN)
|
||||
github_context = json.loads(sys.argv[1])
|
||||
|
||||
event = github_context['event']
|
||||
pr_title = event['pull_request']["title"]
|
||||
pr_link = event['pull_request']["patch_url"].replace(".patch", "")
|
||||
pr_author_url = event['sender']["html_url"]
|
||||
pr_author_name = pr_author_url.rsplit('/')[-1]
|
||||
pr_created_at_dt = datetime.strptime(
|
||||
event['pull_request']["created_at"],
|
||||
DATETIME_FORMAT
|
||||
)
|
||||
pr_created_at = pr_created_at_dt.strftime("%c")
|
||||
pr_updated_at_dt = datetime.strptime(
|
||||
event['pull_request']["updated_at"],
|
||||
DATETIME_FORMAT
|
||||
)
|
||||
pr_updated_at = pr_updated_at_dt.strftime("%c")
|
||||
|
||||
blocks = [
|
||||
{
|
||||
"type": "section",
|
||||
"text": {
|
||||
"type": "mrkdwn",
|
||||
"text": "📣 New spaCy Universe Project Alert ✨"
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "section",
|
||||
"fields": [
|
||||
{
|
||||
"type": "mrkdwn",
|
||||
"text": f"*Pull Request:*\n<{pr_link}|{pr_title}>"
|
||||
},
|
||||
{
|
||||
"type": "mrkdwn",
|
||||
"text": f"*Author:*\n<{pr_author_url}|{pr_author_name}>"
|
||||
},
|
||||
{
|
||||
"type": "mrkdwn",
|
||||
"text": f"*Created at:*\n {pr_created_at}"
|
||||
},
|
||||
{
|
||||
"type": "mrkdwn",
|
||||
"text": f"*Last Updated:*\n {pr_updated_at}"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
client.chat_postMessage(
|
||||
channel=CHANNEL,
|
||||
text="spaCy universe project PR alert",
|
||||
blocks=blocks
|
||||
)
|
44
.github/workflows/autoblack.yml
vendored
Normal file
44
.github/workflows/autoblack.yml
vendored
Normal file
|
@ -0,0 +1,44 @@
|
|||
# GitHub Action that uses Black to reformat all Python code and submits a PR
|
||||
# in regular intervals. Inspired by: https://github.com/cclauss/autoblack
|
||||
|
||||
name: autoblack
|
||||
on:
|
||||
workflow_dispatch: # allow manual trigger
|
||||
schedule:
|
||||
- cron: '0 8 * * 5' # every Friday at 8am UTC
|
||||
|
||||
jobs:
|
||||
autoblack:
|
||||
if: github.repository_owner == 'explosion'
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
with:
|
||||
ref: ${{ github.head_ref }}
|
||||
- uses: actions/setup-python@v2
|
||||
- run: pip install black
|
||||
- name: Auto-format code if needed
|
||||
run: black spacy
|
||||
# We can't run black --check here because that returns a non-zero excit
|
||||
# code and makes GitHub think the action failed
|
||||
- name: Check for modified files
|
||||
id: git-check
|
||||
run: echo ::set-output name=modified::$(if git diff-index --quiet HEAD --; then echo "false"; else echo "true"; fi)
|
||||
- name: Create Pull Request
|
||||
if: steps.git-check.outputs.modified == 'true'
|
||||
uses: peter-evans/create-pull-request@v3
|
||||
with:
|
||||
title: Auto-format code with black
|
||||
labels: meta
|
||||
commit-message: Auto-format code with black
|
||||
committer: GitHub <noreply@github.com>
|
||||
author: explosion-bot <explosion-bot@users.noreply.github.com>
|
||||
body: _This PR is auto-generated._
|
||||
branch: autoblack
|
||||
delete-branch: true
|
||||
draft: false
|
||||
- name: Check outputs
|
||||
if: steps.git-check.outputs.modified == 'true'
|
||||
run: |
|
||||
echo "Pull Request Number - ${{ steps.cpr.outputs.pull-request-number }}"
|
||||
echo "Pull Request URL - ${{ steps.cpr.outputs.pull-request-url }}"
|
99
.github/workflows/cibuildwheel.yml
vendored
99
.github/workflows/cibuildwheel.yml
vendored
|
@ -1,99 +0,0 @@
|
|||
name: Build
|
||||
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
# ytf did they invent their own syntax that's almost regex?
|
||||
# ** matches 'zero or more of any character'
|
||||
- 'release-v[0-9]+.[0-9]+.[0-9]+**'
|
||||
- 'prerelease-v[0-9]+.[0-9]+.[0-9]+**'
|
||||
jobs:
|
||||
build_wheels:
|
||||
name: Build wheels on ${{ matrix.os }}
|
||||
runs-on: ${{ matrix.os }}
|
||||
strategy:
|
||||
matrix:
|
||||
# macos-13 is an intel runner, macos-14 is apple silicon
|
||||
os: [ubuntu-latest, windows-latest, macos-13, macos-14, ubuntu-24.04-arm]
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
# aarch64 (arm) is built via qemu emulation
|
||||
# QEMU is sadly too slow. We need to wait for public ARM support
|
||||
#- name: Set up QEMU
|
||||
# if: runner.os == 'Linux'
|
||||
# uses: docker/setup-qemu-action@v3
|
||||
# with:
|
||||
# platforms: all
|
||||
- name: Build wheels
|
||||
uses: pypa/cibuildwheel@v2.21.3
|
||||
env:
|
||||
CIBW_ARCHS_LINUX: auto
|
||||
with:
|
||||
package-dir: .
|
||||
output-dir: wheelhouse
|
||||
config-file: "{package}/pyproject.toml"
|
||||
- uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: cibw-wheels-${{ matrix.os }}-${{ strategy.job-index }}
|
||||
path: ./wheelhouse/*.whl
|
||||
|
||||
build_sdist:
|
||||
name: Build source distribution
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Build sdist
|
||||
run: pipx run build --sdist
|
||||
- uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: cibw-sdist
|
||||
path: dist/*.tar.gz
|
||||
create_release:
|
||||
needs: [build_wheels, build_sdist]
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
contents: write
|
||||
checks: write
|
||||
actions: read
|
||||
issues: read
|
||||
packages: write
|
||||
pull-requests: read
|
||||
repository-projects: read
|
||||
statuses: read
|
||||
steps:
|
||||
- name: Get the tag name and determine if it's a prerelease
|
||||
id: get_tag_info
|
||||
run: |
|
||||
FULL_TAG=${GITHUB_REF#refs/tags/}
|
||||
if [[ $FULL_TAG == release-* ]]; then
|
||||
TAG_NAME=${FULL_TAG#release-}
|
||||
IS_PRERELEASE=false
|
||||
elif [[ $FULL_TAG == prerelease-* ]]; then
|
||||
TAG_NAME=${FULL_TAG#prerelease-}
|
||||
IS_PRERELEASE=true
|
||||
else
|
||||
echo "Tag does not match expected patterns" >&2
|
||||
exit 1
|
||||
fi
|
||||
echo "FULL_TAG=$TAG_NAME" >> $GITHUB_ENV
|
||||
echo "TAG_NAME=$TAG_NAME" >> $GITHUB_ENV
|
||||
echo "IS_PRERELEASE=$IS_PRERELEASE" >> $GITHUB_ENV
|
||||
- uses: actions/download-artifact@v4
|
||||
with:
|
||||
# unpacks all CIBW artifacts into dist/
|
||||
pattern: cibw-*
|
||||
path: dist
|
||||
merge-multiple: true
|
||||
- name: Create Draft Release
|
||||
id: create_release
|
||||
uses: softprops/action-gh-release@v2
|
||||
if: startsWith(github.ref, 'refs/tags/')
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
with:
|
||||
name: ${{ env.TAG_NAME }}
|
||||
draft: true
|
||||
prerelease: ${{ env.IS_PRERELEASE }}
|
||||
files: "./dist/*"
|
7
.github/workflows/explosionbot.yml
vendored
7
.github/workflows/explosionbot.yml
vendored
|
@ -8,15 +8,14 @@ on:
|
|||
|
||||
jobs:
|
||||
explosion-bot:
|
||||
if: github.repository_owner == 'explosion'
|
||||
runs-on: ubuntu-latest
|
||||
runs-on: ubuntu-18.04
|
||||
steps:
|
||||
- name: Dump GitHub context
|
||||
env:
|
||||
GITHUB_CONTEXT: ${{ toJson(github) }}
|
||||
run: echo "$GITHUB_CONTEXT"
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/setup-python@v4
|
||||
- uses: actions/checkout@v1
|
||||
- uses: actions/setup-python@v1
|
||||
- name: Install and run explosion-bot
|
||||
run: |
|
||||
pip install git+https://${{ secrets.EXPLOSIONBOT_TOKEN }}@github.com/explosion/explosion-bot
|
||||
|
|
9
.github/workflows/issue-manager.yml
vendored
9
.github/workflows/issue-manager.yml
vendored
|
@ -13,10 +13,9 @@ on:
|
|||
|
||||
jobs:
|
||||
issue-manager:
|
||||
if: github.repository_owner == 'explosion'
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: tiangolo/issue-manager@0.4.0
|
||||
- uses: tiangolo/issue-manager@0.2.1
|
||||
with:
|
||||
token: ${{ secrets.GITHUB_TOKEN }}
|
||||
config: >
|
||||
|
@ -26,11 +25,5 @@ jobs:
|
|||
"message": "This issue has been automatically closed because it was answered and there was no follow-up discussion.",
|
||||
"remove_label_on_comment": true,
|
||||
"remove_label_on_close": true
|
||||
},
|
||||
"more-info-needed": {
|
||||
"delay": "P7D",
|
||||
"message": "This issue has been automatically closed because there has been no response to a request for more information from the original author. With only the information that is currently in the issue, there's not enough information to take action. If you're the original author, feel free to reopen the issue if you have or find the answers needed to investigate further.",
|
||||
"remove_label_on_comment": true,
|
||||
"remove_label_on_close": true
|
||||
}
|
||||
}
|
||||
|
|
3
.github/workflows/lock.yml
vendored
3
.github/workflows/lock.yml
vendored
|
@ -13,10 +13,9 @@ concurrency:
|
|||
|
||||
jobs:
|
||||
action:
|
||||
if: github.repository_owner == 'explosion'
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: dessant/lock-threads@v5
|
||||
- uses: dessant/lock-threads@v3
|
||||
with:
|
||||
process-only: 'issues'
|
||||
issue-inactive-days: '30'
|
||||
|
|
29
.github/workflows/publish_pypi.yml
vendored
29
.github/workflows/publish_pypi.yml
vendored
|
@ -1,29 +0,0 @@
|
|||
# The cibuildwheel action triggers on creation of a release, this
|
||||
# triggers on publication.
|
||||
# The expected workflow is to create a draft release and let the wheels
|
||||
# upload, and then hit 'publish', which uploads to PyPi.
|
||||
|
||||
on:
|
||||
release:
|
||||
types:
|
||||
- published
|
||||
|
||||
jobs:
|
||||
upload_pypi:
|
||||
runs-on: ubuntu-latest
|
||||
environment:
|
||||
name: pypi
|
||||
url: https://pypi.org/p/spacy
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
if: github.event_name == 'release' && github.event.action == 'published'
|
||||
# or, alternatively, upload to PyPI on every tag starting with 'v' (remove on: release above to use this)
|
||||
# if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
|
||||
steps:
|
||||
- uses: robinraju/release-downloader@v1
|
||||
with:
|
||||
tag: ${{ github.event.release.tag_name }}
|
||||
fileName: '*'
|
||||
out-file-path: 'dist'
|
||||
- uses: pypa/gh-action-pypi-publish@release/v1
|
|
@ -14,7 +14,7 @@ jobs:
|
|||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
uses: actions/checkout@v1
|
||||
with:
|
||||
ref: ${{ matrix.branch }}
|
||||
- name: Get commits from past 24 hours
|
||||
|
@ -23,9 +23,9 @@ jobs:
|
|||
today=$(date '+%Y-%m-%d %H:%M:%S')
|
||||
yesterday=$(date -d "yesterday" '+%Y-%m-%d %H:%M:%S')
|
||||
if git log --after="$yesterday" --before="$today" | grep commit ; then
|
||||
echo run_tests=true >> $GITHUB_OUTPUT
|
||||
echo "::set-output name=run_tests::true"
|
||||
else
|
||||
echo run_tests=false >> $GITHUB_OUTPUT
|
||||
echo "::set-output name=run_tests::false"
|
||||
fi
|
||||
|
||||
- name: Trigger buildkite build
|
33
.github/workflows/spacy_universe_alert.yml
vendored
33
.github/workflows/spacy_universe_alert.yml
vendored
|
@ -1,33 +0,0 @@
|
|||
name: spaCy universe project alert
|
||||
|
||||
on:
|
||||
pull_request_target:
|
||||
paths:
|
||||
- "website/meta/universe.json"
|
||||
|
||||
jobs:
|
||||
build:
|
||||
if: github.repository_owner == 'explosion'
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Dump GitHub context
|
||||
env:
|
||||
GITHUB_CONTEXT: ${{ toJson(github) }}
|
||||
PR_NUMBER: ${{github.event.number}}
|
||||
run: |
|
||||
echo "$GITHUB_CONTEXT"
|
||||
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: '3.10'
|
||||
- name: Install Bernadette app dependency and send an alert
|
||||
env:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
GITHUB_CONTEXT: ${{ toJson(github) }}
|
||||
CHANNEL: "#alerts-universe"
|
||||
run: |
|
||||
pip install slack-sdk==3.17.2 aiohttp==3.8.1
|
||||
echo "$CHANNEL"
|
||||
python .github/spacy_universe_alert.py "$GITHUB_CONTEXT"
|
53
.github/workflows/tests.yml
vendored
53
.github/workflows/tests.yml
vendored
|
@ -2,8 +2,6 @@ name: tests
|
|||
|
||||
on:
|
||||
push:
|
||||
tags-ignore:
|
||||
- '**'
|
||||
branches-ignore:
|
||||
- "spacy.io"
|
||||
- "nightly.spacy.io"
|
||||
|
@ -12,6 +10,7 @@ on:
|
|||
- "*.md"
|
||||
- "*.mdx"
|
||||
- "website/**"
|
||||
- ".github/workflows/**"
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened, edited]
|
||||
paths-ignore:
|
||||
|
@ -26,51 +25,51 @@ jobs:
|
|||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Check out repo
|
||||
uses: actions/checkout@v4
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Configure Python version
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: "3.10"
|
||||
python-version: "3.7"
|
||||
architecture: x64
|
||||
|
||||
- name: black
|
||||
run: |
|
||||
python -m pip install black -c requirements.txt
|
||||
python -m black spacy --check
|
||||
- name: isort
|
||||
run: |
|
||||
python -m pip install isort -c requirements.txt
|
||||
python -m isort spacy --check
|
||||
- name: flake8
|
||||
run: |
|
||||
python -m pip install flake8==5.0.4
|
||||
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
|
||||
# Unfortunately cython-lint isn't working after the shift to Cython 3.
|
||||
#- name: cython-lint
|
||||
# run: |
|
||||
# python -m pip install cython-lint -c requirements.txt
|
||||
# # E501: line too log, W291: trailing whitespace, E266: too many leading '#' for block comment
|
||||
# cython-lint spacy --ignore E501,W291,E266
|
||||
|
||||
tests:
|
||||
name: Test
|
||||
needs: Validate
|
||||
strategy:
|
||||
fail-fast: true
|
||||
fail-fast: false
|
||||
matrix:
|
||||
os: [ubuntu-latest, windows-latest, macos-latest]
|
||||
python_version: ["3.9", "3.12", "3.13"]
|
||||
python_version: ["3.10"]
|
||||
include:
|
||||
- os: ubuntu-20.04
|
||||
python_version: "3.6"
|
||||
- os: windows-latest
|
||||
python_version: "3.7"
|
||||
- os: macos-latest
|
||||
python_version: "3.8"
|
||||
- os: ubuntu-latest
|
||||
python_version: "3.9"
|
||||
|
||||
runs-on: ${{ matrix.os }}
|
||||
|
||||
steps:
|
||||
- name: Check out repo
|
||||
uses: actions/checkout@v4
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Configure Python version
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: ${{ matrix.python_version }}
|
||||
architecture: x64
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
|
@ -83,8 +82,11 @@ jobs:
|
|||
|
||||
- name: Run mypy
|
||||
run: |
|
||||
# Install older numpy for mypy (bug with newer numpy+mypy not fixed
|
||||
# until mypy 0.981)
|
||||
python -m pip install "numpy<1.22"
|
||||
python -m mypy spacy
|
||||
if: matrix.python_version != '3.7'
|
||||
if: matrix.python_version != '3.6'
|
||||
|
||||
- name: Delete source directory and .egg-info
|
||||
run: |
|
||||
|
@ -113,11 +115,6 @@ jobs:
|
|||
python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
|
||||
if: matrix.python_version == '3.9'
|
||||
|
||||
- name: "Test download_url in info CLI"
|
||||
run: |
|
||||
python -W error -m spacy info ca_core_news_sm | grep -q download_url
|
||||
if: matrix.python_version == '3.9'
|
||||
|
||||
- name: "Test no warnings on load (#11713)"
|
||||
run: |
|
||||
python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
|
||||
|
@ -148,9 +145,7 @@ jobs:
|
|||
- name: "Test assemble CLI"
|
||||
run: |
|
||||
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
|
||||
python -m spacy assemble ner_source_sm.cfg output_dir
|
||||
env:
|
||||
PYTHONWARNINGS: "error,ignore::DeprecationWarning"
|
||||
PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
|
||||
if: matrix.python_version == '3.9'
|
||||
|
||||
- name: "Test assemble CLI vectors warning"
|
||||
|
@ -166,10 +161,10 @@ jobs:
|
|||
- name: "Run CPU tests"
|
||||
run: |
|
||||
python -m pytest --pyargs spacy -W error
|
||||
if: "!(startsWith(matrix.os, 'macos') && matrix.python_version == '3.11')"
|
||||
if: "!(startsWith(matrix.os, 'macos') && matrix.python_version == '3.10')"
|
||||
|
||||
- name: "Run CPU tests with thinc-apple-ops"
|
||||
run: |
|
||||
python -m pip install 'spacy[apple]'
|
||||
python -m pytest --pyargs spacy
|
||||
if: startsWith(matrix.os, 'macos') && matrix.python_version == '3.11'
|
||||
if: startsWith(matrix.os, 'macos') && matrix.python_version == '3.10'
|
||||
|
|
32
.github/workflows/universe_validation.yml
vendored
32
.github/workflows/universe_validation.yml
vendored
|
@ -1,32 +0,0 @@
|
|||
name: universe validation
|
||||
|
||||
on:
|
||||
push:
|
||||
branches-ignore:
|
||||
- "spacy.io"
|
||||
- "nightly.spacy.io"
|
||||
- "v2.spacy.io"
|
||||
paths:
|
||||
- "website/meta/universe.json"
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened, edited]
|
||||
paths:
|
||||
- "website/meta/universe.json"
|
||||
|
||||
jobs:
|
||||
validate:
|
||||
name: Validate
|
||||
if: github.repository_owner == 'explosion'
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Check out repo
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Configure Python version
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: "3.7"
|
||||
|
||||
- name: Validate website/meta/universe.json
|
||||
run: |
|
||||
python .github/validate_universe_json.py website/meta/universe.json
|
11
.gitignore
vendored
11
.gitignore
vendored
|
@ -10,11 +10,20 @@ spacy/tests/package/setup.cfg
|
|||
spacy/tests/package/pyproject.toml
|
||||
spacy/tests/package/requirements.txt
|
||||
|
||||
# Website
|
||||
website/.cache/
|
||||
website/public/
|
||||
website/node_modules
|
||||
website/.npm
|
||||
website/logs
|
||||
*.log
|
||||
npm-debug.log*
|
||||
quickstart-training-generator.js
|
||||
|
||||
# Cython / C extensions
|
||||
cythonize.json
|
||||
spacy/*.html
|
||||
*.cpp
|
||||
*.c
|
||||
*.so
|
||||
|
||||
# Vim / VSCode / editors
|
||||
|
|
|
@ -5,7 +5,7 @@ repos:
|
|||
- id: black
|
||||
language_version: python3.7
|
||||
additional_dependencies: ['click==8.0.4']
|
||||
- repo: https://github.com/pycqa/flake8
|
||||
- repo: https://gitlab.com/pycqa/flake8
|
||||
rev: 5.0.4
|
||||
hooks:
|
||||
- id: flake8
|
||||
|
|
|
@ -35,7 +35,7 @@ so that more people can benefit from it.
|
|||
|
||||
When opening an issue, use a **descriptive title** and include your
|
||||
**environment** (operating system, Python version, spaCy version). Our
|
||||
[issue templates](https://github.com/explosion/spaCy/issues/new/choose) help you
|
||||
[issue template](https://github.com/explosion/spaCy/issues/new) helps you
|
||||
remember the most important details to include. If you've discovered a bug, you
|
||||
can also submit a [regression test](#fixing-bugs) straight away. When you're
|
||||
opening an issue to report the bug, simply refer to your pull request in the
|
||||
|
@ -173,11 +173,6 @@ formatting and [`flake8`](http://flake8.pycqa.org/en/latest/) for linting its
|
|||
Python modules. If you've built spaCy from source, you'll already have both
|
||||
tools installed.
|
||||
|
||||
As a general rule of thumb, we use f-strings for any formatting of strings.
|
||||
One exception are calls to Python's `logging` functionality.
|
||||
To avoid unnecessary string conversions in these cases, we use string formatting
|
||||
templates with `%s` and `%d` etc.
|
||||
|
||||
**⚠️ Note that formatting and linting is currently only possible for Python
|
||||
modules in `.py` files, not Cython modules in `.pyx` and `.pxd` files.**
|
||||
|
||||
|
@ -276,8 +271,7 @@ except: # noqa: E722
|
|||
|
||||
### Python conventions
|
||||
|
||||
All Python code must be written **compatible with Python 3.6+**. More detailed
|
||||
code conventions can be found in the [developer docs](https://github.com/explosion/spaCy/blob/master/extra/DEVELOPER_DOCS/Code%20Conventions.md).
|
||||
All Python code must be written **compatible with Python 3.6+**.
|
||||
|
||||
#### I/O and handling paths
|
||||
|
||||
|
@ -449,12 +443,13 @@ and plugins in spaCy v3.0, and we can't wait to see what you build with it!
|
|||
[`spacy`](https://github.com/topics/spacy?o=desc&s=stars) and
|
||||
[`spacy-extensions`](https://github.com/topics/spacy-extension?o=desc&s=stars)
|
||||
to make it easier to find. Those are also the topics we're linking to from the
|
||||
spaCy website. If you're sharing your project on X, feel free to tag
|
||||
[@spacy_io](https://x.com/spacy_io) so we can check it out.
|
||||
spaCy website. If you're sharing your project on Twitter, feel free to tag
|
||||
[@spacy_io](https://twitter.com/spacy_io) so we can check it out.
|
||||
|
||||
- Once your extension is published, you can open a
|
||||
[PR](https://github.com/explosion/spaCy/pulls) to suggest it for the
|
||||
[Universe](https://spacy.io/universe) page.
|
||||
- Once your extension is published, you can open an issue on the
|
||||
[issue tracker](https://github.com/explosion/spacy/issues) to suggest it for the
|
||||
[resources directory](https://spacy.io/usage/resources#extensions) on the
|
||||
website.
|
||||
|
||||
📖 **For more tips and best practices, see the [checklist for developing spaCy extensions](https://spacy.io/usage/processing-pipelines#extensions).**
|
||||
|
||||
|
|
2
LICENSE
2
LICENSE
|
@ -1,6 +1,6 @@
|
|||
The MIT License (MIT)
|
||||
|
||||
Copyright (C) 2016-2024 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal
|
||||
Copyright (C) 2016-2022 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
|
|
|
@ -1,9 +1,8 @@
|
|||
recursive-include spacy *.pyi *.pyx *.pxd *.txt *.cfg *.jinja *.toml *.hh
|
||||
recursive-include spacy *.pyi *.pyx *.pxd *.txt *.cfg *.jinja *.toml
|
||||
include LICENSE
|
||||
include README.md
|
||||
include pyproject.toml
|
||||
include spacy/py.typed
|
||||
recursive-include spacy/cli *.yml
|
||||
recursive-include spacy/tests *.json
|
||||
recursive-include licenses *
|
||||
recursive-exclude spacy *.cpp
|
||||
|
|
4
Makefile
4
Makefile
|
@ -1,11 +1,11 @@
|
|||
SHELL := /bin/bash
|
||||
|
||||
ifndef SPACY_EXTRAS
|
||||
override SPACY_EXTRAS = spacy-lookups-data==1.0.3
|
||||
override SPACY_EXTRAS = spacy-lookups-data==1.0.2 jieba spacy-pkuseg==0.0.28 sudachipy sudachidict_core pymorphy2
|
||||
endif
|
||||
|
||||
ifndef PYVER
|
||||
override PYVER = 3.8
|
||||
override PYVER = 3.6
|
||||
endif
|
||||
|
||||
VENV := ./env$(PYVER)
|
||||
|
|
85
README.md
85
README.md
|
@ -6,20 +6,20 @@ spaCy is a library for **advanced Natural Language Processing** in Python and
|
|||
Cython. It's built on the very latest research, and was designed from day one to
|
||||
be used in real products.
|
||||
|
||||
spaCy comes with [pretrained pipelines](https://spacy.io/models) and currently
|
||||
supports tokenization and training for **70+ languages**. It features
|
||||
state-of-the-art speed and **neural network models** for tagging, parsing,
|
||||
**named entity recognition**, **text classification** and more, multi-task
|
||||
learning with pretrained **transformers** like BERT, as well as a
|
||||
spaCy comes with
|
||||
[pretrained pipelines](https://spacy.io/models) and
|
||||
currently supports tokenization and training for **60+ languages**. It features
|
||||
state-of-the-art speed and **neural network models** for tagging,
|
||||
parsing, **named entity recognition**, **text classification** and more,
|
||||
multi-task learning with pretrained **transformers** like BERT, as well as a
|
||||
production-ready [**training system**](https://spacy.io/usage/training) and easy
|
||||
model packaging, deployment and workflow management. spaCy is commercial
|
||||
open-source software, released under the
|
||||
[MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE).
|
||||
open-source software, released under the MIT license.
|
||||
|
||||
💫 **Version 3.8 out now!**
|
||||
💫 **Version 3.2 out now!**
|
||||
[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
|
||||
|
||||
[](https://github.com/explosion/spaCy/actions/workflows/tests.yml)
|
||||
[](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)
|
||||
[](https://github.com/explosion/spaCy/releases)
|
||||
[](https://pypi.org/project/spacy/)
|
||||
[](https://anaconda.org/conda-forge/spacy)
|
||||
|
@ -28,47 +28,36 @@ open-source software, released under the
|
|||
<br />
|
||||
[](https://pypi.org/project/spacy/)
|
||||
[](https://anaconda.org/conda-forge/spacy)
|
||||
[](https://twitter.com/spacy_io)
|
||||
|
||||
## 📖 Documentation
|
||||
|
||||
| Documentation | |
|
||||
| ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| ⭐️ **[spaCy 101]** | New to spaCy? Here's everything you need to know! |
|
||||
| 📚 **[Usage Guides]** | How to use spaCy and its features. |
|
||||
| 🚀 **[New in v3.0]** | New features, backwards incompatibilities and migration guide. |
|
||||
| 🪐 **[Project Templates]** | End-to-end workflows you can clone, modify and run. |
|
||||
| 🎛 **[API Reference]** | The detailed reference for spaCy's API. |
|
||||
| ⏩ **[GPU Processing]** | Use spaCy with CUDA-compatible GPU processing. |
|
||||
| 📦 **[Models]** | Download trained pipelines for spaCy. |
|
||||
| 🦙 **[Large Language Models]** | Integrate LLMs into spaCy pipelines. |
|
||||
| 🌌 **[Universe]** | Plugins, extensions, demos and books from the spaCy ecosystem. |
|
||||
| ⚙️ **[spaCy VS Code Extension]** | Additional tooling and features for working with spaCy's config files. |
|
||||
| 👩🏫 **[Online Course]** | Learn spaCy in this free and interactive online course. |
|
||||
| 📰 **[Blog]** | Read about current spaCy and Prodigy development, releases, talks and more from Explosion. |
|
||||
| 📺 **[Videos]** | Our YouTube channel with video tutorials, talks and more. |
|
||||
| 🔴 **[Live Stream]** | Join Matt as he works on spaCy and chat about NLP, live every week. |
|
||||
| 🛠 **[Changelog]** | Changes and version history. |
|
||||
| 💝 **[Contribute]** | How to contribute to the spaCy project and code base. |
|
||||
| 👕 **[Swag]** | Support us and our work with unique, custom-designed swag! |
|
||||
| <a href="https://explosion.ai/tailored-solutions"><img src="https://github.com/explosion/spaCy/assets/13643239/36d2a42e-98c0-4599-90e1-788ef75181be" width="150" alt="Tailored Solutions"/></a> | Custom NLP consulting, implementation and strategic advice by spaCy’s core development team. Streamlined, production-ready, predictable and maintainable. Send us an email or take our 5-minute questionnaire, and well'be in touch! **[Learn more →](https://explosion.ai/tailored-solutions)** |
|
||||
| Documentation | |
|
||||
| ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| ⭐️ **[spaCy 101]** | New to spaCy? Here's everything you need to know! |
|
||||
| 📚 **[Usage Guides]** | How to use spaCy and its features. |
|
||||
| 🚀 **[New in v3.0]** | New features, backwards incompatibilities and migration guide. |
|
||||
| 🪐 **[Project Templates]** | End-to-end workflows you can clone, modify and run. |
|
||||
| 🎛 **[API Reference]** | The detailed reference for spaCy's API. |
|
||||
| 📦 **[Models]** | Download trained pipelines for spaCy. |
|
||||
| 🌌 **[Universe]** | Plugins, extensions, demos and books from the spaCy ecosystem. |
|
||||
| 👩🏫 **[Online Course]** | Learn spaCy in this free and interactive online course. |
|
||||
| 📺 **[Videos]** | Our YouTube channel with video tutorials, talks and more. |
|
||||
| 🛠 **[Changelog]** | Changes and version history. |
|
||||
| 💝 **[Contribute]** | How to contribute to the spaCy project and code base. |
|
||||
| <a href="https://explosion.ai/spacy-tailored-pipelines"><img src="https://user-images.githubusercontent.com/13643239/152853098-1c761611-ccb0-4ec6-9066-b234552831fe.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Get a custom spaCy pipeline, tailor-made for your NLP problem by spaCy's core developers. Streamlined, production-ready, predictable and maintainable. Start by completing our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-pipelines)** |
|
||||
|
||||
[spacy 101]: https://spacy.io/usage/spacy-101
|
||||
[new in v3.0]: https://spacy.io/usage/v3
|
||||
[usage guides]: https://spacy.io/usage/
|
||||
[api reference]: https://spacy.io/api/
|
||||
[gpu processing]: https://spacy.io/usage#gpu
|
||||
[models]: https://spacy.io/models
|
||||
[large language models]: https://spacy.io/usage/large-language-models
|
||||
[universe]: https://spacy.io/universe
|
||||
[spacy vs code extension]: https://github.com/explosion/spacy-vscode
|
||||
[videos]: https://www.youtube.com/c/ExplosionAI
|
||||
[live stream]: https://www.youtube.com/playlist?list=PLBmcuObd5An5_iAxNYLJa_xWmNzsYce8c
|
||||
[online course]: https://course.spacy.io
|
||||
[blog]: https://explosion.ai
|
||||
[project templates]: https://github.com/explosion/projects
|
||||
[changelog]: https://spacy.io/usage#changelog
|
||||
[contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md
|
||||
[swag]: https://explosion.ai/merch
|
||||
|
||||
## 💬 Where to ask questions
|
||||
|
||||
|
@ -80,27 +69,24 @@ more people can benefit from it.
|
|||
| Type | Platforms |
|
||||
| ------------------------------- | --------------------------------------- |
|
||||
| 🚨 **Bug Reports** | [GitHub Issue Tracker] |
|
||||
| 🎁 **Feature Requests & Ideas** | [GitHub Discussions] · [Live Stream] |
|
||||
| 🎁 **Feature Requests & Ideas** | [GitHub Discussions] |
|
||||
| 👩💻 **Usage Questions** | [GitHub Discussions] · [Stack Overflow] |
|
||||
| 🗯 **General Discussion** | [GitHub Discussions] · [Live Stream] |
|
||||
| 🗯 **General Discussion** | [GitHub Discussions] |
|
||||
|
||||
[github issue tracker]: https://github.com/explosion/spaCy/issues
|
||||
[github discussions]: https://github.com/explosion/spaCy/discussions
|
||||
[stack overflow]: https://stackoverflow.com/questions/tagged/spacy
|
||||
[live stream]: https://www.youtube.com/playlist?list=PLBmcuObd5An5_iAxNYLJa_xWmNzsYce8c
|
||||
|
||||
## Features
|
||||
|
||||
- Support for **70+ languages**
|
||||
- Support for **60+ languages**
|
||||
- **Trained pipelines** for different languages and tasks
|
||||
- Multi-task learning with pretrained **transformers** like BERT
|
||||
- Support for pretrained **word vectors** and embeddings
|
||||
- State-of-the-art speed
|
||||
- Production-ready **training system**
|
||||
- Linguistically-motivated **tokenization**
|
||||
- Components for named **entity recognition**, part-of-speech-tagging,
|
||||
dependency parsing, sentence segmentation, **text classification**,
|
||||
lemmatization, morphological analysis, entity linking and more
|
||||
- Components for named **entity recognition**, part-of-speech-tagging, dependency parsing, sentence segmentation, **text classification**, lemmatization, morphological analysis, entity linking and more
|
||||
- Easily extensible with **custom components** and attributes
|
||||
- Support for custom models in **PyTorch**, **TensorFlow** and other frameworks
|
||||
- Built in **visualizers** for syntax and NER
|
||||
|
@ -117,7 +103,7 @@ For detailed installation instructions, see the
|
|||
|
||||
- **Operating system**: macOS / OS X · Linux · Windows (Cygwin, MinGW, Visual
|
||||
Studio)
|
||||
- **Python version**: Python >=3.7, <3.13 (only 64 bit)
|
||||
- **Python version**: Python 3.6+ (only 64 bit)
|
||||
- **Package managers**: [pip] · [conda] (via `conda-forge`)
|
||||
|
||||
[pip]: https://pypi.org/project/spacy/
|
||||
|
@ -126,8 +112,8 @@ For detailed installation instructions, see the
|
|||
### pip
|
||||
|
||||
Using pip, spaCy releases are available as source packages and binary wheels.
|
||||
Before you install spaCy and its dependencies, make sure that your `pip`,
|
||||
`setuptools` and `wheel` are up to date.
|
||||
Before you install spaCy and its dependencies, make sure that
|
||||
your `pip`, `setuptools` and `wheel` are up to date.
|
||||
|
||||
```bash
|
||||
pip install -U pip setuptools wheel
|
||||
|
@ -182,9 +168,9 @@ with the new version.
|
|||
|
||||
## 📦 Download model packages
|
||||
|
||||
Trained pipelines for spaCy can be installed as **Python packages**. This means
|
||||
that they're a component of your application, just like any other module. Models
|
||||
can be installed using spaCy's [`download`](https://spacy.io/api/cli#download)
|
||||
Trained pipelines for spaCy can be installed as **Python packages**. This
|
||||
means that they're a component of your application, just like any other module.
|
||||
Models can be installed using spaCy's [`download`](https://spacy.io/api/cli#download)
|
||||
command, or manually by pointing pip to a path or URL.
|
||||
|
||||
| Documentation | |
|
||||
|
@ -250,7 +236,8 @@ do that depends on your system.
|
|||
| **Mac** | Install a recent version of [XCode](https://developer.apple.com/xcode/), including the so-called "Command Line Tools". macOS and OS X ship with Python and git preinstalled. |
|
||||
| **Windows** | Install a version of the [Visual C++ Build Tools](https://visualstudio.microsoft.com/visual-cpp-build-tools/) or [Visual Studio Express](https://visualstudio.microsoft.com/vs/express/) that matches the version that was used to compile your Python interpreter. |
|
||||
|
||||
For more details and instructions, see the documentation on
|
||||
For more details
|
||||
and instructions, see the documentation on
|
||||
[compiling spaCy from source](https://spacy.io/usage#source) and the
|
||||
[quickstart widget](https://spacy.io/usage#section-quickstart) to get the right
|
||||
commands for your platform and Python version.
|
||||
|
|
|
@ -1,20 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
set -e
|
||||
|
||||
# Insist repository is clean
|
||||
git diff-index --quiet HEAD
|
||||
|
||||
version=$(grep "__version__ = " spacy/about.py)
|
||||
version=${version/__version__ = }
|
||||
version=${version/\'/}
|
||||
version=${version/\'/}
|
||||
version=${version/\"/}
|
||||
version=${version/\"/}
|
||||
|
||||
echo "Pushing release-v"$version
|
||||
|
||||
git tag -d release-v$version || true
|
||||
git push origin :release-v$version || true
|
||||
git tag release-v$version
|
||||
git push origin release-v$version
|
|
@ -1,2 +1,8 @@
|
|||
# build version constraints for use with wheelwright
|
||||
numpy>=2.0.0,<3.0.0
|
||||
# build version constraints for use with wheelwright + multibuild
|
||||
numpy==1.15.0; python_version<='3.7' and platform_machine!='aarch64'
|
||||
numpy==1.19.2; python_version<='3.7' and platform_machine=='aarch64'
|
||||
numpy==1.17.3; python_version=='3.8' and platform_machine!='aarch64'
|
||||
numpy==1.19.2; python_version=='3.8' and platform_machine=='aarch64'
|
||||
numpy==1.19.3; python_version=='3.9'
|
||||
numpy==1.21.3; python_version=='3.10'
|
||||
numpy; python_version>='3.11'
|
||||
|
|
|
@ -191,8 +191,6 @@ def load_model(name: str) -> "Language":
|
|||
...
|
||||
```
|
||||
|
||||
Note that we typically put the `from typing` import statements on the first line(s) of the Python module.
|
||||
|
||||
## Structuring logic
|
||||
|
||||
### Positional and keyword arguments
|
||||
|
@ -277,27 +275,6 @@ If you have to use `try`/`except`, make sure to only include what's **absolutely
|
|||
+ return [v.strip() for v in value.split(",")]
|
||||
```
|
||||
|
||||
### Numeric comparisons
|
||||
|
||||
For numeric comparisons, as a general rule we always use `<` and `>=` and avoid the usage of `<=` and `>`. This is to ensure we consistently
|
||||
apply inclusive lower bounds and exclusive upper bounds, helping to prevent off-by-one errors.
|
||||
|
||||
One exception to this rule is the ternary case. With a chain like
|
||||
|
||||
```python
|
||||
if value >= 0 and value < max:
|
||||
...
|
||||
```
|
||||
|
||||
it's fine to rewrite this to the shorter form
|
||||
|
||||
```python
|
||||
if 0 <= value < max:
|
||||
...
|
||||
```
|
||||
|
||||
even though this requires the usage of the `<=` operator.
|
||||
|
||||
### Iteration and comprehensions
|
||||
|
||||
We generally avoid using built-in functions like `filter` or `map` in favor of list or generator comprehensions.
|
||||
|
@ -478,10 +455,6 @@ Regression tests are tests that refer to bugs reported in specific issues. They
|
|||
|
||||
The test suite also provides [fixtures](https://github.com/explosion/spaCy/blob/master/spacy/tests/conftest.py) for different language tokenizers that can be used as function arguments of the same name and will be passed in automatically. Those should only be used for tests related to those specific languages. We also have [test utility functions](https://github.com/explosion/spaCy/blob/master/spacy/tests/util.py) for common operations, like creating a temporary file.
|
||||
|
||||
### Testing Cython Code
|
||||
|
||||
If you're developing Cython code (`.pyx` files), those extensions will need to be built before the test runner can test that code - otherwise it's going to run the tests with stale code from the last time the extension was built. You can build the extensions locally with `python setup.py build_ext -i`.
|
||||
|
||||
### Constructing objects and state
|
||||
|
||||
Test functions usually follow the same simple structure: they set up some state, perform the operation you want to test and `assert` conditions that you expect to be true, usually before and after the operation.
|
||||
|
|
|
@ -16,38 +16,18 @@ To summon the robot, write a github comment on the issue/PR you wish to test. Th
|
|||
|
||||
Some things to note:
|
||||
|
||||
- The `@explosion-bot please` must be the beginning of the command - you cannot add anything in front of this or else the robot won't know how to parse it. Adding anything at the end aside from the test name will also confuse the robot, so keep it simple!
|
||||
- The command name (such as `test_gpu`) must be one of the tests that the bot knows how to run. The available commands are documented in the bot's [workflow config](https://github.com/explosion/spaCy/blob/master/.github/workflows/explosionbot.yml#L26) and must match exactly one of the commands listed there.
|
||||
- The robot can't do multiple things at once, so if you want it to run multiple tests, you'll have to summon it with one comment per test.
|
||||
* The `@explosion-bot please` must be the beginning of the command - you cannot add anything in front of this or else the robot won't know how to parse it. Adding anything at the end aside from the test name will also confuse the robot, so keep it simple!
|
||||
* The command name (such as `test_gpu`) must be one of the tests that the bot knows how to run. The available commands are documented in the bot's [workflow config](https://github.com/explosion/spaCy/blob/master/.github/workflows/explosionbot.yml#L26) and must match exactly one of the commands listed there.
|
||||
* The robot can't do multiple things at once, so if you want it to run multiple tests, you'll have to summon it with one comment per test.
|
||||
* For the `test_gpu` command, you can specify an optional thinc branch (from the spaCy repo) or a spaCy branch (from the thinc repo) with either the `--thinc-branch` or `--spacy-branch` flags. By default, the bot will pull in the PR branch from the repo where the command was issued, and the main branch of the other repository. However, if you need to run against another branch, you can say (for example):
|
||||
|
||||
### Examples
|
||||
|
||||
- Execute spaCy slow GPU tests with a custom thinc branch from a spaCy PR:
|
||||
|
||||
```
|
||||
@explosion-bot please test_slow_gpu --thinc-branch <branch_name>
|
||||
```
|
||||
|
||||
`branch_name` can either be a named branch, e.g: `develop`, or an unmerged PR, e.g: `refs/pull/<pr_number>/head`.
|
||||
|
||||
- Execute spaCy Transformers GPU tests from a spaCy PR:
|
||||
|
||||
```
|
||||
@explosion-bot please test_gpu --run-on spacy-transformers --run-on-branch master --spacy-branch current_pr
|
||||
```
|
||||
|
||||
This will launch the GPU pipeline for the `spacy-transformers` repo on its `master` branch, using the current spaCy PR's branch to build spaCy. The name of the repository passed to `--run-on` is case-sensitive, e.g: use `spaCy` instead of `spacy`.
|
||||
|
||||
- General info about supported commands.
|
||||
|
||||
```
|
||||
@explosion-bot please info
|
||||
```
|
||||
|
||||
- Help text for a specific command
|
||||
```
|
||||
@explosion-bot please <command> --help
|
||||
```
|
||||
```
|
||||
@explosion-bot please test_gpu --thinc-branch develop
|
||||
```
|
||||
You can also specify a branch from an unmerged PR:
|
||||
```
|
||||
@explosion-bot please test_gpu --thinc-branch refs/pull/633/head
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
|
|
|
@ -1,17 +1,14 @@
|
|||
# Listeners
|
||||
|
||||
- [1. Overview](#1-overview)
|
||||
- [2. Initialization](#2-initialization)
|
||||
- [2A. Linking listeners to the embedding component](#2a-linking-listeners-to-the-embedding-component)
|
||||
- [2B. Shape inference](#2b-shape-inference)
|
||||
- [3. Internal communication](#3-internal-communication)
|
||||
- [3A. During prediction](#3a-during-prediction)
|
||||
- [3B. During training](#3b-during-training)
|
||||
- [Training with multiple listeners](#training-with-multiple-listeners)
|
||||
- [3C. Frozen components](#3c-frozen-components)
|
||||
- [The Tok2Vec or Transformer is frozen](#the-tok2vec-or-transformer-is-frozen)
|
||||
- [The upstream component is frozen](#the-upstream-component-is-frozen)
|
||||
- [4. Replacing listener with standalone](#4-replacing-listener-with-standalone)
|
||||
1. [Overview](#1-overview)
|
||||
2. [Initialization](#2-initialization)
|
||||
- [A. Linking listeners to the embedding component](#2a-linking-listeners-to-the-embedding-component)
|
||||
- [B. Shape inference](#2b-shape-inference)
|
||||
3. [Internal communication](#3-internal-communication)
|
||||
- [A. During prediction](#3a-during-prediction)
|
||||
- [B. During training](#3b-during-training)
|
||||
- [C. Frozen components](#3c-frozen-components)
|
||||
4. [Replacing listener with standalone](#4-replacing-listener-with-standalone)
|
||||
|
||||
## 1. Overview
|
||||
|
||||
|
@ -221,15 +218,3 @@ new_model = tok2vec_model.attrs["replace_listener"](new_model)
|
|||
The new config and model are then properly stored on the `nlp` object.
|
||||
Note that this functionality (running the replacement for a transformer listener) was broken prior to
|
||||
`spacy-transformers` 1.0.5.
|
||||
|
||||
In spaCy 3.7, `Language.replace_listeners` was updated to pass the following additional arguments to the `replace_listener` callback:
|
||||
the listener to be replaced and the `tok2vec`/`transformer` pipe from which the new model was copied. To maintain backwards-compatiblity,
|
||||
the method only passes these extra arguments for callbacks that support them:
|
||||
|
||||
```
|
||||
def replace_listener_pre_37(copied_tok2vec_model):
|
||||
...
|
||||
|
||||
def replace_listener_post_37(copied_tok2vec_model, replaced_listener, tok2vec_pipe):
|
||||
...
|
||||
```
|
||||
|
|
|
@ -1,82 +0,0 @@
|
|||
# spaCy Satellite Packages
|
||||
|
||||
This is a list of all the active repos relevant to spaCy besides the main one, with short descriptions, history, and current status. Archived repos will not be covered.
|
||||
|
||||
## Always Included in spaCy
|
||||
|
||||
These packages are always pulled in when you install spaCy. Most of them are direct dependencies, but some are transitive dependencies through other packages.
|
||||
|
||||
- [spacy-legacy](https://github.com/explosion/spacy-legacy): When an architecture in spaCy changes enough to get a new version, the old version is frozen and moved to spacy-legacy. This allows us to keep the core library slim while also preserving backwards compatability.
|
||||
- [thinc](https://github.com/explosion/thinc): Thinc is the machine learning library that powers trainable components in spaCy. It wraps backends like Numpy, PyTorch, and Tensorflow to provide a functional interface for specifying architectures.
|
||||
- [catalogue](https://github.com/explosion/catalogue): Small library for adding function registries, like those used for model architectures in spaCy.
|
||||
- [confection](https://github.com/explosion/confection): This library contains the functionality for config parsing that was formerly contained directly in Thinc.
|
||||
- [spacy-loggers](https://github.com/explosion/spacy-loggers): Contains loggers beyond the default logger available in spaCy's core code base. This includes loggers integrated with third-party services, which may differ in release cadence from spaCy itself.
|
||||
- [wasabi](https://github.com/explosion/wasabi): A command line formatting library, used for terminal output in spaCy.
|
||||
- [srsly](https://github.com/explosion/srsly): A wrapper that vendors several serialization libraries for spaCy. Includes parsers for JSON, JSONL, MessagePack, (extended) Pickle, and YAML.
|
||||
- [preshed](https://github.com/explosion/preshed): A Cython library for low-level data structures like hash maps, used for memory efficient data storage.
|
||||
- [cython-blis](https://github.com/explosion/cython-blis): Fast matrix multiplication using BLIS without depending on system libraries. Required by Thinc, rather than spaCy directly.
|
||||
- [murmurhash](https://github.com/explosion/murmurhash): A wrapper library for a C++ murmurhash implementation, used for string IDs in spaCy and preshed.
|
||||
- [cymem](https://github.com/explosion/cymem): A small library for RAII-style memory management in Cython.
|
||||
|
||||
## Optional Extensions for spaCy
|
||||
|
||||
These are repos that can be used by spaCy but aren't part of a default installation. Many of these are wrappers to integrate various kinds of third-party libraries.
|
||||
|
||||
- [spacy-transformers](https://github.com/explosion/spacy-transformers): A wrapper for the [HuggingFace Transformers](https://huggingface.co/docs/transformers/index) library, this handles the extensive conversion necessary to coordinate spaCy's powerful `Doc` representation, training pipeline, and the Transformer embeddings. When released, this was known as `spacy-pytorch-transformers`, but it changed to the current name when HuggingFace update the name of their library as well.
|
||||
- [spacy-huggingface-hub](https://github.com/explosion/spacy-huggingface-hub): This package has a CLI script for uploading a packaged spaCy pipeline (created with `spacy package`) to the [Hugging Face Hub](https://huggingface.co/models).
|
||||
- [spacy-alignments](https://github.com/explosion/spacy-alignments): A wrapper for the tokenizations library (mentioned below) with a modified build system to simplify cross-platform wheel creation. Used in spacy-transformers for aligning spaCy and HuggingFace tokenizations.
|
||||
- [spacy-experimental](https://github.com/explosion/spacy-experimental): Experimental components that are not quite ready for inclusion in the main spaCy library. Usually there are unresolved questions around their APIs, so the experimental library allows us to expose them to the community for feedback before fully integrating them.
|
||||
- [spacy-lookups-data](https://github.com/explosion/spacy-lookups-data): A repository of linguistic data, such as lemmas, that takes up a lot of disk space. Originally created to reduce the size of the spaCy core library. This is mainly useful if you want the data included but aren't using a pretrained pipeline; for the affected languages, the relevant data is included in pretrained pipelines directly.
|
||||
- [coreferee](https://github.com/explosion/coreferee): Coreference resolution for English, French, German and Polish, optimised for limited training data and easily extensible for further languages. Used as a spaCy pipeline component.
|
||||
- [spacy-stanza](https://github.com/explosion/spacy-stanza): This is a wrapper that allows the use of Stanford's Stanza library in spaCy.
|
||||
- [spacy-streamlit](https://github.com/explosion/spacy-streamlit): A wrapper for the Streamlit dashboard building library to help with integrating [displaCy](https://spacy.io/api/top-level/#displacy).
|
||||
- [spacymoji](https://github.com/explosion/spacymoji): A library to add extra support for emoji to spaCy, such as including character names.
|
||||
- [thinc-apple-ops](https://github.com/explosion/thinc-apple-ops): A special backend for OSX that uses Apple's native libraries for improved performance.
|
||||
- [os-signpost](https://github.com/explosion/os-signpost): A Python package that allows you to use the `OSSignposter` API in OSX for performance analysis.
|
||||
- [spacy-ray](https://github.com/explosion/spacy-ray): A wrapper to integrate spaCy with Ray, a distributed training framework. Currently a work in progress.
|
||||
|
||||
## Prodigy
|
||||
|
||||
[Prodigy](https://prodi.gy) is Explosion's easy to use and highly customizable tool for annotating data. Prodigy itself requires a license, but the repos below contain documentation, examples, and editor or notebook integrations.
|
||||
|
||||
- [prodigy-recipes](https://github.com/explosion/prodigy-recipes): Sample recipes for Prodigy, along with notebooks and other examples of usage.
|
||||
- [vscode-prodigy](https://github.com/explosion/vscode-prodigy): A VS Code extension that lets you run Prodigy inside VS Code.
|
||||
- [jupyterlab-prodigy](https://github.com/explosion/jupyterlab-prodigy): An extension for JupyterLab that lets you run Prodigy inside JupyterLab.
|
||||
|
||||
## Independent Tools or Projects
|
||||
|
||||
These are tools that may be related to or use spaCy, but are functional independent projects in their own right as well.
|
||||
|
||||
- [floret](https://github.com/explosion/floret): A modification of fastText to use Bloom Embeddings. Can be used to add vectors with subword features to spaCy, and also works independently in the same manner as fastText.
|
||||
- [sense2vec](https://github.com/explosion/sense2vec): A library to make embeddings of noun phrases or words coupled with their part of speech. This library uses spaCy.
|
||||
- [spacy-vectors-builder](https://github.com/explosion/spacy-vectors-builder): This is a spaCy project that builds vectors using floret and a lot of input text. It handles downloading the input data as well as the actual building of vectors.
|
||||
- [holmes-extractor](https://github.com/explosion/holmes-extractor): Information extraction from English and German texts based on predicate logic. Uses spaCy.
|
||||
- [healthsea](https://github.com/explosion/healthsea): Healthsea is a project to extract information from comments about health supplements. Structurally, it's a self-contained, large spaCy project.
|
||||
- [spacy-pkuseg](https://github.com/explosion/spacy-pkuseg): A fork of the pkuseg Chinese tokenizer. Used for Chinese support in spaCy, but also works independently.
|
||||
- [ml-datasets](https://github.com/explosion/ml-datasets): This repo includes loaders for several standard machine learning datasets, like MNIST or WikiNER, and has historically been used in spaCy example code and documentation.
|
||||
|
||||
## Documentation and Informational Repos
|
||||
|
||||
These repos are used to support the spaCy docs or otherwise present information about spaCy or other Explosion projects.
|
||||
|
||||
- [projects](https://github.com/explosion/projects): The projects repo is used to show detailed examples of spaCy usage. Individual projects can be checked out using the spaCy command line tool, rather than checking out the projects repo directly.
|
||||
- [spacy-course](https://github.com/explosion/spacy-course): Home to the interactive spaCy course for learning about how to use the library and some basic NLP principles.
|
||||
- [spacy-io-binder](https://github.com/explosion/spacy-io-binder): Home to the notebooks used for interactive examples in the documentation.
|
||||
|
||||
## Organizational / Meta
|
||||
|
||||
These repos are used for organizing data around spaCy, but are not something an end user would need to install as part of using the library.
|
||||
|
||||
- [spacy-models](https://github.com/explosion/spacy-models): This repo contains metadata (but not training data) for all the spaCy models. This includes information about where their training data came from, version compatability, and performance information. It also includes tests for the model packages, and the built models are hosted as releases of this repo.
|
||||
- [wheelwright](https://github.com/explosion/wheelwright): A tool for automating our PyPI builds and releases.
|
||||
- [ec2buildwheel](https://github.com/explosion/ec2buildwheel): A small project that allows you to build Python packages in the manner of cibuildwheel, but on any EC2 image. Used by wheelwright.
|
||||
|
||||
## Other
|
||||
|
||||
Repos that don't fit in any of the above categories.
|
||||
|
||||
- [blis](https://github.com/explosion/blis): A fork of the official BLIS library. The main branch is not updated, but work continues in various branches. This is used for cython-blis.
|
||||
- [tokenizations](https://github.com/explosion/tokenizations): A library originally by Yohei Tamura to align strings with tolerance to some variations in features like case and diacritics, used for aligning tokens and wordpieces. Adopted and maintained by Explosion, but usually spacy-alignments is used instead.
|
||||
- [conll-2012](https://github.com/explosion/conll-2012): A repo to hold some slightly cleaned up versions of the official scripts for the CoNLL 2012 shared task involving coreference resolution. Used in the coref project.
|
||||
- [fastapi-explosion-extras](https://github.com/explosion/fastapi-explosion-extras): Some small tweaks to FastAPI used at Explosion.
|
||||
|
|
@ -127,76 +127,3 @@ distributed under the License is distributed on an "AS IS" BASIS,
|
|||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
|
||||
|
||||
polyleven
|
||||
---------
|
||||
|
||||
* Files: spacy/matcher/polyleven.c
|
||||
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2021 Fujimoto Seiji <fujimoto@ceptord.net>
|
||||
Copyright (c) 2021 Max Bachmann <kontakt@maxbachmann.de>
|
||||
Copyright (c) 2022 Nick Mazuk
|
||||
Copyright (c) 2022 Michael Weiss <code@mweiss.ch>
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
|
||||
|
||||
SciPy
|
||||
-----
|
||||
|
||||
* Files: scorer.py
|
||||
|
||||
The implementation of trapezoid() is adapted from SciPy, which is distributed
|
||||
under the following license:
|
||||
|
||||
New BSD License
|
||||
|
||||
Copyright (c) 2001-2002 Enthought, Inc. 2003-2023, SciPy Developers.
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
2. Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the following
|
||||
disclaimer in the documentation and/or other materials provided
|
||||
with the distribution.
|
||||
|
||||
3. Neither the name of the copyright holder nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
|
|
@ -1,67 +1,13 @@
|
|||
[build-system]
|
||||
requires = [
|
||||
"setuptools",
|
||||
"cython>=3.0,<4.0",
|
||||
"cython>=0.25,<3.0",
|
||||
"cymem>=2.0.2,<2.1.0",
|
||||
"preshed>=3.0.2,<3.1.0",
|
||||
"murmurhash>=0.28.0,<1.1.0",
|
||||
"thinc>=8.3.4,<8.4.0",
|
||||
"numpy>=2.0.0,<3.0.0"
|
||||
"thinc>=8.0.14,<8.1.0",
|
||||
"blis>=0.4.0,<0.8.0",
|
||||
"pathy",
|
||||
"numpy>=1.15.0",
|
||||
]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[tool.cibuildwheel]
|
||||
build = "*"
|
||||
skip = "pp* cp36* cp37* cp38* *-win32 *i686*"
|
||||
test-skip = ""
|
||||
free-threaded-support = false
|
||||
|
||||
archs = ["native"]
|
||||
|
||||
build-frontend = "default"
|
||||
config-settings = {}
|
||||
dependency-versions = "pinned"
|
||||
environment = { PIP_CONSTRAINT = "build-constraints.txt" }
|
||||
|
||||
environment-pass = []
|
||||
build-verbosity = 0
|
||||
|
||||
before-all = "curl https://sh.rustup.rs -sSf | sh -s -- -y --profile minimal --default-toolchain stable"
|
||||
before-build = "pip install -r requirements.txt && python setup.py clean"
|
||||
repair-wheel-command = ""
|
||||
|
||||
test-command = ""
|
||||
before-test = ""
|
||||
test-requires = []
|
||||
test-extras = []
|
||||
|
||||
container-engine = "docker"
|
||||
|
||||
manylinux-x86_64-image = "manylinux2014"
|
||||
manylinux-i686-image = "manylinux2014"
|
||||
manylinux-aarch64-image = "manylinux2014"
|
||||
manylinux-ppc64le-image = "manylinux2014"
|
||||
manylinux-s390x-image = "manylinux2014"
|
||||
manylinux-pypy_x86_64-image = "manylinux2014"
|
||||
manylinux-pypy_i686-image = "manylinux2014"
|
||||
manylinux-pypy_aarch64-image = "manylinux2014"
|
||||
|
||||
musllinux-x86_64-image = "musllinux_1_2"
|
||||
musllinux-i686-image = "musllinux_1_2"
|
||||
musllinux-aarch64-image = "musllinux_1_2"
|
||||
musllinux-ppc64le-image = "musllinux_1_2"
|
||||
musllinux-s390x-image = "musllinux_1_2"
|
||||
|
||||
[tool.cibuildwheel.linux]
|
||||
repair-wheel-command = "auditwheel repair -w {dest_dir} {wheel}"
|
||||
|
||||
[tool.cibuildwheel.macos]
|
||||
repair-wheel-command = "delocate-wheel --require-archs {delocate_archs} -w {dest_dir} -v {wheel}"
|
||||
|
||||
[tool.cibuildwheel.windows]
|
||||
|
||||
[tool.cibuildwheel.pyodide]
|
||||
|
||||
|
||||
[tool.isort]
|
||||
profile = "black"
|
||||
|
|
|
@ -1,38 +1,41 @@
|
|||
# Our libraries
|
||||
spacy-legacy>=3.0.11,<3.1.0
|
||||
spacy-legacy>=3.0.9,<3.1.0
|
||||
spacy-loggers>=1.0.0,<2.0.0
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
thinc>=8.3.4,<8.4.0
|
||||
thinc>=8.0.14,<8.1.0
|
||||
blis>=0.4.0,<0.8.0
|
||||
ml_datasets>=0.2.0,<0.3.0
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
wasabi>=0.9.1,<1.2.0
|
||||
wasabi>=0.9.1,<1.1.0
|
||||
srsly>=2.4.3,<3.0.0
|
||||
catalogue>=2.0.6,<2.1.0
|
||||
typer-slim>=0.3.0,<1.0.0
|
||||
weasel>=0.1.0,<0.5.0
|
||||
typer>=0.3.0,<0.5.0
|
||||
pathy>=0.3.5
|
||||
smart-open>=5.2.1,<7.0.0
|
||||
# Third party dependencies
|
||||
numpy>=2.0.0,<3.0.0
|
||||
numpy>=1.15.0
|
||||
requests>=2.13.0,<3.0.0
|
||||
tqdm>=4.38.0,<5.0.0
|
||||
pydantic>=1.7.4,!=1.8,!=1.8.1,<3.0.0
|
||||
pydantic>=1.7.4,!=1.8,!=1.8.1,<1.9.0
|
||||
jinja2
|
||||
langcodes>=3.2.0,<4.0.0
|
||||
# Official Python utilities
|
||||
setuptools
|
||||
packaging>=20.0
|
||||
# Require and pin typing_extensions for all python versions as a workaround
|
||||
# for pydantic incompatibility with typing_extensions>=4.6.0
|
||||
typing_extensions>=3.7.4.1,<4.6.0
|
||||
# Development dependencies
|
||||
pre-commit>=2.13.0
|
||||
cython>=3.0,<4.0
|
||||
cython>=0.25,<3.0
|
||||
pytest>=5.2.0,!=7.1.0
|
||||
pytest-timeout>=1.3.0,<2.0.0
|
||||
mock>=2.0.0,<3.0.0
|
||||
flake8>=3.8.0,<6.0.0
|
||||
flake8>=3.8.0,<3.10.0
|
||||
hypothesis>=3.27.0,<7.0.0
|
||||
mypy>=1.5.0,<1.6.0; platform_machine != "aarch64" and python_version >= "3.8"
|
||||
mypy==0.910
|
||||
types-dataclasses>=0.1.3; python_version < "3.7"
|
||||
types-mock>=0.1.1
|
||||
types-setuptools>=57.0.0
|
||||
types-requests
|
||||
types-setuptools>=57.0.0
|
||||
black==22.3.0
|
||||
cython-lint>=0.15.0
|
||||
isort>=5.0,<6.0
|
||||
black>=22.0,<23.0
|
||||
|
|
84
setup.cfg
84
setup.cfg
|
@ -17,11 +17,11 @@ classifiers =
|
|||
Operating System :: Microsoft :: Windows
|
||||
Programming Language :: Cython
|
||||
Programming Language :: Python :: 3
|
||||
Programming Language :: Python :: 3.6
|
||||
Programming Language :: Python :: 3.7
|
||||
Programming Language :: Python :: 3.8
|
||||
Programming Language :: Python :: 3.9
|
||||
Programming Language :: Python :: 3.10
|
||||
Programming Language :: Python :: 3.11
|
||||
Programming Language :: Python :: 3.12
|
||||
Programming Language :: Python :: 3.13
|
||||
Topic :: Scientific/Engineering
|
||||
project_urls =
|
||||
Release notes = https://github.com/explosion/spaCy/releases
|
||||
|
@ -30,41 +30,43 @@ project_urls =
|
|||
[options]
|
||||
zip_safe = false
|
||||
include_package_data = true
|
||||
python_requires = >=3.9,<3.14
|
||||
# NOTE: This section is superseded by pyproject.toml and will be removed in
|
||||
# spaCy v4
|
||||
python_requires = >=3.6
|
||||
setup_requires =
|
||||
cython>=3.0,<4.0
|
||||
numpy>=2.0.0,<3.0.0; python_version < "3.9"
|
||||
numpy>=2.0.0,<3.0.0; python_version >= "3.9"
|
||||
cython>=0.25,<3.0
|
||||
numpy>=1.15.0
|
||||
# We also need our Cython packages here to compile against
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
thinc>=8.3.4,<8.4.0
|
||||
thinc>=8.0.14,<8.1.0
|
||||
install_requires =
|
||||
# Our libraries
|
||||
spacy-legacy>=3.0.11,<3.1.0
|
||||
spacy-legacy>=3.0.9,<3.1.0
|
||||
spacy-loggers>=1.0.0,<2.0.0
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
thinc>=8.3.4,<8.4.0
|
||||
wasabi>=0.9.1,<1.2.0
|
||||
thinc>=8.0.14,<8.1.0
|
||||
blis>=0.4.0,<0.8.0
|
||||
wasabi>=0.9.1,<1.1.0
|
||||
srsly>=2.4.3,<3.0.0
|
||||
catalogue>=2.0.6,<2.1.0
|
||||
weasel>=0.1.0,<0.5.0
|
||||
# Third-party dependencies
|
||||
typer-slim>=0.3.0,<1.0.0
|
||||
typer>=0.3.0,<0.5.0
|
||||
pathy>=0.3.5
|
||||
smart-open>=5.2.1,<7.0.0
|
||||
tqdm>=4.38.0,<5.0.0
|
||||
numpy>=1.15.0; python_version < "3.9"
|
||||
numpy>=1.19.0; python_version >= "3.9"
|
||||
numpy>=1.15.0
|
||||
requests>=2.13.0,<3.0.0
|
||||
pydantic>=1.7.4,!=1.8,!=1.8.1,<3.0.0
|
||||
pydantic>=1.7.4,!=1.8,!=1.8.1,<1.9.0
|
||||
jinja2
|
||||
# Official Python utilities
|
||||
setuptools
|
||||
packaging>=20.0
|
||||
# Require and pin typing_extensions for all python versions as a workaround
|
||||
# for pydantic incompatibility with typing_extensions>=4.6.0
|
||||
typing_extensions>=3.7.4.1,<4.6.0
|
||||
langcodes>=3.2.0,<4.0.0
|
||||
|
||||
[options.entry_points]
|
||||
console_scripts =
|
||||
|
@ -74,53 +76,45 @@ console_scripts =
|
|||
lookups =
|
||||
spacy_lookups_data>=1.0.3,<1.1.0
|
||||
transformers =
|
||||
spacy_transformers>=1.1.2,<1.4.0
|
||||
spacy_transformers>=1.1.2,<1.2.0
|
||||
ray =
|
||||
spacy_ray>=0.1.0,<1.0.0
|
||||
cuda =
|
||||
cupy>=5.0.0b4,<13.0.0
|
||||
cupy>=5.0.0b4,<11.0.0
|
||||
cuda80 =
|
||||
cupy-cuda80>=5.0.0b4,<13.0.0
|
||||
cupy-cuda80>=5.0.0b4,<11.0.0
|
||||
cuda90 =
|
||||
cupy-cuda90>=5.0.0b4,<13.0.0
|
||||
cupy-cuda90>=5.0.0b4,<11.0.0
|
||||
cuda91 =
|
||||
cupy-cuda91>=5.0.0b4,<13.0.0
|
||||
cupy-cuda91>=5.0.0b4,<11.0.0
|
||||
cuda92 =
|
||||
cupy-cuda92>=5.0.0b4,<13.0.0
|
||||
cupy-cuda92>=5.0.0b4,<11.0.0
|
||||
cuda100 =
|
||||
cupy-cuda100>=5.0.0b4,<13.0.0
|
||||
cupy-cuda100>=5.0.0b4,<11.0.0
|
||||
cuda101 =
|
||||
cupy-cuda101>=5.0.0b4,<13.0.0
|
||||
cupy-cuda101>=5.0.0b4,<11.0.0
|
||||
cuda102 =
|
||||
cupy-cuda102>=5.0.0b4,<13.0.0
|
||||
cupy-cuda102>=5.0.0b4,<11.0.0
|
||||
cuda110 =
|
||||
cupy-cuda110>=5.0.0b4,<13.0.0
|
||||
cupy-cuda110>=5.0.0b4,<11.0.0
|
||||
cuda111 =
|
||||
cupy-cuda111>=5.0.0b4,<13.0.0
|
||||
cupy-cuda111>=5.0.0b4,<11.0.0
|
||||
cuda112 =
|
||||
cupy-cuda112>=5.0.0b4,<13.0.0
|
||||
cupy-cuda112>=5.0.0b4,<11.0.0
|
||||
cuda113 =
|
||||
cupy-cuda113>=5.0.0b4,<13.0.0
|
||||
cupy-cuda113>=5.0.0b4,<11.0.0
|
||||
cuda114 =
|
||||
cupy-cuda114>=5.0.0b4,<13.0.0
|
||||
cupy-cuda114>=5.0.0b4,<11.0.0
|
||||
cuda115 =
|
||||
cupy-cuda115>=5.0.0b4,<13.0.0
|
||||
cuda116 =
|
||||
cupy-cuda116>=5.0.0b4,<13.0.0
|
||||
cuda117 =
|
||||
cupy-cuda117>=5.0.0b4,<13.0.0
|
||||
cuda11x =
|
||||
cupy-cuda11x>=11.0.0,<13.0.0
|
||||
cuda12x =
|
||||
cupy-cuda12x>=11.5.0,<13.0.0
|
||||
cuda-autodetect =
|
||||
cupy-wheel>=11.0.0,<13.0.0
|
||||
cupy-cuda115>=5.0.0b4,<11.0.0
|
||||
apple =
|
||||
thinc-apple-ops>=1.0.0,<2.0.0
|
||||
thinc-apple-ops>=0.0.4,<1.0.0
|
||||
# Language tokenizers with external dependencies
|
||||
ja =
|
||||
sudachipy>=0.5.2,!=0.6.1
|
||||
sudachidict_core>=20211220
|
||||
ko =
|
||||
natto-py>=0.9.0
|
||||
natto-py==0.9.0
|
||||
th =
|
||||
pythainlp>=2.0
|
||||
|
||||
|
|
57
setup.py
57
setup.py
|
@ -1,9 +1,10 @@
|
|||
#!/usr/bin/env python
|
||||
from setuptools import Extension, setup, find_packages
|
||||
import sys
|
||||
import platform
|
||||
import numpy
|
||||
from setuptools.command.build_ext import build_ext
|
||||
from sysconfig import get_path
|
||||
from distutils.command.build_ext import build_ext
|
||||
from distutils.sysconfig import get_python_inc
|
||||
from pathlib import Path
|
||||
import shutil
|
||||
from Cython.Build import cythonize
|
||||
|
@ -29,9 +30,7 @@ MOD_NAMES = [
|
|||
"spacy.lexeme",
|
||||
"spacy.vocab",
|
||||
"spacy.attrs",
|
||||
"spacy.kb.candidate",
|
||||
"spacy.kb.kb",
|
||||
"spacy.kb.kb_in_memory",
|
||||
"spacy.kb",
|
||||
"spacy.ml.parser_model",
|
||||
"spacy.morphology",
|
||||
"spacy.pipeline.dep_parser",
|
||||
|
@ -78,7 +77,6 @@ COMPILER_DIRECTIVES = {
|
|||
"language_level": -3,
|
||||
"embedsignature": True,
|
||||
"annotation_typing": False,
|
||||
"profile": sys.version_info < (3, 12),
|
||||
}
|
||||
# Files to copy into the package that are otherwise not included
|
||||
COPY_FILES = {
|
||||
|
@ -88,6 +86,30 @@ COPY_FILES = {
|
|||
}
|
||||
|
||||
|
||||
def is_new_osx():
|
||||
"""Check whether we're on OSX >= 10.7"""
|
||||
if sys.platform != "darwin":
|
||||
return False
|
||||
mac_ver = platform.mac_ver()[0]
|
||||
if mac_ver.startswith("10"):
|
||||
minor_version = int(mac_ver.split(".")[1])
|
||||
if minor_version >= 7:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
return False
|
||||
|
||||
|
||||
if is_new_osx():
|
||||
# On Mac, use libc++ because Apple deprecated use of
|
||||
# libstdc
|
||||
COMPILE_OPTIONS["other"].append("-stdlib=libc++")
|
||||
LINK_OPTIONS["other"].append("-lc++")
|
||||
# g++ (used by unix compiler on mac) links to libstdc++ as a default lib.
|
||||
# See: https://stackoverflow.com/questions/1653047/avoid-linking-to-libstdc
|
||||
LINK_OPTIONS["other"].append("-nodefaultlibs")
|
||||
|
||||
|
||||
# By subclassing build_extensions we have the actual compiler that will be used which is really known only after finalize_options
|
||||
# http://stackoverflow.com/questions/724664/python-distutils-how-to-get-a-compiler-that-is-going-to-be-used
|
||||
class build_ext_options:
|
||||
|
@ -104,8 +126,8 @@ class build_ext_options:
|
|||
|
||||
class build_ext_subclass(build_ext, build_ext_options):
|
||||
def build_extensions(self):
|
||||
if self.parallel is None and os.environ.get("SPACY_NUM_BUILD_JOBS") is not None:
|
||||
self.parallel = int(os.environ.get("SPACY_NUM_BUILD_JOBS"))
|
||||
if not self.parallel:
|
||||
self.parallel = int(os.environ.get("SPACY_NUM_BUILD_JOBS", 1))
|
||||
build_ext_options.build_options(self)
|
||||
build_ext.build_extensions(self)
|
||||
|
||||
|
@ -180,28 +202,13 @@ def setup_package():
|
|||
|
||||
include_dirs = [
|
||||
numpy.get_include(),
|
||||
get_path("include"),
|
||||
get_python_inc(plat_specific=True),
|
||||
]
|
||||
ext_modules = []
|
||||
ext_modules.append(
|
||||
Extension(
|
||||
"spacy.matcher.levenshtein",
|
||||
[
|
||||
"spacy/matcher/levenshtein.pyx",
|
||||
"spacy/matcher/polyleven.c",
|
||||
],
|
||||
language="c",
|
||||
include_dirs=include_dirs,
|
||||
)
|
||||
)
|
||||
for name in MOD_NAMES:
|
||||
mod_path = name.replace(".", "/") + ".pyx"
|
||||
ext = Extension(
|
||||
name,
|
||||
[mod_path],
|
||||
language="c++",
|
||||
include_dirs=include_dirs,
|
||||
extra_compile_args=["-std=c++11"],
|
||||
name, [mod_path], language="c++", include_dirs=include_dirs, extra_compile_args=["-std=c++11"]
|
||||
)
|
||||
ext_modules.append(ext)
|
||||
print("Cythonizing sources")
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
import sys
|
||||
from typing import Union, Iterable, Dict, Any
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Iterable, Union
|
||||
import sys
|
||||
|
||||
# set library-specific custom warning handling before doing anything else
|
||||
from .errors import setup_default_warnings
|
||||
|
@ -8,18 +8,20 @@ from .errors import setup_default_warnings
|
|||
setup_default_warnings() # noqa: E402
|
||||
|
||||
# These are imported as part of the API
|
||||
from thinc.api import Config, prefer_gpu, require_cpu, require_gpu # noqa: F401
|
||||
from thinc.api import prefer_gpu, require_gpu, require_cpu # noqa: F401
|
||||
from thinc.api import Config
|
||||
|
||||
from . import pipeline # noqa: F401
|
||||
from . import util
|
||||
from .about import __version__ # noqa: F401
|
||||
from .cli.info import info # noqa: F401
|
||||
from .errors import Errors
|
||||
from .glossary import explain # noqa: F401
|
||||
from .about import __version__ # noqa: F401
|
||||
from .util import registry, logger # noqa: F401
|
||||
|
||||
from .errors import Errors
|
||||
from .language import Language
|
||||
from .registrations import REGISTRY_POPULATED, populate_registry
|
||||
from .util import logger, registry # noqa: F401
|
||||
from .vocab import Vocab
|
||||
from . import util
|
||||
|
||||
|
||||
if sys.maxunicode == 65535:
|
||||
raise SystemError(Errors.E130)
|
||||
|
@ -29,33 +31,25 @@ def load(
|
|||
name: Union[str, Path],
|
||||
*,
|
||||
vocab: Union[Vocab, bool] = True,
|
||||
disable: Union[str, Iterable[str]] = util._DEFAULT_EMPTY_PIPES,
|
||||
enable: Union[str, Iterable[str]] = util._DEFAULT_EMPTY_PIPES,
|
||||
exclude: Union[str, Iterable[str]] = util._DEFAULT_EMPTY_PIPES,
|
||||
disable: Iterable[str] = util.SimpleFrozenList(),
|
||||
exclude: Iterable[str] = util.SimpleFrozenList(),
|
||||
config: Union[Dict[str, Any], Config] = util.SimpleFrozenDict(),
|
||||
) -> Language:
|
||||
"""Load a spaCy model from an installed package or a local path.
|
||||
|
||||
name (str): Package name or model path.
|
||||
vocab (Vocab): A Vocab object. If True, a vocab is created.
|
||||
disable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to disable. Disabled
|
||||
disable (Iterable[str]): Names of pipeline components to disable. Disabled
|
||||
pipes will be loaded but they won't be run unless you explicitly
|
||||
enable them by calling nlp.enable_pipe.
|
||||
enable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to enable. All other
|
||||
pipes will be disabled (but can be enabled later using nlp.enable_pipe).
|
||||
exclude (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to exclude. Excluded
|
||||
exclude (Iterable[str]): Names of pipeline components to exclude. Excluded
|
||||
components won't be loaded.
|
||||
config (Dict[str, Any] / Config): Config overrides as nested dict or dict
|
||||
keyed by section values in dot notation.
|
||||
RETURNS (Language): The loaded nlp object.
|
||||
"""
|
||||
return util.load_model(
|
||||
name,
|
||||
vocab=vocab,
|
||||
disable=disable,
|
||||
enable=enable,
|
||||
exclude=exclude,
|
||||
config=config,
|
||||
name, vocab=vocab, disable=disable, exclude=exclude, config=config
|
||||
)
|
||||
|
||||
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
# fmt: off
|
||||
__title__ = "spacy"
|
||||
__version__ = "3.8.7"
|
||||
__version__ = "3.3.3"
|
||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||
__projects__ = "https://github.com/explosion/projects"
|
||||
__projects_branch__ = "v3"
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
# Reserve 64 values for flag features
|
||||
from . cimport symbols
|
||||
|
||||
|
||||
cdef enum attr_id_t:
|
||||
NULL_ATTR
|
||||
IS_ALPHA
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
# cython: profile=False
|
||||
from .errors import Errors
|
||||
|
||||
IOB_STRINGS = ("", "I", "O", "B")
|
||||
|
@ -118,7 +117,7 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
|
|||
if "pos" in stringy_attrs:
|
||||
stringy_attrs["TAG"] = stringy_attrs.pop("pos")
|
||||
if "morph" in stringy_attrs:
|
||||
morphs = stringy_attrs.pop("morph") # no-cython-lint
|
||||
morphs = stringy_attrs.pop("morph")
|
||||
if "number" in stringy_attrs:
|
||||
stringy_attrs.pop("number")
|
||||
if "tenspect" in stringy_attrs:
|
||||
|
|
|
@ -1,40 +1,32 @@
|
|||
from wasabi import msg
|
||||
|
||||
# Needed for testing
|
||||
from . import download as download_module # noqa: F401
|
||||
from ._util import app, setup_cli # noqa: F401
|
||||
from .apply import apply # noqa: F401
|
||||
from .assemble import assemble_cli # noqa: F401
|
||||
|
||||
# These are the actual functions, NOT the wrapped CLI commands. The CLI commands
|
||||
# are registered automatically and won't have to be imported here.
|
||||
from .benchmark_speed import benchmark_speed_cli # noqa: F401
|
||||
from .convert import convert # noqa: F401
|
||||
from .debug_config import debug_config # noqa: F401
|
||||
from .debug_data import debug_data # noqa: F401
|
||||
from .debug_diff import debug_diff # noqa: F401
|
||||
from .debug_model import debug_model # noqa: F401
|
||||
from .download import download # noqa: F401
|
||||
from .evaluate import evaluate # noqa: F401
|
||||
from .find_function import find_function # noqa: F401
|
||||
from .find_threshold import find_threshold # noqa: F401
|
||||
from .info import info # noqa: F401
|
||||
from .init_config import fill_config, init_config # noqa: F401
|
||||
from .init_pipeline import init_pipeline_cli # noqa: F401
|
||||
from .package import package # noqa: F401
|
||||
from .pretrain import pretrain # noqa: F401
|
||||
from .profile import profile # noqa: F401
|
||||
from .project.assets import project_assets # type: ignore[attr-defined] # noqa: F401
|
||||
from .project.clone import project_clone # type: ignore[attr-defined] # noqa: F401
|
||||
from .project.document import ( # type: ignore[attr-defined] # noqa: F401
|
||||
project_document,
|
||||
)
|
||||
from .project.dvc import project_update_dvc # type: ignore[attr-defined] # noqa: F401
|
||||
from .project.pull import project_pull # type: ignore[attr-defined] # noqa: F401
|
||||
from .project.push import project_push # type: ignore[attr-defined] # noqa: F401
|
||||
from .project.run import project_run # type: ignore[attr-defined] # noqa: F401
|
||||
from .train import train_cli # type: ignore[attr-defined] # noqa: F401
|
||||
from .validate import validate # type: ignore[attr-defined] # noqa: F401
|
||||
from .train import train_cli # noqa: F401
|
||||
from .assemble import assemble_cli # noqa: F401
|
||||
from .pretrain import pretrain # noqa: F401
|
||||
from .debug_data import debug_data # noqa: F401
|
||||
from .debug_config import debug_config # noqa: F401
|
||||
from .debug_model import debug_model # noqa: F401
|
||||
from .debug_diff import debug_diff # noqa: F401
|
||||
from .evaluate import evaluate # noqa: F401
|
||||
from .convert import convert # noqa: F401
|
||||
from .init_pipeline import init_pipeline_cli # noqa: F401
|
||||
from .init_config import init_config, fill_config # noqa: F401
|
||||
from .validate import validate # noqa: F401
|
||||
from .project.clone import project_clone # noqa: F401
|
||||
from .project.assets import project_assets # noqa: F401
|
||||
from .project.run import project_run # noqa: F401
|
||||
from .project.dvc import project_update_dvc # noqa: F401
|
||||
from .project.push import project_push # noqa: F401
|
||||
from .project.pull import project_pull # noqa: F401
|
||||
from .project.document import project_document # noqa: F401
|
||||
|
||||
|
||||
@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
|
||||
|
|
|
@ -1,50 +1,36 @@
|
|||
import hashlib
|
||||
import os
|
||||
import shutil
|
||||
from typing import Dict, Any, Union, List, Optional, Tuple, Iterable
|
||||
from typing import TYPE_CHECKING, overload
|
||||
import sys
|
||||
from configparser import InterpolationError
|
||||
from contextlib import contextmanager
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Dict,
|
||||
Iterable,
|
||||
List,
|
||||
Optional,
|
||||
Tuple,
|
||||
Union,
|
||||
overload,
|
||||
)
|
||||
|
||||
from wasabi import msg, Printer
|
||||
import srsly
|
||||
import hashlib
|
||||
import typer
|
||||
from click import NoSuchOption
|
||||
from click.parser import split_arg_string
|
||||
from thinc.api import Config, ConfigValidationError, require_gpu
|
||||
from thinc.util import gpu_is_available
|
||||
from typer.main import get_command
|
||||
from wasabi import Printer, msg
|
||||
from weasel import app as project_cli
|
||||
from contextlib import contextmanager
|
||||
from thinc.api import Config, ConfigValidationError, require_gpu
|
||||
from thinc.util import has_cupy, gpu_is_available
|
||||
from configparser import InterpolationError
|
||||
import os
|
||||
|
||||
from .. import about
|
||||
from ..compat import Literal
|
||||
from ..schemas import validate
|
||||
from ..util import (
|
||||
ENV_VARS,
|
||||
SimpleFrozenDict,
|
||||
import_file,
|
||||
is_compatible_version,
|
||||
logger,
|
||||
make_tempdir,
|
||||
registry,
|
||||
run_command,
|
||||
)
|
||||
from ..schemas import ProjectConfigSchema, validate
|
||||
from ..util import import_file, run_command, make_tempdir, registry, logger
|
||||
from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS
|
||||
from .. import about
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pathy import Pathy # noqa: F401
|
||||
|
||||
|
||||
SDIST_SUFFIX = ".tar.gz"
|
||||
WHEEL_SUFFIX = "-py3-none-any.whl"
|
||||
|
||||
PROJECT_FILE = "project.yml"
|
||||
PROJECT_LOCK = "project.lock"
|
||||
COMMAND = "python -m spacy"
|
||||
NAME = "spacy"
|
||||
HELP = """spaCy Command-line Interface
|
||||
|
@ -60,7 +46,6 @@ DEBUG_HELP = """Suite of helpful commands for debugging and profiling. Includes
|
|||
commands to check and validate your config files, training and evaluation data,
|
||||
and custom model implementations.
|
||||
"""
|
||||
BENCHMARK_HELP = """Commands for benchmarking pipelines."""
|
||||
INIT_HELP = """Commands for initializing configs and pipeline packages."""
|
||||
|
||||
# Wrappers for Typer's annotations. Initially created to set defaults and to
|
||||
|
@ -69,13 +54,12 @@ Arg = typer.Argument
|
|||
Opt = typer.Option
|
||||
|
||||
app = typer.Typer(name=NAME, help=HELP)
|
||||
benchmark_cli = typer.Typer(name="benchmark", help=BENCHMARK_HELP, no_args_is_help=True)
|
||||
project_cli = typer.Typer(name="project", help=PROJECT_HELP, no_args_is_help=True)
|
||||
debug_cli = typer.Typer(name="debug", help=DEBUG_HELP, no_args_is_help=True)
|
||||
init_cli = typer.Typer(name="init", help=INIT_HELP, no_args_is_help=True)
|
||||
|
||||
app.add_typer(project_cli, name="project", help=PROJECT_HELP, no_args_is_help=True)
|
||||
app.add_typer(project_cli)
|
||||
app.add_typer(debug_cli)
|
||||
app.add_typer(benchmark_cli)
|
||||
app.add_typer(init_cli)
|
||||
|
||||
|
||||
|
@ -103,9 +87,9 @@ def parse_config_overrides(
|
|||
cli_overrides = _parse_overrides(args, is_cli=True)
|
||||
if cli_overrides:
|
||||
keys = [k for k in cli_overrides if k not in env_overrides]
|
||||
logger.debug("Config overrides from CLI: %s", keys)
|
||||
logger.debug(f"Config overrides from CLI: {keys}")
|
||||
if env_overrides:
|
||||
logger.debug("Config overrides from env variables: %s", list(env_overrides))
|
||||
logger.debug(f"Config overrides from env variables: {list(env_overrides)}")
|
||||
return {**cli_overrides, **env_overrides}
|
||||
|
||||
|
||||
|
@ -148,6 +132,148 @@ def _parse_override(value: Any) -> Any:
|
|||
return str(value)
|
||||
|
||||
|
||||
def load_project_config(
|
||||
path: Path, interpolate: bool = True, overrides: Dict[str, Any] = SimpleFrozenDict()
|
||||
) -> Dict[str, Any]:
|
||||
"""Load the project.yml file from a directory and validate it. Also make
|
||||
sure that all directories defined in the config exist.
|
||||
|
||||
path (Path): The path to the project directory.
|
||||
interpolate (bool): Whether to substitute project variables.
|
||||
overrides (Dict[str, Any]): Optional config overrides.
|
||||
RETURNS (Dict[str, Any]): The loaded project.yml.
|
||||
"""
|
||||
config_path = path / PROJECT_FILE
|
||||
if not config_path.exists():
|
||||
msg.fail(f"Can't find {PROJECT_FILE}", config_path, exits=1)
|
||||
invalid_err = f"Invalid {PROJECT_FILE}. Double-check that the YAML is correct."
|
||||
try:
|
||||
config = srsly.read_yaml(config_path)
|
||||
except ValueError as e:
|
||||
msg.fail(invalid_err, e, exits=1)
|
||||
errors = validate(ProjectConfigSchema, config)
|
||||
if errors:
|
||||
msg.fail(invalid_err)
|
||||
print("\n".join(errors))
|
||||
sys.exit(1)
|
||||
validate_project_version(config)
|
||||
validate_project_commands(config)
|
||||
# Make sure directories defined in config exist
|
||||
for subdir in config.get("directories", []):
|
||||
dir_path = path / subdir
|
||||
if not dir_path.exists():
|
||||
dir_path.mkdir(parents=True)
|
||||
if interpolate:
|
||||
err = f"{PROJECT_FILE} validation error"
|
||||
with show_validation_error(title=err, hint_fill=False):
|
||||
config = substitute_project_variables(config, overrides)
|
||||
return config
|
||||
|
||||
|
||||
def substitute_project_variables(
|
||||
config: Dict[str, Any],
|
||||
overrides: Dict[str, Any] = SimpleFrozenDict(),
|
||||
key: str = "vars",
|
||||
env_key: str = "env",
|
||||
) -> Dict[str, Any]:
|
||||
"""Interpolate variables in the project file using the config system.
|
||||
|
||||
config (Dict[str, Any]): The project config.
|
||||
overrides (Dict[str, Any]): Optional config overrides.
|
||||
key (str): Key containing variables in project config.
|
||||
env_key (str): Key containing environment variable mapping in project config.
|
||||
RETURNS (Dict[str, Any]): The interpolated project config.
|
||||
"""
|
||||
config.setdefault(key, {})
|
||||
config.setdefault(env_key, {})
|
||||
# Substitute references to env vars with their values
|
||||
for config_var, env_var in config[env_key].items():
|
||||
config[env_key][config_var] = _parse_override(os.environ.get(env_var, ""))
|
||||
# Need to put variables in the top scope again so we can have a top-level
|
||||
# section "project" (otherwise, a list of commands in the top scope wouldn't)
|
||||
# be allowed by Thinc's config system
|
||||
cfg = Config({"project": config, key: config[key], env_key: config[env_key]})
|
||||
cfg = Config().from_str(cfg.to_str(), overrides=overrides)
|
||||
interpolated = cfg.interpolate()
|
||||
return dict(interpolated["project"])
|
||||
|
||||
|
||||
def validate_project_version(config: Dict[str, Any]) -> None:
|
||||
"""If the project defines a compatible spaCy version range, chec that it's
|
||||
compatible with the current version of spaCy.
|
||||
|
||||
config (Dict[str, Any]): The loaded config.
|
||||
"""
|
||||
spacy_version = config.get("spacy_version", None)
|
||||
if spacy_version and not is_compatible_version(about.__version__, spacy_version):
|
||||
err = (
|
||||
f"The {PROJECT_FILE} specifies a spaCy version range ({spacy_version}) "
|
||||
f"that's not compatible with the version of spaCy you're running "
|
||||
f"({about.__version__}). You can edit version requirement in the "
|
||||
f"{PROJECT_FILE} to load it, but the project may not run as expected."
|
||||
)
|
||||
msg.fail(err, exits=1)
|
||||
|
||||
|
||||
def validate_project_commands(config: Dict[str, Any]) -> None:
|
||||
"""Check that project commands and workflows are valid, don't contain
|
||||
duplicates, don't clash and only refer to commands that exist.
|
||||
|
||||
config (Dict[str, Any]): The loaded config.
|
||||
"""
|
||||
command_names = [cmd["name"] for cmd in config.get("commands", [])]
|
||||
workflows = config.get("workflows", {})
|
||||
duplicates = set([cmd for cmd in command_names if command_names.count(cmd) > 1])
|
||||
if duplicates:
|
||||
err = f"Duplicate commands defined in {PROJECT_FILE}: {', '.join(duplicates)}"
|
||||
msg.fail(err, exits=1)
|
||||
for workflow_name, workflow_steps in workflows.items():
|
||||
if workflow_name in command_names:
|
||||
err = f"Can't use workflow name '{workflow_name}': name already exists as a command"
|
||||
msg.fail(err, exits=1)
|
||||
for step in workflow_steps:
|
||||
if step not in command_names:
|
||||
msg.fail(
|
||||
f"Unknown command specified in workflow '{workflow_name}': {step}",
|
||||
f"Workflows can only refer to commands defined in the 'commands' "
|
||||
f"section of the {PROJECT_FILE}.",
|
||||
exits=1,
|
||||
)
|
||||
|
||||
|
||||
def get_hash(data, exclude: Iterable[str] = tuple()) -> str:
|
||||
"""Get the hash for a JSON-serializable object.
|
||||
|
||||
data: The data to hash.
|
||||
exclude (Iterable[str]): Top-level keys to exclude if data is a dict.
|
||||
RETURNS (str): The hash.
|
||||
"""
|
||||
if isinstance(data, dict):
|
||||
data = {k: v for k, v in data.items() if k not in exclude}
|
||||
data_str = srsly.json_dumps(data, sort_keys=True).encode("utf8")
|
||||
return hashlib.md5(data_str).hexdigest()
|
||||
|
||||
|
||||
def get_checksum(path: Union[Path, str]) -> str:
|
||||
"""Get the checksum for a file or directory given its file path. If a
|
||||
directory path is provided, this uses all files in that directory.
|
||||
|
||||
path (Union[Path, str]): The file or directory path.
|
||||
RETURNS (str): The checksum.
|
||||
"""
|
||||
path = Path(path)
|
||||
if not (path.is_file() or path.is_dir()):
|
||||
msg.fail(f"Can't get checksum for {path}: not a file or directory", exits=1)
|
||||
if path.is_file():
|
||||
return hashlib.md5(Path(path).read_bytes()).hexdigest()
|
||||
else:
|
||||
# TODO: this is currently pretty slow
|
||||
dir_checksum = hashlib.md5()
|
||||
for sub_file in sorted(fp for fp in path.rglob("*") if fp.is_file()):
|
||||
dir_checksum.update(sub_file.read_bytes())
|
||||
return dir_checksum.hexdigest()
|
||||
|
||||
|
||||
@contextmanager
|
||||
def show_validation_error(
|
||||
file_path: Optional[Union[str, Path]] = None,
|
||||
|
@ -205,10 +331,142 @@ def import_code(code_path: Optional[Union[Path, str]]) -> None:
|
|||
msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1)
|
||||
|
||||
|
||||
def upload_file(src: Path, dest: Union[str, "Pathy"]) -> None:
|
||||
"""Upload a file.
|
||||
|
||||
src (Path): The source path.
|
||||
url (str): The destination URL to upload to.
|
||||
"""
|
||||
import smart_open
|
||||
|
||||
dest = str(dest)
|
||||
with smart_open.open(dest, mode="wb") as output_file:
|
||||
with src.open(mode="rb") as input_file:
|
||||
output_file.write(input_file.read())
|
||||
|
||||
|
||||
def download_file(src: Union[str, "Pathy"], dest: Path, *, force: bool = False) -> None:
|
||||
"""Download a file using smart_open.
|
||||
|
||||
url (str): The URL of the file.
|
||||
dest (Path): The destination path.
|
||||
force (bool): Whether to force download even if file exists.
|
||||
If False, the download will be skipped.
|
||||
"""
|
||||
import smart_open
|
||||
|
||||
if dest.exists() and not force:
|
||||
return None
|
||||
src = str(src)
|
||||
with smart_open.open(src, mode="rb", compression="disable") as input_file:
|
||||
with dest.open(mode="wb") as output_file:
|
||||
shutil.copyfileobj(input_file, output_file)
|
||||
|
||||
|
||||
def ensure_pathy(path):
|
||||
"""Temporary helper to prevent importing Pathy globally (which can cause
|
||||
slow and annoying Google Cloud warning)."""
|
||||
from pathy import Pathy # noqa: F811
|
||||
|
||||
return Pathy(path)
|
||||
|
||||
|
||||
def git_checkout(
|
||||
repo: str, subpath: str, dest: Path, *, branch: str = "master", sparse: bool = False
|
||||
):
|
||||
git_version = get_git_version()
|
||||
if dest.exists():
|
||||
msg.fail("Destination of checkout must not exist", exits=1)
|
||||
if not dest.parent.exists():
|
||||
msg.fail("Parent of destination of checkout must exist", exits=1)
|
||||
if sparse and git_version >= (2, 22):
|
||||
return git_sparse_checkout(repo, subpath, dest, branch)
|
||||
elif sparse:
|
||||
# Only show warnings if the user explicitly wants sparse checkout but
|
||||
# the Git version doesn't support it
|
||||
err_old = (
|
||||
f"You're running an old version of Git (v{git_version[0]}.{git_version[1]}) "
|
||||
f"that doesn't fully support sparse checkout yet."
|
||||
)
|
||||
err_unk = "You're running an unknown version of Git, so sparse checkout has been disabled."
|
||||
msg.warn(
|
||||
f"{err_unk if git_version == (0, 0) else err_old} "
|
||||
f"This means that more files than necessary may be downloaded "
|
||||
f"temporarily. To only download the files needed, make sure "
|
||||
f"you're using Git v2.22 or above."
|
||||
)
|
||||
with make_tempdir() as tmp_dir:
|
||||
cmd = f"git -C {tmp_dir} clone {repo} . -b {branch}"
|
||||
run_command(cmd, capture=True)
|
||||
# We need Path(name) to make sure we also support subdirectories
|
||||
try:
|
||||
source_path = tmp_dir / Path(subpath)
|
||||
if not is_subpath_of(tmp_dir, source_path):
|
||||
err = f"'{subpath}' is a path outside of the cloned repository."
|
||||
msg.fail(err, repo, exits=1)
|
||||
shutil.copytree(str(source_path), str(dest))
|
||||
except FileNotFoundError:
|
||||
err = f"Can't clone {subpath}. Make sure the directory exists in the repo (branch '{branch}')"
|
||||
msg.fail(err, repo, exits=1)
|
||||
|
||||
|
||||
def git_sparse_checkout(repo, subpath, dest, branch):
|
||||
# We're using Git, partial clone and sparse checkout to
|
||||
# only clone the files we need
|
||||
# This ends up being RIDICULOUS. omg.
|
||||
# So, every tutorial and SO post talks about 'sparse checkout'...But they
|
||||
# go and *clone* the whole repo. Worthless. And cloning part of a repo
|
||||
# turns out to be completely broken. The only way to specify a "path" is..
|
||||
# a path *on the server*? The contents of which, specifies the paths. Wat.
|
||||
# Obviously this is hopelessly broken and insecure, because you can query
|
||||
# arbitrary paths on the server! So nobody enables this.
|
||||
# What we have to do is disable *all* files. We could then just checkout
|
||||
# the path, and it'd "work", but be hopelessly slow...Because it goes and
|
||||
# transfers every missing object one-by-one. So the final piece is that we
|
||||
# need to use some weird git internals to fetch the missings in bulk, and
|
||||
# *that* we can do by path.
|
||||
# We're using Git and sparse checkout to only clone the files we need
|
||||
with make_tempdir() as tmp_dir:
|
||||
# This is the "clone, but don't download anything" part.
|
||||
cmd = (
|
||||
f"git clone {repo} {tmp_dir} --no-checkout --depth 1 "
|
||||
f"-b {branch} --filter=blob:none"
|
||||
)
|
||||
run_command(cmd)
|
||||
# Now we need to find the missing filenames for the subpath we want.
|
||||
# Looking for this 'rev-list' command in the git --help? Hah.
|
||||
cmd = f"git -C {tmp_dir} rev-list --objects --all --missing=print -- {subpath}"
|
||||
ret = run_command(cmd, capture=True)
|
||||
git_repo = _http_to_git(repo)
|
||||
# Now pass those missings into another bit of git internals
|
||||
missings = " ".join([x[1:] for x in ret.stdout.split() if x.startswith("?")])
|
||||
if not missings:
|
||||
err = (
|
||||
f"Could not find any relevant files for '{subpath}'. "
|
||||
f"Did you specify a correct and complete path within repo '{repo}' "
|
||||
f"and branch {branch}?"
|
||||
)
|
||||
msg.fail(err, exits=1)
|
||||
cmd = f"git -C {tmp_dir} fetch-pack {git_repo} {missings}"
|
||||
run_command(cmd, capture=True)
|
||||
# And finally, we can checkout our subpath
|
||||
cmd = f"git -C {tmp_dir} checkout {branch} {subpath}"
|
||||
run_command(cmd, capture=True)
|
||||
|
||||
# Get a subdirectory of the cloned path, if appropriate
|
||||
source_path = tmp_dir / Path(subpath)
|
||||
if not is_subpath_of(tmp_dir, source_path):
|
||||
err = f"'{subpath}' is a path outside of the cloned repository."
|
||||
msg.fail(err, repo, exits=1)
|
||||
|
||||
shutil.move(str(source_path), str(dest))
|
||||
|
||||
|
||||
def get_git_version(
|
||||
error: str = "Could not run 'git'. Make sure it's installed and the executable is available.",
|
||||
) -> Tuple[int, int]:
|
||||
"""Get the version of git and raise an error if calling 'git --version' fails.
|
||||
|
||||
error (str): The error message to show.
|
||||
RETURNS (Tuple[int, int]): The version as a (major, minor) tuple. Returns
|
||||
(0, 0) if the version couldn't be determined.
|
||||
|
@ -224,6 +482,30 @@ def get_git_version(
|
|||
return int(version[0]), int(version[1])
|
||||
|
||||
|
||||
def _http_to_git(repo: str) -> str:
|
||||
if repo.startswith("http://"):
|
||||
repo = repo.replace(r"http://", r"https://")
|
||||
if repo.startswith(r"https://"):
|
||||
repo = repo.replace("https://", "git@").replace("/", ":", 1)
|
||||
if repo.endswith("/"):
|
||||
repo = repo[:-1]
|
||||
repo = f"{repo}.git"
|
||||
return repo
|
||||
|
||||
|
||||
def is_subpath_of(parent, child):
|
||||
"""
|
||||
Check whether `child` is a path contained within `parent`.
|
||||
"""
|
||||
# Based on https://stackoverflow.com/a/37095733 .
|
||||
|
||||
# In Python 3.9, the `Path.is_relative_to()` method will supplant this, so
|
||||
# we can stop using crusty old os.path functions.
|
||||
parent_realpath = os.path.realpath(parent)
|
||||
child_realpath = os.path.realpath(child)
|
||||
return os.path.commonpath([parent_realpath, child_realpath]) == parent_realpath
|
||||
|
||||
|
||||
@overload
|
||||
def string_to_list(value: str, intify: Literal[False] = ...) -> List[str]:
|
||||
...
|
||||
|
@ -272,41 +554,5 @@ def setup_gpu(use_gpu: int, silent=None) -> None:
|
|||
require_gpu(use_gpu)
|
||||
else:
|
||||
local_msg.info("Using CPU")
|
||||
if gpu_is_available():
|
||||
if has_cupy and gpu_is_available():
|
||||
local_msg.info("To switch to GPU 0, use the option: --gpu-id 0")
|
||||
|
||||
|
||||
def walk_directory(path: Path, suffix: Optional[str] = None) -> List[Path]:
|
||||
"""Given a directory and a suffix, recursively find all files matching the suffix.
|
||||
Directories or files with names beginning with a . are ignored, but hidden flags on
|
||||
filesystems are not checked.
|
||||
When provided with a suffix `None`, there is no suffix-based filtering."""
|
||||
if not path.is_dir():
|
||||
return [path]
|
||||
paths = [path]
|
||||
locs = []
|
||||
seen = set()
|
||||
for path in paths:
|
||||
if str(path) in seen:
|
||||
continue
|
||||
seen.add(str(path))
|
||||
if path.parts[-1].startswith("."):
|
||||
continue
|
||||
elif path.is_dir():
|
||||
paths.extend(path.iterdir())
|
||||
elif suffix is not None and not path.parts[-1].endswith(suffix):
|
||||
continue
|
||||
else:
|
||||
locs.append(path)
|
||||
# It's good to sort these, in case the ordering messes up cache.
|
||||
locs.sort()
|
||||
return locs
|
||||
|
||||
|
||||
def _format_number(number: Union[int, float], ndigits: int = 2) -> str:
|
||||
"""Formats a number (float or int) rounding to `ndigits`, without truncating trailing 0s,
|
||||
as happens with `round(number, ndigits)`"""
|
||||
if isinstance(number, float):
|
||||
return f"{number:.{ndigits}f}"
|
||||
else:
|
||||
return str(number)
|
||||
|
|
|
@ -1,142 +0,0 @@
|
|||
from itertools import chain
|
||||
from pathlib import Path
|
||||
from typing import Iterable, List, Optional, Union, cast
|
||||
|
||||
import srsly
|
||||
import tqdm
|
||||
from wasabi import msg
|
||||
|
||||
from ..tokens import Doc, DocBin
|
||||
from ..util import ensure_path, load_model
|
||||
from ..vocab import Vocab
|
||||
from ._util import Arg, Opt, app, import_code, setup_gpu, walk_directory
|
||||
|
||||
path_help = """Location of the documents to predict on.
|
||||
Can be a single file in .spacy format or a .jsonl file.
|
||||
Files with other extensions are treated as single plain text documents.
|
||||
If a directory is provided it is traversed recursively to grab
|
||||
all files to be processed.
|
||||
The files can be a mixture of .spacy, .jsonl and text files.
|
||||
If .jsonl is provided the specified field is going
|
||||
to be grabbed ("text" by default)."""
|
||||
|
||||
out_help = "Path to save the resulting .spacy file"
|
||||
code_help = (
|
||||
"Path to Python file with additional " "code (registered functions) to be imported"
|
||||
)
|
||||
gold_help = "Use gold preprocessing provided in the .spacy files"
|
||||
force_msg = (
|
||||
"The provided output file already exists. "
|
||||
"To force overwriting the output file, set the --force or -F flag."
|
||||
)
|
||||
|
||||
|
||||
DocOrStrStream = Union[Iterable[str], Iterable[Doc]]
|
||||
|
||||
|
||||
def _stream_docbin(path: Path, vocab: Vocab) -> Iterable[Doc]:
|
||||
"""
|
||||
Stream Doc objects from DocBin.
|
||||
"""
|
||||
docbin = DocBin().from_disk(path)
|
||||
for doc in docbin.get_docs(vocab):
|
||||
yield doc
|
||||
|
||||
|
||||
def _stream_jsonl(path: Path, field: str) -> Iterable[str]:
|
||||
"""
|
||||
Stream "text" field from JSONL. If the field "text" is
|
||||
not found it raises error.
|
||||
"""
|
||||
for entry in srsly.read_jsonl(path):
|
||||
if field not in entry:
|
||||
msg.fail(f"{path} does not contain the required '{field}' field.", exits=1)
|
||||
else:
|
||||
yield entry[field]
|
||||
|
||||
|
||||
def _stream_texts(paths: Iterable[Path]) -> Iterable[str]:
|
||||
"""
|
||||
Yields strings from text files in paths.
|
||||
"""
|
||||
for path in paths:
|
||||
with open(path, "r") as fin:
|
||||
text = fin.read()
|
||||
yield text
|
||||
|
||||
|
||||
@app.command("apply")
|
||||
def apply_cli(
|
||||
# fmt: off
|
||||
model: str = Arg(..., help="Model name or path"),
|
||||
data_path: Path = Arg(..., help=path_help, exists=True),
|
||||
output_file: Path = Arg(..., help=out_help, dir_okay=False),
|
||||
code_path: Optional[Path] = Opt(None, "--code", "-c", help=code_help),
|
||||
text_key: str = Opt("text", "--text-key", "-tk", help="Key containing text string for JSONL"),
|
||||
force_overwrite: bool = Opt(False, "--force", "-F", help="Force overwriting the output file"),
|
||||
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU."),
|
||||
batch_size: int = Opt(1, "--batch-size", "-b", help="Batch size."),
|
||||
n_process: int = Opt(1, "--n-process", "-n", help="number of processors to use.")
|
||||
):
|
||||
"""
|
||||
Apply a trained pipeline to documents to get predictions.
|
||||
Expects a loadable spaCy pipeline and path to the data, which
|
||||
can be a directory or a file.
|
||||
The data files can be provided in multiple formats:
|
||||
1. .spacy files
|
||||
2. .jsonl files with a specified "field" to read the text from.
|
||||
3. Files with any other extension are assumed to be containing
|
||||
a single document.
|
||||
DOCS: https://spacy.io/api/cli#apply
|
||||
"""
|
||||
data_path = ensure_path(data_path)
|
||||
output_file = ensure_path(output_file)
|
||||
code_path = ensure_path(code_path)
|
||||
if output_file.exists() and not force_overwrite:
|
||||
msg.fail(force_msg, exits=1)
|
||||
if not data_path.exists():
|
||||
msg.fail(f"Couldn't find data path: {data_path}", exits=1)
|
||||
import_code(code_path)
|
||||
setup_gpu(use_gpu)
|
||||
apply(data_path, output_file, model, text_key, batch_size, n_process)
|
||||
|
||||
|
||||
def apply(
|
||||
data_path: Path,
|
||||
output_file: Path,
|
||||
model: str,
|
||||
json_field: str,
|
||||
batch_size: int,
|
||||
n_process: int,
|
||||
):
|
||||
docbin = DocBin(store_user_data=True)
|
||||
paths = walk_directory(data_path)
|
||||
if len(paths) == 0:
|
||||
docbin.to_disk(output_file)
|
||||
msg.warn(
|
||||
"Did not find data to process,"
|
||||
f" {data_path} seems to be an empty directory."
|
||||
)
|
||||
return
|
||||
nlp = load_model(model)
|
||||
msg.good(f"Loaded model {model}")
|
||||
vocab = nlp.vocab
|
||||
streams: List[DocOrStrStream] = []
|
||||
text_files = []
|
||||
for path in paths:
|
||||
if path.suffix == ".spacy":
|
||||
streams.append(_stream_docbin(path, vocab))
|
||||
elif path.suffix == ".jsonl":
|
||||
streams.append(_stream_jsonl(path, json_field))
|
||||
else:
|
||||
text_files.append(path)
|
||||
if len(text_files) > 0:
|
||||
streams.append(_stream_texts(text_files))
|
||||
datagen = cast(DocOrStrStream, chain(*streams))
|
||||
for doc in tqdm.tqdm(
|
||||
nlp.pipe(datagen, batch_size=batch_size, n_process=n_process), disable=None
|
||||
):
|
||||
docbin.add(doc)
|
||||
if output_file.suffix == "":
|
||||
output_file = output_file.with_suffix(".spacy")
|
||||
docbin.to_disk(output_file)
|
|
@ -1,20 +1,13 @@
|
|||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import typer
|
||||
from pathlib import Path
|
||||
from wasabi import msg
|
||||
import typer
|
||||
import logging
|
||||
|
||||
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
|
||||
from ._util import import_code
|
||||
from .. import util
|
||||
from ..util import get_sourced_components, load_model_from_config
|
||||
from ._util import (
|
||||
Arg,
|
||||
Opt,
|
||||
app,
|
||||
import_code,
|
||||
parse_config_overrides,
|
||||
show_validation_error,
|
||||
)
|
||||
|
||||
|
||||
@app.command(
|
||||
|
@ -40,8 +33,7 @@ def assemble_cli(
|
|||
|
||||
DOCS: https://spacy.io/api/cli#assemble
|
||||
"""
|
||||
if verbose:
|
||||
util.logger.setLevel(logging.DEBUG)
|
||||
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
||||
# Make sure all files and paths exists if they are needed
|
||||
if not config_path or (str(config_path) != "-" and not config_path.exists()):
|
||||
msg.fail("Config file not found", config_path, exits=1)
|
||||
|
|
|
@ -1,177 +0,0 @@
|
|||
import random
|
||||
import time
|
||||
from itertools import islice
|
||||
from pathlib import Path
|
||||
from typing import Iterable, List, Optional
|
||||
|
||||
import numpy
|
||||
import typer
|
||||
from tqdm import tqdm
|
||||
from wasabi import msg
|
||||
|
||||
from .. import util
|
||||
from ..language import Language
|
||||
from ..tokens import Doc
|
||||
from ..training import Corpus
|
||||
from ._util import Arg, Opt, benchmark_cli, import_code, setup_gpu
|
||||
|
||||
|
||||
@benchmark_cli.command(
|
||||
"speed",
|
||||
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
||||
)
|
||||
def benchmark_speed_cli(
|
||||
# fmt: off
|
||||
ctx: typer.Context,
|
||||
model: str = Arg(..., help="Model name or path"),
|
||||
data_path: Path = Arg(..., help="Location of binary evaluation data in .spacy format", exists=True),
|
||||
batch_size: Optional[int] = Opt(None, "--batch-size", "-b", min=1, help="Override the pipeline batch size"),
|
||||
no_shuffle: bool = Opt(False, "--no-shuffle", help="Do not shuffle benchmark data"),
|
||||
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
||||
n_batches: int = Opt(50, "--batches", help="Minimum number of batches to benchmark", min=30,),
|
||||
warmup_epochs: int = Opt(3, "--warmup", "-w", min=0, help="Number of iterations over the data for warmup"),
|
||||
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||
# fmt: on
|
||||
):
|
||||
"""
|
||||
Benchmark a pipeline. Expects a loadable spaCy pipeline and benchmark
|
||||
data in the binary .spacy format.
|
||||
"""
|
||||
import_code(code_path)
|
||||
setup_gpu(use_gpu=use_gpu, silent=False)
|
||||
|
||||
nlp = util.load_model(model)
|
||||
batch_size = batch_size if batch_size is not None else nlp.batch_size
|
||||
corpus = Corpus(data_path)
|
||||
docs = [eg.predicted for eg in corpus(nlp)]
|
||||
|
||||
if len(docs) == 0:
|
||||
msg.fail("Cannot benchmark speed using an empty corpus.", exits=1)
|
||||
|
||||
print(f"Warming up for {warmup_epochs} epochs...")
|
||||
warmup(nlp, docs, warmup_epochs, batch_size)
|
||||
|
||||
print()
|
||||
print(f"Benchmarking {n_batches} batches...")
|
||||
wps = benchmark(nlp, docs, n_batches, batch_size, not no_shuffle)
|
||||
|
||||
print()
|
||||
print_outliers(wps)
|
||||
print_mean_with_ci(wps)
|
||||
|
||||
|
||||
# Lowercased, behaves as a context manager function.
|
||||
class time_context:
|
||||
"""Register the running time of a context."""
|
||||
|
||||
def __enter__(self):
|
||||
self.start = time.perf_counter()
|
||||
return self
|
||||
|
||||
def __exit__(self, type, value, traceback):
|
||||
self.elapsed = time.perf_counter() - self.start
|
||||
|
||||
|
||||
class Quartiles:
|
||||
"""Calculate the q1, q2, q3 quartiles and the inter-quartile range (iqr)
|
||||
of a sample."""
|
||||
|
||||
q1: float
|
||||
q2: float
|
||||
q3: float
|
||||
iqr: float
|
||||
|
||||
def __init__(self, sample: numpy.ndarray) -> None:
|
||||
self.q1 = numpy.quantile(sample, 0.25)
|
||||
self.q2 = numpy.quantile(sample, 0.5)
|
||||
self.q3 = numpy.quantile(sample, 0.75)
|
||||
self.iqr = self.q3 - self.q1
|
||||
|
||||
|
||||
def annotate(
|
||||
nlp: Language, docs: List[Doc], batch_size: Optional[int]
|
||||
) -> numpy.ndarray:
|
||||
docs = nlp.pipe(tqdm(docs, unit="doc", disable=None), batch_size=batch_size)
|
||||
wps = []
|
||||
while True:
|
||||
with time_context() as elapsed:
|
||||
batch_docs = list(
|
||||
islice(docs, batch_size if batch_size else nlp.batch_size)
|
||||
)
|
||||
if len(batch_docs) == 0:
|
||||
break
|
||||
n_tokens = count_tokens(batch_docs)
|
||||
wps.append(n_tokens / elapsed.elapsed)
|
||||
|
||||
return numpy.array(wps)
|
||||
|
||||
|
||||
def benchmark(
|
||||
nlp: Language,
|
||||
docs: List[Doc],
|
||||
n_batches: int,
|
||||
batch_size: int,
|
||||
shuffle: bool,
|
||||
) -> numpy.ndarray:
|
||||
if shuffle:
|
||||
bench_docs = [
|
||||
nlp.make_doc(random.choice(docs).text)
|
||||
for _ in range(n_batches * batch_size)
|
||||
]
|
||||
else:
|
||||
bench_docs = [
|
||||
nlp.make_doc(docs[i % len(docs)].text)
|
||||
for i in range(n_batches * batch_size)
|
||||
]
|
||||
|
||||
return annotate(nlp, bench_docs, batch_size)
|
||||
|
||||
|
||||
def bootstrap(x, statistic=numpy.mean, iterations=10000) -> numpy.ndarray:
|
||||
"""Apply a statistic to repeated random samples of an array."""
|
||||
return numpy.fromiter(
|
||||
(
|
||||
statistic(numpy.random.choice(x, len(x), replace=True))
|
||||
for _ in range(iterations)
|
||||
),
|
||||
numpy.float64,
|
||||
)
|
||||
|
||||
|
||||
def count_tokens(docs: Iterable[Doc]) -> int:
|
||||
return sum(len(doc) for doc in docs)
|
||||
|
||||
|
||||
def print_mean_with_ci(sample: numpy.ndarray):
|
||||
mean = numpy.mean(sample)
|
||||
bootstrap_means = bootstrap(sample)
|
||||
bootstrap_means.sort()
|
||||
|
||||
# 95% confidence interval
|
||||
low = bootstrap_means[int(len(bootstrap_means) * 0.025)]
|
||||
high = bootstrap_means[int(len(bootstrap_means) * 0.975)]
|
||||
|
||||
print(f"Mean: {mean:.1f} words/s (95% CI: {low-mean:.1f} +{high-mean:.1f})")
|
||||
|
||||
|
||||
def print_outliers(sample: numpy.ndarray):
|
||||
quartiles = Quartiles(sample)
|
||||
|
||||
n_outliers = numpy.sum(
|
||||
(sample < (quartiles.q1 - 1.5 * quartiles.iqr))
|
||||
| (sample > (quartiles.q3 + 1.5 * quartiles.iqr))
|
||||
)
|
||||
n_extreme_outliers = numpy.sum(
|
||||
(sample < (quartiles.q1 - 3.0 * quartiles.iqr))
|
||||
| (sample > (quartiles.q3 + 3.0 * quartiles.iqr))
|
||||
)
|
||||
print(
|
||||
f"Outliers: {(100 * n_outliers) / len(sample):.1f}%, extreme outliers: {(100 * n_extreme_outliers) / len(sample)}%"
|
||||
)
|
||||
|
||||
|
||||
def warmup(
|
||||
nlp: Language, docs: List[Doc], warmup_epochs: int, batch_size: Optional[int]
|
||||
) -> numpy.ndarray:
|
||||
docs = [doc.copy() for doc in docs * warmup_epochs]
|
||||
return annotate(nlp, docs, batch_size)
|
|
@ -1,22 +1,18 @@
|
|||
import itertools
|
||||
import re
|
||||
import sys
|
||||
from typing import Callable, Iterable, Mapping, Optional, Any, List, Union
|
||||
from enum import Enum
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable, Iterable, Mapping, Optional, Union
|
||||
|
||||
import srsly
|
||||
from wasabi import Printer
|
||||
import srsly
|
||||
import re
|
||||
import sys
|
||||
import itertools
|
||||
|
||||
from ..tokens import Doc, DocBin
|
||||
from ._util import app, Arg, Opt
|
||||
from ..training import docs_to_json
|
||||
from ..training.converters import (
|
||||
conll_ner_to_docs,
|
||||
conllu_to_docs,
|
||||
iob_to_docs,
|
||||
json_to_docs,
|
||||
)
|
||||
from ._util import Arg, Opt, app, walk_directory
|
||||
from ..tokens import Doc, DocBin
|
||||
from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs
|
||||
from ..training.converters import conllu_to_docs
|
||||
|
||||
|
||||
# Converters are matched by file extension except for ner/iob, which are
|
||||
# matched by file extension and content. To add a converter, add a new
|
||||
|
@ -32,8 +28,6 @@ CONVERTERS: Mapping[str, Callable[..., Iterable[Doc]]] = {
|
|||
"json": json_to_docs,
|
||||
}
|
||||
|
||||
AUTO = "auto"
|
||||
|
||||
|
||||
# File types that can be written to stdout
|
||||
FILE_TYPES_STDOUT = ("json",)
|
||||
|
@ -55,7 +49,7 @@ def convert_cli(
|
|||
model: Optional[str] = Opt(None, "--model", "--base", "-b", help="Trained spaCy pipeline for sentence segmentation to use as base (for --seg-sents)"),
|
||||
morphology: bool = Opt(False, "--morphology", "-m", help="Enable appending morphology to tags"),
|
||||
merge_subtokens: bool = Opt(False, "--merge-subtokens", "-T", help="Merge CoNLL-U subtokens"),
|
||||
converter: str = Opt(AUTO, "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}"),
|
||||
converter: str = Opt("auto", "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}"),
|
||||
ner_map: Optional[Path] = Opt(None, "--ner-map", "-nm", help="NER tag mapping (as JSON-encoded dict of entity types)", exists=True),
|
||||
lang: Optional[str] = Opt(None, "--lang", "-l", help="Language (if tokenizer required)"),
|
||||
concatenate: bool = Opt(None, "--concatenate", "-C", help="Concatenate output to a single file"),
|
||||
|
@ -76,8 +70,8 @@ def convert_cli(
|
|||
output_dir: Union[str, Path] = "-" if output_dir == Path("-") else output_dir
|
||||
silent = output_dir == "-"
|
||||
msg = Printer(no_print=silent)
|
||||
converter = _get_converter(msg, converter, input_path)
|
||||
verify_cli_args(msg, input_path, output_dir, file_type.value, converter, ner_map)
|
||||
converter = _get_converter(msg, converter, input_path)
|
||||
convert(
|
||||
input_path,
|
||||
output_dir,
|
||||
|
@ -106,7 +100,7 @@ def convert(
|
|||
model: Optional[str] = None,
|
||||
morphology: bool = False,
|
||||
merge_subtokens: bool = False,
|
||||
converter: str,
|
||||
converter: str = "auto",
|
||||
ner_map: Optional[Path] = None,
|
||||
lang: Optional[str] = None,
|
||||
concatenate: bool = False,
|
||||
|
@ -195,6 +189,33 @@ def autodetect_ner_format(input_data: str) -> Optional[str]:
|
|||
return None
|
||||
|
||||
|
||||
def walk_directory(path: Path, converter: str) -> List[Path]:
|
||||
if not path.is_dir():
|
||||
return [path]
|
||||
paths = [path]
|
||||
locs = []
|
||||
seen = set()
|
||||
for path in paths:
|
||||
if str(path) in seen:
|
||||
continue
|
||||
seen.add(str(path))
|
||||
if path.parts[-1].startswith("."):
|
||||
continue
|
||||
elif path.is_dir():
|
||||
paths.extend(path.iterdir())
|
||||
elif converter == "json" and not path.parts[-1].endswith("json"):
|
||||
continue
|
||||
elif converter == "conll" and not path.parts[-1].endswith("conll"):
|
||||
continue
|
||||
elif converter == "iob" and not path.parts[-1].endswith("iob"):
|
||||
continue
|
||||
else:
|
||||
locs.append(path)
|
||||
# It's good to sort these, in case the ordering messes up cache.
|
||||
locs.sort()
|
||||
return locs
|
||||
|
||||
|
||||
def verify_cli_args(
|
||||
msg: Printer,
|
||||
input_path: Path,
|
||||
|
@ -218,22 +239,18 @@ def verify_cli_args(
|
|||
input_locs = walk_directory(input_path, converter)
|
||||
if len(input_locs) == 0:
|
||||
msg.fail("No input files in directory", input_path, exits=1)
|
||||
if converter not in CONVERTERS:
|
||||
file_types = list(set([loc.suffix[1:] for loc in input_locs]))
|
||||
if converter == "auto" and len(file_types) >= 2:
|
||||
file_types_str = ",".join(file_types)
|
||||
msg.fail("All input files must be same type", file_types_str, exits=1)
|
||||
if converter != "auto" and converter not in CONVERTERS:
|
||||
msg.fail(f"Can't find converter for {converter}", exits=1)
|
||||
|
||||
|
||||
def _get_converter(msg, converter, input_path: Path):
|
||||
if input_path.is_dir():
|
||||
if converter == AUTO:
|
||||
input_locs = walk_directory(input_path, suffix=None)
|
||||
file_types = list(set([loc.suffix[1:] for loc in input_locs]))
|
||||
if len(file_types) >= 2:
|
||||
file_types_str = ",".join(file_types)
|
||||
msg.fail("All input files must be same type", file_types_str, exits=1)
|
||||
input_path = input_locs[0]
|
||||
else:
|
||||
input_path = walk_directory(input_path, suffix=converter)[0]
|
||||
if converter == AUTO:
|
||||
input_path = walk_directory(input_path, converter)[0]
|
||||
if converter == "auto":
|
||||
converter = input_path.suffix[1:]
|
||||
if converter == "ner" or converter == "iob":
|
||||
with input_path.open(encoding="utf8") as file_:
|
||||
|
|
|
@ -1,22 +1,15 @@
|
|||
from typing import Optional, Dict, Any, Union, List
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
|
||||
import typer
|
||||
from wasabi import msg, table
|
||||
from thinc.api import Config
|
||||
from thinc.config import VARIABLE_RE
|
||||
from wasabi import msg, table
|
||||
import typer
|
||||
|
||||
from .. import util
|
||||
from ._util import Arg, Opt, show_validation_error, parse_config_overrides
|
||||
from ._util import import_code, debug_cli
|
||||
from ..schemas import ConfigSchemaInit, ConfigSchemaTraining
|
||||
from ..util import registry
|
||||
from ._util import (
|
||||
Arg,
|
||||
Opt,
|
||||
debug_cli,
|
||||
import_code,
|
||||
parse_config_overrides,
|
||||
show_validation_error,
|
||||
)
|
||||
from .. import util
|
||||
|
||||
|
||||
@debug_cli.command(
|
||||
|
|
|
@ -1,49 +1,28 @@
|
|||
import math
|
||||
import sys
|
||||
from collections import Counter
|
||||
from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Tuple, Union
|
||||
from typing import cast, overload
|
||||
from pathlib import Path
|
||||
from typing import (
|
||||
Any,
|
||||
Dict,
|
||||
Iterable,
|
||||
List,
|
||||
Optional,
|
||||
Sequence,
|
||||
Set,
|
||||
Tuple,
|
||||
Union,
|
||||
cast,
|
||||
overload,
|
||||
)
|
||||
|
||||
import numpy
|
||||
from collections import Counter
|
||||
import sys
|
||||
import srsly
|
||||
from wasabi import Printer, MESSAGES, msg
|
||||
import typer
|
||||
from wasabi import MESSAGES, Printer, msg
|
||||
import math
|
||||
|
||||
from .. import util
|
||||
from ..compat import Literal
|
||||
from ..language import Language
|
||||
from ..morphology import Morphology
|
||||
from ..pipeline import Morphologizer, SpanCategorizer, TrainablePipe
|
||||
from ..pipeline._edit_tree_internals.edit_trees import EditTrees
|
||||
from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
|
||||
from ._util import import_code, debug_cli
|
||||
from ..training import Example
|
||||
from ..training.initialize import get_sourced_components
|
||||
from ..schemas import ConfigSchemaTraining
|
||||
from ..pipeline._parser_internals import nonproj
|
||||
from ..pipeline._parser_internals.nonproj import DELIMITER
|
||||
from ..schemas import ConfigSchemaTraining
|
||||
from ..training import Example, remove_bilu_prefix
|
||||
from ..training.initialize import get_sourced_components
|
||||
from ..pipeline import Morphologizer, SpanCategorizer
|
||||
from ..morphology import Morphology
|
||||
from ..language import Language
|
||||
from ..util import registry, resolve_dot_names
|
||||
from ..compat import Literal
|
||||
from ..vectors import Mode as VectorsMode
|
||||
from ._util import (
|
||||
Arg,
|
||||
Opt,
|
||||
_format_number,
|
||||
app,
|
||||
debug_cli,
|
||||
import_code,
|
||||
parse_config_overrides,
|
||||
show_validation_error,
|
||||
)
|
||||
from .. import util
|
||||
|
||||
|
||||
# Minimum number of expected occurrences of NER label in data to train new label
|
||||
NEW_LABEL_THRESHOLD = 50
|
||||
|
@ -230,7 +209,7 @@ def debug_data(
|
|||
else:
|
||||
msg.info("No word vectors present in the package")
|
||||
|
||||
if "spancat" in factory_names or "spancat_singlelabel" in factory_names:
|
||||
if "spancat" in factory_names:
|
||||
model_labels_spancat = _get_labels_from_spancat(nlp)
|
||||
has_low_data_warning = False
|
||||
has_no_neg_warning = False
|
||||
|
@ -355,7 +334,7 @@ def debug_data(
|
|||
show=verbose,
|
||||
)
|
||||
else:
|
||||
msg.good("Examples without occurrences available for all labels")
|
||||
msg.good("Examples without ocurrences available for all labels")
|
||||
|
||||
if "ner" in factory_names:
|
||||
# Get all unique NER labels present in the data
|
||||
|
@ -382,7 +361,7 @@ def debug_data(
|
|||
if label != "-"
|
||||
]
|
||||
labels_with_counts = _format_labels(labels_with_counts, counts=True)
|
||||
msg.text(f"Labels in train data: {labels_with_counts}", show=verbose)
|
||||
msg.text(f"Labels in train data: {_format_labels(labels)}", show=verbose)
|
||||
missing_labels = model_labels - labels
|
||||
if missing_labels:
|
||||
msg.warn(
|
||||
|
@ -540,13 +519,9 @@ def debug_data(
|
|||
|
||||
if "tagger" in factory_names:
|
||||
msg.divider("Part-of-speech Tagging")
|
||||
label_list, counts = zip(*gold_train_data["tags"].items())
|
||||
msg.info(f"{len(label_list)} label(s) in train data")
|
||||
p = numpy.array(counts)
|
||||
p = p / p.sum()
|
||||
norm_entropy = (-p * numpy.log2(p)).sum() / numpy.log2(len(label_list))
|
||||
msg.info(f"{norm_entropy} is the normalised label entropy")
|
||||
label_list = [label for label in gold_train_data["tags"]]
|
||||
model_labels = _get_labels_from_model(nlp, "tagger")
|
||||
msg.info(f"{len(label_list)} label(s) in train data")
|
||||
labels = set(label_list)
|
||||
missing_labels = model_labels - labels
|
||||
if missing_labels:
|
||||
|
@ -695,59 +670,6 @@ def debug_data(
|
|||
f"Found {gold_train_data['n_cycles']} projectivized train sentence(s) with cycles"
|
||||
)
|
||||
|
||||
if "trainable_lemmatizer" in factory_names:
|
||||
msg.divider("Trainable Lemmatizer")
|
||||
trees_train: Set[str] = gold_train_data["lemmatizer_trees"]
|
||||
trees_dev: Set[str] = gold_dev_data["lemmatizer_trees"]
|
||||
# This is necessary context when someone is attempting to interpret whether the
|
||||
# number of trees exclusively in the dev set is meaningful.
|
||||
msg.info(f"{len(trees_train)} lemmatizer trees generated from training data")
|
||||
msg.info(f"{len(trees_dev)} lemmatizer trees generated from dev data")
|
||||
dev_not_train = trees_dev - trees_train
|
||||
|
||||
if len(dev_not_train) != 0:
|
||||
pct = len(dev_not_train) / len(trees_dev)
|
||||
msg.info(
|
||||
f"{len(dev_not_train)} lemmatizer trees ({pct*100:.1f}% of dev trees)"
|
||||
" were found exclusively in the dev data."
|
||||
)
|
||||
else:
|
||||
# Would we ever expect this case? It seems like it would be pretty rare,
|
||||
# and we might actually want a warning?
|
||||
msg.info("All trees in dev data present in training data.")
|
||||
|
||||
if gold_train_data["n_low_cardinality_lemmas"] > 0:
|
||||
n = gold_train_data["n_low_cardinality_lemmas"]
|
||||
msg.warn(f"{n} training docs with 0 or 1 unique lemmas.")
|
||||
|
||||
if gold_dev_data["n_low_cardinality_lemmas"] > 0:
|
||||
n = gold_dev_data["n_low_cardinality_lemmas"]
|
||||
msg.warn(f"{n} dev docs with 0 or 1 unique lemmas.")
|
||||
|
||||
if gold_train_data["no_lemma_annotations"] > 0:
|
||||
n = gold_train_data["no_lemma_annotations"]
|
||||
msg.warn(f"{n} training docs with no lemma annotations.")
|
||||
else:
|
||||
msg.good("All training docs have lemma annotations.")
|
||||
|
||||
if gold_dev_data["no_lemma_annotations"] > 0:
|
||||
n = gold_dev_data["no_lemma_annotations"]
|
||||
msg.warn(f"{n} dev docs with no lemma annotations.")
|
||||
else:
|
||||
msg.good("All dev docs have lemma annotations.")
|
||||
|
||||
if gold_train_data["partial_lemma_annotations"] > 0:
|
||||
n = gold_train_data["partial_lemma_annotations"]
|
||||
msg.info(f"{n} training docs with partial lemma annotations.")
|
||||
else:
|
||||
msg.good("All training docs have complete lemma annotations.")
|
||||
|
||||
if gold_dev_data["partial_lemma_annotations"] > 0:
|
||||
n = gold_dev_data["partial_lemma_annotations"]
|
||||
msg.info(f"{n} dev docs with partial lemma annotations.")
|
||||
else:
|
||||
msg.good("All dev docs have complete lemma annotations.")
|
||||
|
||||
msg.divider("Summary")
|
||||
good_counts = msg.counts[MESSAGES.GOOD]
|
||||
warn_counts = msg.counts[MESSAGES.WARN]
|
||||
|
@ -809,13 +731,7 @@ def _compile_gold(
|
|||
"n_cats_multilabel": 0,
|
||||
"n_cats_bad_values": 0,
|
||||
"texts": set(),
|
||||
"lemmatizer_trees": set(),
|
||||
"no_lemma_annotations": 0,
|
||||
"partial_lemma_annotations": 0,
|
||||
"n_low_cardinality_lemmas": 0,
|
||||
}
|
||||
if "trainable_lemmatizer" in factory_names:
|
||||
trees = EditTrees(nlp.vocab.strings)
|
||||
for eg in examples:
|
||||
gold = eg.reference
|
||||
doc = eg.predicted
|
||||
|
@ -842,13 +758,13 @@ def _compile_gold(
|
|||
# "Illegal" whitespace entity
|
||||
data["ws_ents"] += 1
|
||||
if label.startswith(("B-", "U-")):
|
||||
combined_label = remove_bilu_prefix(label)
|
||||
combined_label = label.split("-")[1]
|
||||
data["ner"][combined_label] += 1
|
||||
if sent_starts[i] and label.startswith(("I-", "L-")):
|
||||
if sent_starts[i] == True and label.startswith(("I-", "L-")):
|
||||
data["boundary_cross_ents"] += 1
|
||||
elif label == "-":
|
||||
data["ner"]["-"] += 1
|
||||
if "spancat" in factory_names or "spancat_singlelabel" in factory_names:
|
||||
if "spancat" in factory_names:
|
||||
for spans_key in list(eg.reference.spans.keys()):
|
||||
# Obtain the span frequency
|
||||
if spans_key not in data["spancat"]:
|
||||
|
@ -945,25 +861,6 @@ def _compile_gold(
|
|||
data["n_nonproj"] += 1
|
||||
if nonproj.contains_cycle(aligned_heads):
|
||||
data["n_cycles"] += 1
|
||||
if "trainable_lemmatizer" in factory_names:
|
||||
# from EditTreeLemmatizer._labels_from_data
|
||||
if all(token.lemma == 0 for token in gold):
|
||||
data["no_lemma_annotations"] += 1
|
||||
continue
|
||||
if any(token.lemma == 0 for token in gold):
|
||||
data["partial_lemma_annotations"] += 1
|
||||
lemma_set = set()
|
||||
for token in gold:
|
||||
if token.lemma != 0:
|
||||
lemma_set.add(token.lemma)
|
||||
tree_id = trees.add(token.text, token.lemma_)
|
||||
tree_str = trees.tree_to_str(tree_id)
|
||||
data["lemmatizer_trees"].add(tree_str)
|
||||
# We want to identify cases where lemmas aren't assigned
|
||||
# or are all assigned the same value, as this would indicate
|
||||
# an issue since we're expecting a large set of lemmas
|
||||
if len(lemma_set) < 2 and len(gold) > 1:
|
||||
data["n_low_cardinality_lemmas"] += 1
|
||||
return data
|
||||
|
||||
|
||||
|
@ -1011,7 +908,7 @@ def _get_examples_without_label(
|
|||
for eg in data:
|
||||
if component == "ner":
|
||||
labels = [
|
||||
remove_bilu_prefix(label)
|
||||
label.split("-")[1]
|
||||
for label in eg.get_aligned_ner()
|
||||
if label not in ("O", "-", None)
|
||||
]
|
||||
|
@ -1037,7 +934,6 @@ def _get_labels_from_model(nlp: Language, factory_name: str) -> Set[str]:
|
|||
labels: Set[str] = set()
|
||||
for pipe_name in pipe_names:
|
||||
pipe = nlp.get_pipe(pipe_name)
|
||||
assert isinstance(pipe, TrainablePipe)
|
||||
labels.update(pipe.labels)
|
||||
return labels
|
||||
|
||||
|
@ -1046,7 +942,7 @@ def _get_labels_from_spancat(nlp: Language) -> Dict[str, Set[str]]:
|
|||
pipe_names = [
|
||||
pipe_name
|
||||
for pipe_name in nlp.pipe_names
|
||||
if nlp.get_pipe_meta(pipe_name).factory in ("spancat", "spancat_singlelabel")
|
||||
if nlp.get_pipe_meta(pipe_name).factory == "spancat"
|
||||
]
|
||||
labels: Dict[str, Set[str]] = {}
|
||||
for pipe_name in pipe_names:
|
||||
|
@ -1093,8 +989,7 @@ def _get_kl_divergence(p: Counter, q: Counter) -> float:
|
|||
def _format_span_row(span_data: List[Dict], labels: List[str]) -> List[Any]:
|
||||
"""Compile into one list for easier reporting"""
|
||||
d = {
|
||||
label: [label] + list(_format_number(d[label]) for d in span_data)
|
||||
for label in labels
|
||||
label: [label] + list(round(d[label], 2) for d in span_data) for label in labels
|
||||
}
|
||||
return list(d.values())
|
||||
|
||||
|
@ -1109,10 +1004,6 @@ def _get_span_characteristics(
|
|||
label: _gmean(l)
|
||||
for label, l in compiled_gold["spans_length"][spans_key].items()
|
||||
}
|
||||
spans_per_type = {
|
||||
label: len(spans)
|
||||
for label, spans in compiled_gold["spans_per_type"][spans_key].items()
|
||||
}
|
||||
min_lengths = [min(l) for l in compiled_gold["spans_length"][spans_key].values()]
|
||||
max_lengths = [max(l) for l in compiled_gold["spans_length"][spans_key].values()]
|
||||
|
||||
|
@ -1140,7 +1031,6 @@ def _get_span_characteristics(
|
|||
return {
|
||||
"sd": span_distinctiveness,
|
||||
"bd": sb_distinctiveness,
|
||||
"spans_per_type": spans_per_type,
|
||||
"lengths": span_length,
|
||||
"min_length": min(min_lengths),
|
||||
"max_length": max(max_lengths),
|
||||
|
@ -1155,15 +1045,12 @@ def _get_span_characteristics(
|
|||
|
||||
def _print_span_characteristics(span_characteristics: Dict[str, Any]):
|
||||
"""Print all span characteristics into a table"""
|
||||
headers = ("Span Type", "Length", "SD", "BD", "N")
|
||||
# Wasabi has this at 30 by default, but we might have some long labels
|
||||
max_col = max(30, max(len(label) for label in span_characteristics["labels"]))
|
||||
headers = ("Span Type", "Length", "SD", "BD")
|
||||
# Prepare table data with all span characteristics
|
||||
table_data = [
|
||||
span_characteristics["lengths"],
|
||||
span_characteristics["sd"],
|
||||
span_characteristics["bd"],
|
||||
span_characteristics["spans_per_type"],
|
||||
]
|
||||
table = _format_span_row(
|
||||
span_data=table_data, labels=span_characteristics["labels"]
|
||||
|
@ -1174,18 +1061,8 @@ def _print_span_characteristics(span_characteristics: Dict[str, Any]):
|
|||
span_characteristics["avg_sd"],
|
||||
span_characteristics["avg_bd"],
|
||||
]
|
||||
|
||||
footer = (
|
||||
["Wgt. Average"] + ["{:.2f}".format(round(f, 2)) for f in footer_data] + ["-"]
|
||||
)
|
||||
msg.table(
|
||||
table,
|
||||
footer=footer,
|
||||
header=headers,
|
||||
divider=True,
|
||||
aligns=["l"] + ["r"] * (len(footer_data) + 1),
|
||||
max_col=max_col,
|
||||
)
|
||||
footer = ["Wgt. Average"] + [str(round(f, 2)) for f in footer_data]
|
||||
msg.table(table, footer=footer, header=headers, divider=True)
|
||||
|
||||
|
||||
def _get_spans_length_freq_dist(
|
||||
|
|
|
@ -1,13 +1,13 @@
|
|||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import typer
|
||||
from wasabi import Printer, diff_strings, MarkdownRenderer
|
||||
from pathlib import Path
|
||||
from thinc.api import Config
|
||||
from wasabi import MarkdownRenderer, Printer, diff_strings
|
||||
|
||||
from ._util import debug_cli, Arg, Opt, show_validation_error, parse_config_overrides
|
||||
from ..util import load_config
|
||||
from ._util import Arg, Opt, debug_cli, parse_config_overrides, show_validation_error
|
||||
from .init_config import Optimizations, init_config
|
||||
from .init_config import init_config, Optimizations
|
||||
|
||||
|
||||
@debug_cli.command(
|
||||
|
|
|
@ -1,32 +1,19 @@
|
|||
import itertools
|
||||
from typing import Dict, Any, Optional
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
import typer
|
||||
from thinc.api import (
|
||||
Model,
|
||||
data_validation,
|
||||
fix_random_seed,
|
||||
set_dropout_rate,
|
||||
set_gpu_allocator,
|
||||
)
|
||||
from wasabi import msg
|
||||
import itertools
|
||||
|
||||
from spacy.training import Example
|
||||
from spacy.util import resolve_dot_names
|
||||
from wasabi import msg
|
||||
from thinc.api import fix_random_seed, set_dropout_rate
|
||||
from thinc.api import Model, data_validation, set_gpu_allocator
|
||||
import typer
|
||||
|
||||
from .. import util
|
||||
from ._util import Arg, Opt, debug_cli, show_validation_error
|
||||
from ._util import parse_config_overrides, string_to_list, setup_gpu
|
||||
from ..schemas import ConfigSchemaTraining
|
||||
from ..util import registry
|
||||
from ._util import (
|
||||
Arg,
|
||||
Opt,
|
||||
debug_cli,
|
||||
parse_config_overrides,
|
||||
setup_gpu,
|
||||
show_validation_error,
|
||||
string_to_list,
|
||||
)
|
||||
from .. import util
|
||||
|
||||
|
||||
@debug_cli.command(
|
||||
|
@ -170,7 +157,7 @@ def debug_model(
|
|||
msg.divider(f"STEP 3 - prediction")
|
||||
msg.info(str(prediction))
|
||||
|
||||
msg.good(f"Successfully ended analysis - model looks good.")
|
||||
msg.good(f"Succesfully ended analysis - model looks good.")
|
||||
|
||||
|
||||
def _sentences():
|
||||
|
|
|
@ -1,22 +1,13 @@
|
|||
import sys
|
||||
from typing import Optional, Sequence
|
||||
from urllib.parse import urljoin
|
||||
|
||||
import requests
|
||||
import typer
|
||||
import sys
|
||||
from wasabi import msg
|
||||
import typer
|
||||
|
||||
from ._util import app, Arg, Opt, WHEEL_SUFFIX, SDIST_SUFFIX
|
||||
from .. import about
|
||||
from ..util import is_package, get_minor_version, run_command
|
||||
from ..errors import OLD_MODEL_SHORTCUTS
|
||||
from ..util import (
|
||||
get_minor_version,
|
||||
is_in_interactive,
|
||||
is_in_jupyter,
|
||||
is_package,
|
||||
is_prerelease_version,
|
||||
run_command,
|
||||
)
|
||||
from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app
|
||||
|
||||
|
||||
@app.command(
|
||||
|
@ -28,7 +19,7 @@ def download_cli(
|
|||
ctx: typer.Context,
|
||||
model: str = Arg(..., help="Name of pipeline package to download"),
|
||||
direct: bool = Opt(False, "--direct", "-d", "-D", help="Force direct download of name + version"),
|
||||
sdist: bool = Opt(False, "--sdist", "-S", help="Download sdist (.tar.gz) archive instead of pre-built binary wheel"),
|
||||
sdist: bool = Opt(False, "--sdist", "-S", help="Download sdist (.tar.gz) archive instead of pre-built binary wheel")
|
||||
# fmt: on
|
||||
):
|
||||
"""
|
||||
|
@ -44,12 +35,7 @@ def download_cli(
|
|||
download(model, direct, sdist, *ctx.args)
|
||||
|
||||
|
||||
def download(
|
||||
model: str,
|
||||
direct: bool = False,
|
||||
sdist: bool = False,
|
||||
*pip_args,
|
||||
) -> None:
|
||||
def download(model: str, direct: bool = False, sdist: bool = False, *pip_args) -> None:
|
||||
if (
|
||||
not (is_package("spacy") or is_package("spacy-nightly"))
|
||||
and "--no-deps" not in pip_args
|
||||
|
@ -63,17 +49,13 @@ def download(
|
|||
"dependencies, you'll have to install them manually."
|
||||
)
|
||||
pip_args = pip_args + ("--no-deps",)
|
||||
suffix = SDIST_SUFFIX if sdist else WHEEL_SUFFIX
|
||||
dl_tpl = "{m}-{v}/{m}-{v}{s}"
|
||||
if direct:
|
||||
# Reject model names with '/', in order to prevent shenanigans.
|
||||
if "/" in model:
|
||||
msg.fail(
|
||||
title="Model download rejected",
|
||||
text=f"Cannot download model '{model}'. Models are expected to be file names, not URLs or fragments",
|
||||
exits=True,
|
||||
)
|
||||
components = model.split("-")
|
||||
model_name = "".join(components[:-1])
|
||||
version = components[-1]
|
||||
download_model(dl_tpl.format(m=model_name, v=version, s=suffix), pip_args)
|
||||
else:
|
||||
model_name = model
|
||||
if model in OLD_MODEL_SHORTCUTS:
|
||||
|
@ -84,49 +66,15 @@ def download(
|
|||
model_name = OLD_MODEL_SHORTCUTS[model]
|
||||
compatibility = get_compatibility()
|
||||
version = get_version(model_name, compatibility)
|
||||
|
||||
filename = get_model_filename(model_name, version, sdist)
|
||||
|
||||
download_model(filename, pip_args)
|
||||
download_model(dl_tpl.format(m=model_name, v=version, s=suffix), pip_args)
|
||||
msg.good(
|
||||
"Download and installation successful",
|
||||
f"You can now load the package via spacy.load('{model_name}')",
|
||||
)
|
||||
if is_in_jupyter():
|
||||
reload_deps_msg = (
|
||||
"If you are in a Jupyter or Colab notebook, you may need to "
|
||||
"restart Python in order to load all the package's dependencies. "
|
||||
"You can do this by selecting the 'Restart kernel' or 'Restart "
|
||||
"runtime' option."
|
||||
)
|
||||
msg.warn(
|
||||
"Restart to reload dependencies",
|
||||
reload_deps_msg,
|
||||
)
|
||||
elif is_in_interactive():
|
||||
reload_deps_msg = (
|
||||
"If you are in an interactive Python session, you may need to "
|
||||
"exit and restart Python to load all the package's dependencies. "
|
||||
"You can exit with Ctrl-D (or Ctrl-Z and Enter on Windows)."
|
||||
)
|
||||
msg.warn(
|
||||
"Restart to reload dependencies",
|
||||
reload_deps_msg,
|
||||
)
|
||||
|
||||
|
||||
def get_model_filename(model_name: str, version: str, sdist: bool = False) -> str:
|
||||
dl_tpl = "{m}-{v}/{m}-{v}{s}"
|
||||
suffix = SDIST_SUFFIX if sdist else WHEEL_SUFFIX
|
||||
filename = dl_tpl.format(m=model_name, v=version, s=suffix)
|
||||
return filename
|
||||
|
||||
|
||||
def get_compatibility() -> dict:
|
||||
if is_prerelease_version(about.__version__):
|
||||
version: Optional[str] = about.__version__
|
||||
else:
|
||||
version = get_minor_version(about.__version__)
|
||||
version = get_minor_version(about.__version__)
|
||||
r = requests.get(about.__compatibility__)
|
||||
if r.status_code != 200:
|
||||
msg.fail(
|
||||
|
@ -153,24 +101,10 @@ def get_version(model: str, comp: dict) -> str:
|
|||
return comp[model][0]
|
||||
|
||||
|
||||
def get_latest_version(model: str) -> str:
|
||||
comp = get_compatibility()
|
||||
return get_version(model, comp)
|
||||
|
||||
|
||||
def download_model(
|
||||
filename: str, user_pip_args: Optional[Sequence[str]] = None
|
||||
) -> None:
|
||||
# Construct the download URL carefully. We need to make sure we don't
|
||||
# allow relative paths or other shenanigans to trick us into download
|
||||
# from outside our own repo.
|
||||
base_url = about.__download_url__
|
||||
# urljoin requires that the path ends with /, or the last path part will be dropped
|
||||
if not base_url.endswith("/"):
|
||||
base_url = about.__download_url__ + "/"
|
||||
download_url = urljoin(base_url, filename)
|
||||
if not download_url.startswith(about.__download_url__):
|
||||
raise ValueError(f"Download from {filename} rejected. Was it a relative path?")
|
||||
download_url = about.__download_url__ + "/" + filename
|
||||
pip_args = list(user_pip_args) if user_pip_args is not None else []
|
||||
cmd = [sys.executable, "-m", "pip", "install"] + pip_args + [download_url]
|
||||
run_command(cmd)
|
||||
|
|
|
@ -1,21 +1,18 @@
|
|||
import re
|
||||
from typing import Optional, List, Dict, Any, Union
|
||||
from wasabi import Printer
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
|
||||
import re
|
||||
import srsly
|
||||
from thinc.api import fix_random_seed
|
||||
from wasabi import Printer
|
||||
|
||||
from .. import displacy, util
|
||||
from ..scorer import Scorer
|
||||
from ..tokens import Doc
|
||||
from ..training import Corpus
|
||||
from ._util import Arg, Opt, app, benchmark_cli, import_code, setup_gpu
|
||||
from ..tokens import Doc
|
||||
from ._util import app, Arg, Opt, setup_gpu, import_code
|
||||
from ..scorer import Scorer
|
||||
from .. import util
|
||||
from .. import displacy
|
||||
|
||||
|
||||
@benchmark_cli.command(
|
||||
"accuracy",
|
||||
)
|
||||
@app.command("evaluate")
|
||||
def evaluate_cli(
|
||||
# fmt: off
|
||||
|
@ -27,8 +24,6 @@ def evaluate_cli(
|
|||
gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"),
|
||||
displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False),
|
||||
displacy_limit: int = Opt(25, "--displacy-limit", "-dl", help="Limit of parses to render as HTML"),
|
||||
per_component: bool = Opt(False, "--per-component", "-P", help="Return scores per component, only applicable when an output JSON file is specified."),
|
||||
spans_key: str = Opt("sc", "--spans-key", "-sk", help="Spans key to use when evaluating Doc.spans"),
|
||||
# fmt: on
|
||||
):
|
||||
"""
|
||||
|
@ -41,7 +36,7 @@ def evaluate_cli(
|
|||
dependency parses in a HTML file, set as output directory as the
|
||||
displacy_path argument.
|
||||
|
||||
DOCS: https://spacy.io/api/cli#benchmark-accuracy
|
||||
DOCS: https://spacy.io/api/cli#evaluate
|
||||
"""
|
||||
import_code(code_path)
|
||||
evaluate(
|
||||
|
@ -52,9 +47,7 @@ def evaluate_cli(
|
|||
gold_preproc=gold_preproc,
|
||||
displacy_path=displacy_path,
|
||||
displacy_limit=displacy_limit,
|
||||
per_component=per_component,
|
||||
silent=False,
|
||||
spans_key=spans_key,
|
||||
)
|
||||
|
||||
|
||||
|
@ -68,7 +61,6 @@ def evaluate(
|
|||
displacy_limit: int = 25,
|
||||
silent: bool = True,
|
||||
spans_key: str = "sc",
|
||||
per_component: bool = False,
|
||||
) -> Dict[str, Any]:
|
||||
msg = Printer(no_print=silent, pretty=not silent)
|
||||
fix_random_seed()
|
||||
|
@ -83,61 +75,50 @@ def evaluate(
|
|||
corpus = Corpus(data_path, gold_preproc=gold_preproc)
|
||||
nlp = util.load_model(model)
|
||||
dev_dataset = list(corpus(nlp))
|
||||
scores = nlp.evaluate(dev_dataset, per_component=per_component)
|
||||
if per_component:
|
||||
data = scores
|
||||
if output is None:
|
||||
msg.warn(
|
||||
"The per-component option is enabled but there is no output JSON file provided to save the scores to."
|
||||
)
|
||||
else:
|
||||
msg.info("Per-component scores will be saved to output JSON file.")
|
||||
else:
|
||||
metrics = {
|
||||
"TOK": "token_acc",
|
||||
"TAG": "tag_acc",
|
||||
"POS": "pos_acc",
|
||||
"MORPH": "morph_acc",
|
||||
"LEMMA": "lemma_acc",
|
||||
"UAS": "dep_uas",
|
||||
"LAS": "dep_las",
|
||||
"NER P": "ents_p",
|
||||
"NER R": "ents_r",
|
||||
"NER F": "ents_f",
|
||||
"TEXTCAT": "cats_score",
|
||||
"SENT P": "sents_p",
|
||||
"SENT R": "sents_r",
|
||||
"SENT F": "sents_f",
|
||||
"SPAN P": f"spans_{spans_key}_p",
|
||||
"SPAN R": f"spans_{spans_key}_r",
|
||||
"SPAN F": f"spans_{spans_key}_f",
|
||||
"SPEED": "speed",
|
||||
}
|
||||
results = {}
|
||||
data = {}
|
||||
for metric, key in metrics.items():
|
||||
if key in scores:
|
||||
if key == "cats_score":
|
||||
metric = metric + " (" + scores.get("cats_score_desc", "unk") + ")"
|
||||
if isinstance(scores[key], (int, float)):
|
||||
if key == "speed":
|
||||
results[metric] = f"{scores[key]:.0f}"
|
||||
else:
|
||||
results[metric] = f"{scores[key]*100:.2f}"
|
||||
scores = nlp.evaluate(dev_dataset)
|
||||
metrics = {
|
||||
"TOK": "token_acc",
|
||||
"TAG": "tag_acc",
|
||||
"POS": "pos_acc",
|
||||
"MORPH": "morph_acc",
|
||||
"LEMMA": "lemma_acc",
|
||||
"UAS": "dep_uas",
|
||||
"LAS": "dep_las",
|
||||
"NER P": "ents_p",
|
||||
"NER R": "ents_r",
|
||||
"NER F": "ents_f",
|
||||
"TEXTCAT": "cats_score",
|
||||
"SENT P": "sents_p",
|
||||
"SENT R": "sents_r",
|
||||
"SENT F": "sents_f",
|
||||
"SPAN P": f"spans_{spans_key}_p",
|
||||
"SPAN R": f"spans_{spans_key}_r",
|
||||
"SPAN F": f"spans_{spans_key}_f",
|
||||
"SPEED": "speed",
|
||||
}
|
||||
results = {}
|
||||
data = {}
|
||||
for metric, key in metrics.items():
|
||||
if key in scores:
|
||||
if key == "cats_score":
|
||||
metric = metric + " (" + scores.get("cats_score_desc", "unk") + ")"
|
||||
if isinstance(scores[key], (int, float)):
|
||||
if key == "speed":
|
||||
results[metric] = f"{scores[key]:.0f}"
|
||||
else:
|
||||
results[metric] = "-"
|
||||
data[re.sub(r"[\s/]", "_", key.lower())] = scores[key]
|
||||
results[metric] = f"{scores[key]*100:.2f}"
|
||||
else:
|
||||
results[metric] = "-"
|
||||
data[re.sub(r"[\s/]", "_", key.lower())] = scores[key]
|
||||
|
||||
msg.table(results, title="Results")
|
||||
data = handle_scores_per_type(scores, data, spans_key=spans_key, silent=silent)
|
||||
msg.table(results, title="Results")
|
||||
data = handle_scores_per_type(scores, data, spans_key=spans_key, silent=silent)
|
||||
|
||||
if displacy_path:
|
||||
factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names]
|
||||
docs = list(nlp.pipe(ex.reference.text for ex in dev_dataset[:displacy_limit]))
|
||||
render_deps = "parser" in factory_names
|
||||
render_ents = "ner" in factory_names
|
||||
render_spans = "spancat" in factory_names
|
||||
|
||||
render_parses(
|
||||
docs,
|
||||
displacy_path,
|
||||
|
@ -145,7 +126,6 @@ def evaluate(
|
|||
limit=displacy_limit,
|
||||
deps=render_deps,
|
||||
ents=render_ents,
|
||||
spans=render_spans,
|
||||
)
|
||||
msg.good(f"Generated {displacy_limit} parses as HTML", displacy_path)
|
||||
|
||||
|
@ -199,7 +179,6 @@ def render_parses(
|
|||
limit: int = 250,
|
||||
deps: bool = True,
|
||||
ents: bool = True,
|
||||
spans: bool = True,
|
||||
):
|
||||
docs[0].user_data["title"] = model_name
|
||||
if ents:
|
||||
|
@ -213,11 +192,6 @@ def render_parses(
|
|||
with (output_path / "parses.html").open("w", encoding="utf8") as file_:
|
||||
file_.write(html)
|
||||
|
||||
if spans:
|
||||
html = displacy.render(docs[:limit], style="span", page=True)
|
||||
with (output_path / "spans.html").open("w", encoding="utf8") as file_:
|
||||
file_.write(html)
|
||||
|
||||
|
||||
def print_prf_per_type(
|
||||
msg: Printer, scores: Dict[str, Dict[str, float]], name: str, type: str
|
||||
|
|
|
@ -1,69 +0,0 @@
|
|||
from typing import Optional, Tuple
|
||||
|
||||
from catalogue import RegistryError
|
||||
from wasabi import msg
|
||||
|
||||
from ..util import registry
|
||||
from ._util import Arg, Opt, app
|
||||
|
||||
|
||||
@app.command("find-function")
|
||||
def find_function_cli(
|
||||
# fmt: off
|
||||
func_name: str = Arg(..., help="Name of the registered function."),
|
||||
registry_name: Optional[str] = Opt(None, "--registry", "-r", help="Name of the catalogue registry."),
|
||||
# fmt: on
|
||||
):
|
||||
"""
|
||||
Find the module, path and line number to the file the registered
|
||||
function is defined in, if available.
|
||||
|
||||
func_name (str): Name of the registered function.
|
||||
registry_name (Optional[str]): Name of the catalogue registry.
|
||||
|
||||
DOCS: https://spacy.io/api/cli#find-function
|
||||
"""
|
||||
if not registry_name:
|
||||
registry_names = registry.get_registry_names()
|
||||
for name in registry_names:
|
||||
if registry.has(name, func_name):
|
||||
registry_name = name
|
||||
break
|
||||
|
||||
if not registry_name:
|
||||
msg.fail(
|
||||
f"Couldn't find registered function: '{func_name}'",
|
||||
exits=1,
|
||||
)
|
||||
|
||||
assert registry_name is not None
|
||||
find_function(func_name, registry_name)
|
||||
|
||||
|
||||
def find_function(func_name: str, registry_name: str) -> Tuple[str, int]:
|
||||
registry_desc = None
|
||||
try:
|
||||
registry_desc = registry.find(registry_name, func_name)
|
||||
except RegistryError as e:
|
||||
msg.fail(
|
||||
f"Couldn't find registered function: '{func_name}' in registry '{registry_name}'",
|
||||
)
|
||||
msg.fail(f"{e}", exits=1)
|
||||
assert registry_desc is not None
|
||||
|
||||
registry_path = None
|
||||
line_no = None
|
||||
if registry_desc["file"]:
|
||||
registry_path = registry_desc["file"]
|
||||
line_no = registry_desc["line_no"]
|
||||
|
||||
if not registry_path or not line_no:
|
||||
msg.fail(
|
||||
f"Couldn't find path to registered function: '{func_name}' in registry '{registry_name}'",
|
||||
exits=1,
|
||||
)
|
||||
assert registry_path is not None
|
||||
assert line_no is not None
|
||||
|
||||
msg.good(f"Found registered function '{func_name}' at {registry_path}:{line_no}")
|
||||
return str(registry_path), int(line_no)
|
|
@ -1,233 +0,0 @@
|
|||
import functools
|
||||
import logging
|
||||
import operator
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
import numpy
|
||||
import wasabi.tables
|
||||
|
||||
from .. import util
|
||||
from ..errors import Errors
|
||||
from ..pipeline import MultiLabel_TextCategorizer, TextCategorizer
|
||||
from ..training import Corpus
|
||||
from ._util import Arg, Opt, app, import_code, setup_gpu
|
||||
|
||||
_DEFAULTS = {
|
||||
"n_trials": 11,
|
||||
"use_gpu": -1,
|
||||
"gold_preproc": False,
|
||||
}
|
||||
|
||||
|
||||
@app.command(
|
||||
"find-threshold",
|
||||
context_settings={"allow_extra_args": False, "ignore_unknown_options": True},
|
||||
)
|
||||
def find_threshold_cli(
|
||||
# fmt: off
|
||||
model: str = Arg(..., help="Model name or path"),
|
||||
data_path: Path = Arg(..., help="Location of binary evaluation data in .spacy format", exists=True),
|
||||
pipe_name: str = Arg(..., help="Name of pipe to examine thresholds for"),
|
||||
threshold_key: str = Arg(..., help="Key of threshold attribute in component's configuration"),
|
||||
scores_key: str = Arg(..., help="Metric to optimize"),
|
||||
n_trials: int = Opt(_DEFAULTS["n_trials"], "--n_trials", "-n", help="Number of trials to determine optimal thresholds"),
|
||||
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||
use_gpu: int = Opt(_DEFAULTS["use_gpu"], "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
||||
gold_preproc: bool = Opt(_DEFAULTS["gold_preproc"], "--gold-preproc", "-G", help="Use gold preprocessing"),
|
||||
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||
# fmt: on
|
||||
):
|
||||
"""
|
||||
Runs prediction trials for a trained model with varying thresholds to maximize
|
||||
the specified metric. The search space for the threshold is traversed linearly
|
||||
from 0 to 1 in `n_trials` steps. Results are displayed in a table on `stdout`
|
||||
(the corresponding API call to `spacy.cli.find_threshold.find_threshold()`
|
||||
returns all results).
|
||||
|
||||
This is applicable only for components whose predictions are influenced by
|
||||
thresholds - e.g. `textcat_multilabel` and `spancat`, but not `textcat`. Note
|
||||
that the full path to the corresponding threshold attribute in the config has to
|
||||
be provided.
|
||||
|
||||
DOCS: https://spacy.io/api/cli#find-threshold
|
||||
"""
|
||||
if verbose:
|
||||
util.logger.setLevel(logging.DEBUG)
|
||||
import_code(code_path)
|
||||
find_threshold(
|
||||
model=model,
|
||||
data_path=data_path,
|
||||
pipe_name=pipe_name,
|
||||
threshold_key=threshold_key,
|
||||
scores_key=scores_key,
|
||||
n_trials=n_trials,
|
||||
use_gpu=use_gpu,
|
||||
gold_preproc=gold_preproc,
|
||||
silent=False,
|
||||
)
|
||||
|
||||
|
||||
def find_threshold(
|
||||
model: str,
|
||||
data_path: Path,
|
||||
pipe_name: str,
|
||||
threshold_key: str,
|
||||
scores_key: str,
|
||||
*,
|
||||
n_trials: int = _DEFAULTS["n_trials"], # type: ignore
|
||||
use_gpu: int = _DEFAULTS["use_gpu"], # type: ignore
|
||||
gold_preproc: bool = _DEFAULTS["gold_preproc"], # type: ignore
|
||||
silent: bool = True,
|
||||
) -> Tuple[float, float, Dict[float, float]]:
|
||||
"""
|
||||
Runs prediction trials for models with varying thresholds to maximize the specified metric.
|
||||
model (Union[str, Path]): Pipeline to evaluate. Can be a package or a path to a data directory.
|
||||
data_path (Path): Path to file with DocBin with docs to use for threshold search.
|
||||
pipe_name (str): Name of pipe to examine thresholds for.
|
||||
threshold_key (str): Key of threshold attribute in component's configuration.
|
||||
scores_key (str): Name of score to metric to optimize.
|
||||
n_trials (int): Number of trials to determine optimal thresholds.
|
||||
use_gpu (int): GPU ID or -1 for CPU.
|
||||
gold_preproc (bool): Whether to use gold preprocessing. Gold preprocessing helps the annotations align to the
|
||||
tokenization, and may result in sequences of more consistent length. However, it may reduce runtime accuracy due
|
||||
to train/test skew.
|
||||
silent (bool): Whether to print non-error-related output to stdout.
|
||||
RETURNS (Tuple[float, float, Dict[float, float]]): Best found threshold, the corresponding score, scores for all
|
||||
evaluated thresholds.
|
||||
"""
|
||||
|
||||
setup_gpu(use_gpu, silent=silent)
|
||||
data_path = util.ensure_path(data_path)
|
||||
if not data_path.exists():
|
||||
wasabi.msg.fail("Evaluation data not found", data_path, exits=1)
|
||||
nlp = util.load_model(model)
|
||||
|
||||
if pipe_name not in nlp.component_names:
|
||||
raise AttributeError(
|
||||
Errors.E001.format(name=pipe_name, opts=nlp.component_names)
|
||||
)
|
||||
pipe = nlp.get_pipe(pipe_name)
|
||||
if not hasattr(pipe, "scorer"):
|
||||
raise AttributeError(Errors.E1045)
|
||||
|
||||
if type(pipe) == TextCategorizer:
|
||||
wasabi.msg.warn(
|
||||
"The `textcat` component doesn't use a threshold as it's not applicable to the concept of "
|
||||
"exclusive classes. All thresholds will yield the same results."
|
||||
)
|
||||
|
||||
if not silent:
|
||||
wasabi.msg.info(
|
||||
title=f"Optimizing for {scores_key} for component '{pipe_name}' with {n_trials} "
|
||||
f"trials."
|
||||
)
|
||||
|
||||
# Load evaluation corpus.
|
||||
corpus = Corpus(data_path, gold_preproc=gold_preproc)
|
||||
dev_dataset = list(corpus(nlp))
|
||||
config_keys = threshold_key.split(".")
|
||||
|
||||
def set_nested_item(
|
||||
config: Dict[str, Any], keys: List[str], value: float
|
||||
) -> Dict[str, Any]:
|
||||
"""Set item in nested dictionary. Adapted from https://stackoverflow.com/a/54138200.
|
||||
config (Dict[str, Any]): Configuration dictionary.
|
||||
keys (List[Any]): Path to value to set.
|
||||
value (float): Value to set.
|
||||
RETURNS (Dict[str, Any]): Updated dictionary.
|
||||
"""
|
||||
functools.reduce(operator.getitem, keys[:-1], config)[keys[-1]] = value
|
||||
return config
|
||||
|
||||
def filter_config(
|
||||
config: Dict[str, Any], keys: List[str], full_key: str
|
||||
) -> Dict[str, Any]:
|
||||
"""Filters provided config dictionary so that only the specified keys path remains.
|
||||
config (Dict[str, Any]): Configuration dictionary.
|
||||
keys (List[Any]): Path to value to set.
|
||||
full_key (str): Full user-specified key.
|
||||
RETURNS (Dict[str, Any]): Filtered dictionary.
|
||||
"""
|
||||
if keys[0] not in config:
|
||||
wasabi.msg.fail(
|
||||
title=f"Failed to look up `{full_key}` in config: sub-key {[keys[0]]} not found.",
|
||||
text=f"Make sure you specified {[keys[0]]} correctly. The following sub-keys are available instead: "
|
||||
f"{list(config.keys())}",
|
||||
exits=1,
|
||||
)
|
||||
return {
|
||||
keys[0]: filter_config(config[keys[0]], keys[1:], full_key)
|
||||
if len(keys) > 1
|
||||
else config[keys[0]]
|
||||
}
|
||||
|
||||
# Evaluate with varying threshold values.
|
||||
scores: Dict[float, float] = {}
|
||||
config_keys_full = ["components", pipe_name, *config_keys]
|
||||
table_col_widths = (10, 10)
|
||||
thresholds = numpy.linspace(0, 1, n_trials)
|
||||
print(wasabi.tables.row(["Threshold", f"{scores_key}"], widths=table_col_widths))
|
||||
for threshold in thresholds:
|
||||
# Reload pipeline with overrides specifying the new threshold.
|
||||
nlp = util.load_model(
|
||||
model,
|
||||
config=set_nested_item(
|
||||
filter_config(
|
||||
nlp.config, config_keys_full, ".".join(config_keys_full)
|
||||
).copy(),
|
||||
config_keys_full,
|
||||
threshold,
|
||||
),
|
||||
)
|
||||
if hasattr(pipe, "cfg"):
|
||||
setattr(
|
||||
nlp.get_pipe(pipe_name),
|
||||
"cfg",
|
||||
set_nested_item(getattr(pipe, "cfg"), config_keys, threshold),
|
||||
)
|
||||
|
||||
eval_scores = nlp.evaluate(dev_dataset)
|
||||
if scores_key not in eval_scores:
|
||||
wasabi.msg.fail(
|
||||
title=f"Failed to look up score `{scores_key}` in evaluation results.",
|
||||
text=f"Make sure you specified the correct value for `scores_key`. The following scores are "
|
||||
f"available: {list(eval_scores.keys())}",
|
||||
exits=1,
|
||||
)
|
||||
scores[threshold] = eval_scores[scores_key]
|
||||
|
||||
if not isinstance(scores[threshold], (float, int)):
|
||||
wasabi.msg.fail(
|
||||
f"Returned score for key '{scores_key}' is not numeric. Threshold optimization only works for numeric "
|
||||
f"scores.",
|
||||
exits=1,
|
||||
)
|
||||
print(
|
||||
wasabi.row(
|
||||
[round(threshold, 3), round(scores[threshold], 3)],
|
||||
widths=table_col_widths,
|
||||
)
|
||||
)
|
||||
|
||||
best_threshold = max(scores.keys(), key=(lambda key: scores[key]))
|
||||
|
||||
# If all scores are identical, emit warning.
|
||||
if len(set(scores.values())) == 1:
|
||||
wasabi.msg.warn(
|
||||
title="All scores are identical. Verify that all settings are correct.",
|
||||
text=""
|
||||
if (
|
||||
not isinstance(pipe, MultiLabel_TextCategorizer)
|
||||
or scores_key in ("cats_macro_f", "cats_micro_f")
|
||||
)
|
||||
else "Use `cats_macro_f` or `cats_micro_f` when optimizing the threshold for `textcat_multilabel`.",
|
||||
)
|
||||
|
||||
else:
|
||||
if not silent:
|
||||
print(
|
||||
f"\nBest threshold: {round(best_threshold, ndigits=4)} with {scores_key} value of {scores[best_threshold]}."
|
||||
)
|
||||
|
||||
return best_threshold, scores[best_threshold], scores
|
|
@ -1,15 +1,12 @@
|
|||
import json
|
||||
from typing import Optional, Dict, Any, Union, List
|
||||
import platform
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
|
||||
from wasabi import Printer, MarkdownRenderer
|
||||
import srsly
|
||||
from wasabi import MarkdownRenderer, Printer
|
||||
|
||||
from .. import about, util
|
||||
from ..compat import importlib_metadata
|
||||
from ._util import Arg, Opt, app, string_to_list
|
||||
from .download import get_latest_version, get_model_filename
|
||||
from ._util import app, Arg, Opt, string_to_list
|
||||
from .. import util
|
||||
from .. import about
|
||||
|
||||
|
||||
@app.command("info")
|
||||
|
@ -19,7 +16,6 @@ def info_cli(
|
|||
markdown: bool = Opt(False, "--markdown", "-md", help="Generate Markdown for GitHub issues"),
|
||||
silent: bool = Opt(False, "--silent", "-s", "-S", help="Don't print anything (just return)"),
|
||||
exclude: str = Opt("labels", "--exclude", "-e", help="Comma-separated keys to exclude from the print-out"),
|
||||
url: bool = Opt(False, "--url", "-u", help="Print the URL to download the most recent compatible version of the pipeline"),
|
||||
# fmt: on
|
||||
):
|
||||
"""
|
||||
|
@ -27,19 +23,10 @@ def info_cli(
|
|||
print its meta information. Flag --markdown prints details in Markdown for easy
|
||||
copy-pasting to GitHub issues.
|
||||
|
||||
Flag --url prints only the download URL of the most recent compatible
|
||||
version of the pipeline.
|
||||
|
||||
DOCS: https://spacy.io/api/cli#info
|
||||
"""
|
||||
exclude = string_to_list(exclude)
|
||||
info(
|
||||
model,
|
||||
markdown=markdown,
|
||||
silent=silent,
|
||||
exclude=exclude,
|
||||
url=url,
|
||||
)
|
||||
info(model, markdown=markdown, silent=silent, exclude=exclude)
|
||||
|
||||
|
||||
def info(
|
||||
|
@ -48,20 +35,11 @@ def info(
|
|||
markdown: bool = False,
|
||||
silent: bool = True,
|
||||
exclude: Optional[List[str]] = None,
|
||||
url: bool = False,
|
||||
) -> Union[str, dict]:
|
||||
msg = Printer(no_print=silent, pretty=not silent)
|
||||
if not exclude:
|
||||
exclude = []
|
||||
if url:
|
||||
if model is not None:
|
||||
title = f"Download info for pipeline '{model}'"
|
||||
data = info_model_url(model)
|
||||
print(data["download_url"])
|
||||
return data
|
||||
else:
|
||||
msg.fail("--url option requires a pipeline name", exits=1)
|
||||
elif model:
|
||||
if model:
|
||||
title = f"Info about pipeline '{model}'"
|
||||
data = info_model(model, silent=silent)
|
||||
else:
|
||||
|
@ -121,43 +99,11 @@ def info_model(model: str, *, silent: bool = True) -> Dict[str, Any]:
|
|||
meta["source"] = str(model_path.resolve())
|
||||
else:
|
||||
meta["source"] = str(model_path)
|
||||
download_url = info_installed_model_url(model)
|
||||
if download_url:
|
||||
meta["download_url"] = download_url
|
||||
return {
|
||||
k: v for k, v in meta.items() if k not in ("accuracy", "performance", "speed")
|
||||
}
|
||||
|
||||
|
||||
def info_installed_model_url(model: str) -> Optional[str]:
|
||||
"""Given a pipeline name, get the download URL if available, otherwise
|
||||
return None.
|
||||
|
||||
This is only available for pipelines installed as modules that have
|
||||
dist-info available.
|
||||
"""
|
||||
try:
|
||||
dist = importlib_metadata.distribution(model)
|
||||
text = dist.read_text("direct_url.json")
|
||||
if isinstance(text, str):
|
||||
data = json.loads(text)
|
||||
return data["url"]
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def info_model_url(model: str) -> Dict[str, Any]:
|
||||
"""Return the download URL for the latest version of a pipeline."""
|
||||
version = get_latest_version(model)
|
||||
|
||||
filename = get_model_filename(model, version)
|
||||
download_url = about.__download_url__ + "/" + filename
|
||||
release_tpl = "https://github.com/explosion/spacy-models/releases/tag/{m}-{v}"
|
||||
release_url = release_tpl.format(m=model, v=version)
|
||||
return {"download_url": download_url, "release_url": release_url}
|
||||
|
||||
|
||||
def get_markdown(
|
||||
data: Dict[str, Any],
|
||||
title: Optional[str] = None,
|
||||
|
|
|
@ -1,26 +1,18 @@
|
|||
import re
|
||||
from typing import Optional, List, Tuple
|
||||
from enum import Enum
|
||||
from pathlib import Path
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
import srsly
|
||||
from jinja2 import Template
|
||||
from thinc.api import Config
|
||||
from wasabi import Printer, diff_strings
|
||||
from thinc.api import Config
|
||||
import srsly
|
||||
import re
|
||||
from jinja2 import Template
|
||||
|
||||
from .. import util
|
||||
from ..language import DEFAULT_CONFIG_PRETRAIN_PATH
|
||||
from ..schemas import RecommendationSchema
|
||||
from ..util import SimpleFrozenList
|
||||
from ._util import (
|
||||
COMMAND,
|
||||
Arg,
|
||||
Opt,
|
||||
import_code,
|
||||
init_cli,
|
||||
show_validation_error,
|
||||
string_to_list,
|
||||
)
|
||||
from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND
|
||||
from ._util import string_to_list, import_code
|
||||
|
||||
|
||||
ROOT = Path(__file__).parent / "templates"
|
||||
TEMPLATE_PATH = ROOT / "quickstart_training.jinja"
|
||||
|
@ -32,30 +24,16 @@ class Optimizations(str, Enum):
|
|||
accuracy = "accuracy"
|
||||
|
||||
|
||||
class InitValues:
|
||||
"""
|
||||
Default values for initialization. Dedicated class to allow synchronized default values for init_config_cli() and
|
||||
init_config(), i.e. initialization calls via CLI respectively Python.
|
||||
"""
|
||||
|
||||
lang = "en"
|
||||
pipeline = SimpleFrozenList(["tagger", "parser", "ner"])
|
||||
optimize = Optimizations.efficiency
|
||||
gpu = False
|
||||
pretraining = False
|
||||
force_overwrite = False
|
||||
|
||||
|
||||
@init_cli.command("config")
|
||||
def init_config_cli(
|
||||
# fmt: off
|
||||
output_file: Path = Arg(..., help="File to save the config to or - for stdout (will only output config and no additional logging info)", allow_dash=True),
|
||||
lang: str = Opt(InitValues.lang, "--lang", "-l", help="Two-letter code of the language to use"),
|
||||
pipeline: str = Opt(",".join(InitValues.pipeline), "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include (without 'tok2vec' or 'transformer')"),
|
||||
optimize: Optimizations = Opt(InitValues.optimize, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."),
|
||||
gpu: bool = Opt(InitValues.gpu, "--gpu", "-G", help="Whether the model can run on GPU. This will impact the choice of architecture, pretrained weights and related hyperparameters."),
|
||||
pretraining: bool = Opt(InitValues.pretraining, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"),
|
||||
force_overwrite: bool = Opt(InitValues.force_overwrite, "--force", "-F", help="Force overwriting the output file"),
|
||||
lang: str = Opt("en", "--lang", "-l", help="Two-letter code of the language to use"),
|
||||
pipeline: str = Opt("tagger,parser,ner", "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include (without 'tok2vec' or 'transformer')"),
|
||||
optimize: Optimizations = Opt(Optimizations.efficiency.value, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."),
|
||||
gpu: bool = Opt(False, "--gpu", "-G", help="Whether the model can run on GPU. This will impact the choice of architecture, pretrained weights and related hyperparameters."),
|
||||
pretraining: bool = Opt(False, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"),
|
||||
force_overwrite: bool = Opt(False, "--force", "-F", help="Force overwriting the output file"),
|
||||
# fmt: on
|
||||
):
|
||||
"""
|
||||
|
@ -155,11 +133,11 @@ def fill_config(
|
|||
|
||||
def init_config(
|
||||
*,
|
||||
lang: str = InitValues.lang,
|
||||
pipeline: List[str] = InitValues.pipeline,
|
||||
optimize: str = InitValues.optimize,
|
||||
gpu: bool = InitValues.gpu,
|
||||
pretraining: bool = InitValues.pretraining,
|
||||
lang: str,
|
||||
pipeline: List[str],
|
||||
optimize: str,
|
||||
gpu: bool,
|
||||
pretraining: bool = False,
|
||||
silent: bool = True,
|
||||
) -> Config:
|
||||
msg = Printer(no_print=silent)
|
||||
|
|
|
@ -1,23 +1,15 @@
|
|||
from typing import Optional
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import srsly
|
||||
import typer
|
||||
from wasabi import msg
|
||||
import typer
|
||||
import srsly
|
||||
|
||||
from .. import util
|
||||
from ..training.initialize import init_nlp, convert_vectors
|
||||
from ..language import Language
|
||||
from ..training.initialize import convert_vectors, init_nlp
|
||||
from ._util import (
|
||||
Arg,
|
||||
Opt,
|
||||
import_code,
|
||||
init_cli,
|
||||
parse_config_overrides,
|
||||
setup_gpu,
|
||||
show_validation_error,
|
||||
)
|
||||
from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
|
||||
from ._util import import_code, setup_gpu
|
||||
|
||||
|
||||
@init_cli.command("vectors")
|
||||
|
@ -32,15 +24,13 @@ def init_vectors_cli(
|
|||
name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
|
||||
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||
jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True),
|
||||
attr: str = Opt("ORTH", "--attr", "-a", help="Optional token attribute to use for vectors, e.g. LOWER or NORM"),
|
||||
# fmt: on
|
||||
):
|
||||
"""Convert word vectors for use with spaCy. Will export an nlp object that
|
||||
you can use in the [initialize] block of your config to initialize
|
||||
a model with vectors.
|
||||
"""
|
||||
if verbose:
|
||||
util.logger.setLevel(logging.DEBUG)
|
||||
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
||||
msg.info(f"Creating blank nlp object for language '{lang}'")
|
||||
nlp = util.get_lang_class(lang)()
|
||||
if jsonl_loc is not None:
|
||||
|
@ -52,7 +42,6 @@ def init_vectors_cli(
|
|||
prune=prune,
|
||||
name=name,
|
||||
mode=mode,
|
||||
attr=attr,
|
||||
)
|
||||
msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
|
||||
nlp.to_disk(output_dir)
|
||||
|
@ -88,8 +77,7 @@ def init_pipeline_cli(
|
|||
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
|
||||
# fmt: on
|
||||
):
|
||||
if verbose:
|
||||
util.logger.setLevel(logging.DEBUG)
|
||||
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
||||
overrides = parse_config_overrides(ctx.args)
|
||||
import_code(code_path)
|
||||
setup_gpu(use_gpu)
|
||||
|
@ -118,8 +106,7 @@ def init_labels_cli(
|
|||
"""Generate JSON files for the labels in the data. This helps speed up the
|
||||
training process, since spaCy won't have to preprocess the data to
|
||||
extract the labels."""
|
||||
if verbose:
|
||||
util.logger.setLevel(logging.DEBUG)
|
||||
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
||||
if not output_path.exists():
|
||||
output_path.mkdir(parents=True)
|
||||
overrides = parse_config_overrides(ctx.args)
|
||||
|
|
|
@ -1,21 +1,18 @@
|
|||
import os
|
||||
import re
|
||||
from typing import Optional, Union, Any, Dict, List, Tuple, cast
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Tuple, Union, cast
|
||||
|
||||
import srsly
|
||||
from catalogue import RegistryError
|
||||
from wasabi import Printer, MarkdownRenderer, get_raw_input
|
||||
from thinc.api import Config
|
||||
from wasabi import MarkdownRenderer, Printer, get_raw_input
|
||||
from collections import defaultdict
|
||||
from catalogue import RegistryError
|
||||
import srsly
|
||||
import sys
|
||||
import re
|
||||
|
||||
from .. import about, util
|
||||
from ..compat import importlib_metadata
|
||||
from ..schemas import ModelMetaSchema, validate
|
||||
from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app, string_to_list
|
||||
from ._util import app, Arg, Opt, string_to_list, WHEEL_SUFFIX, SDIST_SUFFIX
|
||||
from ..schemas import validate, ModelMetaSchema
|
||||
from .. import util
|
||||
from .. import about
|
||||
|
||||
|
||||
@app.command("package")
|
||||
|
@ -30,7 +27,6 @@ def package_cli(
|
|||
version: Optional[str] = Opt(None, "--version", "-v", help="Package version to override meta"),
|
||||
build: str = Opt("sdist", "--build", "-b", help="Comma-separated formats to build: sdist and/or wheel, or none."),
|
||||
force: bool = Opt(False, "--force", "-f", "-F", help="Force overwriting existing data in output directory"),
|
||||
require_parent: bool = Opt(True, "--require-parent/--no-require-parent", "-R", "-R", help="Include the parent package (e.g. spacy) in the requirements"),
|
||||
# fmt: on
|
||||
):
|
||||
"""
|
||||
|
@ -39,7 +35,7 @@ def package_cli(
|
|||
specified output directory, and the data will be copied over. If
|
||||
--create-meta is set and a meta.json already exists in the output directory,
|
||||
the existing values will be used as the defaults in the command-line prompt.
|
||||
After packaging, "python -m build --sdist" is run in the package directory,
|
||||
After packaging, "python setup.py sdist" is run in the package directory,
|
||||
which will create a .tar.gz archive that can be installed via "pip install".
|
||||
|
||||
If additional code files are provided (e.g. Python files containing custom
|
||||
|
@ -61,7 +57,6 @@ def package_cli(
|
|||
create_sdist=create_sdist,
|
||||
create_wheel=create_wheel,
|
||||
force=force,
|
||||
require_parent=require_parent,
|
||||
silent=False,
|
||||
)
|
||||
|
||||
|
@ -76,7 +71,6 @@ def package(
|
|||
create_meta: bool = False,
|
||||
create_sdist: bool = True,
|
||||
create_wheel: bool = False,
|
||||
require_parent: bool = False,
|
||||
force: bool = False,
|
||||
silent: bool = True,
|
||||
) -> None:
|
||||
|
@ -84,17 +78,9 @@ def package(
|
|||
input_path = util.ensure_path(input_dir)
|
||||
output_path = util.ensure_path(output_dir)
|
||||
meta_path = util.ensure_path(meta_path)
|
||||
if create_wheel and not has_wheel() and not has_build():
|
||||
err = (
|
||||
"Generating wheels requires 'build' or 'wheel' (deprecated) to be installed"
|
||||
)
|
||||
msg.fail(err, "pip install build", exits=1)
|
||||
if not has_build():
|
||||
msg.warn(
|
||||
"Generating packages without the 'build' package is deprecated and "
|
||||
"will not be supported in the future. To install 'build': pip "
|
||||
"install build"
|
||||
)
|
||||
if create_wheel and not has_wheel():
|
||||
err = "Generating a binary .whl file requires wheel to be installed"
|
||||
msg.fail(err, "pip install wheel", exits=1)
|
||||
if not input_path or not input_path.exists():
|
||||
msg.fail("Can't locate pipeline data", input_path, exits=1)
|
||||
if not output_path or not output_path.exists():
|
||||
|
@ -116,7 +102,7 @@ def package(
|
|||
if not meta_path.exists() or not meta_path.is_file():
|
||||
msg.fail("Can't load pipeline meta.json", meta_path, exits=1)
|
||||
meta = srsly.read_json(meta_path)
|
||||
meta = get_meta(input_dir, meta, require_parent=require_parent)
|
||||
meta = get_meta(input_dir, meta)
|
||||
if meta["requirements"]:
|
||||
msg.good(
|
||||
f"Including {len(meta['requirements'])} package requirement(s) from "
|
||||
|
@ -189,7 +175,6 @@ def package(
|
|||
imports.append(code_path.stem)
|
||||
shutil.copy(str(code_path), str(package_path))
|
||||
create_file(main_path / "meta.json", srsly.json_dumps(meta, indent=2))
|
||||
|
||||
create_file(main_path / "setup.py", TEMPLATE_SETUP)
|
||||
create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST)
|
||||
init_py = TEMPLATE_INIT.format(
|
||||
|
@ -199,37 +184,12 @@ def package(
|
|||
msg.good(f"Successfully created package directory '{model_name_v}'", main_path)
|
||||
if create_sdist:
|
||||
with util.working_dir(main_path):
|
||||
# run directly, since util.run_command is not designed to continue
|
||||
# after a command fails
|
||||
ret = subprocess.run(
|
||||
[sys.executable, "-m", "build", ".", "--sdist"],
|
||||
env=os.environ.copy(),
|
||||
)
|
||||
if ret.returncode != 0:
|
||||
msg.warn(
|
||||
"Creating sdist with 'python -m build' failed. Falling "
|
||||
"back to deprecated use of 'python setup.py sdist'"
|
||||
)
|
||||
util.run_command([sys.executable, "setup.py", "sdist"], capture=False)
|
||||
util.run_command([sys.executable, "setup.py", "sdist"], capture=False)
|
||||
zip_file = main_path / "dist" / f"{model_name_v}{SDIST_SUFFIX}"
|
||||
msg.good(f"Successfully created zipped Python package", zip_file)
|
||||
if create_wheel:
|
||||
with util.working_dir(main_path):
|
||||
# run directly, since util.run_command is not designed to continue
|
||||
# after a command fails
|
||||
ret = subprocess.run(
|
||||
[sys.executable, "-m", "build", ".", "--wheel"],
|
||||
env=os.environ.copy(),
|
||||
)
|
||||
if ret.returncode != 0:
|
||||
msg.warn(
|
||||
"Creating wheel with 'python -m build' failed. Falling "
|
||||
"back to deprecated use of 'wheel' with "
|
||||
"'python setup.py bdist_wheel'"
|
||||
)
|
||||
util.run_command(
|
||||
[sys.executable, "setup.py", "bdist_wheel"], capture=False
|
||||
)
|
||||
util.run_command([sys.executable, "setup.py", "bdist_wheel"], capture=False)
|
||||
wheel_name_squashed = re.sub("_+", "_", model_name_v)
|
||||
wheel = main_path / "dist" / f"{wheel_name_squashed}{WHEEL_SUFFIX}"
|
||||
msg.good(f"Successfully created binary wheel", wheel)
|
||||
|
@ -249,17 +209,6 @@ def has_wheel() -> bool:
|
|||
return False
|
||||
|
||||
|
||||
def has_build() -> bool:
|
||||
# it's very likely that there is a local directory named build/ (especially
|
||||
# in an editable install), so an import check is not sufficient; instead
|
||||
# check that there is a package version
|
||||
try:
|
||||
importlib_metadata.version("build")
|
||||
return True
|
||||
except importlib_metadata.PackageNotFoundError: # type: ignore[attr-defined]
|
||||
return False
|
||||
|
||||
|
||||
def get_third_party_dependencies(
|
||||
config: Config, exclude: List[str] = util.SimpleFrozenList()
|
||||
) -> List[str]:
|
||||
|
@ -303,11 +252,9 @@ def get_third_party_dependencies(
|
|||
raise regerr from None
|
||||
module_name = func_info.get("module") # type: ignore[attr-defined]
|
||||
if module_name: # the code is part of a module, not a --code file
|
||||
modules.add(func_info["module"].split(".")[0]) # type: ignore[union-attr]
|
||||
modules.add(func_info["module"].split(".")[0]) # type: ignore[index]
|
||||
dependencies = []
|
||||
for module_name in modules:
|
||||
if module_name == about.__title__:
|
||||
continue
|
||||
if module_name in distributions:
|
||||
dist = distributions.get(module_name)
|
||||
if dist:
|
||||
|
@ -338,9 +285,7 @@ def create_file(file_path: Path, contents: str) -> None:
|
|||
|
||||
|
||||
def get_meta(
|
||||
model_path: Union[str, Path],
|
||||
existing_meta: Dict[str, Any],
|
||||
require_parent: bool = False,
|
||||
model_path: Union[str, Path], existing_meta: Dict[str, Any]
|
||||
) -> Dict[str, Any]:
|
||||
meta: Dict[str, Any] = {
|
||||
"lang": "en",
|
||||
|
@ -354,8 +299,8 @@ def get_meta(
|
|||
}
|
||||
nlp = util.load_model_from_path(Path(model_path))
|
||||
meta.update(nlp.meta)
|
||||
meta["spacy_version"] = util.get_minor_version_range(about.__version__)
|
||||
meta.update(existing_meta)
|
||||
meta["spacy_version"] = util.get_minor_version_range(about.__version__)
|
||||
meta["vectors"] = {
|
||||
"width": nlp.vocab.vectors_length,
|
||||
"vectors": len(nlp.vocab.vectors),
|
||||
|
@ -369,8 +314,6 @@ def get_meta(
|
|||
existing_reqs = [util.split_requirement(req)[0] for req in meta["requirements"]]
|
||||
reqs = get_third_party_dependencies(nlp.config, exclude=existing_reqs)
|
||||
meta["requirements"].extend(reqs)
|
||||
if require_parent and about.__title__ not in meta["requirements"]:
|
||||
meta["requirements"].append(about.__title__ + meta["spacy_version"])
|
||||
return meta
|
||||
|
||||
|
||||
|
@ -460,7 +403,7 @@ def _format_sources(data: Any) -> str:
|
|||
if author:
|
||||
result += " ({})".format(author)
|
||||
sources.append(result)
|
||||
return "<br>".join(sources)
|
||||
return "<br />".join(sources)
|
||||
|
||||
|
||||
def _format_accuracy(data: Dict[str, Any], exclude: List[str] = ["speed"]) -> str:
|
||||
|
@ -545,11 +488,8 @@ def list_files(data_dir):
|
|||
|
||||
|
||||
def list_requirements(meta):
|
||||
# Up to version 3.7, we included the parent package
|
||||
# in requirements by default. This behaviour is removed
|
||||
# in 3.8, with a setting to include the parent package in
|
||||
# the requirements list in the meta if desired.
|
||||
requirements = []
|
||||
parent_package = meta.get('parent_package', 'spacy')
|
||||
requirements = [parent_package + meta['spacy_version']]
|
||||
if 'setup_requires' in meta:
|
||||
requirements += meta['setup_requires']
|
||||
if 'requirements' in meta:
|
||||
|
|
|
@ -1,21 +1,13 @@
|
|||
import re
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import typer
|
||||
from pathlib import Path
|
||||
from wasabi import msg
|
||||
import typer
|
||||
import re
|
||||
|
||||
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
|
||||
from ._util import import_code, setup_gpu
|
||||
from ..training.pretrain import pretrain
|
||||
from ..util import load_config
|
||||
from ._util import (
|
||||
Arg,
|
||||
Opt,
|
||||
app,
|
||||
import_code,
|
||||
parse_config_overrides,
|
||||
setup_gpu,
|
||||
show_validation_error,
|
||||
)
|
||||
|
||||
|
||||
@app.command(
|
||||
|
@ -31,7 +23,6 @@ def pretrain_cli(
|
|||
resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
|
||||
epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."),
|
||||
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
||||
skip_last: bool = Opt(False, "--skip-last", "-L", help="Skip saving model-last.bin"),
|
||||
# fmt: on
|
||||
):
|
||||
"""
|
||||
|
@ -70,7 +61,7 @@ def pretrain_cli(
|
|||
# TODO: What's the solution here? How do we handle optional blocks?
|
||||
msg.fail("The [pretraining] block in your config is empty", exits=1)
|
||||
if not output_dir.exists():
|
||||
output_dir.mkdir(parents=True)
|
||||
output_dir.mkdir()
|
||||
msg.good(f"Created output directory: {output_dir}")
|
||||
# Save non-interpolated config
|
||||
raw_config.to_disk(output_dir / "config.cfg")
|
||||
|
@ -83,7 +74,6 @@ def pretrain_cli(
|
|||
epoch_resume=epoch_resume,
|
||||
use_gpu=use_gpu,
|
||||
silent=False,
|
||||
skip_last=skip_last,
|
||||
)
|
||||
msg.good("Successfully finished pretrain")
|
||||
|
||||
|
|
|
@ -1,18 +1,17 @@
|
|||
from typing import Optional, Sequence, Union, Iterator
|
||||
import tqdm
|
||||
from pathlib import Path
|
||||
import srsly
|
||||
import cProfile
|
||||
import itertools
|
||||
import pstats
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Iterator, Optional, Sequence, Union
|
||||
|
||||
import srsly
|
||||
import tqdm
|
||||
import itertools
|
||||
from wasabi import msg, Printer
|
||||
import typer
|
||||
from wasabi import Printer, msg
|
||||
|
||||
from ._util import app, debug_cli, Arg, Opt, NAME
|
||||
from ..language import Language
|
||||
from ..util import load_model
|
||||
from ._util import NAME, Arg, Opt, app, debug_cli
|
||||
|
||||
|
||||
@debug_cli.command("profile")
|
||||
|
@ -71,7 +70,7 @@ def profile(model: str, inputs: Optional[Path] = None, n_texts: int = 10000) ->
|
|||
|
||||
|
||||
def parse_texts(nlp: Language, texts: Sequence[str]) -> None:
|
||||
for doc in nlp.pipe(tqdm.tqdm(texts, disable=None), batch_size=16):
|
||||
for doc in nlp.pipe(tqdm.tqdm(texts), batch_size=16):
|
||||
pass
|
||||
|
||||
|
||||
|
|
|
@ -1 +1,202 @@
|
|||
from weasel.cli.assets import *
|
||||
from typing import Any, Dict, Optional
|
||||
from pathlib import Path
|
||||
from wasabi import msg
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import requests
|
||||
import typer
|
||||
|
||||
from ...util import ensure_path, working_dir
|
||||
from .._util import project_cli, Arg, Opt, PROJECT_FILE, load_project_config
|
||||
from .._util import get_checksum, download_file, git_checkout, get_git_version
|
||||
from .._util import SimpleFrozenDict, parse_config_overrides
|
||||
|
||||
# Whether assets are extra if `extra` is not set.
|
||||
EXTRA_DEFAULT = False
|
||||
|
||||
|
||||
@project_cli.command(
|
||||
"assets",
|
||||
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
||||
)
|
||||
def project_assets_cli(
|
||||
# fmt: off
|
||||
ctx: typer.Context, # This is only used to read additional arguments
|
||||
project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
|
||||
sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse checkout for assets provided via Git, to only check out and clone the files needed. Requires Git v22.2+."),
|
||||
extra: bool = Opt(False, "--extra", "-e", help="Download all assets, including those marked as 'extra'.")
|
||||
# fmt: on
|
||||
):
|
||||
"""Fetch project assets like datasets and pretrained weights. Assets are
|
||||
defined in the "assets" section of the project.yml. If a checksum is
|
||||
provided in the project.yml, the file is only downloaded if no local file
|
||||
with the same checksum exists.
|
||||
|
||||
DOCS: https://spacy.io/api/cli#project-assets
|
||||
"""
|
||||
overrides = parse_config_overrides(ctx.args)
|
||||
project_assets(
|
||||
project_dir,
|
||||
overrides=overrides,
|
||||
sparse_checkout=sparse_checkout,
|
||||
extra=extra,
|
||||
)
|
||||
|
||||
|
||||
def project_assets(
|
||||
project_dir: Path,
|
||||
*,
|
||||
overrides: Dict[str, Any] = SimpleFrozenDict(),
|
||||
sparse_checkout: bool = False,
|
||||
extra: bool = False,
|
||||
) -> None:
|
||||
"""Fetch assets for a project using DVC if possible.
|
||||
|
||||
project_dir (Path): Path to project directory.
|
||||
sparse_checkout (bool): Use sparse checkout for assets provided via Git, to only check out and clone the files
|
||||
needed.
|
||||
extra (bool): Whether to download all assets, including those marked as 'extra'.
|
||||
"""
|
||||
project_path = ensure_path(project_dir)
|
||||
config = load_project_config(project_path, overrides=overrides)
|
||||
assets = [
|
||||
asset
|
||||
for asset in config.get("assets", [])
|
||||
if extra or not asset.get("extra", EXTRA_DEFAULT)
|
||||
]
|
||||
if not assets:
|
||||
msg.warn(
|
||||
f"No assets specified in {PROJECT_FILE} (if assets are marked as extra, download them with --extra)",
|
||||
exits=0,
|
||||
)
|
||||
msg.info(f"Fetching {len(assets)} asset(s)")
|
||||
|
||||
for asset in assets:
|
||||
dest = (project_dir / asset["dest"]).resolve()
|
||||
checksum = asset.get("checksum")
|
||||
if "git" in asset:
|
||||
git_err = (
|
||||
f"Cloning spaCy project templates requires Git and the 'git' command. "
|
||||
f"Make sure it's installed and that the executable is available."
|
||||
)
|
||||
get_git_version(error=git_err)
|
||||
if dest.exists():
|
||||
# If there's already a file, check for checksum
|
||||
if checksum and checksum == get_checksum(dest):
|
||||
msg.good(
|
||||
f"Skipping download with matching checksum: {asset['dest']}"
|
||||
)
|
||||
continue
|
||||
else:
|
||||
if dest.is_dir():
|
||||
shutil.rmtree(dest)
|
||||
else:
|
||||
dest.unlink()
|
||||
if "repo" not in asset["git"] or asset["git"]["repo"] is None:
|
||||
msg.fail(
|
||||
"A git asset must include 'repo', the repository address.", exits=1
|
||||
)
|
||||
if "path" not in asset["git"] or asset["git"]["path"] is None:
|
||||
msg.fail(
|
||||
"A git asset must include 'path' - use \"\" to get the entire repository.",
|
||||
exits=1,
|
||||
)
|
||||
git_checkout(
|
||||
asset["git"]["repo"],
|
||||
asset["git"]["path"],
|
||||
dest,
|
||||
branch=asset["git"].get("branch"),
|
||||
sparse=sparse_checkout,
|
||||
)
|
||||
msg.good(f"Downloaded asset {dest}")
|
||||
else:
|
||||
url = asset.get("url")
|
||||
if not url:
|
||||
# project.yml defines asset without URL that the user has to place
|
||||
check_private_asset(dest, checksum)
|
||||
continue
|
||||
fetch_asset(project_path, url, dest, checksum)
|
||||
|
||||
|
||||
def check_private_asset(dest: Path, checksum: Optional[str] = None) -> None:
|
||||
"""Check and validate assets without a URL (private assets that the user
|
||||
has to provide themselves) and give feedback about the checksum.
|
||||
|
||||
dest (Path): Destination path of the asset.
|
||||
checksum (Optional[str]): Optional checksum of the expected file.
|
||||
"""
|
||||
if not Path(dest).exists():
|
||||
err = f"No URL provided for asset. You need to add this file yourself: {dest}"
|
||||
msg.warn(err)
|
||||
else:
|
||||
if not checksum:
|
||||
msg.good(f"Asset already exists: {dest}")
|
||||
elif checksum == get_checksum(dest):
|
||||
msg.good(f"Asset exists with matching checksum: {dest}")
|
||||
else:
|
||||
msg.fail(f"Asset available but with incorrect checksum: {dest}")
|
||||
|
||||
|
||||
def fetch_asset(
|
||||
project_path: Path, url: str, dest: Path, checksum: Optional[str] = None
|
||||
) -> None:
|
||||
"""Fetch an asset from a given URL or path. If a checksum is provided and a
|
||||
local file exists, it's only re-downloaded if the checksum doesn't match.
|
||||
|
||||
project_path (Path): Path to project directory.
|
||||
url (str): URL or path to asset.
|
||||
checksum (Optional[str]): Optional expected checksum of local file.
|
||||
RETURNS (Optional[Path]): The path to the fetched asset or None if fetching
|
||||
the asset failed.
|
||||
"""
|
||||
dest_path = (project_path / dest).resolve()
|
||||
if dest_path.exists():
|
||||
# If there's already a file, check for checksum
|
||||
if checksum:
|
||||
if checksum == get_checksum(dest_path):
|
||||
msg.good(f"Skipping download with matching checksum: {dest}")
|
||||
return
|
||||
else:
|
||||
# If there's not a checksum, make sure the file is a possibly valid size
|
||||
if os.path.getsize(dest_path) == 0:
|
||||
msg.warn(f"Asset exists but with size of 0 bytes, deleting: {dest}")
|
||||
os.remove(dest_path)
|
||||
# We might as well support the user here and create parent directories in
|
||||
# case the asset dir isn't listed as a dir to create in the project.yml
|
||||
if not dest_path.parent.exists():
|
||||
dest_path.parent.mkdir(parents=True)
|
||||
with working_dir(project_path):
|
||||
url = convert_asset_url(url)
|
||||
try:
|
||||
download_file(url, dest_path)
|
||||
msg.good(f"Downloaded asset {dest}")
|
||||
except requests.exceptions.RequestException as e:
|
||||
if Path(url).exists() and Path(url).is_file():
|
||||
# If it's a local file, copy to destination
|
||||
shutil.copy(url, str(dest_path))
|
||||
msg.good(f"Copied local asset {dest}")
|
||||
else:
|
||||
msg.fail(f"Download failed: {dest}", e)
|
||||
if checksum and checksum != get_checksum(dest_path):
|
||||
msg.fail(f"Checksum doesn't match value defined in {PROJECT_FILE}: {dest}")
|
||||
|
||||
|
||||
def convert_asset_url(url: str) -> str:
|
||||
"""Check and convert the asset URL if needed.
|
||||
|
||||
url (str): The asset URL.
|
||||
RETURNS (str): The converted URL.
|
||||
"""
|
||||
# If the asset URL is a regular GitHub URL it's likely a mistake
|
||||
if re.match(r"(http(s?)):\/\/github.com", url) and "releases/download" not in url:
|
||||
converted = url.replace("github.com", "raw.githubusercontent.com")
|
||||
converted = re.sub(r"/(tree|blob)/", "/", converted)
|
||||
msg.warn(
|
||||
"Downloading from a regular GitHub URL. This will only download "
|
||||
"the source of the page, not the actual file. Converting the URL "
|
||||
"to a raw URL.",
|
||||
converted,
|
||||
)
|
||||
return converted
|
||||
return url
|
||||
|
|
|
@ -1 +1,99 @@
|
|||
from weasel.cli.clone import *
|
||||
from typing import Optional
|
||||
from pathlib import Path
|
||||
from wasabi import msg
|
||||
import subprocess
|
||||
import re
|
||||
|
||||
from ... import about
|
||||
from ...util import ensure_path
|
||||
from .._util import project_cli, Arg, Opt, COMMAND, PROJECT_FILE
|
||||
from .._util import git_checkout, get_git_version
|
||||
|
||||
DEFAULT_REPO = about.__projects__
|
||||
DEFAULT_PROJECTS_BRANCH = about.__projects_branch__
|
||||
DEFAULT_BRANCH = "master"
|
||||
|
||||
|
||||
@project_cli.command("clone")
|
||||
def project_clone_cli(
|
||||
# fmt: off
|
||||
name: str = Arg(..., help="The name of the template to clone"),
|
||||
dest: Optional[Path] = Arg(None, help="Where to clone the project. Defaults to current working directory", exists=False),
|
||||
repo: str = Opt(DEFAULT_REPO, "--repo", "-r", help="The repository to clone from"),
|
||||
branch: Optional[str] = Opt(None, "--branch", "-b", help="The branch to clone from"),
|
||||
sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse Git checkout to only check out and clone the files needed. Requires Git v22.2+.")
|
||||
# fmt: on
|
||||
):
|
||||
"""Clone a project template from a repository. Calls into "git" and will
|
||||
only download the files from the given subdirectory. The GitHub repo
|
||||
defaults to the official spaCy template repo, but can be customized
|
||||
(including using a private repo).
|
||||
|
||||
DOCS: https://spacy.io/api/cli#project-clone
|
||||
"""
|
||||
if dest is None:
|
||||
dest = Path.cwd() / Path(name).parts[-1]
|
||||
if branch is None:
|
||||
# If it's a user repo, we want to default to other branch
|
||||
branch = DEFAULT_PROJECTS_BRANCH if repo == DEFAULT_REPO else DEFAULT_BRANCH
|
||||
project_clone(name, dest, repo=repo, branch=branch, sparse_checkout=sparse_checkout)
|
||||
|
||||
|
||||
def project_clone(
|
||||
name: str,
|
||||
dest: Path,
|
||||
*,
|
||||
repo: str = about.__projects__,
|
||||
branch: str = about.__projects_branch__,
|
||||
sparse_checkout: bool = False,
|
||||
) -> None:
|
||||
"""Clone a project template from a repository.
|
||||
|
||||
name (str): Name of subdirectory to clone.
|
||||
dest (Path): Destination path of cloned project.
|
||||
repo (str): URL of Git repo containing project templates.
|
||||
branch (str): The branch to clone from
|
||||
"""
|
||||
dest = ensure_path(dest)
|
||||
check_clone(name, dest, repo)
|
||||
project_dir = dest.resolve()
|
||||
repo_name = re.sub(r"(http(s?)):\/\/github.com/", "", repo)
|
||||
try:
|
||||
git_checkout(repo, name, dest, branch=branch, sparse=sparse_checkout)
|
||||
except subprocess.CalledProcessError:
|
||||
err = f"Could not clone '{name}' from repo '{repo_name}'"
|
||||
msg.fail(err, exits=1)
|
||||
msg.good(f"Cloned '{name}' from {repo_name}", project_dir)
|
||||
if not (project_dir / PROJECT_FILE).exists():
|
||||
msg.warn(f"No {PROJECT_FILE} found in directory")
|
||||
else:
|
||||
msg.good(f"Your project is now ready!")
|
||||
print(f"To fetch the assets, run:\n{COMMAND} project assets {dest}")
|
||||
|
||||
|
||||
def check_clone(name: str, dest: Path, repo: str) -> None:
|
||||
"""Check and validate that the destination path can be used to clone. Will
|
||||
check that Git is available and that the destination path is suitable.
|
||||
|
||||
name (str): Name of the directory to clone from the repo.
|
||||
dest (Path): Local destination of cloned directory.
|
||||
repo (str): URL of the repo to clone from.
|
||||
"""
|
||||
git_err = (
|
||||
f"Cloning spaCy project templates requires Git and the 'git' command. "
|
||||
f"To clone a project without Git, copy the files from the '{name}' "
|
||||
f"directory in the {repo} to {dest} manually."
|
||||
)
|
||||
get_git_version(error=git_err)
|
||||
if not dest:
|
||||
msg.fail(f"Not a valid directory to clone project: {dest}", exits=1)
|
||||
if dest.exists():
|
||||
# Directory already exists (not allowed, clone needs to create it)
|
||||
msg.fail(f"Can't clone project, directory already exists: {dest}", exits=1)
|
||||
if not dest.parent.exists():
|
||||
# We're not creating parents, parent dir should exist
|
||||
msg.fail(
|
||||
f"Can't clone project, parent directory doesn't exist: {dest.parent}. "
|
||||
f"Create the necessary folder(s) first before continuing.",
|
||||
exits=1,
|
||||
)
|
||||
|
|
|
@ -1 +1,115 @@
|
|||
from weasel.cli.document import *
|
||||
from pathlib import Path
|
||||
from wasabi import msg, MarkdownRenderer
|
||||
|
||||
from ...util import working_dir
|
||||
from .._util import project_cli, Arg, Opt, PROJECT_FILE, load_project_config
|
||||
|
||||
|
||||
DOCS_URL = "https://spacy.io"
|
||||
INTRO_PROJECT = f"""The [`{PROJECT_FILE}`]({PROJECT_FILE}) defines the data assets required by the
|
||||
project, as well as the available commands and workflows. For details, see the
|
||||
[spaCy projects documentation]({DOCS_URL}/usage/projects)."""
|
||||
INTRO_COMMANDS = f"""The following commands are defined by the project. They
|
||||
can be executed using [`spacy project run [name]`]({DOCS_URL}/api/cli#project-run).
|
||||
Commands are only re-run if their inputs have changed."""
|
||||
INTRO_WORKFLOWS = f"""The following workflows are defined by the project. They
|
||||
can be executed using [`spacy project run [name]`]({DOCS_URL}/api/cli#project-run)
|
||||
and will run the specified commands in order. Commands are only re-run if their
|
||||
inputs have changed."""
|
||||
INTRO_ASSETS = f"""The following assets are defined by the project. They can
|
||||
be fetched by running [`spacy project assets`]({DOCS_URL}/api/cli#project-assets)
|
||||
in the project directory."""
|
||||
# These markers are added to the Markdown and can be used to update the file in
|
||||
# place if it already exists. Only the auto-generated part will be replaced.
|
||||
MARKER_START = "<!-- SPACY PROJECT: AUTO-GENERATED DOCS START (do not remove) -->"
|
||||
MARKER_END = "<!-- SPACY PROJECT: AUTO-GENERATED DOCS END (do not remove) -->"
|
||||
# If this marker is used in an existing README, it's ignored and not replaced
|
||||
MARKER_IGNORE = "<!-- SPACY PROJECT: IGNORE -->"
|
||||
|
||||
|
||||
@project_cli.command("document")
|
||||
def project_document_cli(
|
||||
# fmt: off
|
||||
project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
|
||||
output_file: Path = Opt("-", "--output", "-o", help="Path to output Markdown file for output. Defaults to - for standard output"),
|
||||
no_emoji: bool = Opt(False, "--no-emoji", "-NE", help="Don't use emoji")
|
||||
# fmt: on
|
||||
):
|
||||
"""
|
||||
Auto-generate a README.md for a project. If the content is saved to a file,
|
||||
hidden markers are added so you can add custom content before or after the
|
||||
auto-generated section and only the auto-generated docs will be replaced
|
||||
when you re-run the command.
|
||||
|
||||
DOCS: https://spacy.io/api/cli#project-document
|
||||
"""
|
||||
project_document(project_dir, output_file, no_emoji=no_emoji)
|
||||
|
||||
|
||||
def project_document(
|
||||
project_dir: Path, output_file: Path, *, no_emoji: bool = False
|
||||
) -> None:
|
||||
is_stdout = str(output_file) == "-"
|
||||
config = load_project_config(project_dir)
|
||||
md = MarkdownRenderer(no_emoji=no_emoji)
|
||||
md.add(MARKER_START)
|
||||
title = config.get("title")
|
||||
description = config.get("description")
|
||||
md.add(md.title(1, f"spaCy Project{f': {title}' if title else ''}", "🪐"))
|
||||
if description:
|
||||
md.add(description)
|
||||
md.add(md.title(2, PROJECT_FILE, "📋"))
|
||||
md.add(INTRO_PROJECT)
|
||||
# Commands
|
||||
cmds = config.get("commands", [])
|
||||
data = [(md.code(cmd["name"]), cmd.get("help", "")) for cmd in cmds]
|
||||
if data:
|
||||
md.add(md.title(3, "Commands", "⏯"))
|
||||
md.add(INTRO_COMMANDS)
|
||||
md.add(md.table(data, ["Command", "Description"]))
|
||||
# Workflows
|
||||
wfs = config.get("workflows", {}).items()
|
||||
data = [(md.code(n), " → ".join(md.code(w) for w in stp)) for n, stp in wfs]
|
||||
if data:
|
||||
md.add(md.title(3, "Workflows", "⏭"))
|
||||
md.add(INTRO_WORKFLOWS)
|
||||
md.add(md.table(data, ["Workflow", "Steps"]))
|
||||
# Assets
|
||||
assets = config.get("assets", [])
|
||||
data = []
|
||||
for a in assets:
|
||||
source = "Git" if a.get("git") else "URL" if a.get("url") else "Local"
|
||||
dest_path = a["dest"]
|
||||
dest = md.code(dest_path)
|
||||
if source == "Local":
|
||||
# Only link assets if they're in the repo
|
||||
with working_dir(project_dir) as p:
|
||||
if (p / dest_path).exists():
|
||||
dest = md.link(dest, dest_path)
|
||||
data.append((dest, source, a.get("description", "")))
|
||||
if data:
|
||||
md.add(md.title(3, "Assets", "🗂"))
|
||||
md.add(INTRO_ASSETS)
|
||||
md.add(md.table(data, ["File", "Source", "Description"]))
|
||||
md.add(MARKER_END)
|
||||
# Output result
|
||||
if is_stdout:
|
||||
print(md.text)
|
||||
else:
|
||||
content = md.text
|
||||
if output_file.exists():
|
||||
with output_file.open("r", encoding="utf8") as f:
|
||||
existing = f.read()
|
||||
if MARKER_IGNORE in existing:
|
||||
msg.warn("Found ignore marker in existing file: skipping", output_file)
|
||||
return
|
||||
if MARKER_START in existing and MARKER_END in existing:
|
||||
msg.info("Found existing file: only replacing auto-generated docs")
|
||||
before = existing.split(MARKER_START)[0]
|
||||
after = existing.split(MARKER_END)[1]
|
||||
content = f"{before}{content}{after}"
|
||||
else:
|
||||
msg.warn("Replacing existing file")
|
||||
with output_file.open("w", encoding="utf8") as f:
|
||||
f.write(content)
|
||||
msg.good("Saved project documentation", output_file)
|
||||
|
|
|
@ -1 +1,204 @@
|
|||
from weasel.cli.dvc import *
|
||||
"""This module contains helpers and subcommands for integrating spaCy projects
|
||||
with Data Version Controk (DVC). https://dvc.org"""
|
||||
from typing import Dict, Any, List, Optional, Iterable
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from wasabi import msg
|
||||
|
||||
from .._util import PROJECT_FILE, load_project_config, get_hash, project_cli
|
||||
from .._util import Arg, Opt, NAME, COMMAND
|
||||
from ...util import working_dir, split_command, join_command, run_command
|
||||
from ...util import SimpleFrozenList
|
||||
|
||||
|
||||
DVC_CONFIG = "dvc.yaml"
|
||||
DVC_DIR = ".dvc"
|
||||
UPDATE_COMMAND = "dvc"
|
||||
DVC_CONFIG_COMMENT = f"""# This file is auto-generated by spaCy based on your {PROJECT_FILE}. If you've
|
||||
# edited your {PROJECT_FILE}, you can regenerate this file by running:
|
||||
# {COMMAND} project {UPDATE_COMMAND}"""
|
||||
|
||||
|
||||
@project_cli.command(UPDATE_COMMAND)
|
||||
def project_update_dvc_cli(
|
||||
# fmt: off
|
||||
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
|
||||
workflow: Optional[str] = Arg(None, help=f"Name of workflow defined in {PROJECT_FILE}. Defaults to first workflow if not set."),
|
||||
verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"),
|
||||
force: bool = Opt(False, "--force", "-F", help="Force update DVC config"),
|
||||
# fmt: on
|
||||
):
|
||||
"""Auto-generate Data Version Control (DVC) config. A DVC
|
||||
project can only define one pipeline, so you need to specify one workflow
|
||||
defined in the project.yml. If no workflow is specified, the first defined
|
||||
workflow is used. The DVC config will only be updated if the project.yml
|
||||
changed.
|
||||
|
||||
DOCS: https://spacy.io/api/cli#project-dvc
|
||||
"""
|
||||
project_update_dvc(project_dir, workflow, verbose=verbose, force=force)
|
||||
|
||||
|
||||
def project_update_dvc(
|
||||
project_dir: Path,
|
||||
workflow: Optional[str] = None,
|
||||
*,
|
||||
verbose: bool = False,
|
||||
force: bool = False,
|
||||
) -> None:
|
||||
"""Update the auto-generated Data Version Control (DVC) config file. A DVC
|
||||
project can only define one pipeline, so you need to specify one workflow
|
||||
defined in the project.yml. Will only update the file if the checksum changed.
|
||||
|
||||
project_dir (Path): The project directory.
|
||||
workflow (Optional[str]): Optional name of workflow defined in project.yml.
|
||||
If not set, the first workflow will be used.
|
||||
verbose (bool): Print more info.
|
||||
force (bool): Force update DVC config.
|
||||
"""
|
||||
config = load_project_config(project_dir)
|
||||
updated = update_dvc_config(
|
||||
project_dir, config, workflow, verbose=verbose, force=force
|
||||
)
|
||||
help_msg = "To execute the workflow with DVC, run: dvc repro"
|
||||
if updated:
|
||||
msg.good(f"Updated DVC config from {PROJECT_FILE}", help_msg)
|
||||
else:
|
||||
msg.info(f"No changes found in {PROJECT_FILE}, no update needed", help_msg)
|
||||
|
||||
|
||||
def update_dvc_config(
|
||||
path: Path,
|
||||
config: Dict[str, Any],
|
||||
workflow: Optional[str] = None,
|
||||
verbose: bool = False,
|
||||
silent: bool = False,
|
||||
force: bool = False,
|
||||
) -> bool:
|
||||
"""Re-run the DVC commands in dry mode and update dvc.yaml file in the
|
||||
project directory. The file is auto-generated based on the config. The
|
||||
first line of the auto-generated file specifies the hash of the config
|
||||
dict, so if any of the config values change, the DVC config is regenerated.
|
||||
|
||||
path (Path): The path to the project directory.
|
||||
config (Dict[str, Any]): The loaded project.yml.
|
||||
verbose (bool): Whether to print additional info (via DVC).
|
||||
silent (bool): Don't output anything (via DVC).
|
||||
force (bool): Force update, even if hashes match.
|
||||
RETURNS (bool): Whether the DVC config file was updated.
|
||||
"""
|
||||
ensure_dvc(path)
|
||||
workflows = config.get("workflows", {})
|
||||
workflow_names = list(workflows.keys())
|
||||
check_workflows(workflow_names, workflow)
|
||||
if not workflow:
|
||||
workflow = workflow_names[0]
|
||||
config_hash = get_hash(config)
|
||||
path = path.resolve()
|
||||
dvc_config_path = path / DVC_CONFIG
|
||||
if dvc_config_path.exists():
|
||||
# Check if the file was generated using the current config, if not, redo
|
||||
with dvc_config_path.open("r", encoding="utf8") as f:
|
||||
ref_hash = f.readline().strip().replace("# ", "")
|
||||
if ref_hash == config_hash and not force:
|
||||
return False # Nothing has changed in project.yml, don't need to update
|
||||
dvc_config_path.unlink()
|
||||
dvc_commands = []
|
||||
config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
|
||||
for name in workflows[workflow]:
|
||||
command = config_commands[name]
|
||||
deps = command.get("deps", [])
|
||||
outputs = command.get("outputs", [])
|
||||
outputs_no_cache = command.get("outputs_no_cache", [])
|
||||
if not deps and not outputs and not outputs_no_cache:
|
||||
continue
|
||||
# Default to the working dir as the project path since dvc.yaml is auto-generated
|
||||
# and we don't want arbitrary paths in there
|
||||
project_cmd = ["python", "-m", NAME, "project", "run", name]
|
||||
deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl]
|
||||
outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl]
|
||||
outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl]
|
||||
dvc_cmd = ["run", "-n", name, "-w", str(path), "--no-exec"]
|
||||
if command.get("no_skip"):
|
||||
dvc_cmd.append("--always-changed")
|
||||
full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd]
|
||||
dvc_commands.append(join_command(full_cmd))
|
||||
with working_dir(path):
|
||||
dvc_flags = {"--verbose": verbose, "--quiet": silent}
|
||||
run_dvc_commands(dvc_commands, flags=dvc_flags)
|
||||
with dvc_config_path.open("r+", encoding="utf8") as f:
|
||||
content = f.read()
|
||||
f.seek(0, 0)
|
||||
f.write(f"# {config_hash}\n{DVC_CONFIG_COMMENT}\n{content}")
|
||||
return True
|
||||
|
||||
|
||||
def run_dvc_commands(
|
||||
commands: Iterable[str] = SimpleFrozenList(), flags: Dict[str, bool] = {}
|
||||
) -> None:
|
||||
"""Run a sequence of DVC commands in a subprocess, in order.
|
||||
|
||||
commands (List[str]): The string commands without the leading "dvc".
|
||||
flags (Dict[str, bool]): Conditional flags to be added to command. Makes it
|
||||
easier to pass flags like --quiet that depend on a variable or
|
||||
command-line setting while avoiding lots of nested conditionals.
|
||||
"""
|
||||
for c in commands:
|
||||
command = split_command(c)
|
||||
dvc_command = ["dvc", *command]
|
||||
# Add the flags if they are set to True
|
||||
for flag, is_active in flags.items():
|
||||
if is_active:
|
||||
dvc_command.append(flag)
|
||||
run_command(dvc_command)
|
||||
|
||||
|
||||
def check_workflows(workflows: List[str], workflow: Optional[str] = None) -> None:
|
||||
"""Validate workflows provided in project.yml and check that a given
|
||||
workflow can be used to generate a DVC config.
|
||||
|
||||
workflows (List[str]): Names of the available workflows.
|
||||
workflow (Optional[str]): The name of the workflow to convert.
|
||||
"""
|
||||
if not workflows:
|
||||
msg.fail(
|
||||
f"No workflows defined in {PROJECT_FILE}. To generate a DVC config, "
|
||||
f"define at least one list of commands.",
|
||||
exits=1,
|
||||
)
|
||||
if workflow is not None and workflow not in workflows:
|
||||
msg.fail(
|
||||
f"Workflow '{workflow}' not defined in {PROJECT_FILE}. "
|
||||
f"Available workflows: {', '.join(workflows)}",
|
||||
exits=1,
|
||||
)
|
||||
if not workflow:
|
||||
msg.warn(
|
||||
f"No workflow specified for DVC pipeline. Using the first workflow "
|
||||
f"defined in {PROJECT_FILE}: '{workflows[0]}'"
|
||||
)
|
||||
|
||||
|
||||
def ensure_dvc(project_dir: Path) -> None:
|
||||
"""Ensure that the "dvc" command is available and that the current project
|
||||
directory is an initialized DVC project.
|
||||
"""
|
||||
try:
|
||||
subprocess.run(["dvc", "--version"], stdout=subprocess.DEVNULL)
|
||||
except Exception:
|
||||
msg.fail(
|
||||
"To use spaCy projects with DVC (Data Version Control), DVC needs "
|
||||
"to be installed and the 'dvc' command needs to be available",
|
||||
"You can install the Python package from pip (pip install dvc) or "
|
||||
"conda (conda install -c conda-forge dvc). For more details, see the "
|
||||
"documentation: https://dvc.org/doc/install",
|
||||
exits=1,
|
||||
)
|
||||
if not (project_dir / ".dvc").exists():
|
||||
msg.fail(
|
||||
"Project not initialized as a DVC project",
|
||||
"To initialize a DVC project, you can run 'dvc init' in the project "
|
||||
"directory. For more details, see the documentation: "
|
||||
"https://dvc.org/doc/command-reference/init",
|
||||
exits=1,
|
||||
)
|
||||
|
|
|
@ -1 +1,64 @@
|
|||
from weasel.cli.pull import *
|
||||
from pathlib import Path
|
||||
from wasabi import msg
|
||||
from .remote_storage import RemoteStorage
|
||||
from .remote_storage import get_command_hash
|
||||
from .._util import project_cli, Arg, logger
|
||||
from .._util import load_project_config
|
||||
from .run import update_lockfile
|
||||
|
||||
|
||||
@project_cli.command("pull")
|
||||
def project_pull_cli(
|
||||
# fmt: off
|
||||
remote: str = Arg("default", help="Name or path of remote storage"),
|
||||
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
|
||||
# fmt: on
|
||||
):
|
||||
"""Retrieve available precomputed outputs from a remote storage.
|
||||
You can alias remotes in your project.yml by mapping them to storage paths.
|
||||
A storage can be anything that the smart-open library can upload to, e.g.
|
||||
AWS, Google Cloud Storage, SSH, local directories etc.
|
||||
|
||||
DOCS: https://spacy.io/api/cli#project-pull
|
||||
"""
|
||||
for url, output_path in project_pull(project_dir, remote):
|
||||
if url is not None:
|
||||
msg.good(f"Pulled {output_path} from {url}")
|
||||
|
||||
|
||||
def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
|
||||
# TODO: We don't have tests for this :(. It would take a bit of mockery to
|
||||
# set up. I guess see if it breaks first?
|
||||
config = load_project_config(project_dir)
|
||||
if remote in config.get("remotes", {}):
|
||||
remote = config["remotes"][remote]
|
||||
storage = RemoteStorage(project_dir, remote)
|
||||
commands = list(config.get("commands", []))
|
||||
# We use a while loop here because we don't know how the commands
|
||||
# will be ordered. A command might need dependencies from one that's later
|
||||
# in the list.
|
||||
while commands:
|
||||
for i, cmd in enumerate(list(commands)):
|
||||
logger.debug(f"CMD: {cmd['name']}.")
|
||||
deps = [project_dir / dep for dep in cmd.get("deps", [])]
|
||||
if all(dep.exists() for dep in deps):
|
||||
cmd_hash = get_command_hash("", "", deps, cmd["script"])
|
||||
for output_path in cmd.get("outputs", []):
|
||||
url = storage.pull(output_path, command_hash=cmd_hash)
|
||||
logger.debug(
|
||||
f"URL: {url} for {output_path} with command hash {cmd_hash}"
|
||||
)
|
||||
yield url, output_path
|
||||
|
||||
out_locs = [project_dir / out for out in cmd.get("outputs", [])]
|
||||
if all(loc.exists() for loc in out_locs):
|
||||
update_lockfile(project_dir, cmd)
|
||||
# We remove the command from the list here, and break, so that
|
||||
# we iterate over the loop again.
|
||||
commands.pop(i)
|
||||
break
|
||||
else:
|
||||
logger.debug(f"Dependency missing. Skipping {cmd['name']} outputs.")
|
||||
else:
|
||||
# If we didn't break the for loop, break the while loop.
|
||||
break
|
||||
|
|
|
@ -1 +1,69 @@
|
|||
from weasel.cli.push import *
|
||||
from pathlib import Path
|
||||
from wasabi import msg
|
||||
from .remote_storage import RemoteStorage
|
||||
from .remote_storage import get_content_hash, get_command_hash
|
||||
from .._util import load_project_config
|
||||
from .._util import project_cli, Arg, logger
|
||||
|
||||
|
||||
@project_cli.command("push")
|
||||
def project_push_cli(
|
||||
# fmt: off
|
||||
remote: str = Arg("default", help="Name or path of remote storage"),
|
||||
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
|
||||
# fmt: on
|
||||
):
|
||||
"""Persist outputs to a remote storage. You can alias remotes in your
|
||||
project.yml by mapping them to storage paths. A storage can be anything that
|
||||
the smart-open library can upload to, e.g. AWS, Google Cloud Storage, SSH,
|
||||
local directories etc.
|
||||
|
||||
DOCS: https://spacy.io/api/cli#project-push
|
||||
"""
|
||||
for output_path, url in project_push(project_dir, remote):
|
||||
if url is None:
|
||||
msg.info(f"Skipping {output_path}")
|
||||
else:
|
||||
msg.good(f"Pushed {output_path} to {url}")
|
||||
|
||||
|
||||
def project_push(project_dir: Path, remote: str):
|
||||
"""Persist outputs to a remote storage. You can alias remotes in your project.yml
|
||||
by mapping them to storage paths. A storage can be anything that the smart-open
|
||||
library can upload to, e.g. gcs, aws, ssh, local directories etc
|
||||
"""
|
||||
config = load_project_config(project_dir)
|
||||
if remote in config.get("remotes", {}):
|
||||
remote = config["remotes"][remote]
|
||||
storage = RemoteStorage(project_dir, remote)
|
||||
for cmd in config.get("commands", []):
|
||||
logger.debug(f"CMD: cmd['name']")
|
||||
deps = [project_dir / dep for dep in cmd.get("deps", [])]
|
||||
if any(not dep.exists() for dep in deps):
|
||||
logger.debug(f"Dependency missing. Skipping {cmd['name']} outputs")
|
||||
continue
|
||||
cmd_hash = get_command_hash(
|
||||
"", "", [project_dir / dep for dep in cmd.get("deps", [])], cmd["script"]
|
||||
)
|
||||
logger.debug(f"CMD_HASH: {cmd_hash}")
|
||||
for output_path in cmd.get("outputs", []):
|
||||
output_loc = project_dir / output_path
|
||||
if output_loc.exists() and _is_not_empty_dir(output_loc):
|
||||
url = storage.push(
|
||||
output_path,
|
||||
command_hash=cmd_hash,
|
||||
content_hash=get_content_hash(output_loc),
|
||||
)
|
||||
logger.debug(
|
||||
f"URL: {url} for output {output_path} with cmd_hash {cmd_hash}"
|
||||
)
|
||||
yield output_path, url
|
||||
|
||||
|
||||
def _is_not_empty_dir(loc: Path):
|
||||
if not loc.is_dir():
|
||||
return True
|
||||
elif any(_is_not_empty_dir(child) for child in loc.iterdir()):
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
|
|
@ -1 +1,176 @@
|
|||
from weasel.cli.remote_storage import *
|
||||
from typing import Optional, List, Dict, TYPE_CHECKING
|
||||
import os
|
||||
import site
|
||||
import hashlib
|
||||
import urllib.parse
|
||||
import tarfile
|
||||
from pathlib import Path
|
||||
|
||||
from .._util import get_hash, get_checksum, download_file, ensure_pathy
|
||||
from ...util import make_tempdir, get_minor_version, ENV_VARS, check_bool_env_var
|
||||
from ...git_info import GIT_VERSION
|
||||
from ... import about
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pathy import Pathy # noqa: F401
|
||||
|
||||
|
||||
class RemoteStorage:
|
||||
"""Push and pull outputs to and from a remote file storage.
|
||||
|
||||
Remotes can be anything that `smart-open` can support: AWS, GCS, file system,
|
||||
ssh, etc.
|
||||
"""
|
||||
|
||||
def __init__(self, project_root: Path, url: str, *, compression="gz"):
|
||||
self.root = project_root
|
||||
self.url = ensure_pathy(url)
|
||||
self.compression = compression
|
||||
|
||||
def push(self, path: Path, command_hash: str, content_hash: str) -> "Pathy":
|
||||
"""Compress a file or directory within a project and upload it to a remote
|
||||
storage. If an object exists at the full URL, nothing is done.
|
||||
|
||||
Within the remote storage, files are addressed by their project path
|
||||
(url encoded) and two user-supplied hashes, representing their creation
|
||||
context and their file contents. If the URL already exists, the data is
|
||||
not uploaded. Paths are archived and compressed prior to upload.
|
||||
"""
|
||||
loc = self.root / path
|
||||
if not loc.exists():
|
||||
raise IOError(f"Cannot push {loc}: does not exist.")
|
||||
url = self.make_url(path, command_hash, content_hash)
|
||||
if url.exists():
|
||||
return url
|
||||
tmp: Path
|
||||
with make_tempdir() as tmp:
|
||||
tar_loc = tmp / self.encode_name(str(path))
|
||||
mode_string = f"w:{self.compression}" if self.compression else "w"
|
||||
with tarfile.open(tar_loc, mode=mode_string) as tar_file:
|
||||
tar_file.add(str(loc), arcname=str(path))
|
||||
with tar_loc.open(mode="rb") as input_file:
|
||||
with url.open(mode="wb") as output_file:
|
||||
output_file.write(input_file.read())
|
||||
return url
|
||||
|
||||
def pull(
|
||||
self,
|
||||
path: Path,
|
||||
*,
|
||||
command_hash: Optional[str] = None,
|
||||
content_hash: Optional[str] = None,
|
||||
) -> Optional["Pathy"]:
|
||||
"""Retrieve a file from the remote cache. If the file already exists,
|
||||
nothing is done.
|
||||
|
||||
If the command_hash and/or content_hash are specified, only matching
|
||||
results are returned. If no results are available, an error is raised.
|
||||
"""
|
||||
dest = self.root / path
|
||||
if dest.exists():
|
||||
return None
|
||||
url = self.find(path, command_hash=command_hash, content_hash=content_hash)
|
||||
if url is None:
|
||||
return url
|
||||
else:
|
||||
# Make sure the destination exists
|
||||
if not dest.parent.exists():
|
||||
dest.parent.mkdir(parents=True)
|
||||
tmp: Path
|
||||
with make_tempdir() as tmp:
|
||||
tar_loc = tmp / url.parts[-1]
|
||||
download_file(url, tar_loc)
|
||||
mode_string = f"r:{self.compression}" if self.compression else "r"
|
||||
with tarfile.open(tar_loc, mode=mode_string) as tar_file:
|
||||
# This requires that the path is added correctly, relative
|
||||
# to root. This is how we set things up in push()
|
||||
tar_file.extractall(self.root)
|
||||
return url
|
||||
|
||||
def find(
|
||||
self,
|
||||
path: Path,
|
||||
*,
|
||||
command_hash: Optional[str] = None,
|
||||
content_hash: Optional[str] = None,
|
||||
) -> Optional["Pathy"]:
|
||||
"""Find the best matching version of a file within the storage,
|
||||
or `None` if no match can be found. If both the creation and content hash
|
||||
are specified, only exact matches will be returned. Otherwise, the most
|
||||
recent matching file is preferred.
|
||||
"""
|
||||
name = self.encode_name(str(path))
|
||||
if command_hash is not None and content_hash is not None:
|
||||
url = self.make_url(path, command_hash, content_hash)
|
||||
urls = [url] if url.exists() else []
|
||||
elif command_hash is not None:
|
||||
urls = list((self.url / name / command_hash).iterdir())
|
||||
else:
|
||||
urls = list((self.url / name).iterdir())
|
||||
if content_hash is not None:
|
||||
urls = [url for url in urls if url.parts[-1] == content_hash]
|
||||
return urls[-1] if urls else None
|
||||
|
||||
def make_url(self, path: Path, command_hash: str, content_hash: str) -> "Pathy":
|
||||
"""Construct a URL from a subpath, a creation hash and a content hash."""
|
||||
return self.url / self.encode_name(str(path)) / command_hash / content_hash
|
||||
|
||||
def encode_name(self, name: str) -> str:
|
||||
"""Encode a subpath into a URL-safe name."""
|
||||
return urllib.parse.quote_plus(name)
|
||||
|
||||
|
||||
def get_content_hash(loc: Path) -> str:
|
||||
return get_checksum(loc)
|
||||
|
||||
|
||||
def get_command_hash(
|
||||
site_hash: str, env_hash: str, deps: List[Path], cmd: List[str]
|
||||
) -> str:
|
||||
"""Create a hash representing the execution of a command. This includes the
|
||||
currently installed packages, whatever environment variables have been marked
|
||||
as relevant, and the command.
|
||||
"""
|
||||
if check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION):
|
||||
spacy_v = GIT_VERSION
|
||||
else:
|
||||
spacy_v = str(get_minor_version(about.__version__) or "")
|
||||
dep_checksums = [get_checksum(dep) for dep in sorted(deps)]
|
||||
hashes = [spacy_v, site_hash, env_hash] + dep_checksums
|
||||
hashes.extend(cmd)
|
||||
creation_bytes = "".join(hashes).encode("utf8")
|
||||
return hashlib.md5(creation_bytes).hexdigest()
|
||||
|
||||
|
||||
def get_site_hash():
|
||||
"""Hash the current Python environment's site-packages contents, including
|
||||
the name and version of the libraries. The list we're hashing is what
|
||||
`pip freeze` would output.
|
||||
"""
|
||||
site_dirs = site.getsitepackages()
|
||||
if site.ENABLE_USER_SITE:
|
||||
site_dirs.extend(site.getusersitepackages())
|
||||
packages = set()
|
||||
for site_dir in site_dirs:
|
||||
site_dir = Path(site_dir)
|
||||
for subpath in site_dir.iterdir():
|
||||
if subpath.parts[-1].endswith("dist-info"):
|
||||
packages.add(subpath.parts[-1].replace(".dist-info", ""))
|
||||
package_bytes = "".join(sorted(packages)).encode("utf8")
|
||||
return hashlib.md5sum(package_bytes).hexdigest()
|
||||
|
||||
|
||||
def get_env_hash(env: Dict[str, str]) -> str:
|
||||
"""Construct a hash of the environment variables that will be passed into
|
||||
the commands.
|
||||
|
||||
Values in the env dict may be references to the current os.environ, using
|
||||
the syntax $ENV_VAR to mean os.environ[ENV_VAR]
|
||||
"""
|
||||
env_vars = {}
|
||||
for key, value in env.items():
|
||||
if value.startswith("$"):
|
||||
env_vars[key] = os.environ.get(value[1:], "")
|
||||
else:
|
||||
env_vars[key] = value
|
||||
return get_hash(env_vars)
|
||||
|
|
|
@ -1 +1,310 @@
|
|||
from weasel.cli.run import *
|
||||
from typing import Optional, List, Dict, Sequence, Any, Iterable
|
||||
from pathlib import Path
|
||||
from wasabi import msg
|
||||
from wasabi.util import locale_escape
|
||||
import sys
|
||||
import srsly
|
||||
import typer
|
||||
|
||||
from ... import about
|
||||
from ...git_info import GIT_VERSION
|
||||
from ...util import working_dir, run_command, split_command, is_cwd, join_command
|
||||
from ...util import SimpleFrozenList, is_minor_version_match, ENV_VARS
|
||||
from ...util import check_bool_env_var, SimpleFrozenDict
|
||||
from .._util import PROJECT_FILE, PROJECT_LOCK, load_project_config, get_hash
|
||||
from .._util import get_checksum, project_cli, Arg, Opt, COMMAND, parse_config_overrides
|
||||
|
||||
|
||||
@project_cli.command(
|
||||
"run", context_settings={"allow_extra_args": True, "ignore_unknown_options": True}
|
||||
)
|
||||
def project_run_cli(
|
||||
# fmt: off
|
||||
ctx: typer.Context, # This is only used to read additional arguments
|
||||
subcommand: str = Arg(None, help=f"Name of command defined in the {PROJECT_FILE}"),
|
||||
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
|
||||
force: bool = Opt(False, "--force", "-F", help="Force re-running steps, even if nothing changed"),
|
||||
dry: bool = Opt(False, "--dry", "-D", help="Perform a dry run and don't execute scripts"),
|
||||
show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
|
||||
# fmt: on
|
||||
):
|
||||
"""Run a named command or workflow defined in the project.yml. If a workflow
|
||||
name is specified, all commands in the workflow are run, in order. If
|
||||
commands define dependencies and/or outputs, they will only be re-run if
|
||||
state has changed.
|
||||
|
||||
DOCS: https://spacy.io/api/cli#project-run
|
||||
"""
|
||||
if show_help or not subcommand:
|
||||
print_run_help(project_dir, subcommand)
|
||||
else:
|
||||
overrides = parse_config_overrides(ctx.args)
|
||||
project_run(project_dir, subcommand, overrides=overrides, force=force, dry=dry)
|
||||
|
||||
|
||||
def project_run(
|
||||
project_dir: Path,
|
||||
subcommand: str,
|
||||
*,
|
||||
overrides: Dict[str, Any] = SimpleFrozenDict(),
|
||||
force: bool = False,
|
||||
dry: bool = False,
|
||||
capture: bool = False,
|
||||
) -> None:
|
||||
"""Run a named script defined in the project.yml. If the script is part
|
||||
of the default pipeline (defined in the "run" section), DVC is used to
|
||||
execute the command, so it can determine whether to rerun it. It then
|
||||
calls into "exec" to execute it.
|
||||
|
||||
project_dir (Path): Path to project directory.
|
||||
subcommand (str): Name of command to run.
|
||||
overrides (Dict[str, Any]): Optional config overrides.
|
||||
force (bool): Force re-running, even if nothing changed.
|
||||
dry (bool): Perform a dry run and don't execute commands.
|
||||
capture (bool): Whether to capture the output and errors of individual commands.
|
||||
If False, the stdout and stderr will not be redirected, and if there's an error,
|
||||
sys.exit will be called with the return code. You should use capture=False
|
||||
when you want to turn over execution to the command, and capture=True
|
||||
when you want to run the command more like a function.
|
||||
"""
|
||||
config = load_project_config(project_dir, overrides=overrides)
|
||||
commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
|
||||
workflows = config.get("workflows", {})
|
||||
validate_subcommand(list(commands.keys()), list(workflows.keys()), subcommand)
|
||||
if subcommand in workflows:
|
||||
msg.info(f"Running workflow '{subcommand}'")
|
||||
for cmd in workflows[subcommand]:
|
||||
project_run(
|
||||
project_dir,
|
||||
cmd,
|
||||
overrides=overrides,
|
||||
force=force,
|
||||
dry=dry,
|
||||
capture=capture,
|
||||
)
|
||||
else:
|
||||
cmd = commands[subcommand]
|
||||
for dep in cmd.get("deps", []):
|
||||
if not (project_dir / dep).exists():
|
||||
err = f"Missing dependency specified by command '{subcommand}': {dep}"
|
||||
err_help = "Maybe you forgot to run the 'project assets' command or a previous step?"
|
||||
err_kwargs = {"exits": 1} if not dry else {}
|
||||
msg.fail(err, err_help, **err_kwargs)
|
||||
check_spacy_commit = check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION)
|
||||
with working_dir(project_dir) as current_dir:
|
||||
msg.divider(subcommand)
|
||||
rerun = check_rerun(current_dir, cmd, check_spacy_commit=check_spacy_commit)
|
||||
if not rerun and not force:
|
||||
msg.info(f"Skipping '{cmd['name']}': nothing changed")
|
||||
else:
|
||||
run_commands(cmd["script"], dry=dry, capture=capture)
|
||||
if not dry:
|
||||
update_lockfile(current_dir, cmd)
|
||||
|
||||
|
||||
def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
|
||||
"""Simulate a CLI help prompt using the info available in the project.yml.
|
||||
|
||||
project_dir (Path): The project directory.
|
||||
subcommand (Optional[str]): The subcommand or None. If a subcommand is
|
||||
provided, the subcommand help is shown. Otherwise, the top-level help
|
||||
and a list of available commands is printed.
|
||||
"""
|
||||
config = load_project_config(project_dir)
|
||||
config_commands = config.get("commands", [])
|
||||
commands = {cmd["name"]: cmd for cmd in config_commands}
|
||||
workflows = config.get("workflows", {})
|
||||
project_loc = "" if is_cwd(project_dir) else project_dir
|
||||
if subcommand:
|
||||
validate_subcommand(list(commands.keys()), list(workflows.keys()), subcommand)
|
||||
print(f"Usage: {COMMAND} project run {subcommand} {project_loc}")
|
||||
if subcommand in commands:
|
||||
help_text = commands[subcommand].get("help")
|
||||
if help_text:
|
||||
print(f"\n{help_text}\n")
|
||||
elif subcommand in workflows:
|
||||
steps = workflows[subcommand]
|
||||
print(f"\nWorkflow consisting of {len(steps)} commands:")
|
||||
steps_data = [
|
||||
(f"{i + 1}. {step}", commands[step].get("help", ""))
|
||||
for i, step in enumerate(steps)
|
||||
]
|
||||
msg.table(steps_data)
|
||||
help_cmd = f"{COMMAND} project run [COMMAND] {project_loc} --help"
|
||||
print(f"For command details, run: {help_cmd}")
|
||||
else:
|
||||
print("")
|
||||
title = config.get("title")
|
||||
if title:
|
||||
print(f"{locale_escape(title)}\n")
|
||||
if config_commands:
|
||||
print(f"Available commands in {PROJECT_FILE}")
|
||||
print(f"Usage: {COMMAND} project run [COMMAND] {project_loc}")
|
||||
msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands])
|
||||
if workflows:
|
||||
print(f"Available workflows in {PROJECT_FILE}")
|
||||
print(f"Usage: {COMMAND} project run [WORKFLOW] {project_loc}")
|
||||
msg.table([(name, " -> ".join(steps)) for name, steps in workflows.items()])
|
||||
|
||||
|
||||
def run_commands(
|
||||
commands: Iterable[str] = SimpleFrozenList(),
|
||||
silent: bool = False,
|
||||
dry: bool = False,
|
||||
capture: bool = False,
|
||||
) -> None:
|
||||
"""Run a sequence of commands in a subprocess, in order.
|
||||
|
||||
commands (List[str]): The string commands.
|
||||
silent (bool): Don't print the commands.
|
||||
dry (bool): Perform a dry run and don't execut anything.
|
||||
capture (bool): Whether to capture the output and errors of individual commands.
|
||||
If False, the stdout and stderr will not be redirected, and if there's an error,
|
||||
sys.exit will be called with the return code. You should use capture=False
|
||||
when you want to turn over execution to the command, and capture=True
|
||||
when you want to run the command more like a function.
|
||||
"""
|
||||
for c in commands:
|
||||
command = split_command(c)
|
||||
# Not sure if this is needed or a good idea. Motivation: users may often
|
||||
# use commands in their config that reference "python" and we want to
|
||||
# make sure that it's always executing the same Python that spaCy is
|
||||
# executed with and the pip in the same env, not some other Python/pip.
|
||||
# Also ensures cross-compatibility if user 1 writes "python3" (because
|
||||
# that's how it's set up on their system), and user 2 without the
|
||||
# shortcut tries to re-run the command.
|
||||
if len(command) and command[0] in ("python", "python3"):
|
||||
command[0] = sys.executable
|
||||
elif len(command) and command[0] in ("pip", "pip3"):
|
||||
command = [sys.executable, "-m", "pip", *command[1:]]
|
||||
if not silent:
|
||||
print(f"Running command: {join_command(command)}")
|
||||
if not dry:
|
||||
run_command(command, capture=capture)
|
||||
|
||||
|
||||
def validate_subcommand(
|
||||
commands: Sequence[str], workflows: Sequence[str], subcommand: str
|
||||
) -> None:
|
||||
"""Check that a subcommand is valid and defined. Raises an error otherwise.
|
||||
|
||||
commands (Sequence[str]): The available commands.
|
||||
subcommand (str): The subcommand.
|
||||
"""
|
||||
if not commands and not workflows:
|
||||
msg.fail(f"No commands or workflows defined in {PROJECT_FILE}", exits=1)
|
||||
if subcommand not in commands and subcommand not in workflows:
|
||||
help_msg = []
|
||||
if commands:
|
||||
help_msg.append(f"Available commands: {', '.join(commands)}")
|
||||
if workflows:
|
||||
help_msg.append(f"Available workflows: {', '.join(workflows)}")
|
||||
msg.fail(
|
||||
f"Can't find command or workflow '{subcommand}' in {PROJECT_FILE}",
|
||||
". ".join(help_msg),
|
||||
exits=1,
|
||||
)
|
||||
|
||||
|
||||
def check_rerun(
|
||||
project_dir: Path,
|
||||
command: Dict[str, Any],
|
||||
*,
|
||||
check_spacy_version: bool = True,
|
||||
check_spacy_commit: bool = False,
|
||||
) -> bool:
|
||||
"""Check if a command should be rerun because its settings or inputs/outputs
|
||||
changed.
|
||||
|
||||
project_dir (Path): The current project directory.
|
||||
command (Dict[str, Any]): The command, as defined in the project.yml.
|
||||
strict_version (bool):
|
||||
RETURNS (bool): Whether to re-run the command.
|
||||
"""
|
||||
# Always rerun if no-skip is set
|
||||
if command.get("no_skip", False):
|
||||
return True
|
||||
lock_path = project_dir / PROJECT_LOCK
|
||||
if not lock_path.exists(): # We don't have a lockfile, run command
|
||||
return True
|
||||
data = srsly.read_yaml(lock_path)
|
||||
if command["name"] not in data: # We don't have info about this command
|
||||
return True
|
||||
entry = data[command["name"]]
|
||||
# Always run commands with no outputs (otherwise they'd always be skipped)
|
||||
if not entry.get("outs", []):
|
||||
return True
|
||||
# Always rerun if spaCy version or commit hash changed
|
||||
spacy_v = entry.get("spacy_version")
|
||||
commit = entry.get("spacy_git_version")
|
||||
if check_spacy_version and not is_minor_version_match(spacy_v, about.__version__):
|
||||
info = f"({spacy_v} in {PROJECT_LOCK}, {about.__version__} current)"
|
||||
msg.info(f"Re-running '{command['name']}': spaCy minor version changed {info}")
|
||||
return True
|
||||
if check_spacy_commit and commit != GIT_VERSION:
|
||||
info = f"({commit} in {PROJECT_LOCK}, {GIT_VERSION} current)"
|
||||
msg.info(f"Re-running '{command['name']}': spaCy commit changed {info}")
|
||||
return True
|
||||
# If the entry in the lockfile matches the lockfile entry that would be
|
||||
# generated from the current command, we don't rerun because it means that
|
||||
# all inputs/outputs, hashes and scripts are the same and nothing changed
|
||||
lock_entry = get_lock_entry(project_dir, command)
|
||||
exclude = ["spacy_version", "spacy_git_version"]
|
||||
return get_hash(lock_entry, exclude=exclude) != get_hash(entry, exclude=exclude)
|
||||
|
||||
|
||||
def update_lockfile(project_dir: Path, command: Dict[str, Any]) -> None:
|
||||
"""Update the lockfile after running a command. Will create a lockfile if
|
||||
it doesn't yet exist and will add an entry for the current command, its
|
||||
script and dependencies/outputs.
|
||||
|
||||
project_dir (Path): The current project directory.
|
||||
command (Dict[str, Any]): The command, as defined in the project.yml.
|
||||
"""
|
||||
lock_path = project_dir / PROJECT_LOCK
|
||||
if not lock_path.exists():
|
||||
srsly.write_yaml(lock_path, {})
|
||||
data = {}
|
||||
else:
|
||||
data = srsly.read_yaml(lock_path)
|
||||
data[command["name"]] = get_lock_entry(project_dir, command)
|
||||
srsly.write_yaml(lock_path, data)
|
||||
|
||||
|
||||
def get_lock_entry(project_dir: Path, command: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Get a lockfile entry for a given command. An entry includes the command,
|
||||
the script (command steps) and a list of dependencies and outputs with
|
||||
their paths and file hashes, if available. The format is based on the
|
||||
dvc.lock files, to keep things consistent.
|
||||
|
||||
project_dir (Path): The current project directory.
|
||||
command (Dict[str, Any]): The command, as defined in the project.yml.
|
||||
RETURNS (Dict[str, Any]): The lockfile entry.
|
||||
"""
|
||||
deps = get_fileinfo(project_dir, command.get("deps", []))
|
||||
outs = get_fileinfo(project_dir, command.get("outputs", []))
|
||||
outs_nc = get_fileinfo(project_dir, command.get("outputs_no_cache", []))
|
||||
return {
|
||||
"cmd": f"{COMMAND} run {command['name']}",
|
||||
"script": command["script"],
|
||||
"deps": deps,
|
||||
"outs": [*outs, *outs_nc],
|
||||
"spacy_version": about.__version__,
|
||||
"spacy_git_version": GIT_VERSION,
|
||||
}
|
||||
|
||||
|
||||
def get_fileinfo(project_dir: Path, paths: List[str]) -> List[Dict[str, Optional[str]]]:
|
||||
"""Generate the file information for a list of paths (dependencies, outputs).
|
||||
Includes the file path and the file's checksum.
|
||||
|
||||
project_dir (Path): The current project directory.
|
||||
paths (List[str]): The file paths.
|
||||
RETURNS (List[Dict[str, str]]): The lockfile entry for a file.
|
||||
"""
|
||||
data = []
|
||||
for path in paths:
|
||||
file_path = project_dir / path
|
||||
md5 = get_checksum(file_path) if file_path.exists() else None
|
||||
data.append({"path": path, "md5": md5})
|
||||
return data
|
||||
|
|
|
@ -1,9 +1,9 @@
|
|||
{# This is a template for training configs used for the quickstart widget in
|
||||
the docs and the init config command. It encodes various best practices and
|
||||
can help generate the best possible configuration, given a user's requirements. #}
|
||||
{%- set use_transformer = hardware != "cpu" and transformer_data -%}
|
||||
{%- set use_transformer = hardware != "cpu" -%}
|
||||
{%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
|
||||
{%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "span_finder", "spancat", "spancat_singlelabel", "trainable_lemmatizer"] -%}
|
||||
{%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "spancat", "trainable_lemmatizer"] -%}
|
||||
[paths]
|
||||
train = null
|
||||
dev = null
|
||||
|
@ -24,11 +24,8 @@ gpu_allocator = null
|
|||
lang = "{{ lang }}"
|
||||
{%- set has_textcat = ("textcat" in components or "textcat_multilabel" in components) -%}
|
||||
{%- set with_accuracy = optimize == "accuracy" -%}
|
||||
{# The BOW textcat doesn't need a source of features, so it can omit the
|
||||
tok2vec/transformer. #}
|
||||
{%- set with_accuracy_or_transformer = (use_transformer or with_accuracy) -%}
|
||||
{%- set textcat_needs_features = has_textcat and with_accuracy_or_transformer -%}
|
||||
{%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "span_finder" in components or "spancat" in components or "spancat_singlelabel" in components or "trainable_lemmatizer" in components or "entity_linker" in components or textcat_needs_features) -%}
|
||||
{%- set has_accurate_textcat = has_textcat and with_accuracy -%}
|
||||
{%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "spancat" in components or "trainable_lemmatizer" in components or "entity_linker" in components or has_accurate_textcat) -%}
|
||||
{%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components -%}
|
||||
{%- else -%}
|
||||
{%- set full_pipeline = components -%}
|
||||
|
@ -127,30 +124,6 @@ grad_factor = 1.0
|
|||
@layers = "reduce_mean.v1"
|
||||
{% endif -%}
|
||||
|
||||
{% if "span_finder" in components -%}
|
||||
[components.span_finder]
|
||||
factory = "span_finder"
|
||||
max_length = 25
|
||||
min_length = null
|
||||
scorer = {"@scorers":"spacy.span_finder_scorer.v1"}
|
||||
spans_key = "sc"
|
||||
threshold = 0.5
|
||||
|
||||
[components.span_finder.model]
|
||||
@architectures = "spacy.SpanFinder.v1"
|
||||
|
||||
[components.span_finder.model.scorer]
|
||||
@layers = "spacy.LinearLogistic.v1"
|
||||
nO = 2
|
||||
|
||||
[components.span_finder.model.tok2vec]
|
||||
@architectures = "spacy-transformers.TransformerListener.v1"
|
||||
grad_factor = 1.0
|
||||
|
||||
[components.span_finder.model.tok2vec.pooling]
|
||||
@layers = "reduce_mean.v1"
|
||||
{% endif -%}
|
||||
|
||||
{% if "spancat" in components -%}
|
||||
[components.spancat]
|
||||
factory = "spancat"
|
||||
|
@ -183,36 +156,6 @@ grad_factor = 1.0
|
|||
sizes = [1,2,3]
|
||||
{% endif -%}
|
||||
|
||||
{% if "spancat_singlelabel" in components %}
|
||||
[components.spancat_singlelabel]
|
||||
factory = "spancat_singlelabel"
|
||||
negative_weight = 1.0
|
||||
allow_overlap = true
|
||||
scorer = {"@scorers":"spacy.spancat_scorer.v1"}
|
||||
spans_key = "sc"
|
||||
|
||||
[components.spancat_singlelabel.model]
|
||||
@architectures = "spacy.SpanCategorizer.v1"
|
||||
|
||||
[components.spancat_singlelabel.model.reducer]
|
||||
@layers = "spacy.mean_max_reducer.v1"
|
||||
hidden_size = 128
|
||||
|
||||
[components.spancat_singlelabel.model.scorer]
|
||||
@layers = "Softmax.v2"
|
||||
|
||||
[components.spancat_singlelabel.model.tok2vec]
|
||||
@architectures = "spacy-transformers.TransformerListener.v1"
|
||||
grad_factor = 1.0
|
||||
|
||||
[components.spancat_singlelabel.model.tok2vec.pooling]
|
||||
@layers = "reduce_mean.v1"
|
||||
|
||||
[components.spancat_singlelabel.suggester]
|
||||
@misc = "spacy.ngram_suggester.v1"
|
||||
sizes = [1,2,3]
|
||||
{% endif %}
|
||||
|
||||
{% if "trainable_lemmatizer" in components -%}
|
||||
[components.trainable_lemmatizer]
|
||||
factory = "trainable_lemmatizer"
|
||||
|
@ -271,24 +214,17 @@ grad_factor = 1.0
|
|||
@layers = "reduce_mean.v1"
|
||||
|
||||
[components.textcat.model.linear_model]
|
||||
@architectures = "spacy.TextCatBOW.v3"
|
||||
@architectures = "spacy.TextCatBOW.v2"
|
||||
exclusive_classes = true
|
||||
length = 262144
|
||||
ngram_size = 1
|
||||
no_output_layer = false
|
||||
|
||||
{% else -%}
|
||||
[components.textcat.model]
|
||||
@architectures = "spacy.TextCatCNN.v2"
|
||||
@architectures = "spacy.TextCatBOW.v2"
|
||||
exclusive_classes = true
|
||||
nO = null
|
||||
|
||||
[components.textcat.model.tok2vec]
|
||||
@architectures = "spacy-transformers.TransformerListener.v1"
|
||||
grad_factor = 1.0
|
||||
|
||||
[components.textcat.model.tok2vec.pooling]
|
||||
@layers = "reduce_mean.v1"
|
||||
ngram_size = 1
|
||||
no_output_layer = false
|
||||
{%- endif %}
|
||||
{%- endif %}
|
||||
|
||||
|
@ -309,24 +245,17 @@ grad_factor = 1.0
|
|||
@layers = "reduce_mean.v1"
|
||||
|
||||
[components.textcat_multilabel.model.linear_model]
|
||||
@architectures = "spacy.TextCatBOW.v3"
|
||||
@architectures = "spacy.TextCatBOW.v2"
|
||||
exclusive_classes = false
|
||||
length = 262144
|
||||
ngram_size = 1
|
||||
no_output_layer = false
|
||||
|
||||
{% else -%}
|
||||
[components.textcat_multilabel.model]
|
||||
@architectures = "spacy.TextCatCNN.v2"
|
||||
@architectures = "spacy.TextCatBOW.v2"
|
||||
exclusive_classes = false
|
||||
nO = null
|
||||
|
||||
[components.textcat_multilabel.model.tok2vec]
|
||||
@architectures = "spacy-transformers.TransformerListener.v1"
|
||||
grad_factor = 1.0
|
||||
|
||||
[components.textcat_multilabel.model.tok2vec.pooling]
|
||||
@layers = "reduce_mean.v1"
|
||||
ngram_size = 1
|
||||
no_output_layer = false
|
||||
{%- endif %}
|
||||
{%- endif %}
|
||||
|
||||
|
@ -342,8 +271,13 @@ factory = "tok2vec"
|
|||
[components.tok2vec.model.embed]
|
||||
@architectures = "spacy.MultiHashEmbed.v2"
|
||||
width = ${components.tok2vec.model.encode.width}
|
||||
{% if has_letters -%}
|
||||
attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
|
||||
rows = [5000, 1000, 2500, 2500]
|
||||
rows = [5000, 2500, 2500, 2500]
|
||||
{% else -%}
|
||||
attrs = ["ORTH", "SHAPE"]
|
||||
rows = [5000, 2500]
|
||||
{% endif -%}
|
||||
include_static_vectors = {{ "true" if optimize == "accuracy" else "false" }}
|
||||
|
||||
[components.tok2vec.model.encode]
|
||||
|
@ -357,7 +291,6 @@ maxout_pieces = 3
|
|||
{% if "morphologizer" in components %}
|
||||
[components.morphologizer]
|
||||
factory = "morphologizer"
|
||||
label_smoothing = 0.05
|
||||
|
||||
[components.morphologizer.model]
|
||||
@architectures = "spacy.Tagger.v2"
|
||||
|
@ -371,7 +304,6 @@ width = ${components.tok2vec.model.encode.width}
|
|||
{% if "tagger" in components %}
|
||||
[components.tagger]
|
||||
factory = "tagger"
|
||||
label_smoothing = 0.05
|
||||
|
||||
[components.tagger.model]
|
||||
@architectures = "spacy.Tagger.v2"
|
||||
|
@ -418,27 +350,6 @@ nO = null
|
|||
width = ${components.tok2vec.model.encode.width}
|
||||
{% endif %}
|
||||
|
||||
{% if "span_finder" in components %}
|
||||
[components.span_finder]
|
||||
factory = "span_finder"
|
||||
max_length = 25
|
||||
min_length = null
|
||||
scorer = {"@scorers":"spacy.span_finder_scorer.v1"}
|
||||
spans_key = "sc"
|
||||
threshold = 0.5
|
||||
|
||||
[components.span_finder.model]
|
||||
@architectures = "spacy.SpanFinder.v1"
|
||||
|
||||
[components.span_finder.model.scorer]
|
||||
@layers = "spacy.LinearLogistic.v1"
|
||||
nO = 2
|
||||
|
||||
[components.span_finder.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecListener.v1"
|
||||
width = ${components.tok2vec.model.encode.width}
|
||||
{% endif %}
|
||||
|
||||
{% if "spancat" in components %}
|
||||
[components.spancat]
|
||||
factory = "spancat"
|
||||
|
@ -468,33 +379,6 @@ width = ${components.tok2vec.model.encode.width}
|
|||
sizes = [1,2,3]
|
||||
{% endif %}
|
||||
|
||||
{% if "spancat_singlelabel" in components %}
|
||||
[components.spancat_singlelabel]
|
||||
factory = "spancat_singlelabel"
|
||||
negative_weight = 1.0
|
||||
allow_overlap = true
|
||||
scorer = {"@scorers":"spacy.spancat_scorer.v1"}
|
||||
spans_key = "sc"
|
||||
|
||||
[components.spancat_singlelabel.model]
|
||||
@architectures = "spacy.SpanCategorizer.v1"
|
||||
|
||||
[components.spancat_singlelabel.model.reducer]
|
||||
@layers = "spacy.mean_max_reducer.v1"
|
||||
hidden_size = 128
|
||||
|
||||
[components.spancat_singlelabel.model.scorer]
|
||||
@layers = "Softmax.v2"
|
||||
|
||||
[components.spancat_singlelabel.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecListener.v1"
|
||||
width = ${components.tok2vec.model.encode.width}
|
||||
|
||||
[components.spancat_singlelabel.suggester]
|
||||
@misc = "spacy.ngram_suggester.v1"
|
||||
sizes = [1,2,3]
|
||||
{% endif %}
|
||||
|
||||
{% if "trainable_lemmatizer" in components -%}
|
||||
[components.trainable_lemmatizer]
|
||||
factory = "trainable_lemmatizer"
|
||||
|
@ -544,15 +428,14 @@ nO = null
|
|||
width = ${components.tok2vec.model.encode.width}
|
||||
|
||||
[components.textcat.model.linear_model]
|
||||
@architectures = "spacy.TextCatBOW.v3"
|
||||
@architectures = "spacy.TextCatBOW.v2"
|
||||
exclusive_classes = true
|
||||
length = 262144
|
||||
ngram_size = 1
|
||||
no_output_layer = false
|
||||
|
||||
{% else -%}
|
||||
[components.textcat.model]
|
||||
@architectures = "spacy.TextCatBOW.v3"
|
||||
@architectures = "spacy.TextCatBOW.v2"
|
||||
exclusive_classes = true
|
||||
ngram_size = 1
|
||||
no_output_layer = false
|
||||
|
@ -573,17 +456,15 @@ nO = null
|
|||
width = ${components.tok2vec.model.encode.width}
|
||||
|
||||
[components.textcat_multilabel.model.linear_model]
|
||||
@architectures = "spacy.TextCatBOW.v3"
|
||||
@architectures = "spacy.TextCatBOW.v2"
|
||||
exclusive_classes = false
|
||||
length = 262144
|
||||
ngram_size = 1
|
||||
no_output_layer = false
|
||||
|
||||
{% else -%}
|
||||
[components.textcat_multilabel.model]
|
||||
@architectures = "spacy.TextCatBOW.v3"
|
||||
@architectures = "spacy.TextCatBOW.v2"
|
||||
exclusive_classes = false
|
||||
length = 262144
|
||||
ngram_size = 1
|
||||
no_output_layer = false
|
||||
{%- endif %}
|
||||
|
|
|
@ -37,15 +37,6 @@ bn:
|
|||
accuracy:
|
||||
name: sagorsarker/bangla-bert-base
|
||||
size_factor: 3
|
||||
ca:
|
||||
word_vectors: null
|
||||
transformer:
|
||||
efficiency:
|
||||
name: projecte-aina/roberta-base-ca-v2
|
||||
size_factor: 3
|
||||
accuracy:
|
||||
name: projecte-aina/roberta-base-ca-v2
|
||||
size_factor: 3
|
||||
da:
|
||||
word_vectors: da_core_news_lg
|
||||
transformer:
|
||||
|
@ -280,3 +271,4 @@ zh:
|
|||
accuracy:
|
||||
name: bert-base-chinese
|
||||
size_factor: 3
|
||||
has_letters: false
|
||||
|
|
|
@ -1,23 +1,15 @@
|
|||
from typing import Optional, Dict, Any, Union
|
||||
from pathlib import Path
|
||||
from wasabi import msg
|
||||
import typer
|
||||
import logging
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Optional, Union
|
||||
|
||||
import typer
|
||||
from wasabi import msg
|
||||
|
||||
from .. import util
|
||||
from ..training.initialize import init_nlp
|
||||
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
|
||||
from ._util import import_code, setup_gpu
|
||||
from ..training.loop import train as train_nlp
|
||||
from ._util import (
|
||||
Arg,
|
||||
Opt,
|
||||
app,
|
||||
import_code,
|
||||
parse_config_overrides,
|
||||
setup_gpu,
|
||||
show_validation_error,
|
||||
)
|
||||
from ..training.initialize import init_nlp
|
||||
from .. import util
|
||||
|
||||
|
||||
@app.command(
|
||||
|
@ -47,8 +39,7 @@ def train_cli(
|
|||
|
||||
DOCS: https://spacy.io/api/cli#train
|
||||
"""
|
||||
if verbose:
|
||||
util.logger.setLevel(logging.DEBUG)
|
||||
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
||||
overrides = parse_config_overrides(ctx.args)
|
||||
import_code(code_path)
|
||||
train(config_path, output_path, use_gpu=use_gpu, overrides=overrides)
|
||||
|
|
|
@ -1,21 +1,14 @@
|
|||
import sys
|
||||
import warnings
|
||||
from pathlib import Path
|
||||
from typing import Tuple
|
||||
|
||||
from pathlib import Path
|
||||
import sys
|
||||
import requests
|
||||
from wasabi import Printer, msg
|
||||
from wasabi import msg, Printer
|
||||
import warnings
|
||||
|
||||
from .. import about
|
||||
from ..util import (
|
||||
get_installed_models,
|
||||
get_minor_version,
|
||||
get_model_meta,
|
||||
get_package_path,
|
||||
get_package_version,
|
||||
is_compatible_version,
|
||||
)
|
||||
from ._util import app
|
||||
from .. import about
|
||||
from ..util import get_package_version, get_installed_models, get_minor_version
|
||||
from ..util import get_package_path, get_model_meta, is_compatible_version
|
||||
|
||||
|
||||
@app.command("validate")
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
"""Helpers for Python and platform compatibility."""
|
||||
import sys
|
||||
|
||||
from thinc.util import copy_array
|
||||
|
||||
try:
|
||||
|
|
|
@ -26,9 +26,6 @@ batch_size = 1000
|
|||
[nlp.tokenizer]
|
||||
@tokenizers = "spacy.Tokenizer.v1"
|
||||
|
||||
[nlp.vectors]
|
||||
@vectors = "spacy.Vectors.v1"
|
||||
|
||||
# The pipeline components and their models
|
||||
[components]
|
||||
|
||||
|
@ -93,8 +90,6 @@ dev_corpus = "corpora.dev"
|
|||
train_corpus = "corpora.train"
|
||||
# Optional callback before nlp object is saved to disk after training
|
||||
before_to_disk = null
|
||||
# Optional callback that is invoked at the start of each training step
|
||||
before_update = null
|
||||
|
||||
[training.logger]
|
||||
@loggers = "spacy.ConsoleLogger.v1"
|
||||
|
|
|
@ -4,13 +4,14 @@ spaCy's built in visualization suite for dependencies and named entities.
|
|||
DOCS: https://spacy.io/api/top-level#displacy
|
||||
USAGE: https://spacy.io/usage/visualizers
|
||||
"""
|
||||
from typing import Union, Iterable, Optional, Dict, Any, Callable
|
||||
import warnings
|
||||
from typing import Any, Callable, Dict, Iterable, Optional, Union
|
||||
|
||||
from ..errors import Errors, Warnings
|
||||
from ..tokens import Doc, Span
|
||||
from ..util import find_available_port, is_in_jupyter
|
||||
from .render import DependencyRenderer, EntityRenderer, SpanRenderer
|
||||
from ..tokens import Doc, Span
|
||||
from ..errors import Errors, Warnings
|
||||
from ..util import is_in_jupyter
|
||||
|
||||
|
||||
_html = {}
|
||||
RENDER_WRAPPER = None
|
||||
|
@ -35,7 +36,7 @@ def render(
|
|||
jupyter (bool): Override Jupyter auto-detection.
|
||||
options (dict): Visualiser-specific options, e.g. colors.
|
||||
manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
|
||||
RETURNS (str): Rendered SVG or HTML markup.
|
||||
RETURNS (str): Rendered HTML markup.
|
||||
|
||||
DOCS: https://spacy.io/api/top-level#displacy.render
|
||||
USAGE: https://spacy.io/usage/visualizers
|
||||
|
@ -66,7 +67,7 @@ def render(
|
|||
if jupyter or (jupyter is None and is_in_jupyter()):
|
||||
# return HTML rendered by IPython display()
|
||||
# See #4840 for details on span wrapper to disable mathjax
|
||||
from IPython.core.display import HTML, display
|
||||
from IPython.core.display import display, HTML
|
||||
|
||||
return display(HTML('<span class="tex2jax_ignore">{}</span>'.format(html)))
|
||||
return html
|
||||
|
@ -81,7 +82,6 @@ def serve(
|
|||
manual: bool = False,
|
||||
port: int = 5000,
|
||||
host: str = "0.0.0.0",
|
||||
auto_select_port: bool = False,
|
||||
) -> None:
|
||||
"""Serve displaCy visualisation.
|
||||
|
||||
|
@ -93,15 +93,12 @@ def serve(
|
|||
manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
|
||||
port (int): Port to serve visualisation.
|
||||
host (str): Host to serve visualisation.
|
||||
auto_select_port (bool): Automatically select a port if the specified port is in use.
|
||||
|
||||
DOCS: https://spacy.io/api/top-level#displacy.serve
|
||||
USAGE: https://spacy.io/usage/visualizers
|
||||
"""
|
||||
from wsgiref import simple_server
|
||||
|
||||
port = find_available_port(port, host, auto_select_port)
|
||||
|
||||
if is_in_jupyter():
|
||||
warnings.warn(Warnings.W011)
|
||||
render(docs, style=style, page=page, minify=minify, options=options, manual=manual)
|
||||
|
@ -123,17 +120,12 @@ def app(environ, start_response):
|
|||
return [res]
|
||||
|
||||
|
||||
def parse_deps(
|
||||
orig_doc: Union[Doc, Span], options: Dict[str, Any] = {}
|
||||
) -> Dict[str, Any]:
|
||||
def parse_deps(orig_doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
|
||||
"""Generate dependency parse in {'words': [], 'arcs': []} format.
|
||||
|
||||
orig_doc (Union[Doc, Span]): Document to parse.
|
||||
options (Dict[str, Any]): Dependency parse specific visualisation options.
|
||||
doc (Doc): Document do parse.
|
||||
RETURNS (dict): Generated dependency parse keyed by words and arcs.
|
||||
"""
|
||||
if isinstance(orig_doc, Span):
|
||||
orig_doc = orig_doc.as_doc()
|
||||
doc = Doc(orig_doc.vocab).from_bytes(
|
||||
orig_doc.to_bytes(exclude=["user_data", "user_hooks"])
|
||||
)
|
||||
|
@ -217,7 +209,7 @@ def parse_ents(doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
|
|||
|
||||
|
||||
def parse_spans(doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
|
||||
"""Generate spans in [{start_token: i, end_token: i, label: 'label'}] format.
|
||||
"""Generate spans in [{start: i, end: i, label: 'label'}] format.
|
||||
|
||||
doc (Doc): Document to parse.
|
||||
options (Dict[str, any]): Span-specific visualisation options.
|
||||
|
|
|
@ -1,28 +1,15 @@
|
|||
import uuid
|
||||
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||
import uuid
|
||||
import itertools
|
||||
|
||||
from ..errors import Errors
|
||||
from ..util import escape_html, minify_html, registry
|
||||
from .templates import (
|
||||
TPL_DEP_ARCS,
|
||||
TPL_DEP_SVG,
|
||||
TPL_DEP_WORDS,
|
||||
TPL_DEP_WORDS_LEMMA,
|
||||
TPL_ENT,
|
||||
TPL_ENT_RTL,
|
||||
TPL_ENTS,
|
||||
TPL_FIGURE,
|
||||
TPL_KB_LINK,
|
||||
TPL_PAGE,
|
||||
TPL_SPAN,
|
||||
TPL_SPAN_RTL,
|
||||
TPL_SPAN_SLICE,
|
||||
TPL_SPAN_SLICE_RTL,
|
||||
TPL_SPAN_START,
|
||||
TPL_SPAN_START_RTL,
|
||||
TPL_SPANS,
|
||||
TPL_TITLE,
|
||||
)
|
||||
from .templates import TPL_DEP_ARCS, TPL_DEP_SVG, TPL_DEP_WORDS
|
||||
from .templates import TPL_DEP_WORDS_LEMMA, TPL_ENT, TPL_ENT_RTL, TPL_ENTS
|
||||
from .templates import TPL_FIGURE, TPL_KB_LINK, TPL_PAGE, TPL_SPAN
|
||||
from .templates import TPL_SPAN_RTL, TPL_SPAN_SLICE, TPL_SPAN_SLICE_RTL
|
||||
from .templates import TPL_SPAN_START, TPL_SPAN_START_RTL, TPL_SPANS
|
||||
from .templates import TPL_TITLE
|
||||
|
||||
DEFAULT_LANG = "en"
|
||||
DEFAULT_DIR = "ltr"
|
||||
|
@ -77,11 +64,8 @@ class SpanRenderer:
|
|||
# Set up how the text and labels will be rendered
|
||||
self.direction = DEFAULT_DIR
|
||||
self.lang = DEFAULT_LANG
|
||||
# These values are in px
|
||||
self.top_offset = options.get("top_offset", 40)
|
||||
# This is how far under the top offset the span labels appear
|
||||
self.span_label_offset = options.get("span_label_offset", 20)
|
||||
self.offset_step = options.get("top_offset_step", 17)
|
||||
self.top_offset_step = options.get("top_offset_step", 17)
|
||||
|
||||
# Set up which templates will be used
|
||||
template = options.get("template")
|
||||
|
@ -107,7 +91,7 @@ class SpanRenderer:
|
|||
parsed (list): Dependency parses to render.
|
||||
page (bool): Render parses wrapped as full HTML page.
|
||||
minify (bool): Minify HTML markup.
|
||||
RETURNS (str): Rendered SVG or HTML markup.
|
||||
RETURNS (str): Rendered HTML markup.
|
||||
"""
|
||||
rendered = []
|
||||
for i, p in enumerate(parsed):
|
||||
|
@ -142,106 +126,43 @@ class SpanRenderer:
|
|||
spans (list): Individual entity spans and their start, end, label, kb_id and kb_url.
|
||||
title (str / None): Document title set in Doc.user_data['title'].
|
||||
"""
|
||||
per_token_info = self._assemble_per_token_info(tokens, spans)
|
||||
markup = self._render_markup(per_token_info)
|
||||
markup = TPL_SPANS.format(content=markup, dir=self.direction)
|
||||
if title:
|
||||
markup = TPL_TITLE.format(title=title) + markup
|
||||
return markup
|
||||
|
||||
@staticmethod
|
||||
def _assemble_per_token_info(
|
||||
tokens: List[str], spans: List[Dict[str, Any]]
|
||||
) -> List[Dict[str, List[Dict[str, Any]]]]:
|
||||
"""Assembles token info used to generate markup in render_spans().
|
||||
tokens (List[str]): Tokens in text.
|
||||
spans (List[Dict[str, Any]]): Spans in text.
|
||||
RETURNS (List[Dict[str, List[Dict, str, Any]]]): Per token info needed to render HTML markup for given tokens
|
||||
and spans.
|
||||
"""
|
||||
per_token_info: List[Dict[str, List[Dict[str, Any]]]] = []
|
||||
|
||||
# we must sort so that we can correctly describe when spans need to "stack"
|
||||
# which is determined by their start token, then span length (longer spans on top),
|
||||
# then break any remaining ties with the span label
|
||||
spans = sorted(
|
||||
spans,
|
||||
key=lambda s: (
|
||||
s["start_token"],
|
||||
-(s["end_token"] - s["start_token"]),
|
||||
s["label"],
|
||||
),
|
||||
)
|
||||
|
||||
for s in spans:
|
||||
# this is the vertical 'slot' that the span will be rendered in
|
||||
# vertical_position = span_label_offset + (offset_step * (slot - 1))
|
||||
s["render_slot"] = 0
|
||||
|
||||
per_token_info = []
|
||||
for idx, token in enumerate(tokens):
|
||||
# Identify if a token belongs to a Span (and which) and if it's a
|
||||
# start token of said Span. We'll use this for the final HTML render
|
||||
token_markup: Dict[str, Any] = {}
|
||||
token_markup["text"] = token
|
||||
intersecting_spans: List[Dict[str, Any]] = []
|
||||
entities = []
|
||||
for span in spans:
|
||||
ent = {}
|
||||
if span["start_token"] <= idx < span["end_token"]:
|
||||
span_start = idx == span["start_token"]
|
||||
ent["label"] = span["label"]
|
||||
ent["is_start"] = span_start
|
||||
if span_start:
|
||||
# When the span starts, we need to know how many other
|
||||
# spans are on the 'span stack' and will be rendered.
|
||||
# This value becomes the vertical render slot for this entire span
|
||||
span["render_slot"] = (
|
||||
intersecting_spans[-1]["render_slot"]
|
||||
if len(intersecting_spans)
|
||||
else 0
|
||||
) + 1
|
||||
intersecting_spans.append(span)
|
||||
ent["render_slot"] = span["render_slot"]
|
||||
ent["is_start"] = True if idx == span["start_token"] else False
|
||||
kb_id = span.get("kb_id", "")
|
||||
kb_url = span.get("kb_url", "#")
|
||||
ent["kb_link"] = (
|
||||
TPL_KB_LINK.format(kb_id=kb_id, kb_url=kb_url) if kb_id else ""
|
||||
)
|
||||
entities.append(ent)
|
||||
else:
|
||||
# We don't specifically need to do this since we loop
|
||||
# over tokens and spans sorted by their start_token,
|
||||
# so we'll never use a span again after the last token it appears in,
|
||||
# but if we were to use these spans again we'd want to make sure
|
||||
# this value was reset correctly.
|
||||
span["render_slot"] = 0
|
||||
token_markup["entities"] = entities
|
||||
per_token_info.append(token_markup)
|
||||
|
||||
return per_token_info
|
||||
markup = self._render_markup(per_token_info)
|
||||
markup = TPL_SPANS.format(content=markup, dir=self.direction)
|
||||
if title:
|
||||
markup = TPL_TITLE.format(title=title) + markup
|
||||
return markup
|
||||
|
||||
def _render_markup(self, per_token_info: List[Dict[str, Any]]) -> str:
|
||||
"""Render the markup from per-token information"""
|
||||
markup = ""
|
||||
for token in per_token_info:
|
||||
entities = sorted(token["entities"], key=lambda d: d["render_slot"])
|
||||
# Whitespace tokens disrupt the vertical space (no line height) so that the
|
||||
# span indicators get misaligned. We don't render them as individual
|
||||
# tokens anyway, so we'll just not display a span indicator either.
|
||||
is_whitespace = token["text"].strip() == ""
|
||||
if entities and not is_whitespace:
|
||||
entities = sorted(token["entities"], key=lambda d: d["label"])
|
||||
if entities:
|
||||
slices = self._get_span_slices(token["entities"])
|
||||
starts = self._get_span_starts(token["entities"])
|
||||
total_height = (
|
||||
self.top_offset
|
||||
+ self.span_label_offset
|
||||
+ (self.offset_step * (len(entities) - 1))
|
||||
)
|
||||
markup += self.span_template.format(
|
||||
text=escape_html(token["text"]),
|
||||
span_slices=slices,
|
||||
span_starts=starts,
|
||||
total_height=total_height,
|
||||
text=token["text"], span_slices=slices, span_starts=starts
|
||||
)
|
||||
else:
|
||||
markup += escape_html(token["text"] + " ")
|
||||
|
@ -250,18 +171,10 @@ class SpanRenderer:
|
|||
def _get_span_slices(self, entities: List[Dict]) -> str:
|
||||
"""Get the rendered markup of all Span slices"""
|
||||
span_slices = []
|
||||
for entity in entities:
|
||||
# rather than iterate over multiples of offset_step, we use entity['render_slot']
|
||||
# to determine the vertical position, since that tells where
|
||||
# the span starts vertically so we can extend it horizontally,
|
||||
# past other spans that might have already ended
|
||||
for entity, step in zip(entities, itertools.count(step=self.top_offset_step)):
|
||||
color = self.colors.get(entity["label"].upper(), self.default_color)
|
||||
top_offset = self.top_offset + (
|
||||
self.offset_step * (entity["render_slot"] - 1)
|
||||
)
|
||||
span_slice = self.span_slice_template.format(
|
||||
bg=color,
|
||||
top_offset=top_offset,
|
||||
bg=color, top_offset=self.top_offset + step
|
||||
)
|
||||
span_slices.append(span_slice)
|
||||
return "".join(span_slices)
|
||||
|
@ -269,15 +182,12 @@ class SpanRenderer:
|
|||
def _get_span_starts(self, entities: List[Dict]) -> str:
|
||||
"""Get the rendered markup of all Span start tokens"""
|
||||
span_starts = []
|
||||
for entity in entities:
|
||||
for entity, step in zip(entities, itertools.count(step=self.top_offset_step)):
|
||||
color = self.colors.get(entity["label"].upper(), self.default_color)
|
||||
top_offset = self.top_offset + (
|
||||
self.offset_step * (entity["render_slot"] - 1)
|
||||
)
|
||||
span_start = (
|
||||
self.span_start_template.format(
|
||||
bg=color,
|
||||
top_offset=top_offset,
|
||||
top_offset=self.top_offset + step,
|
||||
label=entity["label"],
|
||||
kb_link=entity["kb_link"],
|
||||
)
|
||||
|
@ -334,8 +244,6 @@ class DependencyRenderer:
|
|||
self.lang = settings.get("lang", DEFAULT_LANG)
|
||||
render_id = f"{id_prefix}-{i}"
|
||||
svg = self.render_svg(render_id, p["words"], p["arcs"])
|
||||
if p.get("title"):
|
||||
svg = TPL_TITLE.format(title=p.get("title")) + svg
|
||||
rendered.append(svg)
|
||||
if page:
|
||||
content = "".join([TPL_FIGURE.format(content=svg) for svg in rendered])
|
||||
|
@ -546,7 +454,7 @@ class EntityRenderer:
|
|||
parsed (list): Dependency parses to render.
|
||||
page (bool): Render parses wrapped as full HTML page.
|
||||
minify (bool): Minify HTML markup.
|
||||
RETURNS (str): Rendered SVG or HTML markup.
|
||||
RETURNS (str): Rendered HTML markup.
|
||||
"""
|
||||
rendered = []
|
||||
for i, p in enumerate(parsed):
|
||||
|
@ -588,7 +496,7 @@ class EntityRenderer:
|
|||
for i, fragment in enumerate(fragments):
|
||||
markup += escape_html(fragment)
|
||||
if len(fragments) > 1 and i != len(fragments) - 1:
|
||||
markup += "<br>"
|
||||
markup += "</br>"
|
||||
if self.ents is None or label.upper() in self.ents:
|
||||
color = self.colors.get(label.upper(), self.default_color)
|
||||
ent_settings = {
|
||||
|
@ -606,7 +514,7 @@ class EntityRenderer:
|
|||
for i, fragment in enumerate(fragments):
|
||||
markup += escape_html(fragment)
|
||||
if len(fragments) > 1 and i != len(fragments) - 1:
|
||||
markup += "<br>"
|
||||
markup += "</br>"
|
||||
markup = TPL_ENTS.format(content=markup, dir=self.direction)
|
||||
if title:
|
||||
markup = TPL_TITLE.format(title=title) + markup
|
||||
|
|
|
@ -67,7 +67,7 @@ TPL_SPANS = """
|
|||
"""
|
||||
|
||||
TPL_SPAN = """
|
||||
<span style="font-weight: bold; display: inline-block; position: relative; height: {total_height}px;">
|
||||
<span style="font-weight: bold; display: inline-block; position: relative;">
|
||||
{text}
|
||||
{span_slices}
|
||||
{span_starts}
|
||||
|
|
|
@ -1,7 +1,5 @@
|
|||
import warnings
|
||||
|
||||
from .compat import Literal
|
||||
|
||||
|
||||
class ErrorsWithCodes(type):
|
||||
def __getattribute__(self, code):
|
||||
|
@ -17,8 +15,8 @@ def setup_default_warnings():
|
|||
filter_warning("ignore", error_msg="numpy.dtype size changed") # noqa
|
||||
filter_warning("ignore", error_msg="numpy.ufunc size changed") # noqa
|
||||
|
||||
# warn about entity_ruler, span_ruler & matcher having no patterns only once
|
||||
for pipe in ["matcher", "entity_ruler", "span_ruler"]:
|
||||
# warn about entity_ruler & matcher having no patterns only once
|
||||
for pipe in ["matcher", "entity_ruler"]:
|
||||
filter_warning("once", error_msg=Warnings.W036.format(name=pipe))
|
||||
|
||||
# warn once about lemmatizer without required POS
|
||||
|
@ -28,10 +26,7 @@ def setup_default_warnings():
|
|||
filter_warning("once", error_msg="[W114]")
|
||||
|
||||
|
||||
def filter_warning(
|
||||
action: Literal["default", "error", "ignore", "always", "module", "once"],
|
||||
error_msg: str,
|
||||
):
|
||||
def filter_warning(action: str, error_msg: str):
|
||||
"""Customize how spaCy should handle a certain warning.
|
||||
|
||||
error_msg (str): e.g. "W006", or a full error message
|
||||
|
@ -210,17 +205,6 @@ class Warnings(metaclass=ErrorsWithCodes):
|
|||
"Only the last span group will be loaded under "
|
||||
"Doc.spans['{group_name}']. Skipping span group with values: "
|
||||
"{group_values}")
|
||||
W121 = ("Attempting to trace non-existent method '{method}' in pipe '{pipe}'")
|
||||
W122 = ("Couldn't trace method '{method}' in pipe '{pipe}'. This can happen if the pipe class "
|
||||
"is a Cython extension type.")
|
||||
W123 = ("Argument `enable` with value {enable} does not contain all values specified in the config option "
|
||||
"`enabled` ({enabled}). Be aware that this might affect other components in your pipeline.")
|
||||
W124 = ("{host}:{port} is already in use, using the nearest available port {serve_port} as an alternative.")
|
||||
W125 = ("The StaticVectors key_attr is no longer used. To set a custom "
|
||||
"key attribute for vectors, configure it through Vectors(attr=) or "
|
||||
"'spacy init vectors --attr'")
|
||||
W126 = ("These keys are unsupported: {unsupported}")
|
||||
W127 = ("Not all `Language.pipe` worker processes completed successfully")
|
||||
|
||||
|
||||
class Errors(metaclass=ErrorsWithCodes):
|
||||
|
@ -228,6 +212,7 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
E002 = ("Can't find factory for '{name}' for language {lang} ({lang_code}). "
|
||||
"This usually happens when spaCy calls `nlp.{method}` with a custom "
|
||||
"component name that's not registered on the current language class. "
|
||||
"If you're using a Transformer, make sure to install 'spacy-transformers'. "
|
||||
"If you're using a custom component, make sure you've added the "
|
||||
"decorator `@Language.component` (for function components) or "
|
||||
"`@Language.factory` (for class components).\n\nAvailable "
|
||||
|
@ -238,9 +223,8 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
"initialized component.")
|
||||
E004 = ("Can't set up pipeline component: a factory for '{name}' already "
|
||||
"exists. Existing factory: {func}. New factory: {new_func}")
|
||||
E005 = ("Pipeline component '{name}' returned {returned_type} instead of a "
|
||||
"Doc. If you're using a custom component, maybe you forgot to "
|
||||
"return the processed Doc?")
|
||||
E005 = ("Pipeline component '{name}' returned None. If you're using a "
|
||||
"custom component, maybe you forgot to return the processed Doc?")
|
||||
E006 = ("Invalid constraints for adding pipeline component. You can only "
|
||||
"set one of the following: before (component name or index), "
|
||||
"after (component name or index), first (True) or last (True). "
|
||||
|
@ -403,7 +387,7 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
"consider using doc.spans instead.")
|
||||
E106 = ("Can't find `doc._.{attr}` attribute specified in the underscore "
|
||||
"settings: {opts}")
|
||||
E107 = ("Value of custom attribute `{attr}` is not JSON-serializable: {value}")
|
||||
E107 = ("Value of `doc._.{attr}` is not JSON-serializable: {value}")
|
||||
E109 = ("Component '{name}' could not be run. Did you forget to "
|
||||
"call `initialize()`?")
|
||||
E110 = ("Invalid displaCy render wrapper. Expected callable, got: {obj}")
|
||||
|
@ -449,7 +433,8 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
E133 = ("The sum of prior probabilities for alias '{alias}' should not "
|
||||
"exceed 1, but found {sum}.")
|
||||
E134 = ("Entity '{entity}' is not defined in the Knowledge Base.")
|
||||
E139 = ("Knowledge base for component '{name}' is empty.")
|
||||
E139 = ("Knowledge base for component '{name}' is empty. Use the methods "
|
||||
"`kb.add_entity` and `kb.add_alias` to add entries.")
|
||||
E140 = ("The list of entities, prior probabilities and entity vectors "
|
||||
"should be of equal length.")
|
||||
E141 = ("Entity vectors should be of length {required} instead of the "
|
||||
|
@ -548,20 +533,11 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
E198 = ("Unable to return {n} most similar vectors for the current vectors "
|
||||
"table, which contains {n_rows} vectors.")
|
||||
E199 = ("Unable to merge 0-length span at `doc[{start}:{end}]`.")
|
||||
E200 = ("Can't set {attr} from Span.")
|
||||
E200 = ("Can't yet set {attr} from Span. Vote for this feature on the "
|
||||
"issue tracker: http://github.com/explosion/spaCy/issues")
|
||||
E202 = ("Unsupported {name} mode '{mode}'. Supported modes: {modes}.")
|
||||
E203 = ("If the {name} embedding layer is not updated "
|
||||
"during training, make sure to include it in 'annotating components'")
|
||||
|
||||
# New errors added in v3.x
|
||||
E849 = ("The vocab only supports {method} for vectors of type "
|
||||
"spacy.vectors.Vectors, not {vectors_type}.")
|
||||
E850 = ("The PretrainVectors objective currently only supports default or "
|
||||
"floret vectors, not {mode} vectors.")
|
||||
E851 = ("The 'textcat' component labels should only have values of 0 or 1, "
|
||||
"but found value of '{val}'.")
|
||||
E853 = ("Unsupported component factory name '{name}'. The character '.' is "
|
||||
"not permitted in factory names.")
|
||||
E854 = ("Unable to set doc.ents. Check that the 'ents_filter' does not "
|
||||
"permit overlapping spans.")
|
||||
E855 = ("Invalid {obj}: {obj} is not from the same doc.")
|
||||
|
@ -727,11 +703,11 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
"need to modify the pipeline, use the built-in methods like "
|
||||
"`nlp.add_pipe`, `nlp.remove_pipe`, `nlp.disable_pipe` or "
|
||||
"`nlp.enable_pipe` instead.")
|
||||
E927 = ("Can't write to frozen list. Maybe you're trying to modify a computed "
|
||||
E927 = ("Can't write to frozen list Maybe you're trying to modify a computed "
|
||||
"property or default function argument?")
|
||||
E928 = ("An InMemoryLookupKB can only be serialized to/from from a directory, "
|
||||
E928 = ("A KnowledgeBase can only be serialized to/from from a directory, "
|
||||
"but the provided argument {loc} points to a file.")
|
||||
E929 = ("Couldn't read InMemoryLookupKB from {loc}. The path does not seem to exist.")
|
||||
E929 = ("Couldn't read KnowledgeBase from {loc}. The path does not seem to exist.")
|
||||
E930 = ("Received invalid get_examples callback in `{method}`. "
|
||||
"Expected function that returns an iterable of Example objects but "
|
||||
"got: {obj}")
|
||||
|
@ -743,8 +719,8 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
"model from a shortcut, which is obsolete as of spaCy v3.0. To "
|
||||
"load the model, use its full name instead:\n\n"
|
||||
"nlp = spacy.load(\"{full}\")\n\nFor more details on the available "
|
||||
"models, see the models directory: https://spacy.io/models and if "
|
||||
"you want to create a blank model, use spacy.blank: "
|
||||
"models, see the models directory: https://spacy.io/models. If you "
|
||||
"want to create a blank model, use spacy.blank: "
|
||||
"nlp = spacy.blank(\"{name}\")")
|
||||
E942 = ("Executing `after_{name}` callback failed. Expected the function to "
|
||||
"return an initialized nlp object but got: {value}. Maybe "
|
||||
|
@ -957,37 +933,6 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
E1040 = ("Doc.from_json requires all tokens to have the same attributes. "
|
||||
"Some tokens do not contain annotation for: {partial_attrs}")
|
||||
E1041 = ("Expected a string, Doc, or bytes as input, but got: {type}")
|
||||
E1042 = ("`enable={enable}` and `disable={disable}` are inconsistent with each other.\nIf you only passed "
|
||||
"one of `enable` or `disable`, the other argument is specified in your pipeline's configuration.\nIn that "
|
||||
"case pass an empty list for the previously not specified argument to avoid this error.")
|
||||
E1043 = ("Expected None or a value in range [{range_start}, {range_end}] for entity linker threshold, but got "
|
||||
"{value}.")
|
||||
E1044 = ("Expected `candidates_batch_size` to be >= 1, but got: {value}")
|
||||
E1045 = ("Encountered {parent} subclass without `{parent}.{method}` "
|
||||
"method in '{name}'. If you want to use this method, make "
|
||||
"sure it's overwritten on the subclass.")
|
||||
E1046 = ("{cls_name} is an abstract class and cannot be instantiated. If you are looking for spaCy's default "
|
||||
"knowledge base, use `InMemoryLookupKB`.")
|
||||
E1047 = ("`find_threshold()` only supports components with a `scorer` attribute.")
|
||||
E1048 = ("Got '{unexpected}' as console progress bar type, but expected one of the following: {expected}")
|
||||
E1049 = ("No available port found for displaCy on host {host}. Please specify an available port "
|
||||
"with `displacy.serve(doc, port=port)`")
|
||||
E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port=port)` "
|
||||
"or use `auto_select_port=True` to pick an available port automatically.")
|
||||
E1051 = ("'allow_overlap' can only be False when max_positive is 1, but found 'max_positive': {max_positive}.")
|
||||
E1052 = ("Unable to copy spans: the character offsets for the span at "
|
||||
"index {i} in the span group do not align with the tokenization "
|
||||
"in the target doc.")
|
||||
E1053 = ("Both 'min_length' and 'max_length' should be larger than 0, but found"
|
||||
" 'min_length': {min_length}, 'max_length': {max_length}")
|
||||
E1054 = ("The text, including whitespace, must match between reference and "
|
||||
"predicted docs when training {component}.")
|
||||
E1055 = ("The 'replace_listener' callback expects {num_params} parameters, "
|
||||
"but only callbacks with one or three parameters are supported")
|
||||
E1056 = ("The `TextCatBOW` architecture expects a length of at least 1, was {length}.")
|
||||
E1057 = ("The `TextCatReduce` architecture must be used with at least one "
|
||||
"reduction. Please enable one of `use_reduce_first`, "
|
||||
"`use_reduce_last`, `use_reduce_max` or `use_reduce_mean`.")
|
||||
|
||||
|
||||
# Deprecated model shortcuts, only used in errors and warnings
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
import warnings
|
||||
|
||||
from .errors import Warnings
|
||||
|
||||
|
||||
|
|
|
@ -1,12 +1,14 @@
|
|||
"""Knowledge-base for entity or concept linking."""
|
||||
from cymem.cymem cimport Pool
|
||||
from preshed.maps cimport PreshMap
|
||||
from libcpp.vector cimport vector
|
||||
from libc.stdint cimport int32_t, int64_t
|
||||
from libc.stdio cimport FILE
|
||||
from libcpp.vector cimport vector
|
||||
from preshed.maps cimport PreshMap
|
||||
|
||||
from ..structs cimport AliasC, KBEntryC
|
||||
from ..typedefs cimport hash_t
|
||||
from .kb cimport KnowledgeBase
|
||||
from .vocab cimport Vocab
|
||||
from .typedefs cimport hash_t
|
||||
from .structs cimport KBEntryC, AliasC
|
||||
|
||||
|
||||
ctypedef vector[KBEntryC] entry_vec
|
||||
ctypedef vector[AliasC] alias_vec
|
||||
|
@ -14,7 +16,21 @@ ctypedef vector[float] float_vec
|
|||
ctypedef vector[float_vec] float_matrix
|
||||
|
||||
|
||||
cdef class InMemoryLookupKB(KnowledgeBase):
|
||||
# Object used by the Entity Linker that summarizes one entity-alias candidate combination.
|
||||
cdef class Candidate:
|
||||
cdef readonly KnowledgeBase kb
|
||||
cdef hash_t entity_hash
|
||||
cdef float entity_freq
|
||||
cdef vector[float] entity_vector
|
||||
cdef hash_t alias_hash
|
||||
cdef float prior_prob
|
||||
|
||||
|
||||
cdef class KnowledgeBase:
|
||||
cdef Pool mem
|
||||
cdef readonly Vocab vocab
|
||||
cdef int64_t entity_vector_length
|
||||
|
||||
# This maps 64bit keys (hash of unique entity string)
|
||||
# to 64bit values (position of the _KBEntryC struct in the _entries vector).
|
||||
# The PreshMap is pretty space efficient, as it uses open addressing. So
|
||||
|
@ -55,28 +71,23 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
# optional data, we can let users configure a DB as the backend for this.
|
||||
cdef object _features_table
|
||||
|
||||
|
||||
cdef inline int64_t c_add_vector(self, vector[float] entity_vector) nogil:
|
||||
"""Add an entity vector to the vectors table."""
|
||||
cdef int64_t new_index = self._vectors_table.size()
|
||||
self._vectors_table.push_back(entity_vector)
|
||||
return new_index
|
||||
|
||||
cdef inline int64_t c_add_entity(
|
||||
self,
|
||||
hash_t entity_hash,
|
||||
float freq,
|
||||
int32_t vector_index,
|
||||
int feats_row
|
||||
) nogil:
|
||||
|
||||
cdef inline int64_t c_add_entity(self, hash_t entity_hash, float freq,
|
||||
int32_t vector_index, int feats_row) nogil:
|
||||
"""Add an entry to the vector of entries.
|
||||
After calling this method, make sure to update also the _entry_index
|
||||
using the return value"""
|
||||
After calling this method, make sure to update also the _entry_index using the return value"""
|
||||
# This is what we'll map the entity hash key to. It's where the entry will sit
|
||||
# in the vector of entries, so we can get it later.
|
||||
cdef int64_t new_index = self._entries.size()
|
||||
|
||||
# Avoid struct initializer to enable nogil, cf.
|
||||
# https://github.com/cython/cython/issues/1642
|
||||
# Avoid struct initializer to enable nogil, cf https://github.com/cython/cython/issues/1642
|
||||
cdef KBEntryC entry
|
||||
entry.entity_hash = entity_hash
|
||||
entry.vector_index = vector_index
|
||||
|
@ -86,17 +97,11 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
self._entries.push_back(entry)
|
||||
return new_index
|
||||
|
||||
cdef inline int64_t c_add_aliases(
|
||||
self,
|
||||
hash_t alias_hash,
|
||||
vector[int64_t] entry_indices,
|
||||
vector[float] probs
|
||||
) nogil:
|
||||
"""Connect a mention to a list of potential entities with their prior
|
||||
probabilities. After calling this method, make sure to update also the
|
||||
_alias_index using the return value"""
|
||||
# This is what we'll map the alias hash key to. It's where the alias will be
|
||||
# defined in the vector of aliases.
|
||||
cdef inline int64_t c_add_aliases(self, hash_t alias_hash, vector[int64_t] entry_indices, vector[float] probs) nogil:
|
||||
"""Connect a mention to a list of potential entities with their prior probabilities .
|
||||
After calling this method, make sure to update also the _alias_index using the return value"""
|
||||
# This is what we'll map the alias hash key to. It's where the alias will be defined
|
||||
# in the vector of aliases.
|
||||
cdef int64_t new_index = self._aliases_table.size()
|
||||
|
||||
# Avoid struct initializer to enable nogil
|
||||
|
@ -109,9 +114,8 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
|
||||
cdef inline void _create_empty_vectors(self, hash_t dummy_hash) nogil:
|
||||
"""
|
||||
Initializing the vectors and making sure the first element of each vector is a
|
||||
dummy, because the PreshMap maps pointing to indices in these vectors can not
|
||||
contain 0 as value.
|
||||
Initializing the vectors and making sure the first element of each vector is a dummy,
|
||||
because the PreshMap maps pointing to indices in these vectors can not contain 0 as value
|
||||
cf. https://github.com/explosion/preshed/issues/17
|
||||
"""
|
||||
cdef int32_t dummy_value = 0
|
||||
|
@ -142,18 +146,12 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
cdef class Writer:
|
||||
cdef FILE* _fp
|
||||
|
||||
cdef int write_header(
|
||||
self, int64_t nr_entries, int64_t entity_vector_length
|
||||
) except -1
|
||||
cdef int write_header(self, int64_t nr_entries, int64_t entity_vector_length) except -1
|
||||
cdef int write_vector_element(self, float element) except -1
|
||||
cdef int write_entry(
|
||||
self, hash_t entry_hash, float entry_freq, int32_t vector_index
|
||||
) except -1
|
||||
cdef int write_entry(self, hash_t entry_hash, float entry_freq, int32_t vector_index) except -1
|
||||
|
||||
cdef int write_alias_length(self, int64_t alias_length) except -1
|
||||
cdef int write_alias_header(
|
||||
self, hash_t alias_hash, int64_t candidate_length
|
||||
) except -1
|
||||
cdef int write_alias_header(self, hash_t alias_hash, int64_t candidate_length) except -1
|
||||
cdef int write_alias(self, int64_t entry_index, float prob) except -1
|
||||
|
||||
cdef int _write(self, void* value, size_t size) except -1
|
||||
|
@ -161,18 +159,12 @@ cdef class Writer:
|
|||
cdef class Reader:
|
||||
cdef FILE* _fp
|
||||
|
||||
cdef int read_header(
|
||||
self, int64_t* nr_entries, int64_t* entity_vector_length
|
||||
) except -1
|
||||
cdef int read_header(self, int64_t* nr_entries, int64_t* entity_vector_length) except -1
|
||||
cdef int read_vector_element(self, float* element) except -1
|
||||
cdef int read_entry(
|
||||
self, hash_t* entity_hash, float* freq, int32_t* vector_index
|
||||
) except -1
|
||||
cdef int read_entry(self, hash_t* entity_hash, float* freq, int32_t* vector_index) except -1
|
||||
|
||||
cdef int read_alias_length(self, int64_t* alias_length) except -1
|
||||
cdef int read_alias_header(
|
||||
self, hash_t* alias_hash, int64_t* candidate_length
|
||||
) except -1
|
||||
cdef int read_alias_header(self, hash_t* alias_hash, int64_t* candidate_length) except -1
|
||||
cdef int read_alias(self, int64_t* entry_index, float* prob) except -1
|
||||
|
||||
cdef int _read(self, void* value, size_t size) except -1
|
|
@ -1,59 +1,113 @@
|
|||
# cython: infer_types=True
|
||||
from typing import Any, Callable, Dict, Iterable
|
||||
# cython: infer_types=True, profile=True
|
||||
from typing import Iterator, Iterable, Callable, Dict, Any
|
||||
|
||||
import srsly
|
||||
|
||||
from cpython.exc cimport PyErr_SetFromErrno
|
||||
from libc.stdint cimport int32_t, int64_t
|
||||
from libc.stdio cimport fclose, feof, fopen, fread, fseek, fwrite
|
||||
from libcpp.vector cimport vector
|
||||
from cymem.cymem cimport Pool
|
||||
from preshed.maps cimport PreshMap
|
||||
from cpython.exc cimport PyErr_SetFromErrno
|
||||
from libc.stdio cimport fopen, fclose, fread, fwrite, feof, fseek
|
||||
from libc.stdint cimport int32_t, int64_t
|
||||
from libcpp.vector cimport vector
|
||||
|
||||
import warnings
|
||||
from pathlib import Path
|
||||
import warnings
|
||||
|
||||
from ..tokens import Span
|
||||
from .typedefs cimport hash_t
|
||||
from .errors import Errors, Warnings
|
||||
from . import util
|
||||
from .util import SimpleFrozenList, ensure_path
|
||||
|
||||
from ..typedefs cimport hash_t
|
||||
cdef class Candidate:
|
||||
"""A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved
|
||||
to a specific `entity` from a Knowledge Base. This will be used as input for the entity linking
|
||||
algorithm which will disambiguate the various candidates to the correct one.
|
||||
Each candidate (alias, entity) pair is assigned to a certain prior probability.
|
||||
|
||||
from .. import util
|
||||
from ..errors import Errors, Warnings
|
||||
from ..util import SimpleFrozenList, ensure_path
|
||||
DOCS: https://spacy.io/api/kb/#candidate_init
|
||||
"""
|
||||
|
||||
from ..vocab cimport Vocab
|
||||
from .kb cimport KnowledgeBase
|
||||
def __init__(self, KnowledgeBase kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob):
|
||||
self.kb = kb
|
||||
self.entity_hash = entity_hash
|
||||
self.entity_freq = entity_freq
|
||||
self.entity_vector = entity_vector
|
||||
self.alias_hash = alias_hash
|
||||
self.prior_prob = prior_prob
|
||||
|
||||
from .candidate import Candidate as Candidate
|
||||
@property
|
||||
def entity(self):
|
||||
"""RETURNS (uint64): hash of the entity's KB ID/name"""
|
||||
return self.entity_hash
|
||||
|
||||
@property
|
||||
def entity_(self):
|
||||
"""RETURNS (str): ID/name of this entity in the KB"""
|
||||
return self.kb.vocab.strings[self.entity_hash]
|
||||
|
||||
@property
|
||||
def alias(self):
|
||||
"""RETURNS (uint64): hash of the alias"""
|
||||
return self.alias_hash
|
||||
|
||||
@property
|
||||
def alias_(self):
|
||||
"""RETURNS (str): ID of the original alias"""
|
||||
return self.kb.vocab.strings[self.alias_hash]
|
||||
|
||||
@property
|
||||
def entity_freq(self):
|
||||
return self.entity_freq
|
||||
|
||||
@property
|
||||
def entity_vector(self):
|
||||
return self.entity_vector
|
||||
|
||||
@property
|
||||
def prior_prob(self):
|
||||
return self.prior_prob
|
||||
|
||||
|
||||
cdef class InMemoryLookupKB(KnowledgeBase):
|
||||
"""An `InMemoryLookupKB` instance stores unique identifiers for entities
|
||||
and their textual aliases, to support entity linking of named entities to
|
||||
real-world concepts.
|
||||
def get_candidates(KnowledgeBase kb, span) -> Iterator[Candidate]:
|
||||
"""
|
||||
Return candidate entities for a given span by using the text of the span as the alias
|
||||
and fetching appropriate entries from the index.
|
||||
This particular function is optimized to work with the built-in KB functionality,
|
||||
but any other custom candidate generation method can be used in combination with the KB as well.
|
||||
"""
|
||||
return kb.get_alias_candidates(span.text)
|
||||
|
||||
DOCS: https://spacy.io/api/inmemorylookupkb
|
||||
|
||||
cdef class KnowledgeBase:
|
||||
"""A `KnowledgeBase` instance stores unique identifiers for entities and their textual aliases,
|
||||
to support entity linking of named entities to real-world concepts.
|
||||
|
||||
DOCS: https://spacy.io/api/kb
|
||||
"""
|
||||
|
||||
def __init__(self, Vocab vocab, entity_vector_length):
|
||||
"""Create an InMemoryLookupKB."""
|
||||
super().__init__(vocab, entity_vector_length)
|
||||
"""Create a KnowledgeBase."""
|
||||
self.mem = Pool()
|
||||
self.entity_vector_length = entity_vector_length
|
||||
self._entry_index = PreshMap()
|
||||
self._alias_index = PreshMap()
|
||||
self.vocab = vocab
|
||||
self._create_empty_vectors(dummy_hash=self.vocab.strings[""])
|
||||
|
||||
def _initialize_entities(self, int64_t nr_entities):
|
||||
def initialize_entities(self, int64_t nr_entities):
|
||||
self._entry_index = PreshMap(nr_entities + 1)
|
||||
self._entries = entry_vec(nr_entities + 1)
|
||||
|
||||
def _initialize_vectors(self, int64_t nr_entities):
|
||||
def initialize_vectors(self, int64_t nr_entities):
|
||||
self._vectors_table = float_matrix(nr_entities + 1)
|
||||
|
||||
def _initialize_aliases(self, int64_t nr_aliases):
|
||||
def initialize_aliases(self, int64_t nr_aliases):
|
||||
self._alias_index = PreshMap(nr_aliases + 1)
|
||||
self._aliases_table = alias_vec(nr_aliases + 1)
|
||||
|
||||
def is_empty(self):
|
||||
return len(self) == 0
|
||||
@property
|
||||
def entity_vector_length(self):
|
||||
"""RETURNS (uint64): length of the entity vectors"""
|
||||
return self.entity_vector_length
|
||||
|
||||
def __len__(self):
|
||||
return self.get_size_entities()
|
||||
|
@ -72,8 +126,7 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
|
||||
def add_entity(self, str entity, float freq, vector[float] entity_vector):
|
||||
"""
|
||||
Add an entity to the KB, optionally specifying its log probability
|
||||
based on corpus frequency.
|
||||
Add an entity to the KB, optionally specifying its log probability based on corpus frequency
|
||||
Return the hash of the entity ID/name at the end.
|
||||
"""
|
||||
cdef hash_t entity_hash = self.vocab.strings.add(entity)
|
||||
|
@ -85,20 +138,14 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
|
||||
# Raise an error if the provided entity vector is not of the correct length
|
||||
if len(entity_vector) != self.entity_vector_length:
|
||||
raise ValueError(
|
||||
Errors.E141.format(
|
||||
found=len(entity_vector), required=self.entity_vector_length
|
||||
)
|
||||
)
|
||||
raise ValueError(Errors.E141.format(found=len(entity_vector), required=self.entity_vector_length))
|
||||
|
||||
vector_index = self.c_add_vector(entity_vector=entity_vector)
|
||||
|
||||
new_index = self.c_add_entity(
|
||||
entity_hash=entity_hash,
|
||||
freq=freq,
|
||||
vector_index=vector_index,
|
||||
feats_row=-1
|
||||
) # Features table currently not implemented
|
||||
new_index = self.c_add_entity(entity_hash=entity_hash,
|
||||
freq=freq,
|
||||
vector_index=vector_index,
|
||||
feats_row=-1) # Features table currently not implemented
|
||||
self._entry_index[entity_hash] = new_index
|
||||
|
||||
return entity_hash
|
||||
|
@ -108,8 +155,8 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
raise ValueError(Errors.E140)
|
||||
|
||||
nr_entities = len(set(entity_list))
|
||||
self._initialize_entities(nr_entities)
|
||||
self._initialize_vectors(nr_entities)
|
||||
self.initialize_entities(nr_entities)
|
||||
self.initialize_vectors(nr_entities)
|
||||
|
||||
i = 0
|
||||
cdef KBEntryC entry
|
||||
|
@ -123,12 +170,7 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
else:
|
||||
entity_vector = vector_list[i]
|
||||
if len(entity_vector) != self.entity_vector_length:
|
||||
raise ValueError(
|
||||
Errors.E141.format(
|
||||
found=len(entity_vector),
|
||||
required=self.entity_vector_length
|
||||
)
|
||||
)
|
||||
raise ValueError(Errors.E141.format(found=len(entity_vector), required=self.entity_vector_length))
|
||||
|
||||
entry.entity_hash = entity_hash
|
||||
entry.freq = freq_list[i]
|
||||
|
@ -162,15 +204,11 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
previous_alias_nr = self.get_size_aliases()
|
||||
# Throw an error if the length of entities and probabilities are not the same
|
||||
if not len(entities) == len(probabilities):
|
||||
raise ValueError(
|
||||
Errors.E132.format(
|
||||
alias=alias,
|
||||
entities_length=len(entities),
|
||||
probabilities_length=len(probabilities))
|
||||
)
|
||||
raise ValueError(Errors.E132.format(alias=alias,
|
||||
entities_length=len(entities),
|
||||
probabilities_length=len(probabilities)))
|
||||
|
||||
# Throw an error if the probabilities sum up to more than 1 (allow for
|
||||
# some rounding errors)
|
||||
# Throw an error if the probabilities sum up to more than 1 (allow for some rounding errors)
|
||||
prob_sum = sum(probabilities)
|
||||
if prob_sum > 1.00001:
|
||||
raise ValueError(Errors.E133.format(alias=alias, sum=prob_sum))
|
||||
|
@ -187,47 +225,40 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
|
||||
for entity, prob in zip(entities, probabilities):
|
||||
entity_hash = self.vocab.strings[entity]
|
||||
if entity_hash not in self._entry_index:
|
||||
if not entity_hash in self._entry_index:
|
||||
raise ValueError(Errors.E134.format(entity=entity))
|
||||
|
||||
entry_index = <int64_t>self._entry_index.get(entity_hash)
|
||||
entry_indices.push_back(int(entry_index))
|
||||
probs.push_back(float(prob))
|
||||
|
||||
new_index = self.c_add_aliases(
|
||||
alias_hash=alias_hash, entry_indices=entry_indices, probs=probs
|
||||
)
|
||||
new_index = self.c_add_aliases(alias_hash=alias_hash, entry_indices=entry_indices, probs=probs)
|
||||
self._alias_index[alias_hash] = new_index
|
||||
|
||||
if previous_alias_nr + 1 != self.get_size_aliases():
|
||||
raise RuntimeError(Errors.E891.format(alias=alias))
|
||||
return alias_hash
|
||||
|
||||
def append_alias(
|
||||
self, str alias, str entity, float prior_prob, ignore_warnings=False
|
||||
):
|
||||
def append_alias(self, str alias, str entity, float prior_prob, ignore_warnings=False):
|
||||
"""
|
||||
For an alias already existing in the KB, extend its potential entities
|
||||
with one more.
|
||||
For an alias already existing in the KB, extend its potential entities with one more.
|
||||
Throw a warning if either the alias or the entity is unknown,
|
||||
or when the combination is already previously recorded.
|
||||
Throw an error if this entity+prior prob would exceed the sum of 1.
|
||||
For efficiency, it's best to use the method `add_alias` as much as
|
||||
possible instead of this one.
|
||||
For efficiency, it's best to use the method `add_alias` as much as possible instead of this one.
|
||||
"""
|
||||
# Check if the alias exists in the KB
|
||||
cdef hash_t alias_hash = self.vocab.strings[alias]
|
||||
if alias_hash not in self._alias_index:
|
||||
if not alias_hash in self._alias_index:
|
||||
raise ValueError(Errors.E176.format(alias=alias))
|
||||
|
||||
# Check if the entity exists in the KB
|
||||
cdef hash_t entity_hash = self.vocab.strings[entity]
|
||||
if entity_hash not in self._entry_index:
|
||||
if not entity_hash in self._entry_index:
|
||||
raise ValueError(Errors.E134.format(entity=entity))
|
||||
entry_index = <int64_t>self._entry_index.get(entity_hash)
|
||||
|
||||
# Throw an error if the prior probabilities (including the new one)
|
||||
# sum up to more than 1
|
||||
# Throw an error if the prior probabilities (including the new one) sum up to more than 1
|
||||
alias_index = <int64_t>self._alias_index.get(alias_hash)
|
||||
alias_entry = self._aliases_table[alias_index]
|
||||
current_sum = sum([p for p in alias_entry.probs])
|
||||
|
@ -255,18 +286,14 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
alias_entry.probs = probs
|
||||
self._aliases_table[alias_index] = alias_entry
|
||||
|
||||
def get_candidates(self, mention: Span) -> Iterable[Candidate]:
|
||||
return self.get_alias_candidates(mention.text) # type: ignore
|
||||
|
||||
def get_alias_candidates(self, str alias) -> Iterable[Candidate]:
|
||||
def get_alias_candidates(self, str alias) -> Iterator[Candidate]:
|
||||
"""
|
||||
Return candidate entities for an alias. Each candidate defines the
|
||||
entity, the original alias, and the prior probability of that alias
|
||||
resolving to that entity.
|
||||
Return candidate entities for an alias. Each candidate defines the entity, the original alias,
|
||||
and the prior probability of that alias resolving to that entity.
|
||||
If the alias is not known in the KB, and empty list is returned.
|
||||
"""
|
||||
cdef hash_t alias_hash = self.vocab.strings[alias]
|
||||
if alias_hash not in self._alias_index:
|
||||
if not alias_hash in self._alias_index:
|
||||
return []
|
||||
alias_index = <int64_t>self._alias_index.get(alias_hash)
|
||||
alias_entry = self._aliases_table[alias_index]
|
||||
|
@ -274,14 +301,10 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
return [Candidate(kb=self,
|
||||
entity_hash=self._entries[entry_index].entity_hash,
|
||||
entity_freq=self._entries[entry_index].freq,
|
||||
entity_vector=self._vectors_table[
|
||||
self._entries[entry_index].vector_index
|
||||
],
|
||||
entity_vector=self._vectors_table[self._entries[entry_index].vector_index],
|
||||
alias_hash=alias_hash,
|
||||
prior_prob=prior_prob)
|
||||
for (entry_index, prior_prob) in zip(
|
||||
alias_entry.entry_indices, alias_entry.probs
|
||||
)
|
||||
for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs)
|
||||
if entry_index != 0]
|
||||
|
||||
def get_vector(self, str entity):
|
||||
|
@ -295,9 +318,8 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
return self._vectors_table[self._entries[entry_index].vector_index]
|
||||
|
||||
def get_prior_prob(self, str entity, str alias):
|
||||
""" Return the prior probability of a given alias being linked to a
|
||||
given entity, or return 0.0 when this combination is not known in the
|
||||
knowledge base."""
|
||||
""" Return the prior probability of a given alias being linked to a given entity,
|
||||
or return 0.0 when this combination is not known in the knowledge base"""
|
||||
cdef hash_t alias_hash = self.vocab.strings[alias]
|
||||
cdef hash_t entity_hash = self.vocab.strings[entity]
|
||||
|
||||
|
@ -308,9 +330,7 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
entry_index = self._entry_index[entity_hash]
|
||||
|
||||
alias_entry = self._aliases_table[alias_index]
|
||||
for (entry_index, prior_prob) in zip(
|
||||
alias_entry.entry_indices, alias_entry.probs
|
||||
):
|
||||
for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs):
|
||||
if self._entries[entry_index].entity_hash == entity_hash:
|
||||
return prior_prob
|
||||
|
||||
|
@ -320,19 +340,13 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
"""Serialize the current state to a binary string.
|
||||
"""
|
||||
def serialize_header():
|
||||
header = (
|
||||
self.get_size_entities(),
|
||||
self.get_size_aliases(),
|
||||
self.entity_vector_length
|
||||
)
|
||||
header = (self.get_size_entities(), self.get_size_aliases(), self.entity_vector_length)
|
||||
return srsly.json_dumps(header)
|
||||
|
||||
def serialize_entries():
|
||||
i = 1
|
||||
tuples = []
|
||||
for entry_hash, entry_index in sorted(
|
||||
self._entry_index.items(), key=lambda x: x[1]
|
||||
):
|
||||
for entry_hash, entry_index in sorted(self._entry_index.items(), key=lambda x: x[1]):
|
||||
entry = self._entries[entry_index]
|
||||
assert entry.entity_hash == entry_hash
|
||||
assert entry_index == i
|
||||
|
@ -345,9 +359,7 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
headers = []
|
||||
indices_lists = []
|
||||
probs_lists = []
|
||||
for alias_hash, alias_index in sorted(
|
||||
self._alias_index.items(), key=lambda x: x[1]
|
||||
):
|
||||
for alias_hash, alias_index in sorted(self._alias_index.items(), key=lambda x: x[1]):
|
||||
alias = self._aliases_table[alias_index]
|
||||
assert alias_index == i
|
||||
candidate_length = len(alias.entry_indices)
|
||||
|
@ -376,9 +388,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
nr_entities = header[0]
|
||||
nr_aliases = header[1]
|
||||
entity_vector_length = header[2]
|
||||
self._initialize_entities(nr_entities)
|
||||
self._initialize_vectors(nr_entities)
|
||||
self._initialize_aliases(nr_aliases)
|
||||
self.initialize_entities(nr_entities)
|
||||
self.initialize_vectors(nr_entities)
|
||||
self.initialize_aliases(nr_aliases)
|
||||
self.entity_vector_length = entity_vector_length
|
||||
|
||||
def deserialize_vectors(b):
|
||||
|
@ -405,7 +417,7 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
indices = srsly.json_loads(all_data[1])
|
||||
probs = srsly.json_loads(all_data[2])
|
||||
for header, indices, probs in zip(headers, indices, probs):
|
||||
alias_hash, _candidate_length = header
|
||||
alias_hash, candidate_length = header
|
||||
alias.entry_indices = indices
|
||||
alias.probs = probs
|
||||
self._aliases_table[i] = alias
|
||||
|
@ -454,14 +466,10 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
writer.write_vector_element(element)
|
||||
i = i+1
|
||||
|
||||
# dumping the entry records in the order in which they are in the
|
||||
# _entries vector.
|
||||
# index 0 is a dummy object not stored in the _entry_index and can
|
||||
# be ignored.
|
||||
# dumping the entry records in the order in which they are in the _entries vector.
|
||||
# index 0 is a dummy object not stored in the _entry_index and can be ignored.
|
||||
i = 1
|
||||
for entry_hash, entry_index in sorted(
|
||||
self._entry_index.items(), key=lambda x: x[1]
|
||||
):
|
||||
for entry_hash, entry_index in sorted(self._entry_index.items(), key=lambda x: x[1]):
|
||||
entry = self._entries[entry_index]
|
||||
assert entry.entity_hash == entry_hash
|
||||
assert entry_index == i
|
||||
|
@ -473,9 +481,7 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
# dumping the aliases in the order in which they are in the _alias_index vector.
|
||||
# index 0 is a dummy object not stored in the _aliases_table and can be ignored.
|
||||
i = 1
|
||||
for alias_hash, alias_index in sorted(
|
||||
self._alias_index.items(), key=lambda x: x[1]
|
||||
):
|
||||
for alias_hash, alias_index in sorted(self._alias_index.items(), key=lambda x: x[1]):
|
||||
alias = self._aliases_table[alias_index]
|
||||
assert alias_index == i
|
||||
|
||||
|
@ -506,8 +512,8 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
cdef int64_t entity_vector_length
|
||||
reader.read_header(&nr_entities, &entity_vector_length)
|
||||
|
||||
self._initialize_entities(nr_entities)
|
||||
self._initialize_vectors(nr_entities)
|
||||
self.initialize_entities(nr_entities)
|
||||
self.initialize_vectors(nr_entities)
|
||||
self.entity_vector_length = entity_vector_length
|
||||
|
||||
# STEP 1: load entity vectors
|
||||
|
@ -546,7 +552,7 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
# STEP 3: load aliases
|
||||
cdef int64_t nr_aliases
|
||||
reader.read_alias_length(&nr_aliases)
|
||||
self._initialize_aliases(nr_aliases)
|
||||
self.initialize_aliases(nr_aliases)
|
||||
|
||||
cdef int64_t nr_candidates
|
||||
cdef vector[int64_t] entry_indices
|
||||
|
@ -581,8 +587,7 @@ cdef class Writer:
|
|||
def __init__(self, path):
|
||||
assert isinstance(path, Path)
|
||||
content = bytes(path)
|
||||
cdef bytes bytes_loc = content.encode('utf8') \
|
||||
if type(content) == str else content
|
||||
cdef bytes bytes_loc = content.encode('utf8') if type(content) == str else content
|
||||
self._fp = fopen(<char*>bytes_loc, 'wb')
|
||||
if not self._fp:
|
||||
raise IOError(Errors.E146.format(path=path))
|
||||
|
@ -592,18 +597,14 @@ cdef class Writer:
|
|||
cdef size_t status = fclose(self._fp)
|
||||
assert status == 0
|
||||
|
||||
cdef int write_header(
|
||||
self, int64_t nr_entries, int64_t entity_vector_length
|
||||
) except -1:
|
||||
cdef int write_header(self, int64_t nr_entries, int64_t entity_vector_length) except -1:
|
||||
self._write(&nr_entries, sizeof(nr_entries))
|
||||
self._write(&entity_vector_length, sizeof(entity_vector_length))
|
||||
|
||||
cdef int write_vector_element(self, float element) except -1:
|
||||
self._write(&element, sizeof(element))
|
||||
|
||||
cdef int write_entry(
|
||||
self, hash_t entry_hash, float entry_freq, int32_t vector_index
|
||||
) except -1:
|
||||
cdef int write_entry(self, hash_t entry_hash, float entry_freq, int32_t vector_index) except -1:
|
||||
self._write(&entry_hash, sizeof(entry_hash))
|
||||
self._write(&entry_freq, sizeof(entry_freq))
|
||||
self._write(&vector_index, sizeof(vector_index))
|
||||
|
@ -612,9 +613,7 @@ cdef class Writer:
|
|||
cdef int write_alias_length(self, int64_t alias_length) except -1:
|
||||
self._write(&alias_length, sizeof(alias_length))
|
||||
|
||||
cdef int write_alias_header(
|
||||
self, hash_t alias_hash, int64_t candidate_length
|
||||
) except -1:
|
||||
cdef int write_alias_header(self, hash_t alias_hash, int64_t candidate_length) except -1:
|
||||
self._write(&alias_hash, sizeof(alias_hash))
|
||||
self._write(&candidate_length, sizeof(candidate_length))
|
||||
|
||||
|
@ -630,19 +629,16 @@ cdef class Writer:
|
|||
cdef class Reader:
|
||||
def __init__(self, path):
|
||||
content = bytes(path)
|
||||
cdef bytes bytes_loc = content.encode('utf8') \
|
||||
if type(content) == str else content
|
||||
cdef bytes bytes_loc = content.encode('utf8') if type(content) == str else content
|
||||
self._fp = fopen(<char*>bytes_loc, 'rb')
|
||||
if not self._fp:
|
||||
PyErr_SetFromErrno(IOError)
|
||||
fseek(self._fp, 0, 0) # this can be 0 if there is no header
|
||||
status = fseek(self._fp, 0, 0) # this can be 0 if there is no header
|
||||
|
||||
def __dealloc__(self):
|
||||
fclose(self._fp)
|
||||
|
||||
cdef int read_header(
|
||||
self, int64_t* nr_entries, int64_t* entity_vector_length
|
||||
) except -1:
|
||||
cdef int read_header(self, int64_t* nr_entries, int64_t* entity_vector_length) except -1:
|
||||
status = self._read(nr_entries, sizeof(int64_t))
|
||||
if status < 1:
|
||||
if feof(self._fp):
|
||||
|
@ -662,9 +658,7 @@ cdef class Reader:
|
|||
return 0 # end of file
|
||||
raise IOError(Errors.E145.format(param="vector element"))
|
||||
|
||||
cdef int read_entry(
|
||||
self, hash_t* entity_hash, float* freq, int32_t* vector_index
|
||||
) except -1:
|
||||
cdef int read_entry(self, hash_t* entity_hash, float* freq, int32_t* vector_index) except -1:
|
||||
status = self._read(entity_hash, sizeof(hash_t))
|
||||
if status < 1:
|
||||
if feof(self._fp):
|
||||
|
@ -695,9 +689,7 @@ cdef class Reader:
|
|||
return 0 # end of file
|
||||
raise IOError(Errors.E145.format(param="alias length"))
|
||||
|
||||
cdef int read_alias_header(
|
||||
self, hash_t* alias_hash, int64_t* candidate_length
|
||||
) except -1:
|
||||
cdef int read_alias_header(self, hash_t* alias_hash, int64_t* candidate_length) except -1:
|
||||
status = self._read(alias_hash, sizeof(hash_t))
|
||||
if status < 1:
|
||||
if feof(self._fp):
|
|
@ -1,11 +0,0 @@
|
|||
from .candidate import Candidate, get_candidates, get_candidates_batch
|
||||
from .kb import KnowledgeBase
|
||||
from .kb_in_memory import InMemoryLookupKB
|
||||
|
||||
__all__ = [
|
||||
"Candidate",
|
||||
"KnowledgeBase",
|
||||
"InMemoryLookupKB",
|
||||
"get_candidates",
|
||||
"get_candidates_batch",
|
||||
]
|
|
@ -1,15 +0,0 @@
|
|||
from libcpp.vector cimport vector
|
||||
|
||||
from ..typedefs cimport hash_t
|
||||
from .kb cimport KnowledgeBase
|
||||
|
||||
|
||||
# Object used by the Entity Linker that summarizes one entity-alias candidate
|
||||
# combination.
|
||||
cdef class Candidate:
|
||||
cdef readonly KnowledgeBase kb
|
||||
cdef hash_t entity_hash
|
||||
cdef float entity_freq
|
||||
cdef vector[float] entity_vector
|
||||
cdef hash_t alias_hash
|
||||
cdef float prior_prob
|
|
@ -1,90 +0,0 @@
|
|||
# cython: infer_types=True
|
||||
|
||||
from typing import Iterable
|
||||
|
||||
from .kb cimport KnowledgeBase
|
||||
|
||||
from ..tokens import Span
|
||||
|
||||
|
||||
cdef class Candidate:
|
||||
"""A `Candidate` object refers to a textual mention (`alias`) that may or
|
||||
may not be resolved to a specific `entity` from a Knowledge Base. This
|
||||
will be used as input for the entity linking algorithm which will
|
||||
disambiguate the various candidates to the correct one.
|
||||
Each candidate (alias, entity) pair is assigned a certain prior probability.
|
||||
|
||||
DOCS: https://spacy.io/api/kb/#candidate-init
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
KnowledgeBase kb,
|
||||
entity_hash,
|
||||
entity_freq,
|
||||
entity_vector,
|
||||
alias_hash,
|
||||
prior_prob
|
||||
):
|
||||
self.kb = kb
|
||||
self.entity_hash = entity_hash
|
||||
self.entity_freq = entity_freq
|
||||
self.entity_vector = entity_vector
|
||||
self.alias_hash = alias_hash
|
||||
self.prior_prob = prior_prob
|
||||
|
||||
@property
|
||||
def entity(self) -> int:
|
||||
"""RETURNS (uint64): hash of the entity's KB ID/name"""
|
||||
return self.entity_hash
|
||||
|
||||
@property
|
||||
def entity_(self) -> str:
|
||||
"""RETURNS (str): ID/name of this entity in the KB"""
|
||||
return self.kb.vocab.strings[self.entity_hash]
|
||||
|
||||
@property
|
||||
def alias(self) -> int:
|
||||
"""RETURNS (uint64): hash of the alias"""
|
||||
return self.alias_hash
|
||||
|
||||
@property
|
||||
def alias_(self) -> str:
|
||||
"""RETURNS (str): ID of the original alias"""
|
||||
return self.kb.vocab.strings[self.alias_hash]
|
||||
|
||||
@property
|
||||
def entity_freq(self) -> float:
|
||||
return self.entity_freq
|
||||
|
||||
@property
|
||||
def entity_vector(self) -> Iterable[float]:
|
||||
return self.entity_vector
|
||||
|
||||
@property
|
||||
def prior_prob(self) -> float:
|
||||
return self.prior_prob
|
||||
|
||||
|
||||
def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]:
|
||||
"""
|
||||
Return candidate entities for a given mention and fetching appropriate
|
||||
entries from the index.
|
||||
kb (KnowledgeBase): Knowledge base to query.
|
||||
mention (Span): Entity mention for which to identify candidates.
|
||||
RETURNS (Iterable[Candidate]): Identified candidates.
|
||||
"""
|
||||
return kb.get_candidates(mention)
|
||||
|
||||
|
||||
def get_candidates_batch(
|
||||
kb: KnowledgeBase, mentions: Iterable[Span]
|
||||
) -> Iterable[Iterable[Candidate]]:
|
||||
"""
|
||||
Return candidate entities for the given mentions and fetching appropriate entries
|
||||
from the index.
|
||||
kb (KnowledgeBase): Knowledge base to query.
|
||||
mention (Iterable[Span]): Entity mentions for which to identify candidates.
|
||||
RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
|
||||
"""
|
||||
return kb.get_candidates_batch(mentions)
|
|
@ -1,12 +0,0 @@
|
|||
"""Knowledge-base for entity or concept linking."""
|
||||
|
||||
from cymem.cymem cimport Pool
|
||||
from libc.stdint cimport int64_t
|
||||
|
||||
from ..vocab cimport Vocab
|
||||
|
||||
|
||||
cdef class KnowledgeBase:
|
||||
cdef Pool mem
|
||||
cdef readonly Vocab vocab
|
||||
cdef readonly int64_t entity_vector_length
|
130
spacy/kb/kb.pyx
130
spacy/kb/kb.pyx
|
@ -1,130 +0,0 @@
|
|||
# cython: infer_types=True
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Iterable, Tuple, Union
|
||||
|
||||
from cymem.cymem cimport Pool
|
||||
|
||||
from ..errors import Errors
|
||||
from ..tokens import Span
|
||||
from ..util import SimpleFrozenList
|
||||
from .candidate import Candidate
|
||||
|
||||
|
||||
cdef class KnowledgeBase:
|
||||
"""A `KnowledgeBase` instance stores unique identifiers for entities and
|
||||
their textual aliases, to support entity linking of named entities to
|
||||
real-world concepts.
|
||||
This is an abstract class and requires its operations to be implemented.
|
||||
|
||||
DOCS: https://spacy.io/api/kb
|
||||
"""
|
||||
|
||||
def __init__(self, vocab: Vocab, entity_vector_length: int):
|
||||
"""Create a KnowledgeBase."""
|
||||
# Make sure abstract KB is not instantiated.
|
||||
if self.__class__ == KnowledgeBase:
|
||||
raise TypeError(
|
||||
Errors.E1046.format(cls_name=self.__class__.__name__)
|
||||
)
|
||||
|
||||
self.vocab = vocab
|
||||
self.entity_vector_length = entity_vector_length
|
||||
self.mem = Pool()
|
||||
|
||||
def get_candidates_batch(
|
||||
self, mentions: Iterable[Span]
|
||||
) -> Iterable[Iterable[Candidate]]:
|
||||
"""
|
||||
Return candidate entities for specified texts. Each candidate defines
|
||||
the entity, the original alias, and the prior probability of that
|
||||
alias resolving to that entity.
|
||||
If no candidate is found for a given text, an empty list is returned.
|
||||
mentions (Iterable[Span]): Mentions for which to get candidates.
|
||||
RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
|
||||
"""
|
||||
return [self.get_candidates(span) for span in mentions]
|
||||
|
||||
def get_candidates(self, mention: Span) -> Iterable[Candidate]:
|
||||
"""
|
||||
Return candidate entities for specified text. Each candidate defines
|
||||
the entity, the original alias,
|
||||
and the prior probability of that alias resolving to that entity.
|
||||
If the no candidate is found for a given text, an empty list is returned.
|
||||
mention (Span): Mention for which to get candidates.
|
||||
RETURNS (Iterable[Candidate]): Identified candidates.
|
||||
"""
|
||||
raise NotImplementedError(
|
||||
Errors.E1045.format(
|
||||
parent="KnowledgeBase", method="get_candidates", name=self.__name__
|
||||
)
|
||||
)
|
||||
|
||||
def get_vectors(self, entities: Iterable[str]) -> Iterable[Iterable[float]]:
|
||||
"""
|
||||
Return vectors for entities.
|
||||
entity (str): Entity name/ID.
|
||||
RETURNS (Iterable[Iterable[float]]): Vectors for specified entities.
|
||||
"""
|
||||
return [self.get_vector(entity) for entity in entities]
|
||||
|
||||
def get_vector(self, str entity) -> Iterable[float]:
|
||||
"""
|
||||
Return vector for entity.
|
||||
entity (str): Entity name/ID.
|
||||
RETURNS (Iterable[float]): Vector for specified entity.
|
||||
"""
|
||||
raise NotImplementedError(
|
||||
Errors.E1045.format(
|
||||
parent="KnowledgeBase", method="get_vector", name=self.__name__
|
||||
)
|
||||
)
|
||||
|
||||
def to_bytes(self, **kwargs) -> bytes:
|
||||
"""Serialize the current state to a binary string.
|
||||
RETURNS (bytes): Current state as binary string.
|
||||
"""
|
||||
raise NotImplementedError(
|
||||
Errors.E1045.format(
|
||||
parent="KnowledgeBase", method="to_bytes", name=self.__name__
|
||||
)
|
||||
)
|
||||
|
||||
def from_bytes(self, bytes_data: bytes, *, exclude: Tuple[str] = tuple()):
|
||||
"""Load state from a binary string.
|
||||
bytes_data (bytes): KB state.
|
||||
exclude (Tuple[str]): Properties to exclude when restoring KB.
|
||||
"""
|
||||
raise NotImplementedError(
|
||||
Errors.E1045.format(
|
||||
parent="KnowledgeBase", method="from_bytes", name=self.__name__
|
||||
)
|
||||
)
|
||||
|
||||
def to_disk(
|
||||
self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()
|
||||
) -> None:
|
||||
"""
|
||||
Write KnowledgeBase content to disk.
|
||||
path (Union[str, Path]): Target file path.
|
||||
exclude (Iterable[str]): List of components to exclude.
|
||||
"""
|
||||
raise NotImplementedError(
|
||||
Errors.E1045.format(
|
||||
parent="KnowledgeBase", method="to_disk", name=self.__name__
|
||||
)
|
||||
)
|
||||
|
||||
def from_disk(
|
||||
self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()
|
||||
) -> None:
|
||||
"""
|
||||
Load KnowledgeBase content from disk.
|
||||
path (Union[str, Path]): Target file path.
|
||||
exclude (Iterable[str]): List of components to exclude.
|
||||
"""
|
||||
raise NotImplementedError(
|
||||
Errors.E1045.format(
|
||||
parent="KnowledgeBase", method="from_disk", name=self.__name__
|
||||
)
|
||||
)
|
|
@ -1,5 +1,5 @@
|
|||
from ...language import BaseDefaults, Language
|
||||
from .stop_words import STOP_WORDS
|
||||
from ...language import Language, BaseDefaults
|
||||
|
||||
|
||||
class AfrikaansDefaults(BaseDefaults):
|
||||
|
|
|
@ -1,11 +1,12 @@
|
|||
from ...attrs import LANG
|
||||
from ...language import BaseDefaults, Language
|
||||
from ...util import update_exc
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .punctuation import TOKENIZER_SUFFIXES
|
||||
from .stop_words import STOP_WORDS
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...language import Language, BaseDefaults
|
||||
from ...attrs import LANG
|
||||
from ...util import update_exc
|
||||
|
||||
|
||||
class AmharicDefaults(BaseDefaults):
|
||||
|
|
|
@ -1,11 +1,5 @@
|
|||
from ..char_classes import (
|
||||
ALPHA_UPPER,
|
||||
CURRENCY,
|
||||
LIST_ELLIPSES,
|
||||
LIST_PUNCT,
|
||||
LIST_QUOTES,
|
||||
UNITS,
|
||||
)
|
||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
|
||||
from ..char_classes import UNITS, ALPHA_UPPER
|
||||
|
||||
_list_punct = LIST_PUNCT + "፡ ። ፣ ፤ ፥ ፦ ፧ ፠ ፨".strip().split()
|
||||
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
from ...symbols import NORM, ORTH
|
||||
from ...symbols import ORTH, NORM
|
||||
|
||||
|
||||
_exc = {}
|
||||
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
from ...language import BaseDefaults, Language
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .punctuation import TOKENIZER_SUFFIXES
|
||||
from .stop_words import STOP_WORDS
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from ...language import Language, BaseDefaults
|
||||
|
||||
|
||||
class ArabicDefaults(BaseDefaults):
|
||||
|
|
|
@ -1,11 +1,5 @@
|
|||
from ..char_classes import (
|
||||
ALPHA_UPPER,
|
||||
CURRENCY,
|
||||
LIST_ELLIPSES,
|
||||
LIST_PUNCT,
|
||||
LIST_QUOTES,
|
||||
UNITS,
|
||||
)
|
||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
|
||||
from ..char_classes import UNITS, ALPHA_UPPER
|
||||
|
||||
_suffixes = (
|
||||
LIST_PUNCT
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
from ...symbols import NORM, ORTH
|
||||
from ...util import update_exc
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...symbols import ORTH, NORM
|
||||
from ...util import update_exc
|
||||
|
||||
|
||||
_exc = {}
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
from ...language import BaseDefaults, Language
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from ...language import Language, BaseDefaults
|
||||
|
||||
|
||||
class AzerbaijaniDefaults(BaseDefaults):
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
from ...attrs import LIKE_NUM
|
||||
|
||||
|
||||
# Eleven, twelve etc. are written separate: on bir, on iki
|
||||
|
||||
_num_words = [
|
||||
|
|
|
@ -1,14 +1,11 @@
|
|||
from ...attrs import LANG
|
||||
from ...language import BaseDefaults, Language
|
||||
from ...util import update_exc
|
||||
from ..punctuation import (
|
||||
COMBINING_DIACRITICS_TOKENIZER_INFIXES,
|
||||
COMBINING_DIACRITICS_TOKENIZER_SUFFIXES,
|
||||
)
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .stop_words import STOP_WORDS
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
|
||||
from ...language import Language, BaseDefaults
|
||||
from ...attrs import LANG
|
||||
from ...util import update_exc
|
||||
|
||||
|
||||
class BulgarianDefaults(BaseDefaults):
|
||||
|
@ -19,8 +16,6 @@ class BulgarianDefaults(BaseDefaults):
|
|||
|
||||
stop_words = STOP_WORDS
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
suffixes = COMBINING_DIACRITICS_TOKENIZER_SUFFIXES
|
||||
infixes = COMBINING_DIACRITICS_TOKENIZER_INFIXES
|
||||
|
||||
|
||||
class Bulgarian(Language):
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
from ...attrs import LIKE_NUM
|
||||
|
||||
|
||||
_num_words = [
|
||||
"нула",
|
||||
"едно",
|
||||
|
|
|
@ -4,7 +4,8 @@ References:
|
|||
(countries, occupations, fields of studies and more).
|
||||
"""
|
||||
|
||||
from ...symbols import NORM, ORTH
|
||||
from ...symbols import ORTH, NORM
|
||||
|
||||
|
||||
_exc = {}
|
||||
|
||||
|
|
|
@ -1,12 +1,10 @@
|
|||
from typing import Callable, Optional
|
||||
|
||||
from typing import Optional, Callable
|
||||
from thinc.api import Model
|
||||
|
||||
from ...language import BaseDefaults, Language
|
||||
from ...pipeline import Lemmatizer
|
||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
||||
from .stop_words import STOP_WORDS
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||
from .stop_words import STOP_WORDS
|
||||
from ...language import Language, BaseDefaults
|
||||
from ...pipeline import Lemmatizer
|
||||
|
||||
|
||||
class BengaliDefaults(BaseDefaults):
|
||||
|
|
|
@ -1,14 +1,6 @@
|
|||
from ..char_classes import (
|
||||
ALPHA,
|
||||
ALPHA_LOWER,
|
||||
CONCAT_QUOTES,
|
||||
HYPHENS,
|
||||
LIST_ELLIPSES,
|
||||
LIST_ICONS,
|
||||
LIST_PUNCT,
|
||||
LIST_QUOTES,
|
||||
UNITS,
|
||||
)
|
||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
|
||||
from ..char_classes import ALPHA_LOWER, ALPHA, HYPHENS, CONCAT_QUOTES, UNITS
|
||||
|
||||
|
||||
_currency = r"\$¢£€¥฿৳"
|
||||
_quotes = CONCAT_QUOTES.replace("'", "")
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
from ...symbols import NORM, ORTH
|
||||
from ...util import update_exc
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...symbols import ORTH, NORM
|
||||
from ...util import update_exc
|
||||
|
||||
|
||||
_exc = {}
|
||||
|
||||
|
|
|
@ -1,16 +0,0 @@
|
|||
from ...language import BaseDefaults, Language
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .stop_words import STOP_WORDS
|
||||
|
||||
|
||||
class TibetanDefaults(BaseDefaults):
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class Tibetan(Language):
|
||||
lang = "bo"
|
||||
Defaults = TibetanDefaults
|
||||
|
||||
|
||||
__all__ = ["Tibetan"]
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user