mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-02 11:20:19 +03:00
Merge branch 'master' into feature/visualisation
This commit is contained in:
commit
fd66bce8c1
4
.github/ISSUE_TEMPLATE/01_bugs.md
vendored
4
.github/ISSUE_TEMPLATE/01_bugs.md
vendored
|
@ -4,11 +4,13 @@ about: Use this template if you came across a bug or unexpected behaviour differ
|
|||
|
||||
---
|
||||
|
||||
<!-- NOTE: For questions or install related issues, please open a Discussion instead. -->
|
||||
|
||||
## How to reproduce the behaviour
|
||||
<!-- Include a code example or the steps that led to the problem. Please try to be as specific as possible. -->
|
||||
|
||||
## Your Environment
|
||||
<!-- Include details of your environment. If you're using spaCy 1.7+, you can also type `python -m spacy info --markdown` and copy-paste the result here.-->
|
||||
<!-- Include details of your environment. You can also type `python -m spacy info --markdown` and copy-paste the result here.-->
|
||||
* Operating System:
|
||||
* Python Version Used:
|
||||
* spaCy Version Used:
|
||||
|
|
3
.github/ISSUE_TEMPLATE/config.yml
vendored
3
.github/ISSUE_TEMPLATE/config.yml
vendored
|
@ -1,8 +1,5 @@
|
|||
blank_issues_enabled: false
|
||||
contact_links:
|
||||
- name: ⚠️ Python 3.10 Support
|
||||
url: https://github.com/explosion/spaCy/discussions/9418
|
||||
about: Python 3.10 wheels haven't been released yet, see the link for details.
|
||||
- name: 🗯 Discussions Forum
|
||||
url: https://github.com/explosion/spaCy/discussions
|
||||
about: Install issues, usage questions, general discussion and anything else that isn't a bug report.
|
||||
|
|
78
.github/azure-steps.yml
vendored
78
.github/azure-steps.yml
vendored
|
@ -1,68 +1,56 @@
|
|||
parameters:
|
||||
python_version: ''
|
||||
architecture: ''
|
||||
prefix: ''
|
||||
gpu: false
|
||||
num_build_jobs: 1
|
||||
architecture: 'x64'
|
||||
num_build_jobs: 2
|
||||
|
||||
steps:
|
||||
- task: UsePythonVersion@0
|
||||
inputs:
|
||||
versionSpec: ${{ parameters.python_version }}
|
||||
architecture: ${{ parameters.architecture }}
|
||||
allowUnstable: true
|
||||
|
||||
- bash: |
|
||||
echo "##vso[task.setvariable variable=python_version]${{ parameters.python_version }}"
|
||||
displayName: 'Set variables'
|
||||
|
||||
- script: |
|
||||
${{ parameters.prefix }} python -m pip install -U pip setuptools
|
||||
${{ parameters.prefix }} python -m pip install -U -r requirements.txt
|
||||
python -m pip install -U build pip setuptools
|
||||
python -m pip install -U -r requirements.txt
|
||||
displayName: "Install dependencies"
|
||||
|
||||
- script: |
|
||||
${{ parameters.prefix }} python setup.py build_ext --inplace -j ${{ parameters.num_build_jobs }}
|
||||
${{ parameters.prefix }} python setup.py sdist --formats=gztar
|
||||
displayName: "Compile and build sdist"
|
||||
python -m build --sdist
|
||||
displayName: "Build sdist"
|
||||
|
||||
- script: python -m mypy spacy
|
||||
- script: |
|
||||
python -m mypy spacy
|
||||
displayName: 'Run mypy'
|
||||
condition: ne(variables['python_version'], '3.10')
|
||||
condition: ne(variables['python_version'], '3.6')
|
||||
|
||||
- task: DeleteFiles@1
|
||||
inputs:
|
||||
contents: "spacy"
|
||||
displayName: "Delete source directory"
|
||||
|
||||
- task: DeleteFiles@1
|
||||
inputs:
|
||||
contents: "*.egg-info"
|
||||
displayName: "Delete egg-info directory"
|
||||
|
||||
- script: |
|
||||
${{ parameters.prefix }} python -m pip freeze --exclude torch --exclude cupy-cuda110 > installed.txt
|
||||
${{ parameters.prefix }} python -m pip uninstall -y -r installed.txt
|
||||
python -m pip freeze > installed.txt
|
||||
python -m pip uninstall -y -r installed.txt
|
||||
displayName: "Uninstall all packages"
|
||||
|
||||
- bash: |
|
||||
${{ parameters.prefix }} SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
|
||||
${{ parameters.prefix }} python -m pip install dist/$SDIST
|
||||
SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
|
||||
SPACY_NUM_BUILD_JOBS=${{ parameters.num_build_jobs }} python -m pip install dist/$SDIST
|
||||
displayName: "Install from sdist"
|
||||
|
||||
- script: |
|
||||
${{ parameters.prefix }} python -m pip install -U -r requirements.txt
|
||||
displayName: "Install test requirements"
|
||||
|
||||
- script: |
|
||||
${{ parameters.prefix }} python -m pip install -U cupy-cuda110 -f https://github.com/cupy/cupy/releases/v9.0.0
|
||||
${{ parameters.prefix }} python -m pip install "torch==1.7.1+cu110" -f https://download.pytorch.org/whl/torch_stable.html
|
||||
displayName: "Install GPU requirements"
|
||||
condition: eq(${{ parameters.gpu }}, true)
|
||||
|
||||
- script: |
|
||||
${{ parameters.prefix }} python -m pytest --pyargs spacy
|
||||
displayName: "Run CPU tests"
|
||||
condition: eq(${{ parameters.gpu }}, false)
|
||||
|
||||
- script: |
|
||||
${{ parameters.prefix }} python -m pytest --pyargs spacy -p spacy.tests.enable_gpu
|
||||
displayName: "Run GPU tests"
|
||||
condition: eq(${{ parameters.gpu }}, true)
|
||||
python -W error -c "import spacy"
|
||||
displayName: "Test import"
|
||||
|
||||
- script: |
|
||||
python -m spacy download ca_core_news_sm
|
||||
|
@ -71,6 +59,11 @@ steps:
|
|||
displayName: 'Test download CLI'
|
||||
condition: eq(variables['python_version'], '3.8')
|
||||
|
||||
- script: |
|
||||
python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
|
||||
displayName: 'Test no warnings on load (#11713)'
|
||||
condition: eq(variables['python_version'], '3.8')
|
||||
|
||||
- script: |
|
||||
python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
|
||||
displayName: 'Test convert CLI'
|
||||
|
@ -105,13 +98,22 @@ steps:
|
|||
displayName: 'Test assemble CLI vectors warning'
|
||||
condition: eq(variables['python_version'], '3.8')
|
||||
|
||||
- script: |
|
||||
python -m pip install -U -r requirements.txt
|
||||
displayName: "Install test requirements"
|
||||
|
||||
- script: |
|
||||
python -m pytest --pyargs spacy -W error
|
||||
displayName: "Run CPU tests"
|
||||
|
||||
- script: |
|
||||
python -m pip install 'spacy[apple]'
|
||||
python -m pytest --pyargs spacy
|
||||
displayName: "Run CPU tests with thinc-apple-ops"
|
||||
condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.11'))
|
||||
|
||||
- script: |
|
||||
python .github/validate_universe_json.py website/meta/universe.json
|
||||
displayName: 'Test website/meta/universe.json'
|
||||
condition: eq(variables['python_version'], '3.8')
|
||||
|
||||
- script: |
|
||||
${{ parameters.prefix }} python -m pip install thinc-apple-ops
|
||||
${{ parameters.prefix }} python -m pytest --pyargs spacy
|
||||
displayName: "Run CPU tests with thinc-apple-ops"
|
||||
condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.9'))
|
||||
|
|
106
.github/contributors/Lucaterre.md
vendored
Normal file
106
.github/contributors/Lucaterre.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
|||
# spaCy contributor agreement
|
||||
|
||||
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||
The SCA applies to any contribution that you make to any product or project
|
||||
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||
**"you"** shall mean the person or entity identified below.
|
||||
|
||||
If you agree to be bound by these terms, fill in the information requested
|
||||
below and include the filled-in version with your first pull request, under the
|
||||
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||
should be your GitHub username, with the extension `.md`. For example, the user
|
||||
example_user would create the file `.github/contributors/example_user.md`.
|
||||
|
||||
Read this agreement carefully before signing. These terms and conditions
|
||||
constitute a binding legal agreement.
|
||||
|
||||
## Contributor Agreement
|
||||
|
||||
1. The term "contribution" or "contributed materials" means any source code,
|
||||
object code, patch, tool, sample, graphic, specification, manual,
|
||||
documentation, or any other material posted or submitted by you to the project.
|
||||
|
||||
2. With respect to any worldwide copyrights, or copyright applications and
|
||||
registrations, in your contribution:
|
||||
|
||||
* you hereby assign to us joint ownership, and to the extent that such
|
||||
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||
royalty-free, unrestricted license to exercise all rights under those
|
||||
copyrights. This includes, at our option, the right to sublicense these same
|
||||
rights to third parties through multiple levels of sublicensees or other
|
||||
licensing arrangements;
|
||||
|
||||
* you agree that each of us can do all things in relation to your
|
||||
contribution as if each of us were the sole owners, and if one of us makes
|
||||
a derivative work of your contribution, the one who makes the derivative
|
||||
work (or has it made will be the sole owner of that derivative work;
|
||||
|
||||
* you agree that you will not assert any moral rights in your contribution
|
||||
against us, our licensees or transferees;
|
||||
|
||||
* you agree that we may register a copyright in your contribution and
|
||||
exercise all ownership rights associated with it; and
|
||||
|
||||
* you agree that neither of us has any duty to consult with, obtain the
|
||||
consent of, pay or render an accounting to the other for any use or
|
||||
distribution of your contribution.
|
||||
|
||||
3. With respect to any patents you own, or that you can license without payment
|
||||
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||
|
||||
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||
your contribution in whole or in part, alone or in combination with or
|
||||
included in any product, work or materials arising out of the project to
|
||||
which your contribution was submitted, and
|
||||
|
||||
* at our option, to sublicense these same rights to third parties through
|
||||
multiple levels of sublicensees or other licensing arrangements.
|
||||
|
||||
4. Except as set out above, you keep all right, title, and interest in your
|
||||
contribution. The rights that you grant to us under these terms are effective
|
||||
on the date you first submitted a contribution to us, even if your submission
|
||||
took place before the date you sign these terms.
|
||||
|
||||
5. You covenant, represent, warrant and agree that:
|
||||
|
||||
* Each contribution that you submit is and shall be an original work of
|
||||
authorship and you can legally grant the rights set out in this SCA;
|
||||
|
||||
* to the best of your knowledge, each contribution will not violate any
|
||||
third party's copyrights, trademarks, patents, or other intellectual
|
||||
property rights; and
|
||||
|
||||
* each contribution shall be in compliance with U.S. export control laws and
|
||||
other applicable export and import laws. You agree to notify us if you
|
||||
become aware of any circumstance which would make any of the foregoing
|
||||
representations inaccurate in any respect. We may publicly disclose your
|
||||
participation in the project, including the fact that you have signed the SCA.
|
||||
|
||||
6. This SCA is governed by the laws of the State of California and applicable
|
||||
U.S. Federal law. Any choice of law rules will not apply.
|
||||
|
||||
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||
mark both statements:
|
||||
|
||||
* [x] I am signing on behalf of myself as an individual and no other person
|
||||
or entity, including my employer, has or will have rights with respect to my
|
||||
contributions.
|
||||
|
||||
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||
actual authority to contractually bind that entity.
|
||||
|
||||
## Contributor Details
|
||||
|
||||
| Field | Entry |
|
||||
|------------------------------- |---------------|
|
||||
| Name | Lucas Terriel |
|
||||
| Company name (if applicable) | |
|
||||
| Title or role (if applicable) | |
|
||||
| Date | 2022-06-20 |
|
||||
| GitHub username | Lucaterre |
|
||||
| Website (optional) | |
|
106
.github/contributors/fonfonx.md
vendored
Normal file
106
.github/contributors/fonfonx.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
|||
# spaCy contributor agreement
|
||||
|
||||
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||
The SCA applies to any contribution that you make to any product or project
|
||||
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||
**"you"** shall mean the person or entity identified below.
|
||||
|
||||
If you agree to be bound by these terms, fill in the information requested
|
||||
below and include the filled-in version with your first pull request, under the
|
||||
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||
should be your GitHub username, with the extension `.md`. For example, the user
|
||||
example_user would create the file `.github/contributors/example_user.md`.
|
||||
|
||||
Read this agreement carefully before signing. These terms and conditions
|
||||
constitute a binding legal agreement.
|
||||
|
||||
## Contributor Agreement
|
||||
|
||||
1. The term "contribution" or "contributed materials" means any source code,
|
||||
object code, patch, tool, sample, graphic, specification, manual,
|
||||
documentation, or any other material posted or submitted by you to the project.
|
||||
|
||||
2. With respect to any worldwide copyrights, or copyright applications and
|
||||
registrations, in your contribution:
|
||||
|
||||
* you hereby assign to us joint ownership, and to the extent that such
|
||||
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||
royalty-free, unrestricted license to exercise all rights under those
|
||||
copyrights. This includes, at our option, the right to sublicense these same
|
||||
rights to third parties through multiple levels of sublicensees or other
|
||||
licensing arrangements;
|
||||
|
||||
* you agree that each of us can do all things in relation to your
|
||||
contribution as if each of us were the sole owners, and if one of us makes
|
||||
a derivative work of your contribution, the one who makes the derivative
|
||||
work (or has it made will be the sole owner of that derivative work;
|
||||
|
||||
* you agree that you will not assert any moral rights in your contribution
|
||||
against us, our licensees or transferees;
|
||||
|
||||
* you agree that we may register a copyright in your contribution and
|
||||
exercise all ownership rights associated with it; and
|
||||
|
||||
* you agree that neither of us has any duty to consult with, obtain the
|
||||
consent of, pay or render an accounting to the other for any use or
|
||||
distribution of your contribution.
|
||||
|
||||
3. With respect to any patents you own, or that you can license without payment
|
||||
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||
|
||||
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||
your contribution in whole or in part, alone or in combination with or
|
||||
included in any product, work or materials arising out of the project to
|
||||
which your contribution was submitted, and
|
||||
|
||||
* at our option, to sublicense these same rights to third parties through
|
||||
multiple levels of sublicensees or other licensing arrangements.
|
||||
|
||||
4. Except as set out above, you keep all right, title, and interest in your
|
||||
contribution. The rights that you grant to us under these terms are effective
|
||||
on the date you first submitted a contribution to us, even if your submission
|
||||
took place before the date you sign these terms.
|
||||
|
||||
5. You covenant, represent, warrant and agree that:
|
||||
|
||||
* Each contribution that you submit is and shall be an original work of
|
||||
authorship and you can legally grant the rights set out in this SCA;
|
||||
|
||||
* to the best of your knowledge, each contribution will not violate any
|
||||
third party's copyrights, trademarks, patents, or other intellectual
|
||||
property rights; and
|
||||
|
||||
* each contribution shall be in compliance with U.S. export control laws and
|
||||
other applicable export and import laws. You agree to notify us if you
|
||||
become aware of any circumstance which would make any of the foregoing
|
||||
representations inaccurate in any respect. We may publicly disclose your
|
||||
participation in the project, including the fact that you have signed the SCA.
|
||||
|
||||
6. This SCA is governed by the laws of the State of California and applicable
|
||||
U.S. Federal law. Any choice of law rules will not apply.
|
||||
|
||||
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||
mark both statements:
|
||||
|
||||
* [x] I am signing on behalf of myself as an individual and no other person
|
||||
or entity, including my employer, has or will have rights with respect to my
|
||||
contributions.
|
||||
|
||||
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||
actual authority to contractually bind that entity.
|
||||
|
||||
## Contributor Details
|
||||
|
||||
| Field | Entry |
|
||||
|------------------------------- | -------------------- |
|
||||
| Name | Xavier Fontaine |
|
||||
| Company name (if applicable) | |
|
||||
| Title or role (if applicable) | |
|
||||
| Date | 2022-04-13 |
|
||||
| GitHub username | fonfonx |
|
||||
| Website (optional) | |
|
13
.github/no-response.yml
vendored
13
.github/no-response.yml
vendored
|
@ -1,13 +0,0 @@
|
|||
# Configuration for probot-no-response - https://github.com/probot/no-response
|
||||
|
||||
# Number of days of inactivity before an Issue is closed for lack of response
|
||||
daysUntilClose: 14
|
||||
# Label requiring a response
|
||||
responseRequiredLabel: more-info-needed
|
||||
# Comment to post when closing an Issue for lack of response. Set to `false` to disable
|
||||
closeComment: >
|
||||
This issue has been automatically closed because there has been no response
|
||||
to a request for more information from the original author. With only the
|
||||
information that is currently in the issue, there's not enough information
|
||||
to take action. If you're the original author, feel free to reopen the issue
|
||||
if you have or find the answers needed to investigate further.
|
67
.github/spacy_universe_alert.py
vendored
Normal file
67
.github/spacy_universe_alert.py
vendored
Normal file
|
@ -0,0 +1,67 @@
|
|||
import os
|
||||
import sys
|
||||
import json
|
||||
from datetime import datetime
|
||||
|
||||
from slack_sdk.web.client import WebClient
|
||||
|
||||
CHANNEL = "#alerts-universe"
|
||||
SLACK_TOKEN = os.environ.get("SLACK_BOT_TOKEN", "ENV VAR not available!")
|
||||
DATETIME_FORMAT = "%Y-%m-%dT%H:%M:%SZ"
|
||||
|
||||
client = WebClient(SLACK_TOKEN)
|
||||
github_context = json.loads(sys.argv[1])
|
||||
|
||||
event = github_context['event']
|
||||
pr_title = event['pull_request']["title"]
|
||||
pr_link = event['pull_request']["patch_url"].replace(".patch", "")
|
||||
pr_author_url = event['sender']["html_url"]
|
||||
pr_author_name = pr_author_url.rsplit('/')[-1]
|
||||
pr_created_at_dt = datetime.strptime(
|
||||
event['pull_request']["created_at"],
|
||||
DATETIME_FORMAT
|
||||
)
|
||||
pr_created_at = pr_created_at_dt.strftime("%c")
|
||||
pr_updated_at_dt = datetime.strptime(
|
||||
event['pull_request']["updated_at"],
|
||||
DATETIME_FORMAT
|
||||
)
|
||||
pr_updated_at = pr_updated_at_dt.strftime("%c")
|
||||
|
||||
blocks = [
|
||||
{
|
||||
"type": "section",
|
||||
"text": {
|
||||
"type": "mrkdwn",
|
||||
"text": "📣 New spaCy Universe Project Alert ✨"
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "section",
|
||||
"fields": [
|
||||
{
|
||||
"type": "mrkdwn",
|
||||
"text": f"*Pull Request:*\n<{pr_link}|{pr_title}>"
|
||||
},
|
||||
{
|
||||
"type": "mrkdwn",
|
||||
"text": f"*Author:*\n<{pr_author_url}|{pr_author_name}>"
|
||||
},
|
||||
{
|
||||
"type": "mrkdwn",
|
||||
"text": f"*Created at:*\n {pr_created_at}"
|
||||
},
|
||||
{
|
||||
"type": "mrkdwn",
|
||||
"text": f"*Last Updated:*\n {pr_updated_at}"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
client.chat_postMessage(
|
||||
channel=CHANNEL,
|
||||
text="spaCy universe project PR alert",
|
||||
blocks=blocks
|
||||
)
|
9
.github/workflows/autoblack.yml
vendored
9
.github/workflows/autoblack.yml
vendored
|
@ -12,10 +12,10 @@ jobs:
|
|||
if: github.repository_owner == 'explosion'
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
ref: ${{ github.head_ref }}
|
||||
- uses: actions/setup-python@v2
|
||||
- uses: actions/setup-python@v4
|
||||
- run: pip install black
|
||||
- name: Auto-format code if needed
|
||||
run: black spacy
|
||||
|
@ -23,10 +23,11 @@ jobs:
|
|||
# code and makes GitHub think the action failed
|
||||
- name: Check for modified files
|
||||
id: git-check
|
||||
run: echo ::set-output name=modified::$(if git diff-index --quiet HEAD --; then echo "false"; else echo "true"; fi)
|
||||
run: echo modified=$(if git diff-index --quiet HEAD --; then echo "false"; else echo "true"; fi) >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Create Pull Request
|
||||
if: steps.git-check.outputs.modified == 'true'
|
||||
uses: peter-evans/create-pull-request@v3
|
||||
uses: peter-evans/create-pull-request@v4
|
||||
with:
|
||||
title: Auto-format code with black
|
||||
labels: meta
|
||||
|
|
8
.github/workflows/explosionbot.yml
vendored
8
.github/workflows/explosionbot.yml
vendored
|
@ -8,14 +8,14 @@ on:
|
|||
|
||||
jobs:
|
||||
explosion-bot:
|
||||
runs-on: ubuntu-18.04
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Dump GitHub context
|
||||
env:
|
||||
GITHUB_CONTEXT: ${{ toJson(github) }}
|
||||
run: echo "$GITHUB_CONTEXT"
|
||||
- uses: actions/checkout@v1
|
||||
- uses: actions/setup-python@v1
|
||||
- uses: actions/checkout@v3
|
||||
- uses: actions/setup-python@v4
|
||||
- name: Install and run explosion-bot
|
||||
run: |
|
||||
pip install git+https://${{ secrets.EXPLOSIONBOT_TOKEN }}@github.com/explosion/explosion-bot
|
||||
|
@ -23,5 +23,5 @@ jobs:
|
|||
env:
|
||||
INPUT_TOKEN: ${{ secrets.EXPLOSIONBOT_TOKEN }}
|
||||
INPUT_BK_TOKEN: ${{ secrets.BUILDKITE_SECRET }}
|
||||
ENABLED_COMMANDS: "test_gpu,test_slow"
|
||||
ENABLED_COMMANDS: "test_gpu,test_slow,test_slow_gpu"
|
||||
ALLOWED_TEAMS: "spaCy"
|
||||
|
|
1
.github/workflows/gputests.yml
vendored
1
.github/workflows/gputests.yml
vendored
|
@ -10,6 +10,7 @@ jobs:
|
|||
fail-fast: false
|
||||
matrix:
|
||||
branch: [master, v4]
|
||||
if: github.repository_owner == 'explosion'
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Trigger buildkite build
|
||||
|
|
8
.github/workflows/issue-manager.yml
vendored
8
.github/workflows/issue-manager.yml
vendored
|
@ -15,7 +15,7 @@ jobs:
|
|||
issue-manager:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: tiangolo/issue-manager@0.2.1
|
||||
- uses: tiangolo/issue-manager@0.4.0
|
||||
with:
|
||||
token: ${{ secrets.GITHUB_TOKEN }}
|
||||
config: >
|
||||
|
@ -25,5 +25,11 @@ jobs:
|
|||
"message": "This issue has been automatically closed because it was answered and there was no follow-up discussion.",
|
||||
"remove_label_on_comment": true,
|
||||
"remove_label_on_close": true
|
||||
},
|
||||
"more-info-needed": {
|
||||
"delay": "P7D",
|
||||
"message": "This issue has been automatically closed because there has been no response to a request for more information from the original author. With only the information that is currently in the issue, there's not enough information to take action. If you're the original author, feel free to reopen the issue if you have or find the answers needed to investigate further.",
|
||||
"remove_label_on_comment": true,
|
||||
"remove_label_on_close": true
|
||||
}
|
||||
}
|
||||
|
|
8
.github/workflows/lock.yml
vendored
8
.github/workflows/lock.yml
vendored
|
@ -15,11 +15,11 @@ jobs:
|
|||
action:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: dessant/lock-threads@v3
|
||||
- uses: dessant/lock-threads@v4
|
||||
with:
|
||||
process-only: 'issues'
|
||||
issue-inactive-days: '30'
|
||||
issue-comment: >
|
||||
This thread has been automatically locked since there
|
||||
has not been any recent activity after it was closed.
|
||||
issue-comment: >
|
||||
This thread has been automatically locked since there
|
||||
has not been any recent activity after it was closed.
|
||||
Please open a new issue for related bugs.
|
||||
|
|
7
.github/workflows/slowtests.yml
vendored
7
.github/workflows/slowtests.yml
vendored
|
@ -10,10 +10,11 @@ jobs:
|
|||
fail-fast: false
|
||||
matrix:
|
||||
branch: [master, v4]
|
||||
if: github.repository_owner == 'explosion'
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v1
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
ref: ${{ matrix.branch }}
|
||||
- name: Get commits from past 24 hours
|
||||
|
@ -22,9 +23,9 @@ jobs:
|
|||
today=$(date '+%Y-%m-%d %H:%M:%S')
|
||||
yesterday=$(date -d "yesterday" '+%Y-%m-%d %H:%M:%S')
|
||||
if git log --after="$yesterday" --before="$today" | grep commit ; then
|
||||
echo "::set-output name=run_tests::true"
|
||||
echo run_tests=true >> $GITHUB_OUTPUT
|
||||
else
|
||||
echo "::set-output name=run_tests::false"
|
||||
echo run_tests=false >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
- name: Trigger buildkite build
|
||||
|
|
32
.github/workflows/spacy_universe_alert.yml
vendored
Normal file
32
.github/workflows/spacy_universe_alert.yml
vendored
Normal file
|
@ -0,0 +1,32 @@
|
|||
name: spaCy universe project alert
|
||||
|
||||
on:
|
||||
pull_request_target:
|
||||
paths:
|
||||
- "website/meta/universe.json"
|
||||
|
||||
jobs:
|
||||
build:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Dump GitHub context
|
||||
env:
|
||||
GITHUB_CONTEXT: ${{ toJson(github) }}
|
||||
PR_NUMBER: ${{github.event.number}}
|
||||
run: |
|
||||
echo "$GITHUB_CONTEXT"
|
||||
|
||||
- uses: actions/checkout@v3
|
||||
- uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: '3.10'
|
||||
- name: Install Bernadette app dependency and send an alert
|
||||
env:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
GITHUB_CONTEXT: ${{ toJson(github) }}
|
||||
CHANNEL: "#alerts-universe"
|
||||
run: |
|
||||
pip install slack-sdk==3.17.2 aiohttp==3.8.1
|
||||
echo "$CHANNEL"
|
||||
python .github/spacy_universe_alert.py "$GITHUB_CONTEXT"
|
11
.gitignore
vendored
11
.gitignore
vendored
|
@ -10,20 +10,11 @@ spacy/tests/package/setup.cfg
|
|||
spacy/tests/package/pyproject.toml
|
||||
spacy/tests/package/requirements.txt
|
||||
|
||||
# Website
|
||||
website/.cache/
|
||||
website/public/
|
||||
website/node_modules
|
||||
website/.npm
|
||||
website/logs
|
||||
*.log
|
||||
npm-debug.log*
|
||||
quickstart-training-generator.js
|
||||
|
||||
# Cython / C extensions
|
||||
cythonize.json
|
||||
spacy/*.html
|
||||
*.cpp
|
||||
*.c
|
||||
*.so
|
||||
|
||||
# Vim / VSCode / editors
|
||||
|
|
|
@ -1,11 +1,12 @@
|
|||
repos:
|
||||
- repo: https://github.com/ambv/black
|
||||
rev: 21.6b0
|
||||
rev: 22.3.0
|
||||
hooks:
|
||||
- id: black
|
||||
language_version: python3.7
|
||||
- repo: https://gitlab.com/pycqa/flake8
|
||||
rev: 3.9.2
|
||||
additional_dependencies: ['click==8.0.4']
|
||||
- repo: https://github.com/pycqa/flake8
|
||||
rev: 5.0.4
|
||||
hooks:
|
||||
- id: flake8
|
||||
args:
|
||||
|
|
|
@ -144,7 +144,7 @@ Changes to `.py` files will be effective immediately.
|
|||
|
||||
When fixing a bug, first create an
|
||||
[issue](https://github.com/explosion/spaCy/issues) if one does not already
|
||||
exist. The description text can be very short – we don't want to make this too
|
||||
exist. The description text can be very short – we don't want to make this too
|
||||
bureaucratic.
|
||||
|
||||
Next, add a test to the relevant file in the
|
||||
|
@ -233,7 +233,7 @@ also want to keep an eye on unused declared variables or repeated
|
|||
(i.e. overwritten) dictionary keys. If your code was formatted with `black`
|
||||
(see above), you shouldn't see any formatting-related warnings.
|
||||
|
||||
The [`.flake8`](.flake8) config defines the configuration we use for this
|
||||
The `flake8` section in [`setup.cfg`](setup.cfg) defines the configuration we use for this
|
||||
codebase. For example, we're not super strict about the line length, and we're
|
||||
excluding very large files like lemmatization and tokenizer exception tables.
|
||||
|
||||
|
@ -271,7 +271,8 @@ except: # noqa: E722
|
|||
|
||||
### Python conventions
|
||||
|
||||
All Python code must be written **compatible with Python 3.6+**.
|
||||
All Python code must be written **compatible with Python 3.6+**. More detailed
|
||||
code conventions can be found in the [developer docs](https://github.com/explosion/spaCy/blob/master/extra/DEVELOPER_DOCS/Code%20Conventions.md).
|
||||
|
||||
#### I/O and handling paths
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
recursive-include spacy *.pyi *.pyx *.pxd *.txt *.cfg *.jinja *.toml
|
||||
recursive-include spacy *.pyi *.pyx *.pxd *.txt *.cfg *.jinja *.toml *.hh
|
||||
include LICENSE
|
||||
include README.md
|
||||
include pyproject.toml
|
||||
|
|
10
README.md
10
README.md
|
@ -8,15 +8,15 @@ be used in real products.
|
|||
|
||||
spaCy comes with
|
||||
[pretrained pipelines](https://spacy.io/models) and
|
||||
currently supports tokenization and training for **60+ languages**. It features
|
||||
currently supports tokenization and training for **70+ languages**. It features
|
||||
state-of-the-art speed and **neural network models** for tagging,
|
||||
parsing, **named entity recognition**, **text classification** and more,
|
||||
multi-task learning with pretrained **transformers** like BERT, as well as a
|
||||
production-ready [**training system**](https://spacy.io/usage/training) and easy
|
||||
model packaging, deployment and workflow management. spaCy is commercial
|
||||
open-source software, released under the MIT license.
|
||||
open-source software, released under the [MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE).
|
||||
|
||||
💫 **Version 3.2 out now!**
|
||||
💫 **Version 3.5 out now!**
|
||||
[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
|
||||
|
||||
[](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)
|
||||
|
@ -46,6 +46,7 @@ open-source software, released under the MIT license.
|
|||
| 🛠 **[Changelog]** | Changes and version history. |
|
||||
| 💝 **[Contribute]** | How to contribute to the spaCy project and code base. |
|
||||
| <a href="https://explosion.ai/spacy-tailored-pipelines"><img src="https://user-images.githubusercontent.com/13643239/152853098-1c761611-ccb0-4ec6-9066-b234552831fe.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Get a custom spaCy pipeline, tailor-made for your NLP problem by spaCy's core developers. Streamlined, production-ready, predictable and maintainable. Start by completing our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-pipelines)** |
|
||||
| <a href="https://explosion.ai/spacy-tailored-analysis"><img src="https://user-images.githubusercontent.com/1019791/206151300-b00cd189-e503-4797-aa1e-1bb6344062c5.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Bespoke advice for problem solving, strategy and analysis for applied NLP projects. Services include data strategy, code reviews, pipeline design and annotation coaching. Curious? Fill in our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-analysis)** |
|
||||
|
||||
[spacy 101]: https://spacy.io/usage/spacy-101
|
||||
[new in v3.0]: https://spacy.io/usage/v3
|
||||
|
@ -59,6 +60,7 @@ open-source software, released under the MIT license.
|
|||
[changelog]: https://spacy.io/usage#changelog
|
||||
[contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md
|
||||
|
||||
|
||||
## 💬 Where to ask questions
|
||||
|
||||
The spaCy project is maintained by the [spaCy team](https://explosion.ai/about).
|
||||
|
@ -79,7 +81,7 @@ more people can benefit from it.
|
|||
|
||||
## Features
|
||||
|
||||
- Support for **60+ languages**
|
||||
- Support for **70+ languages**
|
||||
- **Trained pipelines** for different languages and tasks
|
||||
- Multi-task learning with pretrained **transformers** like BERT
|
||||
- Support for pretrained **word vectors** and embeddings
|
||||
|
|
|
@ -31,8 +31,8 @@ jobs:
|
|||
inputs:
|
||||
versionSpec: "3.7"
|
||||
- script: |
|
||||
pip install flake8==3.9.2
|
||||
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823 --show-source --statistics
|
||||
pip install flake8==5.0.4
|
||||
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
|
||||
displayName: "flake8"
|
||||
|
||||
- job: "Test"
|
||||
|
@ -41,7 +41,7 @@ jobs:
|
|||
matrix:
|
||||
# We're only running one platform per Python version to speed up builds
|
||||
Python36Linux:
|
||||
imageName: "ubuntu-latest"
|
||||
imageName: "ubuntu-20.04"
|
||||
python.version: "3.6"
|
||||
# Python36Windows:
|
||||
# imageName: "windows-latest"
|
||||
|
@ -50,7 +50,7 @@ jobs:
|
|||
# imageName: "macos-latest"
|
||||
# python.version: "3.6"
|
||||
# Python37Linux:
|
||||
# imageName: "ubuntu-latest"
|
||||
# imageName: "ubuntu-20.04"
|
||||
# python.version: "3.7"
|
||||
Python37Windows:
|
||||
imageName: "windows-latest"
|
||||
|
@ -76,15 +76,24 @@ jobs:
|
|||
# Python39Mac:
|
||||
# imageName: "macos-latest"
|
||||
# python.version: "3.9"
|
||||
Python310Linux:
|
||||
imageName: "ubuntu-latest"
|
||||
python.version: "3.10"
|
||||
# Python310Linux:
|
||||
# imageName: "ubuntu-latest"
|
||||
# python.version: "3.10"
|
||||
Python310Windows:
|
||||
imageName: "windows-latest"
|
||||
python.version: "3.10"
|
||||
Python310Mac:
|
||||
imageName: "macos-latest"
|
||||
python.version: "3.10"
|
||||
# Python310Mac:
|
||||
# imageName: "macos-latest"
|
||||
# python.version: "3.10"
|
||||
Python311Linux:
|
||||
imageName: 'ubuntu-latest'
|
||||
python.version: '3.11'
|
||||
Python311Windows:
|
||||
imageName: 'windows-latest'
|
||||
python.version: '3.11'
|
||||
Python311Mac:
|
||||
imageName: 'macos-latest'
|
||||
python.version: '3.11'
|
||||
maxParallel: 4
|
||||
pool:
|
||||
vmImage: $(imageName)
|
||||
|
@ -92,20 +101,3 @@ jobs:
|
|||
- template: .github/azure-steps.yml
|
||||
parameters:
|
||||
python_version: '$(python.version)'
|
||||
architecture: 'x64'
|
||||
|
||||
# - job: "TestGPU"
|
||||
# dependsOn: "Validate"
|
||||
# strategy:
|
||||
# matrix:
|
||||
# Python38LinuxX64_GPU:
|
||||
# python.version: '3.8'
|
||||
# pool:
|
||||
# name: "LinuxX64_GPU"
|
||||
# steps:
|
||||
# - template: .github/azure-steps.yml
|
||||
# parameters:
|
||||
# python_version: '$(python.version)'
|
||||
# architecture: 'x64'
|
||||
# gpu: true
|
||||
# num_build_jobs: 24
|
||||
|
|
|
@ -1,6 +1,9 @@
|
|||
# build version constraints for use with wheelwright + multibuild
|
||||
numpy==1.15.0; python_version<='3.7'
|
||||
numpy==1.17.3; python_version=='3.8'
|
||||
numpy==1.15.0; python_version<='3.7' and platform_machine!='aarch64'
|
||||
numpy==1.19.2; python_version<='3.7' and platform_machine=='aarch64'
|
||||
numpy==1.17.3; python_version=='3.8' and platform_machine!='aarch64'
|
||||
numpy==1.19.2; python_version=='3.8' and platform_machine=='aarch64'
|
||||
numpy==1.19.3; python_version=='3.9'
|
||||
numpy==1.21.3; python_version=='3.10'
|
||||
numpy; python_version>='3.11'
|
||||
numpy==1.23.2; python_version=='3.11'
|
||||
numpy; python_version>='3.12'
|
||||
|
|
|
@ -137,7 +137,7 @@ If any of the TODOs you've added are important and should be fixed soon, you sho
|
|||
|
||||
## Type hints
|
||||
|
||||
We use Python type hints across the `.py` files wherever possible. This makes it easy to understand what a function expects and returns, and modern editors will be able to show this information to you when you call an annotated function. Type hints are not currently used in the `.pyx` (Cython) code, except for definitions of registered functions and component factories, where they're used for config validation.
|
||||
We use Python type hints across the `.py` files wherever possible. This makes it easy to understand what a function expects and returns, and modern editors will be able to show this information to you when you call an annotated function. Type hints are not currently used in the `.pyx` (Cython) code, except for definitions of registered functions and component factories, where they're used for config validation. Ideally when developing, run `mypy spacy` on the code base to inspect any issues.
|
||||
|
||||
If possible, you should always use the more descriptive type hints like `List[str]` or even `List[Any]` instead of only `list`. We also annotate arguments and return types of `Callable` – although, you can simplify this if the type otherwise gets too verbose (e.g. functions that return factories to create callbacks). Remember that `Callable` takes two values: a **list** of the argument type(s) in order, and the return values.
|
||||
|
||||
|
@ -155,6 +155,13 @@ def create_callback(some_arg: bool) -> Callable[[str, int], List[str]]:
|
|||
return callback
|
||||
```
|
||||
|
||||
For typing variables, we prefer the explicit format.
|
||||
|
||||
```diff
|
||||
- var = value # type: Type
|
||||
+ var: Type = value
|
||||
```
|
||||
|
||||
For model architectures, Thinc also provides a collection of [custom types](https://thinc.ai/docs/api-types), including more specific types for arrays and model inputs/outputs. Even outside of static type checking, using these types will make the code a lot easier to read and follow, since it's always clear what array types are expected (and what might go wrong if the output is different from the expected type).
|
||||
|
||||
```python
|
||||
|
@ -184,6 +191,8 @@ def load_model(name: str) -> "Language":
|
|||
...
|
||||
```
|
||||
|
||||
Note that we typically put the `from typing` import statements on the first line(s) of the Python module.
|
||||
|
||||
## Structuring logic
|
||||
|
||||
### Positional and keyword arguments
|
||||
|
@ -268,6 +277,27 @@ If you have to use `try`/`except`, make sure to only include what's **absolutely
|
|||
+ return [v.strip() for v in value.split(",")]
|
||||
```
|
||||
|
||||
### Numeric comparisons
|
||||
|
||||
For numeric comparisons, as a general rule we always use `<` and `>=` and avoid the usage of `<=` and `>`. This is to ensure we consistently
|
||||
apply inclusive lower bounds and exclusive upper bounds, helping to prevent off-by-one errors.
|
||||
|
||||
One exception to this rule is the ternary case. With a chain like
|
||||
|
||||
```python
|
||||
if value >= 0 and value < max:
|
||||
...
|
||||
```
|
||||
|
||||
it's fine to rewrite this to the shorter form
|
||||
|
||||
```python
|
||||
if 0 <= value < max:
|
||||
...
|
||||
```
|
||||
|
||||
even though this requires the usage of the `<=` operator.
|
||||
|
||||
### Iteration and comprehensions
|
||||
|
||||
We generally avoid using built-in functions like `filter` or `map` in favor of list or generator comprehensions.
|
||||
|
@ -444,10 +474,14 @@ spaCy uses the [`pytest`](http://doc.pytest.org/) framework for testing. Tests f
|
|||
|
||||
When adding tests, make sure to use descriptive names and only test for one behavior at a time. Tests should be grouped into modules dedicated to the same type of functionality and some test modules are organized as directories of test files related to the same larger area of the library, e.g. `matcher` or `tokenizer`.
|
||||
|
||||
Regression tests are tests that refer to bugs reported in specific issues. They should live in the relevant module of the test suite, named according to the issue number (e.g., `test_issue1234.py`), and [marked](https://docs.pytest.org/en/6.2.x/example/markers.html#working-with-custom-markers) appropriately (e.g. `@pytest.mark.issue(1234)`). This system allows us to relate tests for specific bugs back to the original reported issue, which is especially useful if we introduce a regression and a previously passing regression tests suddenly fails again. When fixing a bug, it's often useful to create a regression test for it first.
|
||||
Regression tests are tests that refer to bugs reported in specific issues. They should live in the relevant module of the test suite, named according to the issue number (e.g., `test_issue1234.py`), and [marked](https://docs.pytest.org/en/6.2.x/example/markers.html#working-with-custom-markers) appropriately (e.g. `@pytest.mark.issue(1234)`). This system allows us to relate tests for specific bugs back to the original reported issue, which is especially useful if we introduce a regression and a previously passing regression tests suddenly fails again. When fixing a bug, it's often useful to create a regression test for it first.
|
||||
|
||||
The test suite also provides [fixtures](https://github.com/explosion/spaCy/blob/master/spacy/tests/conftest.py) for different language tokenizers that can be used as function arguments of the same name and will be passed in automatically. Those should only be used for tests related to those specific languages. We also have [test utility functions](https://github.com/explosion/spaCy/blob/master/spacy/tests/util.py) for common operations, like creating a temporary file.
|
||||
|
||||
### Testing Cython Code
|
||||
|
||||
If you're developing Cython code (`.pyx` files), those extensions will need to be built before the test runner can test that code - otherwise it's going to run the tests with stale code from the last time the extension was built. You can build the extensions locally with `python setup.py build_ext -i`.
|
||||
|
||||
### Constructing objects and state
|
||||
|
||||
Test functions usually follow the same simple structure: they set up some state, perform the operation you want to test and `assert` conditions that you expect to be true, usually before and after the operation.
|
||||
|
|
56
extra/DEVELOPER_DOCS/ExplosionBot.md
Normal file
56
extra/DEVELOPER_DOCS/ExplosionBot.md
Normal file
|
@ -0,0 +1,56 @@
|
|||
# Explosion-bot
|
||||
|
||||
Explosion-bot is a robot that can be invoked to help with running particular test commands.
|
||||
|
||||
## Permissions
|
||||
|
||||
Only maintainers have permissions to summon explosion-bot. Each of the open source repos that use explosion-bot has its own team(s) of maintainers, and only github users who are members of those teams can successfully run bot commands.
|
||||
|
||||
## Running robot commands
|
||||
|
||||
To summon the robot, write a github comment on the issue/PR you wish to test. The comment must be in the following format:
|
||||
|
||||
```
|
||||
@explosion-bot please test_gpu
|
||||
```
|
||||
|
||||
Some things to note:
|
||||
|
||||
- The `@explosion-bot please` must be the beginning of the command - you cannot add anything in front of this or else the robot won't know how to parse it. Adding anything at the end aside from the test name will also confuse the robot, so keep it simple!
|
||||
- The command name (such as `test_gpu`) must be one of the tests that the bot knows how to run. The available commands are documented in the bot's [workflow config](https://github.com/explosion/spaCy/blob/master/.github/workflows/explosionbot.yml#L26) and must match exactly one of the commands listed there.
|
||||
- The robot can't do multiple things at once, so if you want it to run multiple tests, you'll have to summon it with one comment per test.
|
||||
|
||||
### Examples
|
||||
|
||||
- Execute spaCy slow GPU tests with a custom thinc branch from a spaCy PR:
|
||||
|
||||
```
|
||||
@explosion-bot please test_slow_gpu --thinc-branch <branch_name>
|
||||
```
|
||||
|
||||
`branch_name` can either be a named branch, e.g: `develop`, or an unmerged PR, e.g: `refs/pull/<pr_number>/head`.
|
||||
|
||||
- Execute spaCy Transformers GPU tests from a spaCy PR:
|
||||
|
||||
```
|
||||
@explosion-bot please test_gpu --run-on spacy-transformers --run-on-branch master --spacy-branch current_pr
|
||||
```
|
||||
|
||||
This will launch the GPU pipeline for the `spacy-transformers` repo on its `master` branch, using the current spaCy PR's branch to build spaCy. The name of the repository passed to `--run-on` is case-sensitive, e.g: use `spaCy` instead of `spacy`.
|
||||
|
||||
- General info about supported commands.
|
||||
|
||||
```
|
||||
@explosion-bot please info
|
||||
```
|
||||
|
||||
- Help text for a specific command
|
||||
```
|
||||
@explosion-bot please <command> --help
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
If the robot isn't responding to commands as expected, you can check its logs in the [Github Action](https://github.com/explosion/spaCy/actions/workflows/explosionbot.yml).
|
||||
|
||||
For each command sent to the bot, there should be a run of the `explosion-bot` workflow. In the `Install and run explosion-bot` step, towards the ends of the logs you should see info about the configuration that the bot was run with, as well as any errors that the bot encountered.
|
82
extra/DEVELOPER_DOCS/Satellite Packages.md
Normal file
82
extra/DEVELOPER_DOCS/Satellite Packages.md
Normal file
|
@ -0,0 +1,82 @@
|
|||
# spaCy Satellite Packages
|
||||
|
||||
This is a list of all the active repos relevant to spaCy besides the main one, with short descriptions, history, and current status. Archived repos will not be covered.
|
||||
|
||||
## Always Included in spaCy
|
||||
|
||||
These packages are always pulled in when you install spaCy. Most of them are direct dependencies, but some are transitive dependencies through other packages.
|
||||
|
||||
- [spacy-legacy](https://github.com/explosion/spacy-legacy): When an architecture in spaCy changes enough to get a new version, the old version is frozen and moved to spacy-legacy. This allows us to keep the core library slim while also preserving backwards compatability.
|
||||
- [thinc](https://github.com/explosion/thinc): Thinc is the machine learning library that powers trainable components in spaCy. It wraps backends like Numpy, PyTorch, and Tensorflow to provide a functional interface for specifying architectures.
|
||||
- [catalogue](https://github.com/explosion/catalogue): Small library for adding function registries, like those used for model architectures in spaCy.
|
||||
- [confection](https://github.com/explosion/confection): This library contains the functionality for config parsing that was formerly contained directly in Thinc.
|
||||
- [spacy-loggers](https://github.com/explosion/spacy-loggers): Contains loggers beyond the default logger available in spaCy's core code base. This includes loggers integrated with third-party services, which may differ in release cadence from spaCy itself.
|
||||
- [wasabi](https://github.com/explosion/wasabi): A command line formatting library, used for terminal output in spaCy.
|
||||
- [srsly](https://github.com/explosion/srsly): A wrapper that vendors several serialization libraries for spaCy. Includes parsers for JSON, JSONL, MessagePack, (extended) Pickle, and YAML.
|
||||
- [preshed](https://github.com/explosion/preshed): A Cython library for low-level data structures like hash maps, used for memory efficient data storage.
|
||||
- [cython-blis](https://github.com/explosion/cython-blis): Fast matrix multiplication using BLIS without depending on system libraries. Required by Thinc, rather than spaCy directly.
|
||||
- [murmurhash](https://github.com/explosion/murmurhash): A wrapper library for a C++ murmurhash implementation, used for string IDs in spaCy and preshed.
|
||||
- [cymem](https://github.com/explosion/cymem): A small library for RAII-style memory management in Cython.
|
||||
|
||||
## Optional Extensions for spaCy
|
||||
|
||||
These are repos that can be used by spaCy but aren't part of a default installation. Many of these are wrappers to integrate various kinds of third-party libraries.
|
||||
|
||||
- [spacy-transformers](https://github.com/explosion/spacy-transformers): A wrapper for the [HuggingFace Transformers](https://huggingface.co/docs/transformers/index) library, this handles the extensive conversion necessary to coordinate spaCy's powerful `Doc` representation, training pipeline, and the Transformer embeddings. When released, this was known as `spacy-pytorch-transformers`, but it changed to the current name when HuggingFace update the name of their library as well.
|
||||
- [spacy-huggingface-hub](https://github.com/explosion/spacy-huggingface-hub): This package has a CLI script for uploading a packaged spaCy pipeline (created with `spacy package`) to the [Hugging Face Hub](https://huggingface.co/models).
|
||||
- [spacy-alignments](https://github.com/explosion/spacy-alignments): A wrapper for the tokenizations library (mentioned below) with a modified build system to simplify cross-platform wheel creation. Used in spacy-transformers for aligning spaCy and HuggingFace tokenizations.
|
||||
- [spacy-experimental](https://github.com/explosion/spacy-experimental): Experimental components that are not quite ready for inclusion in the main spaCy library. Usually there are unresolved questions around their APIs, so the experimental library allows us to expose them to the community for feedback before fully integrating them.
|
||||
- [spacy-lookups-data](https://github.com/explosion/spacy-lookups-data): A repository of linguistic data, such as lemmas, that takes up a lot of disk space. Originally created to reduce the size of the spaCy core library. This is mainly useful if you want the data included but aren't using a pretrained pipeline; for the affected languages, the relevant data is included in pretrained pipelines directly.
|
||||
- [coreferee](https://github.com/explosion/coreferee): Coreference resolution for English, French, German and Polish, optimised for limited training data and easily extensible for further languages. Used as a spaCy pipeline component.
|
||||
- [spacy-stanza](https://github.com/explosion/spacy-stanza): This is a wrapper that allows the use of Stanford's Stanza library in spaCy.
|
||||
- [spacy-streamlit](https://github.com/explosion/spacy-streamlit): A wrapper for the Streamlit dashboard building library to help with integrating [displaCy](https://spacy.io/api/top-level/#displacy).
|
||||
- [spacymoji](https://github.com/explosion/spacymoji): A library to add extra support for emoji to spaCy, such as including character names.
|
||||
- [thinc-apple-ops](https://github.com/explosion/thinc-apple-ops): A special backend for OSX that uses Apple's native libraries for improved performance.
|
||||
- [os-signpost](https://github.com/explosion/os-signpost): A Python package that allows you to use the `OSSignposter` API in OSX for performance analysis.
|
||||
- [spacy-ray](https://github.com/explosion/spacy-ray): A wrapper to integrate spaCy with Ray, a distributed training framework. Currently a work in progress.
|
||||
|
||||
## Prodigy
|
||||
|
||||
[Prodigy](https://prodi.gy) is Explosion's easy to use and highly customizable tool for annotating data. Prodigy itself requires a license, but the repos below contain documentation, examples, and editor or notebook integrations.
|
||||
|
||||
- [prodigy-recipes](https://github.com/explosion/prodigy-recipes): Sample recipes for Prodigy, along with notebooks and other examples of usage.
|
||||
- [vscode-prodigy](https://github.com/explosion/vscode-prodigy): A VS Code extension that lets you run Prodigy inside VS Code.
|
||||
- [jupyterlab-prodigy](https://github.com/explosion/jupyterlab-prodigy): An extension for JupyterLab that lets you run Prodigy inside JupyterLab.
|
||||
|
||||
## Independent Tools or Projects
|
||||
|
||||
These are tools that may be related to or use spaCy, but are functional independent projects in their own right as well.
|
||||
|
||||
- [floret](https://github.com/explosion/floret): A modification of fastText to use Bloom Embeddings. Can be used to add vectors with subword features to spaCy, and also works independently in the same manner as fastText.
|
||||
- [sense2vec](https://github.com/explosion/sense2vec): A library to make embeddings of noun phrases or words coupled with their part of speech. This library uses spaCy.
|
||||
- [spacy-vectors-builder](https://github.com/explosion/spacy-vectors-builder): This is a spaCy project that builds vectors using floret and a lot of input text. It handles downloading the input data as well as the actual building of vectors.
|
||||
- [holmes-extractor](https://github.com/explosion/holmes-extractor): Information extraction from English and German texts based on predicate logic. Uses spaCy.
|
||||
- [healthsea](https://github.com/explosion/healthsea): Healthsea is a project to extract information from comments about health supplements. Structurally, it's a self-contained, large spaCy project.
|
||||
- [spacy-pkuseg](https://github.com/explosion/spacy-pkuseg): A fork of the pkuseg Chinese tokenizer. Used for Chinese support in spaCy, but also works independently.
|
||||
- [ml-datasets](https://github.com/explosion/ml-datasets): This repo includes loaders for several standard machine learning datasets, like MNIST or WikiNER, and has historically been used in spaCy example code and documentation.
|
||||
|
||||
## Documentation and Informational Repos
|
||||
|
||||
These repos are used to support the spaCy docs or otherwise present information about spaCy or other Explosion projects.
|
||||
|
||||
- [projects](https://github.com/explosion/projects): The projects repo is used to show detailed examples of spaCy usage. Individual projects can be checked out using the spaCy command line tool, rather than checking out the projects repo directly.
|
||||
- [spacy-course](https://github.com/explosion/spacy-course): Home to the interactive spaCy course for learning about how to use the library and some basic NLP principles.
|
||||
- [spacy-io-binder](https://github.com/explosion/spacy-io-binder): Home to the notebooks used for interactive examples in the documentation.
|
||||
|
||||
## Organizational / Meta
|
||||
|
||||
These repos are used for organizing data around spaCy, but are not something an end user would need to install as part of using the library.
|
||||
|
||||
- [spacy-models](https://github.com/explosion/spacy-models): This repo contains metadata (but not training data) for all the spaCy models. This includes information about where their training data came from, version compatability, and performance information. It also includes tests for the model packages, and the built models are hosted as releases of this repo.
|
||||
- [wheelwright](https://github.com/explosion/wheelwright): A tool for automating our PyPI builds and releases.
|
||||
- [ec2buildwheel](https://github.com/explosion/ec2buildwheel): A small project that allows you to build Python packages in the manner of cibuildwheel, but on any EC2 image. Used by wheelwright.
|
||||
|
||||
## Other
|
||||
|
||||
Repos that don't fit in any of the above categories.
|
||||
|
||||
- [blis](https://github.com/explosion/blis): A fork of the official BLIS library. The main branch is not updated, but work continues in various branches. This is used for cython-blis.
|
||||
- [tokenizations](https://github.com/explosion/tokenizations): A library originally by Yohei Tamura to align strings with tolerance to some variations in features like case and diacritics, used for aligning tokens and wordpieces. Adopted and maintained by Explosion, but usually spacy-alignments is used instead.
|
||||
- [conll-2012](https://github.com/explosion/conll-2012): A repo to hold some slightly cleaned up versions of the official scripts for the CoNLL 2012 shared task involving coreference resolution. Used in the coref project.
|
||||
- [fastapi-explosion-extras](https://github.com/explosion/fastapi-explosion-extras): Some small tweaks to FastAPI used at Explosion.
|
||||
|
|
@ -127,3 +127,34 @@ distributed under the License is distributed on an "AS IS" BASIS,
|
|||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
|
||||
|
||||
polyleven
|
||||
---------
|
||||
|
||||
* Files: spacy/matcher/polyleven.c
|
||||
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2021 Fujimoto Seiji <fujimoto@ceptord.net>
|
||||
Copyright (c) 2021 Max Bachmann <kontakt@maxbachmann.de>
|
||||
Copyright (c) 2022 Nick Mazuk
|
||||
Copyright (c) 2022 Michael Weiss <code@mweiss.ch>
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
|
|
|
@ -5,9 +5,7 @@ requires = [
|
|||
"cymem>=2.0.2,<2.1.0",
|
||||
"preshed>=3.0.2,<3.1.0",
|
||||
"murmurhash>=0.28.0,<1.1.0",
|
||||
"thinc>=8.0.12,<8.1.0",
|
||||
"blis>=0.4.0,<0.8.0",
|
||||
"pathy",
|
||||
"thinc>=8.1.0,<8.2.0",
|
||||
"numpy>=1.15.0",
|
||||
]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
|
|
@ -1,38 +1,40 @@
|
|||
# Our libraries
|
||||
spacy-legacy>=3.0.8,<3.1.0
|
||||
spacy-legacy>=3.0.11,<3.1.0
|
||||
spacy-loggers>=1.0.0,<2.0.0
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
thinc>=8.0.12,<8.1.0
|
||||
blis>=0.4.0,<0.8.0
|
||||
thinc>=8.1.0,<8.2.0
|
||||
ml_datasets>=0.2.0,<0.3.0
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
wasabi>=0.9.0,<1.1.0
|
||||
srsly>=2.4.1,<3.0.0
|
||||
wasabi>=0.9.1,<1.2.0
|
||||
srsly>=2.4.3,<3.0.0
|
||||
catalogue>=2.0.6,<2.1.0
|
||||
typer>=0.3.0,<0.5.0
|
||||
pathy>=0.3.5
|
||||
typer>=0.3.0,<0.8.0
|
||||
pathy>=0.10.0
|
||||
smart-open>=5.2.1,<7.0.0
|
||||
# Third party dependencies
|
||||
numpy>=1.15.0
|
||||
requests>=2.13.0,<3.0.0
|
||||
tqdm>=4.38.0,<5.0.0
|
||||
pydantic>=1.7.4,!=1.8,!=1.8.1,<1.9.0
|
||||
pydantic>=1.7.4,!=1.8,!=1.8.1,<1.11.0
|
||||
jinja2
|
||||
langcodes>=3.2.0,<4.0.0
|
||||
# Official Python utilities
|
||||
setuptools
|
||||
packaging>=20.0
|
||||
typing_extensions>=3.7.4.1,<4.0.0.0; python_version < "3.8"
|
||||
typing_extensions>=3.7.4.1,<4.5.0; python_version < "3.8"
|
||||
# Development dependencies
|
||||
pre-commit>=2.13.0
|
||||
cython>=0.25,<3.0
|
||||
pytest>=5.2.0
|
||||
pytest>=5.2.0,!=7.1.0
|
||||
pytest-timeout>=1.3.0,<2.0.0
|
||||
mock>=2.0.0,<3.0.0
|
||||
flake8>=3.8.0,<3.10.0
|
||||
flake8>=3.8.0,<6.0.0
|
||||
hypothesis>=3.27.0,<7.0.0
|
||||
mypy==0.910
|
||||
mypy>=0.990,<0.1000; platform_machine != "aarch64" and python_version >= "3.7"
|
||||
types-dataclasses>=0.1.3; python_version < "3.7"
|
||||
types-mock>=0.1.1
|
||||
types-setuptools>=57.0.0
|
||||
types-requests
|
||||
types-setuptools>=57.0.0
|
||||
black>=22.0,<23.0
|
||||
|
|
63
setup.cfg
63
setup.cfg
|
@ -22,6 +22,7 @@ classifiers =
|
|||
Programming Language :: Python :: 3.8
|
||||
Programming Language :: Python :: 3.9
|
||||
Programming Language :: Python :: 3.10
|
||||
Programming Language :: Python :: 3.11
|
||||
Topic :: Scientific/Engineering
|
||||
project_urls =
|
||||
Release notes = https://github.com/explosion/spaCy/releases
|
||||
|
@ -38,31 +39,31 @@ setup_requires =
|
|||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
thinc>=8.0.12,<8.1.0
|
||||
thinc>=8.1.0,<8.2.0
|
||||
install_requires =
|
||||
# Our libraries
|
||||
spacy-legacy>=3.0.8,<3.1.0
|
||||
spacy-legacy>=3.0.11,<3.1.0
|
||||
spacy-loggers>=1.0.0,<2.0.0
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
thinc>=8.0.12,<8.1.0
|
||||
blis>=0.4.0,<0.8.0
|
||||
wasabi>=0.9.0,<1.1.0
|
||||
srsly>=2.4.1,<3.0.0
|
||||
thinc>=8.1.0,<8.2.0
|
||||
wasabi>=0.9.1,<1.2.0
|
||||
srsly>=2.4.3,<3.0.0
|
||||
catalogue>=2.0.6,<2.1.0
|
||||
typer>=0.3.0,<0.5.0
|
||||
pathy>=0.3.5
|
||||
# Third-party dependencies
|
||||
typer>=0.3.0,<0.8.0
|
||||
pathy>=0.10.0
|
||||
smart-open>=5.2.1,<7.0.0
|
||||
tqdm>=4.38.0,<5.0.0
|
||||
numpy>=1.15.0
|
||||
requests>=2.13.0,<3.0.0
|
||||
pydantic>=1.7.4,!=1.8,!=1.8.1,<1.9.0
|
||||
pydantic>=1.7.4,!=1.8,!=1.8.1,<1.11.0
|
||||
jinja2
|
||||
# Official Python utilities
|
||||
setuptools
|
||||
packaging>=20.0
|
||||
typing_extensions>=3.7.4,<4.0.0.0; python_version < "3.8"
|
||||
typing_extensions>=3.7.4.1,<4.5.0; python_version < "3.8"
|
||||
langcodes>=3.2.0,<4.0.0
|
||||
|
||||
[options.entry_points]
|
||||
|
@ -73,45 +74,53 @@ console_scripts =
|
|||
lookups =
|
||||
spacy_lookups_data>=1.0.3,<1.1.0
|
||||
transformers =
|
||||
spacy_transformers>=1.1.2,<1.2.0
|
||||
spacy_transformers>=1.1.2,<1.3.0
|
||||
ray =
|
||||
spacy_ray>=0.1.0,<1.0.0
|
||||
cuda =
|
||||
cupy>=5.0.0b4,<11.0.0
|
||||
cupy>=5.0.0b4,<12.0.0
|
||||
cuda80 =
|
||||
cupy-cuda80>=5.0.0b4,<11.0.0
|
||||
cupy-cuda80>=5.0.0b4,<12.0.0
|
||||
cuda90 =
|
||||
cupy-cuda90>=5.0.0b4,<11.0.0
|
||||
cupy-cuda90>=5.0.0b4,<12.0.0
|
||||
cuda91 =
|
||||
cupy-cuda91>=5.0.0b4,<11.0.0
|
||||
cupy-cuda91>=5.0.0b4,<12.0.0
|
||||
cuda92 =
|
||||
cupy-cuda92>=5.0.0b4,<11.0.0
|
||||
cupy-cuda92>=5.0.0b4,<12.0.0
|
||||
cuda100 =
|
||||
cupy-cuda100>=5.0.0b4,<11.0.0
|
||||
cupy-cuda100>=5.0.0b4,<12.0.0
|
||||
cuda101 =
|
||||
cupy-cuda101>=5.0.0b4,<11.0.0
|
||||
cupy-cuda101>=5.0.0b4,<12.0.0
|
||||
cuda102 =
|
||||
cupy-cuda102>=5.0.0b4,<11.0.0
|
||||
cupy-cuda102>=5.0.0b4,<12.0.0
|
||||
cuda110 =
|
||||
cupy-cuda110>=5.0.0b4,<11.0.0
|
||||
cupy-cuda110>=5.0.0b4,<12.0.0
|
||||
cuda111 =
|
||||
cupy-cuda111>=5.0.0b4,<11.0.0
|
||||
cupy-cuda111>=5.0.0b4,<12.0.0
|
||||
cuda112 =
|
||||
cupy-cuda112>=5.0.0b4,<11.0.0
|
||||
cupy-cuda112>=5.0.0b4,<12.0.0
|
||||
cuda113 =
|
||||
cupy-cuda113>=5.0.0b4,<11.0.0
|
||||
cupy-cuda113>=5.0.0b4,<12.0.0
|
||||
cuda114 =
|
||||
cupy-cuda114>=5.0.0b4,<11.0.0
|
||||
cupy-cuda114>=5.0.0b4,<12.0.0
|
||||
cuda115 =
|
||||
cupy-cuda115>=5.0.0b4,<11.0.0
|
||||
cupy-cuda115>=5.0.0b4,<12.0.0
|
||||
cuda116 =
|
||||
cupy-cuda116>=5.0.0b4,<12.0.0
|
||||
cuda117 =
|
||||
cupy-cuda117>=5.0.0b4,<12.0.0
|
||||
cuda11x =
|
||||
cupy-cuda11x>=11.0.0,<12.0.0
|
||||
cuda-autodetect =
|
||||
cupy-wheel>=11.0.0,<12.0.0
|
||||
apple =
|
||||
thinc-apple-ops>=0.0.4,<1.0.0
|
||||
thinc-apple-ops>=0.1.0.dev0,<1.0.0
|
||||
# Language tokenizers with external dependencies
|
||||
ja =
|
||||
sudachipy>=0.5.2,!=0.6.1
|
||||
sudachidict_core>=20211220
|
||||
ko =
|
||||
natto-py==0.9.0
|
||||
natto-py>=0.9.0
|
||||
th =
|
||||
pythainlp>=2.0
|
||||
|
||||
|
|
25
setup.py
25
setup.py
|
@ -23,16 +23,20 @@ Options.docstrings = True
|
|||
|
||||
PACKAGES = find_packages()
|
||||
MOD_NAMES = [
|
||||
"spacy.training.alignment_array",
|
||||
"spacy.training.example",
|
||||
"spacy.parts_of_speech",
|
||||
"spacy.strings",
|
||||
"spacy.lexeme",
|
||||
"spacy.vocab",
|
||||
"spacy.attrs",
|
||||
"spacy.kb",
|
||||
"spacy.kb.candidate",
|
||||
"spacy.kb.kb",
|
||||
"spacy.kb.kb_in_memory",
|
||||
"spacy.ml.parser_model",
|
||||
"spacy.morphology",
|
||||
"spacy.pipeline.dep_parser",
|
||||
"spacy.pipeline._edit_tree_internals.edit_trees",
|
||||
"spacy.pipeline.morphologizer",
|
||||
"spacy.pipeline.multitask",
|
||||
"spacy.pipeline.ner",
|
||||
|
@ -124,6 +128,8 @@ class build_ext_options:
|
|||
|
||||
class build_ext_subclass(build_ext, build_ext_options):
|
||||
def build_extensions(self):
|
||||
if self.parallel is None and os.environ.get("SPACY_NUM_BUILD_JOBS") is not None:
|
||||
self.parallel = int(os.environ.get("SPACY_NUM_BUILD_JOBS"))
|
||||
build_ext_options.build_options(self)
|
||||
build_ext.build_extensions(self)
|
||||
|
||||
|
@ -201,10 +207,25 @@ def setup_package():
|
|||
get_python_inc(plat_specific=True),
|
||||
]
|
||||
ext_modules = []
|
||||
ext_modules.append(
|
||||
Extension(
|
||||
"spacy.matcher.levenshtein",
|
||||
[
|
||||
"spacy/matcher/levenshtein.pyx",
|
||||
"spacy/matcher/polyleven.c",
|
||||
],
|
||||
language="c",
|
||||
include_dirs=include_dirs,
|
||||
)
|
||||
)
|
||||
for name in MOD_NAMES:
|
||||
mod_path = name.replace(".", "/") + ".pyx"
|
||||
ext = Extension(
|
||||
name, [mod_path], language="c++", include_dirs=include_dirs, extra_compile_args=["-std=c++11"]
|
||||
name,
|
||||
[mod_path],
|
||||
language="c++",
|
||||
include_dirs=include_dirs,
|
||||
extra_compile_args=["-std=c++11"],
|
||||
)
|
||||
ext_modules.append(ext)
|
||||
print("Cythonizing sources")
|
||||
|
|
|
@ -31,25 +31,33 @@ def load(
|
|||
name: Union[str, Path],
|
||||
*,
|
||||
vocab: Union[Vocab, bool] = True,
|
||||
disable: Iterable[str] = util.SimpleFrozenList(),
|
||||
exclude: Iterable[str] = util.SimpleFrozenList(),
|
||||
disable: Union[str, Iterable[str]] = util._DEFAULT_EMPTY_PIPES,
|
||||
enable: Union[str, Iterable[str]] = util._DEFAULT_EMPTY_PIPES,
|
||||
exclude: Union[str, Iterable[str]] = util._DEFAULT_EMPTY_PIPES,
|
||||
config: Union[Dict[str, Any], Config] = util.SimpleFrozenDict(),
|
||||
) -> Language:
|
||||
"""Load a spaCy model from an installed package or a local path.
|
||||
|
||||
name (str): Package name or model path.
|
||||
vocab (Vocab): A Vocab object. If True, a vocab is created.
|
||||
disable (Iterable[str]): Names of pipeline components to disable. Disabled
|
||||
disable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to disable. Disabled
|
||||
pipes will be loaded but they won't be run unless you explicitly
|
||||
enable them by calling nlp.enable_pipe.
|
||||
exclude (Iterable[str]): Names of pipeline components to exclude. Excluded
|
||||
enable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to enable. All other
|
||||
pipes will be disabled (but can be enabled later using nlp.enable_pipe).
|
||||
exclude (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to exclude. Excluded
|
||||
components won't be loaded.
|
||||
config (Dict[str, Any] / Config): Config overrides as nested dict or dict
|
||||
keyed by section values in dot notation.
|
||||
RETURNS (Language): The loaded nlp object.
|
||||
"""
|
||||
return util.load_model(
|
||||
name, vocab=vocab, disable=disable, exclude=exclude, config=config
|
||||
name,
|
||||
vocab=vocab,
|
||||
disable=disable,
|
||||
enable=enable,
|
||||
exclude=exclude,
|
||||
config=config,
|
||||
)
|
||||
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
# fmt: off
|
||||
__title__ = "spacy"
|
||||
__version__ = "3.2.2"
|
||||
__version__ = "3.5.0"
|
||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||
__projects__ = "https://github.com/explosion/projects"
|
||||
|
|
|
@ -4,6 +4,7 @@ from ._util import app, setup_cli # noqa: F401
|
|||
|
||||
# These are the actual functions, NOT the wrapped CLI commands. The CLI commands
|
||||
# are registered automatically and won't have to be imported here.
|
||||
from .benchmark_speed import benchmark_speed_cli # noqa: F401
|
||||
from .download import download # noqa: F401
|
||||
from .info import info # noqa: F401
|
||||
from .package import package # noqa: F401
|
||||
|
@ -14,7 +15,9 @@ from .pretrain import pretrain # noqa: F401
|
|||
from .debug_data import debug_data # noqa: F401
|
||||
from .debug_config import debug_config # noqa: F401
|
||||
from .debug_model import debug_model # noqa: F401
|
||||
from .debug_diff import debug_diff # noqa: F401
|
||||
from .evaluate import evaluate # noqa: F401
|
||||
from .apply import apply # noqa: F401
|
||||
from .convert import convert # noqa: F401
|
||||
from .init_pipeline import init_pipeline_cli # noqa: F401
|
||||
from .init_config import init_config, fill_config # noqa: F401
|
||||
|
@ -26,6 +29,7 @@ from .project.dvc import project_update_dvc # noqa: F401
|
|||
from .project.push import project_push # noqa: F401
|
||||
from .project.pull import project_pull # noqa: F401
|
||||
from .project.document import project_document # noqa: F401
|
||||
from .find_threshold import find_threshold # noqa: F401
|
||||
|
||||
|
||||
@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
|
||||
|
|
|
@ -12,7 +12,7 @@ from click.parser import split_arg_string
|
|||
from typer.main import get_command
|
||||
from contextlib import contextmanager
|
||||
from thinc.api import Config, ConfigValidationError, require_gpu
|
||||
from thinc.util import has_cupy, gpu_is_available
|
||||
from thinc.util import gpu_is_available
|
||||
from configparser import InterpolationError
|
||||
import os
|
||||
|
||||
|
@ -23,7 +23,7 @@ from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS
|
|||
from .. import about
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pathy import Pathy # noqa: F401
|
||||
from pathy import FluidPath # noqa: F401
|
||||
|
||||
|
||||
SDIST_SUFFIX = ".tar.gz"
|
||||
|
@ -46,6 +46,7 @@ DEBUG_HELP = """Suite of helpful commands for debugging and profiling. Includes
|
|||
commands to check and validate your config files, training and evaluation data,
|
||||
and custom model implementations.
|
||||
"""
|
||||
BENCHMARK_HELP = """Commands for benchmarking pipelines."""
|
||||
INIT_HELP = """Commands for initializing configs and pipeline packages."""
|
||||
|
||||
# Wrappers for Typer's annotations. Initially created to set defaults and to
|
||||
|
@ -54,12 +55,14 @@ Arg = typer.Argument
|
|||
Opt = typer.Option
|
||||
|
||||
app = typer.Typer(name=NAME, help=HELP)
|
||||
benchmark_cli = typer.Typer(name="benchmark", help=BENCHMARK_HELP, no_args_is_help=True)
|
||||
project_cli = typer.Typer(name="project", help=PROJECT_HELP, no_args_is_help=True)
|
||||
debug_cli = typer.Typer(name="debug", help=DEBUG_HELP, no_args_is_help=True)
|
||||
init_cli = typer.Typer(name="init", help=INIT_HELP, no_args_is_help=True)
|
||||
|
||||
app.add_typer(project_cli)
|
||||
app.add_typer(debug_cli)
|
||||
app.add_typer(benchmark_cli)
|
||||
app.add_typer(init_cli)
|
||||
|
||||
|
||||
|
@ -158,15 +161,15 @@ def load_project_config(
|
|||
sys.exit(1)
|
||||
validate_project_version(config)
|
||||
validate_project_commands(config)
|
||||
if interpolate:
|
||||
err = f"{PROJECT_FILE} validation error"
|
||||
with show_validation_error(title=err, hint_fill=False):
|
||||
config = substitute_project_variables(config, overrides)
|
||||
# Make sure directories defined in config exist
|
||||
for subdir in config.get("directories", []):
|
||||
dir_path = path / subdir
|
||||
if not dir_path.exists():
|
||||
dir_path.mkdir(parents=True)
|
||||
if interpolate:
|
||||
err = f"{PROJECT_FILE} validation error"
|
||||
with show_validation_error(title=err, hint_fill=False):
|
||||
config = substitute_project_variables(config, overrides)
|
||||
return config
|
||||
|
||||
|
||||
|
@ -331,7 +334,7 @@ def import_code(code_path: Optional[Union[Path, str]]) -> None:
|
|||
msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1)
|
||||
|
||||
|
||||
def upload_file(src: Path, dest: Union[str, "Pathy"]) -> None:
|
||||
def upload_file(src: Path, dest: Union[str, "FluidPath"]) -> None:
|
||||
"""Upload a file.
|
||||
|
||||
src (Path): The source path.
|
||||
|
@ -339,13 +342,20 @@ def upload_file(src: Path, dest: Union[str, "Pathy"]) -> None:
|
|||
"""
|
||||
import smart_open
|
||||
|
||||
# Create parent directories for local paths
|
||||
if isinstance(dest, Path):
|
||||
if not dest.parent.exists():
|
||||
dest.parent.mkdir(parents=True)
|
||||
|
||||
dest = str(dest)
|
||||
with smart_open.open(dest, mode="wb") as output_file:
|
||||
with src.open(mode="rb") as input_file:
|
||||
output_file.write(input_file.read())
|
||||
|
||||
|
||||
def download_file(src: Union[str, "Pathy"], dest: Path, *, force: bool = False) -> None:
|
||||
def download_file(
|
||||
src: Union[str, "FluidPath"], dest: Path, *, force: bool = False
|
||||
) -> None:
|
||||
"""Download a file using smart_open.
|
||||
|
||||
url (str): The URL of the file.
|
||||
|
@ -358,9 +368,9 @@ def download_file(src: Union[str, "Pathy"], dest: Path, *, force: bool = False)
|
|||
if dest.exists() and not force:
|
||||
return None
|
||||
src = str(src)
|
||||
with smart_open.open(src, mode="rb", ignore_ext=True) as input_file:
|
||||
with smart_open.open(src, mode="rb", compression="disable") as input_file:
|
||||
with dest.open(mode="wb") as output_file:
|
||||
output_file.write(input_file.read())
|
||||
shutil.copyfileobj(input_file, output_file)
|
||||
|
||||
|
||||
def ensure_pathy(path):
|
||||
|
@ -368,7 +378,7 @@ def ensure_pathy(path):
|
|||
slow and annoying Google Cloud warning)."""
|
||||
from pathy import Pathy # noqa: F811
|
||||
|
||||
return Pathy(path)
|
||||
return Pathy.fluid(path)
|
||||
|
||||
|
||||
def git_checkout(
|
||||
|
@ -462,6 +472,23 @@ def git_sparse_checkout(repo, subpath, dest, branch):
|
|||
shutil.move(str(source_path), str(dest))
|
||||
|
||||
|
||||
def git_repo_branch_exists(repo: str, branch: str) -> bool:
|
||||
"""Uses 'git ls-remote' to check if a repository and branch exists
|
||||
|
||||
repo (str): URL to get repo.
|
||||
branch (str): Branch on repo to check.
|
||||
RETURNS (bool): True if repo:branch exists.
|
||||
"""
|
||||
get_git_version()
|
||||
cmd = f"git ls-remote {repo} {branch}"
|
||||
# We might be tempted to use `--exit-code` with `git ls-remote`, but
|
||||
# `run_command` handles the `returncode` for us, so we'll rely on
|
||||
# the fact that stdout returns '' if the requested branch doesn't exist
|
||||
ret = run_command(cmd, capture=True)
|
||||
exists = ret.stdout != ""
|
||||
return exists
|
||||
|
||||
|
||||
def get_git_version(
|
||||
error: str = "Could not run 'git'. Make sure it's installed and the executable is available.",
|
||||
) -> Tuple[int, int]:
|
||||
|
@ -554,5 +581,41 @@ def setup_gpu(use_gpu: int, silent=None) -> None:
|
|||
require_gpu(use_gpu)
|
||||
else:
|
||||
local_msg.info("Using CPU")
|
||||
if has_cupy and gpu_is_available():
|
||||
if gpu_is_available():
|
||||
local_msg.info("To switch to GPU 0, use the option: --gpu-id 0")
|
||||
|
||||
|
||||
def walk_directory(path: Path, suffix: Optional[str] = None) -> List[Path]:
|
||||
"""Given a directory and a suffix, recursively find all files matching the suffix.
|
||||
Directories or files with names beginning with a . are ignored, but hidden flags on
|
||||
filesystems are not checked.
|
||||
When provided with a suffix `None`, there is no suffix-based filtering."""
|
||||
if not path.is_dir():
|
||||
return [path]
|
||||
paths = [path]
|
||||
locs = []
|
||||
seen = set()
|
||||
for path in paths:
|
||||
if str(path) in seen:
|
||||
continue
|
||||
seen.add(str(path))
|
||||
if path.parts[-1].startswith("."):
|
||||
continue
|
||||
elif path.is_dir():
|
||||
paths.extend(path.iterdir())
|
||||
elif suffix is not None and not path.parts[-1].endswith(suffix):
|
||||
continue
|
||||
else:
|
||||
locs.append(path)
|
||||
# It's good to sort these, in case the ordering messes up cache.
|
||||
locs.sort()
|
||||
return locs
|
||||
|
||||
|
||||
def _format_number(number: Union[int, float], ndigits: int = 2) -> str:
|
||||
"""Formats a number (float or int) rounding to `ndigits`, without truncating trailing 0s,
|
||||
as happens with `round(number, ndigits)`"""
|
||||
if isinstance(number, float):
|
||||
return f"{number:.{ndigits}f}"
|
||||
else:
|
||||
return str(number)
|
||||
|
|
143
spacy/cli/apply.py
Normal file
143
spacy/cli/apply.py
Normal file
|
@ -0,0 +1,143 @@
|
|||
import tqdm
|
||||
import srsly
|
||||
|
||||
from itertools import chain
|
||||
from pathlib import Path
|
||||
from typing import Optional, List, Iterable, cast, Union
|
||||
|
||||
from wasabi import msg
|
||||
|
||||
from ._util import app, Arg, Opt, setup_gpu, import_code, walk_directory
|
||||
|
||||
from ..tokens import Doc, DocBin
|
||||
from ..vocab import Vocab
|
||||
from ..util import ensure_path, load_model
|
||||
|
||||
|
||||
path_help = """Location of the documents to predict on.
|
||||
Can be a single file in .spacy format or a .jsonl file.
|
||||
Files with other extensions are treated as single plain text documents.
|
||||
If a directory is provided it is traversed recursively to grab
|
||||
all files to be processed.
|
||||
The files can be a mixture of .spacy, .jsonl and text files.
|
||||
If .jsonl is provided the specified field is going
|
||||
to be grabbed ("text" by default)."""
|
||||
|
||||
out_help = "Path to save the resulting .spacy file"
|
||||
code_help = (
|
||||
"Path to Python file with additional " "code (registered functions) to be imported"
|
||||
)
|
||||
gold_help = "Use gold preprocessing provided in the .spacy files"
|
||||
force_msg = (
|
||||
"The provided output file already exists. "
|
||||
"To force overwriting the output file, set the --force or -F flag."
|
||||
)
|
||||
|
||||
|
||||
DocOrStrStream = Union[Iterable[str], Iterable[Doc]]
|
||||
|
||||
|
||||
def _stream_docbin(path: Path, vocab: Vocab) -> Iterable[Doc]:
|
||||
"""
|
||||
Stream Doc objects from DocBin.
|
||||
"""
|
||||
docbin = DocBin().from_disk(path)
|
||||
for doc in docbin.get_docs(vocab):
|
||||
yield doc
|
||||
|
||||
|
||||
def _stream_jsonl(path: Path, field: str) -> Iterable[str]:
|
||||
"""
|
||||
Stream "text" field from JSONL. If the field "text" is
|
||||
not found it raises error.
|
||||
"""
|
||||
for entry in srsly.read_jsonl(path):
|
||||
if field not in entry:
|
||||
msg.fail(f"{path} does not contain the required '{field}' field.", exits=1)
|
||||
else:
|
||||
yield entry[field]
|
||||
|
||||
|
||||
def _stream_texts(paths: Iterable[Path]) -> Iterable[str]:
|
||||
"""
|
||||
Yields strings from text files in paths.
|
||||
"""
|
||||
for path in paths:
|
||||
with open(path, "r") as fin:
|
||||
text = fin.read()
|
||||
yield text
|
||||
|
||||
|
||||
@app.command("apply")
|
||||
def apply_cli(
|
||||
# fmt: off
|
||||
model: str = Arg(..., help="Model name or path"),
|
||||
data_path: Path = Arg(..., help=path_help, exists=True),
|
||||
output_file: Path = Arg(..., help=out_help, dir_okay=False),
|
||||
code_path: Optional[Path] = Opt(None, "--code", "-c", help=code_help),
|
||||
text_key: str = Opt("text", "--text-key", "-tk", help="Key containing text string for JSONL"),
|
||||
force_overwrite: bool = Opt(False, "--force", "-F", help="Force overwriting the output file"),
|
||||
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU."),
|
||||
batch_size: int = Opt(1, "--batch-size", "-b", help="Batch size."),
|
||||
n_process: int = Opt(1, "--n-process", "-n", help="number of processors to use.")
|
||||
):
|
||||
"""
|
||||
Apply a trained pipeline to documents to get predictions.
|
||||
Expects a loadable spaCy pipeline and path to the data, which
|
||||
can be a directory or a file.
|
||||
The data files can be provided in multiple formats:
|
||||
1. .spacy files
|
||||
2. .jsonl files with a specified "field" to read the text from.
|
||||
3. Files with any other extension are assumed to be containing
|
||||
a single document.
|
||||
DOCS: https://spacy.io/api/cli#apply
|
||||
"""
|
||||
data_path = ensure_path(data_path)
|
||||
output_file = ensure_path(output_file)
|
||||
code_path = ensure_path(code_path)
|
||||
if output_file.exists() and not force_overwrite:
|
||||
msg.fail(force_msg, exits=1)
|
||||
if not data_path.exists():
|
||||
msg.fail(f"Couldn't find data path: {data_path}", exits=1)
|
||||
import_code(code_path)
|
||||
setup_gpu(use_gpu)
|
||||
apply(data_path, output_file, model, text_key, batch_size, n_process)
|
||||
|
||||
|
||||
def apply(
|
||||
data_path: Path,
|
||||
output_file: Path,
|
||||
model: str,
|
||||
json_field: str,
|
||||
batch_size: int,
|
||||
n_process: int,
|
||||
):
|
||||
docbin = DocBin(store_user_data=True)
|
||||
paths = walk_directory(data_path)
|
||||
if len(paths) == 0:
|
||||
docbin.to_disk(output_file)
|
||||
msg.warn(
|
||||
"Did not find data to process,"
|
||||
f" {data_path} seems to be an empty directory."
|
||||
)
|
||||
return
|
||||
nlp = load_model(model)
|
||||
msg.good(f"Loaded model {model}")
|
||||
vocab = nlp.vocab
|
||||
streams: List[DocOrStrStream] = []
|
||||
text_files = []
|
||||
for path in paths:
|
||||
if path.suffix == ".spacy":
|
||||
streams.append(_stream_docbin(path, vocab))
|
||||
elif path.suffix == ".jsonl":
|
||||
streams.append(_stream_jsonl(path, json_field))
|
||||
else:
|
||||
text_files.append(path)
|
||||
if len(text_files) > 0:
|
||||
streams.append(_stream_texts(text_files))
|
||||
datagen = cast(DocOrStrStream, chain(*streams))
|
||||
for doc in tqdm.tqdm(nlp.pipe(datagen, batch_size=batch_size, n_process=n_process)):
|
||||
docbin.add(doc)
|
||||
if output_file.suffix == "":
|
||||
output_file = output_file.with_suffix(".spacy")
|
||||
docbin.to_disk(output_file)
|
174
spacy/cli/benchmark_speed.py
Normal file
174
spacy/cli/benchmark_speed.py
Normal file
|
@ -0,0 +1,174 @@
|
|||
from typing import Iterable, List, Optional
|
||||
import random
|
||||
from itertools import islice
|
||||
import numpy
|
||||
from pathlib import Path
|
||||
import time
|
||||
from tqdm import tqdm
|
||||
import typer
|
||||
from wasabi import msg
|
||||
|
||||
from .. import util
|
||||
from ..language import Language
|
||||
from ..tokens import Doc
|
||||
from ..training import Corpus
|
||||
from ._util import Arg, Opt, benchmark_cli, setup_gpu
|
||||
|
||||
|
||||
@benchmark_cli.command(
|
||||
"speed",
|
||||
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
||||
)
|
||||
def benchmark_speed_cli(
|
||||
# fmt: off
|
||||
ctx: typer.Context,
|
||||
model: str = Arg(..., help="Model name or path"),
|
||||
data_path: Path = Arg(..., help="Location of binary evaluation data in .spacy format", exists=True),
|
||||
batch_size: Optional[int] = Opt(None, "--batch-size", "-b", min=1, help="Override the pipeline batch size"),
|
||||
no_shuffle: bool = Opt(False, "--no-shuffle", help="Do not shuffle benchmark data"),
|
||||
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
||||
n_batches: int = Opt(50, "--batches", help="Minimum number of batches to benchmark", min=30,),
|
||||
warmup_epochs: int = Opt(3, "--warmup", "-w", min=0, help="Number of iterations over the data for warmup"),
|
||||
# fmt: on
|
||||
):
|
||||
"""
|
||||
Benchmark a pipeline. Expects a loadable spaCy pipeline and benchmark
|
||||
data in the binary .spacy format.
|
||||
"""
|
||||
setup_gpu(use_gpu=use_gpu, silent=False)
|
||||
|
||||
nlp = util.load_model(model)
|
||||
batch_size = batch_size if batch_size is not None else nlp.batch_size
|
||||
corpus = Corpus(data_path)
|
||||
docs = [eg.predicted for eg in corpus(nlp)]
|
||||
|
||||
if len(docs) == 0:
|
||||
msg.fail("Cannot benchmark speed using an empty corpus.", exits=1)
|
||||
|
||||
print(f"Warming up for {warmup_epochs} epochs...")
|
||||
warmup(nlp, docs, warmup_epochs, batch_size)
|
||||
|
||||
print()
|
||||
print(f"Benchmarking {n_batches} batches...")
|
||||
wps = benchmark(nlp, docs, n_batches, batch_size, not no_shuffle)
|
||||
|
||||
print()
|
||||
print_outliers(wps)
|
||||
print_mean_with_ci(wps)
|
||||
|
||||
|
||||
# Lowercased, behaves as a context manager function.
|
||||
class time_context:
|
||||
"""Register the running time of a context."""
|
||||
|
||||
def __enter__(self):
|
||||
self.start = time.perf_counter()
|
||||
return self
|
||||
|
||||
def __exit__(self, type, value, traceback):
|
||||
self.elapsed = time.perf_counter() - self.start
|
||||
|
||||
|
||||
class Quartiles:
|
||||
"""Calculate the q1, q2, q3 quartiles and the inter-quartile range (iqr)
|
||||
of a sample."""
|
||||
|
||||
q1: float
|
||||
q2: float
|
||||
q3: float
|
||||
iqr: float
|
||||
|
||||
def __init__(self, sample: numpy.ndarray) -> None:
|
||||
self.q1 = numpy.quantile(sample, 0.25)
|
||||
self.q2 = numpy.quantile(sample, 0.5)
|
||||
self.q3 = numpy.quantile(sample, 0.75)
|
||||
self.iqr = self.q3 - self.q1
|
||||
|
||||
|
||||
def annotate(
|
||||
nlp: Language, docs: List[Doc], batch_size: Optional[int]
|
||||
) -> numpy.ndarray:
|
||||
docs = nlp.pipe(tqdm(docs, unit="doc"), batch_size=batch_size)
|
||||
wps = []
|
||||
while True:
|
||||
with time_context() as elapsed:
|
||||
batch_docs = list(
|
||||
islice(docs, batch_size if batch_size else nlp.batch_size)
|
||||
)
|
||||
if len(batch_docs) == 0:
|
||||
break
|
||||
n_tokens = count_tokens(batch_docs)
|
||||
wps.append(n_tokens / elapsed.elapsed)
|
||||
|
||||
return numpy.array(wps)
|
||||
|
||||
|
||||
def benchmark(
|
||||
nlp: Language,
|
||||
docs: List[Doc],
|
||||
n_batches: int,
|
||||
batch_size: int,
|
||||
shuffle: bool,
|
||||
) -> numpy.ndarray:
|
||||
if shuffle:
|
||||
bench_docs = [
|
||||
nlp.make_doc(random.choice(docs).text)
|
||||
for _ in range(n_batches * batch_size)
|
||||
]
|
||||
else:
|
||||
bench_docs = [
|
||||
nlp.make_doc(docs[i % len(docs)].text)
|
||||
for i in range(n_batches * batch_size)
|
||||
]
|
||||
|
||||
return annotate(nlp, bench_docs, batch_size)
|
||||
|
||||
|
||||
def bootstrap(x, statistic=numpy.mean, iterations=10000) -> numpy.ndarray:
|
||||
"""Apply a statistic to repeated random samples of an array."""
|
||||
return numpy.fromiter(
|
||||
(
|
||||
statistic(numpy.random.choice(x, len(x), replace=True))
|
||||
for _ in range(iterations)
|
||||
),
|
||||
numpy.float64,
|
||||
)
|
||||
|
||||
|
||||
def count_tokens(docs: Iterable[Doc]) -> int:
|
||||
return sum(len(doc) for doc in docs)
|
||||
|
||||
|
||||
def print_mean_with_ci(sample: numpy.ndarray):
|
||||
mean = numpy.mean(sample)
|
||||
bootstrap_means = bootstrap(sample)
|
||||
bootstrap_means.sort()
|
||||
|
||||
# 95% confidence interval
|
||||
low = bootstrap_means[int(len(bootstrap_means) * 0.025)]
|
||||
high = bootstrap_means[int(len(bootstrap_means) * 0.975)]
|
||||
|
||||
print(f"Mean: {mean:.1f} words/s (95% CI: {low-mean:.1f} +{high-mean:.1f})")
|
||||
|
||||
|
||||
def print_outliers(sample: numpy.ndarray):
|
||||
quartiles = Quartiles(sample)
|
||||
|
||||
n_outliers = numpy.sum(
|
||||
(sample < (quartiles.q1 - 1.5 * quartiles.iqr))
|
||||
| (sample > (quartiles.q3 + 1.5 * quartiles.iqr))
|
||||
)
|
||||
n_extreme_outliers = numpy.sum(
|
||||
(sample < (quartiles.q1 - 3.0 * quartiles.iqr))
|
||||
| (sample > (quartiles.q3 + 3.0 * quartiles.iqr))
|
||||
)
|
||||
print(
|
||||
f"Outliers: {(100 * n_outliers) / len(sample):.1f}%, extreme outliers: {(100 * n_extreme_outliers) / len(sample)}%"
|
||||
)
|
||||
|
||||
|
||||
def warmup(
|
||||
nlp: Language, docs: List[Doc], warmup_epochs: int, batch_size: Optional[int]
|
||||
) -> numpy.ndarray:
|
||||
docs = warmup_epochs * docs
|
||||
return annotate(nlp, docs, batch_size)
|
|
@ -1,4 +1,4 @@
|
|||
from typing import Callable, Iterable, Mapping, Optional, Any, List, Union
|
||||
from typing import Callable, Iterable, Mapping, Optional, Any, Union
|
||||
from enum import Enum
|
||||
from pathlib import Path
|
||||
from wasabi import Printer
|
||||
|
@ -7,7 +7,7 @@ import re
|
|||
import sys
|
||||
import itertools
|
||||
|
||||
from ._util import app, Arg, Opt
|
||||
from ._util import app, Arg, Opt, walk_directory
|
||||
from ..training import docs_to_json
|
||||
from ..tokens import Doc, DocBin
|
||||
from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs
|
||||
|
@ -28,6 +28,8 @@ CONVERTERS: Mapping[str, Callable[..., Iterable[Doc]]] = {
|
|||
"json": json_to_docs,
|
||||
}
|
||||
|
||||
AUTO = "auto"
|
||||
|
||||
|
||||
# File types that can be written to stdout
|
||||
FILE_TYPES_STDOUT = ("json",)
|
||||
|
@ -49,7 +51,7 @@ def convert_cli(
|
|||
model: Optional[str] = Opt(None, "--model", "--base", "-b", help="Trained spaCy pipeline for sentence segmentation to use as base (for --seg-sents)"),
|
||||
morphology: bool = Opt(False, "--morphology", "-m", help="Enable appending morphology to tags"),
|
||||
merge_subtokens: bool = Opt(False, "--merge-subtokens", "-T", help="Merge CoNLL-U subtokens"),
|
||||
converter: str = Opt("auto", "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}"),
|
||||
converter: str = Opt(AUTO, "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}"),
|
||||
ner_map: Optional[Path] = Opt(None, "--ner-map", "-nm", help="NER tag mapping (as JSON-encoded dict of entity types)", exists=True),
|
||||
lang: Optional[str] = Opt(None, "--lang", "-l", help="Language (if tokenizer required)"),
|
||||
concatenate: bool = Opt(None, "--concatenate", "-C", help="Concatenate output to a single file"),
|
||||
|
@ -70,8 +72,8 @@ def convert_cli(
|
|||
output_dir: Union[str, Path] = "-" if output_dir == Path("-") else output_dir
|
||||
silent = output_dir == "-"
|
||||
msg = Printer(no_print=silent)
|
||||
verify_cli_args(msg, input_path, output_dir, file_type.value, converter, ner_map)
|
||||
converter = _get_converter(msg, converter, input_path)
|
||||
verify_cli_args(msg, input_path, output_dir, file_type.value, converter, ner_map)
|
||||
convert(
|
||||
input_path,
|
||||
output_dir,
|
||||
|
@ -100,7 +102,7 @@ def convert(
|
|||
model: Optional[str] = None,
|
||||
morphology: bool = False,
|
||||
merge_subtokens: bool = False,
|
||||
converter: str = "auto",
|
||||
converter: str,
|
||||
ner_map: Optional[Path] = None,
|
||||
lang: Optional[str] = None,
|
||||
concatenate: bool = False,
|
||||
|
@ -189,33 +191,6 @@ def autodetect_ner_format(input_data: str) -> Optional[str]:
|
|||
return None
|
||||
|
||||
|
||||
def walk_directory(path: Path, converter: str) -> List[Path]:
|
||||
if not path.is_dir():
|
||||
return [path]
|
||||
paths = [path]
|
||||
locs = []
|
||||
seen = set()
|
||||
for path in paths:
|
||||
if str(path) in seen:
|
||||
continue
|
||||
seen.add(str(path))
|
||||
if path.parts[-1].startswith("."):
|
||||
continue
|
||||
elif path.is_dir():
|
||||
paths.extend(path.iterdir())
|
||||
elif converter == "json" and not path.parts[-1].endswith("json"):
|
||||
continue
|
||||
elif converter == "conll" and not path.parts[-1].endswith("conll"):
|
||||
continue
|
||||
elif converter == "iob" and not path.parts[-1].endswith("iob"):
|
||||
continue
|
||||
else:
|
||||
locs.append(path)
|
||||
# It's good to sort these, in case the ordering messes up cache.
|
||||
locs.sort()
|
||||
return locs
|
||||
|
||||
|
||||
def verify_cli_args(
|
||||
msg: Printer,
|
||||
input_path: Path,
|
||||
|
@ -239,18 +214,22 @@ def verify_cli_args(
|
|||
input_locs = walk_directory(input_path, converter)
|
||||
if len(input_locs) == 0:
|
||||
msg.fail("No input files in directory", input_path, exits=1)
|
||||
file_types = list(set([loc.suffix[1:] for loc in input_locs]))
|
||||
if converter == "auto" and len(file_types) >= 2:
|
||||
file_types_str = ",".join(file_types)
|
||||
msg.fail("All input files must be same type", file_types_str, exits=1)
|
||||
if converter != "auto" and converter not in CONVERTERS:
|
||||
if converter not in CONVERTERS:
|
||||
msg.fail(f"Can't find converter for {converter}", exits=1)
|
||||
|
||||
|
||||
def _get_converter(msg, converter, input_path: Path):
|
||||
if input_path.is_dir():
|
||||
input_path = walk_directory(input_path, converter)[0]
|
||||
if converter == "auto":
|
||||
if converter == AUTO:
|
||||
input_locs = walk_directory(input_path, suffix=None)
|
||||
file_types = list(set([loc.suffix[1:] for loc in input_locs]))
|
||||
if len(file_types) >= 2:
|
||||
file_types_str = ",".join(file_types)
|
||||
msg.fail("All input files must be same type", file_types_str, exits=1)
|
||||
input_path = input_locs[0]
|
||||
else:
|
||||
input_path = walk_directory(input_path, suffix=converter)[0]
|
||||
if converter == AUTO:
|
||||
converter = input_path.suffix[1:]
|
||||
if converter == "ner" or converter == "iob":
|
||||
with input_path.open(encoding="utf8") as file_:
|
||||
|
|
|
@ -6,12 +6,14 @@ import sys
|
|||
import srsly
|
||||
from wasabi import Printer, MESSAGES, msg
|
||||
import typer
|
||||
import math
|
||||
|
||||
from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
|
||||
from ._util import import_code, debug_cli
|
||||
from ..training import Example
|
||||
from ._util import import_code, debug_cli, _format_number
|
||||
from ..training import Example, remove_bilu_prefix
|
||||
from ..training.initialize import get_sourced_components
|
||||
from ..schemas import ConfigSchemaTraining
|
||||
from ..pipeline import TrainablePipe
|
||||
from ..pipeline._parser_internals import nonproj
|
||||
from ..pipeline._parser_internals.nonproj import DELIMITER
|
||||
from ..pipeline import Morphologizer, SpanCategorizer
|
||||
|
@ -19,6 +21,7 @@ from ..morphology import Morphology
|
|||
from ..language import Language
|
||||
from ..util import registry, resolve_dot_names
|
||||
from ..compat import Literal
|
||||
from ..vectors import Mode as VectorsMode
|
||||
from .. import util
|
||||
|
||||
|
||||
|
@ -29,6 +32,12 @@ DEP_LABEL_THRESHOLD = 20
|
|||
# Minimum number of expected examples to train a new pipeline
|
||||
BLANK_MODEL_MIN_THRESHOLD = 100
|
||||
BLANK_MODEL_THRESHOLD = 2000
|
||||
# Arbitrary threshold where SpanCat performs well
|
||||
SPAN_DISTINCT_THRESHOLD = 1
|
||||
# Arbitrary threshold where SpanCat performs well
|
||||
BOUNDARY_DISTINCT_THRESHOLD = 1
|
||||
# Arbitrary threshold for filtering span lengths during reporting (percentage)
|
||||
SPAN_LENGTH_THRESHOLD_PERCENTAGE = 90
|
||||
|
||||
|
||||
@debug_cli.command(
|
||||
|
@ -170,26 +179,34 @@ def debug_data(
|
|||
show=verbose,
|
||||
)
|
||||
if len(nlp.vocab.vectors):
|
||||
msg.info(
|
||||
f"{len(nlp.vocab.vectors)} vectors ({nlp.vocab.vectors.n_keys} "
|
||||
f"unique keys, {nlp.vocab.vectors_length} dimensions)"
|
||||
)
|
||||
n_missing_vectors = sum(gold_train_data["words_missing_vectors"].values())
|
||||
msg.warn(
|
||||
"{} words in training data without vectors ({:.0f}%)".format(
|
||||
n_missing_vectors,
|
||||
100 * (n_missing_vectors / gold_train_data["n_words"]),
|
||||
),
|
||||
)
|
||||
msg.text(
|
||||
"10 most common words without vectors: {}".format(
|
||||
_format_labels(
|
||||
gold_train_data["words_missing_vectors"].most_common(10),
|
||||
counts=True,
|
||||
)
|
||||
),
|
||||
show=verbose,
|
||||
)
|
||||
if nlp.vocab.vectors.mode == VectorsMode.floret:
|
||||
msg.info(
|
||||
f"floret vectors with {len(nlp.vocab.vectors)} vectors, "
|
||||
f"{nlp.vocab.vectors_length} dimensions, "
|
||||
f"{nlp.vocab.vectors.minn}-{nlp.vocab.vectors.maxn} char "
|
||||
f"n-gram subwords"
|
||||
)
|
||||
else:
|
||||
msg.info(
|
||||
f"{len(nlp.vocab.vectors)} vectors ({nlp.vocab.vectors.n_keys} "
|
||||
f"unique keys, {nlp.vocab.vectors_length} dimensions)"
|
||||
)
|
||||
n_missing_vectors = sum(gold_train_data["words_missing_vectors"].values())
|
||||
msg.warn(
|
||||
"{} words in training data without vectors ({:.0f}%)".format(
|
||||
n_missing_vectors,
|
||||
100 * (n_missing_vectors / gold_train_data["n_words"]),
|
||||
),
|
||||
)
|
||||
msg.text(
|
||||
"10 most common words without vectors: {}".format(
|
||||
_format_labels(
|
||||
gold_train_data["words_missing_vectors"].most_common(10),
|
||||
counts=True,
|
||||
)
|
||||
),
|
||||
show=verbose,
|
||||
)
|
||||
else:
|
||||
msg.info("No word vectors present in the package")
|
||||
|
||||
|
@ -238,6 +255,69 @@ def debug_data(
|
|||
msg.warn(f"No examples for texts WITHOUT new label '{label}'")
|
||||
has_no_neg_warning = True
|
||||
|
||||
with msg.loading("Obtaining span characteristics..."):
|
||||
span_characteristics = _get_span_characteristics(
|
||||
train_dataset, gold_train_data, spans_key
|
||||
)
|
||||
|
||||
msg.info(f"Span characteristics for spans_key '{spans_key}'")
|
||||
msg.info("SD = Span Distinctiveness, BD = Boundary Distinctiveness")
|
||||
_print_span_characteristics(span_characteristics)
|
||||
|
||||
_span_freqs = _get_spans_length_freq_dist(
|
||||
gold_train_data["spans_length"][spans_key]
|
||||
)
|
||||
_filtered_span_freqs = _filter_spans_length_freq_dist(
|
||||
_span_freqs, threshold=SPAN_LENGTH_THRESHOLD_PERCENTAGE
|
||||
)
|
||||
|
||||
msg.info(
|
||||
f"Over {SPAN_LENGTH_THRESHOLD_PERCENTAGE}% of spans have lengths of 1 -- "
|
||||
f"{max(_filtered_span_freqs.keys())} "
|
||||
f"(min={span_characteristics['min_length']}, max={span_characteristics['max_length']}). "
|
||||
f"The most common span lengths are: {_format_freqs(_filtered_span_freqs)}. "
|
||||
"If you are using the n-gram suggester, note that omitting "
|
||||
"infrequent n-gram lengths can greatly improve speed and "
|
||||
"memory usage."
|
||||
)
|
||||
|
||||
msg.text(
|
||||
f"Full distribution of span lengths: {_format_freqs(_span_freqs)}",
|
||||
show=verbose,
|
||||
)
|
||||
|
||||
# Add report regarding span characteristics
|
||||
if span_characteristics["avg_sd"] < SPAN_DISTINCT_THRESHOLD:
|
||||
msg.warn("Spans may not be distinct from the rest of the corpus")
|
||||
else:
|
||||
msg.good("Spans are distinct from the rest of the corpus")
|
||||
|
||||
p_spans = span_characteristics["p_spans"].values()
|
||||
all_span_tokens: Counter = sum(p_spans, Counter())
|
||||
most_common_spans = [w for w, _ in all_span_tokens.most_common(10)]
|
||||
msg.text(
|
||||
"10 most common span tokens: {}".format(
|
||||
_format_labels(most_common_spans)
|
||||
),
|
||||
show=verbose,
|
||||
)
|
||||
|
||||
# Add report regarding span boundary characteristics
|
||||
if span_characteristics["avg_bd"] < BOUNDARY_DISTINCT_THRESHOLD:
|
||||
msg.warn("Boundary tokens are not distinct from the rest of the corpus")
|
||||
else:
|
||||
msg.good("Boundary tokens are distinct from the rest of the corpus")
|
||||
|
||||
p_bounds = span_characteristics["p_bounds"].values()
|
||||
all_span_bound_tokens: Counter = sum(p_bounds, Counter())
|
||||
most_common_bounds = [w for w, _ in all_span_bound_tokens.most_common(10)]
|
||||
msg.text(
|
||||
"10 most common span boundary tokens: {}".format(
|
||||
_format_labels(most_common_bounds)
|
||||
),
|
||||
show=verbose,
|
||||
)
|
||||
|
||||
if has_low_data_warning:
|
||||
msg.text(
|
||||
f"To train a new span type, your data should include at "
|
||||
|
@ -282,7 +362,7 @@ def debug_data(
|
|||
if label != "-"
|
||||
]
|
||||
labels_with_counts = _format_labels(labels_with_counts, counts=True)
|
||||
msg.text(f"Labels in train data: {_format_labels(labels)}", show=verbose)
|
||||
msg.text(f"Labels in train data: {labels_with_counts}", show=verbose)
|
||||
missing_labels = model_labels - labels
|
||||
if missing_labels:
|
||||
msg.warn(
|
||||
|
@ -638,6 +718,9 @@ def _compile_gold(
|
|||
"words": Counter(),
|
||||
"roots": Counter(),
|
||||
"spancat": dict(),
|
||||
"spans_length": dict(),
|
||||
"spans_per_type": dict(),
|
||||
"sb_per_type": dict(),
|
||||
"ws_ents": 0,
|
||||
"boundary_cross_ents": 0,
|
||||
"n_words": 0,
|
||||
|
@ -676,21 +759,66 @@ def _compile_gold(
|
|||
# "Illegal" whitespace entity
|
||||
data["ws_ents"] += 1
|
||||
if label.startswith(("B-", "U-")):
|
||||
combined_label = label.split("-")[1]
|
||||
combined_label = remove_bilu_prefix(label)
|
||||
data["ner"][combined_label] += 1
|
||||
if sent_starts[i] == True and label.startswith(("I-", "L-")):
|
||||
if sent_starts[i] and label.startswith(("I-", "L-")):
|
||||
data["boundary_cross_ents"] += 1
|
||||
elif label == "-":
|
||||
data["ner"]["-"] += 1
|
||||
if "spancat" in factory_names:
|
||||
for span_key in list(eg.reference.spans.keys()):
|
||||
if span_key not in data["spancat"]:
|
||||
data["spancat"][span_key] = Counter()
|
||||
for i, span in enumerate(eg.reference.spans[span_key]):
|
||||
for spans_key in list(eg.reference.spans.keys()):
|
||||
# Obtain the span frequency
|
||||
if spans_key not in data["spancat"]:
|
||||
data["spancat"][spans_key] = Counter()
|
||||
for i, span in enumerate(eg.reference.spans[spans_key]):
|
||||
if span.label_ is None:
|
||||
continue
|
||||
else:
|
||||
data["spancat"][span_key][span.label_] += 1
|
||||
data["spancat"][spans_key][span.label_] += 1
|
||||
|
||||
# Obtain the span length
|
||||
if spans_key not in data["spans_length"]:
|
||||
data["spans_length"][spans_key] = dict()
|
||||
for span in gold.spans[spans_key]:
|
||||
if span.label_ is None:
|
||||
continue
|
||||
if span.label_ not in data["spans_length"][spans_key]:
|
||||
data["spans_length"][spans_key][span.label_] = []
|
||||
data["spans_length"][spans_key][span.label_].append(len(span))
|
||||
|
||||
# Obtain spans per span type
|
||||
if spans_key not in data["spans_per_type"]:
|
||||
data["spans_per_type"][spans_key] = dict()
|
||||
for span in gold.spans[spans_key]:
|
||||
if span.label_ not in data["spans_per_type"][spans_key]:
|
||||
data["spans_per_type"][spans_key][span.label_] = []
|
||||
data["spans_per_type"][spans_key][span.label_].append(span)
|
||||
|
||||
# Obtain boundary tokens per span type
|
||||
window_size = 1
|
||||
if spans_key not in data["sb_per_type"]:
|
||||
data["sb_per_type"][spans_key] = dict()
|
||||
for span in gold.spans[spans_key]:
|
||||
if span.label_ not in data["sb_per_type"][spans_key]:
|
||||
# Creating a data structure that holds the start and
|
||||
# end tokens for each span type
|
||||
data["sb_per_type"][spans_key][span.label_] = {
|
||||
"start": [],
|
||||
"end": [],
|
||||
}
|
||||
for offset in range(window_size):
|
||||
sb_start_idx = span.start - (offset + 1)
|
||||
if sb_start_idx >= 0:
|
||||
data["sb_per_type"][spans_key][span.label_]["start"].append(
|
||||
gold[sb_start_idx : sb_start_idx + 1]
|
||||
)
|
||||
|
||||
sb_end_idx = span.end + (offset + 1)
|
||||
if sb_end_idx <= len(gold):
|
||||
data["sb_per_type"][spans_key][span.label_]["end"].append(
|
||||
gold[sb_end_idx - 1 : sb_end_idx]
|
||||
)
|
||||
|
||||
if "textcat" in factory_names or "textcat_multilabel" in factory_names:
|
||||
data["cats"].update(gold.cats)
|
||||
if any(val not in (0, 1) for val in gold.cats.values()):
|
||||
|
@ -761,6 +889,16 @@ def _format_labels(
|
|||
return ", ".join([f"'{l}'" for l in cast(Iterable[str], labels)])
|
||||
|
||||
|
||||
def _format_freqs(freqs: Dict[int, float], sort: bool = True) -> str:
|
||||
if sort:
|
||||
freqs = dict(sorted(freqs.items()))
|
||||
|
||||
_freqs = [(str(k), v) for k, v in freqs.items()]
|
||||
return ", ".join(
|
||||
[f"{l} ({c}%)" for l, c in cast(Iterable[Tuple[str, float]], _freqs)]
|
||||
)
|
||||
|
||||
|
||||
def _get_examples_without_label(
|
||||
data: Sequence[Example],
|
||||
label: str,
|
||||
|
@ -771,7 +909,7 @@ def _get_examples_without_label(
|
|||
for eg in data:
|
||||
if component == "ner":
|
||||
labels = [
|
||||
label.split("-")[1]
|
||||
remove_bilu_prefix(label)
|
||||
for label in eg.get_aligned_ner()
|
||||
if label not in ("O", "-", None)
|
||||
]
|
||||
|
@ -797,6 +935,7 @@ def _get_labels_from_model(nlp: Language, factory_name: str) -> Set[str]:
|
|||
labels: Set[str] = set()
|
||||
for pipe_name in pipe_names:
|
||||
pipe = nlp.get_pipe(pipe_name)
|
||||
assert isinstance(pipe, TrainablePipe)
|
||||
labels.update(pipe.labels)
|
||||
return labels
|
||||
|
||||
|
@ -815,3 +954,177 @@ def _get_labels_from_spancat(nlp: Language) -> Dict[str, Set[str]]:
|
|||
labels[pipe.key] = set()
|
||||
labels[pipe.key].update(pipe.labels)
|
||||
return labels
|
||||
|
||||
|
||||
def _gmean(l: List) -> float:
|
||||
"""Compute geometric mean of a list"""
|
||||
return math.exp(math.fsum(math.log(i) for i in l) / len(l))
|
||||
|
||||
|
||||
def _wgt_average(metric: Dict[str, float], frequencies: Counter) -> float:
|
||||
total = sum(value * frequencies[span_type] for span_type, value in metric.items())
|
||||
return total / sum(frequencies.values())
|
||||
|
||||
|
||||
def _get_distribution(docs, normalize: bool = True) -> Counter:
|
||||
"""Get the frequency distribution given a set of Docs"""
|
||||
word_counts: Counter = Counter()
|
||||
for doc in docs:
|
||||
for token in doc:
|
||||
# Normalize the text
|
||||
t = token.text.lower().replace("``", '"').replace("''", '"')
|
||||
word_counts[t] += 1
|
||||
if normalize:
|
||||
total = sum(word_counts.values(), 0.0)
|
||||
word_counts = Counter({k: v / total for k, v in word_counts.items()})
|
||||
return word_counts
|
||||
|
||||
|
||||
def _get_kl_divergence(p: Counter, q: Counter) -> float:
|
||||
"""Compute the Kullback-Leibler divergence from two frequency distributions"""
|
||||
total = 0.0
|
||||
for word, p_word in p.items():
|
||||
total += p_word * math.log(p_word / q[word])
|
||||
return total
|
||||
|
||||
|
||||
def _format_span_row(span_data: List[Dict], labels: List[str]) -> List[Any]:
|
||||
"""Compile into one list for easier reporting"""
|
||||
d = {
|
||||
label: [label] + list(_format_number(d[label]) for d in span_data)
|
||||
for label in labels
|
||||
}
|
||||
return list(d.values())
|
||||
|
||||
|
||||
def _get_span_characteristics(
|
||||
examples: List[Example], compiled_gold: Dict[str, Any], spans_key: str
|
||||
) -> Dict[str, Any]:
|
||||
"""Obtain all span characteristics"""
|
||||
data_labels = compiled_gold["spancat"][spans_key]
|
||||
# Get lengths
|
||||
span_length = {
|
||||
label: _gmean(l)
|
||||
for label, l in compiled_gold["spans_length"][spans_key].items()
|
||||
}
|
||||
spans_per_type = {
|
||||
label: len(spans)
|
||||
for label, spans in compiled_gold["spans_per_type"][spans_key].items()
|
||||
}
|
||||
min_lengths = [min(l) for l in compiled_gold["spans_length"][spans_key].values()]
|
||||
max_lengths = [max(l) for l in compiled_gold["spans_length"][spans_key].values()]
|
||||
|
||||
# Get relevant distributions: corpus, spans, span boundaries
|
||||
p_corpus = _get_distribution([eg.reference for eg in examples], normalize=True)
|
||||
p_spans = {
|
||||
label: _get_distribution(spans, normalize=True)
|
||||
for label, spans in compiled_gold["spans_per_type"][spans_key].items()
|
||||
}
|
||||
p_bounds = {
|
||||
label: _get_distribution(sb["start"] + sb["end"], normalize=True)
|
||||
for label, sb in compiled_gold["sb_per_type"][spans_key].items()
|
||||
}
|
||||
|
||||
# Compute for actual span characteristics
|
||||
span_distinctiveness = {
|
||||
label: _get_kl_divergence(freq_dist, p_corpus)
|
||||
for label, freq_dist in p_spans.items()
|
||||
}
|
||||
sb_distinctiveness = {
|
||||
label: _get_kl_divergence(freq_dist, p_corpus)
|
||||
for label, freq_dist in p_bounds.items()
|
||||
}
|
||||
|
||||
return {
|
||||
"sd": span_distinctiveness,
|
||||
"bd": sb_distinctiveness,
|
||||
"spans_per_type": spans_per_type,
|
||||
"lengths": span_length,
|
||||
"min_length": min(min_lengths),
|
||||
"max_length": max(max_lengths),
|
||||
"avg_sd": _wgt_average(span_distinctiveness, data_labels),
|
||||
"avg_bd": _wgt_average(sb_distinctiveness, data_labels),
|
||||
"avg_length": _wgt_average(span_length, data_labels),
|
||||
"labels": list(data_labels.keys()),
|
||||
"p_spans": p_spans,
|
||||
"p_bounds": p_bounds,
|
||||
}
|
||||
|
||||
|
||||
def _print_span_characteristics(span_characteristics: Dict[str, Any]):
|
||||
"""Print all span characteristics into a table"""
|
||||
headers = ("Span Type", "Length", "SD", "BD", "N")
|
||||
# Wasabi has this at 30 by default, but we might have some long labels
|
||||
max_col = max(30, max(len(label) for label in span_characteristics["labels"]))
|
||||
# Prepare table data with all span characteristics
|
||||
table_data = [
|
||||
span_characteristics["lengths"],
|
||||
span_characteristics["sd"],
|
||||
span_characteristics["bd"],
|
||||
span_characteristics["spans_per_type"],
|
||||
]
|
||||
table = _format_span_row(
|
||||
span_data=table_data, labels=span_characteristics["labels"]
|
||||
)
|
||||
# Prepare table footer with weighted averages
|
||||
footer_data = [
|
||||
span_characteristics["avg_length"],
|
||||
span_characteristics["avg_sd"],
|
||||
span_characteristics["avg_bd"],
|
||||
]
|
||||
|
||||
footer = (
|
||||
["Wgt. Average"] + ["{:.2f}".format(round(f, 2)) for f in footer_data] + ["-"]
|
||||
)
|
||||
msg.table(
|
||||
table,
|
||||
footer=footer,
|
||||
header=headers,
|
||||
divider=True,
|
||||
aligns=["l"] + ["r"] * (len(footer_data) + 1),
|
||||
max_col=max_col,
|
||||
)
|
||||
|
||||
|
||||
def _get_spans_length_freq_dist(
|
||||
length_dict: Dict, threshold=SPAN_LENGTH_THRESHOLD_PERCENTAGE
|
||||
) -> Dict[int, float]:
|
||||
"""Get frequency distribution of spans length under a certain threshold"""
|
||||
all_span_lengths = []
|
||||
for _, lengths in length_dict.items():
|
||||
all_span_lengths.extend(lengths)
|
||||
|
||||
freq_dist: Counter = Counter()
|
||||
for i in all_span_lengths:
|
||||
if freq_dist.get(i):
|
||||
freq_dist[i] += 1
|
||||
else:
|
||||
freq_dist[i] = 1
|
||||
|
||||
# We will be working with percentages instead of raw counts
|
||||
freq_dist_percentage = {}
|
||||
for span_length, count in freq_dist.most_common():
|
||||
percentage = (count / len(all_span_lengths)) * 100.0
|
||||
percentage = round(percentage, 2)
|
||||
freq_dist_percentage[span_length] = percentage
|
||||
|
||||
return freq_dist_percentage
|
||||
|
||||
|
||||
def _filter_spans_length_freq_dist(
|
||||
freq_dist: Dict[int, float], threshold: int
|
||||
) -> Dict[int, float]:
|
||||
"""Filter frequency distribution with respect to a threshold
|
||||
|
||||
We're going to filter all the span lengths that fall
|
||||
around a percentage threshold when summed.
|
||||
"""
|
||||
total = 0.0
|
||||
filtered_freq_dist = {}
|
||||
for span_length, dist in freq_dist.items():
|
||||
if total >= threshold:
|
||||
break
|
||||
else:
|
||||
filtered_freq_dist[span_length] = dist
|
||||
total += dist
|
||||
return filtered_freq_dist
|
||||
|
|
89
spacy/cli/debug_diff.py
Normal file
89
spacy/cli/debug_diff.py
Normal file
|
@ -0,0 +1,89 @@
|
|||
from typing import Optional
|
||||
|
||||
import typer
|
||||
from wasabi import Printer, diff_strings, MarkdownRenderer
|
||||
from pathlib import Path
|
||||
from thinc.api import Config
|
||||
|
||||
from ._util import debug_cli, Arg, Opt, show_validation_error, parse_config_overrides
|
||||
from ..util import load_config
|
||||
from .init_config import init_config, Optimizations
|
||||
|
||||
|
||||
@debug_cli.command(
|
||||
"diff-config",
|
||||
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
||||
)
|
||||
def debug_diff_cli(
|
||||
# fmt: off
|
||||
ctx: typer.Context,
|
||||
config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
|
||||
compare_to: Optional[Path] = Opt(None, help="Path to a config file to diff against, or `None` to compare against default settings", exists=True, allow_dash=True),
|
||||
optimize: Optimizations = Opt(Optimizations.efficiency.value, "--optimize", "-o", help="Whether the user config was optimized for efficiency or accuracy. Only relevant when comparing against the default config."),
|
||||
gpu: bool = Opt(False, "--gpu", "-G", help="Whether the original config can run on a GPU. Only relevant when comparing against the default config."),
|
||||
pretraining: bool = Opt(False, "--pretraining", "--pt", help="Whether to compare on a config with pretraining involved. Only relevant when comparing against the default config."),
|
||||
markdown: bool = Opt(False, "--markdown", "-md", help="Generate Markdown for GitHub issues")
|
||||
# fmt: on
|
||||
):
|
||||
"""Show a diff of a config file with respect to spaCy's defaults or another config file. If
|
||||
additional settings were used in the creation of the config file, then you
|
||||
must supply these as extra parameters to the command when comparing to the default settings. The generated diff
|
||||
can also be used when posting to the discussion forum to provide more
|
||||
information for the maintainers.
|
||||
|
||||
The `optimize`, `gpu`, and `pretraining` options are only relevant when
|
||||
comparing against the default configuration (or specifically when `compare_to` is None).
|
||||
|
||||
DOCS: https://spacy.io/api/cli#debug-diff
|
||||
"""
|
||||
debug_diff(
|
||||
config_path=config_path,
|
||||
compare_to=compare_to,
|
||||
gpu=gpu,
|
||||
optimize=optimize,
|
||||
pretraining=pretraining,
|
||||
markdown=markdown,
|
||||
)
|
||||
|
||||
|
||||
def debug_diff(
|
||||
config_path: Path,
|
||||
compare_to: Optional[Path],
|
||||
gpu: bool,
|
||||
optimize: Optimizations,
|
||||
pretraining: bool,
|
||||
markdown: bool,
|
||||
):
|
||||
msg = Printer()
|
||||
with show_validation_error(hint_fill=False):
|
||||
user_config = load_config(config_path)
|
||||
if compare_to:
|
||||
other_config = load_config(compare_to)
|
||||
else:
|
||||
# Recreate a default config based from user's config
|
||||
lang = user_config["nlp"]["lang"]
|
||||
pipeline = list(user_config["nlp"]["pipeline"])
|
||||
msg.info(f"Found user-defined language: '{lang}'")
|
||||
msg.info(f"Found user-defined pipelines: {pipeline}")
|
||||
other_config = init_config(
|
||||
lang=lang,
|
||||
pipeline=pipeline,
|
||||
optimize=optimize.value,
|
||||
gpu=gpu,
|
||||
pretraining=pretraining,
|
||||
silent=True,
|
||||
)
|
||||
|
||||
user = user_config.to_str()
|
||||
other = other_config.to_str()
|
||||
|
||||
if user == other:
|
||||
msg.warn("No diff to show: configs are identical")
|
||||
else:
|
||||
diff_text = diff_strings(other, user, add_symbols=markdown)
|
||||
if markdown:
|
||||
md = MarkdownRenderer()
|
||||
md.add(md.code_block(diff_text, "diff"))
|
||||
print(md.text)
|
||||
else:
|
||||
print(diff_text)
|
|
@ -7,6 +7,7 @@ import typer
|
|||
from ._util import app, Arg, Opt, WHEEL_SUFFIX, SDIST_SUFFIX
|
||||
from .. import about
|
||||
from ..util import is_package, get_minor_version, run_command
|
||||
from ..util import is_prerelease_version
|
||||
from ..errors import OLD_MODEL_SHORTCUTS
|
||||
|
||||
|
||||
|
@ -19,7 +20,7 @@ def download_cli(
|
|||
ctx: typer.Context,
|
||||
model: str = Arg(..., help="Name of pipeline package to download"),
|
||||
direct: bool = Opt(False, "--direct", "-d", "-D", help="Force direct download of name + version"),
|
||||
sdist: bool = Opt(False, "--sdist", "-S", help="Download sdist (.tar.gz) archive instead of pre-built binary wheel")
|
||||
sdist: bool = Opt(False, "--sdist", "-S", help="Download sdist (.tar.gz) archive instead of pre-built binary wheel"),
|
||||
# fmt: on
|
||||
):
|
||||
"""
|
||||
|
@ -35,7 +36,12 @@ def download_cli(
|
|||
download(model, direct, sdist, *ctx.args)
|
||||
|
||||
|
||||
def download(model: str, direct: bool = False, sdist: bool = False, *pip_args) -> None:
|
||||
def download(
|
||||
model: str,
|
||||
direct: bool = False,
|
||||
sdist: bool = False,
|
||||
*pip_args,
|
||||
) -> None:
|
||||
if (
|
||||
not (is_package("spacy") or is_package("spacy-nightly"))
|
||||
and "--no-deps" not in pip_args
|
||||
|
@ -49,13 +55,10 @@ def download(model: str, direct: bool = False, sdist: bool = False, *pip_args) -
|
|||
"dependencies, you'll have to install them manually."
|
||||
)
|
||||
pip_args = pip_args + ("--no-deps",)
|
||||
suffix = SDIST_SUFFIX if sdist else WHEEL_SUFFIX
|
||||
dl_tpl = "{m}-{v}/{m}-{v}{s}#egg={m}=={v}"
|
||||
if direct:
|
||||
components = model.split("-")
|
||||
model_name = "".join(components[:-1])
|
||||
version = components[-1]
|
||||
download_model(dl_tpl.format(m=model_name, v=version, s=suffix), pip_args)
|
||||
else:
|
||||
model_name = model
|
||||
if model in OLD_MODEL_SHORTCUTS:
|
||||
|
@ -66,15 +69,31 @@ def download(model: str, direct: bool = False, sdist: bool = False, *pip_args) -
|
|||
model_name = OLD_MODEL_SHORTCUTS[model]
|
||||
compatibility = get_compatibility()
|
||||
version = get_version(model_name, compatibility)
|
||||
download_model(dl_tpl.format(m=model_name, v=version, s=suffix), pip_args)
|
||||
|
||||
filename = get_model_filename(model_name, version, sdist)
|
||||
|
||||
download_model(filename, pip_args)
|
||||
msg.good(
|
||||
"Download and installation successful",
|
||||
f"You can now load the package via spacy.load('{model_name}')",
|
||||
)
|
||||
|
||||
|
||||
def get_model_filename(model_name: str, version: str, sdist: bool = False) -> str:
|
||||
dl_tpl = "{m}-{v}/{m}-{v}{s}"
|
||||
egg_tpl = "#egg={m}=={v}"
|
||||
suffix = SDIST_SUFFIX if sdist else WHEEL_SUFFIX
|
||||
filename = dl_tpl.format(m=model_name, v=version, s=suffix)
|
||||
if sdist:
|
||||
filename += egg_tpl.format(m=model_name, v=version)
|
||||
return filename
|
||||
|
||||
|
||||
def get_compatibility() -> dict:
|
||||
version = get_minor_version(about.__version__)
|
||||
if is_prerelease_version(about.__version__):
|
||||
version: Optional[str] = about.__version__
|
||||
else:
|
||||
version = get_minor_version(about.__version__)
|
||||
r = requests.get(about.__compatibility__)
|
||||
if r.status_code != 200:
|
||||
msg.fail(
|
||||
|
@ -101,6 +120,11 @@ def get_version(model: str, comp: dict) -> str:
|
|||
return comp[model][0]
|
||||
|
||||
|
||||
def get_latest_version(model: str) -> str:
|
||||
comp = get_compatibility()
|
||||
return get_version(model, comp)
|
||||
|
||||
|
||||
def download_model(
|
||||
filename: str, user_pip_args: Optional[Sequence[str]] = None
|
||||
) -> None:
|
||||
|
|
|
@ -7,12 +7,15 @@ from thinc.api import fix_random_seed
|
|||
|
||||
from ..training import Corpus
|
||||
from ..tokens import Doc
|
||||
from ._util import app, Arg, Opt, setup_gpu, import_code
|
||||
from ._util import app, Arg, Opt, setup_gpu, import_code, benchmark_cli
|
||||
from ..scorer import Scorer
|
||||
from .. import util
|
||||
from .. import displacy
|
||||
|
||||
|
||||
@benchmark_cli.command(
|
||||
"accuracy",
|
||||
)
|
||||
@app.command("evaluate")
|
||||
def evaluate_cli(
|
||||
# fmt: off
|
||||
|
@ -36,7 +39,7 @@ def evaluate_cli(
|
|||
dependency parses in a HTML file, set as output directory as the
|
||||
displacy_path argument.
|
||||
|
||||
DOCS: https://spacy.io/api/cli#evaluate
|
||||
DOCS: https://spacy.io/api/cli#benchmark-accuracy
|
||||
"""
|
||||
import_code(code_path)
|
||||
evaluate(
|
||||
|
|
233
spacy/cli/find_threshold.py
Normal file
233
spacy/cli/find_threshold.py
Normal file
|
@ -0,0 +1,233 @@
|
|||
import functools
|
||||
import operator
|
||||
from pathlib import Path
|
||||
import logging
|
||||
from typing import Optional, Tuple, Any, Dict, List
|
||||
|
||||
import numpy
|
||||
import wasabi.tables
|
||||
|
||||
from ..pipeline import TextCategorizer, MultiLabel_TextCategorizer
|
||||
from ..errors import Errors
|
||||
from ..training import Corpus
|
||||
from ._util import app, Arg, Opt, import_code, setup_gpu
|
||||
from .. import util
|
||||
|
||||
_DEFAULTS = {
|
||||
"n_trials": 11,
|
||||
"use_gpu": -1,
|
||||
"gold_preproc": False,
|
||||
}
|
||||
|
||||
|
||||
@app.command(
|
||||
"find-threshold",
|
||||
context_settings={"allow_extra_args": False, "ignore_unknown_options": True},
|
||||
)
|
||||
def find_threshold_cli(
|
||||
# fmt: off
|
||||
model: str = Arg(..., help="Model name or path"),
|
||||
data_path: Path = Arg(..., help="Location of binary evaluation data in .spacy format", exists=True),
|
||||
pipe_name: str = Arg(..., help="Name of pipe to examine thresholds for"),
|
||||
threshold_key: str = Arg(..., help="Key of threshold attribute in component's configuration"),
|
||||
scores_key: str = Arg(..., help="Metric to optimize"),
|
||||
n_trials: int = Opt(_DEFAULTS["n_trials"], "--n_trials", "-n", help="Number of trials to determine optimal thresholds"),
|
||||
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||
use_gpu: int = Opt(_DEFAULTS["use_gpu"], "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
||||
gold_preproc: bool = Opt(_DEFAULTS["gold_preproc"], "--gold-preproc", "-G", help="Use gold preprocessing"),
|
||||
verbose: bool = Opt(False, "--silent", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||
# fmt: on
|
||||
):
|
||||
"""
|
||||
Runs prediction trials for a trained model with varying tresholds to maximize
|
||||
the specified metric. The search space for the threshold is traversed linearly
|
||||
from 0 to 1 in `n_trials` steps. Results are displayed in a table on `stdout`
|
||||
(the corresponding API call to `spacy.cli.find_threshold.find_threshold()`
|
||||
returns all results).
|
||||
|
||||
This is applicable only for components whose predictions are influenced by
|
||||
thresholds - e.g. `textcat_multilabel` and `spancat`, but not `textcat`. Note
|
||||
that the full path to the corresponding threshold attribute in the config has to
|
||||
be provided.
|
||||
|
||||
DOCS: https://spacy.io/api/cli#find-threshold
|
||||
"""
|
||||
|
||||
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
||||
import_code(code_path)
|
||||
find_threshold(
|
||||
model=model,
|
||||
data_path=data_path,
|
||||
pipe_name=pipe_name,
|
||||
threshold_key=threshold_key,
|
||||
scores_key=scores_key,
|
||||
n_trials=n_trials,
|
||||
use_gpu=use_gpu,
|
||||
gold_preproc=gold_preproc,
|
||||
silent=False,
|
||||
)
|
||||
|
||||
|
||||
def find_threshold(
|
||||
model: str,
|
||||
data_path: Path,
|
||||
pipe_name: str,
|
||||
threshold_key: str,
|
||||
scores_key: str,
|
||||
*,
|
||||
n_trials: int = _DEFAULTS["n_trials"], # type: ignore
|
||||
use_gpu: int = _DEFAULTS["use_gpu"], # type: ignore
|
||||
gold_preproc: bool = _DEFAULTS["gold_preproc"], # type: ignore
|
||||
silent: bool = True,
|
||||
) -> Tuple[float, float, Dict[float, float]]:
|
||||
"""
|
||||
Runs prediction trials for models with varying tresholds to maximize the specified metric.
|
||||
model (Union[str, Path]): Pipeline to evaluate. Can be a package or a path to a data directory.
|
||||
data_path (Path): Path to file with DocBin with docs to use for threshold search.
|
||||
pipe_name (str): Name of pipe to examine thresholds for.
|
||||
threshold_key (str): Key of threshold attribute in component's configuration.
|
||||
scores_key (str): Name of score to metric to optimize.
|
||||
n_trials (int): Number of trials to determine optimal thresholds.
|
||||
use_gpu (int): GPU ID or -1 for CPU.
|
||||
gold_preproc (bool): Whether to use gold preprocessing. Gold preprocessing helps the annotations align to the
|
||||
tokenization, and may result in sequences of more consistent length. However, it may reduce runtime accuracy due
|
||||
to train/test skew.
|
||||
silent (bool): Whether to print non-error-related output to stdout.
|
||||
RETURNS (Tuple[float, float, Dict[float, float]]): Best found threshold, the corresponding score, scores for all
|
||||
evaluated thresholds.
|
||||
"""
|
||||
|
||||
setup_gpu(use_gpu, silent=silent)
|
||||
data_path = util.ensure_path(data_path)
|
||||
if not data_path.exists():
|
||||
wasabi.msg.fail("Evaluation data not found", data_path, exits=1)
|
||||
nlp = util.load_model(model)
|
||||
|
||||
if pipe_name not in nlp.component_names:
|
||||
raise AttributeError(
|
||||
Errors.E001.format(name=pipe_name, opts=nlp.component_names)
|
||||
)
|
||||
pipe = nlp.get_pipe(pipe_name)
|
||||
if not hasattr(pipe, "scorer"):
|
||||
raise AttributeError(Errors.E1045)
|
||||
|
||||
if type(pipe) == TextCategorizer:
|
||||
wasabi.msg.warn(
|
||||
"The `textcat` component doesn't use a threshold as it's not applicable to the concept of "
|
||||
"exclusive classes. All thresholds will yield the same results."
|
||||
)
|
||||
|
||||
if not silent:
|
||||
wasabi.msg.info(
|
||||
title=f"Optimizing for {scores_key} for component '{pipe_name}' with {n_trials} "
|
||||
f"trials."
|
||||
)
|
||||
|
||||
# Load evaluation corpus.
|
||||
corpus = Corpus(data_path, gold_preproc=gold_preproc)
|
||||
dev_dataset = list(corpus(nlp))
|
||||
config_keys = threshold_key.split(".")
|
||||
|
||||
def set_nested_item(
|
||||
config: Dict[str, Any], keys: List[str], value: float
|
||||
) -> Dict[str, Any]:
|
||||
"""Set item in nested dictionary. Adapted from https://stackoverflow.com/a/54138200.
|
||||
config (Dict[str, Any]): Configuration dictionary.
|
||||
keys (List[Any]): Path to value to set.
|
||||
value (float): Value to set.
|
||||
RETURNS (Dict[str, Any]): Updated dictionary.
|
||||
"""
|
||||
functools.reduce(operator.getitem, keys[:-1], config)[keys[-1]] = value
|
||||
return config
|
||||
|
||||
def filter_config(
|
||||
config: Dict[str, Any], keys: List[str], full_key: str
|
||||
) -> Dict[str, Any]:
|
||||
"""Filters provided config dictionary so that only the specified keys path remains.
|
||||
config (Dict[str, Any]): Configuration dictionary.
|
||||
keys (List[Any]): Path to value to set.
|
||||
full_key (str): Full user-specified key.
|
||||
RETURNS (Dict[str, Any]): Filtered dictionary.
|
||||
"""
|
||||
if keys[0] not in config:
|
||||
wasabi.msg.fail(
|
||||
title=f"Failed to look up `{full_key}` in config: sub-key {[keys[0]]} not found.",
|
||||
text=f"Make sure you specified {[keys[0]]} correctly. The following sub-keys are available instead: "
|
||||
f"{list(config.keys())}",
|
||||
exits=1,
|
||||
)
|
||||
return {
|
||||
keys[0]: filter_config(config[keys[0]], keys[1:], full_key)
|
||||
if len(keys) > 1
|
||||
else config[keys[0]]
|
||||
}
|
||||
|
||||
# Evaluate with varying threshold values.
|
||||
scores: Dict[float, float] = {}
|
||||
config_keys_full = ["components", pipe_name, *config_keys]
|
||||
table_col_widths = (10, 10)
|
||||
thresholds = numpy.linspace(0, 1, n_trials)
|
||||
print(wasabi.tables.row(["Threshold", f"{scores_key}"], widths=table_col_widths))
|
||||
for threshold in thresholds:
|
||||
# Reload pipeline with overrides specifying the new threshold.
|
||||
nlp = util.load_model(
|
||||
model,
|
||||
config=set_nested_item(
|
||||
filter_config(
|
||||
nlp.config, config_keys_full, ".".join(config_keys_full)
|
||||
).copy(),
|
||||
config_keys_full,
|
||||
threshold,
|
||||
),
|
||||
)
|
||||
if hasattr(pipe, "cfg"):
|
||||
setattr(
|
||||
nlp.get_pipe(pipe_name),
|
||||
"cfg",
|
||||
set_nested_item(getattr(pipe, "cfg"), config_keys, threshold),
|
||||
)
|
||||
|
||||
eval_scores = nlp.evaluate(dev_dataset)
|
||||
if scores_key not in eval_scores:
|
||||
wasabi.msg.fail(
|
||||
title=f"Failed to look up score `{scores_key}` in evaluation results.",
|
||||
text=f"Make sure you specified the correct value for `scores_key`. The following scores are "
|
||||
f"available: {list(eval_scores.keys())}",
|
||||
exits=1,
|
||||
)
|
||||
scores[threshold] = eval_scores[scores_key]
|
||||
|
||||
if not isinstance(scores[threshold], (float, int)):
|
||||
wasabi.msg.fail(
|
||||
f"Returned score for key '{scores_key}' is not numeric. Threshold optimization only works for numeric "
|
||||
f"scores.",
|
||||
exits=1,
|
||||
)
|
||||
print(
|
||||
wasabi.row(
|
||||
[round(threshold, 3), round(scores[threshold], 3)],
|
||||
widths=table_col_widths,
|
||||
)
|
||||
)
|
||||
|
||||
best_threshold = max(scores.keys(), key=(lambda key: scores[key]))
|
||||
|
||||
# If all scores are identical, emit warning.
|
||||
if len(set(scores.values())) == 1:
|
||||
wasabi.msg.warn(
|
||||
title="All scores are identical. Verify that all settings are correct.",
|
||||
text=""
|
||||
if (
|
||||
not isinstance(pipe, MultiLabel_TextCategorizer)
|
||||
or scores_key in ("cats_macro_f", "cats_micro_f")
|
||||
)
|
||||
else "Use `cats_macro_f` or `cats_micro_f` when optimizing the threshold for `textcat_multilabel`.",
|
||||
)
|
||||
|
||||
else:
|
||||
if not silent:
|
||||
print(
|
||||
f"\nBest threshold: {round(best_threshold, ndigits=4)} with {scores_key} value of {scores[best_threshold]}."
|
||||
)
|
||||
|
||||
return best_threshold, scores[best_threshold], scores
|
|
@ -1,10 +1,13 @@
|
|||
from typing import Optional, Dict, Any, Union, List
|
||||
import platform
|
||||
import pkg_resources
|
||||
import json
|
||||
from pathlib import Path
|
||||
from wasabi import Printer, MarkdownRenderer
|
||||
import srsly
|
||||
|
||||
from ._util import app, Arg, Opt, string_to_list
|
||||
from .download import get_model_filename, get_latest_version
|
||||
from .. import util
|
||||
from .. import about
|
||||
|
||||
|
@ -16,6 +19,7 @@ def info_cli(
|
|||
markdown: bool = Opt(False, "--markdown", "-md", help="Generate Markdown for GitHub issues"),
|
||||
silent: bool = Opt(False, "--silent", "-s", "-S", help="Don't print anything (just return)"),
|
||||
exclude: str = Opt("labels", "--exclude", "-e", help="Comma-separated keys to exclude from the print-out"),
|
||||
url: bool = Opt(False, "--url", "-u", help="Print the URL to download the most recent compatible version of the pipeline"),
|
||||
# fmt: on
|
||||
):
|
||||
"""
|
||||
|
@ -23,10 +27,19 @@ def info_cli(
|
|||
print its meta information. Flag --markdown prints details in Markdown for easy
|
||||
copy-pasting to GitHub issues.
|
||||
|
||||
Flag --url prints only the download URL of the most recent compatible
|
||||
version of the pipeline.
|
||||
|
||||
DOCS: https://spacy.io/api/cli#info
|
||||
"""
|
||||
exclude = string_to_list(exclude)
|
||||
info(model, markdown=markdown, silent=silent, exclude=exclude)
|
||||
info(
|
||||
model,
|
||||
markdown=markdown,
|
||||
silent=silent,
|
||||
exclude=exclude,
|
||||
url=url,
|
||||
)
|
||||
|
||||
|
||||
def info(
|
||||
|
@ -35,11 +48,20 @@ def info(
|
|||
markdown: bool = False,
|
||||
silent: bool = True,
|
||||
exclude: Optional[List[str]] = None,
|
||||
url: bool = False,
|
||||
) -> Union[str, dict]:
|
||||
msg = Printer(no_print=silent, pretty=not silent)
|
||||
if not exclude:
|
||||
exclude = []
|
||||
if model:
|
||||
if url:
|
||||
if model is not None:
|
||||
title = f"Download info for pipeline '{model}'"
|
||||
data = info_model_url(model)
|
||||
print(data["download_url"])
|
||||
return data
|
||||
else:
|
||||
msg.fail("--url option requires a pipeline name", exits=1)
|
||||
elif model:
|
||||
title = f"Info about pipeline '{model}'"
|
||||
data = info_model(model, silent=silent)
|
||||
else:
|
||||
|
@ -99,11 +121,44 @@ def info_model(model: str, *, silent: bool = True) -> Dict[str, Any]:
|
|||
meta["source"] = str(model_path.resolve())
|
||||
else:
|
||||
meta["source"] = str(model_path)
|
||||
download_url = info_installed_model_url(model)
|
||||
if download_url:
|
||||
meta["download_url"] = download_url
|
||||
return {
|
||||
k: v for k, v in meta.items() if k not in ("accuracy", "performance", "speed")
|
||||
}
|
||||
|
||||
|
||||
def info_installed_model_url(model: str) -> Optional[str]:
|
||||
"""Given a pipeline name, get the download URL if available, otherwise
|
||||
return None.
|
||||
|
||||
This is only available for pipelines installed as modules that have
|
||||
dist-info available.
|
||||
"""
|
||||
try:
|
||||
dist = pkg_resources.get_distribution(model)
|
||||
data = json.loads(dist.get_metadata("direct_url.json"))
|
||||
return data["url"]
|
||||
except pkg_resources.DistributionNotFound:
|
||||
# no such package
|
||||
return None
|
||||
except Exception:
|
||||
# something else, like no file or invalid JSON
|
||||
return None
|
||||
|
||||
|
||||
def info_model_url(model: str) -> Dict[str, Any]:
|
||||
"""Return the download URL for the latest version of a pipeline."""
|
||||
version = get_latest_version(model)
|
||||
|
||||
filename = get_model_filename(model, version)
|
||||
download_url = about.__download_url__ + "/" + filename
|
||||
release_tpl = "https://github.com/explosion/spacy-models/releases/tag/{m}-{v}"
|
||||
release_url = release_tpl.format(m=model, v=version)
|
||||
return {"download_url": download_url, "release_url": release_url}
|
||||
|
||||
|
||||
def get_markdown(
|
||||
data: Dict[str, Any],
|
||||
title: Optional[str] = None,
|
||||
|
|
|
@ -10,6 +10,7 @@ from jinja2 import Template
|
|||
from .. import util
|
||||
from ..language import DEFAULT_CONFIG_PRETRAIN_PATH
|
||||
from ..schemas import RecommendationSchema
|
||||
from ..util import SimpleFrozenList
|
||||
from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND
|
||||
from ._util import string_to_list, import_code
|
||||
|
||||
|
@ -24,16 +25,30 @@ class Optimizations(str, Enum):
|
|||
accuracy = "accuracy"
|
||||
|
||||
|
||||
class InitValues:
|
||||
"""
|
||||
Default values for initialization. Dedicated class to allow synchronized default values for init_config_cli() and
|
||||
init_config(), i.e. initialization calls via CLI respectively Python.
|
||||
"""
|
||||
|
||||
lang = "en"
|
||||
pipeline = SimpleFrozenList(["tagger", "parser", "ner"])
|
||||
optimize = Optimizations.efficiency
|
||||
gpu = False
|
||||
pretraining = False
|
||||
force_overwrite = False
|
||||
|
||||
|
||||
@init_cli.command("config")
|
||||
def init_config_cli(
|
||||
# fmt: off
|
||||
output_file: Path = Arg(..., help="File to save the config to or - for stdout (will only output config and no additional logging info)", allow_dash=True),
|
||||
lang: str = Opt("en", "--lang", "-l", help="Two-letter code of the language to use"),
|
||||
pipeline: str = Opt("tagger,parser,ner", "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include (without 'tok2vec' or 'transformer')"),
|
||||
optimize: Optimizations = Opt(Optimizations.efficiency.value, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."),
|
||||
gpu: bool = Opt(False, "--gpu", "-G", help="Whether the model can run on GPU. This will impact the choice of architecture, pretrained weights and related hyperparameters."),
|
||||
pretraining: bool = Opt(False, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"),
|
||||
force_overwrite: bool = Opt(False, "--force", "-F", help="Force overwriting the output file"),
|
||||
lang: str = Opt(InitValues.lang, "--lang", "-l", help="Two-letter code of the language to use"),
|
||||
pipeline: str = Opt(",".join(InitValues.pipeline), "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include (without 'tok2vec' or 'transformer')"),
|
||||
optimize: Optimizations = Opt(InitValues.optimize, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."),
|
||||
gpu: bool = Opt(InitValues.gpu, "--gpu", "-G", help="Whether the model can run on GPU. This will impact the choice of architecture, pretrained weights and related hyperparameters."),
|
||||
pretraining: bool = Opt(InitValues.pretraining, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"),
|
||||
force_overwrite: bool = Opt(InitValues.force_overwrite, "--force", "-F", help="Force overwriting the output file"),
|
||||
# fmt: on
|
||||
):
|
||||
"""
|
||||
|
@ -133,11 +148,11 @@ def fill_config(
|
|||
|
||||
def init_config(
|
||||
*,
|
||||
lang: str,
|
||||
pipeline: List[str],
|
||||
optimize: str,
|
||||
gpu: bool,
|
||||
pretraining: bool = False,
|
||||
lang: str = InitValues.lang,
|
||||
pipeline: List[str] = InitValues.pipeline,
|
||||
optimize: str = InitValues.optimize,
|
||||
gpu: bool = InitValues.gpu,
|
||||
pretraining: bool = InitValues.pretraining,
|
||||
silent: bool = True,
|
||||
) -> Config:
|
||||
msg = Printer(no_print=silent)
|
||||
|
|
|
@ -299,8 +299,8 @@ def get_meta(
|
|||
}
|
||||
nlp = util.load_model_from_path(Path(model_path))
|
||||
meta.update(nlp.meta)
|
||||
meta.update(existing_meta)
|
||||
meta["spacy_version"] = util.get_minor_version_range(about.__version__)
|
||||
meta.update(existing_meta)
|
||||
meta["vectors"] = {
|
||||
"width": nlp.vocab.vectors_length,
|
||||
"vectors": len(nlp.vocab.vectors),
|
||||
|
|
|
@ -61,7 +61,7 @@ def pretrain_cli(
|
|||
# TODO: What's the solution here? How do we handle optional blocks?
|
||||
msg.fail("The [pretraining] block in your config is empty", exits=1)
|
||||
if not output_dir.exists():
|
||||
output_dir.mkdir()
|
||||
output_dir.mkdir(parents=True)
|
||||
msg.good(f"Created output directory: {output_dir}")
|
||||
# Save non-interpolated config
|
||||
raw_config.to_disk(output_dir / "config.cfg")
|
||||
|
|
|
@ -12,6 +12,9 @@ from .._util import project_cli, Arg, Opt, PROJECT_FILE, load_project_config
|
|||
from .._util import get_checksum, download_file, git_checkout, get_git_version
|
||||
from .._util import SimpleFrozenDict, parse_config_overrides
|
||||
|
||||
# Whether assets are extra if `extra` is not set.
|
||||
EXTRA_DEFAULT = False
|
||||
|
||||
|
||||
@project_cli.command(
|
||||
"assets",
|
||||
|
@ -21,7 +24,8 @@ def project_assets_cli(
|
|||
# fmt: off
|
||||
ctx: typer.Context, # This is only used to read additional arguments
|
||||
project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
|
||||
sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse checkout for assets provided via Git, to only check out and clone the files needed. Requires Git v22.2+.")
|
||||
sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse checkout for assets provided via Git, to only check out and clone the files needed. Requires Git v22.2+."),
|
||||
extra: bool = Opt(False, "--extra", "-e", help="Download all assets, including those marked as 'extra'.")
|
||||
# fmt: on
|
||||
):
|
||||
"""Fetch project assets like datasets and pretrained weights. Assets are
|
||||
|
@ -32,7 +36,12 @@ def project_assets_cli(
|
|||
DOCS: https://spacy.io/api/cli#project-assets
|
||||
"""
|
||||
overrides = parse_config_overrides(ctx.args)
|
||||
project_assets(project_dir, overrides=overrides, sparse_checkout=sparse_checkout)
|
||||
project_assets(
|
||||
project_dir,
|
||||
overrides=overrides,
|
||||
sparse_checkout=sparse_checkout,
|
||||
extra=extra,
|
||||
)
|
||||
|
||||
|
||||
def project_assets(
|
||||
|
@ -40,17 +49,29 @@ def project_assets(
|
|||
*,
|
||||
overrides: Dict[str, Any] = SimpleFrozenDict(),
|
||||
sparse_checkout: bool = False,
|
||||
extra: bool = False,
|
||||
) -> None:
|
||||
"""Fetch assets for a project using DVC if possible.
|
||||
|
||||
project_dir (Path): Path to project directory.
|
||||
sparse_checkout (bool): Use sparse checkout for assets provided via Git, to only check out and clone the files
|
||||
needed.
|
||||
extra (bool): Whether to download all assets, including those marked as 'extra'.
|
||||
"""
|
||||
project_path = ensure_path(project_dir)
|
||||
config = load_project_config(project_path, overrides=overrides)
|
||||
assets = config.get("assets", {})
|
||||
assets = [
|
||||
asset
|
||||
for asset in config.get("assets", [])
|
||||
if extra or not asset.get("extra", EXTRA_DEFAULT)
|
||||
]
|
||||
if not assets:
|
||||
msg.warn(f"No assets specified in {PROJECT_FILE}", exits=0)
|
||||
msg.warn(
|
||||
f"No assets specified in {PROJECT_FILE} (if assets are marked as extra, download them with --extra)",
|
||||
exits=0,
|
||||
)
|
||||
msg.info(f"Fetching {len(assets)} asset(s)")
|
||||
|
||||
for asset in assets:
|
||||
dest = (project_dir / asset["dest"]).resolve()
|
||||
checksum = asset.get("checksum")
|
||||
|
@ -168,7 +189,11 @@ def convert_asset_url(url: str) -> str:
|
|||
RETURNS (str): The converted URL.
|
||||
"""
|
||||
# If the asset URL is a regular GitHub URL it's likely a mistake
|
||||
if re.match(r"(http(s?)):\/\/github.com", url) and "releases/download" not in url:
|
||||
if (
|
||||
re.match(r"(http(s?)):\/\/github.com", url)
|
||||
and "releases/download" not in url
|
||||
and "/raw/" not in url
|
||||
):
|
||||
converted = url.replace("github.com", "raw.githubusercontent.com")
|
||||
converted = re.sub(r"/(tree|blob)/", "/", converted)
|
||||
msg.warn(
|
||||
|
|
|
@ -7,11 +7,11 @@ import re
|
|||
from ... import about
|
||||
from ...util import ensure_path
|
||||
from .._util import project_cli, Arg, Opt, COMMAND, PROJECT_FILE
|
||||
from .._util import git_checkout, get_git_version
|
||||
from .._util import git_checkout, get_git_version, git_repo_branch_exists
|
||||
|
||||
DEFAULT_REPO = about.__projects__
|
||||
DEFAULT_PROJECTS_BRANCH = about.__projects_branch__
|
||||
DEFAULT_BRANCH = "master"
|
||||
DEFAULT_BRANCHES = ["main", "master"]
|
||||
|
||||
|
||||
@project_cli.command("clone")
|
||||
|
@ -20,7 +20,7 @@ def project_clone_cli(
|
|||
name: str = Arg(..., help="The name of the template to clone"),
|
||||
dest: Optional[Path] = Arg(None, help="Where to clone the project. Defaults to current working directory", exists=False),
|
||||
repo: str = Opt(DEFAULT_REPO, "--repo", "-r", help="The repository to clone from"),
|
||||
branch: Optional[str] = Opt(None, "--branch", "-b", help="The branch to clone from"),
|
||||
branch: Optional[str] = Opt(None, "--branch", "-b", help=f"The branch to clone from. If not provided, will attempt {', '.join(DEFAULT_BRANCHES)}"),
|
||||
sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse Git checkout to only check out and clone the files needed. Requires Git v22.2+.")
|
||||
# fmt: on
|
||||
):
|
||||
|
@ -33,9 +33,25 @@ def project_clone_cli(
|
|||
"""
|
||||
if dest is None:
|
||||
dest = Path.cwd() / Path(name).parts[-1]
|
||||
if repo == DEFAULT_REPO and branch is None:
|
||||
branch = DEFAULT_PROJECTS_BRANCH
|
||||
|
||||
if branch is None:
|
||||
# If it's a user repo, we want to default to other branch
|
||||
branch = DEFAULT_PROJECTS_BRANCH if repo == DEFAULT_REPO else DEFAULT_BRANCH
|
||||
for default_branch in DEFAULT_BRANCHES:
|
||||
if git_repo_branch_exists(repo, default_branch):
|
||||
branch = default_branch
|
||||
break
|
||||
if branch is None:
|
||||
default_branches_msg = ", ".join(f"'{b}'" for b in DEFAULT_BRANCHES)
|
||||
msg.fail(
|
||||
"No branch provided and attempted default "
|
||||
f"branches {default_branches_msg} do not exist.",
|
||||
exits=1,
|
||||
)
|
||||
else:
|
||||
if not git_repo_branch_exists(repo, branch):
|
||||
msg.fail(f"repo: {repo} (branch: {branch}) does not exist.", exits=1)
|
||||
assert isinstance(branch, str)
|
||||
project_clone(name, dest, repo=repo, branch=branch, sparse_checkout=sparse_checkout)
|
||||
|
||||
|
||||
|
@ -61,9 +77,9 @@ def project_clone(
|
|||
try:
|
||||
git_checkout(repo, name, dest, branch=branch, sparse=sparse_checkout)
|
||||
except subprocess.CalledProcessError:
|
||||
err = f"Could not clone '{name}' from repo '{repo_name}'"
|
||||
err = f"Could not clone '{name}' from repo '{repo_name}' (branch '{branch}')"
|
||||
msg.fail(err, exits=1)
|
||||
msg.good(f"Cloned '{name}' from {repo_name}", project_dir)
|
||||
msg.good(f"Cloned '{name}' from '{repo_name}' (branch '{branch}')", project_dir)
|
||||
if not (project_dir / PROJECT_FILE).exists():
|
||||
msg.warn(f"No {PROJECT_FILE} found in directory")
|
||||
else:
|
||||
|
|
|
@ -25,6 +25,7 @@ def project_update_dvc_cli(
|
|||
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
|
||||
workflow: Optional[str] = Arg(None, help=f"Name of workflow defined in {PROJECT_FILE}. Defaults to first workflow if not set."),
|
||||
verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"),
|
||||
quiet: bool = Opt(False, "--quiet", "-q", help="Print less info"),
|
||||
force: bool = Opt(False, "--force", "-F", help="Force update DVC config"),
|
||||
# fmt: on
|
||||
):
|
||||
|
@ -36,7 +37,7 @@ def project_update_dvc_cli(
|
|||
|
||||
DOCS: https://spacy.io/api/cli#project-dvc
|
||||
"""
|
||||
project_update_dvc(project_dir, workflow, verbose=verbose, force=force)
|
||||
project_update_dvc(project_dir, workflow, verbose=verbose, quiet=quiet, force=force)
|
||||
|
||||
|
||||
def project_update_dvc(
|
||||
|
@ -44,6 +45,7 @@ def project_update_dvc(
|
|||
workflow: Optional[str] = None,
|
||||
*,
|
||||
verbose: bool = False,
|
||||
quiet: bool = False,
|
||||
force: bool = False,
|
||||
) -> None:
|
||||
"""Update the auto-generated Data Version Control (DVC) config file. A DVC
|
||||
|
@ -54,11 +56,12 @@ def project_update_dvc(
|
|||
workflow (Optional[str]): Optional name of workflow defined in project.yml.
|
||||
If not set, the first workflow will be used.
|
||||
verbose (bool): Print more info.
|
||||
quiet (bool): Print less info.
|
||||
force (bool): Force update DVC config.
|
||||
"""
|
||||
config = load_project_config(project_dir)
|
||||
updated = update_dvc_config(
|
||||
project_dir, config, workflow, verbose=verbose, force=force
|
||||
project_dir, config, workflow, verbose=verbose, quiet=quiet, force=force
|
||||
)
|
||||
help_msg = "To execute the workflow with DVC, run: dvc repro"
|
||||
if updated:
|
||||
|
@ -72,7 +75,7 @@ def update_dvc_config(
|
|||
config: Dict[str, Any],
|
||||
workflow: Optional[str] = None,
|
||||
verbose: bool = False,
|
||||
silent: bool = False,
|
||||
quiet: bool = False,
|
||||
force: bool = False,
|
||||
) -> bool:
|
||||
"""Re-run the DVC commands in dry mode and update dvc.yaml file in the
|
||||
|
@ -83,7 +86,7 @@ def update_dvc_config(
|
|||
path (Path): The path to the project directory.
|
||||
config (Dict[str, Any]): The loaded project.yml.
|
||||
verbose (bool): Whether to print additional info (via DVC).
|
||||
silent (bool): Don't output anything (via DVC).
|
||||
quiet (bool): Don't output anything (via DVC).
|
||||
force (bool): Force update, even if hashes match.
|
||||
RETURNS (bool): Whether the DVC config file was updated.
|
||||
"""
|
||||
|
@ -105,6 +108,14 @@ def update_dvc_config(
|
|||
dvc_config_path.unlink()
|
||||
dvc_commands = []
|
||||
config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
|
||||
|
||||
# some flags that apply to every command
|
||||
flags = []
|
||||
if verbose:
|
||||
flags.append("--verbose")
|
||||
if quiet:
|
||||
flags.append("--quiet")
|
||||
|
||||
for name in workflows[workflow]:
|
||||
command = config_commands[name]
|
||||
deps = command.get("deps", [])
|
||||
|
@ -118,14 +129,26 @@ def update_dvc_config(
|
|||
deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl]
|
||||
outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl]
|
||||
outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl]
|
||||
dvc_cmd = ["run", "-n", name, "-w", str(path), "--no-exec"]
|
||||
|
||||
dvc_cmd = ["run", *flags, "-n", name, "-w", str(path), "--no-exec"]
|
||||
if command.get("no_skip"):
|
||||
dvc_cmd.append("--always-changed")
|
||||
full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd]
|
||||
dvc_commands.append(join_command(full_cmd))
|
||||
|
||||
if not dvc_commands:
|
||||
# If we don't check for this, then there will be an error when reading the
|
||||
# config, since DVC wouldn't create it.
|
||||
msg.fail(
|
||||
"No usable commands for DVC found. This can happen if none of your "
|
||||
"commands have dependencies or outputs.",
|
||||
exits=1,
|
||||
)
|
||||
|
||||
with working_dir(path):
|
||||
dvc_flags = {"--verbose": verbose, "--quiet": silent}
|
||||
run_dvc_commands(dvc_commands, flags=dvc_flags)
|
||||
for c in dvc_commands:
|
||||
dvc_command = "dvc " + c
|
||||
run_command(dvc_command)
|
||||
with dvc_config_path.open("r+", encoding="utf8") as f:
|
||||
content = f.read()
|
||||
f.seek(0, 0)
|
||||
|
@ -133,26 +156,6 @@ def update_dvc_config(
|
|||
return True
|
||||
|
||||
|
||||
def run_dvc_commands(
|
||||
commands: Iterable[str] = SimpleFrozenList(), flags: Dict[str, bool] = {}
|
||||
) -> None:
|
||||
"""Run a sequence of DVC commands in a subprocess, in order.
|
||||
|
||||
commands (List[str]): The string commands without the leading "dvc".
|
||||
flags (Dict[str, bool]): Conditional flags to be added to command. Makes it
|
||||
easier to pass flags like --quiet that depend on a variable or
|
||||
command-line setting while avoiding lots of nested conditionals.
|
||||
"""
|
||||
for c in commands:
|
||||
command = split_command(c)
|
||||
dvc_command = ["dvc", *command]
|
||||
# Add the flags if they are set to True
|
||||
for flag, is_active in flags.items():
|
||||
if is_active:
|
||||
dvc_command.append(flag)
|
||||
run_command(dvc_command)
|
||||
|
||||
|
||||
def check_workflows(workflows: List[str], workflow: Optional[str] = None) -> None:
|
||||
"""Validate workflows provided in project.yml and check that a given
|
||||
workflow can be used to generate a DVC config.
|
||||
|
|
|
@ -5,14 +5,17 @@ import hashlib
|
|||
import urllib.parse
|
||||
import tarfile
|
||||
from pathlib import Path
|
||||
from wasabi import msg
|
||||
|
||||
from .._util import get_hash, get_checksum, download_file, ensure_pathy
|
||||
from ...util import make_tempdir, get_minor_version, ENV_VARS, check_bool_env_var
|
||||
from .._util import get_hash, get_checksum, upload_file, download_file
|
||||
from .._util import ensure_pathy, make_tempdir
|
||||
from ...util import get_minor_version, ENV_VARS, check_bool_env_var
|
||||
from ...git_info import GIT_VERSION
|
||||
from ... import about
|
||||
from ...errors import Errors
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pathy import Pathy # noqa: F401
|
||||
from pathy import FluidPath # noqa: F401
|
||||
|
||||
|
||||
class RemoteStorage:
|
||||
|
@ -27,7 +30,7 @@ class RemoteStorage:
|
|||
self.url = ensure_pathy(url)
|
||||
self.compression = compression
|
||||
|
||||
def push(self, path: Path, command_hash: str, content_hash: str) -> "Pathy":
|
||||
def push(self, path: Path, command_hash: str, content_hash: str) -> "FluidPath":
|
||||
"""Compress a file or directory within a project and upload it to a remote
|
||||
storage. If an object exists at the full URL, nothing is done.
|
||||
|
||||
|
@ -48,9 +51,7 @@ class RemoteStorage:
|
|||
mode_string = f"w:{self.compression}" if self.compression else "w"
|
||||
with tarfile.open(tar_loc, mode=mode_string) as tar_file:
|
||||
tar_file.add(str(loc), arcname=str(path))
|
||||
with tar_loc.open(mode="rb") as input_file:
|
||||
with url.open(mode="wb") as output_file:
|
||||
output_file.write(input_file.read())
|
||||
upload_file(tar_loc, url)
|
||||
return url
|
||||
|
||||
def pull(
|
||||
|
@ -59,7 +60,7 @@ class RemoteStorage:
|
|||
*,
|
||||
command_hash: Optional[str] = None,
|
||||
content_hash: Optional[str] = None,
|
||||
) -> Optional["Pathy"]:
|
||||
) -> Optional["FluidPath"]:
|
||||
"""Retrieve a file from the remote cache. If the file already exists,
|
||||
nothing is done.
|
||||
|
||||
|
@ -84,7 +85,23 @@ class RemoteStorage:
|
|||
with tarfile.open(tar_loc, mode=mode_string) as tar_file:
|
||||
# This requires that the path is added correctly, relative
|
||||
# to root. This is how we set things up in push()
|
||||
tar_file.extractall(self.root)
|
||||
|
||||
# Disallow paths outside the current directory for the tar
|
||||
# file (CVE-2007-4559, directory traversal vulnerability)
|
||||
def is_within_directory(directory, target):
|
||||
abs_directory = os.path.abspath(directory)
|
||||
abs_target = os.path.abspath(target)
|
||||
prefix = os.path.commonprefix([abs_directory, abs_target])
|
||||
return prefix == abs_directory
|
||||
|
||||
def safe_extract(tar, path):
|
||||
for member in tar.getmembers():
|
||||
member_path = os.path.join(path, member.name)
|
||||
if not is_within_directory(path, member_path):
|
||||
raise ValueError(Errors.E852)
|
||||
tar.extractall(path)
|
||||
|
||||
safe_extract(tar_file, self.root)
|
||||
return url
|
||||
|
||||
def find(
|
||||
|
@ -93,25 +110,37 @@ class RemoteStorage:
|
|||
*,
|
||||
command_hash: Optional[str] = None,
|
||||
content_hash: Optional[str] = None,
|
||||
) -> Optional["Pathy"]:
|
||||
) -> Optional["FluidPath"]:
|
||||
"""Find the best matching version of a file within the storage,
|
||||
or `None` if no match can be found. If both the creation and content hash
|
||||
are specified, only exact matches will be returned. Otherwise, the most
|
||||
recent matching file is preferred.
|
||||
"""
|
||||
name = self.encode_name(str(path))
|
||||
urls = []
|
||||
if command_hash is not None and content_hash is not None:
|
||||
url = self.make_url(path, command_hash, content_hash)
|
||||
url = self.url / name / command_hash / content_hash
|
||||
urls = [url] if url.exists() else []
|
||||
elif command_hash is not None:
|
||||
urls = list((self.url / name / command_hash).iterdir())
|
||||
if (self.url / name / command_hash).exists():
|
||||
urls = list((self.url / name / command_hash).iterdir())
|
||||
else:
|
||||
urls = list((self.url / name).iterdir())
|
||||
if content_hash is not None:
|
||||
urls = [url for url in urls if url.parts[-1] == content_hash]
|
||||
if (self.url / name).exists():
|
||||
for sub_dir in (self.url / name).iterdir():
|
||||
urls.extend(sub_dir.iterdir())
|
||||
if content_hash is not None:
|
||||
urls = [url for url in urls if url.parts[-1] == content_hash]
|
||||
if len(urls) >= 2:
|
||||
try:
|
||||
urls.sort(key=lambda x: x.stat().last_modified) # type: ignore
|
||||
except Exception:
|
||||
msg.warn(
|
||||
"Unable to sort remote files by last modified. The file(s) "
|
||||
"pulled from the cache may not be the most recent."
|
||||
)
|
||||
return urls[-1] if urls else None
|
||||
|
||||
def make_url(self, path: Path, command_hash: str, content_hash: str) -> "Pathy":
|
||||
def make_url(self, path: Path, command_hash: str, content_hash: str) -> "FluidPath":
|
||||
"""Construct a URL from a subpath, a creation hash and a content hash."""
|
||||
return self.url / self.encode_name(str(path)) / command_hash / content_hash
|
||||
|
||||
|
|
|
@ -1,5 +1,8 @@
|
|||
from typing import Optional, List, Dict, Sequence, Any, Iterable
|
||||
from typing import Optional, List, Dict, Sequence, Any, Iterable, Tuple
|
||||
import os.path
|
||||
from pathlib import Path
|
||||
|
||||
import pkg_resources
|
||||
from wasabi import msg
|
||||
from wasabi.util import locale_escape
|
||||
import sys
|
||||
|
@ -50,6 +53,7 @@ def project_run(
|
|||
force: bool = False,
|
||||
dry: bool = False,
|
||||
capture: bool = False,
|
||||
skip_requirements_check: bool = False,
|
||||
) -> None:
|
||||
"""Run a named script defined in the project.yml. If the script is part
|
||||
of the default pipeline (defined in the "run" section), DVC is used to
|
||||
|
@ -66,11 +70,19 @@ def project_run(
|
|||
sys.exit will be called with the return code. You should use capture=False
|
||||
when you want to turn over execution to the command, and capture=True
|
||||
when you want to run the command more like a function.
|
||||
skip_requirements_check (bool): Whether to skip the requirements check.
|
||||
"""
|
||||
config = load_project_config(project_dir, overrides=overrides)
|
||||
commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
|
||||
workflows = config.get("workflows", {})
|
||||
validate_subcommand(list(commands.keys()), list(workflows.keys()), subcommand)
|
||||
|
||||
req_path = project_dir / "requirements.txt"
|
||||
if not skip_requirements_check:
|
||||
if config.get("check_requirements", True) and os.path.exists(req_path):
|
||||
with req_path.open() as requirements_file:
|
||||
_check_requirements([req.strip() for req in requirements_file])
|
||||
|
||||
if subcommand in workflows:
|
||||
msg.info(f"Running workflow '{subcommand}'")
|
||||
for cmd in workflows[subcommand]:
|
||||
|
@ -81,6 +93,7 @@ def project_run(
|
|||
force=force,
|
||||
dry=dry,
|
||||
capture=capture,
|
||||
skip_requirements_check=True,
|
||||
)
|
||||
else:
|
||||
cmd = commands[subcommand]
|
||||
|
@ -88,8 +101,8 @@ def project_run(
|
|||
if not (project_dir / dep).exists():
|
||||
err = f"Missing dependency specified by command '{subcommand}': {dep}"
|
||||
err_help = "Maybe you forgot to run the 'project assets' command or a previous step?"
|
||||
err_kwargs = {"exits": 1} if not dry else {}
|
||||
msg.fail(err, err_help, **err_kwargs)
|
||||
err_exits = 1 if not dry else None
|
||||
msg.fail(err, err_help, exits=err_exits)
|
||||
check_spacy_commit = check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION)
|
||||
with working_dir(project_dir) as current_dir:
|
||||
msg.divider(subcommand)
|
||||
|
@ -195,6 +208,8 @@ def validate_subcommand(
|
|||
msg.fail(f"No commands or workflows defined in {PROJECT_FILE}", exits=1)
|
||||
if subcommand not in commands and subcommand not in workflows:
|
||||
help_msg = []
|
||||
if subcommand in ["assets", "asset"]:
|
||||
help_msg.append("Did you mean to run: python -m spacy project assets?")
|
||||
if commands:
|
||||
help_msg.append(f"Available commands: {', '.join(commands)}")
|
||||
if workflows:
|
||||
|
@ -308,3 +323,38 @@ def get_fileinfo(project_dir: Path, paths: List[str]) -> List[Dict[str, Optional
|
|||
md5 = get_checksum(file_path) if file_path.exists() else None
|
||||
data.append({"path": path, "md5": md5})
|
||||
return data
|
||||
|
||||
|
||||
def _check_requirements(requirements: List[str]) -> Tuple[bool, bool]:
|
||||
"""Checks whether requirements are installed and free of version conflicts.
|
||||
requirements (List[str]): List of requirements.
|
||||
RETURNS (Tuple[bool, bool]): Whether (1) any packages couldn't be imported, (2) any packages with version conflicts
|
||||
exist.
|
||||
"""
|
||||
|
||||
failed_pkgs_msgs: List[str] = []
|
||||
conflicting_pkgs_msgs: List[str] = []
|
||||
|
||||
for req in requirements:
|
||||
try:
|
||||
pkg_resources.require(req)
|
||||
except pkg_resources.DistributionNotFound as dnf:
|
||||
failed_pkgs_msgs.append(dnf.report())
|
||||
except pkg_resources.VersionConflict as vc:
|
||||
conflicting_pkgs_msgs.append(vc.report())
|
||||
except Exception:
|
||||
msg.warn(
|
||||
f"Unable to check requirement: {req} "
|
||||
"Checks are currently limited to requirement specifiers "
|
||||
"(PEP 508)"
|
||||
)
|
||||
|
||||
if len(failed_pkgs_msgs) or len(conflicting_pkgs_msgs):
|
||||
msg.warn(
|
||||
title="Missing requirements or requirement conflicts detected. Make sure your Python environment is set up "
|
||||
"correctly and you installed all requirements specified in your project's requirements.txt: "
|
||||
)
|
||||
for pgk_msg in failed_pkgs_msgs + conflicting_pkgs_msgs:
|
||||
msg.text(pgk_msg)
|
||||
|
||||
return len(failed_pkgs_msgs) > 0, len(conflicting_pkgs_msgs) > 0
|
||||
|
|
|
@ -1,8 +1,9 @@
|
|||
{# This is a template for training configs used for the quickstart widget in
|
||||
the docs and the init config command. It encodes various best practices and
|
||||
can help generate the best possible configuration, given a user's requirements. #}
|
||||
{%- set use_transformer = hardware != "cpu" -%}
|
||||
{%- set use_transformer = hardware != "cpu" and transformer_data -%}
|
||||
{%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
|
||||
{%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "spancat", "trainable_lemmatizer"] -%}
|
||||
[paths]
|
||||
train = null
|
||||
dev = null
|
||||
|
@ -24,10 +25,10 @@ lang = "{{ lang }}"
|
|||
{%- set has_textcat = ("textcat" in components or "textcat_multilabel" in components) -%}
|
||||
{%- set with_accuracy = optimize == "accuracy" -%}
|
||||
{%- set has_accurate_textcat = has_textcat and with_accuracy -%}
|
||||
{%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "entity_linker" in components or has_accurate_textcat) -%}
|
||||
{%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components %}
|
||||
{%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "spancat" in components or "trainable_lemmatizer" in components or "entity_linker" in components or has_accurate_textcat) -%}
|
||||
{%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components -%}
|
||||
{%- else -%}
|
||||
{%- set full_pipeline = components %}
|
||||
{%- set full_pipeline = components -%}
|
||||
{%- endif %}
|
||||
pipeline = {{ full_pipeline|pprint()|replace("'", '"')|safe }}
|
||||
batch_size = {{ 128 if hardware == "gpu" else 1000 }}
|
||||
|
@ -54,7 +55,7 @@ stride = 96
|
|||
factory = "morphologizer"
|
||||
|
||||
[components.morphologizer.model]
|
||||
@architectures = "spacy.Tagger.v1"
|
||||
@architectures = "spacy.Tagger.v2"
|
||||
nO = null
|
||||
|
||||
[components.morphologizer.model.tok2vec]
|
||||
|
@ -70,7 +71,7 @@ grad_factor = 1.0
|
|||
factory = "tagger"
|
||||
|
||||
[components.tagger.model]
|
||||
@architectures = "spacy.Tagger.v1"
|
||||
@architectures = "spacy.Tagger.v2"
|
||||
nO = null
|
||||
|
||||
[components.tagger.model.tok2vec]
|
||||
|
@ -123,6 +124,60 @@ grad_factor = 1.0
|
|||
@layers = "reduce_mean.v1"
|
||||
{% endif -%}
|
||||
|
||||
{% if "spancat" in components -%}
|
||||
[components.spancat]
|
||||
factory = "spancat"
|
||||
max_positive = null
|
||||
scorer = {"@scorers":"spacy.spancat_scorer.v1"}
|
||||
spans_key = "sc"
|
||||
threshold = 0.5
|
||||
|
||||
[components.spancat.model]
|
||||
@architectures = "spacy.SpanCategorizer.v1"
|
||||
|
||||
[components.spancat.model.reducer]
|
||||
@layers = "spacy.mean_max_reducer.v1"
|
||||
hidden_size = 128
|
||||
|
||||
[components.spancat.model.scorer]
|
||||
@layers = "spacy.LinearLogistic.v1"
|
||||
nO = null
|
||||
nI = null
|
||||
|
||||
[components.spancat.model.tok2vec]
|
||||
@architectures = "spacy-transformers.TransformerListener.v1"
|
||||
grad_factor = 1.0
|
||||
|
||||
[components.spancat.model.tok2vec.pooling]
|
||||
@layers = "reduce_mean.v1"
|
||||
|
||||
[components.spancat.suggester]
|
||||
@misc = "spacy.ngram_suggester.v1"
|
||||
sizes = [1,2,3]
|
||||
{% endif -%}
|
||||
|
||||
{% if "trainable_lemmatizer" in components -%}
|
||||
[components.trainable_lemmatizer]
|
||||
factory = "trainable_lemmatizer"
|
||||
backoff = "orth"
|
||||
min_tree_freq = 3
|
||||
overwrite = false
|
||||
scorer = {"@scorers":"spacy.lemmatizer_scorer.v1"}
|
||||
top_k = 1
|
||||
|
||||
[components.trainable_lemmatizer.model]
|
||||
@architectures = "spacy.Tagger.v2"
|
||||
nO = null
|
||||
normalize = false
|
||||
|
||||
[components.trainable_lemmatizer.model.tok2vec]
|
||||
@architectures = "spacy-transformers.TransformerListener.v1"
|
||||
grad_factor = 1.0
|
||||
|
||||
[components.trainable_lemmatizer.model.tok2vec.pooling]
|
||||
@layers = "reduce_mean.v1"
|
||||
{% endif -%}
|
||||
|
||||
{% if "entity_linker" in components -%}
|
||||
[components.entity_linker]
|
||||
factory = "entity_linker"
|
||||
|
@ -131,7 +186,7 @@ incl_context = true
|
|||
incl_prior = true
|
||||
|
||||
[components.entity_linker.model]
|
||||
@architectures = "spacy.EntityLinker.v1"
|
||||
@architectures = "spacy.EntityLinker.v2"
|
||||
nO = null
|
||||
|
||||
[components.entity_linker.model.tok2vec]
|
||||
|
@ -216,13 +271,8 @@ factory = "tok2vec"
|
|||
[components.tok2vec.model.embed]
|
||||
@architectures = "spacy.MultiHashEmbed.v2"
|
||||
width = ${components.tok2vec.model.encode.width}
|
||||
{% if has_letters -%}
|
||||
attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
|
||||
rows = [5000, 2500, 2500, 2500]
|
||||
{% else -%}
|
||||
attrs = ["ORTH", "SHAPE"]
|
||||
rows = [5000, 2500]
|
||||
{% endif -%}
|
||||
rows = [5000, 1000, 2500, 2500]
|
||||
include_static_vectors = {{ "true" if optimize == "accuracy" else "false" }}
|
||||
|
||||
[components.tok2vec.model.encode]
|
||||
|
@ -238,7 +288,7 @@ maxout_pieces = 3
|
|||
factory = "morphologizer"
|
||||
|
||||
[components.morphologizer.model]
|
||||
@architectures = "spacy.Tagger.v1"
|
||||
@architectures = "spacy.Tagger.v2"
|
||||
nO = null
|
||||
|
||||
[components.morphologizer.model.tok2vec]
|
||||
|
@ -251,7 +301,7 @@ width = ${components.tok2vec.model.encode.width}
|
|||
factory = "tagger"
|
||||
|
||||
[components.tagger.model]
|
||||
@architectures = "spacy.Tagger.v1"
|
||||
@architectures = "spacy.Tagger.v2"
|
||||
nO = null
|
||||
|
||||
[components.tagger.model.tok2vec]
|
||||
|
@ -295,6 +345,54 @@ nO = null
|
|||
width = ${components.tok2vec.model.encode.width}
|
||||
{% endif %}
|
||||
|
||||
{% if "spancat" in components %}
|
||||
[components.spancat]
|
||||
factory = "spancat"
|
||||
max_positive = null
|
||||
scorer = {"@scorers":"spacy.spancat_scorer.v1"}
|
||||
spans_key = "sc"
|
||||
threshold = 0.5
|
||||
|
||||
[components.spancat.model]
|
||||
@architectures = "spacy.SpanCategorizer.v1"
|
||||
|
||||
[components.spancat.model.reducer]
|
||||
@layers = "spacy.mean_max_reducer.v1"
|
||||
hidden_size = 128
|
||||
|
||||
[components.spancat.model.scorer]
|
||||
@layers = "spacy.LinearLogistic.v1"
|
||||
nO = null
|
||||
nI = null
|
||||
|
||||
[components.spancat.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecListener.v1"
|
||||
width = ${components.tok2vec.model.encode.width}
|
||||
|
||||
[components.spancat.suggester]
|
||||
@misc = "spacy.ngram_suggester.v1"
|
||||
sizes = [1,2,3]
|
||||
{% endif %}
|
||||
|
||||
{% if "trainable_lemmatizer" in components -%}
|
||||
[components.trainable_lemmatizer]
|
||||
factory = "trainable_lemmatizer"
|
||||
backoff = "orth"
|
||||
min_tree_freq = 3
|
||||
overwrite = false
|
||||
scorer = {"@scorers":"spacy.lemmatizer_scorer.v1"}
|
||||
top_k = 1
|
||||
|
||||
[components.trainable_lemmatizer.model]
|
||||
@architectures = "spacy.Tagger.v2"
|
||||
nO = null
|
||||
normalize = false
|
||||
|
||||
[components.trainable_lemmatizer.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecListener.v1"
|
||||
width = ${components.tok2vec.model.encode.width}
|
||||
{% endif -%}
|
||||
|
||||
{% if "entity_linker" in components -%}
|
||||
[components.entity_linker]
|
||||
factory = "entity_linker"
|
||||
|
@ -303,7 +401,7 @@ incl_context = true
|
|||
incl_prior = true
|
||||
|
||||
[components.entity_linker.model]
|
||||
@architectures = "spacy.EntityLinker.v1"
|
||||
@architectures = "spacy.EntityLinker.v2"
|
||||
nO = null
|
||||
|
||||
[components.entity_linker.model.tok2vec]
|
||||
|
@ -369,7 +467,7 @@ no_output_layer = false
|
|||
{% endif %}
|
||||
|
||||
{% for pipe in components %}
|
||||
{% if pipe not in ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker"] %}
|
||||
{% if pipe not in listener_components %}
|
||||
{# Other components defined by the user: we just assume they're factories #}
|
||||
[components.{{ pipe }}]
|
||||
factory = "{{ pipe }}"
|
||||
|
|
|
@ -37,6 +37,15 @@ bn:
|
|||
accuracy:
|
||||
name: sagorsarker/bangla-bert-base
|
||||
size_factor: 3
|
||||
ca:
|
||||
word_vectors: null
|
||||
transformer:
|
||||
efficiency:
|
||||
name: projecte-aina/roberta-base-ca-v2
|
||||
size_factor: 3
|
||||
accuracy:
|
||||
name: projecte-aina/roberta-base-ca-v2
|
||||
size_factor: 3
|
||||
da:
|
||||
word_vectors: da_core_news_lg
|
||||
transformer:
|
||||
|
@ -271,4 +280,3 @@ zh:
|
|||
accuracy:
|
||||
name: bert-base-chinese
|
||||
size_factor: 3
|
||||
has_letters: false
|
||||
|
|
|
@ -90,6 +90,8 @@ dev_corpus = "corpora.dev"
|
|||
train_corpus = "corpora.train"
|
||||
# Optional callback before nlp object is saved to disk after training
|
||||
before_to_disk = null
|
||||
# Optional callback that is invoked at the start of each training step
|
||||
before_update = null
|
||||
|
||||
[training.logger]
|
||||
@loggers = "spacy.ConsoleLogger.v1"
|
||||
|
|
|
@ -7,10 +7,11 @@ USAGE: https://spacy.io/usage/visualizers
|
|||
from typing import Union, Iterable, Optional, Dict, Any, Callable
|
||||
import warnings
|
||||
|
||||
from .render import DependencyRenderer, EntityRenderer
|
||||
from .render import DependencyRenderer, EntityRenderer, SpanRenderer
|
||||
from ..tokens import Doc, Span
|
||||
from ..errors import Errors, Warnings
|
||||
from ..util import is_in_jupyter
|
||||
from ..util import find_available_port
|
||||
|
||||
|
||||
_html = {}
|
||||
|
@ -36,7 +37,7 @@ def render(
|
|||
jupyter (bool): Override Jupyter auto-detection.
|
||||
options (dict): Visualiser-specific options, e.g. colors.
|
||||
manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
|
||||
RETURNS (str): Rendered HTML markup.
|
||||
RETURNS (str): Rendered SVG or HTML markup.
|
||||
|
||||
DOCS: https://spacy.io/api/top-level#displacy.render
|
||||
USAGE: https://spacy.io/usage/visualizers
|
||||
|
@ -44,6 +45,7 @@ def render(
|
|||
factories = {
|
||||
"dep": (DependencyRenderer, parse_deps),
|
||||
"ent": (EntityRenderer, parse_ents),
|
||||
"span": (SpanRenderer, parse_spans),
|
||||
}
|
||||
if style not in factories:
|
||||
raise ValueError(Errors.E087.format(style=style))
|
||||
|
@ -55,6 +57,10 @@ def render(
|
|||
renderer_func, converter = factories[style]
|
||||
renderer = renderer_func(options=options)
|
||||
parsed = [converter(doc, options) for doc in docs] if not manual else docs # type: ignore
|
||||
if manual:
|
||||
for doc in docs:
|
||||
if isinstance(doc, dict) and "ents" in doc:
|
||||
doc["ents"] = sorted(doc["ents"], key=lambda x: (x["start"], x["end"]))
|
||||
_html["parsed"] = renderer.render(parsed, page=page, minify=minify).strip() # type: ignore
|
||||
html = _html["parsed"]
|
||||
if RENDER_WRAPPER is not None:
|
||||
|
@ -77,6 +83,7 @@ def serve(
|
|||
manual: bool = False,
|
||||
port: int = 5000,
|
||||
host: str = "0.0.0.0",
|
||||
auto_select_port: bool = False,
|
||||
) -> None:
|
||||
"""Serve displaCy visualisation.
|
||||
|
||||
|
@ -88,12 +95,15 @@ def serve(
|
|||
manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
|
||||
port (int): Port to serve visualisation.
|
||||
host (str): Host to serve visualisation.
|
||||
auto_select_port (bool): Automatically select a port if the specified port is in use.
|
||||
|
||||
DOCS: https://spacy.io/api/top-level#displacy.serve
|
||||
USAGE: https://spacy.io/usage/visualizers
|
||||
"""
|
||||
from wsgiref import simple_server
|
||||
|
||||
port = find_available_port(port, host, auto_select_port)
|
||||
|
||||
if is_in_jupyter():
|
||||
warnings.warn(Warnings.W011)
|
||||
render(docs, style=style, page=page, minify=minify, options=options, manual=manual)
|
||||
|
@ -118,7 +128,8 @@ def app(environ, start_response):
|
|||
def parse_deps(orig_doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
|
||||
"""Generate dependency parse in {'words': [], 'arcs': []} format.
|
||||
|
||||
doc (Doc): Document do parse.
|
||||
orig_doc (Doc): Document to parse.
|
||||
options (Dict[str, Any]): Dependency parse specific visualisation options.
|
||||
RETURNS (dict): Generated dependency parse keyed by words and arcs.
|
||||
"""
|
||||
doc = Doc(orig_doc.vocab).from_bytes(
|
||||
|
@ -203,6 +214,43 @@ def parse_ents(doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
|
|||
return {"text": doc.text, "ents": ents, "title": title, "settings": settings}
|
||||
|
||||
|
||||
def parse_spans(doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
|
||||
"""Generate spans in [{start_token: i, end_token: i, label: 'label'}] format.
|
||||
|
||||
doc (Doc): Document to parse.
|
||||
options (Dict[str, any]): Span-specific visualisation options.
|
||||
RETURNS (dict): Generated span types keyed by text (original text) and spans.
|
||||
"""
|
||||
kb_url_template = options.get("kb_url_template", None)
|
||||
spans_key = options.get("spans_key", "sc")
|
||||
spans = [
|
||||
{
|
||||
"start": span.start_char,
|
||||
"end": span.end_char,
|
||||
"start_token": span.start,
|
||||
"end_token": span.end,
|
||||
"label": span.label_,
|
||||
"kb_id": span.kb_id_ if span.kb_id_ else "",
|
||||
"kb_url": kb_url_template.format(span.kb_id_) if kb_url_template else "#",
|
||||
}
|
||||
for span in doc.spans.get(spans_key, [])
|
||||
]
|
||||
tokens = [token.text for token in doc]
|
||||
|
||||
if not spans:
|
||||
keys = list(doc.spans.keys())
|
||||
warnings.warn(Warnings.W117.format(spans_key=spans_key, keys=keys))
|
||||
title = doc.user_data.get("title", None) if hasattr(doc, "user_data") else None
|
||||
settings = get_doc_settings(doc)
|
||||
return {
|
||||
"text": doc.text,
|
||||
"spans": spans,
|
||||
"title": title,
|
||||
"settings": settings,
|
||||
"tokens": tokens,
|
||||
}
|
||||
|
||||
|
||||
def set_render_wrapper(func: Callable[[str], str]) -> None:
|
||||
"""Set an optional wrapper function that is called around the generated
|
||||
HTML markup on displacy.render. This can be used to allow integration into
|
||||
|
|
|
@ -1,12 +1,15 @@
|
|||
from typing import Dict, Any, List, Optional, Union
|
||||
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||
import uuid
|
||||
import itertools
|
||||
|
||||
from .templates import TPL_DEP_SVG, TPL_DEP_WORDS, TPL_DEP_WORDS_LEMMA, TPL_DEP_ARCS
|
||||
from .templates import TPL_ENT, TPL_ENT_RTL, TPL_FIGURE, TPL_TITLE, TPL_PAGE
|
||||
from .templates import TPL_ENTS, TPL_KB_LINK
|
||||
from ..util import minify_html, escape_html, registry
|
||||
from ..errors import Errors
|
||||
|
||||
from ..util import escape_html, minify_html, registry
|
||||
from .templates import TPL_DEP_ARCS, TPL_DEP_SVG, TPL_DEP_WORDS
|
||||
from .templates import TPL_DEP_WORDS_LEMMA, TPL_ENT, TPL_ENT_RTL, TPL_ENTS
|
||||
from .templates import TPL_FIGURE, TPL_KB_LINK, TPL_PAGE, TPL_SPAN
|
||||
from .templates import TPL_SPAN_RTL, TPL_SPAN_SLICE, TPL_SPAN_SLICE_RTL
|
||||
from .templates import TPL_SPAN_START, TPL_SPAN_START_RTL, TPL_SPANS
|
||||
from .templates import TPL_TITLE
|
||||
|
||||
DEFAULT_LANG = "en"
|
||||
DEFAULT_DIR = "ltr"
|
||||
|
@ -33,6 +36,224 @@ DEFAULT_LABEL_COLORS = {
|
|||
}
|
||||
|
||||
|
||||
class SpanRenderer:
|
||||
"""Render Spans as SVGs."""
|
||||
|
||||
style = "span"
|
||||
|
||||
def __init__(self, options: Dict[str, Any] = {}) -> None:
|
||||
"""Initialise span renderer
|
||||
|
||||
options (dict): Visualiser-specific options (colors, spans)
|
||||
"""
|
||||
# Set up the colors and overall look
|
||||
colors = dict(DEFAULT_LABEL_COLORS)
|
||||
user_colors = registry.displacy_colors.get_all()
|
||||
for user_color in user_colors.values():
|
||||
if callable(user_color):
|
||||
# Since this comes from the function registry, we want to make
|
||||
# sure we support functions that *return* a dict of colors
|
||||
user_color = user_color()
|
||||
if not isinstance(user_color, dict):
|
||||
raise ValueError(Errors.E925.format(obj=type(user_color)))
|
||||
colors.update(user_color)
|
||||
colors.update(options.get("colors", {}))
|
||||
self.default_color = DEFAULT_ENTITY_COLOR
|
||||
self.colors = {label.upper(): color for label, color in colors.items()}
|
||||
|
||||
# Set up how the text and labels will be rendered
|
||||
self.direction = DEFAULT_DIR
|
||||
self.lang = DEFAULT_LANG
|
||||
# These values are in px
|
||||
self.top_offset = options.get("top_offset", 40)
|
||||
# This is how far under the top offset the span labels appear
|
||||
self.span_label_offset = options.get("span_label_offset", 20)
|
||||
self.offset_step = options.get("top_offset_step", 17)
|
||||
|
||||
# Set up which templates will be used
|
||||
template = options.get("template")
|
||||
if template:
|
||||
self.span_template = template["span"]
|
||||
self.span_slice_template = template["slice"]
|
||||
self.span_start_template = template["start"]
|
||||
else:
|
||||
if self.direction == "rtl":
|
||||
self.span_template = TPL_SPAN_RTL
|
||||
self.span_slice_template = TPL_SPAN_SLICE_RTL
|
||||
self.span_start_template = TPL_SPAN_START_RTL
|
||||
else:
|
||||
self.span_template = TPL_SPAN
|
||||
self.span_slice_template = TPL_SPAN_SLICE
|
||||
self.span_start_template = TPL_SPAN_START
|
||||
|
||||
def render(
|
||||
self, parsed: List[Dict[str, Any]], page: bool = False, minify: bool = False
|
||||
) -> str:
|
||||
"""Render complete markup.
|
||||
|
||||
parsed (list): Dependency parses to render.
|
||||
page (bool): Render parses wrapped as full HTML page.
|
||||
minify (bool): Minify HTML markup.
|
||||
RETURNS (str): Rendered SVG or HTML markup.
|
||||
"""
|
||||
rendered = []
|
||||
for i, p in enumerate(parsed):
|
||||
if i == 0:
|
||||
settings = p.get("settings", {})
|
||||
self.direction = settings.get("direction", DEFAULT_DIR)
|
||||
self.lang = settings.get("lang", DEFAULT_LANG)
|
||||
rendered.append(self.render_spans(p["tokens"], p["spans"], p.get("title")))
|
||||
|
||||
if page:
|
||||
docs = "".join([TPL_FIGURE.format(content=doc) for doc in rendered])
|
||||
markup = TPL_PAGE.format(content=docs, lang=self.lang, dir=self.direction)
|
||||
else:
|
||||
markup = "".join(rendered)
|
||||
if minify:
|
||||
return minify_html(markup)
|
||||
return markup
|
||||
|
||||
def render_spans(
|
||||
self,
|
||||
tokens: List[str],
|
||||
spans: List[Dict[str, Any]],
|
||||
title: Optional[str],
|
||||
) -> str:
|
||||
"""Render span types in text.
|
||||
|
||||
Spans are rendered per-token, this means that for each token, we check if it's part
|
||||
of a span slice (a member of a span type) or a span start (the starting token of a
|
||||
given span type).
|
||||
|
||||
tokens (list): Individual tokens in the text
|
||||
spans (list): Individual entity spans and their start, end, label, kb_id and kb_url.
|
||||
title (str / None): Document title set in Doc.user_data['title'].
|
||||
"""
|
||||
per_token_info = []
|
||||
# we must sort so that we can correctly describe when spans need to "stack"
|
||||
# which is determined by their start token, then span length (longer spans on top),
|
||||
# then break any remaining ties with the span label
|
||||
spans = sorted(
|
||||
spans,
|
||||
key=lambda s: (
|
||||
s["start_token"],
|
||||
-(s["end_token"] - s["start_token"]),
|
||||
s["label"],
|
||||
),
|
||||
)
|
||||
for s in spans:
|
||||
# this is the vertical 'slot' that the span will be rendered in
|
||||
# vertical_position = span_label_offset + (offset_step * (slot - 1))
|
||||
s["render_slot"] = 0
|
||||
for idx, token in enumerate(tokens):
|
||||
# Identify if a token belongs to a Span (and which) and if it's a
|
||||
# start token of said Span. We'll use this for the final HTML render
|
||||
token_markup: Dict[str, Any] = {}
|
||||
token_markup["text"] = token
|
||||
concurrent_spans = 0
|
||||
entities = []
|
||||
for span in spans:
|
||||
ent = {}
|
||||
if span["start_token"] <= idx < span["end_token"]:
|
||||
concurrent_spans += 1
|
||||
span_start = idx == span["start_token"]
|
||||
ent["label"] = span["label"]
|
||||
ent["is_start"] = span_start
|
||||
if span_start:
|
||||
# When the span starts, we need to know how many other
|
||||
# spans are on the 'span stack' and will be rendered.
|
||||
# This value becomes the vertical render slot for this entire span
|
||||
span["render_slot"] = concurrent_spans
|
||||
ent["render_slot"] = span["render_slot"]
|
||||
kb_id = span.get("kb_id", "")
|
||||
kb_url = span.get("kb_url", "#")
|
||||
ent["kb_link"] = (
|
||||
TPL_KB_LINK.format(kb_id=kb_id, kb_url=kb_url) if kb_id else ""
|
||||
)
|
||||
entities.append(ent)
|
||||
else:
|
||||
# We don't specifically need to do this since we loop
|
||||
# over tokens and spans sorted by their start_token,
|
||||
# so we'll never use a span again after the last token it appears in,
|
||||
# but if we were to use these spans again we'd want to make sure
|
||||
# this value was reset correctly.
|
||||
span["render_slot"] = 0
|
||||
token_markup["entities"] = entities
|
||||
per_token_info.append(token_markup)
|
||||
markup = self._render_markup(per_token_info)
|
||||
markup = TPL_SPANS.format(content=markup, dir=self.direction)
|
||||
if title:
|
||||
markup = TPL_TITLE.format(title=title) + markup
|
||||
return markup
|
||||
|
||||
def _render_markup(self, per_token_info: List[Dict[str, Any]]) -> str:
|
||||
"""Render the markup from per-token information"""
|
||||
markup = ""
|
||||
for token in per_token_info:
|
||||
entities = sorted(token["entities"], key=lambda d: d["render_slot"])
|
||||
# Whitespace tokens disrupt the vertical space (no line height) so that the
|
||||
# span indicators get misaligned. We don't render them as individual
|
||||
# tokens anyway, so we'll just not display a span indicator either.
|
||||
is_whitespace = token["text"].strip() == ""
|
||||
if entities and not is_whitespace:
|
||||
slices = self._get_span_slices(token["entities"])
|
||||
starts = self._get_span_starts(token["entities"])
|
||||
total_height = (
|
||||
self.top_offset
|
||||
+ self.span_label_offset
|
||||
+ (self.offset_step * (len(entities) - 1))
|
||||
)
|
||||
markup += self.span_template.format(
|
||||
text=token["text"],
|
||||
span_slices=slices,
|
||||
span_starts=starts,
|
||||
total_height=total_height,
|
||||
)
|
||||
else:
|
||||
markup += escape_html(token["text"] + " ")
|
||||
return markup
|
||||
|
||||
def _get_span_slices(self, entities: List[Dict]) -> str:
|
||||
"""Get the rendered markup of all Span slices"""
|
||||
span_slices = []
|
||||
for entity in entities:
|
||||
# rather than iterate over multiples of offset_step, we use entity['render_slot']
|
||||
# to determine the vertical position, since that tells where
|
||||
# the span starts vertically so we can extend it horizontally,
|
||||
# past other spans that might have already ended
|
||||
color = self.colors.get(entity["label"].upper(), self.default_color)
|
||||
top_offset = self.top_offset + (
|
||||
self.offset_step * (entity["render_slot"] - 1)
|
||||
)
|
||||
span_slice = self.span_slice_template.format(
|
||||
bg=color,
|
||||
top_offset=top_offset,
|
||||
)
|
||||
span_slices.append(span_slice)
|
||||
return "".join(span_slices)
|
||||
|
||||
def _get_span_starts(self, entities: List[Dict]) -> str:
|
||||
"""Get the rendered markup of all Span start tokens"""
|
||||
span_starts = []
|
||||
for entity in entities:
|
||||
color = self.colors.get(entity["label"].upper(), self.default_color)
|
||||
top_offset = self.top_offset + (
|
||||
self.offset_step * (entity["render_slot"] - 1)
|
||||
)
|
||||
span_start = (
|
||||
self.span_start_template.format(
|
||||
bg=color,
|
||||
top_offset=top_offset,
|
||||
label=entity["label"],
|
||||
kb_link=entity["kb_link"],
|
||||
)
|
||||
if entity["is_start"]
|
||||
else ""
|
||||
)
|
||||
span_starts.append(span_start)
|
||||
return "".join(span_starts)
|
||||
|
||||
|
||||
class DependencyRenderer:
|
||||
"""Render dependency parses as SVGs."""
|
||||
|
||||
|
@ -105,7 +326,7 @@ class DependencyRenderer:
|
|||
RETURNS (str): Rendered SVG markup.
|
||||
"""
|
||||
self.levels = self.get_levels(arcs)
|
||||
self.highest_level = len(self.levels)
|
||||
self.highest_level = max(self.levels.values(), default=0)
|
||||
self.offset_y = self.distance / 2 * self.highest_level + self.arrow_stroke
|
||||
self.width = self.offset_x + len(words) * self.distance
|
||||
self.height = self.offset_y + 3 * self.word_spacing
|
||||
|
@ -165,7 +386,7 @@ class DependencyRenderer:
|
|||
if start < 0 or end < 0:
|
||||
error_args = dict(start=start, end=end, label=label, dir=direction)
|
||||
raise ValueError(Errors.E157.format(**error_args))
|
||||
level = self.levels.index(end - start) + 1
|
||||
level = self.levels[(start, end, label)]
|
||||
x_start = self.offset_x + start * self.distance + self.arrow_spacing
|
||||
if self.direction == "rtl":
|
||||
x_start = self.width - x_start
|
||||
|
@ -181,7 +402,7 @@ class DependencyRenderer:
|
|||
y_curve = self.offset_y - level * self.distance / 2
|
||||
if self.compact:
|
||||
y_curve = self.offset_y - level * self.distance / 6
|
||||
if y_curve == 0 and len(self.levels) > 5:
|
||||
if y_curve == 0 and max(self.levels.values(), default=0) > 5:
|
||||
y_curve = -self.distance
|
||||
arrowhead = self.get_arrowhead(direction, x_start, y, x_end)
|
||||
arc = self.get_arc(x_start, y, y_curve, x_end)
|
||||
|
@ -225,15 +446,23 @@ class DependencyRenderer:
|
|||
p1, p2, p3 = (end, end + self.arrow_width - 2, end - self.arrow_width + 2)
|
||||
return f"M{p1},{y + 2} L{p2},{y - self.arrow_width} {p3},{y - self.arrow_width}"
|
||||
|
||||
def get_levels(self, arcs: List[Dict[str, Any]]) -> List[int]:
|
||||
def get_levels(self, arcs: List[Dict[str, Any]]) -> Dict[Tuple[int, int, str], int]:
|
||||
"""Calculate available arc height "levels".
|
||||
Used to calculate arrow heights dynamically and without wasting space.
|
||||
|
||||
args (list): Individual arcs and their start, end, direction and label.
|
||||
RETURNS (list): Arc levels sorted from lowest to highest.
|
||||
RETURNS (dict): Arc levels keyed by (start, end, label).
|
||||
"""
|
||||
levels = set(map(lambda arc: arc["end"] - arc["start"], arcs))
|
||||
return sorted(list(levels))
|
||||
arcs = [dict(t) for t in {tuple(sorted(arc.items())) for arc in arcs}]
|
||||
length = max([arc["end"] for arc in arcs], default=0)
|
||||
max_level = [0] * length
|
||||
levels = {}
|
||||
for arc in sorted(arcs, key=lambda arc: arc["end"] - arc["start"]):
|
||||
level = max(max_level[arc["start"] : arc["end"]]) + 1
|
||||
for i in range(arc["start"], arc["end"]):
|
||||
max_level[i] = level
|
||||
levels[(arc["start"], arc["end"], arc["label"])] = level
|
||||
return levels
|
||||
|
||||
|
||||
class EntityRenderer:
|
||||
|
@ -242,7 +471,7 @@ class EntityRenderer:
|
|||
style = "ent"
|
||||
|
||||
def __init__(self, options: Dict[str, Any] = {}) -> None:
|
||||
"""Initialise dependency renderer.
|
||||
"""Initialise entity renderer.
|
||||
|
||||
options (dict): Visualiser-specific options (colors, ents)
|
||||
"""
|
||||
|
@ -281,7 +510,7 @@ class EntityRenderer:
|
|||
parsed (list): Dependency parses to render.
|
||||
page (bool): Render parses wrapped as full HTML page.
|
||||
minify (bool): Minify HTML markup.
|
||||
RETURNS (str): Rendered HTML markup.
|
||||
RETURNS (str): Rendered SVG or HTML markup.
|
||||
"""
|
||||
rendered = []
|
||||
for i, p in enumerate(parsed):
|
||||
|
|
|
@ -62,6 +62,55 @@ TPL_ENT_RTL = """
|
|||
</mark>
|
||||
"""
|
||||
|
||||
TPL_SPANS = """
|
||||
<div class="spans" style="line-height: 2.5; direction: {dir}">{content}</div>
|
||||
"""
|
||||
|
||||
TPL_SPAN = """
|
||||
<span style="font-weight: bold; display: inline-block; position: relative; height: {total_height}px;">
|
||||
{text}
|
||||
{span_slices}
|
||||
{span_starts}
|
||||
</span>
|
||||
"""
|
||||
|
||||
TPL_SPAN_SLICE = """
|
||||
<span style="background: {bg}; top: {top_offset}px; height: 4px; left: -1px; width: calc(100% + 2px); position: absolute;">
|
||||
</span>
|
||||
"""
|
||||
|
||||
|
||||
TPL_SPAN_START = """
|
||||
<span style="background: {bg}; top: {top_offset}px; height: 4px; border-top-left-radius: 3px; border-bottom-left-radius: 3px; left: -1px; width: calc(100% + 2px); position: absolute;">
|
||||
<span style="background: {bg}; z-index: 10; color: #000; top: -0.5em; padding: 2px 3px; position: absolute; font-size: 0.6em; font-weight: bold; line-height: 1; border-radius: 3px">
|
||||
{label}{kb_link}
|
||||
</span>
|
||||
</span>
|
||||
|
||||
"""
|
||||
|
||||
TPL_SPAN_RTL = """
|
||||
<span style="font-weight: bold; display: inline-block; position: relative;">
|
||||
{text}
|
||||
{span_slices}
|
||||
{span_starts}
|
||||
</span>
|
||||
"""
|
||||
|
||||
TPL_SPAN_SLICE_RTL = """
|
||||
<span style="background: {bg}; top: {top_offset}px; height: 4px; left: -1px; width: calc(100% + 2px); position: absolute;">
|
||||
</span>
|
||||
"""
|
||||
|
||||
TPL_SPAN_START_RTL = """
|
||||
<span style="background: {bg}; top: {top_offset}px; height: 4px; border-top-left-radius: 3px; border-bottom-left-radius: 3px; left: -1px; width: calc(100% + 2px); position: absolute;">
|
||||
<span style="background: {bg}; z-index: 10; color: #000; top: -0.5em; padding: 2px 3px; position: absolute; font-size: 0.6em; font-weight: bold; line-height: 1; border-radius: 3px">
|
||||
{label}{kb_link}
|
||||
</span>
|
||||
</span>
|
||||
"""
|
||||
|
||||
|
||||
# Important: this needs to start with a space!
|
||||
TPL_KB_LINK = """
|
||||
<a style="text-decoration: none; color: inherit; font-weight: normal" href="{kb_url}">{kb_id}</a>
|
||||
|
|
111
spacy/errors.py
111
spacy/errors.py
|
@ -1,4 +1,5 @@
|
|||
import warnings
|
||||
from .compat import Literal
|
||||
|
||||
|
||||
class ErrorsWithCodes(type):
|
||||
|
@ -15,8 +16,8 @@ def setup_default_warnings():
|
|||
filter_warning("ignore", error_msg="numpy.dtype size changed") # noqa
|
||||
filter_warning("ignore", error_msg="numpy.ufunc size changed") # noqa
|
||||
|
||||
# warn about entity_ruler & matcher having no patterns only once
|
||||
for pipe in ["matcher", "entity_ruler"]:
|
||||
# warn about entity_ruler, span_ruler & matcher having no patterns only once
|
||||
for pipe in ["matcher", "entity_ruler", "span_ruler"]:
|
||||
filter_warning("once", error_msg=Warnings.W036.format(name=pipe))
|
||||
|
||||
# warn once about lemmatizer without required POS
|
||||
|
@ -26,7 +27,10 @@ def setup_default_warnings():
|
|||
filter_warning("once", error_msg="[W114]")
|
||||
|
||||
|
||||
def filter_warning(action: str, error_msg: str):
|
||||
def filter_warning(
|
||||
action: Literal["default", "error", "ignore", "always", "module", "once"],
|
||||
error_msg: str,
|
||||
):
|
||||
"""Customize how spaCy should handle a certain warning.
|
||||
|
||||
error_msg (str): e.g. "W006", or a full error message
|
||||
|
@ -192,6 +196,25 @@ class Warnings(metaclass=ErrorsWithCodes):
|
|||
W115 = ("Skipping {method}: the floret vector table cannot be modified. "
|
||||
"Vectors are calculated from character ngrams.")
|
||||
W116 = ("Unable to clean attribute '{attr}'.")
|
||||
W117 = ("No spans to visualize found in Doc object with spans_key: '{spans_key}'. If this is "
|
||||
"surprising to you, make sure the Doc was processed using a model "
|
||||
"that supports span categorization, and check the `doc.spans[spans_key]` "
|
||||
"property manually if necessary.\n\nAvailable keys: {keys}")
|
||||
W118 = ("Term '{term}' not found in glossary. It may however be explained in documentation "
|
||||
"for the corpora used to train the language. Please check "
|
||||
"`nlp.meta[\"sources\"]` for any relevant links.")
|
||||
W119 = ("Overriding pipe name in `config` is not supported. Ignoring override '{name_in_config}'.")
|
||||
W120 = ("Unable to load all spans in Doc.spans: more than one span group "
|
||||
"with the name '{group_name}' was found in the saved spans data. "
|
||||
"Only the last span group will be loaded under "
|
||||
"Doc.spans['{group_name}']. Skipping span group with values: "
|
||||
"{group_values}")
|
||||
W121 = ("Attempting to trace non-existent method '{method}' in pipe '{pipe}'")
|
||||
W122 = ("Couldn't trace method '{method}' in pipe '{pipe}'. This can happen if the pipe class "
|
||||
"is a Cython extension type.")
|
||||
W123 = ("Argument `enable` with value {enable} does not contain all values specified in the config option "
|
||||
"`enabled` ({enabled}). Be aware that this might affect other components in your pipeline.")
|
||||
W124 = ("{host}:{port} is already in use, using the nearest available port {serve_port} as an alternative.")
|
||||
|
||||
|
||||
class Errors(metaclass=ErrorsWithCodes):
|
||||
|
@ -210,8 +233,9 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
"initialized component.")
|
||||
E004 = ("Can't set up pipeline component: a factory for '{name}' already "
|
||||
"exists. Existing factory: {func}. New factory: {new_func}")
|
||||
E005 = ("Pipeline component '{name}' returned None. If you're using a "
|
||||
"custom component, maybe you forgot to return the processed Doc?")
|
||||
E005 = ("Pipeline component '{name}' returned {returned_type} instead of a "
|
||||
"Doc. If you're using a custom component, maybe you forgot to "
|
||||
"return the processed Doc?")
|
||||
E006 = ("Invalid constraints for adding pipeline component. You can only "
|
||||
"set one of the following: before (component name or index), "
|
||||
"after (component name or index), first (True) or last (True). "
|
||||
|
@ -322,6 +346,11 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
"clear the existing vectors and resize the table.")
|
||||
E074 = ("Error interpreting compiled match pattern: patterns are expected "
|
||||
"to end with the attribute {attr}. Got: {bad_attr}.")
|
||||
E079 = ("Error computing states in beam: number of predicted beams "
|
||||
"({pbeams}) does not equal number of gold beams ({gbeams}).")
|
||||
E080 = ("Duplicate state found in beam: {key}.")
|
||||
E081 = ("Error getting gradient in beam: number of histories ({n_hist}) "
|
||||
"does not equal number of losses ({losses}).")
|
||||
E082 = ("Error deprojectivizing parse: number of heads ({n_heads}), "
|
||||
"projective heads ({n_proj_heads}) and labels ({n_labels}) do not "
|
||||
"match.")
|
||||
|
@ -369,7 +398,7 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
"consider using doc.spans instead.")
|
||||
E106 = ("Can't find `doc._.{attr}` attribute specified in the underscore "
|
||||
"settings: {opts}")
|
||||
E107 = ("Value of `doc._.{attr}` is not JSON-serializable: {value}")
|
||||
E107 = ("Value of custom attribute `{attr}` is not JSON-serializable: {value}")
|
||||
E109 = ("Component '{name}' could not be run. Did you forget to "
|
||||
"call `initialize()`?")
|
||||
E110 = ("Invalid displaCy render wrapper. Expected callable, got: {obj}")
|
||||
|
@ -437,10 +466,10 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
"same, but found '{nlp}' and '{vocab}' respectively.")
|
||||
E152 = ("The attribute {attr} is not supported for token patterns. "
|
||||
"Please use the option `validate=True` with the Matcher, PhraseMatcher, "
|
||||
"or EntityRuler for more details.")
|
||||
"EntityRuler or AttributeRuler for more details.")
|
||||
E153 = ("The value type {vtype} is not supported for token patterns. "
|
||||
"Please use the option validate=True with Matcher, PhraseMatcher, "
|
||||
"or EntityRuler for more details.")
|
||||
"EntityRuler or AttributeRuler for more details.")
|
||||
E154 = ("One of the attributes or values is not supported for token "
|
||||
"patterns. Please use the option `validate=True` with the Matcher, "
|
||||
"PhraseMatcher, or EntityRuler for more details.")
|
||||
|
@ -515,15 +544,28 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
E198 = ("Unable to return {n} most similar vectors for the current vectors "
|
||||
"table, which contains {n_rows} vectors.")
|
||||
E199 = ("Unable to merge 0-length span at `doc[{start}:{end}]`.")
|
||||
E200 = ("Can't yet set {attr} from Span. Vote for this feature on the "
|
||||
"issue tracker: http://github.com/explosion/spaCy/issues")
|
||||
E200 = ("Can't set {attr} from Span.")
|
||||
E202 = ("Unsupported {name} mode '{mode}'. Supported modes: {modes}.")
|
||||
E203 = ("If the {name} embedding layer is not updated "
|
||||
"during training, make sure to include it in 'annotating components'")
|
||||
|
||||
# New errors added in v3.x
|
||||
E851 = ("The 'textcat' component labels should only have values of 0 or 1, "
|
||||
"but found value of '{val}'.")
|
||||
E852 = ("The tar file pulled from the remote attempted an unsafe path "
|
||||
"traversal.")
|
||||
E853 = ("Unsupported component factory name '{name}'. The character '.' is "
|
||||
"not permitted in factory names.")
|
||||
E854 = ("Unable to set doc.ents. Check that the 'ents_filter' does not "
|
||||
"permit overlapping spans.")
|
||||
E855 = ("Invalid {obj}: {obj} is not from the same doc.")
|
||||
E856 = ("Error accessing span at position {i}: out of bounds in span group "
|
||||
"of length {length}.")
|
||||
E857 = ("Entry '{name}' not found in edit tree lemmatizer labels.")
|
||||
E858 = ("The {mode} vector table does not support this operation. "
|
||||
"{alternative}")
|
||||
E859 = ("The floret vector table cannot be modified.")
|
||||
E860 = ("Can't truncate fasttext-bloom vectors.")
|
||||
E860 = ("Can't truncate floret vectors.")
|
||||
E861 = ("No 'keys' should be provided when initializing floret vectors "
|
||||
"with 'minn' and 'maxn'.")
|
||||
E862 = ("'hash_count' must be between 1-4 for floret vectors.")
|
||||
|
@ -679,11 +721,11 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
"need to modify the pipeline, use the built-in methods like "
|
||||
"`nlp.add_pipe`, `nlp.remove_pipe`, `nlp.disable_pipe` or "
|
||||
"`nlp.enable_pipe` instead.")
|
||||
E927 = ("Can't write to frozen list Maybe you're trying to modify a computed "
|
||||
E927 = ("Can't write to frozen list. Maybe you're trying to modify a computed "
|
||||
"property or default function argument?")
|
||||
E928 = ("A KnowledgeBase can only be serialized to/from from a directory, "
|
||||
E928 = ("An InMemoryLookupKB can only be serialized to/from from a directory, "
|
||||
"but the provided argument {loc} points to a file.")
|
||||
E929 = ("Couldn't read KnowledgeBase from {loc}. The path does not seem to exist.")
|
||||
E929 = ("Couldn't read InMemoryLookupKB from {loc}. The path does not seem to exist.")
|
||||
E930 = ("Received invalid get_examples callback in `{method}`. "
|
||||
"Expected function that returns an iterable of Example objects but "
|
||||
"got: {obj}")
|
||||
|
@ -887,11 +929,46 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
E1022 = ("Words must be of type str or int, but input is of type '{wtype}'")
|
||||
E1023 = ("Couldn't read EntityRuler from the {path}. This file doesn't "
|
||||
"exist.")
|
||||
E1024 = ("A pattern with ID \"{ent_id}\" is not present in EntityRuler "
|
||||
"patterns.")
|
||||
E1024 = ("A pattern with {attr_type} '{label}' is not present in "
|
||||
"'{component}' patterns.")
|
||||
E1025 = ("Cannot intify the value '{value}' as an IOB string. The only "
|
||||
"supported values are: 'I', 'O', 'B' and ''")
|
||||
|
||||
E1026 = ("Edit tree has an invalid format:\n{errors}")
|
||||
E1027 = ("AlignmentArray only supports slicing with a step of 1.")
|
||||
E1028 = ("AlignmentArray only supports indexing using an int or a slice.")
|
||||
E1029 = ("Edit tree cannot be applied to form.")
|
||||
E1030 = ("Edit tree identifier out of range.")
|
||||
E1031 = ("Could not find gold transition - see logs above.")
|
||||
E1032 = ("`{var}` should not be {forbidden}, but received {value}.")
|
||||
E1033 = ("Dimension {name} invalid -- only nO, nF, nP")
|
||||
E1034 = ("Node index {i} out of bounds ({length})")
|
||||
E1035 = ("Token index {i} out of bounds ({length})")
|
||||
E1036 = ("Cannot index into NoneNode")
|
||||
E1037 = ("Invalid attribute value '{attr}'.")
|
||||
E1038 = ("Invalid JSON input: {message}")
|
||||
E1039 = ("The {obj} start or end annotations (start: {start}, end: {end}) "
|
||||
"could not be aligned to token boundaries.")
|
||||
E1040 = ("Doc.from_json requires all tokens to have the same attributes. "
|
||||
"Some tokens do not contain annotation for: {partial_attrs}")
|
||||
E1041 = ("Expected a string, Doc, or bytes as input, but got: {type}")
|
||||
E1042 = ("`enable={enable}` and `disable={disable}` are inconsistent with each other.\nIf you only passed "
|
||||
"one of `enable` or `disable`, the other argument is specified in your pipeline's configuration.\nIn that "
|
||||
"case pass an empty list for the previously not specified argument to avoid this error.")
|
||||
E1043 = ("Expected None or a value in range [{range_start}, {range_end}] for entity linker threshold, but got "
|
||||
"{value}.")
|
||||
E1044 = ("Expected `candidates_batch_size` to be >= 1, but got: {value}")
|
||||
E1045 = ("Encountered {parent} subclass without `{parent}.{method}` "
|
||||
"method in '{name}'. If you want to use this method, make "
|
||||
"sure it's overwritten on the subclass.")
|
||||
E1046 = ("{cls_name} is an abstract class and cannot be instantiated. If you are looking for spaCy's default "
|
||||
"knowledge base, use `InMemoryLookupKB`.")
|
||||
E1047 = ("`find_threshold()` only supports components with a `scorer` attribute.")
|
||||
E1048 = ("Got '{unexpected}' as console progress bar type, but expected one of the following: {expected}")
|
||||
E1049 = ("No available port found for displaCy on host {host}. Please specify an available port "
|
||||
"with `displacy.serve(doc, port=port)`")
|
||||
E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port=port)` "
|
||||
"or use `auto_switch_port=True` to pick an available port automatically.")
|
||||
|
||||
|
||||
# Deprecated model shortcuts, only used in errors and warnings
|
||||
OLD_MODEL_SHORTCUTS = {
|
||||
|
|
|
@ -1,3 +1,7 @@
|
|||
import warnings
|
||||
from .errors import Warnings
|
||||
|
||||
|
||||
def explain(term):
|
||||
"""Get a description for a given POS tag, dependency label or entity type.
|
||||
|
||||
|
@ -11,6 +15,8 @@ def explain(term):
|
|||
"""
|
||||
if term in GLOSSARY:
|
||||
return GLOSSARY[term]
|
||||
else:
|
||||
warnings.warn(Warnings.W118.format(term=term))
|
||||
|
||||
|
||||
GLOSSARY = {
|
||||
|
@ -267,6 +273,7 @@ GLOSSARY = {
|
|||
"relcl": "relative clause modifier",
|
||||
"reparandum": "overridden disfluency",
|
||||
"root": "root",
|
||||
"ROOT": "root",
|
||||
"vocative": "vocative",
|
||||
"xcomp": "open clausal complement",
|
||||
# Dependency labels (German)
|
||||
|
|
3
spacy/kb/__init__.py
Normal file
3
spacy/kb/__init__.py
Normal file
|
@ -0,0 +1,3 @@
|
|||
from .kb import KnowledgeBase
|
||||
from .kb_in_memory import InMemoryLookupKB
|
||||
from .candidate import Candidate, get_candidates, get_candidates_batch
|
12
spacy/kb/candidate.pxd
Normal file
12
spacy/kb/candidate.pxd
Normal file
|
@ -0,0 +1,12 @@
|
|||
from .kb cimport KnowledgeBase
|
||||
from libcpp.vector cimport vector
|
||||
from ..typedefs cimport hash_t
|
||||
|
||||
# Object used by the Entity Linker that summarizes one entity-alias candidate combination.
|
||||
cdef class Candidate:
|
||||
cdef readonly KnowledgeBase kb
|
||||
cdef hash_t entity_hash
|
||||
cdef float entity_freq
|
||||
cdef vector[float] entity_vector
|
||||
cdef hash_t alias_hash
|
||||
cdef float prior_prob
|
74
spacy/kb/candidate.pyx
Normal file
74
spacy/kb/candidate.pyx
Normal file
|
@ -0,0 +1,74 @@
|
|||
# cython: infer_types=True, profile=True
|
||||
|
||||
from typing import Iterable
|
||||
from .kb cimport KnowledgeBase
|
||||
from ..tokens import Span
|
||||
|
||||
cdef class Candidate:
|
||||
"""A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved
|
||||
to a specific `entity` from a Knowledge Base. This will be used as input for the entity linking
|
||||
algorithm which will disambiguate the various candidates to the correct one.
|
||||
Each candidate (alias, entity) pair is assigned a certain prior probability.
|
||||
|
||||
DOCS: https://spacy.io/api/kb/#candidate-init
|
||||
"""
|
||||
|
||||
def __init__(self, KnowledgeBase kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob):
|
||||
self.kb = kb
|
||||
self.entity_hash = entity_hash
|
||||
self.entity_freq = entity_freq
|
||||
self.entity_vector = entity_vector
|
||||
self.alias_hash = alias_hash
|
||||
self.prior_prob = prior_prob
|
||||
|
||||
@property
|
||||
def entity(self) -> int:
|
||||
"""RETURNS (uint64): hash of the entity's KB ID/name"""
|
||||
return self.entity_hash
|
||||
|
||||
@property
|
||||
def entity_(self) -> str:
|
||||
"""RETURNS (str): ID/name of this entity in the KB"""
|
||||
return self.kb.vocab.strings[self.entity_hash]
|
||||
|
||||
@property
|
||||
def alias(self) -> int:
|
||||
"""RETURNS (uint64): hash of the alias"""
|
||||
return self.alias_hash
|
||||
|
||||
@property
|
||||
def alias_(self) -> str:
|
||||
"""RETURNS (str): ID of the original alias"""
|
||||
return self.kb.vocab.strings[self.alias_hash]
|
||||
|
||||
@property
|
||||
def entity_freq(self) -> float:
|
||||
return self.entity_freq
|
||||
|
||||
@property
|
||||
def entity_vector(self) -> Iterable[float]:
|
||||
return self.entity_vector
|
||||
|
||||
@property
|
||||
def prior_prob(self) -> float:
|
||||
return self.prior_prob
|
||||
|
||||
|
||||
def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]:
|
||||
"""
|
||||
Return candidate entities for a given mention and fetching appropriate entries from the index.
|
||||
kb (KnowledgeBase): Knowledge base to query.
|
||||
mention (Span): Entity mention for which to identify candidates.
|
||||
RETURNS (Iterable[Candidate]): Identified candidates.
|
||||
"""
|
||||
return kb.get_candidates(mention)
|
||||
|
||||
|
||||
def get_candidates_batch(kb: KnowledgeBase, mentions: Iterable[Span]) -> Iterable[Iterable[Candidate]]:
|
||||
"""
|
||||
Return candidate entities for the given mentions and fetching appropriate entries from the index.
|
||||
kb (KnowledgeBase): Knowledge base to query.
|
||||
mention (Iterable[Span]): Entity mentions for which to identify candidates.
|
||||
RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
|
||||
"""
|
||||
return kb.get_candidates_batch(mentions)
|
10
spacy/kb/kb.pxd
Normal file
10
spacy/kb/kb.pxd
Normal file
|
@ -0,0 +1,10 @@
|
|||
"""Knowledge-base for entity or concept linking."""
|
||||
|
||||
from cymem.cymem cimport Pool
|
||||
from libc.stdint cimport int64_t
|
||||
from ..vocab cimport Vocab
|
||||
|
||||
cdef class KnowledgeBase:
|
||||
cdef Pool mem
|
||||
cdef readonly Vocab vocab
|
||||
cdef readonly int64_t entity_vector_length
|
108
spacy/kb/kb.pyx
Normal file
108
spacy/kb/kb.pyx
Normal file
|
@ -0,0 +1,108 @@
|
|||
# cython: infer_types=True, profile=True
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Iterable, Tuple, Union
|
||||
from cymem.cymem cimport Pool
|
||||
|
||||
from .candidate import Candidate
|
||||
from ..tokens import Span
|
||||
from ..util import SimpleFrozenList
|
||||
from ..errors import Errors
|
||||
|
||||
|
||||
cdef class KnowledgeBase:
|
||||
"""A `KnowledgeBase` instance stores unique identifiers for entities and their textual aliases,
|
||||
to support entity linking of named entities to real-world concepts.
|
||||
This is an abstract class and requires its operations to be implemented.
|
||||
|
||||
DOCS: https://spacy.io/api/kb
|
||||
"""
|
||||
|
||||
def __init__(self, vocab: Vocab, entity_vector_length: int):
|
||||
"""Create a KnowledgeBase."""
|
||||
# Make sure abstract KB is not instantiated.
|
||||
if self.__class__ == KnowledgeBase:
|
||||
raise TypeError(
|
||||
Errors.E1046.format(cls_name=self.__class__.__name__)
|
||||
)
|
||||
|
||||
self.vocab = vocab
|
||||
self.entity_vector_length = entity_vector_length
|
||||
self.mem = Pool()
|
||||
|
||||
def get_candidates_batch(self, mentions: Iterable[Span]) -> Iterable[Iterable[Candidate]]:
|
||||
"""
|
||||
Return candidate entities for specified texts. Each candidate defines the entity, the original alias,
|
||||
and the prior probability of that alias resolving to that entity.
|
||||
If no candidate is found for a given text, an empty list is returned.
|
||||
mentions (Iterable[Span]): Mentions for which to get candidates.
|
||||
RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
|
||||
"""
|
||||
return [self.get_candidates(span) for span in mentions]
|
||||
|
||||
def get_candidates(self, mention: Span) -> Iterable[Candidate]:
|
||||
"""
|
||||
Return candidate entities for specified text. Each candidate defines the entity, the original alias,
|
||||
and the prior probability of that alias resolving to that entity.
|
||||
If the no candidate is found for a given text, an empty list is returned.
|
||||
mention (Span): Mention for which to get candidates.
|
||||
RETURNS (Iterable[Candidate]): Identified candidates.
|
||||
"""
|
||||
raise NotImplementedError(
|
||||
Errors.E1045.format(parent="KnowledgeBase", method="get_candidates", name=self.__name__)
|
||||
)
|
||||
|
||||
def get_vectors(self, entities: Iterable[str]) -> Iterable[Iterable[float]]:
|
||||
"""
|
||||
Return vectors for entities.
|
||||
entity (str): Entity name/ID.
|
||||
RETURNS (Iterable[Iterable[float]]): Vectors for specified entities.
|
||||
"""
|
||||
return [self.get_vector(entity) for entity in entities]
|
||||
|
||||
def get_vector(self, str entity) -> Iterable[float]:
|
||||
"""
|
||||
Return vector for entity.
|
||||
entity (str): Entity name/ID.
|
||||
RETURNS (Iterable[float]): Vector for specified entity.
|
||||
"""
|
||||
raise NotImplementedError(
|
||||
Errors.E1045.format(parent="KnowledgeBase", method="get_vector", name=self.__name__)
|
||||
)
|
||||
|
||||
def to_bytes(self, **kwargs) -> bytes:
|
||||
"""Serialize the current state to a binary string.
|
||||
RETURNS (bytes): Current state as binary string.
|
||||
"""
|
||||
raise NotImplementedError(
|
||||
Errors.E1045.format(parent="KnowledgeBase", method="to_bytes", name=self.__name__)
|
||||
)
|
||||
|
||||
def from_bytes(self, bytes_data: bytes, *, exclude: Tuple[str] = tuple()):
|
||||
"""Load state from a binary string.
|
||||
bytes_data (bytes): KB state.
|
||||
exclude (Tuple[str]): Properties to exclude when restoring KB.
|
||||
"""
|
||||
raise NotImplementedError(
|
||||
Errors.E1045.format(parent="KnowledgeBase", method="from_bytes", name=self.__name__)
|
||||
)
|
||||
|
||||
def to_disk(self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()) -> None:
|
||||
"""
|
||||
Write KnowledgeBase content to disk.
|
||||
path (Union[str, Path]): Target file path.
|
||||
exclude (Iterable[str]): List of components to exclude.
|
||||
"""
|
||||
raise NotImplementedError(
|
||||
Errors.E1045.format(parent="KnowledgeBase", method="to_disk", name=self.__name__)
|
||||
)
|
||||
|
||||
def from_disk(self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()) -> None:
|
||||
"""
|
||||
Load KnowledgeBase content from disk.
|
||||
path (Union[str, Path]): Target file path.
|
||||
exclude (Iterable[str]): List of components to exclude.
|
||||
"""
|
||||
raise NotImplementedError(
|
||||
Errors.E1045.format(parent="KnowledgeBase", method="from_disk", name=self.__name__)
|
||||
)
|
|
@ -1,14 +1,12 @@
|
|||
"""Knowledge-base for entity or concept linking."""
|
||||
from cymem.cymem cimport Pool
|
||||
from preshed.maps cimport PreshMap
|
||||
from libcpp.vector cimport vector
|
||||
from libc.stdint cimport int32_t, int64_t
|
||||
from libc.stdio cimport FILE
|
||||
|
||||
from .vocab cimport Vocab
|
||||
from .typedefs cimport hash_t
|
||||
from .structs cimport KBEntryC, AliasC
|
||||
|
||||
from ..typedefs cimport hash_t
|
||||
from ..structs cimport KBEntryC, AliasC
|
||||
from .kb cimport KnowledgeBase
|
||||
|
||||
ctypedef vector[KBEntryC] entry_vec
|
||||
ctypedef vector[AliasC] alias_vec
|
||||
|
@ -16,21 +14,7 @@ ctypedef vector[float] float_vec
|
|||
ctypedef vector[float_vec] float_matrix
|
||||
|
||||
|
||||
# Object used by the Entity Linker that summarizes one entity-alias candidate combination.
|
||||
cdef class Candidate:
|
||||
cdef readonly KnowledgeBase kb
|
||||
cdef hash_t entity_hash
|
||||
cdef float entity_freq
|
||||
cdef vector[float] entity_vector
|
||||
cdef hash_t alias_hash
|
||||
cdef float prior_prob
|
||||
|
||||
|
||||
cdef class KnowledgeBase:
|
||||
cdef Pool mem
|
||||
cdef readonly Vocab vocab
|
||||
cdef int64_t entity_vector_length
|
||||
|
||||
cdef class InMemoryLookupKB(KnowledgeBase):
|
||||
# This maps 64bit keys (hash of unique entity string)
|
||||
# to 64bit values (position of the _KBEntryC struct in the _entries vector).
|
||||
# The PreshMap is pretty space efficient, as it uses open addressing. So
|
|
@ -1,8 +1,7 @@
|
|||
# cython: infer_types=True, profile=True
|
||||
from typing import Iterator, Iterable, Callable, Dict, Any
|
||||
from typing import Iterable, Callable, Dict, Any, Union
|
||||
|
||||
import srsly
|
||||
from cymem.cymem cimport Pool
|
||||
from preshed.maps cimport PreshMap
|
||||
from cpython.exc cimport PyErr_SetFromErrno
|
||||
from libc.stdio cimport fopen, fclose, fread, fwrite, feof, fseek
|
||||
|
@ -12,103 +11,41 @@ from libcpp.vector cimport vector
|
|||
from pathlib import Path
|
||||
import warnings
|
||||
|
||||
from .typedefs cimport hash_t
|
||||
from .errors import Errors, Warnings
|
||||
from . import util
|
||||
from .util import SimpleFrozenList, ensure_path
|
||||
|
||||
cdef class Candidate:
|
||||
"""A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved
|
||||
to a specific `entity` from a Knowledge Base. This will be used as input for the entity linking
|
||||
algorithm which will disambiguate the various candidates to the correct one.
|
||||
Each candidate (alias, entity) pair is assigned to a certain prior probability.
|
||||
|
||||
DOCS: https://spacy.io/api/kb/#candidate_init
|
||||
"""
|
||||
|
||||
def __init__(self, KnowledgeBase kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob):
|
||||
self.kb = kb
|
||||
self.entity_hash = entity_hash
|
||||
self.entity_freq = entity_freq
|
||||
self.entity_vector = entity_vector
|
||||
self.alias_hash = alias_hash
|
||||
self.prior_prob = prior_prob
|
||||
|
||||
@property
|
||||
def entity(self):
|
||||
"""RETURNS (uint64): hash of the entity's KB ID/name"""
|
||||
return self.entity_hash
|
||||
|
||||
@property
|
||||
def entity_(self):
|
||||
"""RETURNS (str): ID/name of this entity in the KB"""
|
||||
return self.kb.vocab.strings[self.entity_hash]
|
||||
|
||||
@property
|
||||
def alias(self):
|
||||
"""RETURNS (uint64): hash of the alias"""
|
||||
return self.alias_hash
|
||||
|
||||
@property
|
||||
def alias_(self):
|
||||
"""RETURNS (str): ID of the original alias"""
|
||||
return self.kb.vocab.strings[self.alias_hash]
|
||||
|
||||
@property
|
||||
def entity_freq(self):
|
||||
return self.entity_freq
|
||||
|
||||
@property
|
||||
def entity_vector(self):
|
||||
return self.entity_vector
|
||||
|
||||
@property
|
||||
def prior_prob(self):
|
||||
return self.prior_prob
|
||||
from ..tokens import Span
|
||||
from ..typedefs cimport hash_t
|
||||
from ..errors import Errors, Warnings
|
||||
from .. import util
|
||||
from ..util import SimpleFrozenList, ensure_path
|
||||
from ..vocab cimport Vocab
|
||||
from .kb cimport KnowledgeBase
|
||||
from .candidate import Candidate as Candidate
|
||||
|
||||
|
||||
def get_candidates(KnowledgeBase kb, span) -> Iterator[Candidate]:
|
||||
"""
|
||||
Return candidate entities for a given span by using the text of the span as the alias
|
||||
and fetching appropriate entries from the index.
|
||||
This particular function is optimized to work with the built-in KB functionality,
|
||||
but any other custom candidate generation method can be used in combination with the KB as well.
|
||||
"""
|
||||
return kb.get_alias_candidates(span.text)
|
||||
|
||||
|
||||
cdef class KnowledgeBase:
|
||||
"""A `KnowledgeBase` instance stores unique identifiers for entities and their textual aliases,
|
||||
cdef class InMemoryLookupKB(KnowledgeBase):
|
||||
"""An `InMemoryLookupKB` instance stores unique identifiers for entities and their textual aliases,
|
||||
to support entity linking of named entities to real-world concepts.
|
||||
|
||||
DOCS: https://spacy.io/api/kb
|
||||
DOCS: https://spacy.io/api/inmemorylookupkb
|
||||
"""
|
||||
|
||||
def __init__(self, Vocab vocab, entity_vector_length):
|
||||
"""Create a KnowledgeBase."""
|
||||
self.mem = Pool()
|
||||
self.entity_vector_length = entity_vector_length
|
||||
"""Create an InMemoryLookupKB."""
|
||||
super().__init__(vocab, entity_vector_length)
|
||||
self._entry_index = PreshMap()
|
||||
self._alias_index = PreshMap()
|
||||
self.vocab = vocab
|
||||
self._create_empty_vectors(dummy_hash=self.vocab.strings[""])
|
||||
|
||||
def initialize_entities(self, int64_t nr_entities):
|
||||
def _initialize_entities(self, int64_t nr_entities):
|
||||
self._entry_index = PreshMap(nr_entities + 1)
|
||||
self._entries = entry_vec(nr_entities + 1)
|
||||
|
||||
def initialize_vectors(self, int64_t nr_entities):
|
||||
def _initialize_vectors(self, int64_t nr_entities):
|
||||
self._vectors_table = float_matrix(nr_entities + 1)
|
||||
|
||||
def initialize_aliases(self, int64_t nr_aliases):
|
||||
def _initialize_aliases(self, int64_t nr_aliases):
|
||||
self._alias_index = PreshMap(nr_aliases + 1)
|
||||
self._aliases_table = alias_vec(nr_aliases + 1)
|
||||
|
||||
@property
|
||||
def entity_vector_length(self):
|
||||
"""RETURNS (uint64): length of the entity vectors"""
|
||||
return self.entity_vector_length
|
||||
|
||||
def __len__(self):
|
||||
return self.get_size_entities()
|
||||
|
||||
|
@ -155,8 +92,8 @@ cdef class KnowledgeBase:
|
|||
raise ValueError(Errors.E140)
|
||||
|
||||
nr_entities = len(set(entity_list))
|
||||
self.initialize_entities(nr_entities)
|
||||
self.initialize_vectors(nr_entities)
|
||||
self._initialize_entities(nr_entities)
|
||||
self._initialize_vectors(nr_entities)
|
||||
|
||||
i = 0
|
||||
cdef KBEntryC entry
|
||||
|
@ -286,7 +223,10 @@ cdef class KnowledgeBase:
|
|||
alias_entry.probs = probs
|
||||
self._aliases_table[alias_index] = alias_entry
|
||||
|
||||
def get_alias_candidates(self, str alias) -> Iterator[Candidate]:
|
||||
def get_candidates(self, mention: Span) -> Iterable[Candidate]:
|
||||
return self.get_alias_candidates(mention.text) # type: ignore
|
||||
|
||||
def get_alias_candidates(self, str alias) -> Iterable[Candidate]:
|
||||
"""
|
||||
Return candidate entities for an alias. Each candidate defines the entity, the original alias,
|
||||
and the prior probability of that alias resolving to that entity.
|
||||
|
@ -388,9 +328,9 @@ cdef class KnowledgeBase:
|
|||
nr_entities = header[0]
|
||||
nr_aliases = header[1]
|
||||
entity_vector_length = header[2]
|
||||
self.initialize_entities(nr_entities)
|
||||
self.initialize_vectors(nr_entities)
|
||||
self.initialize_aliases(nr_aliases)
|
||||
self._initialize_entities(nr_entities)
|
||||
self._initialize_vectors(nr_entities)
|
||||
self._initialize_aliases(nr_aliases)
|
||||
self.entity_vector_length = entity_vector_length
|
||||
|
||||
def deserialize_vectors(b):
|
||||
|
@ -512,8 +452,8 @@ cdef class KnowledgeBase:
|
|||
cdef int64_t entity_vector_length
|
||||
reader.read_header(&nr_entities, &entity_vector_length)
|
||||
|
||||
self.initialize_entities(nr_entities)
|
||||
self.initialize_vectors(nr_entities)
|
||||
self._initialize_entities(nr_entities)
|
||||
self._initialize_vectors(nr_entities)
|
||||
self.entity_vector_length = entity_vector_length
|
||||
|
||||
# STEP 1: load entity vectors
|
||||
|
@ -552,7 +492,7 @@ cdef class KnowledgeBase:
|
|||
# STEP 3: load aliases
|
||||
cdef int64_t nr_aliases
|
||||
reader.read_alias_length(&nr_aliases)
|
||||
self.initialize_aliases(nr_aliases)
|
||||
self._initialize_aliases(nr_aliases)
|
||||
|
||||
cdef int64_t nr_candidates
|
||||
cdef vector[int64_t] entry_indices
|
|
@ -2,7 +2,8 @@ from .stop_words import STOP_WORDS
|
|||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
|
||||
from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_INFIXES
|
||||
from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_SUFFIXES
|
||||
from ...language import Language, BaseDefaults
|
||||
from ...attrs import LANG
|
||||
from ...util import update_exc
|
||||
|
@ -16,6 +17,8 @@ class BulgarianDefaults(BaseDefaults):
|
|||
|
||||
stop_words = STOP_WORDS
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
suffixes = COMBINING_DIACRITICS_TOKENIZER_SUFFIXES
|
||||
infixes = COMBINING_DIACRITICS_TOKENIZER_INFIXES
|
||||
|
||||
|
||||
class Bulgarian(Language):
|
||||
|
|
|
@ -72,10 +72,10 @@ class CatalanLemmatizer(Lemmatizer):
|
|||
oov_forms.append(form)
|
||||
if not forms:
|
||||
forms.extend(oov_forms)
|
||||
if not forms and string in lookup_table.keys():
|
||||
forms.append(self.lookup_lemmatize(token)[0])
|
||||
|
||||
# use lookups, and fall back to the token itself
|
||||
if not forms:
|
||||
forms.append(string)
|
||||
forms.append(lookup_table.get(string, [string])[0])
|
||||
forms = list(dict.fromkeys(forms))
|
||||
self.cache[cache_key] = forms
|
||||
return forms
|
||||
|
|
|
@ -258,6 +258,10 @@ ALPHA = group_chars(
|
|||
ALPHA_LOWER = group_chars(_lower + _uncased)
|
||||
ALPHA_UPPER = group_chars(_upper + _uncased)
|
||||
|
||||
_combining_diacritics = r"\u0300-\u036f"
|
||||
|
||||
COMBINING_DIACRITICS = _combining_diacritics
|
||||
|
||||
_units = (
|
||||
"km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft "
|
||||
"kg g mg µg t lb oz m/s km/h kmh mph hPa Pa mbar mb MB kb KB gb GB tb "
|
||||
|
@ -276,7 +280,7 @@ _currency = (
|
|||
_punct = (
|
||||
r"… …… , : ; \! \? ¿ ؟ ¡ \( \) \[ \] \{ \} < > _ # \* & 。 ? ! , 、 ; : ~ · । ، ۔ ؛ ٪"
|
||||
)
|
||||
_quotes = r'\' " ” “ ` ‘ ´ ’ ‚ , „ » « 「 」 『 』 ( ) 〔 〕 【 】 《 》 〈 〉'
|
||||
_quotes = r'\' " ” “ ` ‘ ´ ’ ‚ , „ » « 「 」 『 』 ( ) 〔 〕 【 】 《 》 〈 〉 〈 〉 ⟦ ⟧'
|
||||
_hyphens = "- – — -- --- —— ~"
|
||||
|
||||
# Various symbols like dingbats, but also emoji
|
||||
|
|
16
spacy/lang/dsb/__init__.py
Normal file
16
spacy/lang/dsb/__init__.py
Normal file
|
@ -0,0 +1,16 @@
|
|||
from .lex_attrs import LEX_ATTRS
|
||||
from .stop_words import STOP_WORDS
|
||||
from ...language import Language, BaseDefaults
|
||||
|
||||
|
||||
class LowerSorbianDefaults(BaseDefaults):
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class LowerSorbian(Language):
|
||||
lang = "dsb"
|
||||
Defaults = LowerSorbianDefaults
|
||||
|
||||
|
||||
__all__ = ["LowerSorbian"]
|
15
spacy/lang/dsb/examples.py
Normal file
15
spacy/lang/dsb/examples.py
Normal file
|
@ -0,0 +1,15 @@
|
|||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
>>> from spacy.lang.dsb.examples import sentences
|
||||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"Z tym stwori so wuměnjenje a zakład za dalše wobdźěłanje přez analyzu tekstoweje struktury a semantisku anotaciju a z tym tež za tu předstajenu digitalnu online-wersiju.",
|
||||
"Mi so tu jara derje spodoba.",
|
||||
"Kotre nowniny chceće měć?",
|
||||
"Tak ako w slědnem lěśe jo teke lětosa jano doma zapustowaś móžno.",
|
||||
"Zwóstanjo pótakem hyšći wjele źěła.",
|
||||
]
|
113
spacy/lang/dsb/lex_attrs.py
Normal file
113
spacy/lang/dsb/lex_attrs.py
Normal file
|
@ -0,0 +1,113 @@
|
|||
from ...attrs import LIKE_NUM
|
||||
|
||||
_num_words = [
|
||||
"nul",
|
||||
"jaden",
|
||||
"jadna",
|
||||
"jadno",
|
||||
"dwa",
|
||||
"dwě",
|
||||
"tśi",
|
||||
"tśo",
|
||||
"styri",
|
||||
"styrjo",
|
||||
"pěś",
|
||||
"pěśo",
|
||||
"šesć",
|
||||
"šesćo",
|
||||
"sedym",
|
||||
"sedymjo",
|
||||
"wósym",
|
||||
"wósymjo",
|
||||
"źewjeś",
|
||||
"źewjeśo",
|
||||
"źaseś",
|
||||
"źaseśo",
|
||||
"jadnassćo",
|
||||
"dwanassćo",
|
||||
"tśinasćo",
|
||||
"styrnasćo",
|
||||
"pěśnasćo",
|
||||
"šesnasćo",
|
||||
"sedymnasćo",
|
||||
"wósymnasćo",
|
||||
"źewjeśnasćo",
|
||||
"dwanasćo",
|
||||
"dwaźasća",
|
||||
"tśiźasća",
|
||||
"styrźasća",
|
||||
"pěśźaset",
|
||||
"šesćźaset",
|
||||
"sedymźaset",
|
||||
"wósymźaset",
|
||||
"źewjeśźaset",
|
||||
"sto",
|
||||
"tysac",
|
||||
"milion",
|
||||
"miliarda",
|
||||
"bilion",
|
||||
"biliarda",
|
||||
"trilion",
|
||||
"triliarda",
|
||||
]
|
||||
|
||||
_ordinal_words = [
|
||||
"prědny",
|
||||
"prědna",
|
||||
"prědne",
|
||||
"drugi",
|
||||
"druga",
|
||||
"druge",
|
||||
"tśeśi",
|
||||
"tśeśa",
|
||||
"tśeśe",
|
||||
"stwórty",
|
||||
"stwórta",
|
||||
"stwórte",
|
||||
"pêty",
|
||||
"pěta",
|
||||
"pête",
|
||||
"šesty",
|
||||
"šesta",
|
||||
"šeste",
|
||||
"sedymy",
|
||||
"sedyma",
|
||||
"sedyme",
|
||||
"wósymy",
|
||||
"wósyma",
|
||||
"wósyme",
|
||||
"źewjety",
|
||||
"źewjeta",
|
||||
"źewjete",
|
||||
"źasety",
|
||||
"źaseta",
|
||||
"źasete",
|
||||
"jadnasty",
|
||||
"jadnasta",
|
||||
"jadnaste",
|
||||
"dwanasty",
|
||||
"dwanasta",
|
||||
"dwanaste",
|
||||
]
|
||||
|
||||
|
||||
def like_num(text):
|
||||
if text.startswith(("+", "-", "±", "~")):
|
||||
text = text[1:]
|
||||
text = text.replace(",", "").replace(".", "")
|
||||
if text.isdigit():
|
||||
return True
|
||||
if text.count("/") == 1:
|
||||
num, denom = text.split("/")
|
||||
if num.isdigit() and denom.isdigit():
|
||||
return True
|
||||
text_lower = text.lower()
|
||||
if text_lower in _num_words:
|
||||
return True
|
||||
# Check ordinal number
|
||||
if text_lower in _ordinal_words:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
LEX_ATTRS = {LIKE_NUM: like_num}
|
15
spacy/lang/dsb/stop_words.py
Normal file
15
spacy/lang/dsb/stop_words.py
Normal file
|
@ -0,0 +1,15 @@
|
|||
STOP_WORDS = set(
|
||||
"""
|
||||
a abo aby ako ale až
|
||||
|
||||
daniž dokulaž
|
||||
|
||||
gaž
|
||||
|
||||
jolic
|
||||
|
||||
pak pótom
|
||||
|
||||
teke togodla
|
||||
""".split()
|
||||
)
|
|
@ -35,7 +35,7 @@ for pron in ["i"]:
|
|||
|
||||
_exc[orth + "m"] = [
|
||||
{ORTH: orth, NORM: pron},
|
||||
{ORTH: "m", "tenspect": 1, "number": 1},
|
||||
{ORTH: "m"},
|
||||
]
|
||||
|
||||
_exc[orth + "'ma"] = [
|
||||
|
@ -139,26 +139,27 @@ for pron in ["he", "she", "it"]:
|
|||
|
||||
# W-words, relative pronouns, prepositions etc.
|
||||
|
||||
for word in [
|
||||
"who",
|
||||
"what",
|
||||
"when",
|
||||
"where",
|
||||
"why",
|
||||
"how",
|
||||
"there",
|
||||
"that",
|
||||
"this",
|
||||
"these",
|
||||
"those",
|
||||
for word, morph in [
|
||||
("who", None),
|
||||
("what", None),
|
||||
("when", None),
|
||||
("where", None),
|
||||
("why", None),
|
||||
("how", None),
|
||||
("there", None),
|
||||
("that", "Number=Sing|Person=3"),
|
||||
("this", "Number=Sing|Person=3"),
|
||||
("these", "Number=Plur|Person=3"),
|
||||
("those", "Number=Plur|Person=3"),
|
||||
]:
|
||||
for orth in [word, word.title()]:
|
||||
_exc[orth + "'s"] = [
|
||||
{ORTH: orth, NORM: word},
|
||||
{ORTH: "'s", NORM: "'s"},
|
||||
]
|
||||
if morph != "Number=Plur|Person=3":
|
||||
_exc[orth + "'s"] = [
|
||||
{ORTH: orth, NORM: word},
|
||||
{ORTH: "'s", NORM: "'s"},
|
||||
]
|
||||
|
||||
_exc[orth + "s"] = [{ORTH: orth, NORM: word}, {ORTH: "s"}]
|
||||
_exc[orth + "s"] = [{ORTH: orth, NORM: word}, {ORTH: "s"}]
|
||||
|
||||
_exc[orth + "'ll"] = [
|
||||
{ORTH: orth, NORM: word},
|
||||
|
@ -182,25 +183,26 @@ for word in [
|
|||
{ORTH: "ve", NORM: "have"},
|
||||
]
|
||||
|
||||
_exc[orth + "'re"] = [
|
||||
{ORTH: orth, NORM: word},
|
||||
{ORTH: "'re", NORM: "are"},
|
||||
]
|
||||
if morph != "Number=Sing|Person=3":
|
||||
_exc[orth + "'re"] = [
|
||||
{ORTH: orth, NORM: word},
|
||||
{ORTH: "'re", NORM: "are"},
|
||||
]
|
||||
|
||||
_exc[orth + "re"] = [
|
||||
{ORTH: orth, NORM: word},
|
||||
{ORTH: "re", NORM: "are"},
|
||||
]
|
||||
_exc[orth + "re"] = [
|
||||
{ORTH: orth, NORM: word},
|
||||
{ORTH: "re", NORM: "are"},
|
||||
]
|
||||
|
||||
_exc[orth + "'ve"] = [
|
||||
{ORTH: orth, NORM: word},
|
||||
{ORTH: "'ve"},
|
||||
]
|
||||
_exc[orth + "'ve"] = [
|
||||
{ORTH: orth, NORM: word},
|
||||
{ORTH: "'ve"},
|
||||
]
|
||||
|
||||
_exc[orth + "ve"] = [
|
||||
{ORTH: orth},
|
||||
{ORTH: "ve", NORM: "have"},
|
||||
]
|
||||
_exc[orth + "ve"] = [
|
||||
{ORTH: orth},
|
||||
{ORTH: "ve", NORM: "have"},
|
||||
]
|
||||
|
||||
_exc[orth + "'d"] = [
|
||||
{ORTH: orth, NORM: word},
|
||||
|
@ -447,7 +449,6 @@ for exc_data in [
|
|||
{ORTH: "La.", NORM: "Louisiana"},
|
||||
{ORTH: "Mar.", NORM: "March"},
|
||||
{ORTH: "Mass.", NORM: "Massachusetts"},
|
||||
{ORTH: "May.", NORM: "May"},
|
||||
{ORTH: "Mich.", NORM: "Michigan"},
|
||||
{ORTH: "Minn.", NORM: "Minnesota"},
|
||||
{ORTH: "Miss.", NORM: "Mississippi"},
|
||||
|
|
|
@ -9,14 +9,14 @@ Example sentences to test spaCy and its language models.
|
|||
sentences = [
|
||||
"Apple está buscando comprar una startup del Reino Unido por mil millones de dólares.",
|
||||
"Los coches autónomos delegan la responsabilidad del seguro en sus fabricantes.",
|
||||
"San Francisco analiza prohibir los robots delivery.",
|
||||
"San Francisco analiza prohibir los robots de reparto.",
|
||||
"Londres es una gran ciudad del Reino Unido.",
|
||||
"El gato come pescado.",
|
||||
"Veo al hombre con el telescopio.",
|
||||
"La araña come moscas.",
|
||||
"El pingüino incuba en su nido sobre el hielo.",
|
||||
"¿Dónde estais?",
|
||||
"¿Quién es el presidente Francés?",
|
||||
"¿Dónde está encuentra la capital de Argentina?",
|
||||
"¿Dónde estáis?",
|
||||
"¿Quién es el presidente francés?",
|
||||
"¿Dónde se encuentra la capital de Argentina?",
|
||||
"¿Cuándo nació José de San Martín?",
|
||||
]
|
||||
|
|
|
@ -1,82 +1,80 @@
|
|||
STOP_WORDS = set(
|
||||
"""
|
||||
actualmente acuerdo adelante ademas además adrede afirmó agregó ahi ahora ahí
|
||||
al algo alguna algunas alguno algunos algún alli allí alrededor ambos ampleamos
|
||||
antano antaño ante anterior antes apenas aproximadamente aquel aquella aquellas
|
||||
aquello aquellos aqui aquél aquélla aquéllas aquéllos aquí arriba arribaabajo
|
||||
aseguró asi así atras aun aunque ayer añadió aún
|
||||
a acuerdo adelante ademas además afirmó agregó ahi ahora ahí al algo alguna
|
||||
algunas alguno algunos algún alli allí alrededor ambos ante anterior antes
|
||||
apenas aproximadamente aquel aquella aquellas aquello aquellos aqui aquél
|
||||
aquélla aquéllas aquéllos aquí arriba aseguró asi así atras aun aunque añadió
|
||||
aún
|
||||
|
||||
bajo bastante bien breve buen buena buenas bueno buenos
|
||||
|
||||
cada casi cerca cierta ciertas cierto ciertos cinco claro comentó como con
|
||||
conmigo conocer conseguimos conseguir considera consideró consigo consigue
|
||||
consiguen consigues contigo contra cosas creo cual cuales cualquier cuando
|
||||
cuanta cuantas cuanto cuantos cuatro cuenta cuál cuáles cuándo cuánta cuántas
|
||||
cuánto cuántos cómo
|
||||
cada casi cierta ciertas cierto ciertos cinco claro comentó como con conmigo
|
||||
conocer conseguimos conseguir considera consideró consigo consigue consiguen
|
||||
consigues contigo contra creo cual cuales cualquier cuando cuanta cuantas
|
||||
cuanto cuantos cuatro cuenta cuál cuáles cuándo cuánta cuántas cuánto cuántos
|
||||
cómo
|
||||
|
||||
da dado dan dar de debajo debe deben debido decir dejó del delante demasiado
|
||||
demás dentro deprisa desde despacio despues después detras detrás dia dias dice
|
||||
dicen dicho dieron diferente diferentes dijeron dijo dio donde dos durante día
|
||||
días dónde
|
||||
dicen dicho dieron diez diferente diferentes dijeron dijo dio doce donde dos
|
||||
durante día días dónde
|
||||
|
||||
ejemplo el ella ellas ello ellos embargo empleais emplean emplear empleas
|
||||
empleo en encima encuentra enfrente enseguida entonces entre era eramos eran
|
||||
eras eres es esa esas ese eso esos esta estaba estaban estado estados estais
|
||||
estamos estan estar estará estas este esto estos estoy estuvo está están ex
|
||||
excepto existe existen explicó expresó él ésa ésas ése ésos ésta éstas éste
|
||||
éstos
|
||||
e el ella ellas ello ellos embargo en encima encuentra enfrente enseguida
|
||||
entonces entre era eramos eran eras eres es esa esas ese eso esos esta estaba
|
||||
estaban estado estados estais estamos estan estar estará estas este esto estos
|
||||
estoy estuvo está están excepto existe existen explicó expresó él ésa ésas ése
|
||||
ésos ésta éstas éste éstos
|
||||
|
||||
fin final fue fuera fueron fui fuimos
|
||||
|
||||
general gran grandes gueno
|
||||
gran grande grandes
|
||||
|
||||
ha haber habia habla hablan habrá había habían hace haceis hacemos hacen hacer
|
||||
hacerlo haces hacia haciendo hago han hasta hay haya he hecho hemos hicieron
|
||||
hizo horas hoy hubo
|
||||
hizo hoy hubo
|
||||
|
||||
igual incluso indicó informo informó intenta intentais intentamos intentan
|
||||
intentar intentas intento ir
|
||||
igual incluso indicó informo informó ir
|
||||
|
||||
junto
|
||||
|
||||
la lado largo las le lejos les llegó lleva llevar lo los luego lugar
|
||||
la lado largo las le les llegó lleva llevar lo los luego
|
||||
|
||||
mal manera manifestó mas mayor me mediante medio mejor mencionó menos menudo mi
|
||||
mia mias mientras mio mios mis misma mismas mismo mismos modo momento mucha
|
||||
muchas mucho muchos muy más mí mía mías mío míos
|
||||
mia mias mientras mio mios mis misma mismas mismo mismos modo mucha muchas
|
||||
mucho muchos muy más mí mía mías mío míos
|
||||
|
||||
nada nadie ni ninguna ningunas ninguno ningunos ningún no nos nosotras nosotros
|
||||
nuestra nuestras nuestro nuestros nueva nuevas nuevo nuevos nunca
|
||||
nuestra nuestras nuestro nuestros nueva nuevas nueve nuevo nuevos nunca
|
||||
|
||||
ocho os otra otras otro otros
|
||||
o ocho once os otra otras otro otros
|
||||
|
||||
pais para parece parte partir pasada pasado paìs peor pero pesar poca pocas
|
||||
poco pocos podeis podemos poder podria podriais podriamos podrian podrias podrá
|
||||
para parece parte partir pasada pasado paìs peor pero pesar poca pocas poco
|
||||
pocos podeis podemos poder podria podriais podriamos podrian podrias podrá
|
||||
podrán podría podrían poner por porque posible primer primera primero primeros
|
||||
principalmente pronto propia propias propio propios proximo próximo próximos
|
||||
pudo pueda puede pueden puedo pues
|
||||
pronto propia propias propio propios proximo próximo próximos pudo pueda puede
|
||||
pueden puedo pues
|
||||
|
||||
qeu que quedó queremos quien quienes quiere quiza quizas quizá quizás quién quiénes qué
|
||||
qeu que quedó queremos quien quienes quiere quiza quizas quizá quizás quién
|
||||
quiénes qué
|
||||
|
||||
raras realizado realizar realizó repente respecto
|
||||
realizado realizar realizó repente respecto
|
||||
|
||||
sabe sabeis sabemos saben saber sabes salvo se sea sean segun segunda segundo
|
||||
según seis ser sera será serán sería señaló si sido siempre siendo siete sigue
|
||||
siguiente sin sino sobre sois sola solamente solas solo solos somos son soy
|
||||
soyos su supuesto sus suya suyas suyo sé sí sólo
|
||||
siguiente sin sino sobre sois sola solamente solas solo solos somos son soy su
|
||||
supuesto sus suya suyas suyo suyos sé sí sólo
|
||||
|
||||
tal tambien también tampoco tan tanto tarde te temprano tendrá tendrán teneis
|
||||
tenemos tener tenga tengo tenido tenía tercera ti tiempo tiene tienen toda
|
||||
todas todavia todavía todo todos total trabaja trabajais trabajamos trabajan
|
||||
trabajar trabajas trabajo tras trata través tres tu tus tuvo tuya tuyas tuyo
|
||||
tuyos tú
|
||||
tenemos tener tenga tengo tenido tenía tercera tercero ti tiene tienen toda
|
||||
todas todavia todavía todo todos total tras trata través tres tu tus tuvo tuya
|
||||
tuyas tuyo tuyos tú
|
||||
|
||||
ultimo un una unas uno unos usa usais usamos usan usar usas uso usted ustedes
|
||||
u ultimo un una unas uno unos usa usais usamos usan usar usas uso usted ustedes
|
||||
última últimas último últimos
|
||||
|
||||
va vais valor vamos van varias varios vaya veces ver verdad verdadera verdadero
|
||||
vez vosotras vosotros voy vuestra vuestras vuestro vuestros
|
||||
va vais vamos van varias varios vaya veces ver verdad verdadera verdadero vez
|
||||
vosotras vosotros voy vuestra vuestras vuestro vuestros
|
||||
|
||||
ya yo
|
||||
y ya yo
|
||||
""".split()
|
||||
)
|
||||
|
|
|
@ -53,11 +53,16 @@ class FrenchLemmatizer(Lemmatizer):
|
|||
rules = rules_table.get(univ_pos, [])
|
||||
string = string.lower()
|
||||
forms = []
|
||||
# first try lookup in table based on upos
|
||||
if string in index:
|
||||
forms.append(string)
|
||||
self.cache[cache_key] = forms
|
||||
return forms
|
||||
|
||||
# then add anything in the exceptions table
|
||||
forms.extend(exceptions.get(string, []))
|
||||
|
||||
# if nothing found yet, use the rules
|
||||
oov_forms = []
|
||||
if not forms:
|
||||
for old, new in rules:
|
||||
|
@ -69,12 +74,14 @@ class FrenchLemmatizer(Lemmatizer):
|
|||
forms.append(form)
|
||||
else:
|
||||
oov_forms.append(form)
|
||||
|
||||
# if still nothing, add the oov forms from rules
|
||||
if not forms:
|
||||
forms.extend(oov_forms)
|
||||
if not forms and string in lookup_table.keys():
|
||||
forms.append(self.lookup_lemmatize(token)[0])
|
||||
|
||||
# use lookups, which fall back to the token itself
|
||||
if not forms:
|
||||
forms.append(string)
|
||||
forms.append(lookup_table.get(string, [string])[0])
|
||||
forms = list(dict.fromkeys(forms))
|
||||
self.cache[cache_key] = forms
|
||||
return forms
|
||||
|
|
|
@ -3,7 +3,7 @@ from ...attrs import LIKE_NUM
|
|||
|
||||
_num_words = set(
|
||||
"""
|
||||
zero un deux trois quatre cinq six sept huit neuf dix
|
||||
zero un une deux trois quatre cinq six sept huit neuf dix
|
||||
onze douze treize quatorze quinze seize dix-sept dix-huit dix-neuf
|
||||
vingt trente quarante cinquante soixante soixante-dix septante quatre-vingt huitante quatre-vingt-dix nonante
|
||||
cent mille mil million milliard billion quadrillion quintillion
|
||||
|
@ -13,7 +13,7 @@ sextillion septillion octillion nonillion decillion
|
|||
|
||||
_ordinal_words = set(
|
||||
"""
|
||||
premier deuxième second troisième quatrième cinquième sixième septième huitième neuvième dixième
|
||||
premier première deuxième second seconde troisième quatrième cinquième sixième septième huitième neuvième dixième
|
||||
onzième douzième treizième quatorzième quinzième seizième dix-septième dix-huitième dix-neuvième
|
||||
vingtième trentième quarantième cinquantième soixantième soixante-dixième septantième quatre-vingtième huitantième quatre-vingt-dixième nonantième
|
||||
centième millième millionnième milliardième billionnième quadrillionnième quintillionnième
|
||||
|
|
|
@ -1,11 +1,15 @@
|
|||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||
from ...language import Language, BaseDefaults
|
||||
|
||||
|
||||
class AncientGreekDefaults(BaseDefaults):
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
prefixes = TOKENIZER_PREFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
infixes = TOKENIZER_INFIXES
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
|
46
spacy/lang/grc/punctuation.py
Normal file
46
spacy/lang/grc/punctuation.py
Normal file
|
@ -0,0 +1,46 @@
|
|||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
|
||||
from ..char_classes import LIST_ICONS, ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS
|
||||
from ..char_classes import CONCAT_QUOTES
|
||||
|
||||
_prefixes = (
|
||||
[
|
||||
"†",
|
||||
"⸏",
|
||||
]
|
||||
+ LIST_PUNCT
|
||||
+ LIST_ELLIPSES
|
||||
+ LIST_QUOTES
|
||||
+ LIST_CURRENCY
|
||||
+ LIST_ICONS
|
||||
)
|
||||
|
||||
_suffixes = (
|
||||
LIST_PUNCT
|
||||
+ LIST_ELLIPSES
|
||||
+ LIST_QUOTES
|
||||
+ LIST_ICONS
|
||||
+ [
|
||||
"†",
|
||||
"⸎",
|
||||
r"(?<=[\u1F00-\u1FFF\u0370-\u03FF])[\-\.⸏]",
|
||||
]
|
||||
)
|
||||
|
||||
_infixes = (
|
||||
LIST_ELLIPSES
|
||||
+ LIST_ICONS
|
||||
+ [
|
||||
r"(?<=[0-9])[+\-\*^](?=[0-9-])",
|
||||
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
|
||||
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
|
||||
),
|
||||
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
|
||||
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[\u1F00-\u1FFF\u0370-\u03FF])—",
|
||||
]
|
||||
)
|
||||
|
||||
TOKENIZER_PREFIXES = _prefixes
|
||||
TOKENIZER_SUFFIXES = _suffixes
|
||||
TOKENIZER_INFIXES = _infixes
|
18
spacy/lang/hsb/__init__.py
Normal file
18
spacy/lang/hsb/__init__.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
from .lex_attrs import LEX_ATTRS
|
||||
from .stop_words import STOP_WORDS
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from ...language import Language, BaseDefaults
|
||||
|
||||
|
||||
class UpperSorbianDefaults(BaseDefaults):
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
stop_words = STOP_WORDS
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
|
||||
|
||||
class UpperSorbian(Language):
|
||||
lang = "hsb"
|
||||
Defaults = UpperSorbianDefaults
|
||||
|
||||
|
||||
__all__ = ["UpperSorbian"]
|
15
spacy/lang/hsb/examples.py
Normal file
15
spacy/lang/hsb/examples.py
Normal file
|
@ -0,0 +1,15 @@
|
|||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
>>> from spacy.lang.hsb.examples import sentences
|
||||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"To běšo wjelgin raźone a jo se wót luźi derje pśiwzeło. Tak som dožywiła wjelgin",
|
||||
"Jogo pśewóźowarce stej groniłej, až how w serbskich stronach njama Santa Claus nic pytaś.",
|
||||
"A ten sobuźěłaśeŕ Statneje biblioteki w Barlinju jo pśimjeł drogotne knigły bźez rukajcowu z nagima rukoma!",
|
||||
"Take wobchadanje z našym kulturnym derbstwom zewšym njejźo.",
|
||||
"Wopśimjeśe drugich pśinoskow jo było na wusokem niwowje, ako pśecej.",
|
||||
]
|
106
spacy/lang/hsb/lex_attrs.py
Normal file
106
spacy/lang/hsb/lex_attrs.py
Normal file
|
@ -0,0 +1,106 @@
|
|||
from ...attrs import LIKE_NUM
|
||||
|
||||
_num_words = [
|
||||
"nul",
|
||||
"jedyn",
|
||||
"jedna",
|
||||
"jedne",
|
||||
"dwaj",
|
||||
"dwě",
|
||||
"tři",
|
||||
"třo",
|
||||
"štyri",
|
||||
"štyrjo",
|
||||
"pjeć",
|
||||
"šěsć",
|
||||
"sydom",
|
||||
"wosom",
|
||||
"dźewjeć",
|
||||
"dźesać",
|
||||
"jědnaće",
|
||||
"dwanaće",
|
||||
"třinaće",
|
||||
"štyrnaće",
|
||||
"pjatnaće",
|
||||
"šěsnaće",
|
||||
"sydomnaće",
|
||||
"wosomnaće",
|
||||
"dźewjatnaće",
|
||||
"dwaceći",
|
||||
"třiceći",
|
||||
"štyrceći",
|
||||
"pjećdźesat",
|
||||
"šěsćdźesat",
|
||||
"sydomdźesat",
|
||||
"wosomdźesat",
|
||||
"dźewjećdźesat",
|
||||
"sto",
|
||||
"tysac",
|
||||
"milion",
|
||||
"miliarda",
|
||||
"bilion",
|
||||
"biliarda",
|
||||
"trilion",
|
||||
"triliarda",
|
||||
]
|
||||
|
||||
_ordinal_words = [
|
||||
"prěni",
|
||||
"prěnja",
|
||||
"prěnje",
|
||||
"druhi",
|
||||
"druha",
|
||||
"druhe",
|
||||
"třeći",
|
||||
"třeća",
|
||||
"třeće",
|
||||
"štwórty",
|
||||
"štwórta",
|
||||
"štwórte",
|
||||
"pjaty",
|
||||
"pjata",
|
||||
"pjate",
|
||||
"šěsty",
|
||||
"šěsta",
|
||||
"šěste",
|
||||
"sydmy",
|
||||
"sydma",
|
||||
"sydme",
|
||||
"wosmy",
|
||||
"wosma",
|
||||
"wosme",
|
||||
"dźewjaty",
|
||||
"dźewjata",
|
||||
"dźewjate",
|
||||
"dźesaty",
|
||||
"dźesata",
|
||||
"dźesate",
|
||||
"jědnaty",
|
||||
"jědnata",
|
||||
"jědnate",
|
||||
"dwanaty",
|
||||
"dwanata",
|
||||
"dwanate",
|
||||
]
|
||||
|
||||
|
||||
def like_num(text):
|
||||
if text.startswith(("+", "-", "±", "~")):
|
||||
text = text[1:]
|
||||
text = text.replace(",", "").replace(".", "")
|
||||
if text.isdigit():
|
||||
return True
|
||||
if text.count("/") == 1:
|
||||
num, denom = text.split("/")
|
||||
if num.isdigit() and denom.isdigit():
|
||||
return True
|
||||
text_lower = text.lower()
|
||||
if text_lower in _num_words:
|
||||
return True
|
||||
# Check ordinal number
|
||||
if text_lower in _ordinal_words:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
LEX_ATTRS = {LIKE_NUM: like_num}
|
19
spacy/lang/hsb/stop_words.py
Normal file
19
spacy/lang/hsb/stop_words.py
Normal file
|
@ -0,0 +1,19 @@
|
|||
STOP_WORDS = set(
|
||||
"""
|
||||
a abo ale ani
|
||||
|
||||
dokelž
|
||||
|
||||
hdyž
|
||||
|
||||
jeli jelizo
|
||||
|
||||
kaž
|
||||
|
||||
pak potom
|
||||
|
||||
tež tohodla
|
||||
|
||||
zo zoby
|
||||
""".split()
|
||||
)
|
18
spacy/lang/hsb/tokenizer_exceptions.py
Normal file
18
spacy/lang/hsb/tokenizer_exceptions.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...symbols import ORTH, NORM
|
||||
from ...util import update_exc
|
||||
|
||||
_exc = dict()
|
||||
for exc_data in [
|
||||
{ORTH: "mil.", NORM: "milion"},
|
||||
{ORTH: "wob.", NORM: "wobydler"},
|
||||
]:
|
||||
_exc[exc_data[ORTH]] = [exc_data]
|
||||
|
||||
for orth in [
|
||||
"resp.",
|
||||
]:
|
||||
_exc[orth] = [{ORTH: orth}]
|
||||
|
||||
|
||||
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
|
@ -7,7 +7,7 @@ from .lex_attrs import LEX_ATTRS
|
|||
from ...language import Language, BaseDefaults
|
||||
from ...tokens import Doc
|
||||
from ...scorer import Scorer
|
||||
from ...symbols import POS
|
||||
from ...symbols import POS, X
|
||||
from ...training import validate_examples
|
||||
from ...util import DummyTokenizer, registry, load_config_from_str
|
||||
from ...vocab import Vocab
|
||||
|
@ -57,7 +57,10 @@ class KoreanTokenizer(DummyTokenizer):
|
|||
for token, dtoken in zip(doc, dtokens):
|
||||
first_tag, sep, eomi_tags = dtoken["tag"].partition("+")
|
||||
token.tag_ = first_tag # stem(어간) or pre-final(선어말 어미)
|
||||
token.pos = TAG_MAP[token.tag_][POS]
|
||||
if token.tag_ in TAG_MAP:
|
||||
token.pos = TAG_MAP[token.tag_][POS]
|
||||
else:
|
||||
token.pos = X
|
||||
token.lemma_ = dtoken["lemma"]
|
||||
doc.user_data["full_tags"] = [dt["tag"] for dt in dtokens]
|
||||
return doc
|
||||
|
|
|
@ -3,7 +3,7 @@ from ..punctuation import TOKENIZER_INFIXES as BASE_TOKENIZER_INFIXES
|
|||
|
||||
|
||||
_infixes = (
|
||||
["·", "ㆍ", "\(", "\)"]
|
||||
["·", "ㆍ", r"\(", r"\)"]
|
||||
+ [r"(?<=[0-9])~(?=[0-9-])"]
|
||||
+ LIST_QUOTES
|
||||
+ BASE_TOKENIZER_INFIXES
|
||||
|
|
18
spacy/lang/la/__init__.py
Normal file
18
spacy/lang/la/__init__.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
from ...language import Language, BaseDefaults
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
|
||||
|
||||
class LatinDefaults(BaseDefaults):
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
stop_words = STOP_WORDS
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
|
||||
|
||||
class Latin(Language):
|
||||
lang = "la"
|
||||
Defaults = LatinDefaults
|
||||
|
||||
|
||||
__all__ = ["Latin"]
|
34
spacy/lang/la/lex_attrs.py
Normal file
34
spacy/lang/la/lex_attrs.py
Normal file
|
@ -0,0 +1,34 @@
|
|||
from ...attrs import LIKE_NUM
|
||||
import re
|
||||
|
||||
# cf. Goyvaerts/Levithan 2009; case-insensitive, allow 4
|
||||
roman_numerals_compile = re.compile(
|
||||
r"(?i)^(?=[MDCLXVI])M*(C[MD]|D?C{0,4})(X[CL]|L?X{0,4})(I[XV]|V?I{0,4})$"
|
||||
)
|
||||
|
||||
_num_words = set(
|
||||
"""
|
||||
unus una unum duo duae tres tria quattuor quinque sex septem octo novem decem
|
||||
""".split()
|
||||
)
|
||||
|
||||
_ordinal_words = set(
|
||||
"""
|
||||
primus prima primum secundus secunda secundum tertius tertia tertium
|
||||
""".split()
|
||||
)
|
||||
|
||||
|
||||
def like_num(text):
|
||||
if text.isdigit():
|
||||
return True
|
||||
if roman_numerals_compile.match(text):
|
||||
return True
|
||||
if text.lower() in _num_words:
|
||||
return True
|
||||
if text.lower() in _ordinal_words:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
LEX_ATTRS = {LIKE_NUM: like_num}
|
37
spacy/lang/la/stop_words.py
Normal file
37
spacy/lang/la/stop_words.py
Normal file
|
@ -0,0 +1,37 @@
|
|||
# Corrected Perseus list, cf. https://wiki.digitalclassicist.org/Stopwords_for_Greek_and_Latin
|
||||
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
ab ac ad adhuc aliqui aliquis an ante apud at atque aut autem
|
||||
|
||||
cum cur
|
||||
|
||||
de deinde dum
|
||||
|
||||
ego enim ergo es est et etiam etsi ex
|
||||
|
||||
fio
|
||||
|
||||
haud hic
|
||||
|
||||
iam idem igitur ille in infra inter interim ipse is ita
|
||||
|
||||
magis modo mox
|
||||
|
||||
nam ne nec necque neque nisi non nos
|
||||
|
||||
o ob
|
||||
|
||||
per possum post pro
|
||||
|
||||
quae quam quare qui quia quicumque quidem quilibet quis quisnam quisquam quisque quisquis quo quoniam
|
||||
|
||||
sed si sic sive sub sui sum super suus
|
||||
|
||||
tam tamen trans tu tum
|
||||
|
||||
ubi uel uero
|
||||
|
||||
vel vero
|
||||
""".split()
|
||||
)
|
76
spacy/lang/la/tokenizer_exceptions.py
Normal file
76
spacy/lang/la/tokenizer_exceptions.py
Normal file
|
@ -0,0 +1,76 @@
|
|||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...symbols import ORTH
|
||||
from ...util import update_exc
|
||||
|
||||
|
||||
## TODO: Look into systematically handling u/v
|
||||
_exc = {
|
||||
"mecum": [{ORTH: "me"}, {ORTH: "cum"}],
|
||||
"tecum": [{ORTH: "te"}, {ORTH: "cum"}],
|
||||
"nobiscum": [{ORTH: "nobis"}, {ORTH: "cum"}],
|
||||
"vobiscum": [{ORTH: "vobis"}, {ORTH: "cum"}],
|
||||
"uobiscum": [{ORTH: "uobis"}, {ORTH: "cum"}],
|
||||
}
|
||||
|
||||
for orth in [
|
||||
"A.",
|
||||
"Agr.",
|
||||
"Ap.",
|
||||
"C.",
|
||||
"Cn.",
|
||||
"D.",
|
||||
"F.",
|
||||
"K.",
|
||||
"L.",
|
||||
"M'.",
|
||||
"M.",
|
||||
"Mam.",
|
||||
"N.",
|
||||
"Oct.",
|
||||
"Opet.",
|
||||
"P.",
|
||||
"Paul.",
|
||||
"Post.",
|
||||
"Pro.",
|
||||
"Q.",
|
||||
"S.",
|
||||
"Ser.",
|
||||
"Sert.",
|
||||
"Sex.",
|
||||
"St.",
|
||||
"Sta.",
|
||||
"T.",
|
||||
"Ti.",
|
||||
"V.",
|
||||
"Vol.",
|
||||
"Vop.",
|
||||
"U.",
|
||||
"Uol.",
|
||||
"Uop.",
|
||||
"Ian.",
|
||||
"Febr.",
|
||||
"Mart.",
|
||||
"Apr.",
|
||||
"Mai.",
|
||||
"Iun.",
|
||||
"Iul.",
|
||||
"Aug.",
|
||||
"Sept.",
|
||||
"Oct.",
|
||||
"Nov.",
|
||||
"Nou.",
|
||||
"Dec.",
|
||||
"Non.",
|
||||
"Id.",
|
||||
"A.D.",
|
||||
"Coll.",
|
||||
"Cos.",
|
||||
"Ord.",
|
||||
"Pl.",
|
||||
"S.C.",
|
||||
"Suff.",
|
||||
"Trib.",
|
||||
]:
|
||||
_exc[orth] = [{ORTH: orth}]
|
||||
|
||||
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
18
spacy/lang/lg/__init__.py
Normal file
18
spacy/lang/lg/__init__.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .punctuation import TOKENIZER_INFIXES
|
||||
from ...language import Language, BaseDefaults
|
||||
|
||||
|
||||
class LugandaDefaults(BaseDefaults):
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
infixes = TOKENIZER_INFIXES
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class Luganda(Language):
|
||||
lang = "lg"
|
||||
Defaults = LugandaDefaults
|
||||
|
||||
|
||||
__all__ = ["Luganda"]
|
17
spacy/lang/lg/examples.py
Normal file
17
spacy/lang/lg/examples.py
Normal file
|
@ -0,0 +1,17 @@
|
|||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
>>> from spacy.lang.lg.examples import sentences
|
||||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
sentences = [
|
||||
"Mpa ebyafaayo ku byalo Nakatu ne Nkajja",
|
||||
"Okuyita Ttembo kitegeeza kugwa ddalu",
|
||||
"Ekifumu kino kyali kya mulimu ki?",
|
||||
"Ekkovu we liyise wayitibwa mukululo",
|
||||
"Akola mulimu ki oguvaamu ssente?",
|
||||
"Emisumaali egikomerera embaawo giyitibwa nninga",
|
||||
"Abooluganda ab’emmamba ababiri",
|
||||
"Ekisaawe ky'ebyenjigiriza kya mugaso nnyo",
|
||||
]
|
95
spacy/lang/lg/lex_attrs.py
Normal file
95
spacy/lang/lg/lex_attrs.py
Normal file
|
@ -0,0 +1,95 @@
|
|||
from ...attrs import LIKE_NUM
|
||||
|
||||
_num_words = [
|
||||
"nnooti", # Zero
|
||||
"zeero", # zero
|
||||
"emu", # one
|
||||
"bbiri", # two
|
||||
"ssatu", # three
|
||||
"nnya", # four
|
||||
"ttaano", # five
|
||||
"mukaaga", # six
|
||||
"musanvu", # seven
|
||||
"munaana", # eight
|
||||
"mwenda", # nine
|
||||
"kkumi", # ten
|
||||
"kkumi n'emu", # eleven
|
||||
"kkumi na bbiri", # twelve
|
||||
"kkumi na ssatu", # thirteen
|
||||
"kkumi na nnya", # forteen
|
||||
"kkumi na ttaano", # fifteen
|
||||
"kkumi na mukaaga", # sixteen
|
||||
"kkumi na musanvu", # seventeen
|
||||
"kkumi na munaana", # eighteen
|
||||
"kkumi na mwenda", # nineteen
|
||||
"amakumi abiri", # twenty
|
||||
"amakumi asatu", # thirty
|
||||
"amakumi ana", # forty
|
||||
"amakumi ataano", # fifty
|
||||
"nkaaga", # sixty
|
||||
"nsanvu", # seventy
|
||||
"kinaana", # eighty
|
||||
"kyenda", # ninety
|
||||
"kikumi", # hundred
|
||||
"lukumi", # thousand
|
||||
"kakadde", # million
|
||||
"kawumbi", # billion
|
||||
"kase", # trillion
|
||||
"katabalika", # quadrillion
|
||||
"keesedde", # gajillion
|
||||
"kafukunya", # bazillion
|
||||
"ekisooka", # first
|
||||
"ekyokubiri", # second
|
||||
"ekyokusatu", # third
|
||||
"ekyokuna", # fourth
|
||||
"ekyokutaano", # fifith
|
||||
"ekyomukaaga", # sixth
|
||||
"ekyomusanvu", # seventh
|
||||
"eky'omunaana", # eighth
|
||||
"ekyomwenda", # nineth
|
||||
"ekyekkumi", # tenth
|
||||
"ekyekkumi n'ekimu", # eleventh
|
||||
"ekyekkumi n'ebibiri", # twelveth
|
||||
"ekyekkumi n'ebisatu", # thirteenth
|
||||
"ekyekkumi n'ebina", # fourteenth
|
||||
"ekyekkumi n'ebitaano", # fifteenth
|
||||
"ekyekkumi n'omukaaga", # sixteenth
|
||||
"ekyekkumi n'omusanvu", # seventeenth
|
||||
"ekyekkumi n'omunaana", # eigteenth
|
||||
"ekyekkumi n'omwenda", # nineteenth
|
||||
"ekyamakumi abiri", # twentieth
|
||||
"ekyamakumi asatu", # thirtieth
|
||||
"ekyamakumi ana", # fortieth
|
||||
"ekyamakumi ataano", # fiftieth
|
||||
"ekyenkaaga", # sixtieth
|
||||
"ekyensanvu", # seventieth
|
||||
"ekyekinaana", # eightieth
|
||||
"ekyekyenda", # ninetieth
|
||||
"ekyekikumi", # hundredth
|
||||
"ekyolukumi", # thousandth
|
||||
"ekyakakadde", # millionth
|
||||
"ekyakawumbi", # billionth
|
||||
"ekyakase", # trillionth
|
||||
"ekyakatabalika", # quadrillionth
|
||||
"ekyakeesedde", # gajillionth
|
||||
"ekyakafukunya", # bazillionth
|
||||
]
|
||||
|
||||
|
||||
def like_num(text):
|
||||
if text.startswith(("+", "-", "±", "~")):
|
||||
text = text[1:]
|
||||
text = text.replace(",", "").replace(".", "")
|
||||
if text.isdigit():
|
||||
return True
|
||||
if text.count("/") == 1:
|
||||
num, denom = text.split("/")
|
||||
if num.isdigit() and denom.isdigit():
|
||||
return True
|
||||
text_lower = text.lower()
|
||||
if text_lower in _num_words:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
LEX_ATTRS = {LIKE_NUM: like_num}
|
19
spacy/lang/lg/punctuation.py
Normal file
19
spacy/lang/lg/punctuation.py
Normal file
|
@ -0,0 +1,19 @@
|
|||
from ..char_classes import LIST_ELLIPSES, LIST_ICONS, HYPHENS
|
||||
from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
|
||||
|
||||
_infixes = (
|
||||
LIST_ELLIPSES
|
||||
+ LIST_ICONS
|
||||
+ [
|
||||
r"(?<=[0-9])[+\-\*^](?=[0-9-])",
|
||||
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
|
||||
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
|
||||
),
|
||||
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
|
||||
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
TOKENIZER_INFIXES = _infixes
|
19
spacy/lang/lg/stop_words.py
Normal file
19
spacy/lang/lg/stop_words.py
Normal file
|
@ -0,0 +1,19 @@
|
|||
STOP_WORDS = set(
|
||||
"""
|
||||
abadde abalala abamu abangi abava ajja ali alina ani anti ateekeddwa atewamu
|
||||
atya awamu aweebwa ayinza ba baali babadde babalina bajja
|
||||
bajjanewankubade bali balina bandi bangi bano bateekeddwa baweebwa bayina bebombi beera bibye
|
||||
bimu bingi bino bo bokka bonna buli bulijjo bulungi bwabwe bwaffe bwayo bwe bwonna bya byabwe
|
||||
byaffe byebimu byonna ddaa ddala ddi e ebimu ebiri ebweruobulungi ebyo edda ejja ekirala ekyo
|
||||
endala engeri ennyo era erimu erina ffe ffenna ga gujja gumu gunno guno gwa gwe kaseera kati
|
||||
kennyini ki kiki kikino kikye kikyo kino kirungi kki ku kubangabyombi kubangaolwokuba kudda
|
||||
kuva kuwa kwegamba kyaffe kye kyekimuoyo kyekyo kyonna leero liryo lwa lwaki lyabwezaabwe
|
||||
lyaffe lyange mbadde mingi mpozzi mu mulinaoyina munda mwegyabwe nolwekyo nabadde nabo nandiyagadde
|
||||
nandiye nanti naye ne nedda neera nga nnyingi nnyini nnyinza nnyo nti nyinza nze oba ojja okudda
|
||||
okugenda okuggyako okutuusa okuva okuwa oli olina oluvannyuma olwekyobuva omuli ono osobola otya
|
||||
oyina oyo seetaaga si sinakindi singa talina tayina tebaali tebaalina tebayina terina tetulina
|
||||
tetuteekeddwa tewali teyalina teyayina tolina tu tuyina tulina tuyina twafuna twetaaga wa wabula
|
||||
wabweru wadde waggulunnina wakati waliwobangi waliyo wandi wange wano wansi weebwa yabadde yaffe
|
||||
ye yenna yennyini yina yonna ziba zijja zonna
|
||||
""".split()
|
||||
)
|
|
@ -15,7 +15,7 @@
|
|||
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
aan af al alle alles allebei alleen allen als altijd ander anders andere anderen aangaangde aangezien achter achterna
|
||||
aan af al alle alles allebei alleen allen als altijd ander anders andere anderen aangaande aangezien achter achterna
|
||||
afgelopen aldus alhoewel anderzijds
|
||||
|
||||
ben bij bijna bijvoorbeeld behalve beide beiden beneden bent bepaald beter betere betreffende binnen binnenin boven
|
||||
|
|
|
@ -40,6 +40,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
|
|||
span_label = doc.vocab.strings.add("NP")
|
||||
|
||||
# Only NOUNS and PRONOUNS matter
|
||||
end_span = -1
|
||||
for i, word in enumerate(filter(lambda x: x.pos in [PRON, NOUN], doclike)):
|
||||
# For NOUNS
|
||||
# Pick children from syntactic parse (only those with certain dependencies)
|
||||
|
@ -58,15 +59,17 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
|
|||
children_i = [c.i for c in children] + [word.i]
|
||||
|
||||
start_span = min(children_i)
|
||||
end_span = max(children_i) + 1
|
||||
yield start_span, end_span, span_label
|
||||
if start_span >= end_span:
|
||||
end_span = max(children_i) + 1
|
||||
yield start_span, end_span, span_label
|
||||
|
||||
# PRONOUNS only if it is the subject of a verb
|
||||
elif word.pos == PRON:
|
||||
if word.dep in pronoun_deps:
|
||||
start_span = word.i
|
||||
end_span = word.i + 1
|
||||
yield start_span, end_span, span_label
|
||||
if start_span >= end_span:
|
||||
end_span = word.i + 1
|
||||
yield start_span, end_span, span_label
|
||||
|
||||
|
||||
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
|
||||
from .char_classes import LIST_ICONS, HYPHENS, CURRENCY, UNITS
|
||||
from .char_classes import LIST_ICONS, HYPHENS, CURRENCY, UNITS, COMBINING_DIACRITICS
|
||||
from .char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT
|
||||
|
||||
|
||||
|
@ -44,3 +44,23 @@ TOKENIZER_INFIXES = (
|
|||
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
# Some languages e.g. written with the Cyrillic alphabet permit the use of diacritics
|
||||
# to mark stressed syllables in words where stress is distinctive. Such languages
|
||||
# should use the COMBINING_DIACRITICS... suffix and infix regex lists in
|
||||
# place of the standard ones.
|
||||
COMBINING_DIACRITICS_TOKENIZER_SUFFIXES = list(TOKENIZER_SUFFIXES) + [
|
||||
r"(?<=[{a}][{d}])\.".format(a=ALPHA, d=COMBINING_DIACRITICS),
|
||||
]
|
||||
|
||||
COMBINING_DIACRITICS_TOKENIZER_INFIXES = list(TOKENIZER_INFIXES) + [
|
||||
r"(?<=[{al}][{d}])\.(?=[{au}{q}])".format(
|
||||
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES, d=COMBINING_DIACRITICS
|
||||
),
|
||||
r"(?<=[{a}][{d}]),(?=[{a}])".format(a=ALPHA, d=COMBINING_DIACRITICS),
|
||||
r"(?<=[{a}][{d}])(?:{h})(?=[{a}])".format(
|
||||
a=ALPHA, d=COMBINING_DIACRITICS, h=HYPHENS
|
||||
),
|
||||
r"(?<=[{a}][{d}])[:<>=/](?=[{a}])".format(a=ALPHA, d=COMBINING_DIACRITICS),
|
||||
]
|
||||
|
|
|
@ -5,6 +5,8 @@ from .stop_words import STOP_WORDS
|
|||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .lemmatizer import RussianLemmatizer
|
||||
from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_INFIXES
|
||||
from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_SUFFIXES
|
||||
from ...language import Language, BaseDefaults
|
||||
|
||||
|
||||
|
@ -12,6 +14,8 @@ class RussianDefaults(BaseDefaults):
|
|||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
stop_words = STOP_WORDS
|
||||
suffixes = COMBINING_DIACRITICS_TOKENIZER_SUFFIXES
|
||||
infixes = COMBINING_DIACRITICS_TOKENIZER_INFIXES
|
||||
|
||||
|
||||
class Russian(Language):
|
||||
|
@ -24,7 +28,7 @@ class Russian(Language):
|
|||
assigns=["token.lemma"],
|
||||
default_config={
|
||||
"model": None,
|
||||
"mode": "pymorphy2",
|
||||
"mode": "pymorphy3",
|
||||
"overwrite": False,
|
||||
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||
},
|
||||
|
|
|
@ -19,33 +19,48 @@ class RussianLemmatizer(Lemmatizer):
|
|||
model: Optional[Model],
|
||||
name: str = "lemmatizer",
|
||||
*,
|
||||
mode: str = "pymorphy2",
|
||||
mode: str = "pymorphy3",
|
||||
overwrite: bool = False,
|
||||
scorer: Optional[Callable] = lemmatizer_score,
|
||||
) -> None:
|
||||
if mode == "pymorphy2":
|
||||
if mode in {"pymorphy2", "pymorphy2_lookup"}:
|
||||
try:
|
||||
from pymorphy2 import MorphAnalyzer
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"The Russian lemmatizer mode 'pymorphy2' requires the "
|
||||
"pymorphy2 library. Install it with: pip install pymorphy2"
|
||||
"The lemmatizer mode 'pymorphy2' requires the "
|
||||
"pymorphy2 library and dictionaries. Install them with: "
|
||||
"pip install pymorphy2"
|
||||
"# for Ukrainian dictionaries:"
|
||||
"pip install pymorphy2-dicts-uk"
|
||||
) from None
|
||||
if getattr(self, "_morph", None) is None:
|
||||
self._morph = MorphAnalyzer()
|
||||
self._morph = MorphAnalyzer(lang="ru")
|
||||
elif mode in {"pymorphy3", "pymorphy3_lookup"}:
|
||||
try:
|
||||
from pymorphy3 import MorphAnalyzer
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"The lemmatizer mode 'pymorphy3' requires the "
|
||||
"pymorphy3 library and dictionaries. Install them with: "
|
||||
"pip install pymorphy3"
|
||||
"# for Ukrainian dictionaries:"
|
||||
"pip install pymorphy3-dicts-uk"
|
||||
) from None
|
||||
if getattr(self, "_morph", None) is None:
|
||||
self._morph = MorphAnalyzer(lang="ru")
|
||||
super().__init__(
|
||||
vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||
)
|
||||
|
||||
def pymorphy2_lemmatize(self, token: Token) -> List[str]:
|
||||
def _pymorphy_lemmatize(self, token: Token) -> List[str]:
|
||||
string = token.text
|
||||
univ_pos = token.pos_
|
||||
morphology = token.morph.to_dict()
|
||||
if univ_pos == "PUNCT":
|
||||
return [PUNCT_RULES.get(string, string)]
|
||||
if univ_pos not in ("ADJ", "DET", "NOUN", "NUM", "PRON", "PROPN", "VERB"):
|
||||
# Skip unchangeable pos
|
||||
return [string.lower()]
|
||||
return self._pymorphy_lookup_lemmatize(token)
|
||||
analyses = self._morph.parse(string)
|
||||
filtered_analyses = []
|
||||
for analysis in analyses:
|
||||
|
@ -53,8 +68,10 @@ class RussianLemmatizer(Lemmatizer):
|
|||
# Skip suggested parse variant for unknown word for pymorphy
|
||||
continue
|
||||
analysis_pos, _ = oc2ud(str(analysis.tag))
|
||||
if analysis_pos == univ_pos or (
|
||||
analysis_pos in ("NOUN", "PROPN") and univ_pos in ("NOUN", "PROPN")
|
||||
if (
|
||||
analysis_pos == univ_pos
|
||||
or (analysis_pos in ("NOUN", "PROPN") and univ_pos in ("NOUN", "PROPN"))
|
||||
or ((analysis_pos == "PRON") and (univ_pos == "DET"))
|
||||
):
|
||||
filtered_analyses.append(analysis)
|
||||
if not len(filtered_analyses):
|
||||
|
@ -97,13 +114,28 @@ class RussianLemmatizer(Lemmatizer):
|
|||
dict.fromkeys([analysis.normal_form for analysis in filtered_analyses])
|
||||
)
|
||||
|
||||
def pymorphy2_lookup_lemmatize(self, token: Token) -> List[str]:
|
||||
def _pymorphy_lookup_lemmatize(self, token: Token) -> List[str]:
|
||||
string = token.text
|
||||
analyses = self._morph.parse(string)
|
||||
if len(analyses) == 1:
|
||||
return [analyses[0].normal_form]
|
||||
# often multiple forms would derive from the same normal form
|
||||
# thus check _unique_ normal forms
|
||||
normal_forms = set([an.normal_form for an in analyses])
|
||||
if len(normal_forms) == 1:
|
||||
return [next(iter(normal_forms))]
|
||||
return [string]
|
||||
|
||||
def pymorphy2_lemmatize(self, token: Token) -> List[str]:
|
||||
return self._pymorphy_lemmatize(token)
|
||||
|
||||
def pymorphy2_lookup_lemmatize(self, token: Token) -> List[str]:
|
||||
return self._pymorphy_lookup_lemmatize(token)
|
||||
|
||||
def pymorphy3_lemmatize(self, token: Token) -> List[str]:
|
||||
return self._pymorphy_lemmatize(token)
|
||||
|
||||
def pymorphy3_lookup_lemmatize(self, token: Token) -> List[str]:
|
||||
return self._pymorphy_lookup_lemmatize(token)
|
||||
|
||||
|
||||
def oc2ud(oc_tag: str) -> Tuple[str, Dict[str, str]]:
|
||||
gram_map = {
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user