Merge branch 'master' into feature/visualisation

This commit is contained in:
richardpaulhudson 2023-01-25 15:18:47 +01:00
commit fd66bce8c1
522 changed files with 47564 additions and 38729 deletions

View File

@ -4,11 +4,13 @@ about: Use this template if you came across a bug or unexpected behaviour differ
---
<!-- NOTE: For questions or install related issues, please open a Discussion instead. -->
## How to reproduce the behaviour
<!-- Include a code example or the steps that led to the problem. Please try to be as specific as possible. -->
## Your Environment
<!-- Include details of your environment. If you're using spaCy 1.7+, you can also type `python -m spacy info --markdown` and copy-paste the result here.-->
<!-- Include details of your environment. You can also type `python -m spacy info --markdown` and copy-paste the result here.-->
* Operating System:
* Python Version Used:
* spaCy Version Used:

View File

@ -1,8 +1,5 @@
blank_issues_enabled: false
contact_links:
- name: ⚠️ Python 3.10 Support
url: https://github.com/explosion/spaCy/discussions/9418
about: Python 3.10 wheels haven't been released yet, see the link for details.
- name: 🗯 Discussions Forum
url: https://github.com/explosion/spaCy/discussions
about: Install issues, usage questions, general discussion and anything else that isn't a bug report.

View File

@ -1,68 +1,56 @@
parameters:
python_version: ''
architecture: ''
prefix: ''
gpu: false
num_build_jobs: 1
architecture: 'x64'
num_build_jobs: 2
steps:
- task: UsePythonVersion@0
inputs:
versionSpec: ${{ parameters.python_version }}
architecture: ${{ parameters.architecture }}
allowUnstable: true
- bash: |
echo "##vso[task.setvariable variable=python_version]${{ parameters.python_version }}"
displayName: 'Set variables'
- script: |
${{ parameters.prefix }} python -m pip install -U pip setuptools
${{ parameters.prefix }} python -m pip install -U -r requirements.txt
python -m pip install -U build pip setuptools
python -m pip install -U -r requirements.txt
displayName: "Install dependencies"
- script: |
${{ parameters.prefix }} python setup.py build_ext --inplace -j ${{ parameters.num_build_jobs }}
${{ parameters.prefix }} python setup.py sdist --formats=gztar
displayName: "Compile and build sdist"
python -m build --sdist
displayName: "Build sdist"
- script: python -m mypy spacy
- script: |
python -m mypy spacy
displayName: 'Run mypy'
condition: ne(variables['python_version'], '3.10')
condition: ne(variables['python_version'], '3.6')
- task: DeleteFiles@1
inputs:
contents: "spacy"
displayName: "Delete source directory"
- task: DeleteFiles@1
inputs:
contents: "*.egg-info"
displayName: "Delete egg-info directory"
- script: |
${{ parameters.prefix }} python -m pip freeze --exclude torch --exclude cupy-cuda110 > installed.txt
${{ parameters.prefix }} python -m pip uninstall -y -r installed.txt
python -m pip freeze > installed.txt
python -m pip uninstall -y -r installed.txt
displayName: "Uninstall all packages"
- bash: |
${{ parameters.prefix }} SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
${{ parameters.prefix }} python -m pip install dist/$SDIST
SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
SPACY_NUM_BUILD_JOBS=${{ parameters.num_build_jobs }} python -m pip install dist/$SDIST
displayName: "Install from sdist"
- script: |
${{ parameters.prefix }} python -m pip install -U -r requirements.txt
displayName: "Install test requirements"
- script: |
${{ parameters.prefix }} python -m pip install -U cupy-cuda110 -f https://github.com/cupy/cupy/releases/v9.0.0
${{ parameters.prefix }} python -m pip install "torch==1.7.1+cu110" -f https://download.pytorch.org/whl/torch_stable.html
displayName: "Install GPU requirements"
condition: eq(${{ parameters.gpu }}, true)
- script: |
${{ parameters.prefix }} python -m pytest --pyargs spacy
displayName: "Run CPU tests"
condition: eq(${{ parameters.gpu }}, false)
- script: |
${{ parameters.prefix }} python -m pytest --pyargs spacy -p spacy.tests.enable_gpu
displayName: "Run GPU tests"
condition: eq(${{ parameters.gpu }}, true)
python -W error -c "import spacy"
displayName: "Test import"
- script: |
python -m spacy download ca_core_news_sm
@ -71,6 +59,11 @@ steps:
displayName: 'Test download CLI'
condition: eq(variables['python_version'], '3.8')
- script: |
python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
displayName: 'Test no warnings on load (#11713)'
condition: eq(variables['python_version'], '3.8')
- script: |
python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
displayName: 'Test convert CLI'
@ -105,13 +98,22 @@ steps:
displayName: 'Test assemble CLI vectors warning'
condition: eq(variables['python_version'], '3.8')
- script: |
python -m pip install -U -r requirements.txt
displayName: "Install test requirements"
- script: |
python -m pytest --pyargs spacy -W error
displayName: "Run CPU tests"
- script: |
python -m pip install 'spacy[apple]'
python -m pytest --pyargs spacy
displayName: "Run CPU tests with thinc-apple-ops"
condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.11'))
- script: |
python .github/validate_universe_json.py website/meta/universe.json
displayName: 'Test website/meta/universe.json'
condition: eq(variables['python_version'], '3.8')
- script: |
${{ parameters.prefix }} python -m pip install thinc-apple-ops
${{ parameters.prefix }} python -m pytest --pyargs spacy
displayName: "Run CPU tests with thinc-apple-ops"
condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.9'))

106
.github/contributors/Lucaterre.md vendored Normal file
View File

@ -0,0 +1,106 @@
# spaCy contributor agreement
This spaCy Contributor Agreement (**"SCA"**) is based on the
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
The SCA applies to any contribution that you make to any product or project
managed by us (the **"project"**), and sets out the intellectual property rights
you grant to us in the contributed materials. The term **"us"** shall mean
[ExplosionAI GmbH](https://explosion.ai/legal). The term
**"you"** shall mean the person or entity identified below.
If you agree to be bound by these terms, fill in the information requested
below and include the filled-in version with your first pull request, under the
folder [`.github/contributors/`](/.github/contributors/). The name of the file
should be your GitHub username, with the extension `.md`. For example, the user
example_user would create the file `.github/contributors/example_user.md`.
Read this agreement carefully before signing. These terms and conditions
constitute a binding legal agreement.
## Contributor Agreement
1. The term "contribution" or "contributed materials" means any source code,
object code, patch, tool, sample, graphic, specification, manual,
documentation, or any other material posted or submitted by you to the project.
2. With respect to any worldwide copyrights, or copyright applications and
registrations, in your contribution:
* you hereby assign to us joint ownership, and to the extent that such
assignment is or becomes invalid, ineffective or unenforceable, you hereby
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
royalty-free, unrestricted license to exercise all rights under those
copyrights. This includes, at our option, the right to sublicense these same
rights to third parties through multiple levels of sublicensees or other
licensing arrangements;
* you agree that each of us can do all things in relation to your
contribution as if each of us were the sole owners, and if one of us makes
a derivative work of your contribution, the one who makes the derivative
work (or has it made will be the sole owner of that derivative work;
* you agree that you will not assert any moral rights in your contribution
against us, our licensees or transferees;
* you agree that we may register a copyright in your contribution and
exercise all ownership rights associated with it; and
* you agree that neither of us has any duty to consult with, obtain the
consent of, pay or render an accounting to the other for any use or
distribution of your contribution.
3. With respect to any patents you own, or that you can license without payment
to any third party, you hereby grant to us a perpetual, irrevocable,
non-exclusive, worldwide, no-charge, royalty-free license to:
* make, have made, use, sell, offer to sell, import, and otherwise transfer
your contribution in whole or in part, alone or in combination with or
included in any product, work or materials arising out of the project to
which your contribution was submitted, and
* at our option, to sublicense these same rights to third parties through
multiple levels of sublicensees or other licensing arrangements.
4. Except as set out above, you keep all right, title, and interest in your
contribution. The rights that you grant to us under these terms are effective
on the date you first submitted a contribution to us, even if your submission
took place before the date you sign these terms.
5. You covenant, represent, warrant and agree that:
* Each contribution that you submit is and shall be an original work of
authorship and you can legally grant the rights set out in this SCA;
* to the best of your knowledge, each contribution will not violate any
third party's copyrights, trademarks, patents, or other intellectual
property rights; and
* each contribution shall be in compliance with U.S. export control laws and
other applicable export and import laws. You agree to notify us if you
become aware of any circumstance which would make any of the foregoing
representations inaccurate in any respect. We may publicly disclose your
participation in the project, including the fact that you have signed the SCA.
6. This SCA is governed by the laws of the State of California and applicable
U.S. Federal law. Any choice of law rules will not apply.
7. Please place an “x” on one of the applicable statement below. Please do NOT
mark both statements:
* [x] I am signing on behalf of myself as an individual and no other person
or entity, including my employer, has or will have rights with respect to my
contributions.
* [ ] I am signing on behalf of my employer or a legal entity and I have the
actual authority to contractually bind that entity.
## Contributor Details
| Field | Entry |
|------------------------------- |---------------|
| Name | Lucas Terriel |
| Company name (if applicable) | |
| Title or role (if applicable) | |
| Date | 2022-06-20 |
| GitHub username | Lucaterre |
| Website (optional) | |

106
.github/contributors/fonfonx.md vendored Normal file
View File

@ -0,0 +1,106 @@
# spaCy contributor agreement
This spaCy Contributor Agreement (**"SCA"**) is based on the
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
The SCA applies to any contribution that you make to any product or project
managed by us (the **"project"**), and sets out the intellectual property rights
you grant to us in the contributed materials. The term **"us"** shall mean
[ExplosionAI GmbH](https://explosion.ai/legal). The term
**"you"** shall mean the person or entity identified below.
If you agree to be bound by these terms, fill in the information requested
below and include the filled-in version with your first pull request, under the
folder [`.github/contributors/`](/.github/contributors/). The name of the file
should be your GitHub username, with the extension `.md`. For example, the user
example_user would create the file `.github/contributors/example_user.md`.
Read this agreement carefully before signing. These terms and conditions
constitute a binding legal agreement.
## Contributor Agreement
1. The term "contribution" or "contributed materials" means any source code,
object code, patch, tool, sample, graphic, specification, manual,
documentation, or any other material posted or submitted by you to the project.
2. With respect to any worldwide copyrights, or copyright applications and
registrations, in your contribution:
* you hereby assign to us joint ownership, and to the extent that such
assignment is or becomes invalid, ineffective or unenforceable, you hereby
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
royalty-free, unrestricted license to exercise all rights under those
copyrights. This includes, at our option, the right to sublicense these same
rights to third parties through multiple levels of sublicensees or other
licensing arrangements;
* you agree that each of us can do all things in relation to your
contribution as if each of us were the sole owners, and if one of us makes
a derivative work of your contribution, the one who makes the derivative
work (or has it made will be the sole owner of that derivative work;
* you agree that you will not assert any moral rights in your contribution
against us, our licensees or transferees;
* you agree that we may register a copyright in your contribution and
exercise all ownership rights associated with it; and
* you agree that neither of us has any duty to consult with, obtain the
consent of, pay or render an accounting to the other for any use or
distribution of your contribution.
3. With respect to any patents you own, or that you can license without payment
to any third party, you hereby grant to us a perpetual, irrevocable,
non-exclusive, worldwide, no-charge, royalty-free license to:
* make, have made, use, sell, offer to sell, import, and otherwise transfer
your contribution in whole or in part, alone or in combination with or
included in any product, work or materials arising out of the project to
which your contribution was submitted, and
* at our option, to sublicense these same rights to third parties through
multiple levels of sublicensees or other licensing arrangements.
4. Except as set out above, you keep all right, title, and interest in your
contribution. The rights that you grant to us under these terms are effective
on the date you first submitted a contribution to us, even if your submission
took place before the date you sign these terms.
5. You covenant, represent, warrant and agree that:
* Each contribution that you submit is and shall be an original work of
authorship and you can legally grant the rights set out in this SCA;
* to the best of your knowledge, each contribution will not violate any
third party's copyrights, trademarks, patents, or other intellectual
property rights; and
* each contribution shall be in compliance with U.S. export control laws and
other applicable export and import laws. You agree to notify us if you
become aware of any circumstance which would make any of the foregoing
representations inaccurate in any respect. We may publicly disclose your
participation in the project, including the fact that you have signed the SCA.
6. This SCA is governed by the laws of the State of California and applicable
U.S. Federal law. Any choice of law rules will not apply.
7. Please place an “x” on one of the applicable statement below. Please do NOT
mark both statements:
* [x] I am signing on behalf of myself as an individual and no other person
or entity, including my employer, has or will have rights with respect to my
contributions.
* [ ] I am signing on behalf of my employer or a legal entity and I have the
actual authority to contractually bind that entity.
## Contributor Details
| Field | Entry |
|------------------------------- | -------------------- |
| Name | Xavier Fontaine |
| Company name (if applicable) | |
| Title or role (if applicable) | |
| Date | 2022-04-13 |
| GitHub username | fonfonx |
| Website (optional) | |

View File

@ -1,13 +0,0 @@
# Configuration for probot-no-response - https://github.com/probot/no-response
# Number of days of inactivity before an Issue is closed for lack of response
daysUntilClose: 14
# Label requiring a response
responseRequiredLabel: more-info-needed
# Comment to post when closing an Issue for lack of response. Set to `false` to disable
closeComment: >
This issue has been automatically closed because there has been no response
to a request for more information from the original author. With only the
information that is currently in the issue, there's not enough information
to take action. If you're the original author, feel free to reopen the issue
if you have or find the answers needed to investigate further.

67
.github/spacy_universe_alert.py vendored Normal file
View File

@ -0,0 +1,67 @@
import os
import sys
import json
from datetime import datetime
from slack_sdk.web.client import WebClient
CHANNEL = "#alerts-universe"
SLACK_TOKEN = os.environ.get("SLACK_BOT_TOKEN", "ENV VAR not available!")
DATETIME_FORMAT = "%Y-%m-%dT%H:%M:%SZ"
client = WebClient(SLACK_TOKEN)
github_context = json.loads(sys.argv[1])
event = github_context['event']
pr_title = event['pull_request']["title"]
pr_link = event['pull_request']["patch_url"].replace(".patch", "")
pr_author_url = event['sender']["html_url"]
pr_author_name = pr_author_url.rsplit('/')[-1]
pr_created_at_dt = datetime.strptime(
event['pull_request']["created_at"],
DATETIME_FORMAT
)
pr_created_at = pr_created_at_dt.strftime("%c")
pr_updated_at_dt = datetime.strptime(
event['pull_request']["updated_at"],
DATETIME_FORMAT
)
pr_updated_at = pr_updated_at_dt.strftime("%c")
blocks = [
{
"type": "section",
"text": {
"type": "mrkdwn",
"text": "📣 New spaCy Universe Project Alert ✨"
}
},
{
"type": "section",
"fields": [
{
"type": "mrkdwn",
"text": f"*Pull Request:*\n<{pr_link}|{pr_title}>"
},
{
"type": "mrkdwn",
"text": f"*Author:*\n<{pr_author_url}|{pr_author_name}>"
},
{
"type": "mrkdwn",
"text": f"*Created at:*\n {pr_created_at}"
},
{
"type": "mrkdwn",
"text": f"*Last Updated:*\n {pr_updated_at}"
}
]
}
]
client.chat_postMessage(
channel=CHANNEL,
text="spaCy universe project PR alert",
blocks=blocks
)

View File

@ -12,10 +12,10 @@ jobs:
if: github.repository_owner == 'explosion'
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v3
with:
ref: ${{ github.head_ref }}
- uses: actions/setup-python@v2
- uses: actions/setup-python@v4
- run: pip install black
- name: Auto-format code if needed
run: black spacy
@ -23,10 +23,11 @@ jobs:
# code and makes GitHub think the action failed
- name: Check for modified files
id: git-check
run: echo ::set-output name=modified::$(if git diff-index --quiet HEAD --; then echo "false"; else echo "true"; fi)
run: echo modified=$(if git diff-index --quiet HEAD --; then echo "false"; else echo "true"; fi) >> $GITHUB_OUTPUT
- name: Create Pull Request
if: steps.git-check.outputs.modified == 'true'
uses: peter-evans/create-pull-request@v3
uses: peter-evans/create-pull-request@v4
with:
title: Auto-format code with black
labels: meta

View File

@ -8,14 +8,14 @@ on:
jobs:
explosion-bot:
runs-on: ubuntu-18.04
runs-on: ubuntu-latest
steps:
- name: Dump GitHub context
env:
GITHUB_CONTEXT: ${{ toJson(github) }}
run: echo "$GITHUB_CONTEXT"
- uses: actions/checkout@v1
- uses: actions/setup-python@v1
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
- name: Install and run explosion-bot
run: |
pip install git+https://${{ secrets.EXPLOSIONBOT_TOKEN }}@github.com/explosion/explosion-bot
@ -23,5 +23,5 @@ jobs:
env:
INPUT_TOKEN: ${{ secrets.EXPLOSIONBOT_TOKEN }}
INPUT_BK_TOKEN: ${{ secrets.BUILDKITE_SECRET }}
ENABLED_COMMANDS: "test_gpu,test_slow"
ENABLED_COMMANDS: "test_gpu,test_slow,test_slow_gpu"
ALLOWED_TEAMS: "spaCy"

View File

@ -10,6 +10,7 @@ jobs:
fail-fast: false
matrix:
branch: [master, v4]
if: github.repository_owner == 'explosion'
runs-on: ubuntu-latest
steps:
- name: Trigger buildkite build

View File

@ -15,7 +15,7 @@ jobs:
issue-manager:
runs-on: ubuntu-latest
steps:
- uses: tiangolo/issue-manager@0.2.1
- uses: tiangolo/issue-manager@0.4.0
with:
token: ${{ secrets.GITHUB_TOKEN }}
config: >
@ -25,5 +25,11 @@ jobs:
"message": "This issue has been automatically closed because it was answered and there was no follow-up discussion.",
"remove_label_on_comment": true,
"remove_label_on_close": true
},
"more-info-needed": {
"delay": "P7D",
"message": "This issue has been automatically closed because there has been no response to a request for more information from the original author. With only the information that is currently in the issue, there's not enough information to take action. If you're the original author, feel free to reopen the issue if you have or find the answers needed to investigate further.",
"remove_label_on_comment": true,
"remove_label_on_close": true
}
}

View File

@ -15,7 +15,7 @@ jobs:
action:
runs-on: ubuntu-latest
steps:
- uses: dessant/lock-threads@v3
- uses: dessant/lock-threads@v4
with:
process-only: 'issues'
issue-inactive-days: '30'

View File

@ -10,10 +10,11 @@ jobs:
fail-fast: false
matrix:
branch: [master, v4]
if: github.repository_owner == 'explosion'
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v1
uses: actions/checkout@v3
with:
ref: ${{ matrix.branch }}
- name: Get commits from past 24 hours
@ -22,9 +23,9 @@ jobs:
today=$(date '+%Y-%m-%d %H:%M:%S')
yesterday=$(date -d "yesterday" '+%Y-%m-%d %H:%M:%S')
if git log --after="$yesterday" --before="$today" | grep commit ; then
echo "::set-output name=run_tests::true"
echo run_tests=true >> $GITHUB_OUTPUT
else
echo "::set-output name=run_tests::false"
echo run_tests=false >> $GITHUB_OUTPUT
fi
- name: Trigger buildkite build

View File

@ -0,0 +1,32 @@
name: spaCy universe project alert
on:
pull_request_target:
paths:
- "website/meta/universe.json"
jobs:
build:
runs-on: ubuntu-latest
steps:
- name: Dump GitHub context
env:
GITHUB_CONTEXT: ${{ toJson(github) }}
PR_NUMBER: ${{github.event.number}}
run: |
echo "$GITHUB_CONTEXT"
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
with:
python-version: '3.10'
- name: Install Bernadette app dependency and send an alert
env:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
GITHUB_CONTEXT: ${{ toJson(github) }}
CHANNEL: "#alerts-universe"
run: |
pip install slack-sdk==3.17.2 aiohttp==3.8.1
echo "$CHANNEL"
python .github/spacy_universe_alert.py "$GITHUB_CONTEXT"

11
.gitignore vendored
View File

@ -10,20 +10,11 @@ spacy/tests/package/setup.cfg
spacy/tests/package/pyproject.toml
spacy/tests/package/requirements.txt
# Website
website/.cache/
website/public/
website/node_modules
website/.npm
website/logs
*.log
npm-debug.log*
quickstart-training-generator.js
# Cython / C extensions
cythonize.json
spacy/*.html
*.cpp
*.c
*.so
# Vim / VSCode / editors

View File

@ -1,11 +1,12 @@
repos:
- repo: https://github.com/ambv/black
rev: 21.6b0
rev: 22.3.0
hooks:
- id: black
language_version: python3.7
- repo: https://gitlab.com/pycqa/flake8
rev: 3.9.2
additional_dependencies: ['click==8.0.4']
- repo: https://github.com/pycqa/flake8
rev: 5.0.4
hooks:
- id: flake8
args:

View File

@ -144,7 +144,7 @@ Changes to `.py` files will be effective immediately.
When fixing a bug, first create an
[issue](https://github.com/explosion/spaCy/issues) if one does not already
exist. The description text can be very short we don't want to make this too
exist. The description text can be very short we don't want to make this too
bureaucratic.
Next, add a test to the relevant file in the
@ -233,7 +233,7 @@ also want to keep an eye on unused declared variables or repeated
(i.e. overwritten) dictionary keys. If your code was formatted with `black`
(see above), you shouldn't see any formatting-related warnings.
The [`.flake8`](.flake8) config defines the configuration we use for this
The `flake8` section in [`setup.cfg`](setup.cfg) defines the configuration we use for this
codebase. For example, we're not super strict about the line length, and we're
excluding very large files like lemmatization and tokenizer exception tables.
@ -271,7 +271,8 @@ except: # noqa: E722
### Python conventions
All Python code must be written **compatible with Python 3.6+**.
All Python code must be written **compatible with Python 3.6+**. More detailed
code conventions can be found in the [developer docs](https://github.com/explosion/spaCy/blob/master/extra/DEVELOPER_DOCS/Code%20Conventions.md).
#### I/O and handling paths

View File

@ -1,4 +1,4 @@
recursive-include spacy *.pyi *.pyx *.pxd *.txt *.cfg *.jinja *.toml
recursive-include spacy *.pyi *.pyx *.pxd *.txt *.cfg *.jinja *.toml *.hh
include LICENSE
include README.md
include pyproject.toml

View File

@ -8,15 +8,15 @@ be used in real products.
spaCy comes with
[pretrained pipelines](https://spacy.io/models) and
currently supports tokenization and training for **60+ languages**. It features
currently supports tokenization and training for **70+ languages**. It features
state-of-the-art speed and **neural network models** for tagging,
parsing, **named entity recognition**, **text classification** and more,
multi-task learning with pretrained **transformers** like BERT, as well as a
production-ready [**training system**](https://spacy.io/usage/training) and easy
model packaging, deployment and workflow management. spaCy is commercial
open-source software, released under the MIT license.
open-source software, released under the [MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE).
💫 **Version 3.2 out now!**
💫 **Version 3.5 out now!**
[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
[![Azure Pipelines](https://img.shields.io/azure-devops/build/explosion-ai/public/8/master.svg?logo=azure-pipelines&style=flat-square&label=build)](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)
@ -46,6 +46,7 @@ open-source software, released under the MIT license.
| 🛠 **[Changelog]** | Changes and version history. |
| 💝 **[Contribute]** | How to contribute to the spaCy project and code base. |
| <a href="https://explosion.ai/spacy-tailored-pipelines"><img src="https://user-images.githubusercontent.com/13643239/152853098-1c761611-ccb0-4ec6-9066-b234552831fe.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Get a custom spaCy pipeline, tailor-made for your NLP problem by spaCy's core developers. Streamlined, production-ready, predictable and maintainable. Start by completing our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more &rarr;](https://explosion.ai/spacy-tailored-pipelines)** |
| <a href="https://explosion.ai/spacy-tailored-analysis"><img src="https://user-images.githubusercontent.com/1019791/206151300-b00cd189-e503-4797-aa1e-1bb6344062c5.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Bespoke advice for problem solving, strategy and analysis for applied NLP projects. Services include data strategy, code reviews, pipeline design and annotation coaching. Curious? Fill in our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more &rarr;](https://explosion.ai/spacy-tailored-analysis)** |
[spacy 101]: https://spacy.io/usage/spacy-101
[new in v3.0]: https://spacy.io/usage/v3
@ -59,6 +60,7 @@ open-source software, released under the MIT license.
[changelog]: https://spacy.io/usage#changelog
[contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md
## 💬 Where to ask questions
The spaCy project is maintained by the [spaCy team](https://explosion.ai/about).
@ -79,7 +81,7 @@ more people can benefit from it.
## Features
- Support for **60+ languages**
- Support for **70+ languages**
- **Trained pipelines** for different languages and tasks
- Multi-task learning with pretrained **transformers** like BERT
- Support for pretrained **word vectors** and embeddings

View File

@ -31,8 +31,8 @@ jobs:
inputs:
versionSpec: "3.7"
- script: |
pip install flake8==3.9.2
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823 --show-source --statistics
pip install flake8==5.0.4
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
displayName: "flake8"
- job: "Test"
@ -41,7 +41,7 @@ jobs:
matrix:
# We're only running one platform per Python version to speed up builds
Python36Linux:
imageName: "ubuntu-latest"
imageName: "ubuntu-20.04"
python.version: "3.6"
# Python36Windows:
# imageName: "windows-latest"
@ -50,7 +50,7 @@ jobs:
# imageName: "macos-latest"
# python.version: "3.6"
# Python37Linux:
# imageName: "ubuntu-latest"
# imageName: "ubuntu-20.04"
# python.version: "3.7"
Python37Windows:
imageName: "windows-latest"
@ -76,15 +76,24 @@ jobs:
# Python39Mac:
# imageName: "macos-latest"
# python.version: "3.9"
Python310Linux:
imageName: "ubuntu-latest"
python.version: "3.10"
# Python310Linux:
# imageName: "ubuntu-latest"
# python.version: "3.10"
Python310Windows:
imageName: "windows-latest"
python.version: "3.10"
Python310Mac:
imageName: "macos-latest"
python.version: "3.10"
# Python310Mac:
# imageName: "macos-latest"
# python.version: "3.10"
Python311Linux:
imageName: 'ubuntu-latest'
python.version: '3.11'
Python311Windows:
imageName: 'windows-latest'
python.version: '3.11'
Python311Mac:
imageName: 'macos-latest'
python.version: '3.11'
maxParallel: 4
pool:
vmImage: $(imageName)
@ -92,20 +101,3 @@ jobs:
- template: .github/azure-steps.yml
parameters:
python_version: '$(python.version)'
architecture: 'x64'
# - job: "TestGPU"
# dependsOn: "Validate"
# strategy:
# matrix:
# Python38LinuxX64_GPU:
# python.version: '3.8'
# pool:
# name: "LinuxX64_GPU"
# steps:
# - template: .github/azure-steps.yml
# parameters:
# python_version: '$(python.version)'
# architecture: 'x64'
# gpu: true
# num_build_jobs: 24

View File

@ -1,6 +1,9 @@
# build version constraints for use with wheelwright + multibuild
numpy==1.15.0; python_version<='3.7'
numpy==1.17.3; python_version=='3.8'
numpy==1.15.0; python_version<='3.7' and platform_machine!='aarch64'
numpy==1.19.2; python_version<='3.7' and platform_machine=='aarch64'
numpy==1.17.3; python_version=='3.8' and platform_machine!='aarch64'
numpy==1.19.2; python_version=='3.8' and platform_machine=='aarch64'
numpy==1.19.3; python_version=='3.9'
numpy==1.21.3; python_version=='3.10'
numpy; python_version>='3.11'
numpy==1.23.2; python_version=='3.11'
numpy; python_version>='3.12'

View File

@ -137,7 +137,7 @@ If any of the TODOs you've added are important and should be fixed soon, you sho
## Type hints
We use Python type hints across the `.py` files wherever possible. This makes it easy to understand what a function expects and returns, and modern editors will be able to show this information to you when you call an annotated function. Type hints are not currently used in the `.pyx` (Cython) code, except for definitions of registered functions and component factories, where they're used for config validation.
We use Python type hints across the `.py` files wherever possible. This makes it easy to understand what a function expects and returns, and modern editors will be able to show this information to you when you call an annotated function. Type hints are not currently used in the `.pyx` (Cython) code, except for definitions of registered functions and component factories, where they're used for config validation. Ideally when developing, run `mypy spacy` on the code base to inspect any issues.
If possible, you should always use the more descriptive type hints like `List[str]` or even `List[Any]` instead of only `list`. We also annotate arguments and return types of `Callable` although, you can simplify this if the type otherwise gets too verbose (e.g. functions that return factories to create callbacks). Remember that `Callable` takes two values: a **list** of the argument type(s) in order, and the return values.
@ -155,6 +155,13 @@ def create_callback(some_arg: bool) -> Callable[[str, int], List[str]]:
return callback
```
For typing variables, we prefer the explicit format.
```diff
- var = value # type: Type
+ var: Type = value
```
For model architectures, Thinc also provides a collection of [custom types](https://thinc.ai/docs/api-types), including more specific types for arrays and model inputs/outputs. Even outside of static type checking, using these types will make the code a lot easier to read and follow, since it's always clear what array types are expected (and what might go wrong if the output is different from the expected type).
```python
@ -184,6 +191,8 @@ def load_model(name: str) -> "Language":
...
```
Note that we typically put the `from typing` import statements on the first line(s) of the Python module.
## Structuring logic
### Positional and keyword arguments
@ -268,6 +277,27 @@ If you have to use `try`/`except`, make sure to only include what's **absolutely
+ return [v.strip() for v in value.split(",")]
```
### Numeric comparisons
For numeric comparisons, as a general rule we always use `<` and `>=` and avoid the usage of `<=` and `>`. This is to ensure we consistently
apply inclusive lower bounds and exclusive upper bounds, helping to prevent off-by-one errors.
One exception to this rule is the ternary case. With a chain like
```python
if value >= 0 and value < max:
...
```
it's fine to rewrite this to the shorter form
```python
if 0 <= value < max:
...
```
even though this requires the usage of the `<=` operator.
### Iteration and comprehensions
We generally avoid using built-in functions like `filter` or `map` in favor of list or generator comprehensions.
@ -448,6 +478,10 @@ Regression tests are tests that refer to bugs reported in specific issues. They
The test suite also provides [fixtures](https://github.com/explosion/spaCy/blob/master/spacy/tests/conftest.py) for different language tokenizers that can be used as function arguments of the same name and will be passed in automatically. Those should only be used for tests related to those specific languages. We also have [test utility functions](https://github.com/explosion/spaCy/blob/master/spacy/tests/util.py) for common operations, like creating a temporary file.
### Testing Cython Code
If you're developing Cython code (`.pyx` files), those extensions will need to be built before the test runner can test that code - otherwise it's going to run the tests with stale code from the last time the extension was built. You can build the extensions locally with `python setup.py build_ext -i`.
### Constructing objects and state
Test functions usually follow the same simple structure: they set up some state, perform the operation you want to test and `assert` conditions that you expect to be true, usually before and after the operation.

View File

@ -0,0 +1,56 @@
# Explosion-bot
Explosion-bot is a robot that can be invoked to help with running particular test commands.
## Permissions
Only maintainers have permissions to summon explosion-bot. Each of the open source repos that use explosion-bot has its own team(s) of maintainers, and only github users who are members of those teams can successfully run bot commands.
## Running robot commands
To summon the robot, write a github comment on the issue/PR you wish to test. The comment must be in the following format:
```
@explosion-bot please test_gpu
```
Some things to note:
- The `@explosion-bot please` must be the beginning of the command - you cannot add anything in front of this or else the robot won't know how to parse it. Adding anything at the end aside from the test name will also confuse the robot, so keep it simple!
- The command name (such as `test_gpu`) must be one of the tests that the bot knows how to run. The available commands are documented in the bot's [workflow config](https://github.com/explosion/spaCy/blob/master/.github/workflows/explosionbot.yml#L26) and must match exactly one of the commands listed there.
- The robot can't do multiple things at once, so if you want it to run multiple tests, you'll have to summon it with one comment per test.
### Examples
- Execute spaCy slow GPU tests with a custom thinc branch from a spaCy PR:
```
@explosion-bot please test_slow_gpu --thinc-branch <branch_name>
```
`branch_name` can either be a named branch, e.g: `develop`, or an unmerged PR, e.g: `refs/pull/<pr_number>/head`.
- Execute spaCy Transformers GPU tests from a spaCy PR:
```
@explosion-bot please test_gpu --run-on spacy-transformers --run-on-branch master --spacy-branch current_pr
```
This will launch the GPU pipeline for the `spacy-transformers` repo on its `master` branch, using the current spaCy PR's branch to build spaCy. The name of the repository passed to `--run-on` is case-sensitive, e.g: use `spaCy` instead of `spacy`.
- General info about supported commands.
```
@explosion-bot please info
```
- Help text for a specific command
```
@explosion-bot please <command> --help
```
## Troubleshooting
If the robot isn't responding to commands as expected, you can check its logs in the [Github Action](https://github.com/explosion/spaCy/actions/workflows/explosionbot.yml).
For each command sent to the bot, there should be a run of the `explosion-bot` workflow. In the `Install and run explosion-bot` step, towards the ends of the logs you should see info about the configuration that the bot was run with, as well as any errors that the bot encountered.

View File

@ -0,0 +1,82 @@
# spaCy Satellite Packages
This is a list of all the active repos relevant to spaCy besides the main one, with short descriptions, history, and current status. Archived repos will not be covered.
## Always Included in spaCy
These packages are always pulled in when you install spaCy. Most of them are direct dependencies, but some are transitive dependencies through other packages.
- [spacy-legacy](https://github.com/explosion/spacy-legacy): When an architecture in spaCy changes enough to get a new version, the old version is frozen and moved to spacy-legacy. This allows us to keep the core library slim while also preserving backwards compatability.
- [thinc](https://github.com/explosion/thinc): Thinc is the machine learning library that powers trainable components in spaCy. It wraps backends like Numpy, PyTorch, and Tensorflow to provide a functional interface for specifying architectures.
- [catalogue](https://github.com/explosion/catalogue): Small library for adding function registries, like those used for model architectures in spaCy.
- [confection](https://github.com/explosion/confection): This library contains the functionality for config parsing that was formerly contained directly in Thinc.
- [spacy-loggers](https://github.com/explosion/spacy-loggers): Contains loggers beyond the default logger available in spaCy&#39;s core code base. This includes loggers integrated with third-party services, which may differ in release cadence from spaCy itself.
- [wasabi](https://github.com/explosion/wasabi): A command line formatting library, used for terminal output in spaCy.
- [srsly](https://github.com/explosion/srsly): A wrapper that vendors several serialization libraries for spaCy. Includes parsers for JSON, JSONL, MessagePack, (extended) Pickle, and YAML.
- [preshed](https://github.com/explosion/preshed): A Cython library for low-level data structures like hash maps, used for memory efficient data storage.
- [cython-blis](https://github.com/explosion/cython-blis): Fast matrix multiplication using BLIS without depending on system libraries. Required by Thinc, rather than spaCy directly.
- [murmurhash](https://github.com/explosion/murmurhash): A wrapper library for a C++ murmurhash implementation, used for string IDs in spaCy and preshed.
- [cymem](https://github.com/explosion/cymem): A small library for RAII-style memory management in Cython.
## Optional Extensions for spaCy
These are repos that can be used by spaCy but aren&#39;t part of a default installation. Many of these are wrappers to integrate various kinds of third-party libraries.
- [spacy-transformers](https://github.com/explosion/spacy-transformers): A wrapper for the [HuggingFace Transformers](https://huggingface.co/docs/transformers/index) library, this handles the extensive conversion necessary to coordinate spaCy&#39;s powerful `Doc` representation, training pipeline, and the Transformer embeddings. When released, this was known as `spacy-pytorch-transformers`, but it changed to the current name when HuggingFace update the name of their library as well.
- [spacy-huggingface-hub](https://github.com/explosion/spacy-huggingface-hub): This package has a CLI script for uploading a packaged spaCy pipeline (created with `spacy package`) to the [Hugging Face Hub](https://huggingface.co/models).
- [spacy-alignments](https://github.com/explosion/spacy-alignments): A wrapper for the tokenizations library (mentioned below) with a modified build system to simplify cross-platform wheel creation. Used in spacy-transformers for aligning spaCy and HuggingFace tokenizations.
- [spacy-experimental](https://github.com/explosion/spacy-experimental): Experimental components that are not quite ready for inclusion in the main spaCy library. Usually there are unresolved questions around their APIs, so the experimental library allows us to expose them to the community for feedback before fully integrating them.
- [spacy-lookups-data](https://github.com/explosion/spacy-lookups-data): A repository of linguistic data, such as lemmas, that takes up a lot of disk space. Originally created to reduce the size of the spaCy core library. This is mainly useful if you want the data included but aren&#39;t using a pretrained pipeline; for the affected languages, the relevant data is included in pretrained pipelines directly.
- [coreferee](https://github.com/explosion/coreferee): Coreference resolution for English, French, German and Polish, optimised for limited training data and easily extensible for further languages. Used as a spaCy pipeline component.
- [spacy-stanza](https://github.com/explosion/spacy-stanza): This is a wrapper that allows the use of Stanford&#39;s Stanza library in spaCy.
- [spacy-streamlit](https://github.com/explosion/spacy-streamlit): A wrapper for the Streamlit dashboard building library to help with integrating [displaCy](https://spacy.io/api/top-level/#displacy).
- [spacymoji](https://github.com/explosion/spacymoji): A library to add extra support for emoji to spaCy, such as including character names.
- [thinc-apple-ops](https://github.com/explosion/thinc-apple-ops): A special backend for OSX that uses Apple&#39;s native libraries for improved performance.
- [os-signpost](https://github.com/explosion/os-signpost): A Python package that allows you to use the `OSSignposter` API in OSX for performance analysis.
- [spacy-ray](https://github.com/explosion/spacy-ray): A wrapper to integrate spaCy with Ray, a distributed training framework. Currently a work in progress.
## Prodigy
[Prodigy](https://prodi.gy) is Explosion&#39;s easy to use and highly customizable tool for annotating data. Prodigy itself requires a license, but the repos below contain documentation, examples, and editor or notebook integrations.
- [prodigy-recipes](https://github.com/explosion/prodigy-recipes): Sample recipes for Prodigy, along with notebooks and other examples of usage.
- [vscode-prodigy](https://github.com/explosion/vscode-prodigy): A VS Code extension that lets you run Prodigy inside VS Code.
- [jupyterlab-prodigy](https://github.com/explosion/jupyterlab-prodigy): An extension for JupyterLab that lets you run Prodigy inside JupyterLab.
## Independent Tools or Projects
These are tools that may be related to or use spaCy, but are functional independent projects in their own right as well.
- [floret](https://github.com/explosion/floret): A modification of fastText to use Bloom Embeddings. Can be used to add vectors with subword features to spaCy, and also works independently in the same manner as fastText.
- [sense2vec](https://github.com/explosion/sense2vec): A library to make embeddings of noun phrases or words coupled with their part of speech. This library uses spaCy.
- [spacy-vectors-builder](https://github.com/explosion/spacy-vectors-builder): This is a spaCy project that builds vectors using floret and a lot of input text. It handles downloading the input data as well as the actual building of vectors.
- [holmes-extractor](https://github.com/explosion/holmes-extractor): Information extraction from English and German texts based on predicate logic. Uses spaCy.
- [healthsea](https://github.com/explosion/healthsea): Healthsea is a project to extract information from comments about health supplements. Structurally, it&#39;s a self-contained, large spaCy project.
- [spacy-pkuseg](https://github.com/explosion/spacy-pkuseg): A fork of the pkuseg Chinese tokenizer. Used for Chinese support in spaCy, but also works independently.
- [ml-datasets](https://github.com/explosion/ml-datasets): This repo includes loaders for several standard machine learning datasets, like MNIST or WikiNER, and has historically been used in spaCy example code and documentation.
## Documentation and Informational Repos
These repos are used to support the spaCy docs or otherwise present information about spaCy or other Explosion projects.
- [projects](https://github.com/explosion/projects): The projects repo is used to show detailed examples of spaCy usage. Individual projects can be checked out using the spaCy command line tool, rather than checking out the projects repo directly.
- [spacy-course](https://github.com/explosion/spacy-course): Home to the interactive spaCy course for learning about how to use the library and some basic NLP principles.
- [spacy-io-binder](https://github.com/explosion/spacy-io-binder): Home to the notebooks used for interactive examples in the documentation.
## Organizational / Meta
These repos are used for organizing data around spaCy, but are not something an end user would need to install as part of using the library.
- [spacy-models](https://github.com/explosion/spacy-models): This repo contains metadata (but not training data) for all the spaCy models. This includes information about where their training data came from, version compatability, and performance information. It also includes tests for the model packages, and the built models are hosted as releases of this repo.
- [wheelwright](https://github.com/explosion/wheelwright): A tool for automating our PyPI builds and releases.
- [ec2buildwheel](https://github.com/explosion/ec2buildwheel): A small project that allows you to build Python packages in the manner of cibuildwheel, but on any EC2 image. Used by wheelwright.
## Other
Repos that don&#39;t fit in any of the above categories.
- [blis](https://github.com/explosion/blis): A fork of the official BLIS library. The main branch is not updated, but work continues in various branches. This is used for cython-blis.
- [tokenizations](https://github.com/explosion/tokenizations): A library originally by Yohei Tamura to align strings with tolerance to some variations in features like case and diacritics, used for aligning tokens and wordpieces. Adopted and maintained by Explosion, but usually spacy-alignments is used instead.
- [conll-2012](https://github.com/explosion/conll-2012): A repo to hold some slightly cleaned up versions of the official scripts for the CoNLL 2012 shared task involving coreference resolution. Used in the coref project.
- [fastapi-explosion-extras](https://github.com/explosion/fastapi-explosion-extras): Some small tweaks to FastAPI used at Explosion.

View File

@ -127,3 +127,34 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
polyleven
---------
* Files: spacy/matcher/polyleven.c
MIT License
Copyright (c) 2021 Fujimoto Seiji <fujimoto@ceptord.net>
Copyright (c) 2021 Max Bachmann <kontakt@maxbachmann.de>
Copyright (c) 2022 Nick Mazuk
Copyright (c) 2022 Michael Weiss <code@mweiss.ch>
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@ -5,9 +5,7 @@ requires = [
"cymem>=2.0.2,<2.1.0",
"preshed>=3.0.2,<3.1.0",
"murmurhash>=0.28.0,<1.1.0",
"thinc>=8.0.12,<8.1.0",
"blis>=0.4.0,<0.8.0",
"pathy",
"thinc>=8.1.0,<8.2.0",
"numpy>=1.15.0",
]
build-backend = "setuptools.build_meta"

View File

@ -1,38 +1,40 @@
# Our libraries
spacy-legacy>=3.0.8,<3.1.0
spacy-legacy>=3.0.11,<3.1.0
spacy-loggers>=1.0.0,<2.0.0
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
thinc>=8.0.12,<8.1.0
blis>=0.4.0,<0.8.0
thinc>=8.1.0,<8.2.0
ml_datasets>=0.2.0,<0.3.0
murmurhash>=0.28.0,<1.1.0
wasabi>=0.9.0,<1.1.0
srsly>=2.4.1,<3.0.0
wasabi>=0.9.1,<1.2.0
srsly>=2.4.3,<3.0.0
catalogue>=2.0.6,<2.1.0
typer>=0.3.0,<0.5.0
pathy>=0.3.5
typer>=0.3.0,<0.8.0
pathy>=0.10.0
smart-open>=5.2.1,<7.0.0
# Third party dependencies
numpy>=1.15.0
requests>=2.13.0,<3.0.0
tqdm>=4.38.0,<5.0.0
pydantic>=1.7.4,!=1.8,!=1.8.1,<1.9.0
pydantic>=1.7.4,!=1.8,!=1.8.1,<1.11.0
jinja2
langcodes>=3.2.0,<4.0.0
# Official Python utilities
setuptools
packaging>=20.0
typing_extensions>=3.7.4.1,<4.0.0.0; python_version < "3.8"
typing_extensions>=3.7.4.1,<4.5.0; python_version < "3.8"
# Development dependencies
pre-commit>=2.13.0
cython>=0.25,<3.0
pytest>=5.2.0
pytest>=5.2.0,!=7.1.0
pytest-timeout>=1.3.0,<2.0.0
mock>=2.0.0,<3.0.0
flake8>=3.8.0,<3.10.0
flake8>=3.8.0,<6.0.0
hypothesis>=3.27.0,<7.0.0
mypy==0.910
mypy>=0.990,<0.1000; platform_machine != "aarch64" and python_version >= "3.7"
types-dataclasses>=0.1.3; python_version < "3.7"
types-mock>=0.1.1
types-setuptools>=57.0.0
types-requests
types-setuptools>=57.0.0
black>=22.0,<23.0

View File

@ -22,6 +22,7 @@ classifiers =
Programming Language :: Python :: 3.8
Programming Language :: Python :: 3.9
Programming Language :: Python :: 3.10
Programming Language :: Python :: 3.11
Topic :: Scientific/Engineering
project_urls =
Release notes = https://github.com/explosion/spaCy/releases
@ -38,31 +39,31 @@ setup_requires =
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
murmurhash>=0.28.0,<1.1.0
thinc>=8.0.12,<8.1.0
thinc>=8.1.0,<8.2.0
install_requires =
# Our libraries
spacy-legacy>=3.0.8,<3.1.0
spacy-legacy>=3.0.11,<3.1.0
spacy-loggers>=1.0.0,<2.0.0
murmurhash>=0.28.0,<1.1.0
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
thinc>=8.0.12,<8.1.0
blis>=0.4.0,<0.8.0
wasabi>=0.9.0,<1.1.0
srsly>=2.4.1,<3.0.0
thinc>=8.1.0,<8.2.0
wasabi>=0.9.1,<1.2.0
srsly>=2.4.3,<3.0.0
catalogue>=2.0.6,<2.1.0
typer>=0.3.0,<0.5.0
pathy>=0.3.5
# Third-party dependencies
typer>=0.3.0,<0.8.0
pathy>=0.10.0
smart-open>=5.2.1,<7.0.0
tqdm>=4.38.0,<5.0.0
numpy>=1.15.0
requests>=2.13.0,<3.0.0
pydantic>=1.7.4,!=1.8,!=1.8.1,<1.9.0
pydantic>=1.7.4,!=1.8,!=1.8.1,<1.11.0
jinja2
# Official Python utilities
setuptools
packaging>=20.0
typing_extensions>=3.7.4,<4.0.0.0; python_version < "3.8"
typing_extensions>=3.7.4.1,<4.5.0; python_version < "3.8"
langcodes>=3.2.0,<4.0.0
[options.entry_points]
@ -73,45 +74,53 @@ console_scripts =
lookups =
spacy_lookups_data>=1.0.3,<1.1.0
transformers =
spacy_transformers>=1.1.2,<1.2.0
spacy_transformers>=1.1.2,<1.3.0
ray =
spacy_ray>=0.1.0,<1.0.0
cuda =
cupy>=5.0.0b4,<11.0.0
cupy>=5.0.0b4,<12.0.0
cuda80 =
cupy-cuda80>=5.0.0b4,<11.0.0
cupy-cuda80>=5.0.0b4,<12.0.0
cuda90 =
cupy-cuda90>=5.0.0b4,<11.0.0
cupy-cuda90>=5.0.0b4,<12.0.0
cuda91 =
cupy-cuda91>=5.0.0b4,<11.0.0
cupy-cuda91>=5.0.0b4,<12.0.0
cuda92 =
cupy-cuda92>=5.0.0b4,<11.0.0
cupy-cuda92>=5.0.0b4,<12.0.0
cuda100 =
cupy-cuda100>=5.0.0b4,<11.0.0
cupy-cuda100>=5.0.0b4,<12.0.0
cuda101 =
cupy-cuda101>=5.0.0b4,<11.0.0
cupy-cuda101>=5.0.0b4,<12.0.0
cuda102 =
cupy-cuda102>=5.0.0b4,<11.0.0
cupy-cuda102>=5.0.0b4,<12.0.0
cuda110 =
cupy-cuda110>=5.0.0b4,<11.0.0
cupy-cuda110>=5.0.0b4,<12.0.0
cuda111 =
cupy-cuda111>=5.0.0b4,<11.0.0
cupy-cuda111>=5.0.0b4,<12.0.0
cuda112 =
cupy-cuda112>=5.0.0b4,<11.0.0
cupy-cuda112>=5.0.0b4,<12.0.0
cuda113 =
cupy-cuda113>=5.0.0b4,<11.0.0
cupy-cuda113>=5.0.0b4,<12.0.0
cuda114 =
cupy-cuda114>=5.0.0b4,<11.0.0
cupy-cuda114>=5.0.0b4,<12.0.0
cuda115 =
cupy-cuda115>=5.0.0b4,<11.0.0
cupy-cuda115>=5.0.0b4,<12.0.0
cuda116 =
cupy-cuda116>=5.0.0b4,<12.0.0
cuda117 =
cupy-cuda117>=5.0.0b4,<12.0.0
cuda11x =
cupy-cuda11x>=11.0.0,<12.0.0
cuda-autodetect =
cupy-wheel>=11.0.0,<12.0.0
apple =
thinc-apple-ops>=0.0.4,<1.0.0
thinc-apple-ops>=0.1.0.dev0,<1.0.0
# Language tokenizers with external dependencies
ja =
sudachipy>=0.5.2,!=0.6.1
sudachidict_core>=20211220
ko =
natto-py==0.9.0
natto-py>=0.9.0
th =
pythainlp>=2.0

View File

@ -23,16 +23,20 @@ Options.docstrings = True
PACKAGES = find_packages()
MOD_NAMES = [
"spacy.training.alignment_array",
"spacy.training.example",
"spacy.parts_of_speech",
"spacy.strings",
"spacy.lexeme",
"spacy.vocab",
"spacy.attrs",
"spacy.kb",
"spacy.kb.candidate",
"spacy.kb.kb",
"spacy.kb.kb_in_memory",
"spacy.ml.parser_model",
"spacy.morphology",
"spacy.pipeline.dep_parser",
"spacy.pipeline._edit_tree_internals.edit_trees",
"spacy.pipeline.morphologizer",
"spacy.pipeline.multitask",
"spacy.pipeline.ner",
@ -124,6 +128,8 @@ class build_ext_options:
class build_ext_subclass(build_ext, build_ext_options):
def build_extensions(self):
if self.parallel is None and os.environ.get("SPACY_NUM_BUILD_JOBS") is not None:
self.parallel = int(os.environ.get("SPACY_NUM_BUILD_JOBS"))
build_ext_options.build_options(self)
build_ext.build_extensions(self)
@ -201,10 +207,25 @@ def setup_package():
get_python_inc(plat_specific=True),
]
ext_modules = []
ext_modules.append(
Extension(
"spacy.matcher.levenshtein",
[
"spacy/matcher/levenshtein.pyx",
"spacy/matcher/polyleven.c",
],
language="c",
include_dirs=include_dirs,
)
)
for name in MOD_NAMES:
mod_path = name.replace(".", "/") + ".pyx"
ext = Extension(
name, [mod_path], language="c++", include_dirs=include_dirs, extra_compile_args=["-std=c++11"]
name,
[mod_path],
language="c++",
include_dirs=include_dirs,
extra_compile_args=["-std=c++11"],
)
ext_modules.append(ext)
print("Cythonizing sources")

View File

@ -31,25 +31,33 @@ def load(
name: Union[str, Path],
*,
vocab: Union[Vocab, bool] = True,
disable: Iterable[str] = util.SimpleFrozenList(),
exclude: Iterable[str] = util.SimpleFrozenList(),
disable: Union[str, Iterable[str]] = util._DEFAULT_EMPTY_PIPES,
enable: Union[str, Iterable[str]] = util._DEFAULT_EMPTY_PIPES,
exclude: Union[str, Iterable[str]] = util._DEFAULT_EMPTY_PIPES,
config: Union[Dict[str, Any], Config] = util.SimpleFrozenDict(),
) -> Language:
"""Load a spaCy model from an installed package or a local path.
name (str): Package name or model path.
vocab (Vocab): A Vocab object. If True, a vocab is created.
disable (Iterable[str]): Names of pipeline components to disable. Disabled
disable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to disable. Disabled
pipes will be loaded but they won't be run unless you explicitly
enable them by calling nlp.enable_pipe.
exclude (Iterable[str]): Names of pipeline components to exclude. Excluded
enable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to enable. All other
pipes will be disabled (but can be enabled later using nlp.enable_pipe).
exclude (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to exclude. Excluded
components won't be loaded.
config (Dict[str, Any] / Config): Config overrides as nested dict or dict
keyed by section values in dot notation.
RETURNS (Language): The loaded nlp object.
"""
return util.load_model(
name, vocab=vocab, disable=disable, exclude=exclude, config=config
name,
vocab=vocab,
disable=disable,
enable=enable,
exclude=exclude,
config=config,
)

View File

@ -1,6 +1,6 @@
# fmt: off
__title__ = "spacy"
__version__ = "3.2.2"
__version__ = "3.5.0"
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
__projects__ = "https://github.com/explosion/projects"

View File

@ -4,6 +4,7 @@ from ._util import app, setup_cli # noqa: F401
# These are the actual functions, NOT the wrapped CLI commands. The CLI commands
# are registered automatically and won't have to be imported here.
from .benchmark_speed import benchmark_speed_cli # noqa: F401
from .download import download # noqa: F401
from .info import info # noqa: F401
from .package import package # noqa: F401
@ -14,7 +15,9 @@ from .pretrain import pretrain # noqa: F401
from .debug_data import debug_data # noqa: F401
from .debug_config import debug_config # noqa: F401
from .debug_model import debug_model # noqa: F401
from .debug_diff import debug_diff # noqa: F401
from .evaluate import evaluate # noqa: F401
from .apply import apply # noqa: F401
from .convert import convert # noqa: F401
from .init_pipeline import init_pipeline_cli # noqa: F401
from .init_config import init_config, fill_config # noqa: F401
@ -26,6 +29,7 @@ from .project.dvc import project_update_dvc # noqa: F401
from .project.push import project_push # noqa: F401
from .project.pull import project_pull # noqa: F401
from .project.document import project_document # noqa: F401
from .find_threshold import find_threshold # noqa: F401
@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)

View File

@ -12,7 +12,7 @@ from click.parser import split_arg_string
from typer.main import get_command
from contextlib import contextmanager
from thinc.api import Config, ConfigValidationError, require_gpu
from thinc.util import has_cupy, gpu_is_available
from thinc.util import gpu_is_available
from configparser import InterpolationError
import os
@ -23,7 +23,7 @@ from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS
from .. import about
if TYPE_CHECKING:
from pathy import Pathy # noqa: F401
from pathy import FluidPath # noqa: F401
SDIST_SUFFIX = ".tar.gz"
@ -46,6 +46,7 @@ DEBUG_HELP = """Suite of helpful commands for debugging and profiling. Includes
commands to check and validate your config files, training and evaluation data,
and custom model implementations.
"""
BENCHMARK_HELP = """Commands for benchmarking pipelines."""
INIT_HELP = """Commands for initializing configs and pipeline packages."""
# Wrappers for Typer's annotations. Initially created to set defaults and to
@ -54,12 +55,14 @@ Arg = typer.Argument
Opt = typer.Option
app = typer.Typer(name=NAME, help=HELP)
benchmark_cli = typer.Typer(name="benchmark", help=BENCHMARK_HELP, no_args_is_help=True)
project_cli = typer.Typer(name="project", help=PROJECT_HELP, no_args_is_help=True)
debug_cli = typer.Typer(name="debug", help=DEBUG_HELP, no_args_is_help=True)
init_cli = typer.Typer(name="init", help=INIT_HELP, no_args_is_help=True)
app.add_typer(project_cli)
app.add_typer(debug_cli)
app.add_typer(benchmark_cli)
app.add_typer(init_cli)
@ -158,15 +161,15 @@ def load_project_config(
sys.exit(1)
validate_project_version(config)
validate_project_commands(config)
if interpolate:
err = f"{PROJECT_FILE} validation error"
with show_validation_error(title=err, hint_fill=False):
config = substitute_project_variables(config, overrides)
# Make sure directories defined in config exist
for subdir in config.get("directories", []):
dir_path = path / subdir
if not dir_path.exists():
dir_path.mkdir(parents=True)
if interpolate:
err = f"{PROJECT_FILE} validation error"
with show_validation_error(title=err, hint_fill=False):
config = substitute_project_variables(config, overrides)
return config
@ -331,7 +334,7 @@ def import_code(code_path: Optional[Union[Path, str]]) -> None:
msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1)
def upload_file(src: Path, dest: Union[str, "Pathy"]) -> None:
def upload_file(src: Path, dest: Union[str, "FluidPath"]) -> None:
"""Upload a file.
src (Path): The source path.
@ -339,13 +342,20 @@ def upload_file(src: Path, dest: Union[str, "Pathy"]) -> None:
"""
import smart_open
# Create parent directories for local paths
if isinstance(dest, Path):
if not dest.parent.exists():
dest.parent.mkdir(parents=True)
dest = str(dest)
with smart_open.open(dest, mode="wb") as output_file:
with src.open(mode="rb") as input_file:
output_file.write(input_file.read())
def download_file(src: Union[str, "Pathy"], dest: Path, *, force: bool = False) -> None:
def download_file(
src: Union[str, "FluidPath"], dest: Path, *, force: bool = False
) -> None:
"""Download a file using smart_open.
url (str): The URL of the file.
@ -358,9 +368,9 @@ def download_file(src: Union[str, "Pathy"], dest: Path, *, force: bool = False)
if dest.exists() and not force:
return None
src = str(src)
with smart_open.open(src, mode="rb", ignore_ext=True) as input_file:
with smart_open.open(src, mode="rb", compression="disable") as input_file:
with dest.open(mode="wb") as output_file:
output_file.write(input_file.read())
shutil.copyfileobj(input_file, output_file)
def ensure_pathy(path):
@ -368,7 +378,7 @@ def ensure_pathy(path):
slow and annoying Google Cloud warning)."""
from pathy import Pathy # noqa: F811
return Pathy(path)
return Pathy.fluid(path)
def git_checkout(
@ -462,6 +472,23 @@ def git_sparse_checkout(repo, subpath, dest, branch):
shutil.move(str(source_path), str(dest))
def git_repo_branch_exists(repo: str, branch: str) -> bool:
"""Uses 'git ls-remote' to check if a repository and branch exists
repo (str): URL to get repo.
branch (str): Branch on repo to check.
RETURNS (bool): True if repo:branch exists.
"""
get_git_version()
cmd = f"git ls-remote {repo} {branch}"
# We might be tempted to use `--exit-code` with `git ls-remote`, but
# `run_command` handles the `returncode` for us, so we'll rely on
# the fact that stdout returns '' if the requested branch doesn't exist
ret = run_command(cmd, capture=True)
exists = ret.stdout != ""
return exists
def get_git_version(
error: str = "Could not run 'git'. Make sure it's installed and the executable is available.",
) -> Tuple[int, int]:
@ -554,5 +581,41 @@ def setup_gpu(use_gpu: int, silent=None) -> None:
require_gpu(use_gpu)
else:
local_msg.info("Using CPU")
if has_cupy and gpu_is_available():
if gpu_is_available():
local_msg.info("To switch to GPU 0, use the option: --gpu-id 0")
def walk_directory(path: Path, suffix: Optional[str] = None) -> List[Path]:
"""Given a directory and a suffix, recursively find all files matching the suffix.
Directories or files with names beginning with a . are ignored, but hidden flags on
filesystems are not checked.
When provided with a suffix `None`, there is no suffix-based filtering."""
if not path.is_dir():
return [path]
paths = [path]
locs = []
seen = set()
for path in paths:
if str(path) in seen:
continue
seen.add(str(path))
if path.parts[-1].startswith("."):
continue
elif path.is_dir():
paths.extend(path.iterdir())
elif suffix is not None and not path.parts[-1].endswith(suffix):
continue
else:
locs.append(path)
# It's good to sort these, in case the ordering messes up cache.
locs.sort()
return locs
def _format_number(number: Union[int, float], ndigits: int = 2) -> str:
"""Formats a number (float or int) rounding to `ndigits`, without truncating trailing 0s,
as happens with `round(number, ndigits)`"""
if isinstance(number, float):
return f"{number:.{ndigits}f}"
else:
return str(number)

143
spacy/cli/apply.py Normal file
View File

@ -0,0 +1,143 @@
import tqdm
import srsly
from itertools import chain
from pathlib import Path
from typing import Optional, List, Iterable, cast, Union
from wasabi import msg
from ._util import app, Arg, Opt, setup_gpu, import_code, walk_directory
from ..tokens import Doc, DocBin
from ..vocab import Vocab
from ..util import ensure_path, load_model
path_help = """Location of the documents to predict on.
Can be a single file in .spacy format or a .jsonl file.
Files with other extensions are treated as single plain text documents.
If a directory is provided it is traversed recursively to grab
all files to be processed.
The files can be a mixture of .spacy, .jsonl and text files.
If .jsonl is provided the specified field is going
to be grabbed ("text" by default)."""
out_help = "Path to save the resulting .spacy file"
code_help = (
"Path to Python file with additional " "code (registered functions) to be imported"
)
gold_help = "Use gold preprocessing provided in the .spacy files"
force_msg = (
"The provided output file already exists. "
"To force overwriting the output file, set the --force or -F flag."
)
DocOrStrStream = Union[Iterable[str], Iterable[Doc]]
def _stream_docbin(path: Path, vocab: Vocab) -> Iterable[Doc]:
"""
Stream Doc objects from DocBin.
"""
docbin = DocBin().from_disk(path)
for doc in docbin.get_docs(vocab):
yield doc
def _stream_jsonl(path: Path, field: str) -> Iterable[str]:
"""
Stream "text" field from JSONL. If the field "text" is
not found it raises error.
"""
for entry in srsly.read_jsonl(path):
if field not in entry:
msg.fail(f"{path} does not contain the required '{field}' field.", exits=1)
else:
yield entry[field]
def _stream_texts(paths: Iterable[Path]) -> Iterable[str]:
"""
Yields strings from text files in paths.
"""
for path in paths:
with open(path, "r") as fin:
text = fin.read()
yield text
@app.command("apply")
def apply_cli(
# fmt: off
model: str = Arg(..., help="Model name or path"),
data_path: Path = Arg(..., help=path_help, exists=True),
output_file: Path = Arg(..., help=out_help, dir_okay=False),
code_path: Optional[Path] = Opt(None, "--code", "-c", help=code_help),
text_key: str = Opt("text", "--text-key", "-tk", help="Key containing text string for JSONL"),
force_overwrite: bool = Opt(False, "--force", "-F", help="Force overwriting the output file"),
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU."),
batch_size: int = Opt(1, "--batch-size", "-b", help="Batch size."),
n_process: int = Opt(1, "--n-process", "-n", help="number of processors to use.")
):
"""
Apply a trained pipeline to documents to get predictions.
Expects a loadable spaCy pipeline and path to the data, which
can be a directory or a file.
The data files can be provided in multiple formats:
1. .spacy files
2. .jsonl files with a specified "field" to read the text from.
3. Files with any other extension are assumed to be containing
a single document.
DOCS: https://spacy.io/api/cli#apply
"""
data_path = ensure_path(data_path)
output_file = ensure_path(output_file)
code_path = ensure_path(code_path)
if output_file.exists() and not force_overwrite:
msg.fail(force_msg, exits=1)
if not data_path.exists():
msg.fail(f"Couldn't find data path: {data_path}", exits=1)
import_code(code_path)
setup_gpu(use_gpu)
apply(data_path, output_file, model, text_key, batch_size, n_process)
def apply(
data_path: Path,
output_file: Path,
model: str,
json_field: str,
batch_size: int,
n_process: int,
):
docbin = DocBin(store_user_data=True)
paths = walk_directory(data_path)
if len(paths) == 0:
docbin.to_disk(output_file)
msg.warn(
"Did not find data to process,"
f" {data_path} seems to be an empty directory."
)
return
nlp = load_model(model)
msg.good(f"Loaded model {model}")
vocab = nlp.vocab
streams: List[DocOrStrStream] = []
text_files = []
for path in paths:
if path.suffix == ".spacy":
streams.append(_stream_docbin(path, vocab))
elif path.suffix == ".jsonl":
streams.append(_stream_jsonl(path, json_field))
else:
text_files.append(path)
if len(text_files) > 0:
streams.append(_stream_texts(text_files))
datagen = cast(DocOrStrStream, chain(*streams))
for doc in tqdm.tqdm(nlp.pipe(datagen, batch_size=batch_size, n_process=n_process)):
docbin.add(doc)
if output_file.suffix == "":
output_file = output_file.with_suffix(".spacy")
docbin.to_disk(output_file)

View File

@ -0,0 +1,174 @@
from typing import Iterable, List, Optional
import random
from itertools import islice
import numpy
from pathlib import Path
import time
from tqdm import tqdm
import typer
from wasabi import msg
from .. import util
from ..language import Language
from ..tokens import Doc
from ..training import Corpus
from ._util import Arg, Opt, benchmark_cli, setup_gpu
@benchmark_cli.command(
"speed",
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
)
def benchmark_speed_cli(
# fmt: off
ctx: typer.Context,
model: str = Arg(..., help="Model name or path"),
data_path: Path = Arg(..., help="Location of binary evaluation data in .spacy format", exists=True),
batch_size: Optional[int] = Opt(None, "--batch-size", "-b", min=1, help="Override the pipeline batch size"),
no_shuffle: bool = Opt(False, "--no-shuffle", help="Do not shuffle benchmark data"),
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
n_batches: int = Opt(50, "--batches", help="Minimum number of batches to benchmark", min=30,),
warmup_epochs: int = Opt(3, "--warmup", "-w", min=0, help="Number of iterations over the data for warmup"),
# fmt: on
):
"""
Benchmark a pipeline. Expects a loadable spaCy pipeline and benchmark
data in the binary .spacy format.
"""
setup_gpu(use_gpu=use_gpu, silent=False)
nlp = util.load_model(model)
batch_size = batch_size if batch_size is not None else nlp.batch_size
corpus = Corpus(data_path)
docs = [eg.predicted for eg in corpus(nlp)]
if len(docs) == 0:
msg.fail("Cannot benchmark speed using an empty corpus.", exits=1)
print(f"Warming up for {warmup_epochs} epochs...")
warmup(nlp, docs, warmup_epochs, batch_size)
print()
print(f"Benchmarking {n_batches} batches...")
wps = benchmark(nlp, docs, n_batches, batch_size, not no_shuffle)
print()
print_outliers(wps)
print_mean_with_ci(wps)
# Lowercased, behaves as a context manager function.
class time_context:
"""Register the running time of a context."""
def __enter__(self):
self.start = time.perf_counter()
return self
def __exit__(self, type, value, traceback):
self.elapsed = time.perf_counter() - self.start
class Quartiles:
"""Calculate the q1, q2, q3 quartiles and the inter-quartile range (iqr)
of a sample."""
q1: float
q2: float
q3: float
iqr: float
def __init__(self, sample: numpy.ndarray) -> None:
self.q1 = numpy.quantile(sample, 0.25)
self.q2 = numpy.quantile(sample, 0.5)
self.q3 = numpy.quantile(sample, 0.75)
self.iqr = self.q3 - self.q1
def annotate(
nlp: Language, docs: List[Doc], batch_size: Optional[int]
) -> numpy.ndarray:
docs = nlp.pipe(tqdm(docs, unit="doc"), batch_size=batch_size)
wps = []
while True:
with time_context() as elapsed:
batch_docs = list(
islice(docs, batch_size if batch_size else nlp.batch_size)
)
if len(batch_docs) == 0:
break
n_tokens = count_tokens(batch_docs)
wps.append(n_tokens / elapsed.elapsed)
return numpy.array(wps)
def benchmark(
nlp: Language,
docs: List[Doc],
n_batches: int,
batch_size: int,
shuffle: bool,
) -> numpy.ndarray:
if shuffle:
bench_docs = [
nlp.make_doc(random.choice(docs).text)
for _ in range(n_batches * batch_size)
]
else:
bench_docs = [
nlp.make_doc(docs[i % len(docs)].text)
for i in range(n_batches * batch_size)
]
return annotate(nlp, bench_docs, batch_size)
def bootstrap(x, statistic=numpy.mean, iterations=10000) -> numpy.ndarray:
"""Apply a statistic to repeated random samples of an array."""
return numpy.fromiter(
(
statistic(numpy.random.choice(x, len(x), replace=True))
for _ in range(iterations)
),
numpy.float64,
)
def count_tokens(docs: Iterable[Doc]) -> int:
return sum(len(doc) for doc in docs)
def print_mean_with_ci(sample: numpy.ndarray):
mean = numpy.mean(sample)
bootstrap_means = bootstrap(sample)
bootstrap_means.sort()
# 95% confidence interval
low = bootstrap_means[int(len(bootstrap_means) * 0.025)]
high = bootstrap_means[int(len(bootstrap_means) * 0.975)]
print(f"Mean: {mean:.1f} words/s (95% CI: {low-mean:.1f} +{high-mean:.1f})")
def print_outliers(sample: numpy.ndarray):
quartiles = Quartiles(sample)
n_outliers = numpy.sum(
(sample < (quartiles.q1 - 1.5 * quartiles.iqr))
| (sample > (quartiles.q3 + 1.5 * quartiles.iqr))
)
n_extreme_outliers = numpy.sum(
(sample < (quartiles.q1 - 3.0 * quartiles.iqr))
| (sample > (quartiles.q3 + 3.0 * quartiles.iqr))
)
print(
f"Outliers: {(100 * n_outliers) / len(sample):.1f}%, extreme outliers: {(100 * n_extreme_outliers) / len(sample)}%"
)
def warmup(
nlp: Language, docs: List[Doc], warmup_epochs: int, batch_size: Optional[int]
) -> numpy.ndarray:
docs = warmup_epochs * docs
return annotate(nlp, docs, batch_size)

View File

@ -1,4 +1,4 @@
from typing import Callable, Iterable, Mapping, Optional, Any, List, Union
from typing import Callable, Iterable, Mapping, Optional, Any, Union
from enum import Enum
from pathlib import Path
from wasabi import Printer
@ -7,7 +7,7 @@ import re
import sys
import itertools
from ._util import app, Arg, Opt
from ._util import app, Arg, Opt, walk_directory
from ..training import docs_to_json
from ..tokens import Doc, DocBin
from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs
@ -28,6 +28,8 @@ CONVERTERS: Mapping[str, Callable[..., Iterable[Doc]]] = {
"json": json_to_docs,
}
AUTO = "auto"
# File types that can be written to stdout
FILE_TYPES_STDOUT = ("json",)
@ -49,7 +51,7 @@ def convert_cli(
model: Optional[str] = Opt(None, "--model", "--base", "-b", help="Trained spaCy pipeline for sentence segmentation to use as base (for --seg-sents)"),
morphology: bool = Opt(False, "--morphology", "-m", help="Enable appending morphology to tags"),
merge_subtokens: bool = Opt(False, "--merge-subtokens", "-T", help="Merge CoNLL-U subtokens"),
converter: str = Opt("auto", "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}"),
converter: str = Opt(AUTO, "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}"),
ner_map: Optional[Path] = Opt(None, "--ner-map", "-nm", help="NER tag mapping (as JSON-encoded dict of entity types)", exists=True),
lang: Optional[str] = Opt(None, "--lang", "-l", help="Language (if tokenizer required)"),
concatenate: bool = Opt(None, "--concatenate", "-C", help="Concatenate output to a single file"),
@ -70,8 +72,8 @@ def convert_cli(
output_dir: Union[str, Path] = "-" if output_dir == Path("-") else output_dir
silent = output_dir == "-"
msg = Printer(no_print=silent)
verify_cli_args(msg, input_path, output_dir, file_type.value, converter, ner_map)
converter = _get_converter(msg, converter, input_path)
verify_cli_args(msg, input_path, output_dir, file_type.value, converter, ner_map)
convert(
input_path,
output_dir,
@ -100,7 +102,7 @@ def convert(
model: Optional[str] = None,
morphology: bool = False,
merge_subtokens: bool = False,
converter: str = "auto",
converter: str,
ner_map: Optional[Path] = None,
lang: Optional[str] = None,
concatenate: bool = False,
@ -189,33 +191,6 @@ def autodetect_ner_format(input_data: str) -> Optional[str]:
return None
def walk_directory(path: Path, converter: str) -> List[Path]:
if not path.is_dir():
return [path]
paths = [path]
locs = []
seen = set()
for path in paths:
if str(path) in seen:
continue
seen.add(str(path))
if path.parts[-1].startswith("."):
continue
elif path.is_dir():
paths.extend(path.iterdir())
elif converter == "json" and not path.parts[-1].endswith("json"):
continue
elif converter == "conll" and not path.parts[-1].endswith("conll"):
continue
elif converter == "iob" and not path.parts[-1].endswith("iob"):
continue
else:
locs.append(path)
# It's good to sort these, in case the ordering messes up cache.
locs.sort()
return locs
def verify_cli_args(
msg: Printer,
input_path: Path,
@ -239,18 +214,22 @@ def verify_cli_args(
input_locs = walk_directory(input_path, converter)
if len(input_locs) == 0:
msg.fail("No input files in directory", input_path, exits=1)
file_types = list(set([loc.suffix[1:] for loc in input_locs]))
if converter == "auto" and len(file_types) >= 2:
file_types_str = ",".join(file_types)
msg.fail("All input files must be same type", file_types_str, exits=1)
if converter != "auto" and converter not in CONVERTERS:
if converter not in CONVERTERS:
msg.fail(f"Can't find converter for {converter}", exits=1)
def _get_converter(msg, converter, input_path: Path):
if input_path.is_dir():
input_path = walk_directory(input_path, converter)[0]
if converter == "auto":
if converter == AUTO:
input_locs = walk_directory(input_path, suffix=None)
file_types = list(set([loc.suffix[1:] for loc in input_locs]))
if len(file_types) >= 2:
file_types_str = ",".join(file_types)
msg.fail("All input files must be same type", file_types_str, exits=1)
input_path = input_locs[0]
else:
input_path = walk_directory(input_path, suffix=converter)[0]
if converter == AUTO:
converter = input_path.suffix[1:]
if converter == "ner" or converter == "iob":
with input_path.open(encoding="utf8") as file_:

View File

@ -6,12 +6,14 @@ import sys
import srsly
from wasabi import Printer, MESSAGES, msg
import typer
import math
from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
from ._util import import_code, debug_cli
from ..training import Example
from ._util import import_code, debug_cli, _format_number
from ..training import Example, remove_bilu_prefix
from ..training.initialize import get_sourced_components
from ..schemas import ConfigSchemaTraining
from ..pipeline import TrainablePipe
from ..pipeline._parser_internals import nonproj
from ..pipeline._parser_internals.nonproj import DELIMITER
from ..pipeline import Morphologizer, SpanCategorizer
@ -19,6 +21,7 @@ from ..morphology import Morphology
from ..language import Language
from ..util import registry, resolve_dot_names
from ..compat import Literal
from ..vectors import Mode as VectorsMode
from .. import util
@ -29,6 +32,12 @@ DEP_LABEL_THRESHOLD = 20
# Minimum number of expected examples to train a new pipeline
BLANK_MODEL_MIN_THRESHOLD = 100
BLANK_MODEL_THRESHOLD = 2000
# Arbitrary threshold where SpanCat performs well
SPAN_DISTINCT_THRESHOLD = 1
# Arbitrary threshold where SpanCat performs well
BOUNDARY_DISTINCT_THRESHOLD = 1
# Arbitrary threshold for filtering span lengths during reporting (percentage)
SPAN_LENGTH_THRESHOLD_PERCENTAGE = 90
@debug_cli.command(
@ -170,26 +179,34 @@ def debug_data(
show=verbose,
)
if len(nlp.vocab.vectors):
msg.info(
f"{len(nlp.vocab.vectors)} vectors ({nlp.vocab.vectors.n_keys} "
f"unique keys, {nlp.vocab.vectors_length} dimensions)"
)
n_missing_vectors = sum(gold_train_data["words_missing_vectors"].values())
msg.warn(
"{} words in training data without vectors ({:.0f}%)".format(
n_missing_vectors,
100 * (n_missing_vectors / gold_train_data["n_words"]),
),
)
msg.text(
"10 most common words without vectors: {}".format(
_format_labels(
gold_train_data["words_missing_vectors"].most_common(10),
counts=True,
)
),
show=verbose,
)
if nlp.vocab.vectors.mode == VectorsMode.floret:
msg.info(
f"floret vectors with {len(nlp.vocab.vectors)} vectors, "
f"{nlp.vocab.vectors_length} dimensions, "
f"{nlp.vocab.vectors.minn}-{nlp.vocab.vectors.maxn} char "
f"n-gram subwords"
)
else:
msg.info(
f"{len(nlp.vocab.vectors)} vectors ({nlp.vocab.vectors.n_keys} "
f"unique keys, {nlp.vocab.vectors_length} dimensions)"
)
n_missing_vectors = sum(gold_train_data["words_missing_vectors"].values())
msg.warn(
"{} words in training data without vectors ({:.0f}%)".format(
n_missing_vectors,
100 * (n_missing_vectors / gold_train_data["n_words"]),
),
)
msg.text(
"10 most common words without vectors: {}".format(
_format_labels(
gold_train_data["words_missing_vectors"].most_common(10),
counts=True,
)
),
show=verbose,
)
else:
msg.info("No word vectors present in the package")
@ -238,6 +255,69 @@ def debug_data(
msg.warn(f"No examples for texts WITHOUT new label '{label}'")
has_no_neg_warning = True
with msg.loading("Obtaining span characteristics..."):
span_characteristics = _get_span_characteristics(
train_dataset, gold_train_data, spans_key
)
msg.info(f"Span characteristics for spans_key '{spans_key}'")
msg.info("SD = Span Distinctiveness, BD = Boundary Distinctiveness")
_print_span_characteristics(span_characteristics)
_span_freqs = _get_spans_length_freq_dist(
gold_train_data["spans_length"][spans_key]
)
_filtered_span_freqs = _filter_spans_length_freq_dist(
_span_freqs, threshold=SPAN_LENGTH_THRESHOLD_PERCENTAGE
)
msg.info(
f"Over {SPAN_LENGTH_THRESHOLD_PERCENTAGE}% of spans have lengths of 1 -- "
f"{max(_filtered_span_freqs.keys())} "
f"(min={span_characteristics['min_length']}, max={span_characteristics['max_length']}). "
f"The most common span lengths are: {_format_freqs(_filtered_span_freqs)}. "
"If you are using the n-gram suggester, note that omitting "
"infrequent n-gram lengths can greatly improve speed and "
"memory usage."
)
msg.text(
f"Full distribution of span lengths: {_format_freqs(_span_freqs)}",
show=verbose,
)
# Add report regarding span characteristics
if span_characteristics["avg_sd"] < SPAN_DISTINCT_THRESHOLD:
msg.warn("Spans may not be distinct from the rest of the corpus")
else:
msg.good("Spans are distinct from the rest of the corpus")
p_spans = span_characteristics["p_spans"].values()
all_span_tokens: Counter = sum(p_spans, Counter())
most_common_spans = [w for w, _ in all_span_tokens.most_common(10)]
msg.text(
"10 most common span tokens: {}".format(
_format_labels(most_common_spans)
),
show=verbose,
)
# Add report regarding span boundary characteristics
if span_characteristics["avg_bd"] < BOUNDARY_DISTINCT_THRESHOLD:
msg.warn("Boundary tokens are not distinct from the rest of the corpus")
else:
msg.good("Boundary tokens are distinct from the rest of the corpus")
p_bounds = span_characteristics["p_bounds"].values()
all_span_bound_tokens: Counter = sum(p_bounds, Counter())
most_common_bounds = [w for w, _ in all_span_bound_tokens.most_common(10)]
msg.text(
"10 most common span boundary tokens: {}".format(
_format_labels(most_common_bounds)
),
show=verbose,
)
if has_low_data_warning:
msg.text(
f"To train a new span type, your data should include at "
@ -282,7 +362,7 @@ def debug_data(
if label != "-"
]
labels_with_counts = _format_labels(labels_with_counts, counts=True)
msg.text(f"Labels in train data: {_format_labels(labels)}", show=verbose)
msg.text(f"Labels in train data: {labels_with_counts}", show=verbose)
missing_labels = model_labels - labels
if missing_labels:
msg.warn(
@ -638,6 +718,9 @@ def _compile_gold(
"words": Counter(),
"roots": Counter(),
"spancat": dict(),
"spans_length": dict(),
"spans_per_type": dict(),
"sb_per_type": dict(),
"ws_ents": 0,
"boundary_cross_ents": 0,
"n_words": 0,
@ -676,21 +759,66 @@ def _compile_gold(
# "Illegal" whitespace entity
data["ws_ents"] += 1
if label.startswith(("B-", "U-")):
combined_label = label.split("-")[1]
combined_label = remove_bilu_prefix(label)
data["ner"][combined_label] += 1
if sent_starts[i] == True and label.startswith(("I-", "L-")):
if sent_starts[i] and label.startswith(("I-", "L-")):
data["boundary_cross_ents"] += 1
elif label == "-":
data["ner"]["-"] += 1
if "spancat" in factory_names:
for span_key in list(eg.reference.spans.keys()):
if span_key not in data["spancat"]:
data["spancat"][span_key] = Counter()
for i, span in enumerate(eg.reference.spans[span_key]):
for spans_key in list(eg.reference.spans.keys()):
# Obtain the span frequency
if spans_key not in data["spancat"]:
data["spancat"][spans_key] = Counter()
for i, span in enumerate(eg.reference.spans[spans_key]):
if span.label_ is None:
continue
else:
data["spancat"][span_key][span.label_] += 1
data["spancat"][spans_key][span.label_] += 1
# Obtain the span length
if spans_key not in data["spans_length"]:
data["spans_length"][spans_key] = dict()
for span in gold.spans[spans_key]:
if span.label_ is None:
continue
if span.label_ not in data["spans_length"][spans_key]:
data["spans_length"][spans_key][span.label_] = []
data["spans_length"][spans_key][span.label_].append(len(span))
# Obtain spans per span type
if spans_key not in data["spans_per_type"]:
data["spans_per_type"][spans_key] = dict()
for span in gold.spans[spans_key]:
if span.label_ not in data["spans_per_type"][spans_key]:
data["spans_per_type"][spans_key][span.label_] = []
data["spans_per_type"][spans_key][span.label_].append(span)
# Obtain boundary tokens per span type
window_size = 1
if spans_key not in data["sb_per_type"]:
data["sb_per_type"][spans_key] = dict()
for span in gold.spans[spans_key]:
if span.label_ not in data["sb_per_type"][spans_key]:
# Creating a data structure that holds the start and
# end tokens for each span type
data["sb_per_type"][spans_key][span.label_] = {
"start": [],
"end": [],
}
for offset in range(window_size):
sb_start_idx = span.start - (offset + 1)
if sb_start_idx >= 0:
data["sb_per_type"][spans_key][span.label_]["start"].append(
gold[sb_start_idx : sb_start_idx + 1]
)
sb_end_idx = span.end + (offset + 1)
if sb_end_idx <= len(gold):
data["sb_per_type"][spans_key][span.label_]["end"].append(
gold[sb_end_idx - 1 : sb_end_idx]
)
if "textcat" in factory_names or "textcat_multilabel" in factory_names:
data["cats"].update(gold.cats)
if any(val not in (0, 1) for val in gold.cats.values()):
@ -761,6 +889,16 @@ def _format_labels(
return ", ".join([f"'{l}'" for l in cast(Iterable[str], labels)])
def _format_freqs(freqs: Dict[int, float], sort: bool = True) -> str:
if sort:
freqs = dict(sorted(freqs.items()))
_freqs = [(str(k), v) for k, v in freqs.items()]
return ", ".join(
[f"{l} ({c}%)" for l, c in cast(Iterable[Tuple[str, float]], _freqs)]
)
def _get_examples_without_label(
data: Sequence[Example],
label: str,
@ -771,7 +909,7 @@ def _get_examples_without_label(
for eg in data:
if component == "ner":
labels = [
label.split("-")[1]
remove_bilu_prefix(label)
for label in eg.get_aligned_ner()
if label not in ("O", "-", None)
]
@ -797,6 +935,7 @@ def _get_labels_from_model(nlp: Language, factory_name: str) -> Set[str]:
labels: Set[str] = set()
for pipe_name in pipe_names:
pipe = nlp.get_pipe(pipe_name)
assert isinstance(pipe, TrainablePipe)
labels.update(pipe.labels)
return labels
@ -815,3 +954,177 @@ def _get_labels_from_spancat(nlp: Language) -> Dict[str, Set[str]]:
labels[pipe.key] = set()
labels[pipe.key].update(pipe.labels)
return labels
def _gmean(l: List) -> float:
"""Compute geometric mean of a list"""
return math.exp(math.fsum(math.log(i) for i in l) / len(l))
def _wgt_average(metric: Dict[str, float], frequencies: Counter) -> float:
total = sum(value * frequencies[span_type] for span_type, value in metric.items())
return total / sum(frequencies.values())
def _get_distribution(docs, normalize: bool = True) -> Counter:
"""Get the frequency distribution given a set of Docs"""
word_counts: Counter = Counter()
for doc in docs:
for token in doc:
# Normalize the text
t = token.text.lower().replace("``", '"').replace("''", '"')
word_counts[t] += 1
if normalize:
total = sum(word_counts.values(), 0.0)
word_counts = Counter({k: v / total for k, v in word_counts.items()})
return word_counts
def _get_kl_divergence(p: Counter, q: Counter) -> float:
"""Compute the Kullback-Leibler divergence from two frequency distributions"""
total = 0.0
for word, p_word in p.items():
total += p_word * math.log(p_word / q[word])
return total
def _format_span_row(span_data: List[Dict], labels: List[str]) -> List[Any]:
"""Compile into one list for easier reporting"""
d = {
label: [label] + list(_format_number(d[label]) for d in span_data)
for label in labels
}
return list(d.values())
def _get_span_characteristics(
examples: List[Example], compiled_gold: Dict[str, Any], spans_key: str
) -> Dict[str, Any]:
"""Obtain all span characteristics"""
data_labels = compiled_gold["spancat"][spans_key]
# Get lengths
span_length = {
label: _gmean(l)
for label, l in compiled_gold["spans_length"][spans_key].items()
}
spans_per_type = {
label: len(spans)
for label, spans in compiled_gold["spans_per_type"][spans_key].items()
}
min_lengths = [min(l) for l in compiled_gold["spans_length"][spans_key].values()]
max_lengths = [max(l) for l in compiled_gold["spans_length"][spans_key].values()]
# Get relevant distributions: corpus, spans, span boundaries
p_corpus = _get_distribution([eg.reference for eg in examples], normalize=True)
p_spans = {
label: _get_distribution(spans, normalize=True)
for label, spans in compiled_gold["spans_per_type"][spans_key].items()
}
p_bounds = {
label: _get_distribution(sb["start"] + sb["end"], normalize=True)
for label, sb in compiled_gold["sb_per_type"][spans_key].items()
}
# Compute for actual span characteristics
span_distinctiveness = {
label: _get_kl_divergence(freq_dist, p_corpus)
for label, freq_dist in p_spans.items()
}
sb_distinctiveness = {
label: _get_kl_divergence(freq_dist, p_corpus)
for label, freq_dist in p_bounds.items()
}
return {
"sd": span_distinctiveness,
"bd": sb_distinctiveness,
"spans_per_type": spans_per_type,
"lengths": span_length,
"min_length": min(min_lengths),
"max_length": max(max_lengths),
"avg_sd": _wgt_average(span_distinctiveness, data_labels),
"avg_bd": _wgt_average(sb_distinctiveness, data_labels),
"avg_length": _wgt_average(span_length, data_labels),
"labels": list(data_labels.keys()),
"p_spans": p_spans,
"p_bounds": p_bounds,
}
def _print_span_characteristics(span_characteristics: Dict[str, Any]):
"""Print all span characteristics into a table"""
headers = ("Span Type", "Length", "SD", "BD", "N")
# Wasabi has this at 30 by default, but we might have some long labels
max_col = max(30, max(len(label) for label in span_characteristics["labels"]))
# Prepare table data with all span characteristics
table_data = [
span_characteristics["lengths"],
span_characteristics["sd"],
span_characteristics["bd"],
span_characteristics["spans_per_type"],
]
table = _format_span_row(
span_data=table_data, labels=span_characteristics["labels"]
)
# Prepare table footer with weighted averages
footer_data = [
span_characteristics["avg_length"],
span_characteristics["avg_sd"],
span_characteristics["avg_bd"],
]
footer = (
["Wgt. Average"] + ["{:.2f}".format(round(f, 2)) for f in footer_data] + ["-"]
)
msg.table(
table,
footer=footer,
header=headers,
divider=True,
aligns=["l"] + ["r"] * (len(footer_data) + 1),
max_col=max_col,
)
def _get_spans_length_freq_dist(
length_dict: Dict, threshold=SPAN_LENGTH_THRESHOLD_PERCENTAGE
) -> Dict[int, float]:
"""Get frequency distribution of spans length under a certain threshold"""
all_span_lengths = []
for _, lengths in length_dict.items():
all_span_lengths.extend(lengths)
freq_dist: Counter = Counter()
for i in all_span_lengths:
if freq_dist.get(i):
freq_dist[i] += 1
else:
freq_dist[i] = 1
# We will be working with percentages instead of raw counts
freq_dist_percentage = {}
for span_length, count in freq_dist.most_common():
percentage = (count / len(all_span_lengths)) * 100.0
percentage = round(percentage, 2)
freq_dist_percentage[span_length] = percentage
return freq_dist_percentage
def _filter_spans_length_freq_dist(
freq_dist: Dict[int, float], threshold: int
) -> Dict[int, float]:
"""Filter frequency distribution with respect to a threshold
We're going to filter all the span lengths that fall
around a percentage threshold when summed.
"""
total = 0.0
filtered_freq_dist = {}
for span_length, dist in freq_dist.items():
if total >= threshold:
break
else:
filtered_freq_dist[span_length] = dist
total += dist
return filtered_freq_dist

89
spacy/cli/debug_diff.py Normal file
View File

@ -0,0 +1,89 @@
from typing import Optional
import typer
from wasabi import Printer, diff_strings, MarkdownRenderer
from pathlib import Path
from thinc.api import Config
from ._util import debug_cli, Arg, Opt, show_validation_error, parse_config_overrides
from ..util import load_config
from .init_config import init_config, Optimizations
@debug_cli.command(
"diff-config",
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
)
def debug_diff_cli(
# fmt: off
ctx: typer.Context,
config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
compare_to: Optional[Path] = Opt(None, help="Path to a config file to diff against, or `None` to compare against default settings", exists=True, allow_dash=True),
optimize: Optimizations = Opt(Optimizations.efficiency.value, "--optimize", "-o", help="Whether the user config was optimized for efficiency or accuracy. Only relevant when comparing against the default config."),
gpu: bool = Opt(False, "--gpu", "-G", help="Whether the original config can run on a GPU. Only relevant when comparing against the default config."),
pretraining: bool = Opt(False, "--pretraining", "--pt", help="Whether to compare on a config with pretraining involved. Only relevant when comparing against the default config."),
markdown: bool = Opt(False, "--markdown", "-md", help="Generate Markdown for GitHub issues")
# fmt: on
):
"""Show a diff of a config file with respect to spaCy's defaults or another config file. If
additional settings were used in the creation of the config file, then you
must supply these as extra parameters to the command when comparing to the default settings. The generated diff
can also be used when posting to the discussion forum to provide more
information for the maintainers.
The `optimize`, `gpu`, and `pretraining` options are only relevant when
comparing against the default configuration (or specifically when `compare_to` is None).
DOCS: https://spacy.io/api/cli#debug-diff
"""
debug_diff(
config_path=config_path,
compare_to=compare_to,
gpu=gpu,
optimize=optimize,
pretraining=pretraining,
markdown=markdown,
)
def debug_diff(
config_path: Path,
compare_to: Optional[Path],
gpu: bool,
optimize: Optimizations,
pretraining: bool,
markdown: bool,
):
msg = Printer()
with show_validation_error(hint_fill=False):
user_config = load_config(config_path)
if compare_to:
other_config = load_config(compare_to)
else:
# Recreate a default config based from user's config
lang = user_config["nlp"]["lang"]
pipeline = list(user_config["nlp"]["pipeline"])
msg.info(f"Found user-defined language: '{lang}'")
msg.info(f"Found user-defined pipelines: {pipeline}")
other_config = init_config(
lang=lang,
pipeline=pipeline,
optimize=optimize.value,
gpu=gpu,
pretraining=pretraining,
silent=True,
)
user = user_config.to_str()
other = other_config.to_str()
if user == other:
msg.warn("No diff to show: configs are identical")
else:
diff_text = diff_strings(other, user, add_symbols=markdown)
if markdown:
md = MarkdownRenderer()
md.add(md.code_block(diff_text, "diff"))
print(md.text)
else:
print(diff_text)

View File

@ -7,6 +7,7 @@ import typer
from ._util import app, Arg, Opt, WHEEL_SUFFIX, SDIST_SUFFIX
from .. import about
from ..util import is_package, get_minor_version, run_command
from ..util import is_prerelease_version
from ..errors import OLD_MODEL_SHORTCUTS
@ -19,7 +20,7 @@ def download_cli(
ctx: typer.Context,
model: str = Arg(..., help="Name of pipeline package to download"),
direct: bool = Opt(False, "--direct", "-d", "-D", help="Force direct download of name + version"),
sdist: bool = Opt(False, "--sdist", "-S", help="Download sdist (.tar.gz) archive instead of pre-built binary wheel")
sdist: bool = Opt(False, "--sdist", "-S", help="Download sdist (.tar.gz) archive instead of pre-built binary wheel"),
# fmt: on
):
"""
@ -35,7 +36,12 @@ def download_cli(
download(model, direct, sdist, *ctx.args)
def download(model: str, direct: bool = False, sdist: bool = False, *pip_args) -> None:
def download(
model: str,
direct: bool = False,
sdist: bool = False,
*pip_args,
) -> None:
if (
not (is_package("spacy") or is_package("spacy-nightly"))
and "--no-deps" not in pip_args
@ -49,13 +55,10 @@ def download(model: str, direct: bool = False, sdist: bool = False, *pip_args) -
"dependencies, you'll have to install them manually."
)
pip_args = pip_args + ("--no-deps",)
suffix = SDIST_SUFFIX if sdist else WHEEL_SUFFIX
dl_tpl = "{m}-{v}/{m}-{v}{s}#egg={m}=={v}"
if direct:
components = model.split("-")
model_name = "".join(components[:-1])
version = components[-1]
download_model(dl_tpl.format(m=model_name, v=version, s=suffix), pip_args)
else:
model_name = model
if model in OLD_MODEL_SHORTCUTS:
@ -66,15 +69,31 @@ def download(model: str, direct: bool = False, sdist: bool = False, *pip_args) -
model_name = OLD_MODEL_SHORTCUTS[model]
compatibility = get_compatibility()
version = get_version(model_name, compatibility)
download_model(dl_tpl.format(m=model_name, v=version, s=suffix), pip_args)
filename = get_model_filename(model_name, version, sdist)
download_model(filename, pip_args)
msg.good(
"Download and installation successful",
f"You can now load the package via spacy.load('{model_name}')",
)
def get_model_filename(model_name: str, version: str, sdist: bool = False) -> str:
dl_tpl = "{m}-{v}/{m}-{v}{s}"
egg_tpl = "#egg={m}=={v}"
suffix = SDIST_SUFFIX if sdist else WHEEL_SUFFIX
filename = dl_tpl.format(m=model_name, v=version, s=suffix)
if sdist:
filename += egg_tpl.format(m=model_name, v=version)
return filename
def get_compatibility() -> dict:
version = get_minor_version(about.__version__)
if is_prerelease_version(about.__version__):
version: Optional[str] = about.__version__
else:
version = get_minor_version(about.__version__)
r = requests.get(about.__compatibility__)
if r.status_code != 200:
msg.fail(
@ -101,6 +120,11 @@ def get_version(model: str, comp: dict) -> str:
return comp[model][0]
def get_latest_version(model: str) -> str:
comp = get_compatibility()
return get_version(model, comp)
def download_model(
filename: str, user_pip_args: Optional[Sequence[str]] = None
) -> None:

View File

@ -7,12 +7,15 @@ from thinc.api import fix_random_seed
from ..training import Corpus
from ..tokens import Doc
from ._util import app, Arg, Opt, setup_gpu, import_code
from ._util import app, Arg, Opt, setup_gpu, import_code, benchmark_cli
from ..scorer import Scorer
from .. import util
from .. import displacy
@benchmark_cli.command(
"accuracy",
)
@app.command("evaluate")
def evaluate_cli(
# fmt: off
@ -36,7 +39,7 @@ def evaluate_cli(
dependency parses in a HTML file, set as output directory as the
displacy_path argument.
DOCS: https://spacy.io/api/cli#evaluate
DOCS: https://spacy.io/api/cli#benchmark-accuracy
"""
import_code(code_path)
evaluate(

233
spacy/cli/find_threshold.py Normal file
View File

@ -0,0 +1,233 @@
import functools
import operator
from pathlib import Path
import logging
from typing import Optional, Tuple, Any, Dict, List
import numpy
import wasabi.tables
from ..pipeline import TextCategorizer, MultiLabel_TextCategorizer
from ..errors import Errors
from ..training import Corpus
from ._util import app, Arg, Opt, import_code, setup_gpu
from .. import util
_DEFAULTS = {
"n_trials": 11,
"use_gpu": -1,
"gold_preproc": False,
}
@app.command(
"find-threshold",
context_settings={"allow_extra_args": False, "ignore_unknown_options": True},
)
def find_threshold_cli(
# fmt: off
model: str = Arg(..., help="Model name or path"),
data_path: Path = Arg(..., help="Location of binary evaluation data in .spacy format", exists=True),
pipe_name: str = Arg(..., help="Name of pipe to examine thresholds for"),
threshold_key: str = Arg(..., help="Key of threshold attribute in component's configuration"),
scores_key: str = Arg(..., help="Metric to optimize"),
n_trials: int = Opt(_DEFAULTS["n_trials"], "--n_trials", "-n", help="Number of trials to determine optimal thresholds"),
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
use_gpu: int = Opt(_DEFAULTS["use_gpu"], "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
gold_preproc: bool = Opt(_DEFAULTS["gold_preproc"], "--gold-preproc", "-G", help="Use gold preprocessing"),
verbose: bool = Opt(False, "--silent", "-V", "-VV", help="Display more information for debugging purposes"),
# fmt: on
):
"""
Runs prediction trials for a trained model with varying tresholds to maximize
the specified metric. The search space for the threshold is traversed linearly
from 0 to 1 in `n_trials` steps. Results are displayed in a table on `stdout`
(the corresponding API call to `spacy.cli.find_threshold.find_threshold()`
returns all results).
This is applicable only for components whose predictions are influenced by
thresholds - e.g. `textcat_multilabel` and `spancat`, but not `textcat`. Note
that the full path to the corresponding threshold attribute in the config has to
be provided.
DOCS: https://spacy.io/api/cli#find-threshold
"""
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
import_code(code_path)
find_threshold(
model=model,
data_path=data_path,
pipe_name=pipe_name,
threshold_key=threshold_key,
scores_key=scores_key,
n_trials=n_trials,
use_gpu=use_gpu,
gold_preproc=gold_preproc,
silent=False,
)
def find_threshold(
model: str,
data_path: Path,
pipe_name: str,
threshold_key: str,
scores_key: str,
*,
n_trials: int = _DEFAULTS["n_trials"], # type: ignore
use_gpu: int = _DEFAULTS["use_gpu"], # type: ignore
gold_preproc: bool = _DEFAULTS["gold_preproc"], # type: ignore
silent: bool = True,
) -> Tuple[float, float, Dict[float, float]]:
"""
Runs prediction trials for models with varying tresholds to maximize the specified metric.
model (Union[str, Path]): Pipeline to evaluate. Can be a package or a path to a data directory.
data_path (Path): Path to file with DocBin with docs to use for threshold search.
pipe_name (str): Name of pipe to examine thresholds for.
threshold_key (str): Key of threshold attribute in component's configuration.
scores_key (str): Name of score to metric to optimize.
n_trials (int): Number of trials to determine optimal thresholds.
use_gpu (int): GPU ID or -1 for CPU.
gold_preproc (bool): Whether to use gold preprocessing. Gold preprocessing helps the annotations align to the
tokenization, and may result in sequences of more consistent length. However, it may reduce runtime accuracy due
to train/test skew.
silent (bool): Whether to print non-error-related output to stdout.
RETURNS (Tuple[float, float, Dict[float, float]]): Best found threshold, the corresponding score, scores for all
evaluated thresholds.
"""
setup_gpu(use_gpu, silent=silent)
data_path = util.ensure_path(data_path)
if not data_path.exists():
wasabi.msg.fail("Evaluation data not found", data_path, exits=1)
nlp = util.load_model(model)
if pipe_name not in nlp.component_names:
raise AttributeError(
Errors.E001.format(name=pipe_name, opts=nlp.component_names)
)
pipe = nlp.get_pipe(pipe_name)
if not hasattr(pipe, "scorer"):
raise AttributeError(Errors.E1045)
if type(pipe) == TextCategorizer:
wasabi.msg.warn(
"The `textcat` component doesn't use a threshold as it's not applicable to the concept of "
"exclusive classes. All thresholds will yield the same results."
)
if not silent:
wasabi.msg.info(
title=f"Optimizing for {scores_key} for component '{pipe_name}' with {n_trials} "
f"trials."
)
# Load evaluation corpus.
corpus = Corpus(data_path, gold_preproc=gold_preproc)
dev_dataset = list(corpus(nlp))
config_keys = threshold_key.split(".")
def set_nested_item(
config: Dict[str, Any], keys: List[str], value: float
) -> Dict[str, Any]:
"""Set item in nested dictionary. Adapted from https://stackoverflow.com/a/54138200.
config (Dict[str, Any]): Configuration dictionary.
keys (List[Any]): Path to value to set.
value (float): Value to set.
RETURNS (Dict[str, Any]): Updated dictionary.
"""
functools.reduce(operator.getitem, keys[:-1], config)[keys[-1]] = value
return config
def filter_config(
config: Dict[str, Any], keys: List[str], full_key: str
) -> Dict[str, Any]:
"""Filters provided config dictionary so that only the specified keys path remains.
config (Dict[str, Any]): Configuration dictionary.
keys (List[Any]): Path to value to set.
full_key (str): Full user-specified key.
RETURNS (Dict[str, Any]): Filtered dictionary.
"""
if keys[0] not in config:
wasabi.msg.fail(
title=f"Failed to look up `{full_key}` in config: sub-key {[keys[0]]} not found.",
text=f"Make sure you specified {[keys[0]]} correctly. The following sub-keys are available instead: "
f"{list(config.keys())}",
exits=1,
)
return {
keys[0]: filter_config(config[keys[0]], keys[1:], full_key)
if len(keys) > 1
else config[keys[0]]
}
# Evaluate with varying threshold values.
scores: Dict[float, float] = {}
config_keys_full = ["components", pipe_name, *config_keys]
table_col_widths = (10, 10)
thresholds = numpy.linspace(0, 1, n_trials)
print(wasabi.tables.row(["Threshold", f"{scores_key}"], widths=table_col_widths))
for threshold in thresholds:
# Reload pipeline with overrides specifying the new threshold.
nlp = util.load_model(
model,
config=set_nested_item(
filter_config(
nlp.config, config_keys_full, ".".join(config_keys_full)
).copy(),
config_keys_full,
threshold,
),
)
if hasattr(pipe, "cfg"):
setattr(
nlp.get_pipe(pipe_name),
"cfg",
set_nested_item(getattr(pipe, "cfg"), config_keys, threshold),
)
eval_scores = nlp.evaluate(dev_dataset)
if scores_key not in eval_scores:
wasabi.msg.fail(
title=f"Failed to look up score `{scores_key}` in evaluation results.",
text=f"Make sure you specified the correct value for `scores_key`. The following scores are "
f"available: {list(eval_scores.keys())}",
exits=1,
)
scores[threshold] = eval_scores[scores_key]
if not isinstance(scores[threshold], (float, int)):
wasabi.msg.fail(
f"Returned score for key '{scores_key}' is not numeric. Threshold optimization only works for numeric "
f"scores.",
exits=1,
)
print(
wasabi.row(
[round(threshold, 3), round(scores[threshold], 3)],
widths=table_col_widths,
)
)
best_threshold = max(scores.keys(), key=(lambda key: scores[key]))
# If all scores are identical, emit warning.
if len(set(scores.values())) == 1:
wasabi.msg.warn(
title="All scores are identical. Verify that all settings are correct.",
text=""
if (
not isinstance(pipe, MultiLabel_TextCategorizer)
or scores_key in ("cats_macro_f", "cats_micro_f")
)
else "Use `cats_macro_f` or `cats_micro_f` when optimizing the threshold for `textcat_multilabel`.",
)
else:
if not silent:
print(
f"\nBest threshold: {round(best_threshold, ndigits=4)} with {scores_key} value of {scores[best_threshold]}."
)
return best_threshold, scores[best_threshold], scores

View File

@ -1,10 +1,13 @@
from typing import Optional, Dict, Any, Union, List
import platform
import pkg_resources
import json
from pathlib import Path
from wasabi import Printer, MarkdownRenderer
import srsly
from ._util import app, Arg, Opt, string_to_list
from .download import get_model_filename, get_latest_version
from .. import util
from .. import about
@ -16,6 +19,7 @@ def info_cli(
markdown: bool = Opt(False, "--markdown", "-md", help="Generate Markdown for GitHub issues"),
silent: bool = Opt(False, "--silent", "-s", "-S", help="Don't print anything (just return)"),
exclude: str = Opt("labels", "--exclude", "-e", help="Comma-separated keys to exclude from the print-out"),
url: bool = Opt(False, "--url", "-u", help="Print the URL to download the most recent compatible version of the pipeline"),
# fmt: on
):
"""
@ -23,10 +27,19 @@ def info_cli(
print its meta information. Flag --markdown prints details in Markdown for easy
copy-pasting to GitHub issues.
Flag --url prints only the download URL of the most recent compatible
version of the pipeline.
DOCS: https://spacy.io/api/cli#info
"""
exclude = string_to_list(exclude)
info(model, markdown=markdown, silent=silent, exclude=exclude)
info(
model,
markdown=markdown,
silent=silent,
exclude=exclude,
url=url,
)
def info(
@ -35,11 +48,20 @@ def info(
markdown: bool = False,
silent: bool = True,
exclude: Optional[List[str]] = None,
url: bool = False,
) -> Union[str, dict]:
msg = Printer(no_print=silent, pretty=not silent)
if not exclude:
exclude = []
if model:
if url:
if model is not None:
title = f"Download info for pipeline '{model}'"
data = info_model_url(model)
print(data["download_url"])
return data
else:
msg.fail("--url option requires a pipeline name", exits=1)
elif model:
title = f"Info about pipeline '{model}'"
data = info_model(model, silent=silent)
else:
@ -99,11 +121,44 @@ def info_model(model: str, *, silent: bool = True) -> Dict[str, Any]:
meta["source"] = str(model_path.resolve())
else:
meta["source"] = str(model_path)
download_url = info_installed_model_url(model)
if download_url:
meta["download_url"] = download_url
return {
k: v for k, v in meta.items() if k not in ("accuracy", "performance", "speed")
}
def info_installed_model_url(model: str) -> Optional[str]:
"""Given a pipeline name, get the download URL if available, otherwise
return None.
This is only available for pipelines installed as modules that have
dist-info available.
"""
try:
dist = pkg_resources.get_distribution(model)
data = json.loads(dist.get_metadata("direct_url.json"))
return data["url"]
except pkg_resources.DistributionNotFound:
# no such package
return None
except Exception:
# something else, like no file or invalid JSON
return None
def info_model_url(model: str) -> Dict[str, Any]:
"""Return the download URL for the latest version of a pipeline."""
version = get_latest_version(model)
filename = get_model_filename(model, version)
download_url = about.__download_url__ + "/" + filename
release_tpl = "https://github.com/explosion/spacy-models/releases/tag/{m}-{v}"
release_url = release_tpl.format(m=model, v=version)
return {"download_url": download_url, "release_url": release_url}
def get_markdown(
data: Dict[str, Any],
title: Optional[str] = None,

View File

@ -10,6 +10,7 @@ from jinja2 import Template
from .. import util
from ..language import DEFAULT_CONFIG_PRETRAIN_PATH
from ..schemas import RecommendationSchema
from ..util import SimpleFrozenList
from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND
from ._util import string_to_list, import_code
@ -24,16 +25,30 @@ class Optimizations(str, Enum):
accuracy = "accuracy"
class InitValues:
"""
Default values for initialization. Dedicated class to allow synchronized default values for init_config_cli() and
init_config(), i.e. initialization calls via CLI respectively Python.
"""
lang = "en"
pipeline = SimpleFrozenList(["tagger", "parser", "ner"])
optimize = Optimizations.efficiency
gpu = False
pretraining = False
force_overwrite = False
@init_cli.command("config")
def init_config_cli(
# fmt: off
output_file: Path = Arg(..., help="File to save the config to or - for stdout (will only output config and no additional logging info)", allow_dash=True),
lang: str = Opt("en", "--lang", "-l", help="Two-letter code of the language to use"),
pipeline: str = Opt("tagger,parser,ner", "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include (without 'tok2vec' or 'transformer')"),
optimize: Optimizations = Opt(Optimizations.efficiency.value, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."),
gpu: bool = Opt(False, "--gpu", "-G", help="Whether the model can run on GPU. This will impact the choice of architecture, pretrained weights and related hyperparameters."),
pretraining: bool = Opt(False, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"),
force_overwrite: bool = Opt(False, "--force", "-F", help="Force overwriting the output file"),
lang: str = Opt(InitValues.lang, "--lang", "-l", help="Two-letter code of the language to use"),
pipeline: str = Opt(",".join(InitValues.pipeline), "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include (without 'tok2vec' or 'transformer')"),
optimize: Optimizations = Opt(InitValues.optimize, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."),
gpu: bool = Opt(InitValues.gpu, "--gpu", "-G", help="Whether the model can run on GPU. This will impact the choice of architecture, pretrained weights and related hyperparameters."),
pretraining: bool = Opt(InitValues.pretraining, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"),
force_overwrite: bool = Opt(InitValues.force_overwrite, "--force", "-F", help="Force overwriting the output file"),
# fmt: on
):
"""
@ -133,11 +148,11 @@ def fill_config(
def init_config(
*,
lang: str,
pipeline: List[str],
optimize: str,
gpu: bool,
pretraining: bool = False,
lang: str = InitValues.lang,
pipeline: List[str] = InitValues.pipeline,
optimize: str = InitValues.optimize,
gpu: bool = InitValues.gpu,
pretraining: bool = InitValues.pretraining,
silent: bool = True,
) -> Config:
msg = Printer(no_print=silent)

View File

@ -299,8 +299,8 @@ def get_meta(
}
nlp = util.load_model_from_path(Path(model_path))
meta.update(nlp.meta)
meta.update(existing_meta)
meta["spacy_version"] = util.get_minor_version_range(about.__version__)
meta.update(existing_meta)
meta["vectors"] = {
"width": nlp.vocab.vectors_length,
"vectors": len(nlp.vocab.vectors),

View File

@ -61,7 +61,7 @@ def pretrain_cli(
# TODO: What's the solution here? How do we handle optional blocks?
msg.fail("The [pretraining] block in your config is empty", exits=1)
if not output_dir.exists():
output_dir.mkdir()
output_dir.mkdir(parents=True)
msg.good(f"Created output directory: {output_dir}")
# Save non-interpolated config
raw_config.to_disk(output_dir / "config.cfg")

View File

@ -12,6 +12,9 @@ from .._util import project_cli, Arg, Opt, PROJECT_FILE, load_project_config
from .._util import get_checksum, download_file, git_checkout, get_git_version
from .._util import SimpleFrozenDict, parse_config_overrides
# Whether assets are extra if `extra` is not set.
EXTRA_DEFAULT = False
@project_cli.command(
"assets",
@ -21,7 +24,8 @@ def project_assets_cli(
# fmt: off
ctx: typer.Context, # This is only used to read additional arguments
project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse checkout for assets provided via Git, to only check out and clone the files needed. Requires Git v22.2+.")
sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse checkout for assets provided via Git, to only check out and clone the files needed. Requires Git v22.2+."),
extra: bool = Opt(False, "--extra", "-e", help="Download all assets, including those marked as 'extra'.")
# fmt: on
):
"""Fetch project assets like datasets and pretrained weights. Assets are
@ -32,7 +36,12 @@ def project_assets_cli(
DOCS: https://spacy.io/api/cli#project-assets
"""
overrides = parse_config_overrides(ctx.args)
project_assets(project_dir, overrides=overrides, sparse_checkout=sparse_checkout)
project_assets(
project_dir,
overrides=overrides,
sparse_checkout=sparse_checkout,
extra=extra,
)
def project_assets(
@ -40,17 +49,29 @@ def project_assets(
*,
overrides: Dict[str, Any] = SimpleFrozenDict(),
sparse_checkout: bool = False,
extra: bool = False,
) -> None:
"""Fetch assets for a project using DVC if possible.
project_dir (Path): Path to project directory.
sparse_checkout (bool): Use sparse checkout for assets provided via Git, to only check out and clone the files
needed.
extra (bool): Whether to download all assets, including those marked as 'extra'.
"""
project_path = ensure_path(project_dir)
config = load_project_config(project_path, overrides=overrides)
assets = config.get("assets", {})
assets = [
asset
for asset in config.get("assets", [])
if extra or not asset.get("extra", EXTRA_DEFAULT)
]
if not assets:
msg.warn(f"No assets specified in {PROJECT_FILE}", exits=0)
msg.warn(
f"No assets specified in {PROJECT_FILE} (if assets are marked as extra, download them with --extra)",
exits=0,
)
msg.info(f"Fetching {len(assets)} asset(s)")
for asset in assets:
dest = (project_dir / asset["dest"]).resolve()
checksum = asset.get("checksum")
@ -168,7 +189,11 @@ def convert_asset_url(url: str) -> str:
RETURNS (str): The converted URL.
"""
# If the asset URL is a regular GitHub URL it's likely a mistake
if re.match(r"(http(s?)):\/\/github.com", url) and "releases/download" not in url:
if (
re.match(r"(http(s?)):\/\/github.com", url)
and "releases/download" not in url
and "/raw/" not in url
):
converted = url.replace("github.com", "raw.githubusercontent.com")
converted = re.sub(r"/(tree|blob)/", "/", converted)
msg.warn(

View File

@ -7,11 +7,11 @@ import re
from ... import about
from ...util import ensure_path
from .._util import project_cli, Arg, Opt, COMMAND, PROJECT_FILE
from .._util import git_checkout, get_git_version
from .._util import git_checkout, get_git_version, git_repo_branch_exists
DEFAULT_REPO = about.__projects__
DEFAULT_PROJECTS_BRANCH = about.__projects_branch__
DEFAULT_BRANCH = "master"
DEFAULT_BRANCHES = ["main", "master"]
@project_cli.command("clone")
@ -20,7 +20,7 @@ def project_clone_cli(
name: str = Arg(..., help="The name of the template to clone"),
dest: Optional[Path] = Arg(None, help="Where to clone the project. Defaults to current working directory", exists=False),
repo: str = Opt(DEFAULT_REPO, "--repo", "-r", help="The repository to clone from"),
branch: Optional[str] = Opt(None, "--branch", "-b", help="The branch to clone from"),
branch: Optional[str] = Opt(None, "--branch", "-b", help=f"The branch to clone from. If not provided, will attempt {', '.join(DEFAULT_BRANCHES)}"),
sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse Git checkout to only check out and clone the files needed. Requires Git v22.2+.")
# fmt: on
):
@ -33,9 +33,25 @@ def project_clone_cli(
"""
if dest is None:
dest = Path.cwd() / Path(name).parts[-1]
if repo == DEFAULT_REPO and branch is None:
branch = DEFAULT_PROJECTS_BRANCH
if branch is None:
# If it's a user repo, we want to default to other branch
branch = DEFAULT_PROJECTS_BRANCH if repo == DEFAULT_REPO else DEFAULT_BRANCH
for default_branch in DEFAULT_BRANCHES:
if git_repo_branch_exists(repo, default_branch):
branch = default_branch
break
if branch is None:
default_branches_msg = ", ".join(f"'{b}'" for b in DEFAULT_BRANCHES)
msg.fail(
"No branch provided and attempted default "
f"branches {default_branches_msg} do not exist.",
exits=1,
)
else:
if not git_repo_branch_exists(repo, branch):
msg.fail(f"repo: {repo} (branch: {branch}) does not exist.", exits=1)
assert isinstance(branch, str)
project_clone(name, dest, repo=repo, branch=branch, sparse_checkout=sparse_checkout)
@ -61,9 +77,9 @@ def project_clone(
try:
git_checkout(repo, name, dest, branch=branch, sparse=sparse_checkout)
except subprocess.CalledProcessError:
err = f"Could not clone '{name}' from repo '{repo_name}'"
err = f"Could not clone '{name}' from repo '{repo_name}' (branch '{branch}')"
msg.fail(err, exits=1)
msg.good(f"Cloned '{name}' from {repo_name}", project_dir)
msg.good(f"Cloned '{name}' from '{repo_name}' (branch '{branch}')", project_dir)
if not (project_dir / PROJECT_FILE).exists():
msg.warn(f"No {PROJECT_FILE} found in directory")
else:

View File

@ -25,6 +25,7 @@ def project_update_dvc_cli(
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
workflow: Optional[str] = Arg(None, help=f"Name of workflow defined in {PROJECT_FILE}. Defaults to first workflow if not set."),
verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"),
quiet: bool = Opt(False, "--quiet", "-q", help="Print less info"),
force: bool = Opt(False, "--force", "-F", help="Force update DVC config"),
# fmt: on
):
@ -36,7 +37,7 @@ def project_update_dvc_cli(
DOCS: https://spacy.io/api/cli#project-dvc
"""
project_update_dvc(project_dir, workflow, verbose=verbose, force=force)
project_update_dvc(project_dir, workflow, verbose=verbose, quiet=quiet, force=force)
def project_update_dvc(
@ -44,6 +45,7 @@ def project_update_dvc(
workflow: Optional[str] = None,
*,
verbose: bool = False,
quiet: bool = False,
force: bool = False,
) -> None:
"""Update the auto-generated Data Version Control (DVC) config file. A DVC
@ -54,11 +56,12 @@ def project_update_dvc(
workflow (Optional[str]): Optional name of workflow defined in project.yml.
If not set, the first workflow will be used.
verbose (bool): Print more info.
quiet (bool): Print less info.
force (bool): Force update DVC config.
"""
config = load_project_config(project_dir)
updated = update_dvc_config(
project_dir, config, workflow, verbose=verbose, force=force
project_dir, config, workflow, verbose=verbose, quiet=quiet, force=force
)
help_msg = "To execute the workflow with DVC, run: dvc repro"
if updated:
@ -72,7 +75,7 @@ def update_dvc_config(
config: Dict[str, Any],
workflow: Optional[str] = None,
verbose: bool = False,
silent: bool = False,
quiet: bool = False,
force: bool = False,
) -> bool:
"""Re-run the DVC commands in dry mode and update dvc.yaml file in the
@ -83,7 +86,7 @@ def update_dvc_config(
path (Path): The path to the project directory.
config (Dict[str, Any]): The loaded project.yml.
verbose (bool): Whether to print additional info (via DVC).
silent (bool): Don't output anything (via DVC).
quiet (bool): Don't output anything (via DVC).
force (bool): Force update, even if hashes match.
RETURNS (bool): Whether the DVC config file was updated.
"""
@ -105,6 +108,14 @@ def update_dvc_config(
dvc_config_path.unlink()
dvc_commands = []
config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
# some flags that apply to every command
flags = []
if verbose:
flags.append("--verbose")
if quiet:
flags.append("--quiet")
for name in workflows[workflow]:
command = config_commands[name]
deps = command.get("deps", [])
@ -118,14 +129,26 @@ def update_dvc_config(
deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl]
outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl]
outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl]
dvc_cmd = ["run", "-n", name, "-w", str(path), "--no-exec"]
dvc_cmd = ["run", *flags, "-n", name, "-w", str(path), "--no-exec"]
if command.get("no_skip"):
dvc_cmd.append("--always-changed")
full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd]
dvc_commands.append(join_command(full_cmd))
if not dvc_commands:
# If we don't check for this, then there will be an error when reading the
# config, since DVC wouldn't create it.
msg.fail(
"No usable commands for DVC found. This can happen if none of your "
"commands have dependencies or outputs.",
exits=1,
)
with working_dir(path):
dvc_flags = {"--verbose": verbose, "--quiet": silent}
run_dvc_commands(dvc_commands, flags=dvc_flags)
for c in dvc_commands:
dvc_command = "dvc " + c
run_command(dvc_command)
with dvc_config_path.open("r+", encoding="utf8") as f:
content = f.read()
f.seek(0, 0)
@ -133,26 +156,6 @@ def update_dvc_config(
return True
def run_dvc_commands(
commands: Iterable[str] = SimpleFrozenList(), flags: Dict[str, bool] = {}
) -> None:
"""Run a sequence of DVC commands in a subprocess, in order.
commands (List[str]): The string commands without the leading "dvc".
flags (Dict[str, bool]): Conditional flags to be added to command. Makes it
easier to pass flags like --quiet that depend on a variable or
command-line setting while avoiding lots of nested conditionals.
"""
for c in commands:
command = split_command(c)
dvc_command = ["dvc", *command]
# Add the flags if they are set to True
for flag, is_active in flags.items():
if is_active:
dvc_command.append(flag)
run_command(dvc_command)
def check_workflows(workflows: List[str], workflow: Optional[str] = None) -> None:
"""Validate workflows provided in project.yml and check that a given
workflow can be used to generate a DVC config.

View File

@ -5,14 +5,17 @@ import hashlib
import urllib.parse
import tarfile
from pathlib import Path
from wasabi import msg
from .._util import get_hash, get_checksum, download_file, ensure_pathy
from ...util import make_tempdir, get_minor_version, ENV_VARS, check_bool_env_var
from .._util import get_hash, get_checksum, upload_file, download_file
from .._util import ensure_pathy, make_tempdir
from ...util import get_minor_version, ENV_VARS, check_bool_env_var
from ...git_info import GIT_VERSION
from ... import about
from ...errors import Errors
if TYPE_CHECKING:
from pathy import Pathy # noqa: F401
from pathy import FluidPath # noqa: F401
class RemoteStorage:
@ -27,7 +30,7 @@ class RemoteStorage:
self.url = ensure_pathy(url)
self.compression = compression
def push(self, path: Path, command_hash: str, content_hash: str) -> "Pathy":
def push(self, path: Path, command_hash: str, content_hash: str) -> "FluidPath":
"""Compress a file or directory within a project and upload it to a remote
storage. If an object exists at the full URL, nothing is done.
@ -48,9 +51,7 @@ class RemoteStorage:
mode_string = f"w:{self.compression}" if self.compression else "w"
with tarfile.open(tar_loc, mode=mode_string) as tar_file:
tar_file.add(str(loc), arcname=str(path))
with tar_loc.open(mode="rb") as input_file:
with url.open(mode="wb") as output_file:
output_file.write(input_file.read())
upload_file(tar_loc, url)
return url
def pull(
@ -59,7 +60,7 @@ class RemoteStorage:
*,
command_hash: Optional[str] = None,
content_hash: Optional[str] = None,
) -> Optional["Pathy"]:
) -> Optional["FluidPath"]:
"""Retrieve a file from the remote cache. If the file already exists,
nothing is done.
@ -84,7 +85,23 @@ class RemoteStorage:
with tarfile.open(tar_loc, mode=mode_string) as tar_file:
# This requires that the path is added correctly, relative
# to root. This is how we set things up in push()
tar_file.extractall(self.root)
# Disallow paths outside the current directory for the tar
# file (CVE-2007-4559, directory traversal vulnerability)
def is_within_directory(directory, target):
abs_directory = os.path.abspath(directory)
abs_target = os.path.abspath(target)
prefix = os.path.commonprefix([abs_directory, abs_target])
return prefix == abs_directory
def safe_extract(tar, path):
for member in tar.getmembers():
member_path = os.path.join(path, member.name)
if not is_within_directory(path, member_path):
raise ValueError(Errors.E852)
tar.extractall(path)
safe_extract(tar_file, self.root)
return url
def find(
@ -93,25 +110,37 @@ class RemoteStorage:
*,
command_hash: Optional[str] = None,
content_hash: Optional[str] = None,
) -> Optional["Pathy"]:
) -> Optional["FluidPath"]:
"""Find the best matching version of a file within the storage,
or `None` if no match can be found. If both the creation and content hash
are specified, only exact matches will be returned. Otherwise, the most
recent matching file is preferred.
"""
name = self.encode_name(str(path))
urls = []
if command_hash is not None and content_hash is not None:
url = self.make_url(path, command_hash, content_hash)
url = self.url / name / command_hash / content_hash
urls = [url] if url.exists() else []
elif command_hash is not None:
urls = list((self.url / name / command_hash).iterdir())
if (self.url / name / command_hash).exists():
urls = list((self.url / name / command_hash).iterdir())
else:
urls = list((self.url / name).iterdir())
if content_hash is not None:
urls = [url for url in urls if url.parts[-1] == content_hash]
if (self.url / name).exists():
for sub_dir in (self.url / name).iterdir():
urls.extend(sub_dir.iterdir())
if content_hash is not None:
urls = [url for url in urls if url.parts[-1] == content_hash]
if len(urls) >= 2:
try:
urls.sort(key=lambda x: x.stat().last_modified) # type: ignore
except Exception:
msg.warn(
"Unable to sort remote files by last modified. The file(s) "
"pulled from the cache may not be the most recent."
)
return urls[-1] if urls else None
def make_url(self, path: Path, command_hash: str, content_hash: str) -> "Pathy":
def make_url(self, path: Path, command_hash: str, content_hash: str) -> "FluidPath":
"""Construct a URL from a subpath, a creation hash and a content hash."""
return self.url / self.encode_name(str(path)) / command_hash / content_hash

View File

@ -1,5 +1,8 @@
from typing import Optional, List, Dict, Sequence, Any, Iterable
from typing import Optional, List, Dict, Sequence, Any, Iterable, Tuple
import os.path
from pathlib import Path
import pkg_resources
from wasabi import msg
from wasabi.util import locale_escape
import sys
@ -50,6 +53,7 @@ def project_run(
force: bool = False,
dry: bool = False,
capture: bool = False,
skip_requirements_check: bool = False,
) -> None:
"""Run a named script defined in the project.yml. If the script is part
of the default pipeline (defined in the "run" section), DVC is used to
@ -66,11 +70,19 @@ def project_run(
sys.exit will be called with the return code. You should use capture=False
when you want to turn over execution to the command, and capture=True
when you want to run the command more like a function.
skip_requirements_check (bool): Whether to skip the requirements check.
"""
config = load_project_config(project_dir, overrides=overrides)
commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
workflows = config.get("workflows", {})
validate_subcommand(list(commands.keys()), list(workflows.keys()), subcommand)
req_path = project_dir / "requirements.txt"
if not skip_requirements_check:
if config.get("check_requirements", True) and os.path.exists(req_path):
with req_path.open() as requirements_file:
_check_requirements([req.strip() for req in requirements_file])
if subcommand in workflows:
msg.info(f"Running workflow '{subcommand}'")
for cmd in workflows[subcommand]:
@ -81,6 +93,7 @@ def project_run(
force=force,
dry=dry,
capture=capture,
skip_requirements_check=True,
)
else:
cmd = commands[subcommand]
@ -88,8 +101,8 @@ def project_run(
if not (project_dir / dep).exists():
err = f"Missing dependency specified by command '{subcommand}': {dep}"
err_help = "Maybe you forgot to run the 'project assets' command or a previous step?"
err_kwargs = {"exits": 1} if not dry else {}
msg.fail(err, err_help, **err_kwargs)
err_exits = 1 if not dry else None
msg.fail(err, err_help, exits=err_exits)
check_spacy_commit = check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION)
with working_dir(project_dir) as current_dir:
msg.divider(subcommand)
@ -195,6 +208,8 @@ def validate_subcommand(
msg.fail(f"No commands or workflows defined in {PROJECT_FILE}", exits=1)
if subcommand not in commands and subcommand not in workflows:
help_msg = []
if subcommand in ["assets", "asset"]:
help_msg.append("Did you mean to run: python -m spacy project assets?")
if commands:
help_msg.append(f"Available commands: {', '.join(commands)}")
if workflows:
@ -308,3 +323,38 @@ def get_fileinfo(project_dir: Path, paths: List[str]) -> List[Dict[str, Optional
md5 = get_checksum(file_path) if file_path.exists() else None
data.append({"path": path, "md5": md5})
return data
def _check_requirements(requirements: List[str]) -> Tuple[bool, bool]:
"""Checks whether requirements are installed and free of version conflicts.
requirements (List[str]): List of requirements.
RETURNS (Tuple[bool, bool]): Whether (1) any packages couldn't be imported, (2) any packages with version conflicts
exist.
"""
failed_pkgs_msgs: List[str] = []
conflicting_pkgs_msgs: List[str] = []
for req in requirements:
try:
pkg_resources.require(req)
except pkg_resources.DistributionNotFound as dnf:
failed_pkgs_msgs.append(dnf.report())
except pkg_resources.VersionConflict as vc:
conflicting_pkgs_msgs.append(vc.report())
except Exception:
msg.warn(
f"Unable to check requirement: {req} "
"Checks are currently limited to requirement specifiers "
"(PEP 508)"
)
if len(failed_pkgs_msgs) or len(conflicting_pkgs_msgs):
msg.warn(
title="Missing requirements or requirement conflicts detected. Make sure your Python environment is set up "
"correctly and you installed all requirements specified in your project's requirements.txt: "
)
for pgk_msg in failed_pkgs_msgs + conflicting_pkgs_msgs:
msg.text(pgk_msg)
return len(failed_pkgs_msgs) > 0, len(conflicting_pkgs_msgs) > 0

View File

@ -1,8 +1,9 @@
{# This is a template for training configs used for the quickstart widget in
the docs and the init config command. It encodes various best practices and
can help generate the best possible configuration, given a user's requirements. #}
{%- set use_transformer = hardware != "cpu" -%}
{%- set use_transformer = hardware != "cpu" and transformer_data -%}
{%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
{%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "spancat", "trainable_lemmatizer"] -%}
[paths]
train = null
dev = null
@ -24,10 +25,10 @@ lang = "{{ lang }}"
{%- set has_textcat = ("textcat" in components or "textcat_multilabel" in components) -%}
{%- set with_accuracy = optimize == "accuracy" -%}
{%- set has_accurate_textcat = has_textcat and with_accuracy -%}
{%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "entity_linker" in components or has_accurate_textcat) -%}
{%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components %}
{%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "spancat" in components or "trainable_lemmatizer" in components or "entity_linker" in components or has_accurate_textcat) -%}
{%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components -%}
{%- else -%}
{%- set full_pipeline = components %}
{%- set full_pipeline = components -%}
{%- endif %}
pipeline = {{ full_pipeline|pprint()|replace("'", '"')|safe }}
batch_size = {{ 128 if hardware == "gpu" else 1000 }}
@ -54,7 +55,7 @@ stride = 96
factory = "morphologizer"
[components.morphologizer.model]
@architectures = "spacy.Tagger.v1"
@architectures = "spacy.Tagger.v2"
nO = null
[components.morphologizer.model.tok2vec]
@ -70,7 +71,7 @@ grad_factor = 1.0
factory = "tagger"
[components.tagger.model]
@architectures = "spacy.Tagger.v1"
@architectures = "spacy.Tagger.v2"
nO = null
[components.tagger.model.tok2vec]
@ -123,6 +124,60 @@ grad_factor = 1.0
@layers = "reduce_mean.v1"
{% endif -%}
{% if "spancat" in components -%}
[components.spancat]
factory = "spancat"
max_positive = null
scorer = {"@scorers":"spacy.spancat_scorer.v1"}
spans_key = "sc"
threshold = 0.5
[components.spancat.model]
@architectures = "spacy.SpanCategorizer.v1"
[components.spancat.model.reducer]
@layers = "spacy.mean_max_reducer.v1"
hidden_size = 128
[components.spancat.model.scorer]
@layers = "spacy.LinearLogistic.v1"
nO = null
nI = null
[components.spancat.model.tok2vec]
@architectures = "spacy-transformers.TransformerListener.v1"
grad_factor = 1.0
[components.spancat.model.tok2vec.pooling]
@layers = "reduce_mean.v1"
[components.spancat.suggester]
@misc = "spacy.ngram_suggester.v1"
sizes = [1,2,3]
{% endif -%}
{% if "trainable_lemmatizer" in components -%}
[components.trainable_lemmatizer]
factory = "trainable_lemmatizer"
backoff = "orth"
min_tree_freq = 3
overwrite = false
scorer = {"@scorers":"spacy.lemmatizer_scorer.v1"}
top_k = 1
[components.trainable_lemmatizer.model]
@architectures = "spacy.Tagger.v2"
nO = null
normalize = false
[components.trainable_lemmatizer.model.tok2vec]
@architectures = "spacy-transformers.TransformerListener.v1"
grad_factor = 1.0
[components.trainable_lemmatizer.model.tok2vec.pooling]
@layers = "reduce_mean.v1"
{% endif -%}
{% if "entity_linker" in components -%}
[components.entity_linker]
factory = "entity_linker"
@ -131,7 +186,7 @@ incl_context = true
incl_prior = true
[components.entity_linker.model]
@architectures = "spacy.EntityLinker.v1"
@architectures = "spacy.EntityLinker.v2"
nO = null
[components.entity_linker.model.tok2vec]
@ -216,13 +271,8 @@ factory = "tok2vec"
[components.tok2vec.model.embed]
@architectures = "spacy.MultiHashEmbed.v2"
width = ${components.tok2vec.model.encode.width}
{% if has_letters -%}
attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
rows = [5000, 2500, 2500, 2500]
{% else -%}
attrs = ["ORTH", "SHAPE"]
rows = [5000, 2500]
{% endif -%}
rows = [5000, 1000, 2500, 2500]
include_static_vectors = {{ "true" if optimize == "accuracy" else "false" }}
[components.tok2vec.model.encode]
@ -238,7 +288,7 @@ maxout_pieces = 3
factory = "morphologizer"
[components.morphologizer.model]
@architectures = "spacy.Tagger.v1"
@architectures = "spacy.Tagger.v2"
nO = null
[components.morphologizer.model.tok2vec]
@ -251,7 +301,7 @@ width = ${components.tok2vec.model.encode.width}
factory = "tagger"
[components.tagger.model]
@architectures = "spacy.Tagger.v1"
@architectures = "spacy.Tagger.v2"
nO = null
[components.tagger.model.tok2vec]
@ -295,6 +345,54 @@ nO = null
width = ${components.tok2vec.model.encode.width}
{% endif %}
{% if "spancat" in components %}
[components.spancat]
factory = "spancat"
max_positive = null
scorer = {"@scorers":"spacy.spancat_scorer.v1"}
spans_key = "sc"
threshold = 0.5
[components.spancat.model]
@architectures = "spacy.SpanCategorizer.v1"
[components.spancat.model.reducer]
@layers = "spacy.mean_max_reducer.v1"
hidden_size = 128
[components.spancat.model.scorer]
@layers = "spacy.LinearLogistic.v1"
nO = null
nI = null
[components.spancat.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode.width}
[components.spancat.suggester]
@misc = "spacy.ngram_suggester.v1"
sizes = [1,2,3]
{% endif %}
{% if "trainable_lemmatizer" in components -%}
[components.trainable_lemmatizer]
factory = "trainable_lemmatizer"
backoff = "orth"
min_tree_freq = 3
overwrite = false
scorer = {"@scorers":"spacy.lemmatizer_scorer.v1"}
top_k = 1
[components.trainable_lemmatizer.model]
@architectures = "spacy.Tagger.v2"
nO = null
normalize = false
[components.trainable_lemmatizer.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode.width}
{% endif -%}
{% if "entity_linker" in components -%}
[components.entity_linker]
factory = "entity_linker"
@ -303,7 +401,7 @@ incl_context = true
incl_prior = true
[components.entity_linker.model]
@architectures = "spacy.EntityLinker.v1"
@architectures = "spacy.EntityLinker.v2"
nO = null
[components.entity_linker.model.tok2vec]
@ -369,7 +467,7 @@ no_output_layer = false
{% endif %}
{% for pipe in components %}
{% if pipe not in ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker"] %}
{% if pipe not in listener_components %}
{# Other components defined by the user: we just assume they're factories #}
[components.{{ pipe }}]
factory = "{{ pipe }}"

View File

@ -37,6 +37,15 @@ bn:
accuracy:
name: sagorsarker/bangla-bert-base
size_factor: 3
ca:
word_vectors: null
transformer:
efficiency:
name: projecte-aina/roberta-base-ca-v2
size_factor: 3
accuracy:
name: projecte-aina/roberta-base-ca-v2
size_factor: 3
da:
word_vectors: da_core_news_lg
transformer:
@ -271,4 +280,3 @@ zh:
accuracy:
name: bert-base-chinese
size_factor: 3
has_letters: false

View File

@ -90,6 +90,8 @@ dev_corpus = "corpora.dev"
train_corpus = "corpora.train"
# Optional callback before nlp object is saved to disk after training
before_to_disk = null
# Optional callback that is invoked at the start of each training step
before_update = null
[training.logger]
@loggers = "spacy.ConsoleLogger.v1"

View File

@ -7,10 +7,11 @@ USAGE: https://spacy.io/usage/visualizers
from typing import Union, Iterable, Optional, Dict, Any, Callable
import warnings
from .render import DependencyRenderer, EntityRenderer
from .render import DependencyRenderer, EntityRenderer, SpanRenderer
from ..tokens import Doc, Span
from ..errors import Errors, Warnings
from ..util import is_in_jupyter
from ..util import find_available_port
_html = {}
@ -36,7 +37,7 @@ def render(
jupyter (bool): Override Jupyter auto-detection.
options (dict): Visualiser-specific options, e.g. colors.
manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
RETURNS (str): Rendered HTML markup.
RETURNS (str): Rendered SVG or HTML markup.
DOCS: https://spacy.io/api/top-level#displacy.render
USAGE: https://spacy.io/usage/visualizers
@ -44,6 +45,7 @@ def render(
factories = {
"dep": (DependencyRenderer, parse_deps),
"ent": (EntityRenderer, parse_ents),
"span": (SpanRenderer, parse_spans),
}
if style not in factories:
raise ValueError(Errors.E087.format(style=style))
@ -55,6 +57,10 @@ def render(
renderer_func, converter = factories[style]
renderer = renderer_func(options=options)
parsed = [converter(doc, options) for doc in docs] if not manual else docs # type: ignore
if manual:
for doc in docs:
if isinstance(doc, dict) and "ents" in doc:
doc["ents"] = sorted(doc["ents"], key=lambda x: (x["start"], x["end"]))
_html["parsed"] = renderer.render(parsed, page=page, minify=minify).strip() # type: ignore
html = _html["parsed"]
if RENDER_WRAPPER is not None:
@ -77,6 +83,7 @@ def serve(
manual: bool = False,
port: int = 5000,
host: str = "0.0.0.0",
auto_select_port: bool = False,
) -> None:
"""Serve displaCy visualisation.
@ -88,12 +95,15 @@ def serve(
manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
port (int): Port to serve visualisation.
host (str): Host to serve visualisation.
auto_select_port (bool): Automatically select a port if the specified port is in use.
DOCS: https://spacy.io/api/top-level#displacy.serve
USAGE: https://spacy.io/usage/visualizers
"""
from wsgiref import simple_server
port = find_available_port(port, host, auto_select_port)
if is_in_jupyter():
warnings.warn(Warnings.W011)
render(docs, style=style, page=page, minify=minify, options=options, manual=manual)
@ -118,7 +128,8 @@ def app(environ, start_response):
def parse_deps(orig_doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
"""Generate dependency parse in {'words': [], 'arcs': []} format.
doc (Doc): Document do parse.
orig_doc (Doc): Document to parse.
options (Dict[str, Any]): Dependency parse specific visualisation options.
RETURNS (dict): Generated dependency parse keyed by words and arcs.
"""
doc = Doc(orig_doc.vocab).from_bytes(
@ -203,6 +214,43 @@ def parse_ents(doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
return {"text": doc.text, "ents": ents, "title": title, "settings": settings}
def parse_spans(doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
"""Generate spans in [{start_token: i, end_token: i, label: 'label'}] format.
doc (Doc): Document to parse.
options (Dict[str, any]): Span-specific visualisation options.
RETURNS (dict): Generated span types keyed by text (original text) and spans.
"""
kb_url_template = options.get("kb_url_template", None)
spans_key = options.get("spans_key", "sc")
spans = [
{
"start": span.start_char,
"end": span.end_char,
"start_token": span.start,
"end_token": span.end,
"label": span.label_,
"kb_id": span.kb_id_ if span.kb_id_ else "",
"kb_url": kb_url_template.format(span.kb_id_) if kb_url_template else "#",
}
for span in doc.spans.get(spans_key, [])
]
tokens = [token.text for token in doc]
if not spans:
keys = list(doc.spans.keys())
warnings.warn(Warnings.W117.format(spans_key=spans_key, keys=keys))
title = doc.user_data.get("title", None) if hasattr(doc, "user_data") else None
settings = get_doc_settings(doc)
return {
"text": doc.text,
"spans": spans,
"title": title,
"settings": settings,
"tokens": tokens,
}
def set_render_wrapper(func: Callable[[str], str]) -> None:
"""Set an optional wrapper function that is called around the generated
HTML markup on displacy.render. This can be used to allow integration into

View File

@ -1,12 +1,15 @@
from typing import Dict, Any, List, Optional, Union
from typing import Any, Dict, List, Optional, Tuple, Union
import uuid
import itertools
from .templates import TPL_DEP_SVG, TPL_DEP_WORDS, TPL_DEP_WORDS_LEMMA, TPL_DEP_ARCS
from .templates import TPL_ENT, TPL_ENT_RTL, TPL_FIGURE, TPL_TITLE, TPL_PAGE
from .templates import TPL_ENTS, TPL_KB_LINK
from ..util import minify_html, escape_html, registry
from ..errors import Errors
from ..util import escape_html, minify_html, registry
from .templates import TPL_DEP_ARCS, TPL_DEP_SVG, TPL_DEP_WORDS
from .templates import TPL_DEP_WORDS_LEMMA, TPL_ENT, TPL_ENT_RTL, TPL_ENTS
from .templates import TPL_FIGURE, TPL_KB_LINK, TPL_PAGE, TPL_SPAN
from .templates import TPL_SPAN_RTL, TPL_SPAN_SLICE, TPL_SPAN_SLICE_RTL
from .templates import TPL_SPAN_START, TPL_SPAN_START_RTL, TPL_SPANS
from .templates import TPL_TITLE
DEFAULT_LANG = "en"
DEFAULT_DIR = "ltr"
@ -33,6 +36,224 @@ DEFAULT_LABEL_COLORS = {
}
class SpanRenderer:
"""Render Spans as SVGs."""
style = "span"
def __init__(self, options: Dict[str, Any] = {}) -> None:
"""Initialise span renderer
options (dict): Visualiser-specific options (colors, spans)
"""
# Set up the colors and overall look
colors = dict(DEFAULT_LABEL_COLORS)
user_colors = registry.displacy_colors.get_all()
for user_color in user_colors.values():
if callable(user_color):
# Since this comes from the function registry, we want to make
# sure we support functions that *return* a dict of colors
user_color = user_color()
if not isinstance(user_color, dict):
raise ValueError(Errors.E925.format(obj=type(user_color)))
colors.update(user_color)
colors.update(options.get("colors", {}))
self.default_color = DEFAULT_ENTITY_COLOR
self.colors = {label.upper(): color for label, color in colors.items()}
# Set up how the text and labels will be rendered
self.direction = DEFAULT_DIR
self.lang = DEFAULT_LANG
# These values are in px
self.top_offset = options.get("top_offset", 40)
# This is how far under the top offset the span labels appear
self.span_label_offset = options.get("span_label_offset", 20)
self.offset_step = options.get("top_offset_step", 17)
# Set up which templates will be used
template = options.get("template")
if template:
self.span_template = template["span"]
self.span_slice_template = template["slice"]
self.span_start_template = template["start"]
else:
if self.direction == "rtl":
self.span_template = TPL_SPAN_RTL
self.span_slice_template = TPL_SPAN_SLICE_RTL
self.span_start_template = TPL_SPAN_START_RTL
else:
self.span_template = TPL_SPAN
self.span_slice_template = TPL_SPAN_SLICE
self.span_start_template = TPL_SPAN_START
def render(
self, parsed: List[Dict[str, Any]], page: bool = False, minify: bool = False
) -> str:
"""Render complete markup.
parsed (list): Dependency parses to render.
page (bool): Render parses wrapped as full HTML page.
minify (bool): Minify HTML markup.
RETURNS (str): Rendered SVG or HTML markup.
"""
rendered = []
for i, p in enumerate(parsed):
if i == 0:
settings = p.get("settings", {})
self.direction = settings.get("direction", DEFAULT_DIR)
self.lang = settings.get("lang", DEFAULT_LANG)
rendered.append(self.render_spans(p["tokens"], p["spans"], p.get("title")))
if page:
docs = "".join([TPL_FIGURE.format(content=doc) for doc in rendered])
markup = TPL_PAGE.format(content=docs, lang=self.lang, dir=self.direction)
else:
markup = "".join(rendered)
if minify:
return minify_html(markup)
return markup
def render_spans(
self,
tokens: List[str],
spans: List[Dict[str, Any]],
title: Optional[str],
) -> str:
"""Render span types in text.
Spans are rendered per-token, this means that for each token, we check if it's part
of a span slice (a member of a span type) or a span start (the starting token of a
given span type).
tokens (list): Individual tokens in the text
spans (list): Individual entity spans and their start, end, label, kb_id and kb_url.
title (str / None): Document title set in Doc.user_data['title'].
"""
per_token_info = []
# we must sort so that we can correctly describe when spans need to "stack"
# which is determined by their start token, then span length (longer spans on top),
# then break any remaining ties with the span label
spans = sorted(
spans,
key=lambda s: (
s["start_token"],
-(s["end_token"] - s["start_token"]),
s["label"],
),
)
for s in spans:
# this is the vertical 'slot' that the span will be rendered in
# vertical_position = span_label_offset + (offset_step * (slot - 1))
s["render_slot"] = 0
for idx, token in enumerate(tokens):
# Identify if a token belongs to a Span (and which) and if it's a
# start token of said Span. We'll use this for the final HTML render
token_markup: Dict[str, Any] = {}
token_markup["text"] = token
concurrent_spans = 0
entities = []
for span in spans:
ent = {}
if span["start_token"] <= idx < span["end_token"]:
concurrent_spans += 1
span_start = idx == span["start_token"]
ent["label"] = span["label"]
ent["is_start"] = span_start
if span_start:
# When the span starts, we need to know how many other
# spans are on the 'span stack' and will be rendered.
# This value becomes the vertical render slot for this entire span
span["render_slot"] = concurrent_spans
ent["render_slot"] = span["render_slot"]
kb_id = span.get("kb_id", "")
kb_url = span.get("kb_url", "#")
ent["kb_link"] = (
TPL_KB_LINK.format(kb_id=kb_id, kb_url=kb_url) if kb_id else ""
)
entities.append(ent)
else:
# We don't specifically need to do this since we loop
# over tokens and spans sorted by their start_token,
# so we'll never use a span again after the last token it appears in,
# but if we were to use these spans again we'd want to make sure
# this value was reset correctly.
span["render_slot"] = 0
token_markup["entities"] = entities
per_token_info.append(token_markup)
markup = self._render_markup(per_token_info)
markup = TPL_SPANS.format(content=markup, dir=self.direction)
if title:
markup = TPL_TITLE.format(title=title) + markup
return markup
def _render_markup(self, per_token_info: List[Dict[str, Any]]) -> str:
"""Render the markup from per-token information"""
markup = ""
for token in per_token_info:
entities = sorted(token["entities"], key=lambda d: d["render_slot"])
# Whitespace tokens disrupt the vertical space (no line height) so that the
# span indicators get misaligned. We don't render them as individual
# tokens anyway, so we'll just not display a span indicator either.
is_whitespace = token["text"].strip() == ""
if entities and not is_whitespace:
slices = self._get_span_slices(token["entities"])
starts = self._get_span_starts(token["entities"])
total_height = (
self.top_offset
+ self.span_label_offset
+ (self.offset_step * (len(entities) - 1))
)
markup += self.span_template.format(
text=token["text"],
span_slices=slices,
span_starts=starts,
total_height=total_height,
)
else:
markup += escape_html(token["text"] + " ")
return markup
def _get_span_slices(self, entities: List[Dict]) -> str:
"""Get the rendered markup of all Span slices"""
span_slices = []
for entity in entities:
# rather than iterate over multiples of offset_step, we use entity['render_slot']
# to determine the vertical position, since that tells where
# the span starts vertically so we can extend it horizontally,
# past other spans that might have already ended
color = self.colors.get(entity["label"].upper(), self.default_color)
top_offset = self.top_offset + (
self.offset_step * (entity["render_slot"] - 1)
)
span_slice = self.span_slice_template.format(
bg=color,
top_offset=top_offset,
)
span_slices.append(span_slice)
return "".join(span_slices)
def _get_span_starts(self, entities: List[Dict]) -> str:
"""Get the rendered markup of all Span start tokens"""
span_starts = []
for entity in entities:
color = self.colors.get(entity["label"].upper(), self.default_color)
top_offset = self.top_offset + (
self.offset_step * (entity["render_slot"] - 1)
)
span_start = (
self.span_start_template.format(
bg=color,
top_offset=top_offset,
label=entity["label"],
kb_link=entity["kb_link"],
)
if entity["is_start"]
else ""
)
span_starts.append(span_start)
return "".join(span_starts)
class DependencyRenderer:
"""Render dependency parses as SVGs."""
@ -105,7 +326,7 @@ class DependencyRenderer:
RETURNS (str): Rendered SVG markup.
"""
self.levels = self.get_levels(arcs)
self.highest_level = len(self.levels)
self.highest_level = max(self.levels.values(), default=0)
self.offset_y = self.distance / 2 * self.highest_level + self.arrow_stroke
self.width = self.offset_x + len(words) * self.distance
self.height = self.offset_y + 3 * self.word_spacing
@ -165,7 +386,7 @@ class DependencyRenderer:
if start < 0 or end < 0:
error_args = dict(start=start, end=end, label=label, dir=direction)
raise ValueError(Errors.E157.format(**error_args))
level = self.levels.index(end - start) + 1
level = self.levels[(start, end, label)]
x_start = self.offset_x + start * self.distance + self.arrow_spacing
if self.direction == "rtl":
x_start = self.width - x_start
@ -181,7 +402,7 @@ class DependencyRenderer:
y_curve = self.offset_y - level * self.distance / 2
if self.compact:
y_curve = self.offset_y - level * self.distance / 6
if y_curve == 0 and len(self.levels) > 5:
if y_curve == 0 and max(self.levels.values(), default=0) > 5:
y_curve = -self.distance
arrowhead = self.get_arrowhead(direction, x_start, y, x_end)
arc = self.get_arc(x_start, y, y_curve, x_end)
@ -225,15 +446,23 @@ class DependencyRenderer:
p1, p2, p3 = (end, end + self.arrow_width - 2, end - self.arrow_width + 2)
return f"M{p1},{y + 2} L{p2},{y - self.arrow_width} {p3},{y - self.arrow_width}"
def get_levels(self, arcs: List[Dict[str, Any]]) -> List[int]:
def get_levels(self, arcs: List[Dict[str, Any]]) -> Dict[Tuple[int, int, str], int]:
"""Calculate available arc height "levels".
Used to calculate arrow heights dynamically and without wasting space.
args (list): Individual arcs and their start, end, direction and label.
RETURNS (list): Arc levels sorted from lowest to highest.
RETURNS (dict): Arc levels keyed by (start, end, label).
"""
levels = set(map(lambda arc: arc["end"] - arc["start"], arcs))
return sorted(list(levels))
arcs = [dict(t) for t in {tuple(sorted(arc.items())) for arc in arcs}]
length = max([arc["end"] for arc in arcs], default=0)
max_level = [0] * length
levels = {}
for arc in sorted(arcs, key=lambda arc: arc["end"] - arc["start"]):
level = max(max_level[arc["start"] : arc["end"]]) + 1
for i in range(arc["start"], arc["end"]):
max_level[i] = level
levels[(arc["start"], arc["end"], arc["label"])] = level
return levels
class EntityRenderer:
@ -242,7 +471,7 @@ class EntityRenderer:
style = "ent"
def __init__(self, options: Dict[str, Any] = {}) -> None:
"""Initialise dependency renderer.
"""Initialise entity renderer.
options (dict): Visualiser-specific options (colors, ents)
"""
@ -281,7 +510,7 @@ class EntityRenderer:
parsed (list): Dependency parses to render.
page (bool): Render parses wrapped as full HTML page.
minify (bool): Minify HTML markup.
RETURNS (str): Rendered HTML markup.
RETURNS (str): Rendered SVG or HTML markup.
"""
rendered = []
for i, p in enumerate(parsed):

View File

@ -62,6 +62,55 @@ TPL_ENT_RTL = """
</mark>
"""
TPL_SPANS = """
<div class="spans" style="line-height: 2.5; direction: {dir}">{content}</div>
"""
TPL_SPAN = """
<span style="font-weight: bold; display: inline-block; position: relative; height: {total_height}px;">
{text}
{span_slices}
{span_starts}
</span>
"""
TPL_SPAN_SLICE = """
<span style="background: {bg}; top: {top_offset}px; height: 4px; left: -1px; width: calc(100% + 2px); position: absolute;">
</span>
"""
TPL_SPAN_START = """
<span style="background: {bg}; top: {top_offset}px; height: 4px; border-top-left-radius: 3px; border-bottom-left-radius: 3px; left: -1px; width: calc(100% + 2px); position: absolute;">
<span style="background: {bg}; z-index: 10; color: #000; top: -0.5em; padding: 2px 3px; position: absolute; font-size: 0.6em; font-weight: bold; line-height: 1; border-radius: 3px">
{label}{kb_link}
</span>
</span>
"""
TPL_SPAN_RTL = """
<span style="font-weight: bold; display: inline-block; position: relative;">
{text}
{span_slices}
{span_starts}
</span>
"""
TPL_SPAN_SLICE_RTL = """
<span style="background: {bg}; top: {top_offset}px; height: 4px; left: -1px; width: calc(100% + 2px); position: absolute;">
</span>
"""
TPL_SPAN_START_RTL = """
<span style="background: {bg}; top: {top_offset}px; height: 4px; border-top-left-radius: 3px; border-bottom-left-radius: 3px; left: -1px; width: calc(100% + 2px); position: absolute;">
<span style="background: {bg}; z-index: 10; color: #000; top: -0.5em; padding: 2px 3px; position: absolute; font-size: 0.6em; font-weight: bold; line-height: 1; border-radius: 3px">
{label}{kb_link}
</span>
</span>
"""
# Important: this needs to start with a space!
TPL_KB_LINK = """
<a style="text-decoration: none; color: inherit; font-weight: normal" href="{kb_url}">{kb_id}</a>

View File

@ -1,4 +1,5 @@
import warnings
from .compat import Literal
class ErrorsWithCodes(type):
@ -15,8 +16,8 @@ def setup_default_warnings():
filter_warning("ignore", error_msg="numpy.dtype size changed") # noqa
filter_warning("ignore", error_msg="numpy.ufunc size changed") # noqa
# warn about entity_ruler & matcher having no patterns only once
for pipe in ["matcher", "entity_ruler"]:
# warn about entity_ruler, span_ruler & matcher having no patterns only once
for pipe in ["matcher", "entity_ruler", "span_ruler"]:
filter_warning("once", error_msg=Warnings.W036.format(name=pipe))
# warn once about lemmatizer without required POS
@ -26,7 +27,10 @@ def setup_default_warnings():
filter_warning("once", error_msg="[W114]")
def filter_warning(action: str, error_msg: str):
def filter_warning(
action: Literal["default", "error", "ignore", "always", "module", "once"],
error_msg: str,
):
"""Customize how spaCy should handle a certain warning.
error_msg (str): e.g. "W006", or a full error message
@ -192,6 +196,25 @@ class Warnings(metaclass=ErrorsWithCodes):
W115 = ("Skipping {method}: the floret vector table cannot be modified. "
"Vectors are calculated from character ngrams.")
W116 = ("Unable to clean attribute '{attr}'.")
W117 = ("No spans to visualize found in Doc object with spans_key: '{spans_key}'. If this is "
"surprising to you, make sure the Doc was processed using a model "
"that supports span categorization, and check the `doc.spans[spans_key]` "
"property manually if necessary.\n\nAvailable keys: {keys}")
W118 = ("Term '{term}' not found in glossary. It may however be explained in documentation "
"for the corpora used to train the language. Please check "
"`nlp.meta[\"sources\"]` for any relevant links.")
W119 = ("Overriding pipe name in `config` is not supported. Ignoring override '{name_in_config}'.")
W120 = ("Unable to load all spans in Doc.spans: more than one span group "
"with the name '{group_name}' was found in the saved spans data. "
"Only the last span group will be loaded under "
"Doc.spans['{group_name}']. Skipping span group with values: "
"{group_values}")
W121 = ("Attempting to trace non-existent method '{method}' in pipe '{pipe}'")
W122 = ("Couldn't trace method '{method}' in pipe '{pipe}'. This can happen if the pipe class "
"is a Cython extension type.")
W123 = ("Argument `enable` with value {enable} does not contain all values specified in the config option "
"`enabled` ({enabled}). Be aware that this might affect other components in your pipeline.")
W124 = ("{host}:{port} is already in use, using the nearest available port {serve_port} as an alternative.")
class Errors(metaclass=ErrorsWithCodes):
@ -210,8 +233,9 @@ class Errors(metaclass=ErrorsWithCodes):
"initialized component.")
E004 = ("Can't set up pipeline component: a factory for '{name}' already "
"exists. Existing factory: {func}. New factory: {new_func}")
E005 = ("Pipeline component '{name}' returned None. If you're using a "
"custom component, maybe you forgot to return the processed Doc?")
E005 = ("Pipeline component '{name}' returned {returned_type} instead of a "
"Doc. If you're using a custom component, maybe you forgot to "
"return the processed Doc?")
E006 = ("Invalid constraints for adding pipeline component. You can only "
"set one of the following: before (component name or index), "
"after (component name or index), first (True) or last (True). "
@ -322,6 +346,11 @@ class Errors(metaclass=ErrorsWithCodes):
"clear the existing vectors and resize the table.")
E074 = ("Error interpreting compiled match pattern: patterns are expected "
"to end with the attribute {attr}. Got: {bad_attr}.")
E079 = ("Error computing states in beam: number of predicted beams "
"({pbeams}) does not equal number of gold beams ({gbeams}).")
E080 = ("Duplicate state found in beam: {key}.")
E081 = ("Error getting gradient in beam: number of histories ({n_hist}) "
"does not equal number of losses ({losses}).")
E082 = ("Error deprojectivizing parse: number of heads ({n_heads}), "
"projective heads ({n_proj_heads}) and labels ({n_labels}) do not "
"match.")
@ -369,7 +398,7 @@ class Errors(metaclass=ErrorsWithCodes):
"consider using doc.spans instead.")
E106 = ("Can't find `doc._.{attr}` attribute specified in the underscore "
"settings: {opts}")
E107 = ("Value of `doc._.{attr}` is not JSON-serializable: {value}")
E107 = ("Value of custom attribute `{attr}` is not JSON-serializable: {value}")
E109 = ("Component '{name}' could not be run. Did you forget to "
"call `initialize()`?")
E110 = ("Invalid displaCy render wrapper. Expected callable, got: {obj}")
@ -437,10 +466,10 @@ class Errors(metaclass=ErrorsWithCodes):
"same, but found '{nlp}' and '{vocab}' respectively.")
E152 = ("The attribute {attr} is not supported for token patterns. "
"Please use the option `validate=True` with the Matcher, PhraseMatcher, "
"or EntityRuler for more details.")
"EntityRuler or AttributeRuler for more details.")
E153 = ("The value type {vtype} is not supported for token patterns. "
"Please use the option validate=True with Matcher, PhraseMatcher, "
"or EntityRuler for more details.")
"EntityRuler or AttributeRuler for more details.")
E154 = ("One of the attributes or values is not supported for token "
"patterns. Please use the option `validate=True` with the Matcher, "
"PhraseMatcher, or EntityRuler for more details.")
@ -515,15 +544,28 @@ class Errors(metaclass=ErrorsWithCodes):
E198 = ("Unable to return {n} most similar vectors for the current vectors "
"table, which contains {n_rows} vectors.")
E199 = ("Unable to merge 0-length span at `doc[{start}:{end}]`.")
E200 = ("Can't yet set {attr} from Span. Vote for this feature on the "
"issue tracker: http://github.com/explosion/spaCy/issues")
E200 = ("Can't set {attr} from Span.")
E202 = ("Unsupported {name} mode '{mode}'. Supported modes: {modes}.")
E203 = ("If the {name} embedding layer is not updated "
"during training, make sure to include it in 'annotating components'")
# New errors added in v3.x
E851 = ("The 'textcat' component labels should only have values of 0 or 1, "
"but found value of '{val}'.")
E852 = ("The tar file pulled from the remote attempted an unsafe path "
"traversal.")
E853 = ("Unsupported component factory name '{name}'. The character '.' is "
"not permitted in factory names.")
E854 = ("Unable to set doc.ents. Check that the 'ents_filter' does not "
"permit overlapping spans.")
E855 = ("Invalid {obj}: {obj} is not from the same doc.")
E856 = ("Error accessing span at position {i}: out of bounds in span group "
"of length {length}.")
E857 = ("Entry '{name}' not found in edit tree lemmatizer labels.")
E858 = ("The {mode} vector table does not support this operation. "
"{alternative}")
E859 = ("The floret vector table cannot be modified.")
E860 = ("Can't truncate fasttext-bloom vectors.")
E860 = ("Can't truncate floret vectors.")
E861 = ("No 'keys' should be provided when initializing floret vectors "
"with 'minn' and 'maxn'.")
E862 = ("'hash_count' must be between 1-4 for floret vectors.")
@ -679,11 +721,11 @@ class Errors(metaclass=ErrorsWithCodes):
"need to modify the pipeline, use the built-in methods like "
"`nlp.add_pipe`, `nlp.remove_pipe`, `nlp.disable_pipe` or "
"`nlp.enable_pipe` instead.")
E927 = ("Can't write to frozen list Maybe you're trying to modify a computed "
E927 = ("Can't write to frozen list. Maybe you're trying to modify a computed "
"property or default function argument?")
E928 = ("A KnowledgeBase can only be serialized to/from from a directory, "
E928 = ("An InMemoryLookupKB can only be serialized to/from from a directory, "
"but the provided argument {loc} points to a file.")
E929 = ("Couldn't read KnowledgeBase from {loc}. The path does not seem to exist.")
E929 = ("Couldn't read InMemoryLookupKB from {loc}. The path does not seem to exist.")
E930 = ("Received invalid get_examples callback in `{method}`. "
"Expected function that returns an iterable of Example objects but "
"got: {obj}")
@ -887,10 +929,45 @@ class Errors(metaclass=ErrorsWithCodes):
E1022 = ("Words must be of type str or int, but input is of type '{wtype}'")
E1023 = ("Couldn't read EntityRuler from the {path}. This file doesn't "
"exist.")
E1024 = ("A pattern with ID \"{ent_id}\" is not present in EntityRuler "
"patterns.")
E1024 = ("A pattern with {attr_type} '{label}' is not present in "
"'{component}' patterns.")
E1025 = ("Cannot intify the value '{value}' as an IOB string. The only "
"supported values are: 'I', 'O', 'B' and ''")
E1026 = ("Edit tree has an invalid format:\n{errors}")
E1027 = ("AlignmentArray only supports slicing with a step of 1.")
E1028 = ("AlignmentArray only supports indexing using an int or a slice.")
E1029 = ("Edit tree cannot be applied to form.")
E1030 = ("Edit tree identifier out of range.")
E1031 = ("Could not find gold transition - see logs above.")
E1032 = ("`{var}` should not be {forbidden}, but received {value}.")
E1033 = ("Dimension {name} invalid -- only nO, nF, nP")
E1034 = ("Node index {i} out of bounds ({length})")
E1035 = ("Token index {i} out of bounds ({length})")
E1036 = ("Cannot index into NoneNode")
E1037 = ("Invalid attribute value '{attr}'.")
E1038 = ("Invalid JSON input: {message}")
E1039 = ("The {obj} start or end annotations (start: {start}, end: {end}) "
"could not be aligned to token boundaries.")
E1040 = ("Doc.from_json requires all tokens to have the same attributes. "
"Some tokens do not contain annotation for: {partial_attrs}")
E1041 = ("Expected a string, Doc, or bytes as input, but got: {type}")
E1042 = ("`enable={enable}` and `disable={disable}` are inconsistent with each other.\nIf you only passed "
"one of `enable` or `disable`, the other argument is specified in your pipeline's configuration.\nIn that "
"case pass an empty list for the previously not specified argument to avoid this error.")
E1043 = ("Expected None or a value in range [{range_start}, {range_end}] for entity linker threshold, but got "
"{value}.")
E1044 = ("Expected `candidates_batch_size` to be >= 1, but got: {value}")
E1045 = ("Encountered {parent} subclass without `{parent}.{method}` "
"method in '{name}'. If you want to use this method, make "
"sure it's overwritten on the subclass.")
E1046 = ("{cls_name} is an abstract class and cannot be instantiated. If you are looking for spaCy's default "
"knowledge base, use `InMemoryLookupKB`.")
E1047 = ("`find_threshold()` only supports components with a `scorer` attribute.")
E1048 = ("Got '{unexpected}' as console progress bar type, but expected one of the following: {expected}")
E1049 = ("No available port found for displaCy on host {host}. Please specify an available port "
"with `displacy.serve(doc, port=port)`")
E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port=port)` "
"or use `auto_switch_port=True` to pick an available port automatically.")
# Deprecated model shortcuts, only used in errors and warnings

View File

@ -1,3 +1,7 @@
import warnings
from .errors import Warnings
def explain(term):
"""Get a description for a given POS tag, dependency label or entity type.
@ -11,6 +15,8 @@ def explain(term):
"""
if term in GLOSSARY:
return GLOSSARY[term]
else:
warnings.warn(Warnings.W118.format(term=term))
GLOSSARY = {
@ -267,6 +273,7 @@ GLOSSARY = {
"relcl": "relative clause modifier",
"reparandum": "overridden disfluency",
"root": "root",
"ROOT": "root",
"vocative": "vocative",
"xcomp": "open clausal complement",
# Dependency labels (German)

3
spacy/kb/__init__.py Normal file
View File

@ -0,0 +1,3 @@
from .kb import KnowledgeBase
from .kb_in_memory import InMemoryLookupKB
from .candidate import Candidate, get_candidates, get_candidates_batch

12
spacy/kb/candidate.pxd Normal file
View File

@ -0,0 +1,12 @@
from .kb cimport KnowledgeBase
from libcpp.vector cimport vector
from ..typedefs cimport hash_t
# Object used by the Entity Linker that summarizes one entity-alias candidate combination.
cdef class Candidate:
cdef readonly KnowledgeBase kb
cdef hash_t entity_hash
cdef float entity_freq
cdef vector[float] entity_vector
cdef hash_t alias_hash
cdef float prior_prob

74
spacy/kb/candidate.pyx Normal file
View File

@ -0,0 +1,74 @@
# cython: infer_types=True, profile=True
from typing import Iterable
from .kb cimport KnowledgeBase
from ..tokens import Span
cdef class Candidate:
"""A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved
to a specific `entity` from a Knowledge Base. This will be used as input for the entity linking
algorithm which will disambiguate the various candidates to the correct one.
Each candidate (alias, entity) pair is assigned a certain prior probability.
DOCS: https://spacy.io/api/kb/#candidate-init
"""
def __init__(self, KnowledgeBase kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob):
self.kb = kb
self.entity_hash = entity_hash
self.entity_freq = entity_freq
self.entity_vector = entity_vector
self.alias_hash = alias_hash
self.prior_prob = prior_prob
@property
def entity(self) -> int:
"""RETURNS (uint64): hash of the entity's KB ID/name"""
return self.entity_hash
@property
def entity_(self) -> str:
"""RETURNS (str): ID/name of this entity in the KB"""
return self.kb.vocab.strings[self.entity_hash]
@property
def alias(self) -> int:
"""RETURNS (uint64): hash of the alias"""
return self.alias_hash
@property
def alias_(self) -> str:
"""RETURNS (str): ID of the original alias"""
return self.kb.vocab.strings[self.alias_hash]
@property
def entity_freq(self) -> float:
return self.entity_freq
@property
def entity_vector(self) -> Iterable[float]:
return self.entity_vector
@property
def prior_prob(self) -> float:
return self.prior_prob
def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]:
"""
Return candidate entities for a given mention and fetching appropriate entries from the index.
kb (KnowledgeBase): Knowledge base to query.
mention (Span): Entity mention for which to identify candidates.
RETURNS (Iterable[Candidate]): Identified candidates.
"""
return kb.get_candidates(mention)
def get_candidates_batch(kb: KnowledgeBase, mentions: Iterable[Span]) -> Iterable[Iterable[Candidate]]:
"""
Return candidate entities for the given mentions and fetching appropriate entries from the index.
kb (KnowledgeBase): Knowledge base to query.
mention (Iterable[Span]): Entity mentions for which to identify candidates.
RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
"""
return kb.get_candidates_batch(mentions)

10
spacy/kb/kb.pxd Normal file
View File

@ -0,0 +1,10 @@
"""Knowledge-base for entity or concept linking."""
from cymem.cymem cimport Pool
from libc.stdint cimport int64_t
from ..vocab cimport Vocab
cdef class KnowledgeBase:
cdef Pool mem
cdef readonly Vocab vocab
cdef readonly int64_t entity_vector_length

108
spacy/kb/kb.pyx Normal file
View File

@ -0,0 +1,108 @@
# cython: infer_types=True, profile=True
from pathlib import Path
from typing import Iterable, Tuple, Union
from cymem.cymem cimport Pool
from .candidate import Candidate
from ..tokens import Span
from ..util import SimpleFrozenList
from ..errors import Errors
cdef class KnowledgeBase:
"""A `KnowledgeBase` instance stores unique identifiers for entities and their textual aliases,
to support entity linking of named entities to real-world concepts.
This is an abstract class and requires its operations to be implemented.
DOCS: https://spacy.io/api/kb
"""
def __init__(self, vocab: Vocab, entity_vector_length: int):
"""Create a KnowledgeBase."""
# Make sure abstract KB is not instantiated.
if self.__class__ == KnowledgeBase:
raise TypeError(
Errors.E1046.format(cls_name=self.__class__.__name__)
)
self.vocab = vocab
self.entity_vector_length = entity_vector_length
self.mem = Pool()
def get_candidates_batch(self, mentions: Iterable[Span]) -> Iterable[Iterable[Candidate]]:
"""
Return candidate entities for specified texts. Each candidate defines the entity, the original alias,
and the prior probability of that alias resolving to that entity.
If no candidate is found for a given text, an empty list is returned.
mentions (Iterable[Span]): Mentions for which to get candidates.
RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
"""
return [self.get_candidates(span) for span in mentions]
def get_candidates(self, mention: Span) -> Iterable[Candidate]:
"""
Return candidate entities for specified text. Each candidate defines the entity, the original alias,
and the prior probability of that alias resolving to that entity.
If the no candidate is found for a given text, an empty list is returned.
mention (Span): Mention for which to get candidates.
RETURNS (Iterable[Candidate]): Identified candidates.
"""
raise NotImplementedError(
Errors.E1045.format(parent="KnowledgeBase", method="get_candidates", name=self.__name__)
)
def get_vectors(self, entities: Iterable[str]) -> Iterable[Iterable[float]]:
"""
Return vectors for entities.
entity (str): Entity name/ID.
RETURNS (Iterable[Iterable[float]]): Vectors for specified entities.
"""
return [self.get_vector(entity) for entity in entities]
def get_vector(self, str entity) -> Iterable[float]:
"""
Return vector for entity.
entity (str): Entity name/ID.
RETURNS (Iterable[float]): Vector for specified entity.
"""
raise NotImplementedError(
Errors.E1045.format(parent="KnowledgeBase", method="get_vector", name=self.__name__)
)
def to_bytes(self, **kwargs) -> bytes:
"""Serialize the current state to a binary string.
RETURNS (bytes): Current state as binary string.
"""
raise NotImplementedError(
Errors.E1045.format(parent="KnowledgeBase", method="to_bytes", name=self.__name__)
)
def from_bytes(self, bytes_data: bytes, *, exclude: Tuple[str] = tuple()):
"""Load state from a binary string.
bytes_data (bytes): KB state.
exclude (Tuple[str]): Properties to exclude when restoring KB.
"""
raise NotImplementedError(
Errors.E1045.format(parent="KnowledgeBase", method="from_bytes", name=self.__name__)
)
def to_disk(self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()) -> None:
"""
Write KnowledgeBase content to disk.
path (Union[str, Path]): Target file path.
exclude (Iterable[str]): List of components to exclude.
"""
raise NotImplementedError(
Errors.E1045.format(parent="KnowledgeBase", method="to_disk", name=self.__name__)
)
def from_disk(self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()) -> None:
"""
Load KnowledgeBase content from disk.
path (Union[str, Path]): Target file path.
exclude (Iterable[str]): List of components to exclude.
"""
raise NotImplementedError(
Errors.E1045.format(parent="KnowledgeBase", method="from_disk", name=self.__name__)
)

View File

@ -1,14 +1,12 @@
"""Knowledge-base for entity or concept linking."""
from cymem.cymem cimport Pool
from preshed.maps cimport PreshMap
from libcpp.vector cimport vector
from libc.stdint cimport int32_t, int64_t
from libc.stdio cimport FILE
from .vocab cimport Vocab
from .typedefs cimport hash_t
from .structs cimport KBEntryC, AliasC
from ..typedefs cimport hash_t
from ..structs cimport KBEntryC, AliasC
from .kb cimport KnowledgeBase
ctypedef vector[KBEntryC] entry_vec
ctypedef vector[AliasC] alias_vec
@ -16,21 +14,7 @@ ctypedef vector[float] float_vec
ctypedef vector[float_vec] float_matrix
# Object used by the Entity Linker that summarizes one entity-alias candidate combination.
cdef class Candidate:
cdef readonly KnowledgeBase kb
cdef hash_t entity_hash
cdef float entity_freq
cdef vector[float] entity_vector
cdef hash_t alias_hash
cdef float prior_prob
cdef class KnowledgeBase:
cdef Pool mem
cdef readonly Vocab vocab
cdef int64_t entity_vector_length
cdef class InMemoryLookupKB(KnowledgeBase):
# This maps 64bit keys (hash of unique entity string)
# to 64bit values (position of the _KBEntryC struct in the _entries vector).
# The PreshMap is pretty space efficient, as it uses open addressing. So

View File

@ -1,8 +1,7 @@
# cython: infer_types=True, profile=True
from typing import Iterator, Iterable, Callable, Dict, Any
from typing import Iterable, Callable, Dict, Any, Union
import srsly
from cymem.cymem cimport Pool
from preshed.maps cimport PreshMap
from cpython.exc cimport PyErr_SetFromErrno
from libc.stdio cimport fopen, fclose, fread, fwrite, feof, fseek
@ -12,103 +11,41 @@ from libcpp.vector cimport vector
from pathlib import Path
import warnings
from .typedefs cimport hash_t
from .errors import Errors, Warnings
from . import util
from .util import SimpleFrozenList, ensure_path
cdef class Candidate:
"""A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved
to a specific `entity` from a Knowledge Base. This will be used as input for the entity linking
algorithm which will disambiguate the various candidates to the correct one.
Each candidate (alias, entity) pair is assigned to a certain prior probability.
DOCS: https://spacy.io/api/kb/#candidate_init
"""
def __init__(self, KnowledgeBase kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob):
self.kb = kb
self.entity_hash = entity_hash
self.entity_freq = entity_freq
self.entity_vector = entity_vector
self.alias_hash = alias_hash
self.prior_prob = prior_prob
@property
def entity(self):
"""RETURNS (uint64): hash of the entity's KB ID/name"""
return self.entity_hash
@property
def entity_(self):
"""RETURNS (str): ID/name of this entity in the KB"""
return self.kb.vocab.strings[self.entity_hash]
@property
def alias(self):
"""RETURNS (uint64): hash of the alias"""
return self.alias_hash
@property
def alias_(self):
"""RETURNS (str): ID of the original alias"""
return self.kb.vocab.strings[self.alias_hash]
@property
def entity_freq(self):
return self.entity_freq
@property
def entity_vector(self):
return self.entity_vector
@property
def prior_prob(self):
return self.prior_prob
from ..tokens import Span
from ..typedefs cimport hash_t
from ..errors import Errors, Warnings
from .. import util
from ..util import SimpleFrozenList, ensure_path
from ..vocab cimport Vocab
from .kb cimport KnowledgeBase
from .candidate import Candidate as Candidate
def get_candidates(KnowledgeBase kb, span) -> Iterator[Candidate]:
"""
Return candidate entities for a given span by using the text of the span as the alias
and fetching appropriate entries from the index.
This particular function is optimized to work with the built-in KB functionality,
but any other custom candidate generation method can be used in combination with the KB as well.
"""
return kb.get_alias_candidates(span.text)
cdef class KnowledgeBase:
"""A `KnowledgeBase` instance stores unique identifiers for entities and their textual aliases,
cdef class InMemoryLookupKB(KnowledgeBase):
"""An `InMemoryLookupKB` instance stores unique identifiers for entities and their textual aliases,
to support entity linking of named entities to real-world concepts.
DOCS: https://spacy.io/api/kb
DOCS: https://spacy.io/api/inmemorylookupkb
"""
def __init__(self, Vocab vocab, entity_vector_length):
"""Create a KnowledgeBase."""
self.mem = Pool()
self.entity_vector_length = entity_vector_length
"""Create an InMemoryLookupKB."""
super().__init__(vocab, entity_vector_length)
self._entry_index = PreshMap()
self._alias_index = PreshMap()
self.vocab = vocab
self._create_empty_vectors(dummy_hash=self.vocab.strings[""])
def initialize_entities(self, int64_t nr_entities):
def _initialize_entities(self, int64_t nr_entities):
self._entry_index = PreshMap(nr_entities + 1)
self._entries = entry_vec(nr_entities + 1)
def initialize_vectors(self, int64_t nr_entities):
def _initialize_vectors(self, int64_t nr_entities):
self._vectors_table = float_matrix(nr_entities + 1)
def initialize_aliases(self, int64_t nr_aliases):
def _initialize_aliases(self, int64_t nr_aliases):
self._alias_index = PreshMap(nr_aliases + 1)
self._aliases_table = alias_vec(nr_aliases + 1)
@property
def entity_vector_length(self):
"""RETURNS (uint64): length of the entity vectors"""
return self.entity_vector_length
def __len__(self):
return self.get_size_entities()
@ -155,8 +92,8 @@ cdef class KnowledgeBase:
raise ValueError(Errors.E140)
nr_entities = len(set(entity_list))
self.initialize_entities(nr_entities)
self.initialize_vectors(nr_entities)
self._initialize_entities(nr_entities)
self._initialize_vectors(nr_entities)
i = 0
cdef KBEntryC entry
@ -286,7 +223,10 @@ cdef class KnowledgeBase:
alias_entry.probs = probs
self._aliases_table[alias_index] = alias_entry
def get_alias_candidates(self, str alias) -> Iterator[Candidate]:
def get_candidates(self, mention: Span) -> Iterable[Candidate]:
return self.get_alias_candidates(mention.text) # type: ignore
def get_alias_candidates(self, str alias) -> Iterable[Candidate]:
"""
Return candidate entities for an alias. Each candidate defines the entity, the original alias,
and the prior probability of that alias resolving to that entity.
@ -388,9 +328,9 @@ cdef class KnowledgeBase:
nr_entities = header[0]
nr_aliases = header[1]
entity_vector_length = header[2]
self.initialize_entities(nr_entities)
self.initialize_vectors(nr_entities)
self.initialize_aliases(nr_aliases)
self._initialize_entities(nr_entities)
self._initialize_vectors(nr_entities)
self._initialize_aliases(nr_aliases)
self.entity_vector_length = entity_vector_length
def deserialize_vectors(b):
@ -512,8 +452,8 @@ cdef class KnowledgeBase:
cdef int64_t entity_vector_length
reader.read_header(&nr_entities, &entity_vector_length)
self.initialize_entities(nr_entities)
self.initialize_vectors(nr_entities)
self._initialize_entities(nr_entities)
self._initialize_vectors(nr_entities)
self.entity_vector_length = entity_vector_length
# STEP 1: load entity vectors
@ -552,7 +492,7 @@ cdef class KnowledgeBase:
# STEP 3: load aliases
cdef int64_t nr_aliases
reader.read_alias_length(&nr_aliases)
self.initialize_aliases(nr_aliases)
self._initialize_aliases(nr_aliases)
cdef int64_t nr_candidates
cdef vector[int64_t] entry_indices

View File

@ -2,7 +2,8 @@ from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .lex_attrs import LEX_ATTRS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_INFIXES
from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_SUFFIXES
from ...language import Language, BaseDefaults
from ...attrs import LANG
from ...util import update_exc
@ -16,6 +17,8 @@ class BulgarianDefaults(BaseDefaults):
stop_words = STOP_WORDS
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
suffixes = COMBINING_DIACRITICS_TOKENIZER_SUFFIXES
infixes = COMBINING_DIACRITICS_TOKENIZER_INFIXES
class Bulgarian(Language):

View File

@ -72,10 +72,10 @@ class CatalanLemmatizer(Lemmatizer):
oov_forms.append(form)
if not forms:
forms.extend(oov_forms)
if not forms and string in lookup_table.keys():
forms.append(self.lookup_lemmatize(token)[0])
# use lookups, and fall back to the token itself
if not forms:
forms.append(string)
forms.append(lookup_table.get(string, [string])[0])
forms = list(dict.fromkeys(forms))
self.cache[cache_key] = forms
return forms

View File

@ -258,6 +258,10 @@ ALPHA = group_chars(
ALPHA_LOWER = group_chars(_lower + _uncased)
ALPHA_UPPER = group_chars(_upper + _uncased)
_combining_diacritics = r"\u0300-\u036f"
COMBINING_DIACRITICS = _combining_diacritics
_units = (
"km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft "
"kg g mg µg t lb oz m/s km/h kmh mph hPa Pa mbar mb MB kb KB gb GB tb "
@ -276,7 +280,7 @@ _currency = (
_punct = (
r"… …… , : ; \! \? ¿ ؟ ¡ \( \) \[ \] \{ \} < > _ # \* & 。 · । ، ۔ ؛ ٪"
)
_quotes = r'\' " ” “ ` ´ , „ » « 「 」 『 』 【 】 《 》 〈 〉'
_quotes = r'\' " ” “ ` ´ , „ » « 「 」 『 』 【 】 《 》 〈 〉 〈 〉 ⟦ ⟧'
_hyphens = "- — -- --- —— ~"
# Various symbols like dingbats, but also emoji

View File

@ -0,0 +1,16 @@
from .lex_attrs import LEX_ATTRS
from .stop_words import STOP_WORDS
from ...language import Language, BaseDefaults
class LowerSorbianDefaults(BaseDefaults):
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
class LowerSorbian(Language):
lang = "dsb"
Defaults = LowerSorbianDefaults
__all__ = ["LowerSorbian"]

View File

@ -0,0 +1,15 @@
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.dsb.examples import sentences
>>> docs = nlp.pipe(sentences)
"""
sentences = [
"Z tym stwori so wuměnjenje a zakład za dalše wobdźěłanje přez analyzu tekstoweje struktury a semantisku anotaciju a z tym tež za tu předstajenu digitalnu online-wersiju.",
"Mi so tu jara derje spodoba.",
"Kotre nowniny chceće měć?",
"Tak ako w slědnem lěśe jo teke lětosa jano doma zapustowaś móžno.",
"Zwóstanjo pótakem hyšći wjele źěła.",
]

113
spacy/lang/dsb/lex_attrs.py Normal file
View File

@ -0,0 +1,113 @@
from ...attrs import LIKE_NUM
_num_words = [
"nul",
"jaden",
"jadna",
"jadno",
"dwa",
"dwě",
"tśi",
"tśo",
"styri",
"styrjo",
"pěś",
"pěśo",
"šesć",
"šesćo",
"sedym",
"sedymjo",
"wósym",
"wósymjo",
"źewjeś",
"źewjeśo",
"źaseś",
"źaseśo",
"jadnassćo",
"dwanassćo",
"tśinasćo",
"styrnasćo",
"pěśnasćo",
"šesnasćo",
"sedymnasćo",
"wósymnasćo",
"źewjeśnasćo",
"dwanasćo",
"dwaźasća",
"tśiźasća",
"styrźasća",
"pěśźaset",
"šesćźaset",
"sedymźaset",
"wósymźaset",
"źewjeśźaset",
"sto",
"tysac",
"milion",
"miliarda",
"bilion",
"biliarda",
"trilion",
"triliarda",
]
_ordinal_words = [
"prědny",
"prědna",
"prědne",
"drugi",
"druga",
"druge",
"tśeśi",
"tśeśa",
"tśeśe",
"stwórty",
"stwórta",
"stwórte",
"pêty",
"pěta",
"pête",
"šesty",
"šesta",
"šeste",
"sedymy",
"sedyma",
"sedyme",
"wósymy",
"wósyma",
"wósyme",
"źewjety",
"źewjeta",
"źewjete",
"źasety",
"źaseta",
"źasete",
"jadnasty",
"jadnasta",
"jadnaste",
"dwanasty",
"dwanasta",
"dwanaste",
]
def like_num(text):
if text.startswith(("+", "-", "±", "~")):
text = text[1:]
text = text.replace(",", "").replace(".", "")
if text.isdigit():
return True
if text.count("/") == 1:
num, denom = text.split("/")
if num.isdigit() and denom.isdigit():
return True
text_lower = text.lower()
if text_lower in _num_words:
return True
# Check ordinal number
if text_lower in _ordinal_words:
return True
return False
LEX_ATTRS = {LIKE_NUM: like_num}

View File

@ -0,0 +1,15 @@
STOP_WORDS = set(
"""
a abo aby ako ale
daniž dokulaž
gaž
jolic
pak pótom
teke togodla
""".split()
)

View File

@ -35,7 +35,7 @@ for pron in ["i"]:
_exc[orth + "m"] = [
{ORTH: orth, NORM: pron},
{ORTH: "m", "tenspect": 1, "number": 1},
{ORTH: "m"},
]
_exc[orth + "'ma"] = [
@ -139,26 +139,27 @@ for pron in ["he", "she", "it"]:
# W-words, relative pronouns, prepositions etc.
for word in [
"who",
"what",
"when",
"where",
"why",
"how",
"there",
"that",
"this",
"these",
"those",
for word, morph in [
("who", None),
("what", None),
("when", None),
("where", None),
("why", None),
("how", None),
("there", None),
("that", "Number=Sing|Person=3"),
("this", "Number=Sing|Person=3"),
("these", "Number=Plur|Person=3"),
("those", "Number=Plur|Person=3"),
]:
for orth in [word, word.title()]:
_exc[orth + "'s"] = [
{ORTH: orth, NORM: word},
{ORTH: "'s", NORM: "'s"},
]
if morph != "Number=Plur|Person=3":
_exc[orth + "'s"] = [
{ORTH: orth, NORM: word},
{ORTH: "'s", NORM: "'s"},
]
_exc[orth + "s"] = [{ORTH: orth, NORM: word}, {ORTH: "s"}]
_exc[orth + "s"] = [{ORTH: orth, NORM: word}, {ORTH: "s"}]
_exc[orth + "'ll"] = [
{ORTH: orth, NORM: word},
@ -182,25 +183,26 @@ for word in [
{ORTH: "ve", NORM: "have"},
]
_exc[orth + "'re"] = [
{ORTH: orth, NORM: word},
{ORTH: "'re", NORM: "are"},
]
if morph != "Number=Sing|Person=3":
_exc[orth + "'re"] = [
{ORTH: orth, NORM: word},
{ORTH: "'re", NORM: "are"},
]
_exc[orth + "re"] = [
{ORTH: orth, NORM: word},
{ORTH: "re", NORM: "are"},
]
_exc[orth + "re"] = [
{ORTH: orth, NORM: word},
{ORTH: "re", NORM: "are"},
]
_exc[orth + "'ve"] = [
{ORTH: orth, NORM: word},
{ORTH: "'ve"},
]
_exc[orth + "'ve"] = [
{ORTH: orth, NORM: word},
{ORTH: "'ve"},
]
_exc[orth + "ve"] = [
{ORTH: orth},
{ORTH: "ve", NORM: "have"},
]
_exc[orth + "ve"] = [
{ORTH: orth},
{ORTH: "ve", NORM: "have"},
]
_exc[orth + "'d"] = [
{ORTH: orth, NORM: word},
@ -447,7 +449,6 @@ for exc_data in [
{ORTH: "La.", NORM: "Louisiana"},
{ORTH: "Mar.", NORM: "March"},
{ORTH: "Mass.", NORM: "Massachusetts"},
{ORTH: "May.", NORM: "May"},
{ORTH: "Mich.", NORM: "Michigan"},
{ORTH: "Minn.", NORM: "Minnesota"},
{ORTH: "Miss.", NORM: "Mississippi"},

View File

@ -9,14 +9,14 @@ Example sentences to test spaCy and its language models.
sentences = [
"Apple está buscando comprar una startup del Reino Unido por mil millones de dólares.",
"Los coches autónomos delegan la responsabilidad del seguro en sus fabricantes.",
"San Francisco analiza prohibir los robots delivery.",
"San Francisco analiza prohibir los robots de reparto.",
"Londres es una gran ciudad del Reino Unido.",
"El gato come pescado.",
"Veo al hombre con el telescopio.",
"La araña come moscas.",
"El pingüino incuba en su nido sobre el hielo.",
"¿Dónde estais?",
"¿Quién es el presidente Francés?",
"¿Dónde está encuentra la capital de Argentina?",
"¿Dónde estáis?",
"¿Quién es el presidente francés?",
"¿Dónde se encuentra la capital de Argentina?",
"¿Cuándo nació José de San Martín?",
]

View File

@ -1,82 +1,80 @@
STOP_WORDS = set(
"""
actualmente acuerdo adelante ademas además adrede afirmó agregó ahi ahora ahí
al algo alguna algunas alguno algunos algún alli allí alrededor ambos ampleamos
antano antaño ante anterior antes apenas aproximadamente aquel aquella aquellas
aquello aquellos aqui aquél aquélla aquéllas aquéllos aquí arriba arribaabajo
aseguró asi así atras aun aunque ayer añadió aún
a acuerdo adelante ademas además afirmó agregó ahi ahora ahí al algo alguna
algunas alguno algunos algún alli allí alrededor ambos ante anterior antes
apenas aproximadamente aquel aquella aquellas aquello aquellos aqui aquél
aquélla aquéllas aquéllos aquí arriba aseguró asi así atras aun aunque añadió
aún
bajo bastante bien breve buen buena buenas bueno buenos
cada casi cerca cierta ciertas cierto ciertos cinco claro comentó como con
conmigo conocer conseguimos conseguir considera consideró consigo consigue
consiguen consigues contigo contra cosas creo cual cuales cualquier cuando
cuanta cuantas cuanto cuantos cuatro cuenta cuál cuáles cuándo cuánta cuántas
cuánto cuántos cómo
cada casi cierta ciertas cierto ciertos cinco claro comentó como con conmigo
conocer conseguimos conseguir considera consideró consigo consigue consiguen
consigues contigo contra creo cual cuales cualquier cuando cuanta cuantas
cuanto cuantos cuatro cuenta cuál cuáles cuándo cuánta cuántas cuánto cuántos
cómo
da dado dan dar de debajo debe deben debido decir dejó del delante demasiado
demás dentro deprisa desde despacio despues después detras detrás dia dias dice
dicen dicho dieron diferente diferentes dijeron dijo dio donde dos durante día
días dónde
dicen dicho dieron diez diferente diferentes dijeron dijo dio doce donde dos
durante día días dónde
ejemplo el ella ellas ello ellos embargo empleais emplean emplear empleas
empleo en encima encuentra enfrente enseguida entonces entre era eramos eran
eras eres es esa esas ese eso esos esta estaba estaban estado estados estais
estamos estan estar estará estas este esto estos estoy estuvo está están ex
excepto existe existen explicó expresó él ésa ésas ése ésos ésta éstas éste
éstos
e el ella ellas ello ellos embargo en encima encuentra enfrente enseguida
entonces entre era eramos eran eras eres es esa esas ese eso esos esta estaba
estaban estado estados estais estamos estan estar estará estas este esto estos
estoy estuvo está están excepto existe existen explicó expresó él ésa ésas ése
ésos ésta éstas éste éstos
fin final fue fuera fueron fui fuimos
general gran grandes gueno
gran grande grandes
ha haber habia habla hablan habrá había habían hace haceis hacemos hacen hacer
hacerlo haces hacia haciendo hago han hasta hay haya he hecho hemos hicieron
hizo horas hoy hubo
hizo hoy hubo
igual incluso indicó informo informó intenta intentais intentamos intentan
intentar intentas intento ir
igual incluso indicó informo informó ir
junto
la lado largo las le lejos les llegó lleva llevar lo los luego lugar
la lado largo las le les llegó lleva llevar lo los luego
mal manera manifestó mas mayor me mediante medio mejor mencionó menos menudo mi
mia mias mientras mio mios mis misma mismas mismo mismos modo momento mucha
muchas mucho muchos muy más mía mías mío míos
mia mias mientras mio mios mis misma mismas mismo mismos modo mucha muchas
mucho muchos muy más mía mías mío míos
nada nadie ni ninguna ningunas ninguno ningunos ningún no nos nosotras nosotros
nuestra nuestras nuestro nuestros nueva nuevas nuevo nuevos nunca
nuestra nuestras nuestro nuestros nueva nuevas nueve nuevo nuevos nunca
ocho os otra otras otro otros
o ocho once os otra otras otro otros
pais para parece parte partir pasada pasado paìs peor pero pesar poca pocas
poco pocos podeis podemos poder podria podriais podriamos podrian podrias podrá
para parece parte partir pasada pasado paìs peor pero pesar poca pocas poco
pocos podeis podemos poder podria podriais podriamos podrian podrias podrá
podrán podría podrían poner por porque posible primer primera primero primeros
principalmente pronto propia propias propio propios proximo próximo próximos
pudo pueda puede pueden puedo pues
pronto propia propias propio propios proximo próximo próximos pudo pueda puede
pueden puedo pues
qeu que quedó queremos quien quienes quiere quiza quizas quizá quizás quién quiénes qué
qeu que quedó queremos quien quienes quiere quiza quizas quizá quizás quién
quiénes qué
raras realizado realizar realizó repente respecto
realizado realizar realizó repente respecto
sabe sabeis sabemos saben saber sabes salvo se sea sean segun segunda segundo
según seis ser sera será serán sería señaló si sido siempre siendo siete sigue
siguiente sin sino sobre sois sola solamente solas solo solos somos son soy
soyos su supuesto sus suya suyas suyo sólo
siguiente sin sino sobre sois sola solamente solas solo solos somos son soy su
supuesto sus suya suyas suyo suyos sólo
tal tambien también tampoco tan tanto tarde te temprano tendrá tendrán teneis
tenemos tener tenga tengo tenido tenía tercera ti tiempo tiene tienen toda
todas todavia todavía todo todos total trabaja trabajais trabajamos trabajan
trabajar trabajas trabajo tras trata través tres tu tus tuvo tuya tuyas tuyo
tuyos
tenemos tener tenga tengo tenido tenía tercera tercero ti tiene tienen toda
todas todavia todavía todo todos total tras trata través tres tu tus tuvo tuya
tuyas tuyo tuyos
ultimo un una unas uno unos usa usais usamos usan usar usas uso usted ustedes
u ultimo un una unas uno unos usa usais usamos usan usar usas uso usted ustedes
última últimas último últimos
va vais valor vamos van varias varios vaya veces ver verdad verdadera verdadero
vez vosotras vosotros voy vuestra vuestras vuestro vuestros
va vais vamos van varias varios vaya veces ver verdad verdadera verdadero vez
vosotras vosotros voy vuestra vuestras vuestro vuestros
ya yo
y ya yo
""".split()
)

View File

@ -53,11 +53,16 @@ class FrenchLemmatizer(Lemmatizer):
rules = rules_table.get(univ_pos, [])
string = string.lower()
forms = []
# first try lookup in table based on upos
if string in index:
forms.append(string)
self.cache[cache_key] = forms
return forms
# then add anything in the exceptions table
forms.extend(exceptions.get(string, []))
# if nothing found yet, use the rules
oov_forms = []
if not forms:
for old, new in rules:
@ -69,12 +74,14 @@ class FrenchLemmatizer(Lemmatizer):
forms.append(form)
else:
oov_forms.append(form)
# if still nothing, add the oov forms from rules
if not forms:
forms.extend(oov_forms)
if not forms and string in lookup_table.keys():
forms.append(self.lookup_lemmatize(token)[0])
# use lookups, which fall back to the token itself
if not forms:
forms.append(string)
forms.append(lookup_table.get(string, [string])[0])
forms = list(dict.fromkeys(forms))
self.cache[cache_key] = forms
return forms

View File

@ -3,7 +3,7 @@ from ...attrs import LIKE_NUM
_num_words = set(
"""
zero un deux trois quatre cinq six sept huit neuf dix
zero un une deux trois quatre cinq six sept huit neuf dix
onze douze treize quatorze quinze seize dix-sept dix-huit dix-neuf
vingt trente quarante cinquante soixante soixante-dix septante quatre-vingt huitante quatre-vingt-dix nonante
cent mille mil million milliard billion quadrillion quintillion
@ -13,7 +13,7 @@ sextillion septillion octillion nonillion decillion
_ordinal_words = set(
"""
premier deuxième second troisième quatrième cinquième sixième septième huitième neuvième dixième
premier première deuxième second seconde troisième quatrième cinquième sixième septième huitième neuvième dixième
onzième douzième treizième quatorzième quinzième seizième dix-septième dix-huitième dix-neuvième
vingtième trentième quarantième cinquantième soixantième soixante-dixième septantième quatre-vingtième huitantième quatre-vingt-dixième nonantième
centième millième millionnième milliardième billionnième quadrillionnième quintillionnième

View File

@ -1,11 +1,15 @@
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from ...language import Language, BaseDefaults
class AncientGreekDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES
infixes = TOKENIZER_INFIXES
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS

View File

@ -0,0 +1,46 @@
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
from ..char_classes import LIST_ICONS, ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS
from ..char_classes import CONCAT_QUOTES
_prefixes = (
[
"",
"",
]
+ LIST_PUNCT
+ LIST_ELLIPSES
+ LIST_QUOTES
+ LIST_CURRENCY
+ LIST_ICONS
)
_suffixes = (
LIST_PUNCT
+ LIST_ELLIPSES
+ LIST_QUOTES
+ LIST_ICONS
+ [
"",
"",
r"(?<=[\u1F00-\u1FFF\u0370-\u03FF])[\-\.⸏]",
]
)
_infixes = (
LIST_ELLIPSES
+ LIST_ICONS
+ [
r"(?<=[0-9])[+\-\*^](?=[0-9-])",
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
),
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
r"(?<=[\u1F00-\u1FFF\u0370-\u03FF])—",
]
)
TOKENIZER_PREFIXES = _prefixes
TOKENIZER_SUFFIXES = _suffixes
TOKENIZER_INFIXES = _infixes

View File

@ -0,0 +1,18 @@
from .lex_attrs import LEX_ATTRS
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from ...language import Language, BaseDefaults
class UpperSorbianDefaults(BaseDefaults):
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
class UpperSorbian(Language):
lang = "hsb"
Defaults = UpperSorbianDefaults
__all__ = ["UpperSorbian"]

View File

@ -0,0 +1,15 @@
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.hsb.examples import sentences
>>> docs = nlp.pipe(sentences)
"""
sentences = [
"To běšo wjelgin raźone a jo se wót luźi derje pśiwzeło. Tak som dožywiła wjelgin",
"Jogo pśewóźowarce stej groniłej, až how w serbskich stronach njama Santa Claus nic pytaś.",
"A ten sobuźěłaśeŕ Statneje biblioteki w Barlinju jo pśimjeł drogotne knigły bźez rukajcowu z nagima rukoma!",
"Take wobchadanje z našym kulturnym derbstwom zewšym njejźo.",
"Wopśimjeśe drugich pśinoskow jo było na wusokem niwowje, ako pśecej.",
]

106
spacy/lang/hsb/lex_attrs.py Normal file
View File

@ -0,0 +1,106 @@
from ...attrs import LIKE_NUM
_num_words = [
"nul",
"jedyn",
"jedna",
"jedne",
"dwaj",
"dwě",
"tři",
"třo",
"štyri",
"štyrjo",
"pjeć",
"šěsć",
"sydom",
"wosom",
"dźewjeć",
"dźesać",
"jědnaće",
"dwanaće",
"třinaće",
"štyrnaće",
"pjatnaće",
"šěsnaće",
"sydomnaće",
"wosomnaće",
"dźewjatnaće",
"dwaceći",
"třiceći",
"štyrceći",
"pjećdźesat",
"šěsćdźesat",
"sydomdźesat",
"wosomdźesat",
"dźewjećdźesat",
"sto",
"tysac",
"milion",
"miliarda",
"bilion",
"biliarda",
"trilion",
"triliarda",
]
_ordinal_words = [
"prěni",
"prěnja",
"prěnje",
"druhi",
"druha",
"druhe",
"třeći",
"třeća",
"třeće",
"štwórty",
"štwórta",
"štwórte",
"pjaty",
"pjata",
"pjate",
"šěsty",
"šěsta",
"šěste",
"sydmy",
"sydma",
"sydme",
"wosmy",
"wosma",
"wosme",
"dźewjaty",
"dźewjata",
"dźewjate",
"dźesaty",
"dźesata",
"dźesate",
"jědnaty",
"jědnata",
"jědnate",
"dwanaty",
"dwanata",
"dwanate",
]
def like_num(text):
if text.startswith(("+", "-", "±", "~")):
text = text[1:]
text = text.replace(",", "").replace(".", "")
if text.isdigit():
return True
if text.count("/") == 1:
num, denom = text.split("/")
if num.isdigit() and denom.isdigit():
return True
text_lower = text.lower()
if text_lower in _num_words:
return True
# Check ordinal number
if text_lower in _ordinal_words:
return True
return False
LEX_ATTRS = {LIKE_NUM: like_num}

View File

@ -0,0 +1,19 @@
STOP_WORDS = set(
"""
a abo ale ani
dokelž
hdyž
jeli jelizo
kaž
pak potom
tež tohodla
zo zoby
""".split()
)

View File

@ -0,0 +1,18 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, NORM
from ...util import update_exc
_exc = dict()
for exc_data in [
{ORTH: "mil.", NORM: "milion"},
{ORTH: "wob.", NORM: "wobydler"},
]:
_exc[exc_data[ORTH]] = [exc_data]
for orth in [
"resp.",
]:
_exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -7,7 +7,7 @@ from .lex_attrs import LEX_ATTRS
from ...language import Language, BaseDefaults
from ...tokens import Doc
from ...scorer import Scorer
from ...symbols import POS
from ...symbols import POS, X
from ...training import validate_examples
from ...util import DummyTokenizer, registry, load_config_from_str
from ...vocab import Vocab
@ -57,7 +57,10 @@ class KoreanTokenizer(DummyTokenizer):
for token, dtoken in zip(doc, dtokens):
first_tag, sep, eomi_tags = dtoken["tag"].partition("+")
token.tag_ = first_tag # stem(어간) or pre-final(선어말 어미)
token.pos = TAG_MAP[token.tag_][POS]
if token.tag_ in TAG_MAP:
token.pos = TAG_MAP[token.tag_][POS]
else:
token.pos = X
token.lemma_ = dtoken["lemma"]
doc.user_data["full_tags"] = [dt["tag"] for dt in dtokens]
return doc

View File

@ -3,7 +3,7 @@ from ..punctuation import TOKENIZER_INFIXES as BASE_TOKENIZER_INFIXES
_infixes = (
["·", "", "\(", "\)"]
["·", "", r"\(", r"\)"]
+ [r"(?<=[0-9])~(?=[0-9-])"]
+ LIST_QUOTES
+ BASE_TOKENIZER_INFIXES

18
spacy/lang/la/__init__.py Normal file
View File

@ -0,0 +1,18 @@
from ...language import Language, BaseDefaults
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
class LatinDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
stop_words = STOP_WORDS
lex_attr_getters = LEX_ATTRS
class Latin(Language):
lang = "la"
Defaults = LatinDefaults
__all__ = ["Latin"]

View File

@ -0,0 +1,34 @@
from ...attrs import LIKE_NUM
import re
# cf. Goyvaerts/Levithan 2009; case-insensitive, allow 4
roman_numerals_compile = re.compile(
r"(?i)^(?=[MDCLXVI])M*(C[MD]|D?C{0,4})(X[CL]|L?X{0,4})(I[XV]|V?I{0,4})$"
)
_num_words = set(
"""
unus una unum duo duae tres tria quattuor quinque sex septem octo novem decem
""".split()
)
_ordinal_words = set(
"""
primus prima primum secundus secunda secundum tertius tertia tertium
""".split()
)
def like_num(text):
if text.isdigit():
return True
if roman_numerals_compile.match(text):
return True
if text.lower() in _num_words:
return True
if text.lower() in _ordinal_words:
return True
return False
LEX_ATTRS = {LIKE_NUM: like_num}

View File

@ -0,0 +1,37 @@
# Corrected Perseus list, cf. https://wiki.digitalclassicist.org/Stopwords_for_Greek_and_Latin
STOP_WORDS = set(
"""
ab ac ad adhuc aliqui aliquis an ante apud at atque aut autem
cum cur
de deinde dum
ego enim ergo es est et etiam etsi ex
fio
haud hic
iam idem igitur ille in infra inter interim ipse is ita
magis modo mox
nam ne nec necque neque nisi non nos
o ob
per possum post pro
quae quam quare qui quia quicumque quidem quilibet quis quisnam quisquam quisque quisquis quo quoniam
sed si sic sive sub sui sum super suus
tam tamen trans tu tum
ubi uel uero
vel vero
""".split()
)

View File

@ -0,0 +1,76 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH
from ...util import update_exc
## TODO: Look into systematically handling u/v
_exc = {
"mecum": [{ORTH: "me"}, {ORTH: "cum"}],
"tecum": [{ORTH: "te"}, {ORTH: "cum"}],
"nobiscum": [{ORTH: "nobis"}, {ORTH: "cum"}],
"vobiscum": [{ORTH: "vobis"}, {ORTH: "cum"}],
"uobiscum": [{ORTH: "uobis"}, {ORTH: "cum"}],
}
for orth in [
"A.",
"Agr.",
"Ap.",
"C.",
"Cn.",
"D.",
"F.",
"K.",
"L.",
"M'.",
"M.",
"Mam.",
"N.",
"Oct.",
"Opet.",
"P.",
"Paul.",
"Post.",
"Pro.",
"Q.",
"S.",
"Ser.",
"Sert.",
"Sex.",
"St.",
"Sta.",
"T.",
"Ti.",
"V.",
"Vol.",
"Vop.",
"U.",
"Uol.",
"Uop.",
"Ian.",
"Febr.",
"Mart.",
"Apr.",
"Mai.",
"Iun.",
"Iul.",
"Aug.",
"Sept.",
"Oct.",
"Nov.",
"Nou.",
"Dec.",
"Non.",
"Id.",
"A.D.",
"Coll.",
"Cos.",
"Ord.",
"Pl.",
"S.C.",
"Suff.",
"Trib.",
]:
_exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

18
spacy/lang/lg/__init__.py Normal file
View File

@ -0,0 +1,18 @@
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_INFIXES
from ...language import Language, BaseDefaults
class LugandaDefaults(BaseDefaults):
lex_attr_getters = LEX_ATTRS
infixes = TOKENIZER_INFIXES
stop_words = STOP_WORDS
class Luganda(Language):
lang = "lg"
Defaults = LugandaDefaults
__all__ = ["Luganda"]

17
spacy/lang/lg/examples.py Normal file
View File

@ -0,0 +1,17 @@
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.lg.examples import sentences
>>> docs = nlp.pipe(sentences)
"""
sentences = [
"Mpa ebyafaayo ku byalo Nakatu ne Nkajja",
"Okuyita Ttembo kitegeeza kugwa ddalu",
"Ekifumu kino kyali kya mulimu ki?",
"Ekkovu we liyise wayitibwa mukululo",
"Akola mulimu ki oguvaamu ssente?",
"Emisumaali egikomerera embaawo giyitibwa nninga",
"Abooluganda abemmamba ababiri",
"Ekisaawe ky'ebyenjigiriza kya mugaso nnyo",
]

View File

@ -0,0 +1,95 @@
from ...attrs import LIKE_NUM
_num_words = [
"nnooti", # Zero
"zeero", # zero
"emu", # one
"bbiri", # two
"ssatu", # three
"nnya", # four
"ttaano", # five
"mukaaga", # six
"musanvu", # seven
"munaana", # eight
"mwenda", # nine
"kkumi", # ten
"kkumi n'emu", # eleven
"kkumi na bbiri", # twelve
"kkumi na ssatu", # thirteen
"kkumi na nnya", # forteen
"kkumi na ttaano", # fifteen
"kkumi na mukaaga", # sixteen
"kkumi na musanvu", # seventeen
"kkumi na munaana", # eighteen
"kkumi na mwenda", # nineteen
"amakumi abiri", # twenty
"amakumi asatu", # thirty
"amakumi ana", # forty
"amakumi ataano", # fifty
"nkaaga", # sixty
"nsanvu", # seventy
"kinaana", # eighty
"kyenda", # ninety
"kikumi", # hundred
"lukumi", # thousand
"kakadde", # million
"kawumbi", # billion
"kase", # trillion
"katabalika", # quadrillion
"keesedde", # gajillion
"kafukunya", # bazillion
"ekisooka", # first
"ekyokubiri", # second
"ekyokusatu", # third
"ekyokuna", # fourth
"ekyokutaano", # fifith
"ekyomukaaga", # sixth
"ekyomusanvu", # seventh
"eky'omunaana", # eighth
"ekyomwenda", # nineth
"ekyekkumi", # tenth
"ekyekkumi n'ekimu", # eleventh
"ekyekkumi n'ebibiri", # twelveth
"ekyekkumi n'ebisatu", # thirteenth
"ekyekkumi n'ebina", # fourteenth
"ekyekkumi n'ebitaano", # fifteenth
"ekyekkumi n'omukaaga", # sixteenth
"ekyekkumi n'omusanvu", # seventeenth
"ekyekkumi n'omunaana", # eigteenth
"ekyekkumi n'omwenda", # nineteenth
"ekyamakumi abiri", # twentieth
"ekyamakumi asatu", # thirtieth
"ekyamakumi ana", # fortieth
"ekyamakumi ataano", # fiftieth
"ekyenkaaga", # sixtieth
"ekyensanvu", # seventieth
"ekyekinaana", # eightieth
"ekyekyenda", # ninetieth
"ekyekikumi", # hundredth
"ekyolukumi", # thousandth
"ekyakakadde", # millionth
"ekyakawumbi", # billionth
"ekyakase", # trillionth
"ekyakatabalika", # quadrillionth
"ekyakeesedde", # gajillionth
"ekyakafukunya", # bazillionth
]
def like_num(text):
if text.startswith(("+", "-", "±", "~")):
text = text[1:]
text = text.replace(",", "").replace(".", "")
if text.isdigit():
return True
if text.count("/") == 1:
num, denom = text.split("/")
if num.isdigit() and denom.isdigit():
return True
text_lower = text.lower()
if text_lower in _num_words:
return True
return False
LEX_ATTRS = {LIKE_NUM: like_num}

View File

@ -0,0 +1,19 @@
from ..char_classes import LIST_ELLIPSES, LIST_ICONS, HYPHENS
from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
_infixes = (
LIST_ELLIPSES
+ LIST_ICONS
+ [
r"(?<=[0-9])[+\-\*^](?=[0-9-])",
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
),
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
]
)
TOKENIZER_INFIXES = _infixes

View File

@ -0,0 +1,19 @@
STOP_WORDS = set(
"""
abadde abalala abamu abangi abava ajja ali alina ani anti ateekeddwa atewamu
atya awamu aweebwa ayinza ba baali babadde babalina bajja
bajjanewankubade bali balina bandi bangi bano bateekeddwa baweebwa bayina bebombi beera bibye
bimu bingi bino bo bokka bonna buli bulijjo bulungi bwabwe bwaffe bwayo bwe bwonna bya byabwe
byaffe byebimu byonna ddaa ddala ddi e ebimu ebiri ebweruobulungi ebyo edda ejja ekirala ekyo
endala engeri ennyo era erimu erina ffe ffenna ga gujja gumu gunno guno gwa gwe kaseera kati
kennyini ki kiki kikino kikye kikyo kino kirungi kki ku kubangabyombi kubangaolwokuba kudda
kuva kuwa kwegamba kyaffe kye kyekimuoyo kyekyo kyonna leero liryo lwa lwaki lyabwezaabwe
lyaffe lyange mbadde mingi mpozzi mu mulinaoyina munda mwegyabwe nolwekyo nabadde nabo nandiyagadde
nandiye nanti naye ne nedda neera nga nnyingi nnyini nnyinza nnyo nti nyinza nze oba ojja okudda
okugenda okuggyako okutuusa okuva okuwa oli olina oluvannyuma olwekyobuva omuli ono osobola otya
oyina oyo seetaaga si sinakindi singa talina tayina tebaali tebaalina tebayina terina tetulina
tetuteekeddwa tewali teyalina teyayina tolina tu tuyina tulina tuyina twafuna twetaaga wa wabula
wabweru wadde waggulunnina wakati waliwobangi waliyo wandi wange wano wansi weebwa yabadde yaffe
ye yenna yennyini yina yonna ziba zijja zonna
""".split()
)

View File

@ -15,7 +15,7 @@
STOP_WORDS = set(
"""
aan af al alle alles allebei alleen allen als altijd ander anders andere anderen aangaangde aangezien achter achterna
aan af al alle alles allebei alleen allen als altijd ander anders andere anderen aangaande aangezien achter achterna
afgelopen aldus alhoewel anderzijds
ben bij bijna bijvoorbeeld behalve beide beiden beneden bent bepaald beter betere betreffende binnen binnenin boven

View File

@ -40,6 +40,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
span_label = doc.vocab.strings.add("NP")
# Only NOUNS and PRONOUNS matter
end_span = -1
for i, word in enumerate(filter(lambda x: x.pos in [PRON, NOUN], doclike)):
# For NOUNS
# Pick children from syntactic parse (only those with certain dependencies)
@ -58,15 +59,17 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
children_i = [c.i for c in children] + [word.i]
start_span = min(children_i)
end_span = max(children_i) + 1
yield start_span, end_span, span_label
if start_span >= end_span:
end_span = max(children_i) + 1
yield start_span, end_span, span_label
# PRONOUNS only if it is the subject of a verb
elif word.pos == PRON:
if word.dep in pronoun_deps:
start_span = word.i
end_span = word.i + 1
yield start_span, end_span, span_label
if start_span >= end_span:
end_span = word.i + 1
yield start_span, end_span, span_label
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}

View File

@ -1,5 +1,5 @@
from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
from .char_classes import LIST_ICONS, HYPHENS, CURRENCY, UNITS
from .char_classes import LIST_ICONS, HYPHENS, CURRENCY, UNITS, COMBINING_DIACRITICS
from .char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT
@ -44,3 +44,23 @@ TOKENIZER_INFIXES = (
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
]
)
# Some languages e.g. written with the Cyrillic alphabet permit the use of diacritics
# to mark stressed syllables in words where stress is distinctive. Such languages
# should use the COMBINING_DIACRITICS... suffix and infix regex lists in
# place of the standard ones.
COMBINING_DIACRITICS_TOKENIZER_SUFFIXES = list(TOKENIZER_SUFFIXES) + [
r"(?<=[{a}][{d}])\.".format(a=ALPHA, d=COMBINING_DIACRITICS),
]
COMBINING_DIACRITICS_TOKENIZER_INFIXES = list(TOKENIZER_INFIXES) + [
r"(?<=[{al}][{d}])\.(?=[{au}{q}])".format(
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES, d=COMBINING_DIACRITICS
),
r"(?<=[{a}][{d}]),(?=[{a}])".format(a=ALPHA, d=COMBINING_DIACRITICS),
r"(?<=[{a}][{d}])(?:{h})(?=[{a}])".format(
a=ALPHA, d=COMBINING_DIACRITICS, h=HYPHENS
),
r"(?<=[{a}][{d}])[:<>=/](?=[{a}])".format(a=ALPHA, d=COMBINING_DIACRITICS),
]

View File

@ -5,6 +5,8 @@ from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .lex_attrs import LEX_ATTRS
from .lemmatizer import RussianLemmatizer
from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_INFIXES
from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_SUFFIXES
from ...language import Language, BaseDefaults
@ -12,6 +14,8 @@ class RussianDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
suffixes = COMBINING_DIACRITICS_TOKENIZER_SUFFIXES
infixes = COMBINING_DIACRITICS_TOKENIZER_INFIXES
class Russian(Language):
@ -24,7 +28,7 @@ class Russian(Language):
assigns=["token.lemma"],
default_config={
"model": None,
"mode": "pymorphy2",
"mode": "pymorphy3",
"overwrite": False,
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
},

View File

@ -19,33 +19,48 @@ class RussianLemmatizer(Lemmatizer):
model: Optional[Model],
name: str = "lemmatizer",
*,
mode: str = "pymorphy2",
mode: str = "pymorphy3",
overwrite: bool = False,
scorer: Optional[Callable] = lemmatizer_score,
) -> None:
if mode == "pymorphy2":
if mode in {"pymorphy2", "pymorphy2_lookup"}:
try:
from pymorphy2 import MorphAnalyzer
except ImportError:
raise ImportError(
"The Russian lemmatizer mode 'pymorphy2' requires the "
"pymorphy2 library. Install it with: pip install pymorphy2"
"The lemmatizer mode 'pymorphy2' requires the "
"pymorphy2 library and dictionaries. Install them with: "
"pip install pymorphy2"
"# for Ukrainian dictionaries:"
"pip install pymorphy2-dicts-uk"
) from None
if getattr(self, "_morph", None) is None:
self._morph = MorphAnalyzer()
self._morph = MorphAnalyzer(lang="ru")
elif mode in {"pymorphy3", "pymorphy3_lookup"}:
try:
from pymorphy3 import MorphAnalyzer
except ImportError:
raise ImportError(
"The lemmatizer mode 'pymorphy3' requires the "
"pymorphy3 library and dictionaries. Install them with: "
"pip install pymorphy3"
"# for Ukrainian dictionaries:"
"pip install pymorphy3-dicts-uk"
) from None
if getattr(self, "_morph", None) is None:
self._morph = MorphAnalyzer(lang="ru")
super().__init__(
vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
)
def pymorphy2_lemmatize(self, token: Token) -> List[str]:
def _pymorphy_lemmatize(self, token: Token) -> List[str]:
string = token.text
univ_pos = token.pos_
morphology = token.morph.to_dict()
if univ_pos == "PUNCT":
return [PUNCT_RULES.get(string, string)]
if univ_pos not in ("ADJ", "DET", "NOUN", "NUM", "PRON", "PROPN", "VERB"):
# Skip unchangeable pos
return [string.lower()]
return self._pymorphy_lookup_lemmatize(token)
analyses = self._morph.parse(string)
filtered_analyses = []
for analysis in analyses:
@ -53,8 +68,10 @@ class RussianLemmatizer(Lemmatizer):
# Skip suggested parse variant for unknown word for pymorphy
continue
analysis_pos, _ = oc2ud(str(analysis.tag))
if analysis_pos == univ_pos or (
analysis_pos in ("NOUN", "PROPN") and univ_pos in ("NOUN", "PROPN")
if (
analysis_pos == univ_pos
or (analysis_pos in ("NOUN", "PROPN") and univ_pos in ("NOUN", "PROPN"))
or ((analysis_pos == "PRON") and (univ_pos == "DET"))
):
filtered_analyses.append(analysis)
if not len(filtered_analyses):
@ -97,13 +114,28 @@ class RussianLemmatizer(Lemmatizer):
dict.fromkeys([analysis.normal_form for analysis in filtered_analyses])
)
def pymorphy2_lookup_lemmatize(self, token: Token) -> List[str]:
def _pymorphy_lookup_lemmatize(self, token: Token) -> List[str]:
string = token.text
analyses = self._morph.parse(string)
if len(analyses) == 1:
return [analyses[0].normal_form]
# often multiple forms would derive from the same normal form
# thus check _unique_ normal forms
normal_forms = set([an.normal_form for an in analyses])
if len(normal_forms) == 1:
return [next(iter(normal_forms))]
return [string]
def pymorphy2_lemmatize(self, token: Token) -> List[str]:
return self._pymorphy_lemmatize(token)
def pymorphy2_lookup_lemmatize(self, token: Token) -> List[str]:
return self._pymorphy_lookup_lemmatize(token)
def pymorphy3_lemmatize(self, token: Token) -> List[str]:
return self._pymorphy_lemmatize(token)
def pymorphy3_lookup_lemmatize(self, token: Token) -> List[str]:
return self._pymorphy_lookup_lemmatize(token)
def oc2ud(oc_tag: str) -> Tuple[str, Dict[str, str]]:
gram_map = {

Some files were not shown because too many files have changed in this diff Show More