mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-18 12:12:20 +03:00
Merge branch 'master' into feature/coref
This commit is contained in:
commit
683f470852
2
.github/ISSUE_TEMPLATE/01_bugs.md
vendored
2
.github/ISSUE_TEMPLATE/01_bugs.md
vendored
|
@ -4,6 +4,8 @@ about: Use this template if you came across a bug or unexpected behaviour differ
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
<!-- NOTE: For questions or install related issues, please open a Discussion instead. -->
|
||||||
|
|
||||||
## How to reproduce the behaviour
|
## How to reproduce the behaviour
|
||||||
<!-- Include a code example or the steps that led to the problem. Please try to be as specific as possible. -->
|
<!-- Include a code example or the steps that led to the problem. Please try to be as specific as possible. -->
|
||||||
|
|
||||||
|
|
3
.github/ISSUE_TEMPLATE/config.yml
vendored
3
.github/ISSUE_TEMPLATE/config.yml
vendored
|
@ -1,8 +1,5 @@
|
||||||
blank_issues_enabled: false
|
blank_issues_enabled: false
|
||||||
contact_links:
|
contact_links:
|
||||||
- name: ⚠️ Python 3.10 Support
|
|
||||||
url: https://github.com/explosion/spaCy/discussions/9418
|
|
||||||
about: Python 3.10 wheels haven't been released yet, see the link for details.
|
|
||||||
- name: 🗯 Discussions Forum
|
- name: 🗯 Discussions Forum
|
||||||
url: https://github.com/explosion/spaCy/discussions
|
url: https://github.com/explosion/spaCy/discussions
|
||||||
about: Install issues, usage questions, general discussion and anything else that isn't a bug report.
|
about: Install issues, usage questions, general discussion and anything else that isn't a bug report.
|
||||||
|
|
34
.github/azure-steps.yml
vendored
34
.github/azure-steps.yml
vendored
|
@ -64,12 +64,12 @@ steps:
|
||||||
displayName: "Run GPU tests"
|
displayName: "Run GPU tests"
|
||||||
condition: eq(${{ parameters.gpu }}, true)
|
condition: eq(${{ parameters.gpu }}, true)
|
||||||
|
|
||||||
- script: |
|
# - script: |
|
||||||
python -m spacy download ca_core_news_sm
|
# python -m spacy download ca_core_news_sm
|
||||||
python -m spacy download ca_core_news_md
|
# python -m spacy download ca_core_news_md
|
||||||
python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
|
# python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
|
||||||
displayName: 'Test download CLI'
|
# displayName: 'Test download CLI'
|
||||||
condition: eq(variables['python_version'], '3.8')
|
# condition: eq(variables['python_version'], '3.8')
|
||||||
|
|
||||||
- script: |
|
- script: |
|
||||||
python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
|
python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
|
||||||
|
@ -93,17 +93,17 @@ steps:
|
||||||
displayName: 'Test train CLI'
|
displayName: 'Test train CLI'
|
||||||
condition: eq(variables['python_version'], '3.8')
|
condition: eq(variables['python_version'], '3.8')
|
||||||
|
|
||||||
- script: |
|
# - script: |
|
||||||
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
|
# python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
|
||||||
PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
|
# PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
|
||||||
displayName: 'Test assemble CLI'
|
# displayName: 'Test assemble CLI'
|
||||||
condition: eq(variables['python_version'], '3.8')
|
# condition: eq(variables['python_version'], '3.8')
|
||||||
|
#
|
||||||
- script: |
|
# - script: |
|
||||||
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
|
# python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
|
||||||
python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
|
# python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
|
||||||
displayName: 'Test assemble CLI vectors warning'
|
# displayName: 'Test assemble CLI vectors warning'
|
||||||
condition: eq(variables['python_version'], '3.8')
|
# condition: eq(variables['python_version'], '3.8')
|
||||||
|
|
||||||
- script: |
|
- script: |
|
||||||
python .github/validate_universe_json.py website/meta/universe.json
|
python .github/validate_universe_json.py website/meta/universe.json
|
||||||
|
|
106
.github/contributors/fonfonx.md
vendored
Normal file
106
.github/contributors/fonfonx.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Xavier Fontaine |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | 2022-04-13 |
|
||||||
|
| GitHub username | fonfonx |
|
||||||
|
| Website (optional) | |
|
21
.github/workflows/gputests.yml
vendored
Normal file
21
.github/workflows/gputests.yml
vendored
Normal file
|
@ -0,0 +1,21 @@
|
||||||
|
name: Weekly GPU tests
|
||||||
|
|
||||||
|
on:
|
||||||
|
schedule:
|
||||||
|
- cron: '0 1 * * MON'
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
weekly-gputests:
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
branch: [master, v4]
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- name: Trigger buildkite build
|
||||||
|
uses: buildkite/trigger-pipeline-action@v1.2.0
|
||||||
|
env:
|
||||||
|
PIPELINE: explosion-ai/spacy-slow-gpu-tests
|
||||||
|
BRANCH: ${{ matrix.branch }}
|
||||||
|
MESSAGE: ":github: Weekly GPU + slow tests - triggered from a GitHub Action"
|
||||||
|
BUILDKITE_API_ACCESS_TOKEN: ${{ secrets.BUILDKITE_SECRET }}
|
37
.github/workflows/slowtests.yml
vendored
Normal file
37
.github/workflows/slowtests.yml
vendored
Normal file
|
@ -0,0 +1,37 @@
|
||||||
|
name: Daily slow tests
|
||||||
|
|
||||||
|
on:
|
||||||
|
schedule:
|
||||||
|
- cron: '0 0 * * *'
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
daily-slowtests:
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
branch: [master, v4]
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v1
|
||||||
|
with:
|
||||||
|
ref: ${{ matrix.branch }}
|
||||||
|
- name: Get commits from past 24 hours
|
||||||
|
id: check_commits
|
||||||
|
run: |
|
||||||
|
today=$(date '+%Y-%m-%d %H:%M:%S')
|
||||||
|
yesterday=$(date -d "yesterday" '+%Y-%m-%d %H:%M:%S')
|
||||||
|
if git log --after="$yesterday" --before="$today" | grep commit ; then
|
||||||
|
echo "::set-output name=run_tests::true"
|
||||||
|
else
|
||||||
|
echo "::set-output name=run_tests::false"
|
||||||
|
fi
|
||||||
|
|
||||||
|
- name: Trigger buildkite build
|
||||||
|
if: steps.check_commits.outputs.run_tests == 'true'
|
||||||
|
uses: buildkite/trigger-pipeline-action@v1.2.0
|
||||||
|
env:
|
||||||
|
PIPELINE: explosion-ai/spacy-slow-tests
|
||||||
|
BRANCH: ${{ matrix.branch }}
|
||||||
|
MESSAGE: ":github: Daily slow tests - triggered from a GitHub Action"
|
||||||
|
BUILDKITE_API_ACCESS_TOKEN: ${{ secrets.BUILDKITE_SECRET }}
|
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -9,7 +9,6 @@ keys/
|
||||||
spacy/tests/package/setup.cfg
|
spacy/tests/package/setup.cfg
|
||||||
spacy/tests/package/pyproject.toml
|
spacy/tests/package/pyproject.toml
|
||||||
spacy/tests/package/requirements.txt
|
spacy/tests/package/requirements.txt
|
||||||
spacy/tests/universe/universe.json
|
|
||||||
|
|
||||||
# Website
|
# Website
|
||||||
website/.cache/
|
website/.cache/
|
||||||
|
|
|
@ -144,7 +144,7 @@ Changes to `.py` files will be effective immediately.
|
||||||
|
|
||||||
When fixing a bug, first create an
|
When fixing a bug, first create an
|
||||||
[issue](https://github.com/explosion/spaCy/issues) if one does not already
|
[issue](https://github.com/explosion/spaCy/issues) if one does not already
|
||||||
exist. The description text can be very short – we don't want to make this too
|
exist. The description text can be very short – we don't want to make this too
|
||||||
bureaucratic.
|
bureaucratic.
|
||||||
|
|
||||||
Next, add a test to the relevant file in the
|
Next, add a test to the relevant file in the
|
||||||
|
@ -233,7 +233,7 @@ also want to keep an eye on unused declared variables or repeated
|
||||||
(i.e. overwritten) dictionary keys. If your code was formatted with `black`
|
(i.e. overwritten) dictionary keys. If your code was formatted with `black`
|
||||||
(see above), you shouldn't see any formatting-related warnings.
|
(see above), you shouldn't see any formatting-related warnings.
|
||||||
|
|
||||||
The [`.flake8`](.flake8) config defines the configuration we use for this
|
The `flake8` section in [`setup.cfg`](setup.cfg) defines the configuration we use for this
|
||||||
codebase. For example, we're not super strict about the line length, and we're
|
codebase. For example, we're not super strict about the line length, and we're
|
||||||
excluding very large files like lemmatization and tokenizer exception tables.
|
excluding very large files like lemmatization and tokenizer exception tables.
|
||||||
|
|
||||||
|
|
31
README.md
31
README.md
|
@ -32,19 +32,20 @@ open-source software, released under the MIT license.
|
||||||
|
|
||||||
## 📖 Documentation
|
## 📖 Documentation
|
||||||
|
|
||||||
| Documentation | |
|
| Documentation | |
|
||||||
| -------------------------- | -------------------------------------------------------------- |
|
| ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| ⭐️ **[spaCy 101]** | New to spaCy? Here's everything you need to know! |
|
| ⭐️ **[spaCy 101]** | New to spaCy? Here's everything you need to know! |
|
||||||
| 📚 **[Usage Guides]** | How to use spaCy and its features. |
|
| 📚 **[Usage Guides]** | How to use spaCy and its features. |
|
||||||
| 🚀 **[New in v3.0]** | New features, backwards incompatibilities and migration guide. |
|
| 🚀 **[New in v3.0]** | New features, backwards incompatibilities and migration guide. |
|
||||||
| 🪐 **[Project Templates]** | End-to-end workflows you can clone, modify and run. |
|
| 🪐 **[Project Templates]** | End-to-end workflows you can clone, modify and run. |
|
||||||
| 🎛 **[API Reference]** | The detailed reference for spaCy's API. |
|
| 🎛 **[API Reference]** | The detailed reference for spaCy's API. |
|
||||||
| 📦 **[Models]** | Download trained pipelines for spaCy. |
|
| 📦 **[Models]** | Download trained pipelines for spaCy. |
|
||||||
| 🌌 **[Universe]** | Plugins, extensions, demos and books from the spaCy ecosystem. |
|
| 🌌 **[Universe]** | Plugins, extensions, demos and books from the spaCy ecosystem. |
|
||||||
| 👩🏫 **[Online Course]** | Learn spaCy in this free and interactive online course. |
|
| 👩🏫 **[Online Course]** | Learn spaCy in this free and interactive online course. |
|
||||||
| 📺 **[Videos]** | Our YouTube channel with video tutorials, talks and more. |
|
| 📺 **[Videos]** | Our YouTube channel with video tutorials, talks and more. |
|
||||||
| 🛠 **[Changelog]** | Changes and version history. |
|
| 🛠 **[Changelog]** | Changes and version history. |
|
||||||
| 💝 **[Contribute]** | How to contribute to the spaCy project and code base. |
|
| 💝 **[Contribute]** | How to contribute to the spaCy project and code base. |
|
||||||
|
| <a href="https://explosion.ai/spacy-tailored-pipelines"><img src="https://user-images.githubusercontent.com/13643239/152853098-1c761611-ccb0-4ec6-9066-b234552831fe.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Get a custom spaCy pipeline, tailor-made for your NLP problem by spaCy's core developers. Streamlined, production-ready, predictable and maintainable. Start by completing our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-pipelines)** |
|
||||||
|
|
||||||
[spacy 101]: https://spacy.io/usage/spacy-101
|
[spacy 101]: https://spacy.io/usage/spacy-101
|
||||||
[new in v3.0]: https://spacy.io/usage/v3
|
[new in v3.0]: https://spacy.io/usage/v3
|
||||||
|
@ -60,9 +61,7 @@ open-source software, released under the MIT license.
|
||||||
|
|
||||||
## 💬 Where to ask questions
|
## 💬 Where to ask questions
|
||||||
|
|
||||||
The spaCy project is maintained by **[@honnibal](https://github.com/honnibal)**,
|
The spaCy project is maintained by the [spaCy team](https://explosion.ai/about).
|
||||||
**[@ines](https://github.com/ines)**, **[@svlandeg](https://github.com/svlandeg)**,
|
|
||||||
**[@adrianeboyd](https://github.com/adrianeboyd)** and **[@polm](https://github.com/polm)**.
|
|
||||||
Please understand that we won't be able to provide individual support via email.
|
Please understand that we won't be able to provide individual support via email.
|
||||||
We also believe that help is much more valuable if it's shared publicly, so that
|
We also believe that help is much more valuable if it's shared publicly, so that
|
||||||
more people can benefit from it.
|
more people can benefit from it.
|
||||||
|
|
|
@ -11,12 +11,14 @@ trigger:
|
||||||
exclude:
|
exclude:
|
||||||
- "website/*"
|
- "website/*"
|
||||||
- "*.md"
|
- "*.md"
|
||||||
|
- ".github/workflows/*"
|
||||||
pr:
|
pr:
|
||||||
paths:
|
paths:
|
||||||
exclude:
|
exclude:
|
||||||
- "*.md"
|
- "*.md"
|
||||||
- "website/docs/*"
|
- "website/docs/*"
|
||||||
- "website/src/*"
|
- "website/src/*"
|
||||||
|
- ".github/workflows/*"
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
# Perform basic checks for most important errors (syntax etc.) Uses the config
|
# Perform basic checks for most important errors (syntax etc.) Uses the config
|
||||||
|
|
|
@ -137,7 +137,7 @@ If any of the TODOs you've added are important and should be fixed soon, you sho
|
||||||
|
|
||||||
## Type hints
|
## Type hints
|
||||||
|
|
||||||
We use Python type hints across the `.py` files wherever possible. This makes it easy to understand what a function expects and returns, and modern editors will be able to show this information to you when you call an annotated function. Type hints are not currently used in the `.pyx` (Cython) code, except for definitions of registered functions and component factories, where they're used for config validation.
|
We use Python type hints across the `.py` files wherever possible. This makes it easy to understand what a function expects and returns, and modern editors will be able to show this information to you when you call an annotated function. Type hints are not currently used in the `.pyx` (Cython) code, except for definitions of registered functions and component factories, where they're used for config validation. Ideally when developing, run `mypy spacy` on the code base to inspect any issues.
|
||||||
|
|
||||||
If possible, you should always use the more descriptive type hints like `List[str]` or even `List[Any]` instead of only `list`. We also annotate arguments and return types of `Callable` – although, you can simplify this if the type otherwise gets too verbose (e.g. functions that return factories to create callbacks). Remember that `Callable` takes two values: a **list** of the argument type(s) in order, and the return values.
|
If possible, you should always use the more descriptive type hints like `List[str]` or even `List[Any]` instead of only `list`. We also annotate arguments and return types of `Callable` – although, you can simplify this if the type otherwise gets too verbose (e.g. functions that return factories to create callbacks). Remember that `Callable` takes two values: a **list** of the argument type(s) in order, and the return values.
|
||||||
|
|
||||||
|
@ -155,6 +155,13 @@ def create_callback(some_arg: bool) -> Callable[[str, int], List[str]]:
|
||||||
return callback
|
return callback
|
||||||
```
|
```
|
||||||
|
|
||||||
|
For typing variables, we prefer the explicit format.
|
||||||
|
|
||||||
|
```diff
|
||||||
|
- var = value # type: Type
|
||||||
|
+ var: Type = value
|
||||||
|
```
|
||||||
|
|
||||||
For model architectures, Thinc also provides a collection of [custom types](https://thinc.ai/docs/api-types), including more specific types for arrays and model inputs/outputs. Even outside of static type checking, using these types will make the code a lot easier to read and follow, since it's always clear what array types are expected (and what might go wrong if the output is different from the expected type).
|
For model architectures, Thinc also provides a collection of [custom types](https://thinc.ai/docs/api-types), including more specific types for arrays and model inputs/outputs. Even outside of static type checking, using these types will make the code a lot easier to read and follow, since it's always clear what array types are expected (and what might go wrong if the output is different from the expected type).
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
|
|
@ -5,7 +5,7 @@ requires = [
|
||||||
"cymem>=2.0.2,<2.1.0",
|
"cymem>=2.0.2,<2.1.0",
|
||||||
"preshed>=3.0.2,<3.1.0",
|
"preshed>=3.0.2,<3.1.0",
|
||||||
"murmurhash>=0.28.0,<1.1.0",
|
"murmurhash>=0.28.0,<1.1.0",
|
||||||
"thinc>=8.0.12,<8.1.0",
|
"thinc>=8.0.14,<8.1.0",
|
||||||
"blis>=0.4.0,<0.8.0",
|
"blis>=0.4.0,<0.8.0",
|
||||||
"pathy",
|
"pathy",
|
||||||
"numpy>=1.15.0",
|
"numpy>=1.15.0",
|
||||||
|
|
|
@ -1,14 +1,14 @@
|
||||||
# Our libraries
|
# Our libraries
|
||||||
spacy-legacy>=3.0.8,<3.1.0
|
spacy-legacy>=3.0.9,<3.1.0
|
||||||
spacy-loggers>=1.0.0,<2.0.0
|
spacy-loggers>=1.0.0,<2.0.0
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
thinc>=8.0.12,<8.1.0
|
thinc>=8.0.14,<8.1.0
|
||||||
blis>=0.4.0,<0.8.0
|
blis>=0.4.0,<0.8.0
|
||||||
ml_datasets>=0.2.0,<0.3.0
|
ml_datasets>=0.2.0,<0.3.0
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
wasabi>=0.8.1,<1.1.0
|
wasabi>=0.9.1,<1.1.0
|
||||||
srsly>=2.4.1,<3.0.0
|
srsly>=2.4.3,<3.0.0
|
||||||
catalogue>=2.0.6,<2.1.0
|
catalogue>=2.0.6,<2.1.0
|
||||||
typer>=0.3.0,<0.5.0
|
typer>=0.3.0,<0.5.0
|
||||||
pathy>=0.3.5
|
pathy>=0.3.5
|
||||||
|
@ -26,7 +26,7 @@ typing_extensions>=3.7.4.1,<4.0.0.0; python_version < "3.8"
|
||||||
# Development dependencies
|
# Development dependencies
|
||||||
pre-commit>=2.13.0
|
pre-commit>=2.13.0
|
||||||
cython>=0.25,<3.0
|
cython>=0.25,<3.0
|
||||||
pytest>=5.2.0
|
pytest>=5.2.0,!=7.1.0
|
||||||
pytest-timeout>=1.3.0,<2.0.0
|
pytest-timeout>=1.3.0,<2.0.0
|
||||||
mock>=2.0.0,<3.0.0
|
mock>=2.0.0,<3.0.0
|
||||||
flake8>=3.8.0,<3.10.0
|
flake8>=3.8.0,<3.10.0
|
||||||
|
@ -35,3 +35,4 @@ mypy==0.910
|
||||||
types-dataclasses>=0.1.3; python_version < "3.7"
|
types-dataclasses>=0.1.3; python_version < "3.7"
|
||||||
types-mock>=0.1.1
|
types-mock>=0.1.1
|
||||||
types-requests
|
types-requests
|
||||||
|
black>=22.0,<23.0
|
||||||
|
|
10
setup.cfg
10
setup.cfg
|
@ -38,18 +38,18 @@ setup_requires =
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
thinc>=8.0.12,<8.1.0
|
thinc>=8.0.14,<8.1.0
|
||||||
install_requires =
|
install_requires =
|
||||||
# Our libraries
|
# Our libraries
|
||||||
spacy-legacy>=3.0.8,<3.1.0
|
spacy-legacy>=3.0.9,<3.1.0
|
||||||
spacy-loggers>=1.0.0,<2.0.0
|
spacy-loggers>=1.0.0,<2.0.0
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
thinc>=8.0.12,<8.1.0
|
thinc>=8.0.14,<8.1.0
|
||||||
blis>=0.4.0,<0.8.0
|
blis>=0.4.0,<0.8.0
|
||||||
wasabi>=0.8.1,<1.1.0
|
wasabi>=0.9.1,<1.1.0
|
||||||
srsly>=2.4.1,<3.0.0
|
srsly>=2.4.3,<3.0.0
|
||||||
catalogue>=2.0.6,<2.1.0
|
catalogue>=2.0.6,<2.1.0
|
||||||
typer>=0.3.0,<0.5.0
|
typer>=0.3.0,<0.5.0
|
||||||
pathy>=0.3.5
|
pathy>=0.3.5
|
||||||
|
|
3
setup.py
3
setup.py
|
@ -23,6 +23,7 @@ Options.docstrings = True
|
||||||
|
|
||||||
PACKAGES = find_packages()
|
PACKAGES = find_packages()
|
||||||
MOD_NAMES = [
|
MOD_NAMES = [
|
||||||
|
"spacy.training.alignment_array",
|
||||||
"spacy.training.example",
|
"spacy.training.example",
|
||||||
"spacy.parts_of_speech",
|
"spacy.parts_of_speech",
|
||||||
"spacy.strings",
|
"spacy.strings",
|
||||||
|
@ -33,6 +34,7 @@ MOD_NAMES = [
|
||||||
"spacy.ml.parser_model",
|
"spacy.ml.parser_model",
|
||||||
"spacy.morphology",
|
"spacy.morphology",
|
||||||
"spacy.pipeline.dep_parser",
|
"spacy.pipeline.dep_parser",
|
||||||
|
"spacy.pipeline._edit_tree_internals.edit_trees",
|
||||||
"spacy.pipeline.morphologizer",
|
"spacy.pipeline.morphologizer",
|
||||||
"spacy.pipeline.multitask",
|
"spacy.pipeline.multitask",
|
||||||
"spacy.pipeline.ner",
|
"spacy.pipeline.ner",
|
||||||
|
@ -81,7 +83,6 @@ COPY_FILES = {
|
||||||
ROOT / "setup.cfg": PACKAGE_ROOT / "tests" / "package",
|
ROOT / "setup.cfg": PACKAGE_ROOT / "tests" / "package",
|
||||||
ROOT / "pyproject.toml": PACKAGE_ROOT / "tests" / "package",
|
ROOT / "pyproject.toml": PACKAGE_ROOT / "tests" / "package",
|
||||||
ROOT / "requirements.txt": PACKAGE_ROOT / "tests" / "package",
|
ROOT / "requirements.txt": PACKAGE_ROOT / "tests" / "package",
|
||||||
ROOT / "website" / "meta" / "universe.json": PACKAGE_ROOT / "tests" / "universe",
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
# fmt: off
|
# fmt: off
|
||||||
__title__ = "spacy"
|
__title__ = "spacy"
|
||||||
__version__ = "3.2.1"
|
__version__ = "3.3.0.dev0"
|
||||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||||
__projects__ = "https://github.com/explosion/projects"
|
__projects__ = "https://github.com/explosion/projects"
|
||||||
|
|
|
@ -14,6 +14,7 @@ from .pretrain import pretrain # noqa: F401
|
||||||
from .debug_data import debug_data # noqa: F401
|
from .debug_data import debug_data # noqa: F401
|
||||||
from .debug_config import debug_config # noqa: F401
|
from .debug_config import debug_config # noqa: F401
|
||||||
from .debug_model import debug_model # noqa: F401
|
from .debug_model import debug_model # noqa: F401
|
||||||
|
from .debug_diff import debug_diff # noqa: F401
|
||||||
from .evaluate import evaluate # noqa: F401
|
from .evaluate import evaluate # noqa: F401
|
||||||
from .convert import convert # noqa: F401
|
from .convert import convert # noqa: F401
|
||||||
from .init_pipeline import init_pipeline_cli # noqa: F401
|
from .init_pipeline import init_pipeline_cli # noqa: F401
|
||||||
|
|
|
@ -360,7 +360,7 @@ def download_file(src: Union[str, "Pathy"], dest: Path, *, force: bool = False)
|
||||||
src = str(src)
|
src = str(src)
|
||||||
with smart_open.open(src, mode="rb", ignore_ext=True) as input_file:
|
with smart_open.open(src, mode="rb", ignore_ext=True) as input_file:
|
||||||
with dest.open(mode="wb") as output_file:
|
with dest.open(mode="wb") as output_file:
|
||||||
output_file.write(input_file.read())
|
shutil.copyfileobj(input_file, output_file)
|
||||||
|
|
||||||
|
|
||||||
def ensure_pathy(path):
|
def ensure_pathy(path):
|
||||||
|
|
|
@ -19,6 +19,7 @@ from ..morphology import Morphology
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..util import registry, resolve_dot_names
|
from ..util import registry, resolve_dot_names
|
||||||
from ..compat import Literal
|
from ..compat import Literal
|
||||||
|
from ..vectors import Mode as VectorsMode
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
|
||||||
|
@ -170,29 +171,101 @@ def debug_data(
|
||||||
show=verbose,
|
show=verbose,
|
||||||
)
|
)
|
||||||
if len(nlp.vocab.vectors):
|
if len(nlp.vocab.vectors):
|
||||||
msg.info(
|
if nlp.vocab.vectors.mode == VectorsMode.floret:
|
||||||
f"{len(nlp.vocab.vectors)} vectors ({nlp.vocab.vectors.n_keys} "
|
msg.info(
|
||||||
f"unique keys, {nlp.vocab.vectors_length} dimensions)"
|
f"floret vectors with {len(nlp.vocab.vectors)} vectors, "
|
||||||
)
|
f"{nlp.vocab.vectors_length} dimensions, "
|
||||||
n_missing_vectors = sum(gold_train_data["words_missing_vectors"].values())
|
f"{nlp.vocab.vectors.minn}-{nlp.vocab.vectors.maxn} char "
|
||||||
msg.warn(
|
f"n-gram subwords"
|
||||||
"{} words in training data without vectors ({:.0f}%)".format(
|
)
|
||||||
n_missing_vectors,
|
else:
|
||||||
100 * (n_missing_vectors / gold_train_data["n_words"]),
|
msg.info(
|
||||||
),
|
f"{len(nlp.vocab.vectors)} vectors ({nlp.vocab.vectors.n_keys} "
|
||||||
)
|
f"unique keys, {nlp.vocab.vectors_length} dimensions)"
|
||||||
msg.text(
|
)
|
||||||
"10 most common words without vectors: {}".format(
|
n_missing_vectors = sum(gold_train_data["words_missing_vectors"].values())
|
||||||
_format_labels(
|
msg.warn(
|
||||||
gold_train_data["words_missing_vectors"].most_common(10),
|
"{} words in training data without vectors ({:.0f}%)".format(
|
||||||
counts=True,
|
n_missing_vectors,
|
||||||
)
|
100 * (n_missing_vectors / gold_train_data["n_words"]),
|
||||||
),
|
),
|
||||||
show=verbose,
|
)
|
||||||
)
|
msg.text(
|
||||||
|
"10 most common words without vectors: {}".format(
|
||||||
|
_format_labels(
|
||||||
|
gold_train_data["words_missing_vectors"].most_common(10),
|
||||||
|
counts=True,
|
||||||
|
)
|
||||||
|
),
|
||||||
|
show=verbose,
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
msg.info("No word vectors present in the package")
|
msg.info("No word vectors present in the package")
|
||||||
|
|
||||||
|
if "spancat" in factory_names:
|
||||||
|
model_labels_spancat = _get_labels_from_spancat(nlp)
|
||||||
|
has_low_data_warning = False
|
||||||
|
has_no_neg_warning = False
|
||||||
|
|
||||||
|
msg.divider("Span Categorization")
|
||||||
|
msg.table(model_labels_spancat, header=["Spans Key", "Labels"], divider=True)
|
||||||
|
|
||||||
|
msg.text("Label counts in train data: ", show=verbose)
|
||||||
|
for spans_key, data_labels in gold_train_data["spancat"].items():
|
||||||
|
msg.text(
|
||||||
|
f"Key: {spans_key}, {_format_labels(data_labels.items(), counts=True)}",
|
||||||
|
show=verbose,
|
||||||
|
)
|
||||||
|
# Data checks: only take the spans keys in the actual spancat components
|
||||||
|
data_labels_in_component = {
|
||||||
|
spans_key: gold_train_data["spancat"][spans_key]
|
||||||
|
for spans_key in model_labels_spancat.keys()
|
||||||
|
}
|
||||||
|
for spans_key, data_labels in data_labels_in_component.items():
|
||||||
|
for label, count in data_labels.items():
|
||||||
|
# Check for missing labels
|
||||||
|
spans_key_in_model = spans_key in model_labels_spancat.keys()
|
||||||
|
if (spans_key_in_model) and (
|
||||||
|
label not in model_labels_spancat[spans_key]
|
||||||
|
):
|
||||||
|
msg.warn(
|
||||||
|
f"Label '{label}' is not present in the model labels of key '{spans_key}'. "
|
||||||
|
"Performance may degrade after training."
|
||||||
|
)
|
||||||
|
# Check for low number of examples per label
|
||||||
|
if count <= NEW_LABEL_THRESHOLD:
|
||||||
|
msg.warn(
|
||||||
|
f"Low number of examples for label '{label}' in key '{spans_key}' ({count})"
|
||||||
|
)
|
||||||
|
has_low_data_warning = True
|
||||||
|
# Check for negative examples
|
||||||
|
with msg.loading("Analyzing label distribution..."):
|
||||||
|
neg_docs = _get_examples_without_label(
|
||||||
|
train_dataset, label, "spancat", spans_key
|
||||||
|
)
|
||||||
|
if neg_docs == 0:
|
||||||
|
msg.warn(f"No examples for texts WITHOUT new label '{label}'")
|
||||||
|
has_no_neg_warning = True
|
||||||
|
|
||||||
|
if has_low_data_warning:
|
||||||
|
msg.text(
|
||||||
|
f"To train a new span type, your data should include at "
|
||||||
|
f"least {NEW_LABEL_THRESHOLD} instances of the new label",
|
||||||
|
show=verbose,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
msg.good("Good amount of examples for all labels")
|
||||||
|
|
||||||
|
if has_no_neg_warning:
|
||||||
|
msg.text(
|
||||||
|
"Training data should always include examples of spans "
|
||||||
|
"in context, as well as examples without a given span "
|
||||||
|
"type.",
|
||||||
|
show=verbose,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
msg.good("Examples without ocurrences available for all labels")
|
||||||
|
|
||||||
if "ner" in factory_names:
|
if "ner" in factory_names:
|
||||||
# Get all unique NER labels present in the data
|
# Get all unique NER labels present in the data
|
||||||
labels = set(
|
labels = set(
|
||||||
|
@ -238,7 +311,7 @@ def debug_data(
|
||||||
has_low_data_warning = True
|
has_low_data_warning = True
|
||||||
|
|
||||||
with msg.loading("Analyzing label distribution..."):
|
with msg.loading("Analyzing label distribution..."):
|
||||||
neg_docs = _get_examples_without_label(train_dataset, label)
|
neg_docs = _get_examples_without_label(train_dataset, label, "ner")
|
||||||
if neg_docs == 0:
|
if neg_docs == 0:
|
||||||
msg.warn(f"No examples for texts WITHOUT new label '{label}'")
|
msg.warn(f"No examples for texts WITHOUT new label '{label}'")
|
||||||
has_no_neg_warning = True
|
has_no_neg_warning = True
|
||||||
|
@ -573,6 +646,7 @@ def _compile_gold(
|
||||||
"deps": Counter(),
|
"deps": Counter(),
|
||||||
"words": Counter(),
|
"words": Counter(),
|
||||||
"roots": Counter(),
|
"roots": Counter(),
|
||||||
|
"spancat": dict(),
|
||||||
"ws_ents": 0,
|
"ws_ents": 0,
|
||||||
"boundary_cross_ents": 0,
|
"boundary_cross_ents": 0,
|
||||||
"n_words": 0,
|
"n_words": 0,
|
||||||
|
@ -603,6 +677,7 @@ def _compile_gold(
|
||||||
if nlp.vocab.strings[word] not in nlp.vocab.vectors:
|
if nlp.vocab.strings[word] not in nlp.vocab.vectors:
|
||||||
data["words_missing_vectors"].update([word])
|
data["words_missing_vectors"].update([word])
|
||||||
if "ner" in factory_names:
|
if "ner" in factory_names:
|
||||||
|
sent_starts = eg.get_aligned_sent_starts()
|
||||||
for i, label in enumerate(eg.get_aligned_ner()):
|
for i, label in enumerate(eg.get_aligned_ner()):
|
||||||
if label is None:
|
if label is None:
|
||||||
continue
|
continue
|
||||||
|
@ -612,10 +687,19 @@ def _compile_gold(
|
||||||
if label.startswith(("B-", "U-")):
|
if label.startswith(("B-", "U-")):
|
||||||
combined_label = label.split("-")[1]
|
combined_label = label.split("-")[1]
|
||||||
data["ner"][combined_label] += 1
|
data["ner"][combined_label] += 1
|
||||||
if gold[i].is_sent_start and label.startswith(("I-", "L-")):
|
if sent_starts[i] == True and label.startswith(("I-", "L-")):
|
||||||
data["boundary_cross_ents"] += 1
|
data["boundary_cross_ents"] += 1
|
||||||
elif label == "-":
|
elif label == "-":
|
||||||
data["ner"]["-"] += 1
|
data["ner"]["-"] += 1
|
||||||
|
if "spancat" in factory_names:
|
||||||
|
for span_key in list(eg.reference.spans.keys()):
|
||||||
|
if span_key not in data["spancat"]:
|
||||||
|
data["spancat"][span_key] = Counter()
|
||||||
|
for i, span in enumerate(eg.reference.spans[span_key]):
|
||||||
|
if span.label_ is None:
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
data["spancat"][span_key][span.label_] += 1
|
||||||
if "textcat" in factory_names or "textcat_multilabel" in factory_names:
|
if "textcat" in factory_names or "textcat_multilabel" in factory_names:
|
||||||
data["cats"].update(gold.cats)
|
data["cats"].update(gold.cats)
|
||||||
if any(val not in (0, 1) for val in gold.cats.values()):
|
if any(val not in (0, 1) for val in gold.cats.values()):
|
||||||
|
@ -686,14 +770,28 @@ def _format_labels(
|
||||||
return ", ".join([f"'{l}'" for l in cast(Iterable[str], labels)])
|
return ", ".join([f"'{l}'" for l in cast(Iterable[str], labels)])
|
||||||
|
|
||||||
|
|
||||||
def _get_examples_without_label(data: Sequence[Example], label: str) -> int:
|
def _get_examples_without_label(
|
||||||
|
data: Sequence[Example],
|
||||||
|
label: str,
|
||||||
|
component: Literal["ner", "spancat"] = "ner",
|
||||||
|
spans_key: Optional[str] = "sc",
|
||||||
|
) -> int:
|
||||||
count = 0
|
count = 0
|
||||||
for eg in data:
|
for eg in data:
|
||||||
labels = [
|
if component == "ner":
|
||||||
label.split("-")[1]
|
labels = [
|
||||||
for label in eg.get_aligned_ner()
|
label.split("-")[1]
|
||||||
if label not in ("O", "-", None)
|
for label in eg.get_aligned_ner()
|
||||||
]
|
if label not in ("O", "-", None)
|
||||||
|
]
|
||||||
|
|
||||||
|
if component == "spancat":
|
||||||
|
labels = (
|
||||||
|
[span.label_ for span in eg.reference.spans[spans_key]]
|
||||||
|
if spans_key in eg.reference.spans
|
||||||
|
else []
|
||||||
|
)
|
||||||
|
|
||||||
if label not in labels:
|
if label not in labels:
|
||||||
count += 1
|
count += 1
|
||||||
return count
|
return count
|
||||||
|
|
89
spacy/cli/debug_diff.py
Normal file
89
spacy/cli/debug_diff.py
Normal file
|
@ -0,0 +1,89 @@
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import typer
|
||||||
|
from wasabi import Printer, diff_strings, MarkdownRenderer
|
||||||
|
from pathlib import Path
|
||||||
|
from thinc.api import Config
|
||||||
|
|
||||||
|
from ._util import debug_cli, Arg, Opt, show_validation_error, parse_config_overrides
|
||||||
|
from ..util import load_config
|
||||||
|
from .init_config import init_config, Optimizations
|
||||||
|
|
||||||
|
|
||||||
|
@debug_cli.command(
|
||||||
|
"diff-config",
|
||||||
|
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
||||||
|
)
|
||||||
|
def debug_diff_cli(
|
||||||
|
# fmt: off
|
||||||
|
ctx: typer.Context,
|
||||||
|
config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
|
||||||
|
compare_to: Optional[Path] = Opt(None, help="Path to a config file to diff against, or `None` to compare against default settings", exists=True, allow_dash=True),
|
||||||
|
optimize: Optimizations = Opt(Optimizations.efficiency.value, "--optimize", "-o", help="Whether the user config was optimized for efficiency or accuracy. Only relevant when comparing against the default config."),
|
||||||
|
gpu: bool = Opt(False, "--gpu", "-G", help="Whether the original config can run on a GPU. Only relevant when comparing against the default config."),
|
||||||
|
pretraining: bool = Opt(False, "--pretraining", "--pt", help="Whether to compare on a config with pretraining involved. Only relevant when comparing against the default config."),
|
||||||
|
markdown: bool = Opt(False, "--markdown", "-md", help="Generate Markdown for GitHub issues")
|
||||||
|
# fmt: on
|
||||||
|
):
|
||||||
|
"""Show a diff of a config file with respect to spaCy's defaults or another config file. If
|
||||||
|
additional settings were used in the creation of the config file, then you
|
||||||
|
must supply these as extra parameters to the command when comparing to the default settings. The generated diff
|
||||||
|
can also be used when posting to the discussion forum to provide more
|
||||||
|
information for the maintainers.
|
||||||
|
|
||||||
|
The `optimize`, `gpu`, and `pretraining` options are only relevant when
|
||||||
|
comparing against the default configuration (or specifically when `compare_to` is None).
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/cli#debug-diff
|
||||||
|
"""
|
||||||
|
debug_diff(
|
||||||
|
config_path=config_path,
|
||||||
|
compare_to=compare_to,
|
||||||
|
gpu=gpu,
|
||||||
|
optimize=optimize,
|
||||||
|
pretraining=pretraining,
|
||||||
|
markdown=markdown,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def debug_diff(
|
||||||
|
config_path: Path,
|
||||||
|
compare_to: Optional[Path],
|
||||||
|
gpu: bool,
|
||||||
|
optimize: Optimizations,
|
||||||
|
pretraining: bool,
|
||||||
|
markdown: bool,
|
||||||
|
):
|
||||||
|
msg = Printer()
|
||||||
|
with show_validation_error(hint_fill=False):
|
||||||
|
user_config = load_config(config_path)
|
||||||
|
if compare_to:
|
||||||
|
other_config = load_config(compare_to)
|
||||||
|
else:
|
||||||
|
# Recreate a default config based from user's config
|
||||||
|
lang = user_config["nlp"]["lang"]
|
||||||
|
pipeline = list(user_config["nlp"]["pipeline"])
|
||||||
|
msg.info(f"Found user-defined language: '{lang}'")
|
||||||
|
msg.info(f"Found user-defined pipelines: {pipeline}")
|
||||||
|
other_config = init_config(
|
||||||
|
lang=lang,
|
||||||
|
pipeline=pipeline,
|
||||||
|
optimize=optimize.value,
|
||||||
|
gpu=gpu,
|
||||||
|
pretraining=pretraining,
|
||||||
|
silent=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
user = user_config.to_str()
|
||||||
|
other = other_config.to_str()
|
||||||
|
|
||||||
|
if user == other:
|
||||||
|
msg.warn("No diff to show: configs are identical")
|
||||||
|
else:
|
||||||
|
diff_text = diff_strings(other, user, add_symbols=markdown)
|
||||||
|
if markdown:
|
||||||
|
md = MarkdownRenderer()
|
||||||
|
md.add(md.code_block(diff_text, "diff"))
|
||||||
|
print(md.text)
|
||||||
|
else:
|
||||||
|
print(diff_text)
|
|
@ -7,6 +7,7 @@ from collections import defaultdict
|
||||||
from catalogue import RegistryError
|
from catalogue import RegistryError
|
||||||
import srsly
|
import srsly
|
||||||
import sys
|
import sys
|
||||||
|
import re
|
||||||
|
|
||||||
from ._util import app, Arg, Opt, string_to_list, WHEEL_SUFFIX, SDIST_SUFFIX
|
from ._util import app, Arg, Opt, string_to_list, WHEEL_SUFFIX, SDIST_SUFFIX
|
||||||
from ..schemas import validate, ModelMetaSchema
|
from ..schemas import validate, ModelMetaSchema
|
||||||
|
@ -109,6 +110,24 @@ def package(
|
||||||
", ".join(meta["requirements"]),
|
", ".join(meta["requirements"]),
|
||||||
)
|
)
|
||||||
if name is not None:
|
if name is not None:
|
||||||
|
if not name.isidentifier():
|
||||||
|
msg.fail(
|
||||||
|
f"Model name ('{name}') is not a valid module name. "
|
||||||
|
"This is required so it can be imported as a module.",
|
||||||
|
"We recommend names that use ASCII A-Z, a-z, _ (underscore), "
|
||||||
|
"and 0-9. "
|
||||||
|
"For specific details see: https://docs.python.org/3/reference/lexical_analysis.html#identifiers",
|
||||||
|
exits=1,
|
||||||
|
)
|
||||||
|
if not _is_permitted_package_name(name):
|
||||||
|
msg.fail(
|
||||||
|
f"Model name ('{name}') is not a permitted package name. "
|
||||||
|
"This is required to correctly load the model with spacy.load.",
|
||||||
|
"We recommend names that use ASCII A-Z, a-z, _ (underscore), "
|
||||||
|
"and 0-9. "
|
||||||
|
"For specific details see: https://www.python.org/dev/peps/pep-0426/#name",
|
||||||
|
exits=1,
|
||||||
|
)
|
||||||
meta["name"] = name
|
meta["name"] = name
|
||||||
if version is not None:
|
if version is not None:
|
||||||
meta["version"] = version
|
meta["version"] = version
|
||||||
|
@ -162,7 +181,7 @@ def package(
|
||||||
imports="\n".join(f"from . import {m}" for m in imports)
|
imports="\n".join(f"from . import {m}" for m in imports)
|
||||||
)
|
)
|
||||||
create_file(package_path / "__init__.py", init_py)
|
create_file(package_path / "__init__.py", init_py)
|
||||||
msg.good(f"Successfully created package '{model_name_v}'", main_path)
|
msg.good(f"Successfully created package directory '{model_name_v}'", main_path)
|
||||||
if create_sdist:
|
if create_sdist:
|
||||||
with util.working_dir(main_path):
|
with util.working_dir(main_path):
|
||||||
util.run_command([sys.executable, "setup.py", "sdist"], capture=False)
|
util.run_command([sys.executable, "setup.py", "sdist"], capture=False)
|
||||||
|
@ -171,8 +190,14 @@ def package(
|
||||||
if create_wheel:
|
if create_wheel:
|
||||||
with util.working_dir(main_path):
|
with util.working_dir(main_path):
|
||||||
util.run_command([sys.executable, "setup.py", "bdist_wheel"], capture=False)
|
util.run_command([sys.executable, "setup.py", "bdist_wheel"], capture=False)
|
||||||
wheel = main_path / "dist" / f"{model_name_v}{WHEEL_SUFFIX}"
|
wheel_name_squashed = re.sub("_+", "_", model_name_v)
|
||||||
|
wheel = main_path / "dist" / f"{wheel_name_squashed}{WHEEL_SUFFIX}"
|
||||||
msg.good(f"Successfully created binary wheel", wheel)
|
msg.good(f"Successfully created binary wheel", wheel)
|
||||||
|
if "__" in model_name:
|
||||||
|
msg.warn(
|
||||||
|
f"Model name ('{model_name}') contains a run of underscores. "
|
||||||
|
"Runs of underscores are not significant in installed package names.",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def has_wheel() -> bool:
|
def has_wheel() -> bool:
|
||||||
|
@ -422,6 +447,14 @@ def _format_label_scheme(data: Dict[str, Any]) -> str:
|
||||||
return md.text
|
return md.text
|
||||||
|
|
||||||
|
|
||||||
|
def _is_permitted_package_name(package_name: str) -> bool:
|
||||||
|
# regex from: https://www.python.org/dev/peps/pep-0426/#name
|
||||||
|
permitted_match = re.search(
|
||||||
|
r"^([A-Z0-9]|[A-Z0-9][A-Z0-9._-]*[A-Z0-9])$", package_name, re.IGNORECASE
|
||||||
|
)
|
||||||
|
return permitted_match is not None
|
||||||
|
|
||||||
|
|
||||||
TEMPLATE_SETUP = """
|
TEMPLATE_SETUP = """
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
import io
|
import io
|
||||||
|
|
|
@ -3,9 +3,15 @@ the docs and the init config command. It encodes various best practices and
|
||||||
can help generate the best possible configuration, given a user's requirements. #}
|
can help generate the best possible configuration, given a user's requirements. #}
|
||||||
{%- set use_transformer = hardware != "cpu" -%}
|
{%- set use_transformer = hardware != "cpu" -%}
|
||||||
{%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
|
{%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
|
||||||
|
{%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "spancat", "trainable_lemmatizer"] -%}
|
||||||
[paths]
|
[paths]
|
||||||
train = null
|
train = null
|
||||||
dev = null
|
dev = null
|
||||||
|
{% if use_transformer or optimize == "efficiency" or not word_vectors -%}
|
||||||
|
vectors = null
|
||||||
|
{% else -%}
|
||||||
|
vectors = "{{ word_vectors }}"
|
||||||
|
{% endif -%}
|
||||||
|
|
||||||
[system]
|
[system]
|
||||||
{% if use_transformer -%}
|
{% if use_transformer -%}
|
||||||
|
@ -19,10 +25,10 @@ lang = "{{ lang }}"
|
||||||
{%- set has_textcat = ("textcat" in components or "textcat_multilabel" in components) -%}
|
{%- set has_textcat = ("textcat" in components or "textcat_multilabel" in components) -%}
|
||||||
{%- set with_accuracy = optimize == "accuracy" -%}
|
{%- set with_accuracy = optimize == "accuracy" -%}
|
||||||
{%- set has_accurate_textcat = has_textcat and with_accuracy -%}
|
{%- set has_accurate_textcat = has_textcat and with_accuracy -%}
|
||||||
{%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "entity_linker" in components or has_accurate_textcat) -%}
|
{%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "spancat" in components or "trainable_lemmatizer" in components or "entity_linker" in components or has_accurate_textcat) -%}
|
||||||
{%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components %}
|
{%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components -%}
|
||||||
{%- else -%}
|
{%- else -%}
|
||||||
{%- set full_pipeline = components %}
|
{%- set full_pipeline = components -%}
|
||||||
{%- endif %}
|
{%- endif %}
|
||||||
pipeline = {{ full_pipeline|pprint()|replace("'", '"')|safe }}
|
pipeline = {{ full_pipeline|pprint()|replace("'", '"')|safe }}
|
||||||
batch_size = {{ 128 if hardware == "gpu" else 1000 }}
|
batch_size = {{ 128 if hardware == "gpu" else 1000 }}
|
||||||
|
@ -49,7 +55,7 @@ stride = 96
|
||||||
factory = "morphologizer"
|
factory = "morphologizer"
|
||||||
|
|
||||||
[components.morphologizer.model]
|
[components.morphologizer.model]
|
||||||
@architectures = "spacy.Tagger.v1"
|
@architectures = "spacy.Tagger.v2"
|
||||||
nO = null
|
nO = null
|
||||||
|
|
||||||
[components.morphologizer.model.tok2vec]
|
[components.morphologizer.model.tok2vec]
|
||||||
|
@ -65,7 +71,7 @@ grad_factor = 1.0
|
||||||
factory = "tagger"
|
factory = "tagger"
|
||||||
|
|
||||||
[components.tagger.model]
|
[components.tagger.model]
|
||||||
@architectures = "spacy.Tagger.v1"
|
@architectures = "spacy.Tagger.v2"
|
||||||
nO = null
|
nO = null
|
||||||
|
|
||||||
[components.tagger.model.tok2vec]
|
[components.tagger.model.tok2vec]
|
||||||
|
@ -118,6 +124,60 @@ grad_factor = 1.0
|
||||||
@layers = "reduce_mean.v1"
|
@layers = "reduce_mean.v1"
|
||||||
{% endif -%}
|
{% endif -%}
|
||||||
|
|
||||||
|
{% if "spancat" in components -%}
|
||||||
|
[components.spancat]
|
||||||
|
factory = "spancat"
|
||||||
|
max_positive = null
|
||||||
|
scorer = {"@scorers":"spacy.spancat_scorer.v1"}
|
||||||
|
spans_key = "sc"
|
||||||
|
threshold = 0.5
|
||||||
|
|
||||||
|
[components.spancat.model]
|
||||||
|
@architectures = "spacy.SpanCategorizer.v1"
|
||||||
|
|
||||||
|
[components.spancat.model.reducer]
|
||||||
|
@layers = "spacy.mean_max_reducer.v1"
|
||||||
|
hidden_size = 128
|
||||||
|
|
||||||
|
[components.spancat.model.scorer]
|
||||||
|
@layers = "spacy.LinearLogistic.v1"
|
||||||
|
nO = null
|
||||||
|
nI = null
|
||||||
|
|
||||||
|
[components.spancat.model.tok2vec]
|
||||||
|
@architectures = "spacy-transformers.TransformerListener.v1"
|
||||||
|
grad_factor = 1.0
|
||||||
|
|
||||||
|
[components.spancat.model.tok2vec.pooling]
|
||||||
|
@layers = "reduce_mean.v1"
|
||||||
|
|
||||||
|
[components.spancat.suggester]
|
||||||
|
@misc = "spacy.ngram_suggester.v1"
|
||||||
|
sizes = [1,2,3]
|
||||||
|
{% endif -%}
|
||||||
|
|
||||||
|
{% if "trainable_lemmatizer" in components -%}
|
||||||
|
[components.trainable_lemmatizer]
|
||||||
|
factory = "trainable_lemmatizer"
|
||||||
|
backoff = "orth"
|
||||||
|
min_tree_freq = 3
|
||||||
|
overwrite = false
|
||||||
|
scorer = {"@scorers":"spacy.lemmatizer_scorer.v1"}
|
||||||
|
top_k = 1
|
||||||
|
|
||||||
|
[components.trainable_lemmatizer.model]
|
||||||
|
@architectures = "spacy.Tagger.v2"
|
||||||
|
nO = null
|
||||||
|
normalize = false
|
||||||
|
|
||||||
|
[components.trainable_lemmatizer.model.tok2vec]
|
||||||
|
@architectures = "spacy-transformers.TransformerListener.v1"
|
||||||
|
grad_factor = 1.0
|
||||||
|
|
||||||
|
[components.trainable_lemmatizer.model.tok2vec.pooling]
|
||||||
|
@layers = "reduce_mean.v1"
|
||||||
|
{% endif -%}
|
||||||
|
|
||||||
{% if "entity_linker" in components -%}
|
{% if "entity_linker" in components -%}
|
||||||
[components.entity_linker]
|
[components.entity_linker]
|
||||||
factory = "entity_linker"
|
factory = "entity_linker"
|
||||||
|
@ -126,7 +186,7 @@ incl_context = true
|
||||||
incl_prior = true
|
incl_prior = true
|
||||||
|
|
||||||
[components.entity_linker.model]
|
[components.entity_linker.model]
|
||||||
@architectures = "spacy.EntityLinker.v1"
|
@architectures = "spacy.EntityLinker.v2"
|
||||||
nO = null
|
nO = null
|
||||||
|
|
||||||
[components.entity_linker.model.tok2vec]
|
[components.entity_linker.model.tok2vec]
|
||||||
|
@ -233,7 +293,7 @@ maxout_pieces = 3
|
||||||
factory = "morphologizer"
|
factory = "morphologizer"
|
||||||
|
|
||||||
[components.morphologizer.model]
|
[components.morphologizer.model]
|
||||||
@architectures = "spacy.Tagger.v1"
|
@architectures = "spacy.Tagger.v2"
|
||||||
nO = null
|
nO = null
|
||||||
|
|
||||||
[components.morphologizer.model.tok2vec]
|
[components.morphologizer.model.tok2vec]
|
||||||
|
@ -246,7 +306,7 @@ width = ${components.tok2vec.model.encode.width}
|
||||||
factory = "tagger"
|
factory = "tagger"
|
||||||
|
|
||||||
[components.tagger.model]
|
[components.tagger.model]
|
||||||
@architectures = "spacy.Tagger.v1"
|
@architectures = "spacy.Tagger.v2"
|
||||||
nO = null
|
nO = null
|
||||||
|
|
||||||
[components.tagger.model.tok2vec]
|
[components.tagger.model.tok2vec]
|
||||||
|
@ -290,6 +350,54 @@ nO = null
|
||||||
width = ${components.tok2vec.model.encode.width}
|
width = ${components.tok2vec.model.encode.width}
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
|
||||||
|
{% if "spancat" in components %}
|
||||||
|
[components.spancat]
|
||||||
|
factory = "spancat"
|
||||||
|
max_positive = null
|
||||||
|
scorer = {"@scorers":"spacy.spancat_scorer.v1"}
|
||||||
|
spans_key = "sc"
|
||||||
|
threshold = 0.5
|
||||||
|
|
||||||
|
[components.spancat.model]
|
||||||
|
@architectures = "spacy.SpanCategorizer.v1"
|
||||||
|
|
||||||
|
[components.spancat.model.reducer]
|
||||||
|
@layers = "spacy.mean_max_reducer.v1"
|
||||||
|
hidden_size = 128
|
||||||
|
|
||||||
|
[components.spancat.model.scorer]
|
||||||
|
@layers = "spacy.LinearLogistic.v1"
|
||||||
|
nO = null
|
||||||
|
nI = null
|
||||||
|
|
||||||
|
[components.spancat.model.tok2vec]
|
||||||
|
@architectures = "spacy.Tok2VecListener.v1"
|
||||||
|
width = ${components.tok2vec.model.encode.width}
|
||||||
|
|
||||||
|
[components.spancat.suggester]
|
||||||
|
@misc = "spacy.ngram_suggester.v1"
|
||||||
|
sizes = [1,2,3]
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
|
{% if "trainable_lemmatizer" in components -%}
|
||||||
|
[components.trainable_lemmatizer]
|
||||||
|
factory = "trainable_lemmatizer"
|
||||||
|
backoff = "orth"
|
||||||
|
min_tree_freq = 3
|
||||||
|
overwrite = false
|
||||||
|
scorer = {"@scorers":"spacy.lemmatizer_scorer.v1"}
|
||||||
|
top_k = 1
|
||||||
|
|
||||||
|
[components.trainable_lemmatizer.model]
|
||||||
|
@architectures = "spacy.Tagger.v2"
|
||||||
|
nO = null
|
||||||
|
normalize = false
|
||||||
|
|
||||||
|
[components.trainable_lemmatizer.model.tok2vec]
|
||||||
|
@architectures = "spacy.Tok2VecListener.v1"
|
||||||
|
width = ${components.tok2vec.model.encode.width}
|
||||||
|
{% endif -%}
|
||||||
|
|
||||||
{% if "entity_linker" in components -%}
|
{% if "entity_linker" in components -%}
|
||||||
[components.entity_linker]
|
[components.entity_linker]
|
||||||
factory = "entity_linker"
|
factory = "entity_linker"
|
||||||
|
@ -298,7 +406,7 @@ incl_context = true
|
||||||
incl_prior = true
|
incl_prior = true
|
||||||
|
|
||||||
[components.entity_linker.model]
|
[components.entity_linker.model]
|
||||||
@architectures = "spacy.EntityLinker.v1"
|
@architectures = "spacy.EntityLinker.v2"
|
||||||
nO = null
|
nO = null
|
||||||
|
|
||||||
[components.entity_linker.model.tok2vec]
|
[components.entity_linker.model.tok2vec]
|
||||||
|
@ -364,7 +472,7 @@ no_output_layer = false
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
|
||||||
{% for pipe in components %}
|
{% for pipe in components %}
|
||||||
{% if pipe not in ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker"] %}
|
{% if pipe not in listener_components %}
|
||||||
{# Other components defined by the user: we just assume they're factories #}
|
{# Other components defined by the user: we just assume they're factories #}
|
||||||
[components.{{ pipe }}]
|
[components.{{ pipe }}]
|
||||||
factory = "{{ pipe }}"
|
factory = "{{ pipe }}"
|
||||||
|
@ -421,8 +529,4 @@ compound = 1.001
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
|
||||||
[initialize]
|
[initialize]
|
||||||
{% if use_transformer or optimize == "efficiency" or not word_vectors -%}
|
|
||||||
vectors = ${paths.vectors}
|
vectors = ${paths.vectors}
|
||||||
{% else -%}
|
|
||||||
vectors = "{{ word_vectors }}"
|
|
||||||
{% endif -%}
|
|
||||||
|
|
|
@ -4,10 +4,10 @@ spaCy's built in visualization suite for dependencies and named entities.
|
||||||
DOCS: https://spacy.io/api/top-level#displacy
|
DOCS: https://spacy.io/api/top-level#displacy
|
||||||
USAGE: https://spacy.io/usage/visualizers
|
USAGE: https://spacy.io/usage/visualizers
|
||||||
"""
|
"""
|
||||||
from typing import Union, Iterable, Optional, Dict, Any, Callable
|
from typing import List, Union, Iterable, Optional, Dict, Any, Callable
|
||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
from .render import DependencyRenderer, EntityRenderer
|
from .render import DependencyRenderer, EntityRenderer, SpanRenderer
|
||||||
from ..tokens import Doc, Span
|
from ..tokens import Doc, Span
|
||||||
from ..errors import Errors, Warnings
|
from ..errors import Errors, Warnings
|
||||||
from ..util import is_in_jupyter
|
from ..util import is_in_jupyter
|
||||||
|
@ -44,6 +44,7 @@ def render(
|
||||||
factories = {
|
factories = {
|
||||||
"dep": (DependencyRenderer, parse_deps),
|
"dep": (DependencyRenderer, parse_deps),
|
||||||
"ent": (EntityRenderer, parse_ents),
|
"ent": (EntityRenderer, parse_ents),
|
||||||
|
"span": (SpanRenderer, parse_spans),
|
||||||
}
|
}
|
||||||
if style not in factories:
|
if style not in factories:
|
||||||
raise ValueError(Errors.E087.format(style=style))
|
raise ValueError(Errors.E087.format(style=style))
|
||||||
|
@ -203,6 +204,42 @@ def parse_ents(doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
|
||||||
return {"text": doc.text, "ents": ents, "title": title, "settings": settings}
|
return {"text": doc.text, "ents": ents, "title": title, "settings": settings}
|
||||||
|
|
||||||
|
|
||||||
|
def parse_spans(doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
|
||||||
|
"""Generate spans in [{start: i, end: i, label: 'label'}] format.
|
||||||
|
|
||||||
|
doc (Doc): Document to parse.
|
||||||
|
options (Dict[str, any]): Span-specific visualisation options.
|
||||||
|
RETURNS (dict): Generated span types keyed by text (original text) and spans.
|
||||||
|
"""
|
||||||
|
kb_url_template = options.get("kb_url_template", None)
|
||||||
|
spans_key = options.get("spans_key", "sc")
|
||||||
|
spans = [
|
||||||
|
{
|
||||||
|
"start": span.start_char,
|
||||||
|
"end": span.end_char,
|
||||||
|
"start_token": span.start,
|
||||||
|
"end_token": span.end,
|
||||||
|
"label": span.label_,
|
||||||
|
"kb_id": span.kb_id_ if span.kb_id_ else "",
|
||||||
|
"kb_url": kb_url_template.format(span.kb_id_) if kb_url_template else "#",
|
||||||
|
}
|
||||||
|
for span in doc.spans[spans_key]
|
||||||
|
]
|
||||||
|
tokens = [token.text for token in doc]
|
||||||
|
|
||||||
|
if not spans:
|
||||||
|
warnings.warn(Warnings.W117.format(spans_key=spans_key))
|
||||||
|
title = doc.user_data.get("title", None) if hasattr(doc, "user_data") else None
|
||||||
|
settings = get_doc_settings(doc)
|
||||||
|
return {
|
||||||
|
"text": doc.text,
|
||||||
|
"spans": spans,
|
||||||
|
"title": title,
|
||||||
|
"settings": settings,
|
||||||
|
"tokens": tokens,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def set_render_wrapper(func: Callable[[str], str]) -> None:
|
def set_render_wrapper(func: Callable[[str], str]) -> None:
|
||||||
"""Set an optional wrapper function that is called around the generated
|
"""Set an optional wrapper function that is called around the generated
|
||||||
HTML markup on displacy.render. This can be used to allow integration into
|
HTML markup on displacy.render. This can be used to allow integration into
|
||||||
|
|
|
@ -1,12 +1,15 @@
|
||||||
from typing import Dict, Any, List, Optional, Union
|
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||||
import uuid
|
import uuid
|
||||||
|
import itertools
|
||||||
|
|
||||||
from .templates import TPL_DEP_SVG, TPL_DEP_WORDS, TPL_DEP_WORDS_LEMMA, TPL_DEP_ARCS
|
|
||||||
from .templates import TPL_ENT, TPL_ENT_RTL, TPL_FIGURE, TPL_TITLE, TPL_PAGE
|
|
||||||
from .templates import TPL_ENTS, TPL_KB_LINK
|
|
||||||
from ..util import minify_html, escape_html, registry
|
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
|
from ..util import escape_html, minify_html, registry
|
||||||
|
from .templates import TPL_DEP_ARCS, TPL_DEP_SVG, TPL_DEP_WORDS
|
||||||
|
from .templates import TPL_DEP_WORDS_LEMMA, TPL_ENT, TPL_ENT_RTL, TPL_ENTS
|
||||||
|
from .templates import TPL_FIGURE, TPL_KB_LINK, TPL_PAGE, TPL_SPAN
|
||||||
|
from .templates import TPL_SPAN_RTL, TPL_SPAN_SLICE, TPL_SPAN_SLICE_RTL
|
||||||
|
from .templates import TPL_SPAN_START, TPL_SPAN_START_RTL, TPL_SPANS
|
||||||
|
from .templates import TPL_TITLE
|
||||||
|
|
||||||
DEFAULT_LANG = "en"
|
DEFAULT_LANG = "en"
|
||||||
DEFAULT_DIR = "ltr"
|
DEFAULT_DIR = "ltr"
|
||||||
|
@ -33,6 +36,168 @@ DEFAULT_LABEL_COLORS = {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class SpanRenderer:
|
||||||
|
"""Render Spans as SVGs."""
|
||||||
|
|
||||||
|
style = "span"
|
||||||
|
|
||||||
|
def __init__(self, options: Dict[str, Any] = {}) -> None:
|
||||||
|
"""Initialise span renderer
|
||||||
|
|
||||||
|
options (dict): Visualiser-specific options (colors, spans)
|
||||||
|
"""
|
||||||
|
# Set up the colors and overall look
|
||||||
|
colors = dict(DEFAULT_LABEL_COLORS)
|
||||||
|
user_colors = registry.displacy_colors.get_all()
|
||||||
|
for user_color in user_colors.values():
|
||||||
|
if callable(user_color):
|
||||||
|
# Since this comes from the function registry, we want to make
|
||||||
|
# sure we support functions that *return* a dict of colors
|
||||||
|
user_color = user_color()
|
||||||
|
if not isinstance(user_color, dict):
|
||||||
|
raise ValueError(Errors.E925.format(obj=type(user_color)))
|
||||||
|
colors.update(user_color)
|
||||||
|
colors.update(options.get("colors", {}))
|
||||||
|
self.default_color = DEFAULT_ENTITY_COLOR
|
||||||
|
self.colors = {label.upper(): color for label, color in colors.items()}
|
||||||
|
|
||||||
|
# Set up how the text and labels will be rendered
|
||||||
|
self.direction = DEFAULT_DIR
|
||||||
|
self.lang = DEFAULT_LANG
|
||||||
|
self.top_offset = options.get("top_offset", 40)
|
||||||
|
self.top_offset_step = options.get("top_offset_step", 17)
|
||||||
|
|
||||||
|
# Set up which templates will be used
|
||||||
|
template = options.get("template")
|
||||||
|
if template:
|
||||||
|
self.span_template = template["span"]
|
||||||
|
self.span_slice_template = template["slice"]
|
||||||
|
self.span_start_template = template["start"]
|
||||||
|
else:
|
||||||
|
if self.direction == "rtl":
|
||||||
|
self.span_template = TPL_SPAN_RTL
|
||||||
|
self.span_slice_template = TPL_SPAN_SLICE_RTL
|
||||||
|
self.span_start_template = TPL_SPAN_START_RTL
|
||||||
|
else:
|
||||||
|
self.span_template = TPL_SPAN
|
||||||
|
self.span_slice_template = TPL_SPAN_SLICE
|
||||||
|
self.span_start_template = TPL_SPAN_START
|
||||||
|
|
||||||
|
def render(
|
||||||
|
self, parsed: List[Dict[str, Any]], page: bool = False, minify: bool = False
|
||||||
|
) -> str:
|
||||||
|
"""Render complete markup.
|
||||||
|
|
||||||
|
parsed (list): Dependency parses to render.
|
||||||
|
page (bool): Render parses wrapped as full HTML page.
|
||||||
|
minify (bool): Minify HTML markup.
|
||||||
|
RETURNS (str): Rendered HTML markup.
|
||||||
|
"""
|
||||||
|
rendered = []
|
||||||
|
for i, p in enumerate(parsed):
|
||||||
|
if i == 0:
|
||||||
|
settings = p.get("settings", {})
|
||||||
|
self.direction = settings.get("direction", DEFAULT_DIR)
|
||||||
|
self.lang = settings.get("lang", DEFAULT_LANG)
|
||||||
|
rendered.append(self.render_spans(p["tokens"], p["spans"], p.get("title")))
|
||||||
|
|
||||||
|
if page:
|
||||||
|
docs = "".join([TPL_FIGURE.format(content=doc) for doc in rendered])
|
||||||
|
markup = TPL_PAGE.format(content=docs, lang=self.lang, dir=self.direction)
|
||||||
|
else:
|
||||||
|
markup = "".join(rendered)
|
||||||
|
if minify:
|
||||||
|
return minify_html(markup)
|
||||||
|
return markup
|
||||||
|
|
||||||
|
def render_spans(
|
||||||
|
self,
|
||||||
|
tokens: List[str],
|
||||||
|
spans: List[Dict[str, Any]],
|
||||||
|
title: Optional[str],
|
||||||
|
) -> str:
|
||||||
|
"""Render span types in text.
|
||||||
|
|
||||||
|
Spans are rendered per-token, this means that for each token, we check if it's part
|
||||||
|
of a span slice (a member of a span type) or a span start (the starting token of a
|
||||||
|
given span type).
|
||||||
|
|
||||||
|
tokens (list): Individual tokens in the text
|
||||||
|
spans (list): Individual entity spans and their start, end, label, kb_id and kb_url.
|
||||||
|
title (str / None): Document title set in Doc.user_data['title'].
|
||||||
|
"""
|
||||||
|
per_token_info = []
|
||||||
|
for idx, token in enumerate(tokens):
|
||||||
|
# Identify if a token belongs to a Span (and which) and if it's a
|
||||||
|
# start token of said Span. We'll use this for the final HTML render
|
||||||
|
token_markup: Dict[str, Any] = {}
|
||||||
|
token_markup["text"] = token
|
||||||
|
entities = []
|
||||||
|
for span in spans:
|
||||||
|
ent = {}
|
||||||
|
if span["start_token"] <= idx < span["end_token"]:
|
||||||
|
ent["label"] = span["label"]
|
||||||
|
ent["is_start"] = True if idx == span["start_token"] else False
|
||||||
|
kb_id = span.get("kb_id", "")
|
||||||
|
kb_url = span.get("kb_url", "#")
|
||||||
|
ent["kb_link"] = (
|
||||||
|
TPL_KB_LINK.format(kb_id=kb_id, kb_url=kb_url) if kb_id else ""
|
||||||
|
)
|
||||||
|
entities.append(ent)
|
||||||
|
token_markup["entities"] = entities
|
||||||
|
per_token_info.append(token_markup)
|
||||||
|
|
||||||
|
markup = self._render_markup(per_token_info)
|
||||||
|
markup = TPL_SPANS.format(content=markup, dir=self.direction)
|
||||||
|
if title:
|
||||||
|
markup = TPL_TITLE.format(title=title) + markup
|
||||||
|
return markup
|
||||||
|
|
||||||
|
def _render_markup(self, per_token_info: List[Dict[str, Any]]) -> str:
|
||||||
|
"""Render the markup from per-token information"""
|
||||||
|
markup = ""
|
||||||
|
for token in per_token_info:
|
||||||
|
entities = sorted(token["entities"], key=lambda d: d["label"])
|
||||||
|
if entities:
|
||||||
|
slices = self._get_span_slices(token["entities"])
|
||||||
|
starts = self._get_span_starts(token["entities"])
|
||||||
|
markup += self.span_template.format(
|
||||||
|
text=token["text"], span_slices=slices, span_starts=starts
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
markup += escape_html(token["text"] + " ")
|
||||||
|
return markup
|
||||||
|
|
||||||
|
def _get_span_slices(self, entities: List[Dict]) -> str:
|
||||||
|
"""Get the rendered markup of all Span slices"""
|
||||||
|
span_slices = []
|
||||||
|
for entity, step in zip(entities, itertools.count(step=self.top_offset_step)):
|
||||||
|
color = self.colors.get(entity["label"].upper(), self.default_color)
|
||||||
|
span_slice = self.span_slice_template.format(
|
||||||
|
bg=color, top_offset=self.top_offset + step
|
||||||
|
)
|
||||||
|
span_slices.append(span_slice)
|
||||||
|
return "".join(span_slices)
|
||||||
|
|
||||||
|
def _get_span_starts(self, entities: List[Dict]) -> str:
|
||||||
|
"""Get the rendered markup of all Span start tokens"""
|
||||||
|
span_starts = []
|
||||||
|
for entity, step in zip(entities, itertools.count(step=self.top_offset_step)):
|
||||||
|
color = self.colors.get(entity["label"].upper(), self.default_color)
|
||||||
|
span_start = (
|
||||||
|
self.span_start_template.format(
|
||||||
|
bg=color,
|
||||||
|
top_offset=self.top_offset + step,
|
||||||
|
label=entity["label"],
|
||||||
|
kb_link=entity["kb_link"],
|
||||||
|
)
|
||||||
|
if entity["is_start"]
|
||||||
|
else ""
|
||||||
|
)
|
||||||
|
span_starts.append(span_start)
|
||||||
|
return "".join(span_starts)
|
||||||
|
|
||||||
|
|
||||||
class DependencyRenderer:
|
class DependencyRenderer:
|
||||||
"""Render dependency parses as SVGs."""
|
"""Render dependency parses as SVGs."""
|
||||||
|
|
||||||
|
@ -105,7 +270,7 @@ class DependencyRenderer:
|
||||||
RETURNS (str): Rendered SVG markup.
|
RETURNS (str): Rendered SVG markup.
|
||||||
"""
|
"""
|
||||||
self.levels = self.get_levels(arcs)
|
self.levels = self.get_levels(arcs)
|
||||||
self.highest_level = len(self.levels)
|
self.highest_level = max(self.levels.values(), default=0)
|
||||||
self.offset_y = self.distance / 2 * self.highest_level + self.arrow_stroke
|
self.offset_y = self.distance / 2 * self.highest_level + self.arrow_stroke
|
||||||
self.width = self.offset_x + len(words) * self.distance
|
self.width = self.offset_x + len(words) * self.distance
|
||||||
self.height = self.offset_y + 3 * self.word_spacing
|
self.height = self.offset_y + 3 * self.word_spacing
|
||||||
|
@ -165,7 +330,7 @@ class DependencyRenderer:
|
||||||
if start < 0 or end < 0:
|
if start < 0 or end < 0:
|
||||||
error_args = dict(start=start, end=end, label=label, dir=direction)
|
error_args = dict(start=start, end=end, label=label, dir=direction)
|
||||||
raise ValueError(Errors.E157.format(**error_args))
|
raise ValueError(Errors.E157.format(**error_args))
|
||||||
level = self.levels.index(end - start) + 1
|
level = self.levels[(start, end, label)]
|
||||||
x_start = self.offset_x + start * self.distance + self.arrow_spacing
|
x_start = self.offset_x + start * self.distance + self.arrow_spacing
|
||||||
if self.direction == "rtl":
|
if self.direction == "rtl":
|
||||||
x_start = self.width - x_start
|
x_start = self.width - x_start
|
||||||
|
@ -181,7 +346,7 @@ class DependencyRenderer:
|
||||||
y_curve = self.offset_y - level * self.distance / 2
|
y_curve = self.offset_y - level * self.distance / 2
|
||||||
if self.compact:
|
if self.compact:
|
||||||
y_curve = self.offset_y - level * self.distance / 6
|
y_curve = self.offset_y - level * self.distance / 6
|
||||||
if y_curve == 0 and len(self.levels) > 5:
|
if y_curve == 0 and max(self.levels.values(), default=0) > 5:
|
||||||
y_curve = -self.distance
|
y_curve = -self.distance
|
||||||
arrowhead = self.get_arrowhead(direction, x_start, y, x_end)
|
arrowhead = self.get_arrowhead(direction, x_start, y, x_end)
|
||||||
arc = self.get_arc(x_start, y, y_curve, x_end)
|
arc = self.get_arc(x_start, y, y_curve, x_end)
|
||||||
|
@ -225,15 +390,23 @@ class DependencyRenderer:
|
||||||
p1, p2, p3 = (end, end + self.arrow_width - 2, end - self.arrow_width + 2)
|
p1, p2, p3 = (end, end + self.arrow_width - 2, end - self.arrow_width + 2)
|
||||||
return f"M{p1},{y + 2} L{p2},{y - self.arrow_width} {p3},{y - self.arrow_width}"
|
return f"M{p1},{y + 2} L{p2},{y - self.arrow_width} {p3},{y - self.arrow_width}"
|
||||||
|
|
||||||
def get_levels(self, arcs: List[Dict[str, Any]]) -> List[int]:
|
def get_levels(self, arcs: List[Dict[str, Any]]) -> Dict[Tuple[int, int, str], int]:
|
||||||
"""Calculate available arc height "levels".
|
"""Calculate available arc height "levels".
|
||||||
Used to calculate arrow heights dynamically and without wasting space.
|
Used to calculate arrow heights dynamically and without wasting space.
|
||||||
|
|
||||||
args (list): Individual arcs and their start, end, direction and label.
|
args (list): Individual arcs and their start, end, direction and label.
|
||||||
RETURNS (list): Arc levels sorted from lowest to highest.
|
RETURNS (dict): Arc levels keyed by (start, end, label).
|
||||||
"""
|
"""
|
||||||
levels = set(map(lambda arc: arc["end"] - arc["start"], arcs))
|
arcs = [dict(t) for t in {tuple(sorted(arc.items())) for arc in arcs}]
|
||||||
return sorted(list(levels))
|
length = max([arc["end"] for arc in arcs], default=0)
|
||||||
|
max_level = [0] * length
|
||||||
|
levels = {}
|
||||||
|
for arc in sorted(arcs, key=lambda arc: arc["end"] - arc["start"]):
|
||||||
|
level = max(max_level[arc["start"] : arc["end"]]) + 1
|
||||||
|
for i in range(arc["start"], arc["end"]):
|
||||||
|
max_level[i] = level
|
||||||
|
levels[(arc["start"], arc["end"], arc["label"])] = level
|
||||||
|
return levels
|
||||||
|
|
||||||
|
|
||||||
class EntityRenderer:
|
class EntityRenderer:
|
||||||
|
@ -242,7 +415,7 @@ class EntityRenderer:
|
||||||
style = "ent"
|
style = "ent"
|
||||||
|
|
||||||
def __init__(self, options: Dict[str, Any] = {}) -> None:
|
def __init__(self, options: Dict[str, Any] = {}) -> None:
|
||||||
"""Initialise dependency renderer.
|
"""Initialise entity renderer.
|
||||||
|
|
||||||
options (dict): Visualiser-specific options (colors, ents)
|
options (dict): Visualiser-specific options (colors, ents)
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -62,6 +62,55 @@ TPL_ENT_RTL = """
|
||||||
</mark>
|
</mark>
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
TPL_SPANS = """
|
||||||
|
<div class="spans" style="line-height: 2.5; direction: {dir}">{content}</div>
|
||||||
|
"""
|
||||||
|
|
||||||
|
TPL_SPAN = """
|
||||||
|
<span style="font-weight: bold; display: inline-block; position: relative;">
|
||||||
|
{text}
|
||||||
|
{span_slices}
|
||||||
|
{span_starts}
|
||||||
|
</span>
|
||||||
|
"""
|
||||||
|
|
||||||
|
TPL_SPAN_SLICE = """
|
||||||
|
<span style="background: {bg}; top: {top_offset}px; height: 4px; left: -1px; width: calc(100% + 2px); position: absolute;">
|
||||||
|
</span>
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
TPL_SPAN_START = """
|
||||||
|
<span style="background: {bg}; top: {top_offset}px; height: 4px; border-top-left-radius: 3px; border-bottom-left-radius: 3px; left: -1px; width: calc(100% + 2px); position: absolute;">
|
||||||
|
<span style="background: {bg}; z-index: 10; color: #000; top: -0.5em; padding: 2px 3px; position: absolute; font-size: 0.6em; font-weight: bold; line-height: 1; border-radius: 3px">
|
||||||
|
{label}{kb_link}
|
||||||
|
</span>
|
||||||
|
</span>
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
TPL_SPAN_RTL = """
|
||||||
|
<span style="font-weight: bold; display: inline-block; position: relative;">
|
||||||
|
{text}
|
||||||
|
{span_slices}
|
||||||
|
{span_starts}
|
||||||
|
</span>
|
||||||
|
"""
|
||||||
|
|
||||||
|
TPL_SPAN_SLICE_RTL = """
|
||||||
|
<span style="background: {bg}; top: {top_offset}px; height: 4px; left: -1px; width: calc(100% + 2px); position: absolute;">
|
||||||
|
</span>
|
||||||
|
"""
|
||||||
|
|
||||||
|
TPL_SPAN_START_RTL = """
|
||||||
|
<span style="background: {bg}; top: {top_offset}px; height: 4px; border-top-left-radius: 3px; border-bottom-left-radius: 3px; left: -1px; width: calc(100% + 2px); position: absolute;">
|
||||||
|
<span style="background: {bg}; z-index: 10; color: #000; top: -0.5em; padding: 2px 3px; position: absolute; font-size: 0.6em; font-weight: bold; line-height: 1; border-radius: 3px">
|
||||||
|
{label}{kb_link}
|
||||||
|
</span>
|
||||||
|
</span>
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
# Important: this needs to start with a space!
|
# Important: this needs to start with a space!
|
||||||
TPL_KB_LINK = """
|
TPL_KB_LINK = """
|
||||||
<a style="text-decoration: none; color: inherit; font-weight: normal" href="{kb_url}">{kb_id}</a>
|
<a style="text-decoration: none; color: inherit; font-weight: normal" href="{kb_url}">{kb_id}</a>
|
||||||
|
|
|
@ -192,6 +192,13 @@ class Warnings(metaclass=ErrorsWithCodes):
|
||||||
W115 = ("Skipping {method}: the floret vector table cannot be modified. "
|
W115 = ("Skipping {method}: the floret vector table cannot be modified. "
|
||||||
"Vectors are calculated from character ngrams.")
|
"Vectors are calculated from character ngrams.")
|
||||||
W116 = ("Unable to clean attribute '{attr}'.")
|
W116 = ("Unable to clean attribute '{attr}'.")
|
||||||
|
W117 = ("No spans to visualize found in Doc object with spans_key: '{spans_key}'. If this is "
|
||||||
|
"surprising to you, make sure the Doc was processed using a model "
|
||||||
|
"that supports span categorization, and check the `doc.spans[spans_key]` "
|
||||||
|
"property manually if necessary.")
|
||||||
|
W118 = ("Term '{term}' not found in glossary. It may however be explained in documentation "
|
||||||
|
"for the corpora used to train the language. Please check "
|
||||||
|
"`nlp.meta[\"sources\"]` for any relevant links.")
|
||||||
|
|
||||||
|
|
||||||
class Errors(metaclass=ErrorsWithCodes):
|
class Errors(metaclass=ErrorsWithCodes):
|
||||||
|
@ -483,7 +490,7 @@ class Errors(metaclass=ErrorsWithCodes):
|
||||||
"components, since spans are only views of the Doc. Use Doc and "
|
"components, since spans are only views of the Doc. Use Doc and "
|
||||||
"Token attributes (or custom extension attributes) only and remove "
|
"Token attributes (or custom extension attributes) only and remove "
|
||||||
"the following: {attrs}")
|
"the following: {attrs}")
|
||||||
E181 = ("Received invalid attributes for unkown object {obj}: {attrs}. "
|
E181 = ("Received invalid attributes for unknown object {obj}: {attrs}. "
|
||||||
"Only Doc and Token attributes are supported.")
|
"Only Doc and Token attributes are supported.")
|
||||||
E182 = ("Received invalid attribute declaration: {attr}\nDid you forget "
|
E182 = ("Received invalid attribute declaration: {attr}\nDid you forget "
|
||||||
"to define the attribute? For example: `{attr}.???`")
|
"to define the attribute? For example: `{attr}.???`")
|
||||||
|
@ -520,10 +527,14 @@ class Errors(metaclass=ErrorsWithCodes):
|
||||||
E202 = ("Unsupported {name} mode '{mode}'. Supported modes: {modes}.")
|
E202 = ("Unsupported {name} mode '{mode}'. Supported modes: {modes}.")
|
||||||
|
|
||||||
# New errors added in v3.x
|
# New errors added in v3.x
|
||||||
|
E855 = ("Invalid {obj}: {obj} is not from the same doc.")
|
||||||
|
E856 = ("Error accessing span at position {i}: out of bounds in span group "
|
||||||
|
"of length {length}.")
|
||||||
|
E857 = ("Entry '{name}' not found in edit tree lemmatizer labels.")
|
||||||
E858 = ("The {mode} vector table does not support this operation. "
|
E858 = ("The {mode} vector table does not support this operation. "
|
||||||
"{alternative}")
|
"{alternative}")
|
||||||
E859 = ("The floret vector table cannot be modified.")
|
E859 = ("The floret vector table cannot be modified.")
|
||||||
E860 = ("Can't truncate fasttext-bloom vectors.")
|
E860 = ("Can't truncate floret vectors.")
|
||||||
E861 = ("No 'keys' should be provided when initializing floret vectors "
|
E861 = ("No 'keys' should be provided when initializing floret vectors "
|
||||||
"with 'minn' and 'maxn'.")
|
"with 'minn' and 'maxn'.")
|
||||||
E862 = ("'hash_count' must be between 1-4 for floret vectors.")
|
E862 = ("'hash_count' must be between 1-4 for floret vectors.")
|
||||||
|
@ -566,9 +577,6 @@ class Errors(metaclass=ErrorsWithCodes):
|
||||||
E879 = ("Unexpected type for 'spans' data. Provide a dictionary mapping keys to "
|
E879 = ("Unexpected type for 'spans' data. Provide a dictionary mapping keys to "
|
||||||
"a list of spans, with each span represented by a tuple (start_char, end_char). "
|
"a list of spans, with each span represented by a tuple (start_char, end_char). "
|
||||||
"The tuple can be optionally extended with a label and a KB ID.")
|
"The tuple can be optionally extended with a label and a KB ID.")
|
||||||
E880 = ("The 'wandb' library could not be found - did you install it? "
|
|
||||||
"Alternatively, specify the 'ConsoleLogger' in the 'training.logger' "
|
|
||||||
"config section, instead of the 'WandbLogger'.")
|
|
||||||
E884 = ("The pipeline could not be initialized because the vectors "
|
E884 = ("The pipeline could not be initialized because the vectors "
|
||||||
"could not be found at '{vectors}'. If your pipeline was already "
|
"could not be found at '{vectors}'. If your pipeline was already "
|
||||||
"initialized/trained before, call 'resume_training' instead of 'initialize', "
|
"initialized/trained before, call 'resume_training' instead of 'initialize', "
|
||||||
|
@ -894,6 +902,9 @@ class Errors(metaclass=ErrorsWithCodes):
|
||||||
"patterns.")
|
"patterns.")
|
||||||
E1025 = ("Cannot intify the value '{value}' as an IOB string. The only "
|
E1025 = ("Cannot intify the value '{value}' as an IOB string. The only "
|
||||||
"supported values are: 'I', 'O', 'B' and ''")
|
"supported values are: 'I', 'O', 'B' and ''")
|
||||||
|
E1026 = ("Edit tree has an invalid format:\n{errors}")
|
||||||
|
E1027 = ("AlignmentArray only supports slicing with a step of 1.")
|
||||||
|
E1028 = ("AlignmentArray only supports indexing using an int or a slice.")
|
||||||
|
|
||||||
|
|
||||||
# Deprecated model shortcuts, only used in errors and warnings
|
# Deprecated model shortcuts, only used in errors and warnings
|
||||||
|
|
|
@ -1,3 +1,7 @@
|
||||||
|
import warnings
|
||||||
|
from .errors import Warnings
|
||||||
|
|
||||||
|
|
||||||
def explain(term):
|
def explain(term):
|
||||||
"""Get a description for a given POS tag, dependency label or entity type.
|
"""Get a description for a given POS tag, dependency label or entity type.
|
||||||
|
|
||||||
|
@ -11,6 +15,8 @@ def explain(term):
|
||||||
"""
|
"""
|
||||||
if term in GLOSSARY:
|
if term in GLOSSARY:
|
||||||
return GLOSSARY[term]
|
return GLOSSARY[term]
|
||||||
|
else:
|
||||||
|
warnings.warn(Warnings.W118.format(term=term))
|
||||||
|
|
||||||
|
|
||||||
GLOSSARY = {
|
GLOSSARY = {
|
||||||
|
@ -310,7 +316,6 @@ GLOSSARY = {
|
||||||
"re": "repeated element",
|
"re": "repeated element",
|
||||||
"rs": "reported speech",
|
"rs": "reported speech",
|
||||||
"sb": "subject",
|
"sb": "subject",
|
||||||
"sb": "subject",
|
|
||||||
"sbp": "passivized subject (PP)",
|
"sbp": "passivized subject (PP)",
|
||||||
"sp": "subject or predicate",
|
"sp": "subject or predicate",
|
||||||
"svp": "separable verb prefix",
|
"svp": "separable verb prefix",
|
||||||
|
|
|
@ -45,6 +45,10 @@ _hangul_syllables = r"\uAC00-\uD7AF"
|
||||||
_hangul_jamo = r"\u1100-\u11FF"
|
_hangul_jamo = r"\u1100-\u11FF"
|
||||||
_hangul = _hangul_syllables + _hangul_jamo
|
_hangul = _hangul_syllables + _hangul_jamo
|
||||||
|
|
||||||
|
_hiragana = r"\u3040-\u309F"
|
||||||
|
_katakana = r"\u30A0-\u30FFー"
|
||||||
|
_kana = _hiragana + _katakana
|
||||||
|
|
||||||
# letters with diacritics - Catalan, Czech, Latin, Latvian, Lithuanian, Polish, Slovak, Turkish, Welsh
|
# letters with diacritics - Catalan, Czech, Latin, Latvian, Lithuanian, Polish, Slovak, Turkish, Welsh
|
||||||
_latin_u_extendedA = (
|
_latin_u_extendedA = (
|
||||||
r"\u0100\u0102\u0104\u0106\u0108\u010A\u010C\u010E\u0110\u0112\u0114\u0116\u0118\u011A\u011C"
|
r"\u0100\u0102\u0104\u0106\u0108\u010A\u010C\u010E\u0110\u0112\u0114\u0116\u0118\u011A\u011C"
|
||||||
|
@ -244,6 +248,7 @@ _uncased = (
|
||||||
+ _tamil
|
+ _tamil
|
||||||
+ _telugu
|
+ _telugu
|
||||||
+ _hangul
|
+ _hangul
|
||||||
|
+ _kana
|
||||||
+ _cjk
|
+ _cjk
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
16
spacy/lang/dsb/__init__.py
Normal file
16
spacy/lang/dsb/__init__.py
Normal file
|
@ -0,0 +1,16 @@
|
||||||
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
from .stop_words import STOP_WORDS
|
||||||
|
from ...language import Language, BaseDefaults
|
||||||
|
|
||||||
|
|
||||||
|
class LowerSorbianDefaults(BaseDefaults):
|
||||||
|
lex_attr_getters = LEX_ATTRS
|
||||||
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
|
class LowerSorbian(Language):
|
||||||
|
lang = "dsb"
|
||||||
|
Defaults = LowerSorbianDefaults
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["LowerSorbian"]
|
15
spacy/lang/dsb/examples.py
Normal file
15
spacy/lang/dsb/examples.py
Normal file
|
@ -0,0 +1,15 @@
|
||||||
|
"""
|
||||||
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
>>> from spacy.lang.dsb.examples import sentences
|
||||||
|
>>> docs = nlp.pipe(sentences)
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
sentences = [
|
||||||
|
"Z tym stwori so wuměnjenje a zakład za dalše wobdźěłanje přez analyzu tekstoweje struktury a semantisku anotaciju a z tym tež za tu předstajenu digitalnu online-wersiju.",
|
||||||
|
"Mi so tu jara derje spodoba.",
|
||||||
|
"Kotre nowniny chceće měć?",
|
||||||
|
"Tak ako w slědnem lěśe jo teke lětosa jano doma zapustowaś móžno.",
|
||||||
|
"Zwóstanjo pótakem hyšći wjele źěła.",
|
||||||
|
]
|
113
spacy/lang/dsb/lex_attrs.py
Normal file
113
spacy/lang/dsb/lex_attrs.py
Normal file
|
@ -0,0 +1,113 @@
|
||||||
|
from ...attrs import LIKE_NUM
|
||||||
|
|
||||||
|
_num_words = [
|
||||||
|
"nul",
|
||||||
|
"jaden",
|
||||||
|
"jadna",
|
||||||
|
"jadno",
|
||||||
|
"dwa",
|
||||||
|
"dwě",
|
||||||
|
"tśi",
|
||||||
|
"tśo",
|
||||||
|
"styri",
|
||||||
|
"styrjo",
|
||||||
|
"pěś",
|
||||||
|
"pěśo",
|
||||||
|
"šesć",
|
||||||
|
"šesćo",
|
||||||
|
"sedym",
|
||||||
|
"sedymjo",
|
||||||
|
"wósym",
|
||||||
|
"wósymjo",
|
||||||
|
"źewjeś",
|
||||||
|
"źewjeśo",
|
||||||
|
"źaseś",
|
||||||
|
"źaseśo",
|
||||||
|
"jadnassćo",
|
||||||
|
"dwanassćo",
|
||||||
|
"tśinasćo",
|
||||||
|
"styrnasćo",
|
||||||
|
"pěśnasćo",
|
||||||
|
"šesnasćo",
|
||||||
|
"sedymnasćo",
|
||||||
|
"wósymnasćo",
|
||||||
|
"źewjeśnasćo",
|
||||||
|
"dwanasćo",
|
||||||
|
"dwaźasća",
|
||||||
|
"tśiźasća",
|
||||||
|
"styrźasća",
|
||||||
|
"pěśźaset",
|
||||||
|
"šesćźaset",
|
||||||
|
"sedymźaset",
|
||||||
|
"wósymźaset",
|
||||||
|
"źewjeśźaset",
|
||||||
|
"sto",
|
||||||
|
"tysac",
|
||||||
|
"milion",
|
||||||
|
"miliarda",
|
||||||
|
"bilion",
|
||||||
|
"biliarda",
|
||||||
|
"trilion",
|
||||||
|
"triliarda",
|
||||||
|
]
|
||||||
|
|
||||||
|
_ordinal_words = [
|
||||||
|
"prědny",
|
||||||
|
"prědna",
|
||||||
|
"prědne",
|
||||||
|
"drugi",
|
||||||
|
"druga",
|
||||||
|
"druge",
|
||||||
|
"tśeśi",
|
||||||
|
"tśeśa",
|
||||||
|
"tśeśe",
|
||||||
|
"stwórty",
|
||||||
|
"stwórta",
|
||||||
|
"stwórte",
|
||||||
|
"pêty",
|
||||||
|
"pěta",
|
||||||
|
"pête",
|
||||||
|
"šesty",
|
||||||
|
"šesta",
|
||||||
|
"šeste",
|
||||||
|
"sedymy",
|
||||||
|
"sedyma",
|
||||||
|
"sedyme",
|
||||||
|
"wósymy",
|
||||||
|
"wósyma",
|
||||||
|
"wósyme",
|
||||||
|
"źewjety",
|
||||||
|
"źewjeta",
|
||||||
|
"źewjete",
|
||||||
|
"źasety",
|
||||||
|
"źaseta",
|
||||||
|
"źasete",
|
||||||
|
"jadnasty",
|
||||||
|
"jadnasta",
|
||||||
|
"jadnaste",
|
||||||
|
"dwanasty",
|
||||||
|
"dwanasta",
|
||||||
|
"dwanaste",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def like_num(text):
|
||||||
|
if text.startswith(("+", "-", "±", "~")):
|
||||||
|
text = text[1:]
|
||||||
|
text = text.replace(",", "").replace(".", "")
|
||||||
|
if text.isdigit():
|
||||||
|
return True
|
||||||
|
if text.count("/") == 1:
|
||||||
|
num, denom = text.split("/")
|
||||||
|
if num.isdigit() and denom.isdigit():
|
||||||
|
return True
|
||||||
|
text_lower = text.lower()
|
||||||
|
if text_lower in _num_words:
|
||||||
|
return True
|
||||||
|
# Check ordinal number
|
||||||
|
if text_lower in _ordinal_words:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
LEX_ATTRS = {LIKE_NUM: like_num}
|
15
spacy/lang/dsb/stop_words.py
Normal file
15
spacy/lang/dsb/stop_words.py
Normal file
|
@ -0,0 +1,15 @@
|
||||||
|
STOP_WORDS = set(
|
||||||
|
"""
|
||||||
|
a abo aby ako ale až
|
||||||
|
|
||||||
|
daniž dokulaž
|
||||||
|
|
||||||
|
gaž
|
||||||
|
|
||||||
|
jolic
|
||||||
|
|
||||||
|
pak pótom
|
||||||
|
|
||||||
|
teke togodla
|
||||||
|
""".split()
|
||||||
|
)
|
|
@ -447,7 +447,6 @@ for exc_data in [
|
||||||
{ORTH: "La.", NORM: "Louisiana"},
|
{ORTH: "La.", NORM: "Louisiana"},
|
||||||
{ORTH: "Mar.", NORM: "March"},
|
{ORTH: "Mar.", NORM: "March"},
|
||||||
{ORTH: "Mass.", NORM: "Massachusetts"},
|
{ORTH: "Mass.", NORM: "Massachusetts"},
|
||||||
{ORTH: "May.", NORM: "May"},
|
|
||||||
{ORTH: "Mich.", NORM: "Michigan"},
|
{ORTH: "Mich.", NORM: "Michigan"},
|
||||||
{ORTH: "Minn.", NORM: "Minnesota"},
|
{ORTH: "Minn.", NORM: "Minnesota"},
|
||||||
{ORTH: "Miss.", NORM: "Mississippi"},
|
{ORTH: "Miss.", NORM: "Mississippi"},
|
||||||
|
|
|
@ -47,6 +47,41 @@ _num_words = [
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
_ordinal_words = [
|
||||||
|
"primero",
|
||||||
|
"segundo",
|
||||||
|
"tercero",
|
||||||
|
"cuarto",
|
||||||
|
"quinto",
|
||||||
|
"sexto",
|
||||||
|
"séptimo",
|
||||||
|
"octavo",
|
||||||
|
"noveno",
|
||||||
|
"décimo",
|
||||||
|
"undécimo",
|
||||||
|
"duodécimo",
|
||||||
|
"decimotercero",
|
||||||
|
"decimocuarto",
|
||||||
|
"decimoquinto",
|
||||||
|
"decimosexto",
|
||||||
|
"decimoséptimo",
|
||||||
|
"decimoctavo",
|
||||||
|
"decimonoveno",
|
||||||
|
"vigésimo",
|
||||||
|
"trigésimo",
|
||||||
|
"cuadragésimo",
|
||||||
|
"quincuagésimo",
|
||||||
|
"sexagésimo",
|
||||||
|
"septuagésimo",
|
||||||
|
"octogésima",
|
||||||
|
"nonagésima",
|
||||||
|
"centésima",
|
||||||
|
"milésima",
|
||||||
|
"millonésima",
|
||||||
|
"billonésima",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
def like_num(text):
|
def like_num(text):
|
||||||
if text.startswith(("+", "-", "±", "~")):
|
if text.startswith(("+", "-", "±", "~")):
|
||||||
text = text[1:]
|
text = text[1:]
|
||||||
|
@ -57,7 +92,11 @@ def like_num(text):
|
||||||
num, denom = text.split("/")
|
num, denom = text.split("/")
|
||||||
if num.isdigit() and denom.isdigit():
|
if num.isdigit() and denom.isdigit():
|
||||||
return True
|
return True
|
||||||
if text.lower() in _num_words:
|
text_lower = text.lower()
|
||||||
|
if text_lower in _num_words:
|
||||||
|
return True
|
||||||
|
# Check ordinal number
|
||||||
|
if text_lower in _ordinal_words:
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
|
@ -2,6 +2,7 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||||
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from ...language import Language, BaseDefaults
|
from ...language import Language, BaseDefaults
|
||||||
|
|
||||||
|
|
||||||
|
@ -11,6 +12,7 @@ class FinnishDefaults(BaseDefaults):
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
lex_attr_getters = LEX_ATTRS
|
lex_attr_getters = LEX_ATTRS
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
syntax_iterators = SYNTAX_ITERATORS
|
||||||
|
|
||||||
|
|
||||||
class Finnish(Language):
|
class Finnish(Language):
|
||||||
|
|
79
spacy/lang/fi/syntax_iterators.py
Normal file
79
spacy/lang/fi/syntax_iterators.py
Normal file
|
@ -0,0 +1,79 @@
|
||||||
|
from typing import Iterator, Tuple, Union
|
||||||
|
from ...tokens import Doc, Span
|
||||||
|
from ...symbols import NOUN, PROPN, PRON
|
||||||
|
from ...errors import Errors
|
||||||
|
|
||||||
|
|
||||||
|
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
|
||||||
|
"""Detect base noun phrases from a dependency parse. Works on both Doc and Span."""
|
||||||
|
labels = [
|
||||||
|
"appos",
|
||||||
|
"nsubj",
|
||||||
|
"nsubj:cop",
|
||||||
|
"obj",
|
||||||
|
"obl",
|
||||||
|
"ROOT",
|
||||||
|
]
|
||||||
|
extend_labels = [
|
||||||
|
"amod",
|
||||||
|
"compound",
|
||||||
|
"compound:nn",
|
||||||
|
"flat:name",
|
||||||
|
"nmod",
|
||||||
|
"nmod:gobj",
|
||||||
|
"nmod:gsubj",
|
||||||
|
"nmod:poss",
|
||||||
|
"nummod",
|
||||||
|
]
|
||||||
|
|
||||||
|
def potential_np_head(word):
|
||||||
|
return word.pos in (NOUN, PROPN) and (
|
||||||
|
word.dep in np_deps or word.head.pos == PRON
|
||||||
|
)
|
||||||
|
|
||||||
|
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||||
|
if not doc.has_annotation("DEP"):
|
||||||
|
raise ValueError(Errors.E029)
|
||||||
|
|
||||||
|
np_deps = [doc.vocab.strings[label] for label in labels]
|
||||||
|
extend_deps = [doc.vocab.strings[label] for label in extend_labels]
|
||||||
|
np_label = doc.vocab.strings.add("NP")
|
||||||
|
conj_label = doc.vocab.strings.add("conj")
|
||||||
|
|
||||||
|
rbracket = 0
|
||||||
|
prev_end = -1
|
||||||
|
for i, word in enumerate(doclike):
|
||||||
|
if i < rbracket:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Is this a potential independent NP head or coordinated with
|
||||||
|
# a NOUN that is itself an independent NP head?
|
||||||
|
#
|
||||||
|
# e.g. "Terveyden ja hyvinvoinnin laitos"
|
||||||
|
if potential_np_head(word) or (
|
||||||
|
word.dep == conj_label and potential_np_head(word.head)
|
||||||
|
):
|
||||||
|
# Try to extend to the left to include adjective/num
|
||||||
|
# modifiers, compound words etc.
|
||||||
|
lbracket = word.i
|
||||||
|
for ldep in word.lefts:
|
||||||
|
if ldep.dep in extend_deps:
|
||||||
|
lbracket = ldep.left_edge.i
|
||||||
|
break
|
||||||
|
|
||||||
|
# Prevent nested chunks from being produced
|
||||||
|
if lbracket <= prev_end:
|
||||||
|
continue
|
||||||
|
|
||||||
|
rbracket = word.i
|
||||||
|
# Try to extend the span to the right to capture
|
||||||
|
# appositions and noun modifiers
|
||||||
|
for rdep in word.rights:
|
||||||
|
if rdep.dep in extend_deps:
|
||||||
|
rbracket = rdep.i
|
||||||
|
prev_end = rbracket
|
||||||
|
|
||||||
|
yield lbracket, rbracket + 1, np_label
|
||||||
|
|
||||||
|
|
||||||
|
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
|
|
@ -3,7 +3,7 @@ from ...attrs import LIKE_NUM
|
||||||
|
|
||||||
_num_words = set(
|
_num_words = set(
|
||||||
"""
|
"""
|
||||||
zero un deux trois quatre cinq six sept huit neuf dix
|
zero un une deux trois quatre cinq six sept huit neuf dix
|
||||||
onze douze treize quatorze quinze seize dix-sept dix-huit dix-neuf
|
onze douze treize quatorze quinze seize dix-sept dix-huit dix-neuf
|
||||||
vingt trente quarante cinquante soixante soixante-dix septante quatre-vingt huitante quatre-vingt-dix nonante
|
vingt trente quarante cinquante soixante soixante-dix septante quatre-vingt huitante quatre-vingt-dix nonante
|
||||||
cent mille mil million milliard billion quadrillion quintillion
|
cent mille mil million milliard billion quadrillion quintillion
|
||||||
|
@ -13,7 +13,7 @@ sextillion septillion octillion nonillion decillion
|
||||||
|
|
||||||
_ordinal_words = set(
|
_ordinal_words = set(
|
||||||
"""
|
"""
|
||||||
premier deuxième second troisième quatrième cinquième sixième septième huitième neuvième dixième
|
premier première deuxième second seconde troisième quatrième cinquième sixième septième huitième neuvième dixième
|
||||||
onzième douzième treizième quatorzième quinzième seizième dix-septième dix-huitième dix-neuvième
|
onzième douzième treizième quatorzième quinzième seizième dix-septième dix-huitième dix-neuvième
|
||||||
vingtième trentième quarantième cinquantième soixantième soixante-dixième septantième quatre-vingtième huitantième quatre-vingt-dixième nonantième
|
vingtième trentième quarantième cinquantième soixantième soixante-dixième septantième quatre-vingtième huitantième quatre-vingt-dixième nonantième
|
||||||
centième millième millionnième milliardième billionnième quadrillionnième quintillionnième
|
centième millième millionnième milliardième billionnième quadrillionnième quintillionnième
|
||||||
|
|
|
@ -6,16 +6,35 @@ from ...tokens import Doc, Span
|
||||||
|
|
||||||
|
|
||||||
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
|
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
|
||||||
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
|
"""
|
||||||
# fmt: off
|
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
|
||||||
labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
|
"""
|
||||||
# fmt: on
|
labels = [
|
||||||
|
"nsubj",
|
||||||
|
"nsubj:pass",
|
||||||
|
"obj",
|
||||||
|
"obl",
|
||||||
|
"obl:agent",
|
||||||
|
"obl:arg",
|
||||||
|
"obl:mod",
|
||||||
|
"nmod",
|
||||||
|
"pcomp",
|
||||||
|
"appos",
|
||||||
|
"ROOT",
|
||||||
|
]
|
||||||
|
post_modifiers = ["flat", "flat:name", "flat:foreign", "fixed", "compound"]
|
||||||
doc = doclike.doc # Ensure works on both Doc and Span.
|
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||||
if not doc.has_annotation("DEP"):
|
if not doc.has_annotation("DEP"):
|
||||||
raise ValueError(Errors.E029)
|
raise ValueError(Errors.E029)
|
||||||
np_deps = [doc.vocab.strings[label] for label in labels]
|
np_deps = {doc.vocab.strings.add(label) for label in labels}
|
||||||
conj = doc.vocab.strings.add("conj")
|
np_modifs = {doc.vocab.strings.add(modifier) for modifier in post_modifiers}
|
||||||
np_label = doc.vocab.strings.add("NP")
|
np_label = doc.vocab.strings.add("NP")
|
||||||
|
adj_label = doc.vocab.strings.add("amod")
|
||||||
|
det_label = doc.vocab.strings.add("det")
|
||||||
|
det_pos = doc.vocab.strings.add("DET")
|
||||||
|
adp_pos = doc.vocab.strings.add("ADP")
|
||||||
|
conj_label = doc.vocab.strings.add("conj")
|
||||||
|
conj_pos = doc.vocab.strings.add("CCONJ")
|
||||||
prev_end = -1
|
prev_end = -1
|
||||||
for i, word in enumerate(doclike):
|
for i, word in enumerate(doclike):
|
||||||
if word.pos not in (NOUN, PROPN, PRON):
|
if word.pos not in (NOUN, PROPN, PRON):
|
||||||
|
@ -24,16 +43,43 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
|
||||||
if word.left_edge.i <= prev_end:
|
if word.left_edge.i <= prev_end:
|
||||||
continue
|
continue
|
||||||
if word.dep in np_deps:
|
if word.dep in np_deps:
|
||||||
prev_end = word.right_edge.i
|
right_childs = list(word.rights)
|
||||||
yield word.left_edge.i, word.right_edge.i + 1, np_label
|
right_child = right_childs[0] if right_childs else None
|
||||||
elif word.dep == conj:
|
|
||||||
|
if right_child:
|
||||||
|
if (
|
||||||
|
right_child.dep == adj_label
|
||||||
|
): # allow chain of adjectives by expanding to right
|
||||||
|
right_end = right_child.right_edge
|
||||||
|
elif (
|
||||||
|
right_child.dep == det_label and right_child.pos == det_pos
|
||||||
|
): # cut relative pronouns here
|
||||||
|
right_end = right_child
|
||||||
|
elif right_child.dep in np_modifs: # Check if we can expand to right
|
||||||
|
right_end = word.right_edge
|
||||||
|
else:
|
||||||
|
right_end = word
|
||||||
|
else:
|
||||||
|
right_end = word
|
||||||
|
prev_end = right_end.i
|
||||||
|
|
||||||
|
left_index = word.left_edge.i
|
||||||
|
left_index = left_index + 1 if word.left_edge.pos == adp_pos else left_index
|
||||||
|
|
||||||
|
yield left_index, right_end.i + 1, np_label
|
||||||
|
elif word.dep == conj_label:
|
||||||
head = word.head
|
head = word.head
|
||||||
while head.dep == conj and head.head.i < head.i:
|
while head.dep == conj_label and head.head.i < head.i:
|
||||||
head = head.head
|
head = head.head
|
||||||
# If the head is an NP, and we're coordinated to it, we're an NP
|
# If the head is an NP, and we're coordinated to it, we're an NP
|
||||||
if head.dep in np_deps:
|
if head.dep in np_deps:
|
||||||
prev_end = word.right_edge.i
|
prev_end = word.i
|
||||||
yield word.left_edge.i, word.right_edge.i + 1, np_label
|
|
||||||
|
left_index = word.left_edge.i # eliminate left attached conjunction
|
||||||
|
left_index = (
|
||||||
|
left_index + 1 if word.left_edge.pos == conj_pos else left_index
|
||||||
|
)
|
||||||
|
yield left_index, word.i + 1, np_label
|
||||||
|
|
||||||
|
|
||||||
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
|
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
|
||||||
|
|
18
spacy/lang/hsb/__init__.py
Normal file
18
spacy/lang/hsb/__init__.py
Normal file
|
@ -0,0 +1,18 @@
|
||||||
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
from .stop_words import STOP_WORDS
|
||||||
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
|
from ...language import Language, BaseDefaults
|
||||||
|
|
||||||
|
|
||||||
|
class UpperSorbianDefaults(BaseDefaults):
|
||||||
|
lex_attr_getters = LEX_ATTRS
|
||||||
|
stop_words = STOP_WORDS
|
||||||
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
|
|
||||||
|
|
||||||
|
class UpperSorbian(Language):
|
||||||
|
lang = "hsb"
|
||||||
|
Defaults = UpperSorbianDefaults
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["UpperSorbian"]
|
15
spacy/lang/hsb/examples.py
Normal file
15
spacy/lang/hsb/examples.py
Normal file
|
@ -0,0 +1,15 @@
|
||||||
|
"""
|
||||||
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
>>> from spacy.lang.hsb.examples import sentences
|
||||||
|
>>> docs = nlp.pipe(sentences)
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
sentences = [
|
||||||
|
"To běšo wjelgin raźone a jo se wót luźi derje pśiwzeło. Tak som dožywiła wjelgin",
|
||||||
|
"Jogo pśewóźowarce stej groniłej, až how w serbskich stronach njama Santa Claus nic pytaś.",
|
||||||
|
"A ten sobuźěłaśeŕ Statneje biblioteki w Barlinju jo pśimjeł drogotne knigły bźez rukajcowu z nagima rukoma!",
|
||||||
|
"Take wobchadanje z našym kulturnym derbstwom zewšym njejźo.",
|
||||||
|
"Wopśimjeśe drugich pśinoskow jo było na wusokem niwowje, ako pśecej.",
|
||||||
|
]
|
106
spacy/lang/hsb/lex_attrs.py
Normal file
106
spacy/lang/hsb/lex_attrs.py
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
from ...attrs import LIKE_NUM
|
||||||
|
|
||||||
|
_num_words = [
|
||||||
|
"nul",
|
||||||
|
"jedyn",
|
||||||
|
"jedna",
|
||||||
|
"jedne",
|
||||||
|
"dwaj",
|
||||||
|
"dwě",
|
||||||
|
"tři",
|
||||||
|
"třo",
|
||||||
|
"štyri",
|
||||||
|
"štyrjo",
|
||||||
|
"pjeć",
|
||||||
|
"šěsć",
|
||||||
|
"sydom",
|
||||||
|
"wosom",
|
||||||
|
"dźewjeć",
|
||||||
|
"dźesać",
|
||||||
|
"jědnaće",
|
||||||
|
"dwanaće",
|
||||||
|
"třinaće",
|
||||||
|
"štyrnaće",
|
||||||
|
"pjatnaće",
|
||||||
|
"šěsnaće",
|
||||||
|
"sydomnaće",
|
||||||
|
"wosomnaće",
|
||||||
|
"dźewjatnaće",
|
||||||
|
"dwaceći",
|
||||||
|
"třiceći",
|
||||||
|
"štyrceći",
|
||||||
|
"pjećdźesat",
|
||||||
|
"šěsćdźesat",
|
||||||
|
"sydomdźesat",
|
||||||
|
"wosomdźesat",
|
||||||
|
"dźewjećdźesat",
|
||||||
|
"sto",
|
||||||
|
"tysac",
|
||||||
|
"milion",
|
||||||
|
"miliarda",
|
||||||
|
"bilion",
|
||||||
|
"biliarda",
|
||||||
|
"trilion",
|
||||||
|
"triliarda",
|
||||||
|
]
|
||||||
|
|
||||||
|
_ordinal_words = [
|
||||||
|
"prěni",
|
||||||
|
"prěnja",
|
||||||
|
"prěnje",
|
||||||
|
"druhi",
|
||||||
|
"druha",
|
||||||
|
"druhe",
|
||||||
|
"třeći",
|
||||||
|
"třeća",
|
||||||
|
"třeće",
|
||||||
|
"štwórty",
|
||||||
|
"štwórta",
|
||||||
|
"štwórte",
|
||||||
|
"pjaty",
|
||||||
|
"pjata",
|
||||||
|
"pjate",
|
||||||
|
"šěsty",
|
||||||
|
"šěsta",
|
||||||
|
"šěste",
|
||||||
|
"sydmy",
|
||||||
|
"sydma",
|
||||||
|
"sydme",
|
||||||
|
"wosmy",
|
||||||
|
"wosma",
|
||||||
|
"wosme",
|
||||||
|
"dźewjaty",
|
||||||
|
"dźewjata",
|
||||||
|
"dźewjate",
|
||||||
|
"dźesaty",
|
||||||
|
"dźesata",
|
||||||
|
"dźesate",
|
||||||
|
"jědnaty",
|
||||||
|
"jědnata",
|
||||||
|
"jědnate",
|
||||||
|
"dwanaty",
|
||||||
|
"dwanata",
|
||||||
|
"dwanate",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def like_num(text):
|
||||||
|
if text.startswith(("+", "-", "±", "~")):
|
||||||
|
text = text[1:]
|
||||||
|
text = text.replace(",", "").replace(".", "")
|
||||||
|
if text.isdigit():
|
||||||
|
return True
|
||||||
|
if text.count("/") == 1:
|
||||||
|
num, denom = text.split("/")
|
||||||
|
if num.isdigit() and denom.isdigit():
|
||||||
|
return True
|
||||||
|
text_lower = text.lower()
|
||||||
|
if text_lower in _num_words:
|
||||||
|
return True
|
||||||
|
# Check ordinal number
|
||||||
|
if text_lower in _ordinal_words:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
LEX_ATTRS = {LIKE_NUM: like_num}
|
19
spacy/lang/hsb/stop_words.py
Normal file
19
spacy/lang/hsb/stop_words.py
Normal file
|
@ -0,0 +1,19 @@
|
||||||
|
STOP_WORDS = set(
|
||||||
|
"""
|
||||||
|
a abo ale ani
|
||||||
|
|
||||||
|
dokelž
|
||||||
|
|
||||||
|
hdyž
|
||||||
|
|
||||||
|
jeli jelizo
|
||||||
|
|
||||||
|
kaž
|
||||||
|
|
||||||
|
pak potom
|
||||||
|
|
||||||
|
tež tohodla
|
||||||
|
|
||||||
|
zo zoby
|
||||||
|
""".split()
|
||||||
|
)
|
18
spacy/lang/hsb/tokenizer_exceptions.py
Normal file
18
spacy/lang/hsb/tokenizer_exceptions.py
Normal file
|
@ -0,0 +1,18 @@
|
||||||
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
|
from ...symbols import ORTH, NORM
|
||||||
|
from ...util import update_exc
|
||||||
|
|
||||||
|
_exc = dict()
|
||||||
|
for exc_data in [
|
||||||
|
{ORTH: "mil.", NORM: "milion"},
|
||||||
|
{ORTH: "wob.", NORM: "wobydler"},
|
||||||
|
]:
|
||||||
|
_exc[exc_data[ORTH]] = [exc_data]
|
||||||
|
|
||||||
|
for orth in [
|
||||||
|
"resp.",
|
||||||
|
]:
|
||||||
|
_exc[orth] = [{ORTH: orth}]
|
||||||
|
|
||||||
|
|
||||||
|
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
|
@ -6,13 +6,15 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||||
from ...language import Language, BaseDefaults
|
from ...language import Language, BaseDefaults
|
||||||
from .lemmatizer import ItalianLemmatizer
|
from .lemmatizer import ItalianLemmatizer
|
||||||
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
|
|
||||||
|
|
||||||
class ItalianDefaults(BaseDefaults):
|
class ItalianDefaults(BaseDefaults):
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
stop_words = STOP_WORDS
|
|
||||||
prefixes = TOKENIZER_PREFIXES
|
prefixes = TOKENIZER_PREFIXES
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
|
stop_words = STOP_WORDS
|
||||||
|
syntax_iterators = SYNTAX_ITERATORS
|
||||||
|
|
||||||
|
|
||||||
class Italian(Language):
|
class Italian(Language):
|
||||||
|
|
|
@ -10,18 +10,18 @@ avresti avrete avrà avrò avuta avute avuti avuto
|
||||||
|
|
||||||
basta bene benissimo brava bravo
|
basta bene benissimo brava bravo
|
||||||
|
|
||||||
casa caso cento certa certe certi certo che chi chicchessia chiunque ci
|
casa caso cento certa certe certi certo che chi chicchessia chiunque ci c'
|
||||||
ciascuna ciascuno cima cio cioe circa citta città co codesta codesti codesto
|
ciascuna ciascuno cima cio cioe circa citta città co codesta codesti codesto
|
||||||
cogli coi col colei coll coloro colui come cominci comunque con concernente
|
cogli coi col colei coll coloro colui come cominci comunque con concernente
|
||||||
conciliarsi conclusione consiglio contro cortesia cos cosa cosi così cui
|
conciliarsi conclusione consiglio contro cortesia cos cosa cosi così cui
|
||||||
|
|
||||||
da dagl dagli dai dal dall dalla dalle dallo dappertutto davanti degl degli
|
d' da dagl dagli dai dal dall dall' dalla dalle dallo dappertutto davanti degl degli
|
||||||
dei del dell della delle dello dentro detto deve di dice dietro dire
|
dei del dell dell' della delle dello dentro detto deve di dice dietro dire
|
||||||
dirimpetto diventa diventare diventato dopo dov dove dovra dovrà dovunque due
|
dirimpetto diventa diventare diventato dopo dov dove dovra dovrà dovunque due
|
||||||
dunque durante
|
dunque durante
|
||||||
|
|
||||||
ebbe ebbero ebbi ecc ecco ed effettivamente egli ella entrambi eppure era
|
e ebbe ebbero ebbi ecc ecco ed effettivamente egli ella entrambi eppure era
|
||||||
erano eravamo eravate eri ero esempio esse essendo esser essere essi ex
|
erano eravamo eravate eri ero esempio esse essendo esser essere essi ex è
|
||||||
|
|
||||||
fa faccia facciamo facciano facciate faccio facemmo facendo facesse facessero
|
fa faccia facciamo facciano facciate faccio facemmo facendo facesse facessero
|
||||||
facessi facessimo faceste facesti faceva facevamo facevano facevate facevi
|
facessi facessimo faceste facesti faceva facevamo facevano facevate facevi
|
||||||
|
@ -30,21 +30,21 @@ fareste faresti farete farà farò fatto favore fece fecero feci fin finalmente
|
||||||
finche fine fino forse forza fosse fossero fossi fossimo foste fosti fra
|
finche fine fino forse forza fosse fossero fossi fossimo foste fosti fra
|
||||||
frattempo fu fui fummo fuori furono futuro generale
|
frattempo fu fui fummo fuori furono futuro generale
|
||||||
|
|
||||||
gia già giacche giorni giorno gli gliela gliele glieli glielo gliene governo
|
gia già giacche giorni giorno gli gl' gliela gliele glieli glielo gliene governo
|
||||||
grande grazie gruppo
|
grande grazie gruppo
|
||||||
|
|
||||||
ha haha hai hanno ho
|
ha haha hai hanno ho
|
||||||
|
|
||||||
ieri il improvviso in inc infatti inoltre insieme intanto intorno invece io
|
ieri il improvviso in inc infatti inoltre insieme intanto intorno invece io
|
||||||
|
|
||||||
la là lasciato lato lavoro le lei li lo lontano loro lui lungo luogo
|
l' la là lasciato lato lavoro le lei li lo lontano loro lui lungo luogo
|
||||||
|
|
||||||
ma macche magari maggior mai male malgrado malissimo mancanza marche me
|
m' ma macche magari maggior mai male malgrado malissimo mancanza marche me
|
||||||
medesimo mediante meglio meno mentre mesi mezzo mi mia mie miei mila miliardi
|
medesimo mediante meglio meno mentre mesi mezzo mi mia mie miei mila miliardi
|
||||||
milioni minimi ministro mio modo molti moltissimo molto momento mondo mosto
|
milioni minimi ministro mio modo molti moltissimo molto momento mondo mosto
|
||||||
|
|
||||||
nazionale ne negl negli nei nel nell nella nelle nello nemmeno neppure nessun
|
nazionale ne negl negli nei nel nell nella nelle nello nemmeno neppure nessun nessun'
|
||||||
nessuna nessuno niente no noi non nondimeno nonostante nonsia nostra nostre
|
nessuna nessuno nient' niente no noi non nondimeno nonostante nonsia nostra nostre
|
||||||
nostri nostro novanta nove nulla nuovo
|
nostri nostro novanta nove nulla nuovo
|
||||||
|
|
||||||
od oggi ogni ognuna ognuno oltre oppure ora ore osi ossia ottanta otto
|
od oggi ogni ognuna ognuno oltre oppure ora ore osi ossia ottanta otto
|
||||||
|
@ -56,12 +56,12 @@ potrebbe preferibilmente presa press prima primo principalmente probabilmente
|
||||||
proprio puo può pure purtroppo
|
proprio puo può pure purtroppo
|
||||||
|
|
||||||
qualche qualcosa qualcuna qualcuno quale quali qualunque quando quanta quante
|
qualche qualcosa qualcuna qualcuno quale quali qualunque quando quanta quante
|
||||||
quanti quanto quantunque quasi quattro quel quella quelle quelli quello quest
|
quanti quanto quantunque quasi quattro quel quel' quella quelle quelli quello quest quest'
|
||||||
questa queste questi questo qui quindi
|
questa queste questi questo qui quindi
|
||||||
|
|
||||||
realmente recente recentemente registrazione relativo riecco salvo
|
realmente recente recentemente registrazione relativo riecco salvo
|
||||||
|
|
||||||
sara sarà sarai saranno sarebbe sarebbero sarei saremmo saremo sareste
|
s' sara sarà sarai saranno sarebbe sarebbero sarei saremmo saremo sareste
|
||||||
saresti sarete saro sarò scola scopo scorso se secondo seguente seguito sei
|
saresti sarete saro sarò scola scopo scorso se secondo seguente seguito sei
|
||||||
sembra sembrare sembrato sembri sempre senza sette si sia siamo siano siate
|
sembra sembrare sembrato sembri sempre senza sette si sia siamo siano siate
|
||||||
siete sig solito solo soltanto sono sopra sotto spesso srl sta stai stando
|
siete sig solito solo soltanto sono sopra sotto spesso srl sta stai stando
|
||||||
|
@ -72,12 +72,12 @@ steste stesti stette stettero stetti stia stiamo stiano stiate sto su sua
|
||||||
subito successivamente successivo sue sugl sugli sui sul sull sulla sulle
|
subito successivamente successivo sue sugl sugli sui sul sull sulla sulle
|
||||||
sullo suo suoi
|
sullo suo suoi
|
||||||
|
|
||||||
tale tali talvolta tanto te tempo ti titolo tra tranne tre trenta
|
t' tale tali talvolta tanto te tempo ti titolo tra tranne tre trenta
|
||||||
troppo trovato tu tua tue tuo tuoi tutta tuttavia tutte tutti tutto
|
troppo trovato tu tua tue tuo tuoi tutta tuttavia tutte tutti tutto
|
||||||
|
|
||||||
uguali ulteriore ultimo un una uno uomo
|
uguali ulteriore ultimo un un' una uno uomo
|
||||||
|
|
||||||
va vale vari varia varie vario verso vi via vicino visto vita voi volta volte
|
v' va vale vari varia varie vario verso vi via vicino visto vita voi volta volte
|
||||||
vostra vostre vostri vostro
|
vostra vostre vostri vostro
|
||||||
""".split()
|
""".split()
|
||||||
)
|
)
|
||||||
|
|
86
spacy/lang/it/syntax_iterators.py
Normal file
86
spacy/lang/it/syntax_iterators.py
Normal file
|
@ -0,0 +1,86 @@
|
||||||
|
from typing import Union, Iterator, Tuple
|
||||||
|
|
||||||
|
from ...symbols import NOUN, PROPN, PRON
|
||||||
|
from ...errors import Errors
|
||||||
|
from ...tokens import Doc, Span
|
||||||
|
|
||||||
|
|
||||||
|
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
|
||||||
|
"""
|
||||||
|
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
|
||||||
|
"""
|
||||||
|
labels = [
|
||||||
|
"nsubj",
|
||||||
|
"nsubj:pass",
|
||||||
|
"obj",
|
||||||
|
"obl",
|
||||||
|
"obl:agent",
|
||||||
|
"nmod",
|
||||||
|
"pcomp",
|
||||||
|
"appos",
|
||||||
|
"ROOT",
|
||||||
|
]
|
||||||
|
post_modifiers = ["flat", "flat:name", "fixed", "compound"]
|
||||||
|
dets = ["det", "det:poss"]
|
||||||
|
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||||
|
if not doc.has_annotation("DEP"):
|
||||||
|
raise ValueError(Errors.E029)
|
||||||
|
np_deps = {doc.vocab.strings.add(label) for label in labels}
|
||||||
|
np_modifs = {doc.vocab.strings.add(modifier) for modifier in post_modifiers}
|
||||||
|
np_label = doc.vocab.strings.add("NP")
|
||||||
|
adj_label = doc.vocab.strings.add("amod")
|
||||||
|
det_labels = {doc.vocab.strings.add(det) for det in dets}
|
||||||
|
det_pos = doc.vocab.strings.add("DET")
|
||||||
|
adp_label = doc.vocab.strings.add("ADP")
|
||||||
|
conj = doc.vocab.strings.add("conj")
|
||||||
|
conj_pos = doc.vocab.strings.add("CCONJ")
|
||||||
|
prev_end = -1
|
||||||
|
for i, word in enumerate(doclike):
|
||||||
|
if word.pos not in (NOUN, PROPN, PRON):
|
||||||
|
continue
|
||||||
|
# Prevent nested chunks from being produced
|
||||||
|
if word.left_edge.i <= prev_end:
|
||||||
|
continue
|
||||||
|
if word.dep in np_deps:
|
||||||
|
right_childs = list(word.rights)
|
||||||
|
right_child = right_childs[0] if right_childs else None
|
||||||
|
|
||||||
|
if right_child:
|
||||||
|
if (
|
||||||
|
right_child.dep == adj_label
|
||||||
|
): # allow chain of adjectives by expanding to right
|
||||||
|
right_end = right_child.right_edge
|
||||||
|
elif (
|
||||||
|
right_child.dep in det_labels and right_child.pos == det_pos
|
||||||
|
): # cut relative pronouns here
|
||||||
|
right_end = right_child
|
||||||
|
elif right_child.dep in np_modifs: # Check if we can expand to right
|
||||||
|
right_end = word.right_edge
|
||||||
|
else:
|
||||||
|
right_end = word
|
||||||
|
else:
|
||||||
|
right_end = word
|
||||||
|
prev_end = right_end.i
|
||||||
|
|
||||||
|
left_index = word.left_edge.i
|
||||||
|
left_index = (
|
||||||
|
left_index + 1 if word.left_edge.pos == adp_label else left_index
|
||||||
|
)
|
||||||
|
|
||||||
|
yield left_index, right_end.i + 1, np_label
|
||||||
|
elif word.dep == conj:
|
||||||
|
head = word.head
|
||||||
|
while head.dep == conj and head.head.i < head.i:
|
||||||
|
head = head.head
|
||||||
|
# If the head is an NP, and we're coordinated to it, we're an NP
|
||||||
|
if head.dep in np_deps:
|
||||||
|
prev_end = word.i
|
||||||
|
|
||||||
|
left_index = word.left_edge.i # eliminate left attached conjunction
|
||||||
|
left_index = (
|
||||||
|
left_index + 1 if word.left_edge.pos == conj_pos else left_index
|
||||||
|
)
|
||||||
|
yield left_index, word.i + 1, np_label
|
||||||
|
|
||||||
|
|
||||||
|
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
|
|
@ -1,12 +1,13 @@
|
||||||
from typing import Iterator, Any, Dict
|
from typing import Iterator, Any, Dict
|
||||||
|
|
||||||
|
from .punctuation import TOKENIZER_INFIXES
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .tag_map import TAG_MAP
|
from .tag_map import TAG_MAP
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from ...language import Language, BaseDefaults
|
from ...language import Language, BaseDefaults
|
||||||
from ...tokens import Doc
|
from ...tokens import Doc
|
||||||
from ...scorer import Scorer
|
from ...scorer import Scorer
|
||||||
from ...symbols import POS
|
from ...symbols import POS, X
|
||||||
from ...training import validate_examples
|
from ...training import validate_examples
|
||||||
from ...util import DummyTokenizer, registry, load_config_from_str
|
from ...util import DummyTokenizer, registry, load_config_from_str
|
||||||
from ...vocab import Vocab
|
from ...vocab import Vocab
|
||||||
|
@ -31,15 +32,24 @@ def create_tokenizer():
|
||||||
class KoreanTokenizer(DummyTokenizer):
|
class KoreanTokenizer(DummyTokenizer):
|
||||||
def __init__(self, vocab: Vocab):
|
def __init__(self, vocab: Vocab):
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
MeCab = try_mecab_import() # type: ignore[func-returns-value]
|
self._mecab = try_mecab_import() # type: ignore[func-returns-value]
|
||||||
self.mecab_tokenizer = MeCab("-F%f[0],%f[7]")
|
self._mecab_tokenizer = None
|
||||||
|
|
||||||
|
@property
|
||||||
|
def mecab_tokenizer(self):
|
||||||
|
# This is a property so that initializing a pipeline with blank:ko is
|
||||||
|
# possible without actually requiring mecab-ko, e.g. to run
|
||||||
|
# `spacy init vectors ko` for a pipeline that will have a different
|
||||||
|
# tokenizer in the end. The languages need to match for the vectors
|
||||||
|
# to be imported and there's no way to pass a custom config to
|
||||||
|
# `init vectors`.
|
||||||
|
if self._mecab_tokenizer is None:
|
||||||
|
self._mecab_tokenizer = self._mecab("-F%f[0],%f[7]")
|
||||||
|
return self._mecab_tokenizer
|
||||||
|
|
||||||
def __reduce__(self):
|
def __reduce__(self):
|
||||||
return KoreanTokenizer, (self.vocab,)
|
return KoreanTokenizer, (self.vocab,)
|
||||||
|
|
||||||
def __del__(self):
|
|
||||||
self.mecab_tokenizer.__del__()
|
|
||||||
|
|
||||||
def __call__(self, text: str) -> Doc:
|
def __call__(self, text: str) -> Doc:
|
||||||
dtokens = list(self.detailed_tokens(text))
|
dtokens = list(self.detailed_tokens(text))
|
||||||
surfaces = [dt["surface"] for dt in dtokens]
|
surfaces = [dt["surface"] for dt in dtokens]
|
||||||
|
@ -47,7 +57,10 @@ class KoreanTokenizer(DummyTokenizer):
|
||||||
for token, dtoken in zip(doc, dtokens):
|
for token, dtoken in zip(doc, dtokens):
|
||||||
first_tag, sep, eomi_tags = dtoken["tag"].partition("+")
|
first_tag, sep, eomi_tags = dtoken["tag"].partition("+")
|
||||||
token.tag_ = first_tag # stem(어간) or pre-final(선어말 어미)
|
token.tag_ = first_tag # stem(어간) or pre-final(선어말 어미)
|
||||||
token.pos = TAG_MAP[token.tag_][POS]
|
if token.tag_ in TAG_MAP:
|
||||||
|
token.pos = TAG_MAP[token.tag_][POS]
|
||||||
|
else:
|
||||||
|
token.pos = X
|
||||||
token.lemma_ = dtoken["lemma"]
|
token.lemma_ = dtoken["lemma"]
|
||||||
doc.user_data["full_tags"] = [dt["tag"] for dt in dtokens]
|
doc.user_data["full_tags"] = [dt["tag"] for dt in dtokens]
|
||||||
return doc
|
return doc
|
||||||
|
@ -76,6 +89,7 @@ class KoreanDefaults(BaseDefaults):
|
||||||
lex_attr_getters = LEX_ATTRS
|
lex_attr_getters = LEX_ATTRS
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
|
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
|
||||||
|
infixes = TOKENIZER_INFIXES
|
||||||
|
|
||||||
|
|
||||||
class Korean(Language):
|
class Korean(Language):
|
||||||
|
@ -90,7 +104,8 @@ def try_mecab_import() -> None:
|
||||||
return MeCab
|
return MeCab
|
||||||
except ImportError:
|
except ImportError:
|
||||||
raise ImportError(
|
raise ImportError(
|
||||||
"Korean support requires [mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), "
|
'The Korean tokenizer ("spacy.ko.KoreanTokenizer") requires '
|
||||||
|
"[mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), "
|
||||||
"[mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), "
|
"[mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), "
|
||||||
"and [natto-py](https://github.com/buruzaemon/natto-py)"
|
"and [natto-py](https://github.com/buruzaemon/natto-py)"
|
||||||
) from None
|
) from None
|
||||||
|
|
12
spacy/lang/ko/punctuation.py
Normal file
12
spacy/lang/ko/punctuation.py
Normal file
|
@ -0,0 +1,12 @@
|
||||||
|
from ..char_classes import LIST_QUOTES
|
||||||
|
from ..punctuation import TOKENIZER_INFIXES as BASE_TOKENIZER_INFIXES
|
||||||
|
|
||||||
|
|
||||||
|
_infixes = (
|
||||||
|
["·", "ㆍ", "\(", "\)"]
|
||||||
|
+ [r"(?<=[0-9])~(?=[0-9-])"]
|
||||||
|
+ LIST_QUOTES
|
||||||
|
+ BASE_TOKENIZER_INFIXES
|
||||||
|
)
|
||||||
|
|
||||||
|
TOKENIZER_INFIXES = _infixes
|
|
@ -4,46 +4,42 @@ alle allerede alt and andre annen annet at av
|
||||||
|
|
||||||
bak bare bedre beste blant ble bli blir blitt bris by både
|
bak bare bedre beste blant ble bli blir blitt bris by både
|
||||||
|
|
||||||
da dag de del dem den denne der dermed det dette disse drept du
|
da dag de del dem den denne der dermed det dette disse du
|
||||||
|
|
||||||
eller en enn er et ett etter
|
eller en enn er et ett etter
|
||||||
|
|
||||||
fem fikk fire fjor flere folk for fortsatt fotball fra fram frankrike fredag
|
fem fikk fire fjor flere folk for fortsatt fra fram
|
||||||
funnet få får fått før først første
|
funnet få får fått før først første
|
||||||
|
|
||||||
gang gi gikk gjennom gjorde gjort gjør gjøre god godt grunn gå går
|
gang gi gikk gjennom gjorde gjort gjør gjøre god godt grunn gå går
|
||||||
|
|
||||||
ha hadde ham han hans har hele helt henne hennes her hun hva hvor hvordan
|
ha hadde ham han hans har hele helt henne hennes her hun
|
||||||
hvorfor
|
|
||||||
|
|
||||||
i ifølge igjen ikke ingen inn
|
i ifølge igjen ikke ingen inn
|
||||||
|
|
||||||
ja jeg
|
ja jeg
|
||||||
|
|
||||||
kamp kampen kan kl klart kom komme kommer kontakt kort kroner kunne kveld
|
kamp kampen kan kl klart kom komme kommer kontakt kort kroner kunne kveld
|
||||||
kvinner
|
|
||||||
|
|
||||||
la laget land landet langt leder ligger like litt løpet lørdag
|
la laget land landet langt leder ligger like litt løpet
|
||||||
|
|
||||||
man mandag mange mannen mars med meg mellom men mener menn mennesker mens mer
|
man mange med meg mellom men mener mennesker mens mer mot mye må mål måtte
|
||||||
millioner minutter mot msci mye må mål måtte
|
|
||||||
|
|
||||||
ned neste noe noen nok norge norsk norske ntb ny nye nå når
|
ned neste noe noen nok ny nye nå når
|
||||||
|
|
||||||
og også om onsdag opp opplyser oslo oss over
|
og også om opp opplyser oss over
|
||||||
|
|
||||||
personer plass poeng politidistrikt politiet president prosent på
|
personer plass poeng på
|
||||||
|
|
||||||
regjeringen runde rundt russland
|
runde rundt
|
||||||
|
|
||||||
sa saken samme sammen samtidig satt se seg seks selv senere september ser sett
|
sa saken samme sammen samtidig satt se seg seks selv senere ser sett
|
||||||
siden sier sin sine siste sitt skal skriver skulle slik som sted stedet stor
|
siden sier sin sine siste sitt skal skriver skulle slik som sted stedet stor
|
||||||
store står sverige svært så søndag
|
store står svært så
|
||||||
|
|
||||||
ta tatt tid tidligere til tilbake tillegg tirsdag to tok torsdag tre tror
|
ta tatt tid tidligere til tilbake tillegg tok tror
|
||||||
tyskland
|
|
||||||
|
|
||||||
under usa ut uten utenfor
|
under ut uten utenfor
|
||||||
|
|
||||||
vant var ved veldig vi videre viktig vil ville viser vår være vært
|
vant var ved veldig vi videre viktig vil ville viser vår være vært
|
||||||
|
|
||||||
|
|
|
@ -1,56 +1,219 @@
|
||||||
from ...attrs import LIKE_NUM
|
from ...attrs import LIKE_NUM
|
||||||
|
|
||||||
|
|
||||||
_num_words = [
|
_num_words = list(
|
||||||
"ноль",
|
set(
|
||||||
"один",
|
"""
|
||||||
"два",
|
ноль ноля нолю нолём ноле нулевой нулевого нулевому нулевым нулевом нулевая нулевую нулевое нулевые нулевых нулевыми
|
||||||
"три",
|
|
||||||
"четыре",
|
четверть четверти четвертью четвертей четвертям четвертями четвертях
|
||||||
"пять",
|
|
||||||
"шесть",
|
треть трети третью третей третям третями третях
|
||||||
"семь",
|
|
||||||
"восемь",
|
половина половины половине половину половиной половин половинам половинами половинах половиною
|
||||||
"девять",
|
|
||||||
"десять",
|
один одного одному одним одном
|
||||||
"одиннадцать",
|
первой первого первому первом первый первым первых
|
||||||
"двенадцать",
|
во-первых
|
||||||
"тринадцать",
|
единица единицы единице единицу единицей единиц единицам единицами единицах единицею
|
||||||
"четырнадцать",
|
|
||||||
"пятнадцать",
|
два двумя двум двух двоих двое две
|
||||||
"шестнадцать",
|
второго второму второй втором вторым вторых
|
||||||
"семнадцать",
|
двойка двойки двойке двойку двойкой двоек двойкам двойками двойках двойкою
|
||||||
"восемнадцать",
|
во-вторых
|
||||||
"девятнадцать",
|
оба обе обеим обеими обеих обоим обоими обоих
|
||||||
"двадцать",
|
|
||||||
"тридцать",
|
полтора полторы полутора
|
||||||
"сорок",
|
|
||||||
"пятьдесят",
|
три третьего третьему третьем третьим третий тремя трем трех трое троих трёх
|
||||||
"шестьдесят",
|
тройка тройки тройке тройку тройкою троек тройкам тройками тройках тройкой
|
||||||
"семьдесят",
|
троечка троечки троечке троечку троечкой троечек троечкам троечками троечках троечкой
|
||||||
"восемьдесят",
|
трешка трешки трешке трешку трешкой трешек трешкам трешками трешках трешкою
|
||||||
"девяносто",
|
трёшка трёшки трёшке трёшку трёшкой трёшек трёшкам трёшками трёшках трёшкою
|
||||||
"сто",
|
трояк трояка трояку трояком трояке трояки трояков троякам трояками трояках
|
||||||
"двести",
|
треха треху трехой
|
||||||
"триста",
|
трёха трёху трёхой
|
||||||
"четыреста",
|
втроем втроём
|
||||||
"пятьсот",
|
|
||||||
"шестьсот",
|
четыре четвертого четвертому четвертом четвертый четвертым четверка четырьмя четырем четырех четверо четырёх четверым
|
||||||
"семьсот",
|
четверых
|
||||||
"восемьсот",
|
вчетвером
|
||||||
"девятьсот",
|
|
||||||
"тысяча",
|
пять пятого пятому пятом пятый пятым пятью пяти пятеро пятерых пятерыми
|
||||||
"миллион",
|
впятером
|
||||||
"миллиард",
|
пятерочка пятерочки пятерочке пятерочками пятерочкой пятерочку пятерочкой пятерочками
|
||||||
"триллион",
|
пятёрочка пятёрочки пятёрочке пятёрочками пятёрочкой пятёрочку пятёрочкой пятёрочками
|
||||||
"квадриллион",
|
пятерка пятерки пятерке пятерками пятеркой пятерку пятерками
|
||||||
"квинтиллион",
|
пятёрка пятёрки пятёрке пятёрками пятёркой пятёрку пятёрками
|
||||||
]
|
пятёра пятёры пятёре пятёрами пятёрой пятёру пятёрами
|
||||||
|
пятера пятеры пятере пятерами пятерой пятеру пятерами
|
||||||
|
пятак пятаки пятаке пятаками пятаком пятаку пятаками
|
||||||
|
|
||||||
|
шесть шестерка шестого шестому шестой шестом шестым шестью шести шестеро шестерых
|
||||||
|
вшестером
|
||||||
|
|
||||||
|
семь семерка седьмого седьмому седьмой седьмом седьмым семью семи семеро седьмых
|
||||||
|
всемером
|
||||||
|
|
||||||
|
восемь восьмерка восьмого восьмому восемью восьмой восьмом восьмым восеми восьмером восьми восьмью
|
||||||
|
восьмерых
|
||||||
|
ввосьмером
|
||||||
|
|
||||||
|
девять девятого девятому девятка девятом девятый девятым девятью девяти девятером вдевятером девятерых
|
||||||
|
вдевятером
|
||||||
|
|
||||||
|
десять десятого десятому десятка десятом десятый десятым десятью десяти десятером десятых
|
||||||
|
вдесятером
|
||||||
|
|
||||||
|
одиннадцать одиннадцатого одиннадцатому одиннадцатом одиннадцатый одиннадцатым одиннадцатью одиннадцати
|
||||||
|
одиннадцатых
|
||||||
|
|
||||||
|
двенадцать двенадцатого двенадцатому двенадцатом двенадцатый двенадцатым двенадцатью двенадцати
|
||||||
|
двенадцатых
|
||||||
|
|
||||||
|
тринадцать тринадцатого тринадцатому тринадцатом тринадцатый тринадцатым тринадцатью тринадцати
|
||||||
|
тринадцатых
|
||||||
|
|
||||||
|
четырнадцать четырнадцатого четырнадцатому четырнадцатом четырнадцатый четырнадцатым четырнадцатью четырнадцати
|
||||||
|
четырнадцатых
|
||||||
|
|
||||||
|
пятнадцать пятнадцатого пятнадцатому пятнадцатом пятнадцатый пятнадцатым пятнадцатью пятнадцати
|
||||||
|
пятнадцатых
|
||||||
|
пятнарик пятнарику пятнариком пятнарики
|
||||||
|
|
||||||
|
шестнадцать шестнадцатого шестнадцатому шестнадцатом шестнадцатый шестнадцатым шестнадцатью шестнадцати
|
||||||
|
шестнадцатых
|
||||||
|
|
||||||
|
семнадцать семнадцатого семнадцатому семнадцатом семнадцатый семнадцатым семнадцатью семнадцати семнадцатых
|
||||||
|
|
||||||
|
восемнадцать восемнадцатого восемнадцатому восемнадцатом восемнадцатый восемнадцатым восемнадцатью восемнадцати
|
||||||
|
восемнадцатых
|
||||||
|
|
||||||
|
девятнадцать девятнадцатого девятнадцатому девятнадцатом девятнадцатый девятнадцатым девятнадцатью девятнадцати
|
||||||
|
девятнадцатых
|
||||||
|
|
||||||
|
двадцать двадцатого двадцатому двадцатом двадцатый двадцатым двадцатью двадцати двадцатых
|
||||||
|
|
||||||
|
четвертак четвертака четвертаке четвертаку четвертаки четвертаком четвертаками
|
||||||
|
|
||||||
|
тридцать тридцатого тридцатому тридцатом тридцатый тридцатым тридцатью тридцати тридцатых
|
||||||
|
тридцадка тридцадку тридцадке тридцадки тридцадкой тридцадкою тридцадками
|
||||||
|
|
||||||
|
тридевять тридевяти тридевятью
|
||||||
|
|
||||||
|
сорок сорокового сороковому сороковом сороковым сороковой сороковых
|
||||||
|
сорокет сорокета сорокету сорокете сорокеты сорокетом сорокетами сорокетам
|
||||||
|
|
||||||
|
пятьдесят пятьдесятого пятьдесятому пятьюдесятью пятьдесятом пятьдесятый пятьдесятым пятидесяти пятьдесятых
|
||||||
|
полтинник полтинника полтиннике полтиннику полтинники полтинником полтинниками полтинникам полтинниках
|
||||||
|
пятидесятка пятидесятке пятидесятку пятидесятки пятидесяткой пятидесятками пятидесяткам пятидесятках
|
||||||
|
полтос полтоса полтосе полтосу полтосы полтосом полтосами полтосам полтосах
|
||||||
|
|
||||||
|
шестьдесят шестьдесятого шестьдесятому шестьюдесятью шестьдесятом шестьдесятый шестьдесятым шестидесятые шестидесяти
|
||||||
|
шестьдесятых
|
||||||
|
|
||||||
|
семьдесят семьдесятого семьдесятому семьюдесятью семьдесятом семьдесятый семьдесятым семидесяти семьдесятых
|
||||||
|
|
||||||
|
восемьдесят восемьдесятого восемьдесятому восемьюдесятью восемьдесятом восемьдесятый восемьдесятым восемидесяти
|
||||||
|
восьмидесяти восьмидесятых
|
||||||
|
|
||||||
|
девяносто девяностого девяностому девяностом девяностый девяностым девяноста девяностых
|
||||||
|
|
||||||
|
сто сотого сотому сотом сотен сотый сотым ста
|
||||||
|
стольник стольника стольнику стольнике стольники стольником стольниками
|
||||||
|
сотка сотки сотке соткой сотками соткам сотках
|
||||||
|
сотня сотни сотне сотней сотнями сотням сотнях
|
||||||
|
|
||||||
|
двести двумястами двухсотого двухсотому двухсотом двухсотый двухсотым двумстам двухстах двухсот
|
||||||
|
|
||||||
|
триста тремястами трехсотого трехсотому трехсотом трехсотый трехсотым тремстам трехстах трехсот
|
||||||
|
|
||||||
|
четыреста четырехсотого четырехсотому четырьмястами четырехсотом четырехсотый четырехсотым четыремстам четырехстах
|
||||||
|
четырехсот
|
||||||
|
|
||||||
|
пятьсот пятисотого пятисотому пятьюстами пятисотом пятисотый пятисотым пятистам пятистах пятисот
|
||||||
|
пятисотка пятисотки пятисотке пятисоткой пятисотками пятисоткам пятисоткою пятисотках
|
||||||
|
пятихатка пятихатки пятихатке пятихаткой пятихатками пятихаткам пятихаткою пятихатках
|
||||||
|
пятифан пятифаны пятифане пятифаном пятифанами пятифанах
|
||||||
|
|
||||||
|
шестьсот шестисотого шестисотому шестьюстами шестисотом шестисотый шестисотым шестистам шестистах шестисот
|
||||||
|
|
||||||
|
семьсот семисотого семисотому семьюстами семисотом семисотый семисотым семистам семистах семисот
|
||||||
|
|
||||||
|
восемьсот восемисотого восемисотому восемисотом восемисотый восемисотым восьмистами восьмистам восьмистах восьмисот
|
||||||
|
|
||||||
|
девятьсот девятисотого девятисотому девятьюстами девятисотом девятисотый девятисотым девятистам девятистах девятисот
|
||||||
|
|
||||||
|
тысяча тысячного тысячному тысячном тысячный тысячным тысячам тысячах тысячей тысяч тысячи тыс
|
||||||
|
косарь косаря косару косарем косарями косарях косарям косарей
|
||||||
|
|
||||||
|
десятитысячный десятитысячного десятитысячному десятитысячным десятитысячном десятитысячная десятитысячной
|
||||||
|
десятитысячную десятитысячною десятитысячное десятитысячные десятитысячных десятитысячными
|
||||||
|
|
||||||
|
двадцатитысячный двадцатитысячного двадцатитысячному двадцатитысячным двадцатитысячном двадцатитысячная
|
||||||
|
двадцатитысячной двадцатитысячную двадцатитысячною двадцатитысячное двадцатитысячные двадцатитысячных
|
||||||
|
двадцатитысячными
|
||||||
|
|
||||||
|
тридцатитысячный тридцатитысячного тридцатитысячному тридцатитысячным тридцатитысячном тридцатитысячная
|
||||||
|
тридцатитысячной тридцатитысячную тридцатитысячною тридцатитысячное тридцатитысячные тридцатитысячных
|
||||||
|
тридцатитысячными
|
||||||
|
|
||||||
|
сорокатысячный сорокатысячного сорокатысячному сорокатысячным сорокатысячном сорокатысячная
|
||||||
|
сорокатысячной сорокатысячную сорокатысячною сорокатысячное сорокатысячные сорокатысячных
|
||||||
|
сорокатысячными
|
||||||
|
|
||||||
|
пятидесятитысячный пятидесятитысячного пятидесятитысячному пятидесятитысячным пятидесятитысячном пятидесятитысячная
|
||||||
|
пятидесятитысячной пятидесятитысячную пятидесятитысячною пятидесятитысячное пятидесятитысячные пятидесятитысячных
|
||||||
|
пятидесятитысячными
|
||||||
|
|
||||||
|
шестидесятитысячный шестидесятитысячного шестидесятитысячному шестидесятитысячным шестидесятитысячном шестидесятитысячная
|
||||||
|
шестидесятитысячной шестидесятитысячную шестидесятитысячною шестидесятитысячное шестидесятитысячные шестидесятитысячных
|
||||||
|
шестидесятитысячными
|
||||||
|
|
||||||
|
семидесятитысячный семидесятитысячного семидесятитысячному семидесятитысячным семидесятитысячном семидесятитысячная
|
||||||
|
семидесятитысячной семидесятитысячную семидесятитысячною семидесятитысячное семидесятитысячные семидесятитысячных
|
||||||
|
семидесятитысячными
|
||||||
|
|
||||||
|
восьмидесятитысячный восьмидесятитысячного восьмидесятитысячному восьмидесятитысячным восьмидесятитысячном восьмидесятитысячная
|
||||||
|
восьмидесятитысячной восьмидесятитысячную восьмидесятитысячною восьмидесятитысячное восьмидесятитысячные восьмидесятитысячных
|
||||||
|
восьмидесятитысячными
|
||||||
|
|
||||||
|
стотысячный стотысячного стотысячному стотысячным стотысячном стотысячная стотысячной стотысячную стотысячное
|
||||||
|
стотысячные стотысячных стотысячными стотысячною
|
||||||
|
|
||||||
|
миллион миллионного миллионов миллионному миллионном миллионный миллионным миллионом миллиона миллионе миллиону
|
||||||
|
миллионов
|
||||||
|
лям ляма лямы лямом лямами лямах лямов
|
||||||
|
млн
|
||||||
|
|
||||||
|
десятимиллионная десятимиллионной десятимиллионными десятимиллионный десятимиллионным десятимиллионному
|
||||||
|
десятимиллионными десятимиллионную десятимиллионное десятимиллионные десятимиллионных десятимиллионною
|
||||||
|
|
||||||
|
миллиард миллиардного миллиардному миллиардном миллиардный миллиардным миллиардом миллиарда миллиарде миллиарду
|
||||||
|
миллиардов
|
||||||
|
лярд лярда лярды лярдом лярдами лярдах лярдов
|
||||||
|
млрд
|
||||||
|
|
||||||
|
триллион триллионного триллионному триллионном триллионный триллионным триллионом триллиона триллионе триллиону
|
||||||
|
триллионов трлн
|
||||||
|
|
||||||
|
квадриллион квадриллионного квадриллионному квадриллионный квадриллионным квадриллионом квадриллиона квадриллионе
|
||||||
|
квадриллиону квадриллионов квадрлн
|
||||||
|
|
||||||
|
квинтиллион квинтиллионного квинтиллионному квинтиллионный квинтиллионным квинтиллионом квинтиллиона квинтиллионе
|
||||||
|
квинтиллиону квинтиллионов квинтлн
|
||||||
|
|
||||||
|
i ii iii iv v vi vii viii ix x xi xii xiii xiv xv xvi xvii xviii xix xx xxi xxii xxiii xxiv xxv xxvi xxvii xxvii xxix
|
||||||
|
""".split()
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def like_num(text):
|
def like_num(text):
|
||||||
if text.startswith(("+", "-", "±", "~")):
|
if text.startswith(("+", "-", "±", "~")):
|
||||||
text = text[1:]
|
text = text[1:]
|
||||||
|
if text.endswith("%"):
|
||||||
|
text = text[:-1]
|
||||||
text = text.replace(",", "").replace(".", "")
|
text = text.replace(",", "").replace(".", "")
|
||||||
if text.isdigit():
|
if text.isdigit():
|
||||||
return True
|
return True
|
||||||
|
|
|
@ -1,52 +1,111 @@
|
||||||
STOP_WORDS = set(
|
STOP_WORDS = set(
|
||||||
"""
|
"""
|
||||||
а
|
а авось ага агу аж ай али алло ау ах ая
|
||||||
|
|
||||||
будем будет будете будешь буду будут будучи будь будьте бы был была были было
|
б будем будет будете будешь буду будут будучи будь будьте бы был была были было
|
||||||
быть
|
быть бац без безусловно бишь благо благодаря ближайшие близко более больше
|
||||||
|
будто бывает бывала бывали бываю бывают бытует
|
||||||
|
|
||||||
в вам вами вас весь во вот все всё всего всей всем всём всеми всему всех всею
|
в вам вами вас весь во вот все всё всего всей всем всём всеми всему всех всею
|
||||||
всея всю вся вы
|
всея всю вся вы ваш ваша ваше ваши вдали вдобавок вдруг ведь везде вернее
|
||||||
|
взаимно взаправду видно вишь включая вместо внакладе вначале вне вниз внизу
|
||||||
|
вновь вовсе возможно воистину вокруг вон вообще вопреки вперекор вплоть
|
||||||
|
вполне вправду вправе впрочем впрямь вресноту вроде вряд всегда всюду
|
||||||
|
всякий всякого всякой всячески вчеред
|
||||||
|
|
||||||
да для до
|
г го где гораздо гав
|
||||||
|
|
||||||
его едим едят ее её ей ел ела ем ему емъ если ест есть ешь еще ещё ею
|
д да для до дабы давайте давно давным даже далее далеко дальше данная
|
||||||
|
данного данное данной данном данному данные данный данных дану данунах
|
||||||
|
даром де действительно довольно доколе доколь долго должен должна
|
||||||
|
должно должны должный дополнительно другая другие другим другими
|
||||||
|
других другое другой
|
||||||
|
|
||||||
же
|
е его едим едят ее её ей ел ела ем ему емъ если ест есть ешь еще ещё ею едва
|
||||||
|
ежели еле
|
||||||
|
|
||||||
за
|
ж же
|
||||||
|
|
||||||
и из или им ими имъ их
|
з за затем зато зачем здесь значит зря
|
||||||
|
|
||||||
|
и из или им ими имъ их ибо иль имеет имел имела имело именно иметь иначе
|
||||||
|
иногда иным иными итак ишь
|
||||||
|
|
||||||
|
й
|
||||||
|
|
||||||
к как кем ко когда кого ком кому комья которая которого которое которой котором
|
к как кем ко когда кого ком кому комья которая которого которое которой котором
|
||||||
которому которою которую которые который которым которыми которых кто
|
которому которою которую которые который которым которыми которых кто ка кабы
|
||||||
|
каждая каждое каждые каждый кажется казалась казались казалось казался казаться
|
||||||
|
какая какие каким какими каков какого какой какому какою касательно кой коли
|
||||||
|
коль конечно короче кроме кстати ку куда
|
||||||
|
|
||||||
меня мне мной мною мог моги могите могла могли могло могу могут мое моё моего
|
л ли либо лишь любая любого любое любой любом любую любыми любых
|
||||||
|
|
||||||
|
м меня мне мной мною мог моги могите могла могли могло могу могут мое моё моего
|
||||||
моей моем моём моему моею можем может можете можешь мои мой моим моими моих
|
моей моем моём моему моею можем может можете можешь мои мой моим моими моих
|
||||||
мочь мою моя мы
|
мочь мою моя мы мало меж между менее меньше мимо многие много многого многое
|
||||||
|
многом многому можно мол му
|
||||||
|
|
||||||
на нам нами нас наса наш наша наше нашего нашей нашем нашему нашею наши нашим
|
н на нам нами нас наса наш наша наше нашего нашей нашем нашему нашею наши нашим
|
||||||
нашими наших нашу не него нее неё ней нем нём нему нет нею ним ними них но
|
нашими наших нашу не него нее неё ней нем нём нему нет нею ним ними них но
|
||||||
|
наверняка наверху навряд навыворот над надо назад наиболее наизворот
|
||||||
|
наизнанку наипаче накануне наконец наоборот наперед наперекор наподобие
|
||||||
|
например напротив напрямую насилу настоящая настоящее настоящие настоящий
|
||||||
|
насчет нате находиться начала начале неважно негде недавно недалеко незачем
|
||||||
|
некем некогда некому некоторая некоторые некоторый некоторых некто некуда
|
||||||
|
нельзя немногие немногим немного необходимо необходимости необходимые
|
||||||
|
необходимым неоткуда непрерывно нередко несколько нету неужели нечего
|
||||||
|
нечем нечему нечто нешто нибудь нигде ниже низко никак никакой никем
|
||||||
|
никогда никого никому никто никуда ниоткуда нипочем ничего ничем ничему
|
||||||
|
ничто ну нужная нужно нужного нужные нужный нужных ныне нынешнее нынешней
|
||||||
|
нынешних нынче
|
||||||
|
|
||||||
о об один одна одни одним одними одних одно одного одной одном одному одною
|
о об один одна одни одним одними одних одно одного одной одном одному одною
|
||||||
одну он она оне они оно от
|
одну он она оне они оно от оба общую обычно ого однажды однако ой около оный
|
||||||
|
оп опять особенно особо особую особые откуда отнелижа отнелиже отовсюду
|
||||||
|
отсюда оттого оттот оттуда отчего отчему ох очевидно очень ом
|
||||||
|
|
||||||
по при
|
п по при паче перед под подавно поди подобная подобно подобного подобные
|
||||||
|
подобный подобным подобных поелику пожалуй пожалуйста позже поистине
|
||||||
|
пока покамест поколе поколь покуда покудова помимо понеже поприще пор
|
||||||
|
пора посему поскольку после посреди посредством потом потому потомушта
|
||||||
|
похожем почему почти поэтому прежде притом причем про просто прочего
|
||||||
|
прочее прочему прочими проще прям пусть
|
||||||
|
|
||||||
|
р ради разве ранее рано раньше рядом
|
||||||
|
|
||||||
с сам сама сами самим самими самих само самого самом самому саму свое своё
|
с сам сама сами самим самими самих само самого самом самому саму свое своё
|
||||||
своего своей своем своём своему своею свои свой своим своими своих свою своя
|
своего своей своем своём своему своею свои свой своим своими своих свою своя
|
||||||
себе себя собой собою
|
себе себя собой собою самая самое самой самый самых сверх свыше се сего сей
|
||||||
|
сейчас сие сих сквозь сколько скорее скоро следует слишком смогут сможет
|
||||||
|
сначала снова со собственно совсем сперва спокону спустя сразу среди сродни
|
||||||
|
стал стала стали стало стать суть сызнова
|
||||||
|
|
||||||
та так такая такие таким такими таких такого такое такой таком такому такою
|
та то ту ты ти так такая такие таким такими таких такого такое такой таком такому такою
|
||||||
такую те тебе тебя тем теми тех то тобой тобою того той только том томах тому
|
такую те тебе тебя тем теми тех тобой тобою того той только том томах тому
|
||||||
тот тою ту ты
|
тот тою также таки таков такова там твои твоим твоих твой твоя твоё
|
||||||
|
теперь тогда тоже тотчас точно туда тут тьфу тая
|
||||||
|
|
||||||
у уже
|
у уже увы уж ура ух ую
|
||||||
|
|
||||||
чего чем чём чему что чтобы
|
ф фу
|
||||||
|
|
||||||
эта эти этим этими этих это этого этой этом этому этот этою эту
|
х ха хе хорошо хотел хотела хотелось хотеть хоть хотя хочешь хочу хуже
|
||||||
|
|
||||||
я
|
ч чего чем чём чему что чтобы часто чаще чей через чтоб чуть чхать чьим
|
||||||
|
чьих чьё чё
|
||||||
|
|
||||||
|
ш ша
|
||||||
|
|
||||||
|
щ ща щас
|
||||||
|
|
||||||
|
ы ых ые ый
|
||||||
|
|
||||||
|
э эта эти этим этими этих это этого этой этом этому этот этою эту эдак эдакий
|
||||||
|
эй эка экий этак этакий эх
|
||||||
|
|
||||||
|
ю
|
||||||
|
|
||||||
|
я явно явных яко якобы якоже
|
||||||
""".split()
|
""".split()
|
||||||
)
|
)
|
||||||
|
|
|
@ -2,7 +2,6 @@ from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ...symbols import ORTH, NORM
|
from ...symbols import ORTH, NORM
|
||||||
from ...util import update_exc
|
from ...util import update_exc
|
||||||
|
|
||||||
|
|
||||||
_exc = {}
|
_exc = {}
|
||||||
|
|
||||||
_abbrev_exc = [
|
_abbrev_exc = [
|
||||||
|
@ -42,7 +41,6 @@ _abbrev_exc = [
|
||||||
{ORTH: "дек", NORM: "декабрь"},
|
{ORTH: "дек", NORM: "декабрь"},
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
for abbrev_desc in _abbrev_exc:
|
for abbrev_desc in _abbrev_exc:
|
||||||
abbrev = abbrev_desc[ORTH]
|
abbrev = abbrev_desc[ORTH]
|
||||||
for orth in (abbrev, abbrev.capitalize(), abbrev.upper()):
|
for orth in (abbrev, abbrev.capitalize(), abbrev.upper()):
|
||||||
|
@ -50,17 +48,354 @@ for abbrev_desc in _abbrev_exc:
|
||||||
_exc[orth + "."] = [{ORTH: orth + ".", NORM: abbrev_desc[NORM]}]
|
_exc[orth + "."] = [{ORTH: orth + ".", NORM: abbrev_desc[NORM]}]
|
||||||
|
|
||||||
|
|
||||||
_slang_exc = [
|
for abbr in [
|
||||||
|
# Year slang abbreviations
|
||||||
{ORTH: "2к15", NORM: "2015"},
|
{ORTH: "2к15", NORM: "2015"},
|
||||||
{ORTH: "2к16", NORM: "2016"},
|
{ORTH: "2к16", NORM: "2016"},
|
||||||
{ORTH: "2к17", NORM: "2017"},
|
{ORTH: "2к17", NORM: "2017"},
|
||||||
{ORTH: "2к18", NORM: "2018"},
|
{ORTH: "2к18", NORM: "2018"},
|
||||||
{ORTH: "2к19", NORM: "2019"},
|
{ORTH: "2к19", NORM: "2019"},
|
||||||
{ORTH: "2к20", NORM: "2020"},
|
{ORTH: "2к20", NORM: "2020"},
|
||||||
]
|
{ORTH: "2к21", NORM: "2021"},
|
||||||
|
{ORTH: "2к22", NORM: "2022"},
|
||||||
|
{ORTH: "2к23", NORM: "2023"},
|
||||||
|
{ORTH: "2к24", NORM: "2024"},
|
||||||
|
{ORTH: "2к25", NORM: "2025"},
|
||||||
|
]:
|
||||||
|
_exc[abbr[ORTH]] = [abbr]
|
||||||
|
|
||||||
for slang_desc in _slang_exc:
|
for abbr in [
|
||||||
_exc[slang_desc[ORTH]] = [slang_desc]
|
# Profession and academic titles abbreviations
|
||||||
|
{ORTH: "ак.", NORM: "академик"},
|
||||||
|
{ORTH: "акад.", NORM: "академик"},
|
||||||
|
{ORTH: "д-р архитектуры", NORM: "доктор архитектуры"},
|
||||||
|
{ORTH: "д-р биол. наук", NORM: "доктор биологических наук"},
|
||||||
|
{ORTH: "д-р ветеринар. наук", NORM: "доктор ветеринарных наук"},
|
||||||
|
{ORTH: "д-р воен. наук", NORM: "доктор военных наук"},
|
||||||
|
{ORTH: "д-р геогр. наук", NORM: "доктор географических наук"},
|
||||||
|
{ORTH: "д-р геол.-минерал. наук", NORM: "доктор геолого-минералогических наук"},
|
||||||
|
{ORTH: "д-р искусствоведения", NORM: "доктор искусствоведения"},
|
||||||
|
{ORTH: "д-р ист. наук", NORM: "доктор исторических наук"},
|
||||||
|
{ORTH: "д-р культурологии", NORM: "доктор культурологии"},
|
||||||
|
{ORTH: "д-р мед. наук", NORM: "доктор медицинских наук"},
|
||||||
|
{ORTH: "д-р пед. наук", NORM: "доктор педагогических наук"},
|
||||||
|
{ORTH: "д-р полит. наук", NORM: "доктор политических наук"},
|
||||||
|
{ORTH: "д-р психол. наук", NORM: "доктор психологических наук"},
|
||||||
|
{ORTH: "д-р с.-х. наук", NORM: "доктор сельскохозяйственных наук"},
|
||||||
|
{ORTH: "д-р социол. наук", NORM: "доктор социологических наук"},
|
||||||
|
{ORTH: "д-р техн. наук", NORM: "доктор технических наук"},
|
||||||
|
{ORTH: "д-р фармацевт. наук", NORM: "доктор фармацевтических наук"},
|
||||||
|
{ORTH: "д-р физ.-мат. наук", NORM: "доктор физико-математических наук"},
|
||||||
|
{ORTH: "д-р филол. наук", NORM: "доктор филологических наук"},
|
||||||
|
{ORTH: "д-р филос. наук", NORM: "доктор философских наук"},
|
||||||
|
{ORTH: "д-р хим. наук", NORM: "доктор химических наук"},
|
||||||
|
{ORTH: "д-р экон. наук", NORM: "доктор экономических наук"},
|
||||||
|
{ORTH: "д-р юрид. наук", NORM: "доктор юридических наук"},
|
||||||
|
{ORTH: "д-р", NORM: "доктор"},
|
||||||
|
{ORTH: "д.б.н.", NORM: "доктор биологических наук"},
|
||||||
|
{ORTH: "д.г.-м.н.", NORM: "доктор геолого-минералогических наук"},
|
||||||
|
{ORTH: "д.г.н.", NORM: "доктор географических наук"},
|
||||||
|
{ORTH: "д.и.н.", NORM: "доктор исторических наук"},
|
||||||
|
{ORTH: "д.иск.", NORM: "доктор искусствоведения"},
|
||||||
|
{ORTH: "д.м.н.", NORM: "доктор медицинских наук"},
|
||||||
|
{ORTH: "д.п.н.", NORM: "доктор психологических наук"},
|
||||||
|
{ORTH: "д.пед.н.", NORM: "доктор педагогических наук"},
|
||||||
|
{ORTH: "д.полит.н.", NORM: "доктор политических наук"},
|
||||||
|
{ORTH: "д.с.-х.н.", NORM: "доктор сельскохозяйственных наук"},
|
||||||
|
{ORTH: "д.социол.н.", NORM: "доктор социологических наук"},
|
||||||
|
{ORTH: "д.т.н.", NORM: "доктор технических наук"},
|
||||||
|
{ORTH: "д.т.н", NORM: "доктор технических наук"},
|
||||||
|
{ORTH: "д.ф.-м.н.", NORM: "доктор физико-математических наук"},
|
||||||
|
{ORTH: "д.ф.н.", NORM: "доктор филологических наук"},
|
||||||
|
{ORTH: "д.филос.н.", NORM: "доктор философских наук"},
|
||||||
|
{ORTH: "д.фил.н.", NORM: "доктор филологических наук"},
|
||||||
|
{ORTH: "д.х.н.", NORM: "доктор химических наук"},
|
||||||
|
{ORTH: "д.э.н.", NORM: "доктор экономических наук"},
|
||||||
|
{ORTH: "д.э.н", NORM: "доктор экономических наук"},
|
||||||
|
{ORTH: "д.ю.н.", NORM: "доктор юридических наук"},
|
||||||
|
{ORTH: "доц.", NORM: "доцент"},
|
||||||
|
{ORTH: "и.о.", NORM: "исполняющий обязанности"},
|
||||||
|
{ORTH: "к.б.н.", NORM: "кандидат биологических наук"},
|
||||||
|
{ORTH: "к.воен.н.", NORM: "кандидат военных наук"},
|
||||||
|
{ORTH: "к.г.-м.н.", NORM: "кандидат геолого-минералогических наук"},
|
||||||
|
{ORTH: "к.г.н.", NORM: "кандидат географических наук"},
|
||||||
|
{ORTH: "к.геогр.н", NORM: "кандидат географических наук"},
|
||||||
|
{ORTH: "к.геогр.наук", NORM: "кандидат географических наук"},
|
||||||
|
{ORTH: "к.и.н.", NORM: "кандидат исторических наук"},
|
||||||
|
{ORTH: "к.иск.", NORM: "кандидат искусствоведения"},
|
||||||
|
{ORTH: "к.м.н.", NORM: "кандидат медицинских наук"},
|
||||||
|
{ORTH: "к.п.н.", NORM: "кандидат психологических наук"},
|
||||||
|
{ORTH: "к.псх.н.", NORM: "кандидат психологических наук"},
|
||||||
|
{ORTH: "к.пед.н.", NORM: "кандидат педагогических наук"},
|
||||||
|
{ORTH: "канд.пед.наук", NORM: "кандидат педагогических наук"},
|
||||||
|
{ORTH: "к.полит.н.", NORM: "кандидат политических наук"},
|
||||||
|
{ORTH: "к.с.-х.н.", NORM: "кандидат сельскохозяйственных наук"},
|
||||||
|
{ORTH: "к.социол.н.", NORM: "кандидат социологических наук"},
|
||||||
|
{ORTH: "к.с.н.", NORM: "кандидат социологических наук"},
|
||||||
|
{ORTH: "к.т.н.", NORM: "кандидат технических наук"},
|
||||||
|
{ORTH: "к.ф.-м.н.", NORM: "кандидат физико-математических наук"},
|
||||||
|
{ORTH: "к.ф.н.", NORM: "кандидат филологических наук"},
|
||||||
|
{ORTH: "к.фил.н.", NORM: "кандидат филологических наук"},
|
||||||
|
{ORTH: "к.филол.н", NORM: "кандидат филологических наук"},
|
||||||
|
{ORTH: "к.фарм.наук", NORM: "кандидат фармакологических наук"},
|
||||||
|
{ORTH: "к.фарм.н.", NORM: "кандидат фармакологических наук"},
|
||||||
|
{ORTH: "к.фарм.н", NORM: "кандидат фармакологических наук"},
|
||||||
|
{ORTH: "к.филос.наук", NORM: "кандидат философских наук"},
|
||||||
|
{ORTH: "к.филос.н.", NORM: "кандидат философских наук"},
|
||||||
|
{ORTH: "к.филос.н", NORM: "кандидат философских наук"},
|
||||||
|
{ORTH: "к.х.н.", NORM: "кандидат химических наук"},
|
||||||
|
{ORTH: "к.х.н", NORM: "кандидат химических наук"},
|
||||||
|
{ORTH: "к.э.н.", NORM: "кандидат экономических наук"},
|
||||||
|
{ORTH: "к.э.н", NORM: "кандидат экономических наук"},
|
||||||
|
{ORTH: "к.ю.н.", NORM: "кандидат юридических наук"},
|
||||||
|
{ORTH: "к.ю.н", NORM: "кандидат юридических наук"},
|
||||||
|
{ORTH: "канд. архитектуры", NORM: "кандидат архитектуры"},
|
||||||
|
{ORTH: "канд. биол. наук", NORM: "кандидат биологических наук"},
|
||||||
|
{ORTH: "канд. ветеринар. наук", NORM: "кандидат ветеринарных наук"},
|
||||||
|
{ORTH: "канд. воен. наук", NORM: "кандидат военных наук"},
|
||||||
|
{ORTH: "канд. геогр. наук", NORM: "кандидат географических наук"},
|
||||||
|
{ORTH: "канд. геол.-минерал. наук", NORM: "кандидат геолого-минералогических наук"},
|
||||||
|
{ORTH: "канд. искусствоведения", NORM: "кандидат искусствоведения"},
|
||||||
|
{ORTH: "канд. ист. наук", NORM: "кандидат исторических наук"},
|
||||||
|
{ORTH: "к.ист.н.", NORM: "кандидат исторических наук"},
|
||||||
|
{ORTH: "канд. культурологии", NORM: "кандидат культурологии"},
|
||||||
|
{ORTH: "канд. мед. наук", NORM: "кандидат медицинских наук"},
|
||||||
|
{ORTH: "канд. пед. наук", NORM: "кандидат педагогических наук"},
|
||||||
|
{ORTH: "канд. полит. наук", NORM: "кандидат политических наук"},
|
||||||
|
{ORTH: "канд. психол. наук", NORM: "кандидат психологических наук"},
|
||||||
|
{ORTH: "канд. с.-х. наук", NORM: "кандидат сельскохозяйственных наук"},
|
||||||
|
{ORTH: "канд. социол. наук", NORM: "кандидат социологических наук"},
|
||||||
|
{ORTH: "к.соц.наук", NORM: "кандидат социологических наук"},
|
||||||
|
{ORTH: "к.соц.н.", NORM: "кандидат социологических наук"},
|
||||||
|
{ORTH: "к.соц.н", NORM: "кандидат социологических наук"},
|
||||||
|
{ORTH: "канд. техн. наук", NORM: "кандидат технических наук"},
|
||||||
|
{ORTH: "канд. фармацевт. наук", NORM: "кандидат фармацевтических наук"},
|
||||||
|
{ORTH: "канд. физ.-мат. наук", NORM: "кандидат физико-математических наук"},
|
||||||
|
{ORTH: "канд. филол. наук", NORM: "кандидат филологических наук"},
|
||||||
|
{ORTH: "канд. филос. наук", NORM: "кандидат философских наук"},
|
||||||
|
{ORTH: "канд. хим. наук", NORM: "кандидат химических наук"},
|
||||||
|
{ORTH: "канд. экон. наук", NORM: "кандидат экономических наук"},
|
||||||
|
{ORTH: "канд. юрид. наук", NORM: "кандидат юридических наук"},
|
||||||
|
{ORTH: "в.н.с.", NORM: "ведущий научный сотрудник"},
|
||||||
|
{ORTH: "мл. науч. сотр.", NORM: "младший научный сотрудник"},
|
||||||
|
{ORTH: "м.н.с.", NORM: "младший научный сотрудник"},
|
||||||
|
{ORTH: "проф.", NORM: "профессор"},
|
||||||
|
{ORTH: "профессор.кафедры", NORM: "профессор кафедры"},
|
||||||
|
{ORTH: "ст. науч. сотр.", NORM: "старший научный сотрудник"},
|
||||||
|
{ORTH: "чл.-к.", NORM: "член корреспондент"},
|
||||||
|
{ORTH: "чл.-корр.", NORM: "член-корреспондент"},
|
||||||
|
{ORTH: "чл.-кор.", NORM: "член-корреспондент"},
|
||||||
|
{ORTH: "дир.", NORM: "директор"},
|
||||||
|
{ORTH: "зам. дир.", NORM: "заместитель директора"},
|
||||||
|
{ORTH: "зав. каф.", NORM: "заведующий кафедрой"},
|
||||||
|
{ORTH: "зав.кафедрой", NORM: "заведующий кафедрой"},
|
||||||
|
{ORTH: "зав. кафедрой", NORM: "заведующий кафедрой"},
|
||||||
|
{ORTH: "асп.", NORM: "аспирант"},
|
||||||
|
{ORTH: "гл. науч. сотр.", NORM: "главный научный сотрудник"},
|
||||||
|
{ORTH: "вед. науч. сотр.", NORM: "ведущий научный сотрудник"},
|
||||||
|
{ORTH: "науч. сотр.", NORM: "научный сотрудник"},
|
||||||
|
{ORTH: "к.м.с.", NORM: "кандидат в мастера спорта"},
|
||||||
|
]:
|
||||||
|
_exc[abbr[ORTH]] = [abbr]
|
||||||
|
|
||||||
|
|
||||||
|
for abbr in [
|
||||||
|
# Literary phrases abbreviations
|
||||||
|
{ORTH: "и т.д.", NORM: "и так далее"},
|
||||||
|
{ORTH: "и т.п.", NORM: "и тому подобное"},
|
||||||
|
{ORTH: "т.д.", NORM: "так далее"},
|
||||||
|
{ORTH: "т.п.", NORM: "тому подобное"},
|
||||||
|
{ORTH: "т.е.", NORM: "то есть"},
|
||||||
|
{ORTH: "т.к.", NORM: "так как"},
|
||||||
|
{ORTH: "в т.ч.", NORM: "в том числе"},
|
||||||
|
{ORTH: "и пр.", NORM: "и прочие"},
|
||||||
|
{ORTH: "и др.", NORM: "и другие"},
|
||||||
|
{ORTH: "т.н.", NORM: "так называемый"},
|
||||||
|
]:
|
||||||
|
_exc[abbr[ORTH]] = [abbr]
|
||||||
|
|
||||||
|
|
||||||
|
for abbr in [
|
||||||
|
# Appeal to a person abbreviations
|
||||||
|
{ORTH: "г-н", NORM: "господин"},
|
||||||
|
{ORTH: "г-да", NORM: "господа"},
|
||||||
|
{ORTH: "г-жа", NORM: "госпожа"},
|
||||||
|
{ORTH: "тов.", NORM: "товарищ"},
|
||||||
|
]:
|
||||||
|
_exc[abbr[ORTH]] = [abbr]
|
||||||
|
|
||||||
|
|
||||||
|
for abbr in [
|
||||||
|
# Time periods abbreviations
|
||||||
|
{ORTH: "до н.э.", NORM: "до нашей эры"},
|
||||||
|
{ORTH: "по н.в.", NORM: "по настоящее время"},
|
||||||
|
{ORTH: "в н.в.", NORM: "в настоящее время"},
|
||||||
|
{ORTH: "наст.", NORM: "настоящий"},
|
||||||
|
{ORTH: "наст. время", NORM: "настоящее время"},
|
||||||
|
{ORTH: "г.г.", NORM: "годы"},
|
||||||
|
{ORTH: "гг.", NORM: "годы"},
|
||||||
|
{ORTH: "т.г.", NORM: "текущий год"},
|
||||||
|
]:
|
||||||
|
_exc[abbr[ORTH]] = [abbr]
|
||||||
|
|
||||||
|
|
||||||
|
for abbr in [
|
||||||
|
# Address forming elements abbreviations
|
||||||
|
{ORTH: "респ.", NORM: "республика"},
|
||||||
|
{ORTH: "обл.", NORM: "область"},
|
||||||
|
{ORTH: "г.ф.з.", NORM: "город федерального значения"},
|
||||||
|
{ORTH: "а.обл.", NORM: "автономная область"},
|
||||||
|
{ORTH: "а.окр.", NORM: "автономный округ"},
|
||||||
|
{ORTH: "м.р-н", NORM: "муниципальный район"},
|
||||||
|
{ORTH: "г.о.", NORM: "городской округ"},
|
||||||
|
{ORTH: "г.п.", NORM: "городское поселение"},
|
||||||
|
{ORTH: "с.п.", NORM: "сельское поселение"},
|
||||||
|
{ORTH: "вн.р-н", NORM: "внутригородской район"},
|
||||||
|
{ORTH: "вн.тер.г.", NORM: "внутригородская территория города"},
|
||||||
|
{ORTH: "пос.", NORM: "поселение"},
|
||||||
|
{ORTH: "р-н", NORM: "район"},
|
||||||
|
{ORTH: "с/с", NORM: "сельсовет"},
|
||||||
|
{ORTH: "г.", NORM: "город"},
|
||||||
|
{ORTH: "п.г.т.", NORM: "поселок городского типа"},
|
||||||
|
{ORTH: "пгт.", NORM: "поселок городского типа"},
|
||||||
|
{ORTH: "р.п.", NORM: "рабочий поселок"},
|
||||||
|
{ORTH: "рп.", NORM: "рабочий поселок"},
|
||||||
|
{ORTH: "кп.", NORM: "курортный поселок"},
|
||||||
|
{ORTH: "гп.", NORM: "городской поселок"},
|
||||||
|
{ORTH: "п.", NORM: "поселок"},
|
||||||
|
{ORTH: "в-ки", NORM: "выселки"},
|
||||||
|
{ORTH: "г-к", NORM: "городок"},
|
||||||
|
{ORTH: "з-ка", NORM: "заимка"},
|
||||||
|
{ORTH: "п-к", NORM: "починок"},
|
||||||
|
{ORTH: "киш.", NORM: "кишлак"},
|
||||||
|
{ORTH: "п. ст. ", NORM: "поселок станция"},
|
||||||
|
{ORTH: "п. ж/д ст. ", NORM: "поселок при железнодорожной станции"},
|
||||||
|
{ORTH: "ж/д бл-ст", NORM: "железнодорожный блокпост"},
|
||||||
|
{ORTH: "ж/д б-ка", NORM: "железнодорожная будка"},
|
||||||
|
{ORTH: "ж/д в-ка", NORM: "железнодорожная ветка"},
|
||||||
|
{ORTH: "ж/д к-ма", NORM: "железнодорожная казарма"},
|
||||||
|
{ORTH: "ж/д к-т", NORM: "железнодорожный комбинат"},
|
||||||
|
{ORTH: "ж/д пл-ма", NORM: "железнодорожная платформа"},
|
||||||
|
{ORTH: "ж/д пл-ка", NORM: "железнодорожная площадка"},
|
||||||
|
{ORTH: "ж/д п.п.", NORM: "железнодорожный путевой пост"},
|
||||||
|
{ORTH: "ж/д о.п.", NORM: "железнодорожный остановочный пункт"},
|
||||||
|
{ORTH: "ж/д рзд.", NORM: "железнодорожный разъезд"},
|
||||||
|
{ORTH: "ж/д ст. ", NORM: "железнодорожная станция"},
|
||||||
|
{ORTH: "м-ко", NORM: "местечко"},
|
||||||
|
{ORTH: "д.", NORM: "деревня"},
|
||||||
|
{ORTH: "с.", NORM: "село"},
|
||||||
|
{ORTH: "сл.", NORM: "слобода"},
|
||||||
|
{ORTH: "ст. ", NORM: "станция"},
|
||||||
|
{ORTH: "ст-ца", NORM: "станица"},
|
||||||
|
{ORTH: "у.", NORM: "улус"},
|
||||||
|
{ORTH: "х.", NORM: "хутор"},
|
||||||
|
{ORTH: "рзд.", NORM: "разъезд"},
|
||||||
|
{ORTH: "зим.", NORM: "зимовье"},
|
||||||
|
{ORTH: "б-г", NORM: "берег"},
|
||||||
|
{ORTH: "ж/р", NORM: "жилой район"},
|
||||||
|
{ORTH: "кв-л", NORM: "квартал"},
|
||||||
|
{ORTH: "мкр.", NORM: "микрорайон"},
|
||||||
|
{ORTH: "ост-в", NORM: "остров"},
|
||||||
|
{ORTH: "платф.", NORM: "платформа"},
|
||||||
|
{ORTH: "п/р", NORM: "промышленный район"},
|
||||||
|
{ORTH: "р-н", NORM: "район"},
|
||||||
|
{ORTH: "тер.", NORM: "территория"},
|
||||||
|
{
|
||||||
|
ORTH: "тер. СНО",
|
||||||
|
NORM: "территория садоводческих некоммерческих объединений граждан",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
ORTH: "тер. ОНО",
|
||||||
|
NORM: "территория огороднических некоммерческих объединений граждан",
|
||||||
|
},
|
||||||
|
{ORTH: "тер. ДНО", NORM: "территория дачных некоммерческих объединений граждан"},
|
||||||
|
{ORTH: "тер. СНТ", NORM: "территория садоводческих некоммерческих товариществ"},
|
||||||
|
{ORTH: "тер. ОНТ", NORM: "территория огороднических некоммерческих товариществ"},
|
||||||
|
{ORTH: "тер. ДНТ", NORM: "территория дачных некоммерческих товариществ"},
|
||||||
|
{ORTH: "тер. СПК", NORM: "территория садоводческих потребительских кооперативов"},
|
||||||
|
{ORTH: "тер. ОПК", NORM: "территория огороднических потребительских кооперативов"},
|
||||||
|
{ORTH: "тер. ДПК", NORM: "территория дачных потребительских кооперативов"},
|
||||||
|
{ORTH: "тер. СНП", NORM: "территория садоводческих некоммерческих партнерств"},
|
||||||
|
{ORTH: "тер. ОНП", NORM: "территория огороднических некоммерческих партнерств"},
|
||||||
|
{ORTH: "тер. ДНП", NORM: "территория дачных некоммерческих партнерств"},
|
||||||
|
{ORTH: "тер. ТСН", NORM: "территория товарищества собственников недвижимости"},
|
||||||
|
{ORTH: "тер. ГСК", NORM: "территория гаражно-строительного кооператива"},
|
||||||
|
{ORTH: "ус.", NORM: "усадьба"},
|
||||||
|
{ORTH: "тер.ф.х.", NORM: "территория фермерского хозяйства"},
|
||||||
|
{ORTH: "ю.", NORM: "юрты"},
|
||||||
|
{ORTH: "ал.", NORM: "аллея"},
|
||||||
|
{ORTH: "б-р", NORM: "бульвар"},
|
||||||
|
{ORTH: "взв.", NORM: "взвоз"},
|
||||||
|
{ORTH: "взд.", NORM: "въезд"},
|
||||||
|
{ORTH: "дор.", NORM: "дорога"},
|
||||||
|
{ORTH: "ззд.", NORM: "заезд"},
|
||||||
|
{ORTH: "км", NORM: "километр"},
|
||||||
|
{ORTH: "к-цо", NORM: "кольцо"},
|
||||||
|
{ORTH: "лн.", NORM: "линия"},
|
||||||
|
{ORTH: "мгстр.", NORM: "магистраль"},
|
||||||
|
{ORTH: "наб.", NORM: "набережная"},
|
||||||
|
{ORTH: "пер-д", NORM: "переезд"},
|
||||||
|
{ORTH: "пер.", NORM: "переулок"},
|
||||||
|
{ORTH: "пл-ка", NORM: "площадка"},
|
||||||
|
{ORTH: "пл.", NORM: "площадь"},
|
||||||
|
{ORTH: "пр-д", NORM: "проезд"},
|
||||||
|
{ORTH: "пр-к", NORM: "просек"},
|
||||||
|
{ORTH: "пр-ка", NORM: "просека"},
|
||||||
|
{ORTH: "пр-лок", NORM: "проселок"},
|
||||||
|
{ORTH: "пр-кт", NORM: "проспект"},
|
||||||
|
{ORTH: "проул.", NORM: "проулок"},
|
||||||
|
{ORTH: "рзд.", NORM: "разъезд"},
|
||||||
|
{ORTH: "ряд", NORM: "ряд(ы)"},
|
||||||
|
{ORTH: "с-р", NORM: "сквер"},
|
||||||
|
{ORTH: "с-к", NORM: "спуск"},
|
||||||
|
{ORTH: "сзд.", NORM: "съезд"},
|
||||||
|
{ORTH: "туп.", NORM: "тупик"},
|
||||||
|
{ORTH: "ул.", NORM: "улица"},
|
||||||
|
{ORTH: "ш.", NORM: "шоссе"},
|
||||||
|
{ORTH: "влд.", NORM: "владение"},
|
||||||
|
{ORTH: "г-ж", NORM: "гараж"},
|
||||||
|
{ORTH: "д.", NORM: "дом"},
|
||||||
|
{ORTH: "двлд.", NORM: "домовладение"},
|
||||||
|
{ORTH: "зд.", NORM: "здание"},
|
||||||
|
{ORTH: "з/у", NORM: "земельный участок"},
|
||||||
|
{ORTH: "кв.", NORM: "квартира"},
|
||||||
|
{ORTH: "ком.", NORM: "комната"},
|
||||||
|
{ORTH: "подв.", NORM: "подвал"},
|
||||||
|
{ORTH: "кот.", NORM: "котельная"},
|
||||||
|
{ORTH: "п-б", NORM: "погреб"},
|
||||||
|
{ORTH: "к.", NORM: "корпус"},
|
||||||
|
{ORTH: "ОНС", NORM: "объект незавершенного строительства"},
|
||||||
|
{ORTH: "оф.", NORM: "офис"},
|
||||||
|
{ORTH: "пав.", NORM: "павильон"},
|
||||||
|
{ORTH: "помещ.", NORM: "помещение"},
|
||||||
|
{ORTH: "раб.уч.", NORM: "рабочий участок"},
|
||||||
|
{ORTH: "скл.", NORM: "склад"},
|
||||||
|
{ORTH: "coop.", NORM: "сооружение"},
|
||||||
|
{ORTH: "стр.", NORM: "строение"},
|
||||||
|
{ORTH: "торг.зал", NORM: "торговый зал"},
|
||||||
|
{ORTH: "а/п", NORM: "аэропорт"},
|
||||||
|
{ORTH: "им.", NORM: "имени"},
|
||||||
|
]:
|
||||||
|
_exc[abbr[ORTH]] = [abbr]
|
||||||
|
|
||||||
|
|
||||||
|
for abbr in [
|
||||||
|
# Others abbreviations
|
||||||
|
{ORTH: "тыс.руб.", NORM: "тысяч рублей"},
|
||||||
|
{ORTH: "тыс.", NORM: "тысяч"},
|
||||||
|
{ORTH: "руб.", NORM: "рубль"},
|
||||||
|
{ORTH: "долл.", NORM: "доллар"},
|
||||||
|
{ORTH: "прим.", NORM: "примечание"},
|
||||||
|
{ORTH: "прим.ред.", NORM: "примечание редакции"},
|
||||||
|
{ORTH: "см. также", NORM: "смотри также"},
|
||||||
|
{ORTH: "кв.м.", NORM: "квадрантный метр"},
|
||||||
|
{ORTH: "м2", NORM: "квадрантный метр"},
|
||||||
|
{ORTH: "б/у", NORM: "бывший в употреблении"},
|
||||||
|
{ORTH: "сокр.", NORM: "сокращение"},
|
||||||
|
{ORTH: "чел.", NORM: "человек"},
|
||||||
|
{ORTH: "б.п.", NORM: "базисный пункт"},
|
||||||
|
]:
|
||||||
|
_exc[abbr[ORTH]] = [abbr]
|
||||||
|
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||||
|
|
18
spacy/lang/sl/examples.py
Normal file
18
spacy/lang/sl/examples.py
Normal file
|
@ -0,0 +1,18 @@
|
||||||
|
"""
|
||||||
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
>>> from spacy.lang.sl.examples import sentences
|
||||||
|
>>> docs = nlp.pipe(sentences)
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
sentences = [
|
||||||
|
"Apple načrtuje nakup britanskega startupa za 1 bilijon dolarjev",
|
||||||
|
"France Prešeren je umrl 8. februarja 1849 v Kranju",
|
||||||
|
"Staro ljubljansko letališče Moste bo obnovila družba BTC",
|
||||||
|
"London je največje mesto v Združenem kraljestvu.",
|
||||||
|
"Kje se skrivaš?",
|
||||||
|
"Kdo je predsednik Francije?",
|
||||||
|
"Katero je glavno mesto Združenih držav Amerike?",
|
||||||
|
"Kdaj je bil rojen Milan Kučan?",
|
||||||
|
]
|
|
@ -1,13 +1,10 @@
|
||||||
# Source: https://github.com/stopwords-iso/stopwords-sl
|
# Source: https://github.com/stopwords-iso/stopwords-sl
|
||||||
# TODO: probably needs to be tidied up – the list seems to have month names in
|
# Removed various words that are not normally considered stop words, such as months.
|
||||||
# it, which shouldn't be considered stop words.
|
|
||||||
|
|
||||||
STOP_WORDS = set(
|
STOP_WORDS = set(
|
||||||
"""
|
"""
|
||||||
a
|
a
|
||||||
ali
|
ali
|
||||||
april
|
|
||||||
avgust
|
|
||||||
b
|
b
|
||||||
bi
|
bi
|
||||||
bil
|
bil
|
||||||
|
@ -19,7 +16,6 @@ biti
|
||||||
blizu
|
blizu
|
||||||
bo
|
bo
|
||||||
bodo
|
bodo
|
||||||
bojo
|
|
||||||
bolj
|
bolj
|
||||||
bom
|
bom
|
||||||
bomo
|
bomo
|
||||||
|
@ -37,16 +33,6 @@ da
|
||||||
daleč
|
daleč
|
||||||
dan
|
dan
|
||||||
danes
|
danes
|
||||||
datum
|
|
||||||
december
|
|
||||||
deset
|
|
||||||
deseta
|
|
||||||
deseti
|
|
||||||
deseto
|
|
||||||
devet
|
|
||||||
deveta
|
|
||||||
deveti
|
|
||||||
deveto
|
|
||||||
do
|
do
|
||||||
dober
|
dober
|
||||||
dobra
|
dobra
|
||||||
|
@ -54,16 +40,7 @@ dobri
|
||||||
dobro
|
dobro
|
||||||
dokler
|
dokler
|
||||||
dol
|
dol
|
||||||
dolg
|
|
||||||
dolga
|
|
||||||
dolgi
|
|
||||||
dovolj
|
dovolj
|
||||||
drug
|
|
||||||
druga
|
|
||||||
drugi
|
|
||||||
drugo
|
|
||||||
dva
|
|
||||||
dve
|
|
||||||
e
|
e
|
||||||
eden
|
eden
|
||||||
en
|
en
|
||||||
|
@ -74,7 +51,6 @@ enkrat
|
||||||
eno
|
eno
|
||||||
etc.
|
etc.
|
||||||
f
|
f
|
||||||
februar
|
|
||||||
g
|
g
|
||||||
g.
|
g.
|
||||||
ga
|
ga
|
||||||
|
@ -93,16 +69,12 @@ iv
|
||||||
ix
|
ix
|
||||||
iz
|
iz
|
||||||
j
|
j
|
||||||
januar
|
|
||||||
jaz
|
jaz
|
||||||
je
|
je
|
||||||
ji
|
ji
|
||||||
jih
|
jih
|
||||||
jim
|
jim
|
||||||
jo
|
jo
|
||||||
julij
|
|
||||||
junij
|
|
||||||
jutri
|
|
||||||
k
|
k
|
||||||
kadarkoli
|
kadarkoli
|
||||||
kaj
|
kaj
|
||||||
|
@ -123,41 +95,23 @@ kje
|
||||||
kjer
|
kjer
|
||||||
kjerkoli
|
kjerkoli
|
||||||
ko
|
ko
|
||||||
koder
|
|
||||||
koderkoli
|
koderkoli
|
||||||
koga
|
koga
|
||||||
komu
|
komu
|
||||||
kot
|
kot
|
||||||
kratek
|
|
||||||
kratka
|
|
||||||
kratke
|
|
||||||
kratki
|
|
||||||
l
|
l
|
||||||
lahka
|
|
||||||
lahke
|
|
||||||
lahki
|
|
||||||
lahko
|
|
||||||
le
|
le
|
||||||
lep
|
lep
|
||||||
lepa
|
lepa
|
||||||
lepe
|
lepe
|
||||||
lepi
|
lepi
|
||||||
lepo
|
lepo
|
||||||
leto
|
|
||||||
m
|
m
|
||||||
maj
|
|
||||||
majhen
|
|
||||||
majhna
|
|
||||||
majhni
|
|
||||||
malce
|
|
||||||
malo
|
|
||||||
manj
|
manj
|
||||||
marec
|
|
||||||
me
|
me
|
||||||
med
|
med
|
||||||
medtem
|
medtem
|
||||||
mene
|
mene
|
||||||
mesec
|
|
||||||
mi
|
mi
|
||||||
midva
|
midva
|
||||||
midve
|
midve
|
||||||
|
@ -183,7 +137,6 @@ najmanj
|
||||||
naju
|
naju
|
||||||
največ
|
največ
|
||||||
nam
|
nam
|
||||||
narobe
|
|
||||||
nas
|
nas
|
||||||
nato
|
nato
|
||||||
nazaj
|
nazaj
|
||||||
|
@ -192,7 +145,6 @@ naša
|
||||||
naše
|
naše
|
||||||
ne
|
ne
|
||||||
nedavno
|
nedavno
|
||||||
nedelja
|
|
||||||
nek
|
nek
|
||||||
neka
|
neka
|
||||||
nekaj
|
nekaj
|
||||||
|
@ -236,7 +188,6 @@ njuna
|
||||||
njuno
|
njuno
|
||||||
no
|
no
|
||||||
nocoj
|
nocoj
|
||||||
november
|
|
||||||
npr.
|
npr.
|
||||||
o
|
o
|
||||||
ob
|
ob
|
||||||
|
@ -244,51 +195,23 @@ oba
|
||||||
obe
|
obe
|
||||||
oboje
|
oboje
|
||||||
od
|
od
|
||||||
odprt
|
|
||||||
odprta
|
|
||||||
odprti
|
|
||||||
okoli
|
okoli
|
||||||
oktober
|
|
||||||
on
|
on
|
||||||
onadva
|
onadva
|
||||||
one
|
one
|
||||||
oni
|
oni
|
||||||
onidve
|
onidve
|
||||||
osem
|
|
||||||
osma
|
|
||||||
osmi
|
|
||||||
osmo
|
|
||||||
oz.
|
oz.
|
||||||
p
|
p
|
||||||
pa
|
pa
|
||||||
pet
|
|
||||||
peta
|
|
||||||
petek
|
|
||||||
peti
|
|
||||||
peto
|
|
||||||
po
|
po
|
||||||
pod
|
pod
|
||||||
pogosto
|
pogosto
|
||||||
poleg
|
poleg
|
||||||
poln
|
|
||||||
polna
|
|
||||||
polni
|
|
||||||
polno
|
|
||||||
ponavadi
|
ponavadi
|
||||||
ponedeljek
|
|
||||||
ponovno
|
ponovno
|
||||||
potem
|
potem
|
||||||
povsod
|
povsod
|
||||||
pozdravljen
|
|
||||||
pozdravljeni
|
|
||||||
prav
|
|
||||||
prava
|
|
||||||
prave
|
|
||||||
pravi
|
|
||||||
pravo
|
|
||||||
prazen
|
|
||||||
prazna
|
|
||||||
prazno
|
|
||||||
prbl.
|
prbl.
|
||||||
precej
|
precej
|
||||||
pred
|
pred
|
||||||
|
@ -297,19 +220,10 @@ preko
|
||||||
pri
|
pri
|
||||||
pribl.
|
pribl.
|
||||||
približno
|
približno
|
||||||
primer
|
|
||||||
pripravljen
|
|
||||||
pripravljena
|
|
||||||
pripravljeni
|
|
||||||
proti
|
proti
|
||||||
prva
|
|
||||||
prvi
|
|
||||||
prvo
|
|
||||||
r
|
r
|
||||||
ravno
|
|
||||||
redko
|
redko
|
||||||
res
|
res
|
||||||
reč
|
|
||||||
s
|
s
|
||||||
saj
|
saj
|
||||||
sam
|
sam
|
||||||
|
@ -321,29 +235,17 @@ se
|
||||||
sebe
|
sebe
|
||||||
sebi
|
sebi
|
||||||
sedaj
|
sedaj
|
||||||
sedem
|
|
||||||
sedma
|
|
||||||
sedmi
|
|
||||||
sedmo
|
|
||||||
sem
|
sem
|
||||||
september
|
|
||||||
seveda
|
seveda
|
||||||
si
|
si
|
||||||
sicer
|
sicer
|
||||||
skoraj
|
skoraj
|
||||||
skozi
|
skozi
|
||||||
slab
|
|
||||||
smo
|
smo
|
||||||
so
|
so
|
||||||
sobota
|
|
||||||
spet
|
spet
|
||||||
sreda
|
|
||||||
srednja
|
|
||||||
srednji
|
|
||||||
sta
|
sta
|
||||||
ste
|
ste
|
||||||
stran
|
|
||||||
stvar
|
|
||||||
sva
|
sva
|
||||||
t
|
t
|
||||||
ta
|
ta
|
||||||
|
@ -358,10 +260,6 @@ te
|
||||||
tebe
|
tebe
|
||||||
tebi
|
tebi
|
||||||
tega
|
tega
|
||||||
težak
|
|
||||||
težka
|
|
||||||
težki
|
|
||||||
težko
|
|
||||||
ti
|
ti
|
||||||
tista
|
tista
|
||||||
tiste
|
tiste
|
||||||
|
@ -371,11 +269,6 @@ tj.
|
||||||
tja
|
tja
|
||||||
to
|
to
|
||||||
toda
|
toda
|
||||||
torek
|
|
||||||
tretja
|
|
||||||
tretje
|
|
||||||
tretji
|
|
||||||
tri
|
|
||||||
tu
|
tu
|
||||||
tudi
|
tudi
|
||||||
tukaj
|
tukaj
|
||||||
|
@ -392,10 +285,6 @@ vaša
|
||||||
vaše
|
vaše
|
||||||
ve
|
ve
|
||||||
vedno
|
vedno
|
||||||
velik
|
|
||||||
velika
|
|
||||||
veliki
|
|
||||||
veliko
|
|
||||||
vendar
|
vendar
|
||||||
ves
|
ves
|
||||||
več
|
več
|
||||||
|
@ -403,10 +292,6 @@ vi
|
||||||
vidva
|
vidva
|
||||||
vii
|
vii
|
||||||
viii
|
viii
|
||||||
visok
|
|
||||||
visoka
|
|
||||||
visoke
|
|
||||||
visoki
|
|
||||||
vsa
|
vsa
|
||||||
vsaj
|
vsaj
|
||||||
vsak
|
vsak
|
||||||
|
@ -420,34 +305,21 @@ vsega
|
||||||
vsi
|
vsi
|
||||||
vso
|
vso
|
||||||
včasih
|
včasih
|
||||||
včeraj
|
|
||||||
x
|
x
|
||||||
z
|
z
|
||||||
za
|
za
|
||||||
zadaj
|
zadaj
|
||||||
zadnji
|
zadnji
|
||||||
zakaj
|
zakaj
|
||||||
zaprta
|
|
||||||
zaprti
|
|
||||||
zaprto
|
|
||||||
zdaj
|
zdaj
|
||||||
zelo
|
zelo
|
||||||
zunaj
|
zunaj
|
||||||
č
|
č
|
||||||
če
|
če
|
||||||
često
|
često
|
||||||
četrta
|
|
||||||
četrtek
|
|
||||||
četrti
|
|
||||||
četrto
|
|
||||||
čez
|
čez
|
||||||
čigav
|
čigav
|
||||||
š
|
š
|
||||||
šest
|
|
||||||
šesta
|
|
||||||
šesti
|
|
||||||
šesto
|
|
||||||
štiri
|
|
||||||
ž
|
ž
|
||||||
že
|
že
|
||||||
""".split()
|
""".split()
|
||||||
|
|
|
@ -53,7 +53,7 @@ _ordinal_words = [
|
||||||
"doksanıncı",
|
"doksanıncı",
|
||||||
"yüzüncü",
|
"yüzüncü",
|
||||||
"bininci",
|
"bininci",
|
||||||
"mliyonuncu",
|
"milyonuncu",
|
||||||
"milyarıncı",
|
"milyarıncı",
|
||||||
"trilyonuncu",
|
"trilyonuncu",
|
||||||
"katrilyonuncu",
|
"katrilyonuncu",
|
||||||
|
|
|
@ -6,19 +6,30 @@ from ...util import update_exc
|
||||||
_exc = {}
|
_exc = {}
|
||||||
|
|
||||||
for exc_data in [
|
for exc_data in [
|
||||||
|
{ORTH: "обл.", NORM: "область"},
|
||||||
|
{ORTH: "р-н.", NORM: "район"},
|
||||||
|
{ORTH: "р-н", NORM: "район"},
|
||||||
|
{ORTH: "м.", NORM: "місто"},
|
||||||
{ORTH: "вул.", NORM: "вулиця"},
|
{ORTH: "вул.", NORM: "вулиця"},
|
||||||
{ORTH: "ім.", NORM: "імені"},
|
|
||||||
{ORTH: "просп.", NORM: "проспект"},
|
{ORTH: "просп.", NORM: "проспект"},
|
||||||
|
{ORTH: "пр-кт", NORM: "проспект"},
|
||||||
{ORTH: "бул.", NORM: "бульвар"},
|
{ORTH: "бул.", NORM: "бульвар"},
|
||||||
{ORTH: "пров.", NORM: "провулок"},
|
{ORTH: "пров.", NORM: "провулок"},
|
||||||
{ORTH: "пл.", NORM: "площа"},
|
{ORTH: "пл.", NORM: "площа"},
|
||||||
|
{ORTH: "майд.", NORM: "майдан"},
|
||||||
|
{ORTH: "мкр.", NORM: "мікрорайон"},
|
||||||
|
{ORTH: "ст.", NORM: "станція"},
|
||||||
|
{ORTH: "ж/м", NORM: "житловий масив"},
|
||||||
|
{ORTH: "наб.", NORM: "набережна"},
|
||||||
|
{ORTH: "в/ч", NORM: "військова частина"},
|
||||||
|
{ORTH: "в/м", NORM: "військове містечко"},
|
||||||
|
{ORTH: "оз.", NORM: "озеро"},
|
||||||
|
{ORTH: "ім.", NORM: "імені"},
|
||||||
{ORTH: "г.", NORM: "гора"},
|
{ORTH: "г.", NORM: "гора"},
|
||||||
{ORTH: "п.", NORM: "пан"},
|
{ORTH: "п.", NORM: "пан"},
|
||||||
{ORTH: "м.", NORM: "місто"},
|
|
||||||
{ORTH: "проф.", NORM: "професор"},
|
{ORTH: "проф.", NORM: "професор"},
|
||||||
{ORTH: "акад.", NORM: "академік"},
|
{ORTH: "акад.", NORM: "академік"},
|
||||||
{ORTH: "доц.", NORM: "доцент"},
|
{ORTH: "доц.", NORM: "доцент"},
|
||||||
{ORTH: "оз.", NORM: "озеро"},
|
|
||||||
]:
|
]:
|
||||||
_exc[exc_data[ORTH]] = [exc_data]
|
_exc[exc_data[ORTH]] = [exc_data]
|
||||||
|
|
||||||
|
|
|
@ -131,7 +131,7 @@ class Language:
|
||||||
self,
|
self,
|
||||||
vocab: Union[Vocab, bool] = True,
|
vocab: Union[Vocab, bool] = True,
|
||||||
*,
|
*,
|
||||||
max_length: int = 10 ** 6,
|
max_length: int = 10**6,
|
||||||
meta: Dict[str, Any] = {},
|
meta: Dict[str, Any] = {},
|
||||||
create_tokenizer: Optional[Callable[["Language"], Callable[[str], Doc]]] = None,
|
create_tokenizer: Optional[Callable[["Language"], Callable[[str], Doc]]] = None,
|
||||||
batch_size: int = 1000,
|
batch_size: int = 1000,
|
||||||
|
@ -354,12 +354,15 @@ class Language:
|
||||||
@property
|
@property
|
||||||
def pipe_labels(self) -> Dict[str, List[str]]:
|
def pipe_labels(self) -> Dict[str, List[str]]:
|
||||||
"""Get the labels set by the pipeline components, if available (if
|
"""Get the labels set by the pipeline components, if available (if
|
||||||
the component exposes a labels property).
|
the component exposes a labels property and the labels are not
|
||||||
|
hidden).
|
||||||
|
|
||||||
RETURNS (Dict[str, List[str]]): Labels keyed by component name.
|
RETURNS (Dict[str, List[str]]): Labels keyed by component name.
|
||||||
"""
|
"""
|
||||||
labels = {}
|
labels = {}
|
||||||
for name, pipe in self._components:
|
for name, pipe in self._components:
|
||||||
|
if hasattr(pipe, "hide_labels") and pipe.hide_labels is True:
|
||||||
|
continue
|
||||||
if hasattr(pipe, "labels"):
|
if hasattr(pipe, "labels"):
|
||||||
labels[name] = list(pipe.labels)
|
labels[name] = list(pipe.labels)
|
||||||
return SimpleFrozenDict(labels)
|
return SimpleFrozenDict(labels)
|
||||||
|
@ -1219,8 +1222,9 @@ class Language:
|
||||||
component_cfg = {}
|
component_cfg = {}
|
||||||
grads = {}
|
grads = {}
|
||||||
|
|
||||||
def get_grads(W, dW, key=None):
|
def get_grads(key, W, dW):
|
||||||
grads[key] = (W, dW)
|
grads[key] = (W, dW)
|
||||||
|
return W, dW
|
||||||
|
|
||||||
get_grads.learn_rate = sgd.learn_rate # type: ignore[attr-defined, union-attr]
|
get_grads.learn_rate = sgd.learn_rate # type: ignore[attr-defined, union-attr]
|
||||||
get_grads.b1 = sgd.b1 # type: ignore[attr-defined, union-attr]
|
get_grads.b1 = sgd.b1 # type: ignore[attr-defined, union-attr]
|
||||||
|
@ -1233,7 +1237,7 @@ class Language:
|
||||||
examples, sgd=get_grads, losses=losses, **component_cfg.get(name, {})
|
examples, sgd=get_grads, losses=losses, **component_cfg.get(name, {})
|
||||||
)
|
)
|
||||||
for key, (W, dW) in grads.items():
|
for key, (W, dW) in grads.items():
|
||||||
sgd(W, dW, key=key) # type: ignore[call-arg, misc]
|
sgd(key, W, dW) # type: ignore[call-arg, misc]
|
||||||
return losses
|
return losses
|
||||||
|
|
||||||
def begin_training(
|
def begin_training(
|
||||||
|
|
|
@ -244,8 +244,12 @@ cdef class Matcher:
|
||||||
pipe = "parser"
|
pipe = "parser"
|
||||||
error_msg = Errors.E155.format(pipe=pipe, attr=self.vocab.strings.as_string(attr))
|
error_msg = Errors.E155.format(pipe=pipe, attr=self.vocab.strings.as_string(attr))
|
||||||
raise ValueError(error_msg)
|
raise ValueError(error_msg)
|
||||||
matches = find_matches(&self.patterns[0], self.patterns.size(), doclike, length,
|
|
||||||
extensions=self._extensions, predicates=self._extra_predicates, with_alignments=with_alignments)
|
if self.patterns.empty():
|
||||||
|
matches = []
|
||||||
|
else:
|
||||||
|
matches = find_matches(&self.patterns[0], self.patterns.size(), doclike, length,
|
||||||
|
extensions=self._extensions, predicates=self._extra_predicates, with_alignments=with_alignments)
|
||||||
final_matches = []
|
final_matches = []
|
||||||
pairs_by_id = {}
|
pairs_by_id = {}
|
||||||
# For each key, either add all matches, or only the filtered,
|
# For each key, either add all matches, or only the filtered,
|
||||||
|
|
|
@ -14,7 +14,7 @@ class PhraseMatcher:
|
||||||
def add(
|
def add(
|
||||||
self,
|
self,
|
||||||
key: str,
|
key: str,
|
||||||
docs: List[List[Dict[str, Any]]],
|
docs: List[Doc],
|
||||||
*,
|
*,
|
||||||
on_match: Optional[
|
on_match: Optional[
|
||||||
Callable[[Matcher, Doc, int, List[Tuple[Any, ...]]], Any]
|
Callable[[Matcher, Doc, int, List[Tuple[Any, ...]]], Any]
|
||||||
|
|
|
@ -63,4 +63,4 @@ def _get_span_indices(ops, spans: Ragged, lengths: Ints1d) -> Ints1d:
|
||||||
|
|
||||||
|
|
||||||
def _ensure_cpu(spans: Ragged, lengths: Ints1d) -> Tuple[Ragged, Ints1d]:
|
def _ensure_cpu(spans: Ragged, lengths: Ints1d) -> Tuple[Ragged, Ints1d]:
|
||||||
return (Ragged(to_numpy(spans.dataXd), to_numpy(spans.lengths)), to_numpy(lengths))
|
return Ragged(to_numpy(spans.dataXd), to_numpy(spans.lengths)), to_numpy(lengths)
|
||||||
|
|
|
@ -1,34 +1,82 @@
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional, Callable, Iterable, List
|
from typing import Optional, Callable, Iterable, List, Tuple
|
||||||
from thinc.types import Floats2d
|
from thinc.types import Floats2d
|
||||||
from thinc.api import chain, clone, list2ragged, reduce_mean, residual
|
from thinc.api import chain, clone, list2ragged, reduce_mean, residual
|
||||||
from thinc.api import Model, Maxout, Linear
|
from thinc.api import Model, Maxout, Linear, noop, tuplify, Ragged
|
||||||
|
|
||||||
from ...util import registry
|
from ...util import registry
|
||||||
from ...kb import KnowledgeBase, Candidate, get_candidates
|
from ...kb import KnowledgeBase, Candidate, get_candidates
|
||||||
from ...vocab import Vocab
|
from ...vocab import Vocab
|
||||||
from ...tokens import Span, Doc
|
from ...tokens import Span, Doc
|
||||||
|
from ..extract_spans import extract_spans
|
||||||
|
from ...errors import Errors
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures("spacy.EntityLinker.v1")
|
@registry.architectures("spacy.EntityLinker.v2")
|
||||||
def build_nel_encoder(
|
def build_nel_encoder(
|
||||||
tok2vec: Model, nO: Optional[int] = None
|
tok2vec: Model, nO: Optional[int] = None
|
||||||
) -> Model[List[Doc], Floats2d]:
|
) -> Model[List[Doc], Floats2d]:
|
||||||
with Model.define_operators({">>": chain, "**": clone}):
|
with Model.define_operators({">>": chain, "&": tuplify}):
|
||||||
token_width = tok2vec.maybe_get_dim("nO")
|
token_width = tok2vec.maybe_get_dim("nO")
|
||||||
output_layer = Linear(nO=nO, nI=token_width)
|
output_layer = Linear(nO=nO, nI=token_width)
|
||||||
model = (
|
model = (
|
||||||
tok2vec
|
((tok2vec >> list2ragged()) & build_span_maker())
|
||||||
>> list2ragged()
|
>> extract_spans()
|
||||||
>> reduce_mean()
|
>> reduce_mean()
|
||||||
>> residual(Maxout(nO=token_width, nI=token_width, nP=2, dropout=0.0)) # type: ignore[arg-type]
|
>> residual(Maxout(nO=token_width, nI=token_width, nP=2, dropout=0.0)) # type: ignore[arg-type]
|
||||||
>> output_layer
|
>> output_layer
|
||||||
)
|
)
|
||||||
model.set_ref("output_layer", output_layer)
|
model.set_ref("output_layer", output_layer)
|
||||||
model.set_ref("tok2vec", tok2vec)
|
model.set_ref("tok2vec", tok2vec)
|
||||||
|
# flag to show this isn't legacy
|
||||||
|
model.attrs["include_span_maker"] = True
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
def build_span_maker(n_sents: int = 0) -> Model:
|
||||||
|
model: Model = Model("span_maker", forward=span_maker_forward)
|
||||||
|
model.attrs["n_sents"] = n_sents
|
||||||
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
def span_maker_forward(model, docs: List[Doc], is_train) -> Tuple[Ragged, Callable]:
|
||||||
|
ops = model.ops
|
||||||
|
n_sents = model.attrs["n_sents"]
|
||||||
|
candidates = []
|
||||||
|
for doc in docs:
|
||||||
|
cands = []
|
||||||
|
try:
|
||||||
|
sentences = [s for s in doc.sents]
|
||||||
|
except ValueError:
|
||||||
|
# no sentence info, normal in initialization
|
||||||
|
for tok in doc:
|
||||||
|
tok.is_sent_start = tok.i == 0
|
||||||
|
sentences = [doc[:]]
|
||||||
|
for ent in doc.ents:
|
||||||
|
try:
|
||||||
|
# find the sentence in the list of sentences.
|
||||||
|
sent_index = sentences.index(ent.sent)
|
||||||
|
except AttributeError:
|
||||||
|
# Catch the exception when ent.sent is None and provide a user-friendly warning
|
||||||
|
raise RuntimeError(Errors.E030) from None
|
||||||
|
# get n previous sentences, if there are any
|
||||||
|
start_sentence = max(0, sent_index - n_sents)
|
||||||
|
# get n posterior sentences, or as many < n as there are
|
||||||
|
end_sentence = min(len(sentences) - 1, sent_index + n_sents)
|
||||||
|
# get token positions
|
||||||
|
start_token = sentences[start_sentence].start
|
||||||
|
end_token = sentences[end_sentence].end
|
||||||
|
# save positions for extraction
|
||||||
|
cands.append((start_token, end_token))
|
||||||
|
|
||||||
|
candidates.append(ops.asarray2i(cands))
|
||||||
|
candlens = ops.asarray1i([len(cands) for cands in candidates])
|
||||||
|
candidates = ops.xp.concatenate(candidates)
|
||||||
|
outputs = Ragged(candidates, candlens)
|
||||||
|
# because this is just rearranging docs, the backprop does nothing
|
||||||
|
return outputs, lambda x: []
|
||||||
|
|
||||||
|
|
||||||
@registry.misc("spacy.KBFromFile.v1")
|
@registry.misc("spacy.KBFromFile.v1")
|
||||||
def load_kb(kb_path: Path) -> Callable[[Vocab], KnowledgeBase]:
|
def load_kb(kb_path: Path) -> Callable[[Vocab], KnowledgeBase]:
|
||||||
def kb_from_file(vocab):
|
def kb_from_file(vocab):
|
||||||
|
|
|
@ -85,7 +85,7 @@ def get_characters_loss(ops, docs, prediction, nr_char):
|
||||||
target = ops.asarray(to_categorical(target_ids, n_classes=256), dtype="f")
|
target = ops.asarray(to_categorical(target_ids, n_classes=256), dtype="f")
|
||||||
target = target.reshape((-1, 256 * nr_char))
|
target = target.reshape((-1, 256 * nr_char))
|
||||||
diff = prediction - target
|
diff = prediction - target
|
||||||
loss = (diff ** 2).sum()
|
loss = (diff**2).sum()
|
||||||
d_target = diff / float(prediction.shape[0])
|
d_target = diff / float(prediction.shape[0])
|
||||||
return loss, d_target
|
return loss, d_target
|
||||||
|
|
||||||
|
|
|
@ -1,14 +1,14 @@
|
||||||
from typing import Optional, List
|
from typing import Optional, List
|
||||||
from thinc.api import zero_init, with_array, Softmax, chain, Model
|
from thinc.api import zero_init, with_array, Softmax_v2, chain, Model
|
||||||
from thinc.types import Floats2d
|
from thinc.types import Floats2d
|
||||||
|
|
||||||
from ...util import registry
|
from ...util import registry
|
||||||
from ...tokens import Doc
|
from ...tokens import Doc
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures("spacy.Tagger.v1")
|
@registry.architectures("spacy.Tagger.v2")
|
||||||
def build_tagger_model(
|
def build_tagger_model(
|
||||||
tok2vec: Model[List[Doc], List[Floats2d]], nO: Optional[int] = None
|
tok2vec: Model[List[Doc], List[Floats2d]], nO: Optional[int] = None, normalize=False
|
||||||
) -> Model[List[Doc], List[Floats2d]]:
|
) -> Model[List[Doc], List[Floats2d]]:
|
||||||
"""Build a tagger model, using a provided token-to-vector component. The tagger
|
"""Build a tagger model, using a provided token-to-vector component. The tagger
|
||||||
model simply adds a linear layer with softmax activation to predict scores
|
model simply adds a linear layer with softmax activation to predict scores
|
||||||
|
@ -19,7 +19,9 @@ def build_tagger_model(
|
||||||
"""
|
"""
|
||||||
# TODO: glorot_uniform_init seems to work a bit better than zero_init here?!
|
# TODO: glorot_uniform_init seems to work a bit better than zero_init here?!
|
||||||
t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
|
t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
|
||||||
output_layer = Softmax(nO, t2v_width, init_W=zero_init)
|
output_layer = Softmax_v2(
|
||||||
|
nO, t2v_width, init_W=zero_init, normalize_outputs=normalize
|
||||||
|
)
|
||||||
softmax = with_array(output_layer) # type: ignore
|
softmax = with_array(output_layer) # type: ignore
|
||||||
model = chain(tok2vec, softmax)
|
model = chain(tok2vec, softmax)
|
||||||
model.set_ref("tok2vec", tok2vec)
|
model.set_ref("tok2vec", tok2vec)
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
from .attributeruler import AttributeRuler
|
from .attributeruler import AttributeRuler
|
||||||
from .coref import CoreferenceResolver
|
from .coref import CoreferenceResolver
|
||||||
from .dep_parser import DependencyParser
|
from .dep_parser import DependencyParser
|
||||||
|
from .edit_tree_lemmatizer import EditTreeLemmatizer
|
||||||
from .entity_linker import EntityLinker
|
from .entity_linker import EntityLinker
|
||||||
from .ner import EntityRecognizer
|
from .ner import EntityRecognizer
|
||||||
from .entityruler import EntityRuler
|
from .entityruler import EntityRuler
|
||||||
|
|
0
spacy/pipeline/_edit_tree_internals/__init__.py
Normal file
0
spacy/pipeline/_edit_tree_internals/__init__.py
Normal file
93
spacy/pipeline/_edit_tree_internals/edit_trees.pxd
Normal file
93
spacy/pipeline/_edit_tree_internals/edit_trees.pxd
Normal file
|
@ -0,0 +1,93 @@
|
||||||
|
from libc.stdint cimport uint32_t, uint64_t
|
||||||
|
from libcpp.unordered_map cimport unordered_map
|
||||||
|
from libcpp.vector cimport vector
|
||||||
|
|
||||||
|
from ...typedefs cimport attr_t, hash_t, len_t
|
||||||
|
from ...strings cimport StringStore
|
||||||
|
|
||||||
|
cdef extern from "<algorithm>" namespace "std" nogil:
|
||||||
|
void swap[T](T& a, T& b) except + # Only available in Cython 3.
|
||||||
|
|
||||||
|
# An edit tree (Müller et al., 2015) is a tree structure that consists of
|
||||||
|
# edit operations. The two types of operations are string matches
|
||||||
|
# and string substitutions. Given an input string s and an output string t,
|
||||||
|
# subsitution and match nodes should be interpreted as follows:
|
||||||
|
#
|
||||||
|
# * Substitution node: consists of an original string and substitute string.
|
||||||
|
# If s matches the original string, then t is the substitute. Otherwise,
|
||||||
|
# the node does not apply.
|
||||||
|
# * Match node: consists of a prefix length, suffix length, prefix edit tree,
|
||||||
|
# and suffix edit tree. If s is composed of a prefix, middle part, and suffix
|
||||||
|
# with the given suffix and prefix lengths, then t is the concatenation
|
||||||
|
# prefix_tree(prefix) + middle + suffix_tree(suffix).
|
||||||
|
#
|
||||||
|
# For efficiency, we represent strings in substitution nodes as integers, with
|
||||||
|
# the actual strings stored in a StringStore. Subtrees in match nodes are stored
|
||||||
|
# as tree identifiers (rather than pointers) to simplify serialization.
|
||||||
|
|
||||||
|
cdef uint32_t NULL_TREE_ID
|
||||||
|
|
||||||
|
cdef struct MatchNodeC:
|
||||||
|
len_t prefix_len
|
||||||
|
len_t suffix_len
|
||||||
|
uint32_t prefix_tree
|
||||||
|
uint32_t suffix_tree
|
||||||
|
|
||||||
|
cdef struct SubstNodeC:
|
||||||
|
attr_t orig
|
||||||
|
attr_t subst
|
||||||
|
|
||||||
|
cdef union NodeC:
|
||||||
|
MatchNodeC match_node
|
||||||
|
SubstNodeC subst_node
|
||||||
|
|
||||||
|
cdef struct EditTreeC:
|
||||||
|
bint is_match_node
|
||||||
|
NodeC inner
|
||||||
|
|
||||||
|
cdef inline EditTreeC edittree_new_match(len_t prefix_len, len_t suffix_len,
|
||||||
|
uint32_t prefix_tree, uint32_t suffix_tree):
|
||||||
|
cdef MatchNodeC match_node = MatchNodeC(prefix_len=prefix_len,
|
||||||
|
suffix_len=suffix_len, prefix_tree=prefix_tree,
|
||||||
|
suffix_tree=suffix_tree)
|
||||||
|
cdef NodeC inner = NodeC(match_node=match_node)
|
||||||
|
return EditTreeC(is_match_node=True, inner=inner)
|
||||||
|
|
||||||
|
cdef inline EditTreeC edittree_new_subst(attr_t orig, attr_t subst):
|
||||||
|
cdef EditTreeC node
|
||||||
|
cdef SubstNodeC subst_node = SubstNodeC(orig=orig, subst=subst)
|
||||||
|
cdef NodeC inner = NodeC(subst_node=subst_node)
|
||||||
|
return EditTreeC(is_match_node=False, inner=inner)
|
||||||
|
|
||||||
|
cdef inline uint64_t edittree_hash(EditTreeC tree):
|
||||||
|
cdef MatchNodeC match_node
|
||||||
|
cdef SubstNodeC subst_node
|
||||||
|
|
||||||
|
if tree.is_match_node:
|
||||||
|
match_node = tree.inner.match_node
|
||||||
|
return hash((match_node.prefix_len, match_node.suffix_len, match_node.prefix_tree, match_node.suffix_tree))
|
||||||
|
else:
|
||||||
|
subst_node = tree.inner.subst_node
|
||||||
|
return hash((subst_node.orig, subst_node.subst))
|
||||||
|
|
||||||
|
cdef struct LCS:
|
||||||
|
int source_begin
|
||||||
|
int source_end
|
||||||
|
int target_begin
|
||||||
|
int target_end
|
||||||
|
|
||||||
|
cdef inline bint lcs_is_empty(LCS lcs):
|
||||||
|
return lcs.source_begin == 0 and lcs.source_end == 0 and lcs.target_begin == 0 and lcs.target_end == 0
|
||||||
|
|
||||||
|
cdef class EditTrees:
|
||||||
|
cdef vector[EditTreeC] trees
|
||||||
|
cdef unordered_map[hash_t, uint32_t] map
|
||||||
|
cdef StringStore strings
|
||||||
|
|
||||||
|
cpdef uint32_t add(self, str form, str lemma)
|
||||||
|
cpdef str apply(self, uint32_t tree_id, str form)
|
||||||
|
cpdef unicode tree_to_str(self, uint32_t tree_id)
|
||||||
|
|
||||||
|
cdef uint32_t _add(self, str form, str lemma)
|
||||||
|
cdef _apply(self, uint32_t tree_id, str form_part, list lemma_pieces)
|
||||||
|
cdef uint32_t _tree_id(self, EditTreeC tree)
|
305
spacy/pipeline/_edit_tree_internals/edit_trees.pyx
Normal file
305
spacy/pipeline/_edit_tree_internals/edit_trees.pyx
Normal file
|
@ -0,0 +1,305 @@
|
||||||
|
# cython: infer_types=True, binding=True
|
||||||
|
from cython.operator cimport dereference as deref
|
||||||
|
from libc.stdint cimport uint32_t
|
||||||
|
from libc.stdint cimport UINT32_MAX
|
||||||
|
from libc.string cimport memset
|
||||||
|
from libcpp.pair cimport pair
|
||||||
|
from libcpp.vector cimport vector
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from ...typedefs cimport hash_t
|
||||||
|
|
||||||
|
from ... import util
|
||||||
|
from ...errors import Errors
|
||||||
|
from ...strings import StringStore
|
||||||
|
from .schemas import validate_edit_tree
|
||||||
|
|
||||||
|
|
||||||
|
NULL_TREE_ID = UINT32_MAX
|
||||||
|
|
||||||
|
cdef LCS find_lcs(str source, str target):
|
||||||
|
"""
|
||||||
|
Find the longest common subsequence (LCS) between two strings. If there are
|
||||||
|
multiple LCSes, only one of them is returned.
|
||||||
|
|
||||||
|
source (str): The first string.
|
||||||
|
target (str): The second string.
|
||||||
|
RETURNS (LCS): The spans of the longest common subsequences.
|
||||||
|
"""
|
||||||
|
cdef Py_ssize_t source_len = len(source)
|
||||||
|
cdef Py_ssize_t target_len = len(target)
|
||||||
|
cdef size_t longest_align = 0;
|
||||||
|
cdef int source_idx, target_idx
|
||||||
|
cdef LCS lcs
|
||||||
|
cdef Py_UCS4 source_cp, target_cp
|
||||||
|
|
||||||
|
memset(&lcs, 0, sizeof(lcs))
|
||||||
|
|
||||||
|
cdef vector[size_t] prev_aligns = vector[size_t](target_len);
|
||||||
|
cdef vector[size_t] cur_aligns = vector[size_t](target_len);
|
||||||
|
|
||||||
|
for (source_idx, source_cp) in enumerate(source):
|
||||||
|
for (target_idx, target_cp) in enumerate(target):
|
||||||
|
if source_cp == target_cp:
|
||||||
|
if source_idx == 0 or target_idx == 0:
|
||||||
|
cur_aligns[target_idx] = 1
|
||||||
|
else:
|
||||||
|
cur_aligns[target_idx] = prev_aligns[target_idx - 1] + 1
|
||||||
|
|
||||||
|
# Check if this is the longest alignment and replace previous
|
||||||
|
# best alignment when this is the case.
|
||||||
|
if cur_aligns[target_idx] > longest_align:
|
||||||
|
longest_align = cur_aligns[target_idx]
|
||||||
|
lcs.source_begin = source_idx - longest_align + 1
|
||||||
|
lcs.source_end = source_idx + 1
|
||||||
|
lcs.target_begin = target_idx - longest_align + 1
|
||||||
|
lcs.target_end = target_idx + 1
|
||||||
|
else:
|
||||||
|
# No match, we start with a zero-length alignment.
|
||||||
|
cur_aligns[target_idx] = 0
|
||||||
|
swap(prev_aligns, cur_aligns)
|
||||||
|
|
||||||
|
return lcs
|
||||||
|
|
||||||
|
cdef class EditTrees:
|
||||||
|
"""Container for constructing and storing edit trees."""
|
||||||
|
def __init__(self, strings: StringStore):
|
||||||
|
"""Create a container for edit trees.
|
||||||
|
|
||||||
|
strings (StringStore): the string store to use."""
|
||||||
|
self.strings = strings
|
||||||
|
|
||||||
|
cpdef uint32_t add(self, str form, str lemma):
|
||||||
|
"""Add an edit tree that rewrites the given string into the given lemma.
|
||||||
|
|
||||||
|
RETURNS (int): identifier of the edit tree in the container.
|
||||||
|
"""
|
||||||
|
# Treat two empty strings as a special case. Generating an edit
|
||||||
|
# tree for identical strings results in a match node. However,
|
||||||
|
# since two empty strings have a zero-length LCS, a substitution
|
||||||
|
# node would be created. Since we do not want to clutter the
|
||||||
|
# recursive tree construction with logic for this case, handle
|
||||||
|
# it in this wrapper method.
|
||||||
|
if len(form) == 0 and len(lemma) == 0:
|
||||||
|
tree = edittree_new_match(0, 0, NULL_TREE_ID, NULL_TREE_ID)
|
||||||
|
return self._tree_id(tree)
|
||||||
|
|
||||||
|
return self._add(form, lemma)
|
||||||
|
|
||||||
|
cdef uint32_t _add(self, str form, str lemma):
|
||||||
|
cdef LCS lcs = find_lcs(form, lemma)
|
||||||
|
|
||||||
|
cdef EditTreeC tree
|
||||||
|
cdef uint32_t tree_id, prefix_tree, suffix_tree
|
||||||
|
if lcs_is_empty(lcs):
|
||||||
|
tree = edittree_new_subst(self.strings.add(form), self.strings.add(lemma))
|
||||||
|
else:
|
||||||
|
# If we have a non-empty LCS, such as "gooi" in "ge[gooi]d" and "[gooi]en",
|
||||||
|
# create edit trees for the prefix pair ("ge"/"") and the suffix pair ("d"/"en").
|
||||||
|
prefix_tree = NULL_TREE_ID
|
||||||
|
if lcs.source_begin != 0 or lcs.target_begin != 0:
|
||||||
|
prefix_tree = self.add(form[:lcs.source_begin], lemma[:lcs.target_begin])
|
||||||
|
|
||||||
|
suffix_tree = NULL_TREE_ID
|
||||||
|
if lcs.source_end != len(form) or lcs.target_end != len(lemma):
|
||||||
|
suffix_tree = self.add(form[lcs.source_end:], lemma[lcs.target_end:])
|
||||||
|
|
||||||
|
tree = edittree_new_match(lcs.source_begin, len(form) - lcs.source_end, prefix_tree, suffix_tree)
|
||||||
|
|
||||||
|
return self._tree_id(tree)
|
||||||
|
|
||||||
|
cdef uint32_t _tree_id(self, EditTreeC tree):
|
||||||
|
# If this tree has been constructed before, return its identifier.
|
||||||
|
cdef hash_t hash = edittree_hash(tree)
|
||||||
|
cdef unordered_map[hash_t, uint32_t].iterator iter = self.map.find(hash)
|
||||||
|
if iter != self.map.end():
|
||||||
|
return deref(iter).second
|
||||||
|
|
||||||
|
# The tree hasn't been seen before, store it.
|
||||||
|
cdef uint32_t tree_id = self.trees.size()
|
||||||
|
self.trees.push_back(tree)
|
||||||
|
self.map.insert(pair[hash_t, uint32_t](hash, tree_id))
|
||||||
|
|
||||||
|
return tree_id
|
||||||
|
|
||||||
|
cpdef str apply(self, uint32_t tree_id, str form):
|
||||||
|
"""Apply an edit tree to a form.
|
||||||
|
|
||||||
|
tree_id (uint32_t): the identifier of the edit tree to apply.
|
||||||
|
form (str): the form to apply the edit tree to.
|
||||||
|
RETURNS (str): the transformer form or None if the edit tree
|
||||||
|
could not be applied to the form.
|
||||||
|
"""
|
||||||
|
if tree_id >= self.trees.size():
|
||||||
|
raise IndexError("Edit tree identifier out of range")
|
||||||
|
|
||||||
|
lemma_pieces = []
|
||||||
|
try:
|
||||||
|
self._apply(tree_id, form, lemma_pieces)
|
||||||
|
except ValueError:
|
||||||
|
return None
|
||||||
|
return "".join(lemma_pieces)
|
||||||
|
|
||||||
|
cdef _apply(self, uint32_t tree_id, str form_part, list lemma_pieces):
|
||||||
|
"""Recursively apply an edit tree to a form, adding pieces to
|
||||||
|
the lemma_pieces list."""
|
||||||
|
assert tree_id <= self.trees.size()
|
||||||
|
|
||||||
|
cdef EditTreeC tree = self.trees[tree_id]
|
||||||
|
cdef MatchNodeC match_node
|
||||||
|
cdef int suffix_start
|
||||||
|
|
||||||
|
if tree.is_match_node:
|
||||||
|
match_node = tree.inner.match_node
|
||||||
|
|
||||||
|
if match_node.prefix_len + match_node.suffix_len > len(form_part):
|
||||||
|
raise ValueError("Edit tree cannot be applied to form")
|
||||||
|
|
||||||
|
suffix_start = len(form_part) - match_node.suffix_len
|
||||||
|
|
||||||
|
if match_node.prefix_tree != NULL_TREE_ID:
|
||||||
|
self._apply(match_node.prefix_tree, form_part[:match_node.prefix_len], lemma_pieces)
|
||||||
|
|
||||||
|
lemma_pieces.append(form_part[match_node.prefix_len:suffix_start])
|
||||||
|
|
||||||
|
if match_node.suffix_tree != NULL_TREE_ID:
|
||||||
|
self._apply(match_node.suffix_tree, form_part[suffix_start:], lemma_pieces)
|
||||||
|
else:
|
||||||
|
if form_part == self.strings[tree.inner.subst_node.orig]:
|
||||||
|
lemma_pieces.append(self.strings[tree.inner.subst_node.subst])
|
||||||
|
else:
|
||||||
|
raise ValueError("Edit tree cannot be applied to form")
|
||||||
|
|
||||||
|
cpdef unicode tree_to_str(self, uint32_t tree_id):
|
||||||
|
"""Return the tree as a string. The tree tree string is formatted
|
||||||
|
like an S-expression. This is primarily useful for debugging. Match
|
||||||
|
nodes have the following format:
|
||||||
|
|
||||||
|
(m prefix_len suffix_len prefix_tree suffix_tree)
|
||||||
|
|
||||||
|
Substitution nodes have the following format:
|
||||||
|
|
||||||
|
(s original substitute)
|
||||||
|
|
||||||
|
tree_id (uint32_t): the identifier of the edit tree.
|
||||||
|
RETURNS (str): the tree as an S-expression.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if tree_id >= self.trees.size():
|
||||||
|
raise IndexError("Edit tree identifier out of range")
|
||||||
|
|
||||||
|
cdef EditTreeC tree = self.trees[tree_id]
|
||||||
|
cdef SubstNodeC subst_node
|
||||||
|
|
||||||
|
if not tree.is_match_node:
|
||||||
|
subst_node = tree.inner.subst_node
|
||||||
|
return f"(s '{self.strings[subst_node.orig]}' '{self.strings[subst_node.subst]}')"
|
||||||
|
|
||||||
|
cdef MatchNodeC match_node = tree.inner.match_node
|
||||||
|
|
||||||
|
prefix_tree = "()"
|
||||||
|
if match_node.prefix_tree != NULL_TREE_ID:
|
||||||
|
prefix_tree = self.tree_to_str(match_node.prefix_tree)
|
||||||
|
|
||||||
|
suffix_tree = "()"
|
||||||
|
if match_node.suffix_tree != NULL_TREE_ID:
|
||||||
|
suffix_tree = self.tree_to_str(match_node.suffix_tree)
|
||||||
|
|
||||||
|
return f"(m {match_node.prefix_len} {match_node.suffix_len} {prefix_tree} {suffix_tree})"
|
||||||
|
|
||||||
|
def from_json(self, trees: list) -> "EditTrees":
|
||||||
|
self.trees.clear()
|
||||||
|
|
||||||
|
for tree in trees:
|
||||||
|
tree = _dict2tree(tree)
|
||||||
|
self.trees.push_back(tree)
|
||||||
|
|
||||||
|
self._rebuild_tree_map()
|
||||||
|
|
||||||
|
def from_bytes(self, bytes_data: bytes, *) -> "EditTrees":
|
||||||
|
def deserialize_trees(tree_dicts):
|
||||||
|
cdef EditTreeC c_tree
|
||||||
|
for tree_dict in tree_dicts:
|
||||||
|
c_tree = _dict2tree(tree_dict)
|
||||||
|
self.trees.push_back(c_tree)
|
||||||
|
|
||||||
|
deserializers = {}
|
||||||
|
deserializers["trees"] = lambda n: deserialize_trees(n)
|
||||||
|
util.from_bytes(bytes_data, deserializers, [])
|
||||||
|
|
||||||
|
self._rebuild_tree_map()
|
||||||
|
|
||||||
|
return self
|
||||||
|
|
||||||
|
def to_bytes(self, **kwargs) -> bytes:
|
||||||
|
tree_dicts = []
|
||||||
|
for tree in self.trees:
|
||||||
|
tree = _tree2dict(tree)
|
||||||
|
tree_dicts.append(tree)
|
||||||
|
|
||||||
|
serializers = {}
|
||||||
|
serializers["trees"] = lambda: tree_dicts
|
||||||
|
|
||||||
|
return util.to_bytes(serializers, [])
|
||||||
|
|
||||||
|
def to_disk(self, path, **kwargs) -> "EditTrees":
|
||||||
|
path = util.ensure_path(path)
|
||||||
|
with path.open("wb") as file_:
|
||||||
|
file_.write(self.to_bytes())
|
||||||
|
|
||||||
|
def from_disk(self, path, **kwargs) -> "EditTrees":
|
||||||
|
path = util.ensure_path(path)
|
||||||
|
if path.exists():
|
||||||
|
with path.open("rb") as file_:
|
||||||
|
data = file_.read()
|
||||||
|
return self.from_bytes(data)
|
||||||
|
|
||||||
|
return self
|
||||||
|
|
||||||
|
def __getitem__(self, idx):
|
||||||
|
return _tree2dict(self.trees[idx])
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return self.trees.size()
|
||||||
|
|
||||||
|
def _rebuild_tree_map(self):
|
||||||
|
"""Rebuild the tree hash -> tree id mapping"""
|
||||||
|
cdef EditTreeC c_tree
|
||||||
|
cdef uint32_t tree_id
|
||||||
|
cdef hash_t tree_hash
|
||||||
|
|
||||||
|
self.map.clear()
|
||||||
|
|
||||||
|
for tree_id in range(self.trees.size()):
|
||||||
|
c_tree = self.trees[tree_id]
|
||||||
|
tree_hash = edittree_hash(c_tree)
|
||||||
|
self.map.insert(pair[hash_t, uint32_t](tree_hash, tree_id))
|
||||||
|
|
||||||
|
def __reduce__(self):
|
||||||
|
return (unpickle_edittrees, (self.strings, self.to_bytes()))
|
||||||
|
|
||||||
|
|
||||||
|
def unpickle_edittrees(strings, trees_data):
|
||||||
|
return EditTrees(strings).from_bytes(trees_data)
|
||||||
|
|
||||||
|
|
||||||
|
def _tree2dict(tree):
|
||||||
|
if tree["is_match_node"]:
|
||||||
|
tree = tree["inner"]["match_node"]
|
||||||
|
else:
|
||||||
|
tree = tree["inner"]["subst_node"]
|
||||||
|
return(dict(tree))
|
||||||
|
|
||||||
|
def _dict2tree(tree):
|
||||||
|
errors = validate_edit_tree(tree)
|
||||||
|
if errors:
|
||||||
|
raise ValueError(Errors.E1026.format(errors="\n".join(errors)))
|
||||||
|
|
||||||
|
tree = dict(tree)
|
||||||
|
if "prefix_len" in tree:
|
||||||
|
tree = {"is_match_node": True, "inner": {"match_node": tree}}
|
||||||
|
else:
|
||||||
|
tree = {"is_match_node": False, "inner": {"subst_node": tree}}
|
||||||
|
|
||||||
|
return tree
|
44
spacy/pipeline/_edit_tree_internals/schemas.py
Normal file
44
spacy/pipeline/_edit_tree_internals/schemas.py
Normal file
|
@ -0,0 +1,44 @@
|
||||||
|
from typing import Any, Dict, List, Union
|
||||||
|
from collections import defaultdict
|
||||||
|
from pydantic import BaseModel, Field, ValidationError
|
||||||
|
from pydantic.types import StrictBool, StrictInt, StrictStr
|
||||||
|
|
||||||
|
|
||||||
|
class MatchNodeSchema(BaseModel):
|
||||||
|
prefix_len: StrictInt = Field(..., title="Prefix length")
|
||||||
|
suffix_len: StrictInt = Field(..., title="Suffix length")
|
||||||
|
prefix_tree: StrictInt = Field(..., title="Prefix tree")
|
||||||
|
suffix_tree: StrictInt = Field(..., title="Suffix tree")
|
||||||
|
|
||||||
|
class Config:
|
||||||
|
extra = "forbid"
|
||||||
|
|
||||||
|
|
||||||
|
class SubstNodeSchema(BaseModel):
|
||||||
|
orig: Union[int, StrictStr] = Field(..., title="Original substring")
|
||||||
|
subst: Union[int, StrictStr] = Field(..., title="Replacement substring")
|
||||||
|
|
||||||
|
class Config:
|
||||||
|
extra = "forbid"
|
||||||
|
|
||||||
|
|
||||||
|
class EditTreeSchema(BaseModel):
|
||||||
|
__root__: Union[MatchNodeSchema, SubstNodeSchema]
|
||||||
|
|
||||||
|
|
||||||
|
def validate_edit_tree(obj: Dict[str, Any]) -> List[str]:
|
||||||
|
"""Validate edit tree.
|
||||||
|
|
||||||
|
obj (Dict[str, Any]): JSON-serializable data to validate.
|
||||||
|
RETURNS (List[str]): A list of error messages, if available.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
EditTreeSchema.parse_obj(obj)
|
||||||
|
return []
|
||||||
|
except ValidationError as e:
|
||||||
|
errors = e.errors()
|
||||||
|
data = defaultdict(list)
|
||||||
|
for error in errors:
|
||||||
|
err_loc = " -> ".join([str(p) for p in error.get("loc", [])])
|
||||||
|
data[err_loc].append(error.get("msg"))
|
||||||
|
return [f"[{loc}] {', '.join(msg)}" for loc, msg in data.items()] # type: ignore[arg-type]
|
|
@ -3,6 +3,7 @@ from libc.string cimport memcpy, memset
|
||||||
from libc.stdlib cimport calloc, free
|
from libc.stdlib cimport calloc, free
|
||||||
from libc.stdint cimport uint32_t, uint64_t
|
from libc.stdint cimport uint32_t, uint64_t
|
||||||
cimport libcpp
|
cimport libcpp
|
||||||
|
from libcpp.unordered_map cimport unordered_map
|
||||||
from libcpp.vector cimport vector
|
from libcpp.vector cimport vector
|
||||||
from libcpp.set cimport set
|
from libcpp.set cimport set
|
||||||
from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
|
from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
|
||||||
|
@ -30,8 +31,8 @@ cdef cppclass StateC:
|
||||||
vector[int] _stack
|
vector[int] _stack
|
||||||
vector[int] _rebuffer
|
vector[int] _rebuffer
|
||||||
vector[SpanC] _ents
|
vector[SpanC] _ents
|
||||||
vector[ArcC] _left_arcs
|
unordered_map[int, vector[ArcC]] _left_arcs
|
||||||
vector[ArcC] _right_arcs
|
unordered_map[int, vector[ArcC]] _right_arcs
|
||||||
vector[libcpp.bool] _unshiftable
|
vector[libcpp.bool] _unshiftable
|
||||||
set[int] _sent_starts
|
set[int] _sent_starts
|
||||||
TokenC _empty_token
|
TokenC _empty_token
|
||||||
|
@ -160,15 +161,22 @@ cdef cppclass StateC:
|
||||||
else:
|
else:
|
||||||
return &this._sent[i]
|
return &this._sent[i]
|
||||||
|
|
||||||
void get_arcs(vector[ArcC]* arcs) nogil const:
|
void map_get_arcs(const unordered_map[int, vector[ArcC]] &heads_arcs, vector[ArcC]* out) nogil const:
|
||||||
for i in range(this._left_arcs.size()):
|
cdef const vector[ArcC]* arcs
|
||||||
arc = this._left_arcs.at(i)
|
head_arcs_it = heads_arcs.const_begin()
|
||||||
if arc.head != -1 and arc.child != -1:
|
while head_arcs_it != heads_arcs.const_end():
|
||||||
arcs.push_back(arc)
|
arcs = &deref(head_arcs_it).second
|
||||||
for i in range(this._right_arcs.size()):
|
arcs_it = arcs.const_begin()
|
||||||
arc = this._right_arcs.at(i)
|
while arcs_it != arcs.const_end():
|
||||||
if arc.head != -1 and arc.child != -1:
|
arc = deref(arcs_it)
|
||||||
arcs.push_back(arc)
|
if arc.head != -1 and arc.child != -1:
|
||||||
|
out.push_back(arc)
|
||||||
|
incr(arcs_it)
|
||||||
|
incr(head_arcs_it)
|
||||||
|
|
||||||
|
void get_arcs(vector[ArcC]* out) nogil const:
|
||||||
|
this.map_get_arcs(this._left_arcs, out)
|
||||||
|
this.map_get_arcs(this._right_arcs, out)
|
||||||
|
|
||||||
int H(int child) nogil const:
|
int H(int child) nogil const:
|
||||||
if child >= this.length or child < 0:
|
if child >= this.length or child < 0:
|
||||||
|
@ -182,37 +190,35 @@ cdef cppclass StateC:
|
||||||
else:
|
else:
|
||||||
return this._ents.back().start
|
return this._ents.back().start
|
||||||
|
|
||||||
int L(int head, int idx) nogil const:
|
int nth_child(const unordered_map[int, vector[ArcC]]& heads_arcs, int head, int idx) nogil const:
|
||||||
if idx < 1 or this._left_arcs.size() == 0:
|
if idx < 1:
|
||||||
return -1
|
return -1
|
||||||
|
|
||||||
# Work backwards through left-arcs to find the arc at the
|
head_arcs_it = heads_arcs.const_find(head)
|
||||||
|
if head_arcs_it == heads_arcs.const_end():
|
||||||
|
return -1
|
||||||
|
|
||||||
|
cdef const vector[ArcC]* arcs = &deref(head_arcs_it).second
|
||||||
|
|
||||||
|
# Work backwards through arcs to find the arc at the
|
||||||
# requested index more quickly.
|
# requested index more quickly.
|
||||||
cdef size_t child_index = 0
|
cdef size_t child_index = 0
|
||||||
it = this._left_arcs.const_rbegin()
|
arcs_it = arcs.const_rbegin()
|
||||||
while it != this._left_arcs.rend():
|
while arcs_it != arcs.const_rend() and child_index != idx:
|
||||||
arc = deref(it)
|
arc = deref(arcs_it)
|
||||||
if arc.head == head and arc.child != -1 and arc.child < head:
|
if arc.child != -1:
|
||||||
child_index += 1
|
child_index += 1
|
||||||
if child_index == idx:
|
if child_index == idx:
|
||||||
return arc.child
|
return arc.child
|
||||||
incr(it)
|
incr(arcs_it)
|
||||||
|
|
||||||
return -1
|
return -1
|
||||||
|
|
||||||
|
int L(int head, int idx) nogil const:
|
||||||
|
return this.nth_child(this._left_arcs, head, idx)
|
||||||
|
|
||||||
int R(int head, int idx) nogil const:
|
int R(int head, int idx) nogil const:
|
||||||
if idx < 1 or this._right_arcs.size() == 0:
|
return this.nth_child(this._right_arcs, head, idx)
|
||||||
return -1
|
|
||||||
cdef vector[int] rights
|
|
||||||
for i in range(this._right_arcs.size()):
|
|
||||||
arc = this._right_arcs.at(i)
|
|
||||||
if arc.head == head and arc.child != -1 and arc.child > head:
|
|
||||||
rights.push_back(arc.child)
|
|
||||||
idx = (<int>rights.size()) - idx
|
|
||||||
if idx < 0:
|
|
||||||
return -1
|
|
||||||
else:
|
|
||||||
return rights.at(idx)
|
|
||||||
|
|
||||||
bint empty() nogil const:
|
bint empty() nogil const:
|
||||||
return this._stack.size() == 0
|
return this._stack.size() == 0
|
||||||
|
@ -253,22 +259,29 @@ cdef cppclass StateC:
|
||||||
|
|
||||||
int r_edge(int word) nogil const:
|
int r_edge(int word) nogil const:
|
||||||
return word
|
return word
|
||||||
|
|
||||||
int n_L(int head) nogil const:
|
int n_arcs(const unordered_map[int, vector[ArcC]] &heads_arcs, int head) nogil const:
|
||||||
cdef int n = 0
|
cdef int n = 0
|
||||||
for i in range(this._left_arcs.size()):
|
head_arcs_it = heads_arcs.const_find(head)
|
||||||
arc = this._left_arcs.at(i)
|
if head_arcs_it == heads_arcs.const_end():
|
||||||
if arc.head == head and arc.child != -1 and arc.child < arc.head:
|
return n
|
||||||
|
|
||||||
|
cdef const vector[ArcC]* arcs = &deref(head_arcs_it).second
|
||||||
|
arcs_it = arcs.const_begin()
|
||||||
|
while arcs_it != arcs.end():
|
||||||
|
arc = deref(arcs_it)
|
||||||
|
if arc.child != -1:
|
||||||
n += 1
|
n += 1
|
||||||
|
incr(arcs_it)
|
||||||
|
|
||||||
return n
|
return n
|
||||||
|
|
||||||
|
|
||||||
|
int n_L(int head) nogil const:
|
||||||
|
return n_arcs(this._left_arcs, head)
|
||||||
|
|
||||||
int n_R(int head) nogil const:
|
int n_R(int head) nogil const:
|
||||||
cdef int n = 0
|
return n_arcs(this._right_arcs, head)
|
||||||
for i in range(this._right_arcs.size()):
|
|
||||||
arc = this._right_arcs.at(i)
|
|
||||||
if arc.head == head and arc.child != -1 and arc.child > arc.head:
|
|
||||||
n += 1
|
|
||||||
return n
|
|
||||||
|
|
||||||
bint stack_is_connected() nogil const:
|
bint stack_is_connected() nogil const:
|
||||||
return False
|
return False
|
||||||
|
@ -328,19 +341,20 @@ cdef cppclass StateC:
|
||||||
arc.child = child
|
arc.child = child
|
||||||
arc.label = label
|
arc.label = label
|
||||||
if head > child:
|
if head > child:
|
||||||
this._left_arcs.push_back(arc)
|
this._left_arcs[arc.head].push_back(arc)
|
||||||
else:
|
else:
|
||||||
this._right_arcs.push_back(arc)
|
this._right_arcs[arc.head].push_back(arc)
|
||||||
this._heads[child] = head
|
this._heads[child] = head
|
||||||
|
|
||||||
void del_arc(int h_i, int c_i) nogil:
|
void map_del_arc(unordered_map[int, vector[ArcC]]* heads_arcs, int h_i, int c_i) nogil:
|
||||||
cdef vector[ArcC]* arcs
|
arcs_it = heads_arcs.find(h_i)
|
||||||
if h_i > c_i:
|
if arcs_it == heads_arcs.end():
|
||||||
arcs = &this._left_arcs
|
return
|
||||||
else:
|
|
||||||
arcs = &this._right_arcs
|
arcs = &deref(arcs_it).second
|
||||||
if arcs.size() == 0:
|
if arcs.size() == 0:
|
||||||
return
|
return
|
||||||
|
|
||||||
arc = arcs.back()
|
arc = arcs.back()
|
||||||
if arc.head == h_i and arc.child == c_i:
|
if arc.head == h_i and arc.child == c_i:
|
||||||
arcs.pop_back()
|
arcs.pop_back()
|
||||||
|
@ -353,6 +367,12 @@ cdef cppclass StateC:
|
||||||
arc.label = 0
|
arc.label = 0
|
||||||
break
|
break
|
||||||
|
|
||||||
|
void del_arc(int h_i, int c_i) nogil:
|
||||||
|
if h_i > c_i:
|
||||||
|
this.map_del_arc(&this._left_arcs, h_i, c_i)
|
||||||
|
else:
|
||||||
|
this.map_del_arc(&this._right_arcs, h_i, c_i)
|
||||||
|
|
||||||
SpanC get_ent() nogil const:
|
SpanC get_ent() nogil const:
|
||||||
cdef SpanC ent
|
cdef SpanC ent
|
||||||
if this._ents.size() == 0:
|
if this._ents.size() == 0:
|
||||||
|
|
|
@ -218,7 +218,7 @@ def _get_aligned_sent_starts(example):
|
||||||
sent_starts = [False] * len(example.x)
|
sent_starts = [False] * len(example.x)
|
||||||
seen_words = set()
|
seen_words = set()
|
||||||
for y_sent in example.y.sents:
|
for y_sent in example.y.sents:
|
||||||
x_indices = list(align[y_sent.start : y_sent.end].dataXd)
|
x_indices = list(align[y_sent.start : y_sent.end])
|
||||||
if any(x_idx in seen_words for x_idx in x_indices):
|
if any(x_idx in seen_words for x_idx in x_indices):
|
||||||
# If there are any tokens in X that align across two sentences,
|
# If there are any tokens in X that align across two sentences,
|
||||||
# regard the sentence annotations as missing, as we can't
|
# regard the sentence annotations as missing, as we can't
|
||||||
|
|
|
@ -4,6 +4,10 @@ for doing pseudo-projective parsing implementation uses the HEAD decoration
|
||||||
scheme.
|
scheme.
|
||||||
"""
|
"""
|
||||||
from copy import copy
|
from copy import copy
|
||||||
|
from libc.limits cimport INT_MAX
|
||||||
|
from libc.stdlib cimport abs
|
||||||
|
from libcpp cimport bool
|
||||||
|
from libcpp.vector cimport vector
|
||||||
|
|
||||||
from ...tokens.doc cimport Doc, set_children_from_heads
|
from ...tokens.doc cimport Doc, set_children_from_heads
|
||||||
|
|
||||||
|
@ -41,13 +45,18 @@ def contains_cycle(heads):
|
||||||
|
|
||||||
|
|
||||||
def is_nonproj_arc(tokenid, heads):
|
def is_nonproj_arc(tokenid, heads):
|
||||||
|
cdef vector[int] c_heads = _heads_to_c(heads)
|
||||||
|
return _is_nonproj_arc(tokenid, c_heads)
|
||||||
|
|
||||||
|
|
||||||
|
cdef bool _is_nonproj_arc(int tokenid, const vector[int]& heads) nogil:
|
||||||
# definition (e.g. Havelka 2007): an arc h -> d, h < d is non-projective
|
# definition (e.g. Havelka 2007): an arc h -> d, h < d is non-projective
|
||||||
# if there is a token k, h < k < d such that h is not
|
# if there is a token k, h < k < d such that h is not
|
||||||
# an ancestor of k. Same for h -> d, h > d
|
# an ancestor of k. Same for h -> d, h > d
|
||||||
head = heads[tokenid]
|
head = heads[tokenid]
|
||||||
if head == tokenid: # root arcs cannot be non-projective
|
if head == tokenid: # root arcs cannot be non-projective
|
||||||
return False
|
return False
|
||||||
elif head is None: # unattached tokens cannot be non-projective
|
elif head < 0: # unattached tokens cannot be non-projective
|
||||||
return False
|
return False
|
||||||
|
|
||||||
cdef int start, end
|
cdef int start, end
|
||||||
|
@ -56,19 +65,29 @@ def is_nonproj_arc(tokenid, heads):
|
||||||
else:
|
else:
|
||||||
start, end = (tokenid+1, head)
|
start, end = (tokenid+1, head)
|
||||||
for k in range(start, end):
|
for k in range(start, end):
|
||||||
for ancestor in ancestors(k, heads):
|
if _has_head_as_ancestor(k, head, heads):
|
||||||
if ancestor is None: # for unattached tokens/subtrees
|
continue
|
||||||
break
|
|
||||||
elif ancestor == head: # normal case: k dominated by h
|
|
||||||
break
|
|
||||||
else: # head not in ancestors: d -> h is non-projective
|
else: # head not in ancestors: d -> h is non-projective
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
cdef bool _has_head_as_ancestor(int tokenid, int head, const vector[int]& heads) nogil:
|
||||||
|
ancestor = tokenid
|
||||||
|
cnt = 0
|
||||||
|
while cnt < heads.size():
|
||||||
|
if heads[ancestor] == head or heads[ancestor] < 0:
|
||||||
|
return True
|
||||||
|
ancestor = heads[ancestor]
|
||||||
|
cnt += 1
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
def is_nonproj_tree(heads):
|
def is_nonproj_tree(heads):
|
||||||
|
cdef vector[int] c_heads = _heads_to_c(heads)
|
||||||
# a tree is non-projective if at least one arc is non-projective
|
# a tree is non-projective if at least one arc is non-projective
|
||||||
return any(is_nonproj_arc(word, heads) for word in range(len(heads)))
|
return any(_is_nonproj_arc(word, c_heads) for word in range(len(heads)))
|
||||||
|
|
||||||
|
|
||||||
def decompose(label):
|
def decompose(label):
|
||||||
|
@ -98,16 +117,31 @@ def projectivize(heads, labels):
|
||||||
# tree, i.e. connected and cycle-free. Returns a new pair (heads, labels)
|
# tree, i.e. connected and cycle-free. Returns a new pair (heads, labels)
|
||||||
# which encode a projective and decorated tree.
|
# which encode a projective and decorated tree.
|
||||||
proj_heads = copy(heads)
|
proj_heads = copy(heads)
|
||||||
smallest_np_arc = _get_smallest_nonproj_arc(proj_heads)
|
|
||||||
if smallest_np_arc is None: # this sentence is already projective
|
cdef int new_head
|
||||||
|
cdef vector[int] c_proj_heads = _heads_to_c(proj_heads)
|
||||||
|
cdef int smallest_np_arc = _get_smallest_nonproj_arc(c_proj_heads)
|
||||||
|
if smallest_np_arc == -1: # this sentence is already projective
|
||||||
return proj_heads, copy(labels)
|
return proj_heads, copy(labels)
|
||||||
while smallest_np_arc is not None:
|
while smallest_np_arc != -1:
|
||||||
_lift(smallest_np_arc, proj_heads)
|
new_head = _lift(smallest_np_arc, proj_heads)
|
||||||
smallest_np_arc = _get_smallest_nonproj_arc(proj_heads)
|
c_proj_heads[smallest_np_arc] = new_head
|
||||||
|
smallest_np_arc = _get_smallest_nonproj_arc(c_proj_heads)
|
||||||
deco_labels = _decorate(heads, proj_heads, labels)
|
deco_labels = _decorate(heads, proj_heads, labels)
|
||||||
return proj_heads, deco_labels
|
return proj_heads, deco_labels
|
||||||
|
|
||||||
|
|
||||||
|
cdef vector[int] _heads_to_c(heads):
|
||||||
|
cdef vector[int] c_heads;
|
||||||
|
for head in heads:
|
||||||
|
if head == None:
|
||||||
|
c_heads.push_back(-1)
|
||||||
|
else:
|
||||||
|
assert head < len(heads)
|
||||||
|
c_heads.push_back(head)
|
||||||
|
return c_heads
|
||||||
|
|
||||||
|
|
||||||
cpdef deprojectivize(Doc doc):
|
cpdef deprojectivize(Doc doc):
|
||||||
# Reattach arcs with decorated labels (following HEAD scheme). For each
|
# Reattach arcs with decorated labels (following HEAD scheme). For each
|
||||||
# decorated arc X||Y, search top-down, left-to-right, breadth-first until
|
# decorated arc X||Y, search top-down, left-to-right, breadth-first until
|
||||||
|
@ -137,27 +171,38 @@ def _decorate(heads, proj_heads, labels):
|
||||||
deco_labels.append(labels[tokenid])
|
deco_labels.append(labels[tokenid])
|
||||||
return deco_labels
|
return deco_labels
|
||||||
|
|
||||||
|
def get_smallest_nonproj_arc_slow(heads):
|
||||||
|
cdef vector[int] c_heads = _heads_to_c(heads)
|
||||||
|
return _get_smallest_nonproj_arc(c_heads)
|
||||||
|
|
||||||
def _get_smallest_nonproj_arc(heads):
|
|
||||||
|
cdef int _get_smallest_nonproj_arc(const vector[int]& heads) nogil:
|
||||||
# return the smallest non-proj arc or None
|
# return the smallest non-proj arc or None
|
||||||
# where size is defined as the distance between dep and head
|
# where size is defined as the distance between dep and head
|
||||||
# and ties are broken left to right
|
# and ties are broken left to right
|
||||||
smallest_size = float('inf')
|
cdef int smallest_size = INT_MAX
|
||||||
smallest_np_arc = None
|
cdef int smallest_np_arc = -1
|
||||||
for tokenid, head in enumerate(heads):
|
cdef int size
|
||||||
|
cdef int tokenid
|
||||||
|
cdef int head
|
||||||
|
|
||||||
|
for tokenid in range(heads.size()):
|
||||||
|
head = heads[tokenid]
|
||||||
size = abs(tokenid-head)
|
size = abs(tokenid-head)
|
||||||
if size < smallest_size and is_nonproj_arc(tokenid, heads):
|
if size < smallest_size and _is_nonproj_arc(tokenid, heads):
|
||||||
smallest_size = size
|
smallest_size = size
|
||||||
smallest_np_arc = tokenid
|
smallest_np_arc = tokenid
|
||||||
return smallest_np_arc
|
return smallest_np_arc
|
||||||
|
|
||||||
|
|
||||||
def _lift(tokenid, heads):
|
cpdef int _lift(tokenid, heads):
|
||||||
# reattaches a word to it's grandfather
|
# reattaches a word to it's grandfather
|
||||||
head = heads[tokenid]
|
head = heads[tokenid]
|
||||||
ghead = heads[head]
|
ghead = heads[head]
|
||||||
|
cdef int new_head = ghead if head != ghead else tokenid
|
||||||
# attach to ghead if head isn't attached to root else attach to root
|
# attach to ghead if head isn't attached to root else attach to root
|
||||||
heads[tokenid] = ghead if head != ghead else tokenid
|
heads[tokenid] = new_head
|
||||||
|
return new_head
|
||||||
|
|
||||||
|
|
||||||
def _find_new_head(token, headlabel):
|
def _find_new_head(token, headlabel):
|
||||||
|
|
379
spacy/pipeline/edit_tree_lemmatizer.py
Normal file
379
spacy/pipeline/edit_tree_lemmatizer.py
Normal file
|
@ -0,0 +1,379 @@
|
||||||
|
from typing import cast, Any, Callable, Dict, Iterable, List, Optional
|
||||||
|
from typing import Sequence, Tuple, Union
|
||||||
|
from collections import Counter
|
||||||
|
from copy import deepcopy
|
||||||
|
from itertools import islice
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
import srsly
|
||||||
|
from thinc.api import Config, Model, SequenceCategoricalCrossentropy
|
||||||
|
from thinc.types import Floats2d, Ints1d, Ints2d
|
||||||
|
|
||||||
|
from ._edit_tree_internals.edit_trees import EditTrees
|
||||||
|
from ._edit_tree_internals.schemas import validate_edit_tree
|
||||||
|
from .lemmatizer import lemmatizer_score
|
||||||
|
from .trainable_pipe import TrainablePipe
|
||||||
|
from ..errors import Errors
|
||||||
|
from ..language import Language
|
||||||
|
from ..tokens import Doc
|
||||||
|
from ..training import Example, validate_examples, validate_get_examples
|
||||||
|
from ..vocab import Vocab
|
||||||
|
from .. import util
|
||||||
|
|
||||||
|
|
||||||
|
default_model_config = """
|
||||||
|
[model]
|
||||||
|
@architectures = "spacy.Tagger.v2"
|
||||||
|
|
||||||
|
[model.tok2vec]
|
||||||
|
@architectures = "spacy.HashEmbedCNN.v2"
|
||||||
|
pretrained_vectors = null
|
||||||
|
width = 96
|
||||||
|
depth = 4
|
||||||
|
embed_size = 2000
|
||||||
|
window_size = 1
|
||||||
|
maxout_pieces = 3
|
||||||
|
subword_features = true
|
||||||
|
"""
|
||||||
|
DEFAULT_EDIT_TREE_LEMMATIZER_MODEL = Config().from_str(default_model_config)["model"]
|
||||||
|
|
||||||
|
|
||||||
|
@Language.factory(
|
||||||
|
"trainable_lemmatizer",
|
||||||
|
assigns=["token.lemma"],
|
||||||
|
requires=[],
|
||||||
|
default_config={
|
||||||
|
"model": DEFAULT_EDIT_TREE_LEMMATIZER_MODEL,
|
||||||
|
"backoff": "orth",
|
||||||
|
"min_tree_freq": 3,
|
||||||
|
"overwrite": False,
|
||||||
|
"top_k": 1,
|
||||||
|
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||||
|
},
|
||||||
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
|
)
|
||||||
|
def make_edit_tree_lemmatizer(
|
||||||
|
nlp: Language,
|
||||||
|
name: str,
|
||||||
|
model: Model,
|
||||||
|
backoff: Optional[str],
|
||||||
|
min_tree_freq: int,
|
||||||
|
overwrite: bool,
|
||||||
|
top_k: int,
|
||||||
|
scorer: Optional[Callable],
|
||||||
|
):
|
||||||
|
"""Construct an EditTreeLemmatizer component."""
|
||||||
|
return EditTreeLemmatizer(
|
||||||
|
nlp.vocab,
|
||||||
|
model,
|
||||||
|
name,
|
||||||
|
backoff=backoff,
|
||||||
|
min_tree_freq=min_tree_freq,
|
||||||
|
overwrite=overwrite,
|
||||||
|
top_k=top_k,
|
||||||
|
scorer=scorer,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class EditTreeLemmatizer(TrainablePipe):
|
||||||
|
"""
|
||||||
|
Lemmatizer that lemmatizes each word using a predicted edit tree.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
vocab: Vocab,
|
||||||
|
model: Model,
|
||||||
|
name: str = "trainable_lemmatizer",
|
||||||
|
*,
|
||||||
|
backoff: Optional[str] = "orth",
|
||||||
|
min_tree_freq: int = 3,
|
||||||
|
overwrite: bool = False,
|
||||||
|
top_k: int = 1,
|
||||||
|
scorer: Optional[Callable] = lemmatizer_score,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Construct an edit tree lemmatizer.
|
||||||
|
|
||||||
|
backoff (Optional[str]): backoff to use when the predicted edit trees
|
||||||
|
are not applicable. Must be an attribute of Token or None (leave the
|
||||||
|
lemma unset).
|
||||||
|
min_tree_freq (int): prune trees that are applied less than this
|
||||||
|
frequency in the training data.
|
||||||
|
overwrite (bool): overwrite existing lemma annotations.
|
||||||
|
top_k (int): try to apply at most the k most probable edit trees.
|
||||||
|
"""
|
||||||
|
self.vocab = vocab
|
||||||
|
self.model = model
|
||||||
|
self.name = name
|
||||||
|
self.backoff = backoff
|
||||||
|
self.min_tree_freq = min_tree_freq
|
||||||
|
self.overwrite = overwrite
|
||||||
|
self.top_k = top_k
|
||||||
|
|
||||||
|
self.trees = EditTrees(self.vocab.strings)
|
||||||
|
self.tree2label: Dict[int, int] = {}
|
||||||
|
|
||||||
|
self.cfg: Dict[str, Any] = {"labels": []}
|
||||||
|
self.scorer = scorer
|
||||||
|
|
||||||
|
def get_loss(
|
||||||
|
self, examples: Iterable[Example], scores: List[Floats2d]
|
||||||
|
) -> Tuple[float, List[Floats2d]]:
|
||||||
|
validate_examples(examples, "EditTreeLemmatizer.get_loss")
|
||||||
|
loss_func = SequenceCategoricalCrossentropy(normalize=False, missing_value=-1)
|
||||||
|
|
||||||
|
truths = []
|
||||||
|
for eg in examples:
|
||||||
|
eg_truths = []
|
||||||
|
for (predicted, gold_lemma) in zip(
|
||||||
|
eg.predicted, eg.get_aligned("LEMMA", as_string=True)
|
||||||
|
):
|
||||||
|
if gold_lemma is None:
|
||||||
|
label = -1
|
||||||
|
else:
|
||||||
|
tree_id = self.trees.add(predicted.text, gold_lemma)
|
||||||
|
label = self.tree2label.get(tree_id, 0)
|
||||||
|
eg_truths.append(label)
|
||||||
|
|
||||||
|
truths.append(eg_truths)
|
||||||
|
|
||||||
|
d_scores, loss = loss_func(scores, truths) # type: ignore
|
||||||
|
if self.model.ops.xp.isnan(loss):
|
||||||
|
raise ValueError(Errors.E910.format(name=self.name))
|
||||||
|
|
||||||
|
return float(loss), d_scores
|
||||||
|
|
||||||
|
def predict(self, docs: Iterable[Doc]) -> List[Ints2d]:
|
||||||
|
n_docs = len(list(docs))
|
||||||
|
if not any(len(doc) for doc in docs):
|
||||||
|
# Handle cases where there are no tokens in any docs.
|
||||||
|
n_labels = len(self.cfg["labels"])
|
||||||
|
guesses: List[Ints2d] = [
|
||||||
|
self.model.ops.alloc((0, n_labels), dtype="i") for doc in docs
|
||||||
|
]
|
||||||
|
assert len(guesses) == n_docs
|
||||||
|
return guesses
|
||||||
|
scores = self.model.predict(docs)
|
||||||
|
assert len(scores) == n_docs
|
||||||
|
guesses = self._scores2guesses(docs, scores)
|
||||||
|
assert len(guesses) == n_docs
|
||||||
|
return guesses
|
||||||
|
|
||||||
|
def _scores2guesses(self, docs, scores):
|
||||||
|
guesses = []
|
||||||
|
for doc, doc_scores in zip(docs, scores):
|
||||||
|
if self.top_k == 1:
|
||||||
|
doc_guesses = doc_scores.argmax(axis=1).reshape(-1, 1)
|
||||||
|
else:
|
||||||
|
doc_guesses = np.argsort(doc_scores)[..., : -self.top_k - 1 : -1]
|
||||||
|
|
||||||
|
if not isinstance(doc_guesses, np.ndarray):
|
||||||
|
doc_guesses = doc_guesses.get()
|
||||||
|
|
||||||
|
doc_compat_guesses = []
|
||||||
|
for token, candidates in zip(doc, doc_guesses):
|
||||||
|
tree_id = -1
|
||||||
|
for candidate in candidates:
|
||||||
|
candidate_tree_id = self.cfg["labels"][candidate]
|
||||||
|
|
||||||
|
if self.trees.apply(candidate_tree_id, token.text) is not None:
|
||||||
|
tree_id = candidate_tree_id
|
||||||
|
break
|
||||||
|
doc_compat_guesses.append(tree_id)
|
||||||
|
|
||||||
|
guesses.append(np.array(doc_compat_guesses))
|
||||||
|
|
||||||
|
return guesses
|
||||||
|
|
||||||
|
def set_annotations(self, docs: Iterable[Doc], batch_tree_ids):
|
||||||
|
for i, doc in enumerate(docs):
|
||||||
|
doc_tree_ids = batch_tree_ids[i]
|
||||||
|
if hasattr(doc_tree_ids, "get"):
|
||||||
|
doc_tree_ids = doc_tree_ids.get()
|
||||||
|
for j, tree_id in enumerate(doc_tree_ids):
|
||||||
|
if self.overwrite or doc[j].lemma == 0:
|
||||||
|
# If no applicable tree could be found during prediction,
|
||||||
|
# the special identifier -1 is used. Otherwise the tree
|
||||||
|
# is guaranteed to be applicable.
|
||||||
|
if tree_id == -1:
|
||||||
|
if self.backoff is not None:
|
||||||
|
doc[j].lemma = getattr(doc[j], self.backoff)
|
||||||
|
else:
|
||||||
|
lemma = self.trees.apply(tree_id, doc[j].text)
|
||||||
|
doc[j].lemma_ = lemma
|
||||||
|
|
||||||
|
@property
|
||||||
|
def labels(self) -> Tuple[int, ...]:
|
||||||
|
"""Returns the labels currently added to the component."""
|
||||||
|
return tuple(self.cfg["labels"])
|
||||||
|
|
||||||
|
@property
|
||||||
|
def hide_labels(self) -> bool:
|
||||||
|
return True
|
||||||
|
|
||||||
|
@property
|
||||||
|
def label_data(self) -> Dict:
|
||||||
|
trees = []
|
||||||
|
for tree_id in range(len(self.trees)):
|
||||||
|
tree = self.trees[tree_id]
|
||||||
|
if "orig" in tree:
|
||||||
|
tree["orig"] = self.vocab.strings[tree["orig"]]
|
||||||
|
if "subst" in tree:
|
||||||
|
tree["subst"] = self.vocab.strings[tree["subst"]]
|
||||||
|
trees.append(tree)
|
||||||
|
return dict(trees=trees, labels=tuple(self.cfg["labels"]))
|
||||||
|
|
||||||
|
def initialize(
|
||||||
|
self,
|
||||||
|
get_examples: Callable[[], Iterable[Example]],
|
||||||
|
*,
|
||||||
|
nlp: Optional[Language] = None,
|
||||||
|
labels: Optional[Dict] = None,
|
||||||
|
):
|
||||||
|
validate_get_examples(get_examples, "EditTreeLemmatizer.initialize")
|
||||||
|
|
||||||
|
if labels is None:
|
||||||
|
self._labels_from_data(get_examples)
|
||||||
|
else:
|
||||||
|
self._add_labels(labels)
|
||||||
|
|
||||||
|
# Sample for the model.
|
||||||
|
doc_sample = []
|
||||||
|
label_sample = []
|
||||||
|
for example in islice(get_examples(), 10):
|
||||||
|
doc_sample.append(example.x)
|
||||||
|
gold_labels: List[List[float]] = []
|
||||||
|
for token in example.reference:
|
||||||
|
if token.lemma == 0:
|
||||||
|
gold_label = None
|
||||||
|
else:
|
||||||
|
gold_label = self._pair2label(token.text, token.lemma_)
|
||||||
|
|
||||||
|
gold_labels.append(
|
||||||
|
[
|
||||||
|
1.0 if label == gold_label else 0.0
|
||||||
|
for label in self.cfg["labels"]
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
gold_labels = cast(Floats2d, gold_labels)
|
||||||
|
label_sample.append(self.model.ops.asarray(gold_labels, dtype="float32"))
|
||||||
|
|
||||||
|
self._require_labels()
|
||||||
|
assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
|
||||||
|
assert len(label_sample) > 0, Errors.E923.format(name=self.name)
|
||||||
|
|
||||||
|
self.model.initialize(X=doc_sample, Y=label_sample)
|
||||||
|
|
||||||
|
def from_bytes(self, bytes_data, *, exclude=tuple()):
|
||||||
|
deserializers = {
|
||||||
|
"cfg": lambda b: self.cfg.update(srsly.json_loads(b)),
|
||||||
|
"model": lambda b: self.model.from_bytes(b),
|
||||||
|
"vocab": lambda b: self.vocab.from_bytes(b, exclude=exclude),
|
||||||
|
"trees": lambda b: self.trees.from_bytes(b),
|
||||||
|
}
|
||||||
|
|
||||||
|
util.from_bytes(bytes_data, deserializers, exclude)
|
||||||
|
|
||||||
|
return self
|
||||||
|
|
||||||
|
def to_bytes(self, *, exclude=tuple()):
|
||||||
|
serializers = {
|
||||||
|
"cfg": lambda: srsly.json_dumps(self.cfg),
|
||||||
|
"model": lambda: self.model.to_bytes(),
|
||||||
|
"vocab": lambda: self.vocab.to_bytes(exclude=exclude),
|
||||||
|
"trees": lambda: self.trees.to_bytes(),
|
||||||
|
}
|
||||||
|
|
||||||
|
return util.to_bytes(serializers, exclude)
|
||||||
|
|
||||||
|
def to_disk(self, path, exclude=tuple()):
|
||||||
|
path = util.ensure_path(path)
|
||||||
|
serializers = {
|
||||||
|
"cfg": lambda p: srsly.write_json(p, self.cfg),
|
||||||
|
"model": lambda p: self.model.to_disk(p),
|
||||||
|
"vocab": lambda p: self.vocab.to_disk(p, exclude=exclude),
|
||||||
|
"trees": lambda p: self.trees.to_disk(p),
|
||||||
|
}
|
||||||
|
util.to_disk(path, serializers, exclude)
|
||||||
|
|
||||||
|
def from_disk(self, path, exclude=tuple()):
|
||||||
|
def load_model(p):
|
||||||
|
try:
|
||||||
|
with open(p, "rb") as mfile:
|
||||||
|
self.model.from_bytes(mfile.read())
|
||||||
|
except AttributeError:
|
||||||
|
raise ValueError(Errors.E149) from None
|
||||||
|
|
||||||
|
deserializers = {
|
||||||
|
"cfg": lambda p: self.cfg.update(srsly.read_json(p)),
|
||||||
|
"model": load_model,
|
||||||
|
"vocab": lambda p: self.vocab.from_disk(p, exclude=exclude),
|
||||||
|
"trees": lambda p: self.trees.from_disk(p),
|
||||||
|
}
|
||||||
|
|
||||||
|
util.from_disk(path, deserializers, exclude)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def _add_labels(self, labels: Dict):
|
||||||
|
if "labels" not in labels:
|
||||||
|
raise ValueError(Errors.E857.format(name="labels"))
|
||||||
|
if "trees" not in labels:
|
||||||
|
raise ValueError(Errors.E857.format(name="trees"))
|
||||||
|
|
||||||
|
self.cfg["labels"] = list(labels["labels"])
|
||||||
|
trees = []
|
||||||
|
for tree in labels["trees"]:
|
||||||
|
errors = validate_edit_tree(tree)
|
||||||
|
if errors:
|
||||||
|
raise ValueError(Errors.E1026.format(errors="\n".join(errors)))
|
||||||
|
|
||||||
|
tree = dict(tree)
|
||||||
|
if "orig" in tree:
|
||||||
|
tree["orig"] = self.vocab.strings[tree["orig"]]
|
||||||
|
if "orig" in tree:
|
||||||
|
tree["subst"] = self.vocab.strings[tree["subst"]]
|
||||||
|
|
||||||
|
trees.append(tree)
|
||||||
|
|
||||||
|
self.trees.from_json(trees)
|
||||||
|
|
||||||
|
for label, tree in enumerate(self.labels):
|
||||||
|
self.tree2label[tree] = label
|
||||||
|
|
||||||
|
def _labels_from_data(self, get_examples: Callable[[], Iterable[Example]]):
|
||||||
|
# Count corpus tree frequencies in ad-hoc storage to avoid cluttering
|
||||||
|
# the final pipe/string store.
|
||||||
|
vocab = Vocab()
|
||||||
|
trees = EditTrees(vocab.strings)
|
||||||
|
tree_freqs: Counter = Counter()
|
||||||
|
repr_pairs: Dict = {}
|
||||||
|
for example in get_examples():
|
||||||
|
for token in example.reference:
|
||||||
|
if token.lemma != 0:
|
||||||
|
tree_id = trees.add(token.text, token.lemma_)
|
||||||
|
tree_freqs[tree_id] += 1
|
||||||
|
repr_pairs[tree_id] = (token.text, token.lemma_)
|
||||||
|
|
||||||
|
# Construct trees that make the frequency cut-off using representative
|
||||||
|
# form - token pairs.
|
||||||
|
for tree_id, freq in tree_freqs.items():
|
||||||
|
if freq >= self.min_tree_freq:
|
||||||
|
form, lemma = repr_pairs[tree_id]
|
||||||
|
self._pair2label(form, lemma, add_label=True)
|
||||||
|
|
||||||
|
def _pair2label(self, form, lemma, add_label=False):
|
||||||
|
"""
|
||||||
|
Look up the edit tree identifier for a form/label pair. If the edit
|
||||||
|
tree is unknown and "add_label" is set, the edit tree will be added to
|
||||||
|
the labels.
|
||||||
|
"""
|
||||||
|
tree_id = self.trees.add(form, lemma)
|
||||||
|
if tree_id not in self.tree2label:
|
||||||
|
if not add_label:
|
||||||
|
return None
|
||||||
|
|
||||||
|
self.tree2label[tree_id] = len(self.cfg["labels"])
|
||||||
|
self.cfg["labels"].append(tree_id)
|
||||||
|
return self.tree2label[tree_id]
|
|
@ -6,17 +6,17 @@ import srsly
|
||||||
import random
|
import random
|
||||||
from thinc.api import CosineDistance, Model, Optimizer, Config
|
from thinc.api import CosineDistance, Model, Optimizer, Config
|
||||||
from thinc.api import set_dropout_rate
|
from thinc.api import set_dropout_rate
|
||||||
import warnings
|
|
||||||
|
|
||||||
from ..kb import KnowledgeBase, Candidate
|
from ..kb import KnowledgeBase, Candidate
|
||||||
from ..ml import empty_kb
|
from ..ml import empty_kb
|
||||||
from ..tokens import Doc, Span
|
from ..tokens import Doc, Span
|
||||||
from .pipe import deserialize_config
|
from .pipe import deserialize_config
|
||||||
|
from .legacy.entity_linker import EntityLinker_v1
|
||||||
from .trainable_pipe import TrainablePipe
|
from .trainable_pipe import TrainablePipe
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..vocab import Vocab
|
from ..vocab import Vocab
|
||||||
from ..training import Example, validate_examples, validate_get_examples
|
from ..training import Example, validate_examples, validate_get_examples
|
||||||
from ..errors import Errors, Warnings
|
from ..errors import Errors
|
||||||
from ..util import SimpleFrozenList, registry
|
from ..util import SimpleFrozenList, registry
|
||||||
from .. import util
|
from .. import util
|
||||||
from ..scorer import Scorer
|
from ..scorer import Scorer
|
||||||
|
@ -26,7 +26,7 @@ BACKWARD_OVERWRITE = True
|
||||||
|
|
||||||
default_model_config = """
|
default_model_config = """
|
||||||
[model]
|
[model]
|
||||||
@architectures = "spacy.EntityLinker.v1"
|
@architectures = "spacy.EntityLinker.v2"
|
||||||
|
|
||||||
[model.tok2vec]
|
[model.tok2vec]
|
||||||
@architectures = "spacy.HashEmbedCNN.v2"
|
@architectures = "spacy.HashEmbedCNN.v2"
|
||||||
|
@ -55,6 +55,7 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]
|
||||||
"get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
|
"get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
|
||||||
"overwrite": True,
|
"overwrite": True,
|
||||||
"scorer": {"@scorers": "spacy.entity_linker_scorer.v1"},
|
"scorer": {"@scorers": "spacy.entity_linker_scorer.v1"},
|
||||||
|
"use_gold_ents": True,
|
||||||
},
|
},
|
||||||
default_score_weights={
|
default_score_weights={
|
||||||
"nel_micro_f": 1.0,
|
"nel_micro_f": 1.0,
|
||||||
|
@ -75,6 +76,7 @@ def make_entity_linker(
|
||||||
get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
|
get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
|
||||||
overwrite: bool,
|
overwrite: bool,
|
||||||
scorer: Optional[Callable],
|
scorer: Optional[Callable],
|
||||||
|
use_gold_ents: bool,
|
||||||
):
|
):
|
||||||
"""Construct an EntityLinker component.
|
"""Construct an EntityLinker component.
|
||||||
|
|
||||||
|
@ -90,6 +92,22 @@ def make_entity_linker(
|
||||||
produces a list of candidates, given a certain knowledge base and a textual mention.
|
produces a list of candidates, given a certain knowledge base and a textual mention.
|
||||||
scorer (Optional[Callable]): The scoring method.
|
scorer (Optional[Callable]): The scoring method.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
if not model.attrs.get("include_span_maker", False):
|
||||||
|
# The only difference in arguments here is that use_gold_ents is not available
|
||||||
|
return EntityLinker_v1(
|
||||||
|
nlp.vocab,
|
||||||
|
model,
|
||||||
|
name,
|
||||||
|
labels_discard=labels_discard,
|
||||||
|
n_sents=n_sents,
|
||||||
|
incl_prior=incl_prior,
|
||||||
|
incl_context=incl_context,
|
||||||
|
entity_vector_length=entity_vector_length,
|
||||||
|
get_candidates=get_candidates,
|
||||||
|
overwrite=overwrite,
|
||||||
|
scorer=scorer,
|
||||||
|
)
|
||||||
return EntityLinker(
|
return EntityLinker(
|
||||||
nlp.vocab,
|
nlp.vocab,
|
||||||
model,
|
model,
|
||||||
|
@ -102,6 +120,7 @@ def make_entity_linker(
|
||||||
get_candidates=get_candidates,
|
get_candidates=get_candidates,
|
||||||
overwrite=overwrite,
|
overwrite=overwrite,
|
||||||
scorer=scorer,
|
scorer=scorer,
|
||||||
|
use_gold_ents=use_gold_ents,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -136,6 +155,7 @@ class EntityLinker(TrainablePipe):
|
||||||
get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
|
get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
|
||||||
overwrite: bool = BACKWARD_OVERWRITE,
|
overwrite: bool = BACKWARD_OVERWRITE,
|
||||||
scorer: Optional[Callable] = entity_linker_score,
|
scorer: Optional[Callable] = entity_linker_score,
|
||||||
|
use_gold_ents: bool,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Initialize an entity linker.
|
"""Initialize an entity linker.
|
||||||
|
|
||||||
|
@ -152,6 +172,8 @@ class EntityLinker(TrainablePipe):
|
||||||
produces a list of candidates, given a certain knowledge base and a textual mention.
|
produces a list of candidates, given a certain knowledge base and a textual mention.
|
||||||
scorer (Optional[Callable]): The scoring method. Defaults to
|
scorer (Optional[Callable]): The scoring method. Defaults to
|
||||||
Scorer.score_links.
|
Scorer.score_links.
|
||||||
|
use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
|
||||||
|
component must provide entity annotations.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/entitylinker#init
|
DOCS: https://spacy.io/api/entitylinker#init
|
||||||
"""
|
"""
|
||||||
|
@ -169,6 +191,7 @@ class EntityLinker(TrainablePipe):
|
||||||
# create an empty KB by default. If you want to load a predefined one, specify it in 'initialize'.
|
# create an empty KB by default. If you want to load a predefined one, specify it in 'initialize'.
|
||||||
self.kb = empty_kb(entity_vector_length)(self.vocab)
|
self.kb = empty_kb(entity_vector_length)(self.vocab)
|
||||||
self.scorer = scorer
|
self.scorer = scorer
|
||||||
|
self.use_gold_ents = use_gold_ents
|
||||||
|
|
||||||
def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]):
|
def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]):
|
||||||
"""Define the KB of this pipe by providing a function that will
|
"""Define the KB of this pipe by providing a function that will
|
||||||
|
@ -212,14 +235,48 @@ class EntityLinker(TrainablePipe):
|
||||||
doc_sample = []
|
doc_sample = []
|
||||||
vector_sample = []
|
vector_sample = []
|
||||||
for example in islice(get_examples(), 10):
|
for example in islice(get_examples(), 10):
|
||||||
doc_sample.append(example.x)
|
doc = example.x
|
||||||
|
if self.use_gold_ents:
|
||||||
|
doc.ents = example.y.ents
|
||||||
|
doc_sample.append(doc)
|
||||||
vector_sample.append(self.model.ops.alloc1f(nO))
|
vector_sample.append(self.model.ops.alloc1f(nO))
|
||||||
assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
|
assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
|
||||||
assert len(vector_sample) > 0, Errors.E923.format(name=self.name)
|
assert len(vector_sample) > 0, Errors.E923.format(name=self.name)
|
||||||
|
|
||||||
|
# XXX In order for size estimation to work, there has to be at least
|
||||||
|
# one entity. It's not used for training so it doesn't have to be real,
|
||||||
|
# so we add a fake one if none are present.
|
||||||
|
# We can't use Doc.has_annotation here because it can be True for docs
|
||||||
|
# that have been through an NER component but got no entities.
|
||||||
|
has_annotations = any([doc.ents for doc in doc_sample])
|
||||||
|
if not has_annotations:
|
||||||
|
doc = doc_sample[0]
|
||||||
|
ent = doc[0:1]
|
||||||
|
ent.label_ = "XXX"
|
||||||
|
doc.ents = (ent,)
|
||||||
|
|
||||||
self.model.initialize(
|
self.model.initialize(
|
||||||
X=doc_sample, Y=self.model.ops.asarray(vector_sample, dtype="float32")
|
X=doc_sample, Y=self.model.ops.asarray(vector_sample, dtype="float32")
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if not has_annotations:
|
||||||
|
# Clean up dummy annotation
|
||||||
|
doc.ents = []
|
||||||
|
|
||||||
|
def batch_has_learnable_example(self, examples):
|
||||||
|
"""Check if a batch contains a learnable example.
|
||||||
|
|
||||||
|
If one isn't present, then the update step needs to be skipped.
|
||||||
|
"""
|
||||||
|
|
||||||
|
for eg in examples:
|
||||||
|
for ent in eg.predicted.ents:
|
||||||
|
candidates = list(self.get_candidates(self.kb, ent))
|
||||||
|
if candidates:
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
def update(
|
def update(
|
||||||
self,
|
self,
|
||||||
examples: Iterable[Example],
|
examples: Iterable[Example],
|
||||||
|
@ -247,35 +304,29 @@ class EntityLinker(TrainablePipe):
|
||||||
if not examples:
|
if not examples:
|
||||||
return losses
|
return losses
|
||||||
validate_examples(examples, "EntityLinker.update")
|
validate_examples(examples, "EntityLinker.update")
|
||||||
sentence_docs = []
|
|
||||||
for eg in examples:
|
|
||||||
sentences = [s for s in eg.reference.sents]
|
|
||||||
kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
|
|
||||||
for ent in eg.reference.ents:
|
|
||||||
# KB ID of the first token is the same as the whole span
|
|
||||||
kb_id = kb_ids[ent.start]
|
|
||||||
if kb_id:
|
|
||||||
try:
|
|
||||||
# find the sentence in the list of sentences.
|
|
||||||
sent_index = sentences.index(ent.sent)
|
|
||||||
except AttributeError:
|
|
||||||
# Catch the exception when ent.sent is None and provide a user-friendly warning
|
|
||||||
raise RuntimeError(Errors.E030) from None
|
|
||||||
# get n previous sentences, if there are any
|
|
||||||
start_sentence = max(0, sent_index - self.n_sents)
|
|
||||||
# get n posterior sentences, or as many < n as there are
|
|
||||||
end_sentence = min(len(sentences) - 1, sent_index + self.n_sents)
|
|
||||||
# get token positions
|
|
||||||
start_token = sentences[start_sentence].start
|
|
||||||
end_token = sentences[end_sentence].end
|
|
||||||
# append that span as a doc to training
|
|
||||||
sent_doc = eg.predicted[start_token:end_token].as_doc()
|
|
||||||
sentence_docs.append(sent_doc)
|
|
||||||
set_dropout_rate(self.model, drop)
|
set_dropout_rate(self.model, drop)
|
||||||
if not sentence_docs:
|
docs = [eg.predicted for eg in examples]
|
||||||
warnings.warn(Warnings.W093.format(name="Entity Linker"))
|
# save to restore later
|
||||||
|
old_ents = [doc.ents for doc in docs]
|
||||||
|
|
||||||
|
for doc, ex in zip(docs, examples):
|
||||||
|
if self.use_gold_ents:
|
||||||
|
doc.ents = ex.reference.ents
|
||||||
|
else:
|
||||||
|
# only keep matching ents
|
||||||
|
doc.ents = ex.get_matching_ents()
|
||||||
|
|
||||||
|
# make sure we have something to learn from, if not, short-circuit
|
||||||
|
if not self.batch_has_learnable_example(examples):
|
||||||
return losses
|
return losses
|
||||||
sentence_encodings, bp_context = self.model.begin_update(sentence_docs)
|
|
||||||
|
sentence_encodings, bp_context = self.model.begin_update(docs)
|
||||||
|
|
||||||
|
# now restore the ents
|
||||||
|
for doc, old in zip(docs, old_ents):
|
||||||
|
doc.ents = old
|
||||||
|
|
||||||
loss, d_scores = self.get_loss(
|
loss, d_scores = self.get_loss(
|
||||||
sentence_encodings=sentence_encodings, examples=examples
|
sentence_encodings=sentence_encodings, examples=examples
|
||||||
)
|
)
|
||||||
|
@ -288,24 +339,38 @@ class EntityLinker(TrainablePipe):
|
||||||
def get_loss(self, examples: Iterable[Example], sentence_encodings: Floats2d):
|
def get_loss(self, examples: Iterable[Example], sentence_encodings: Floats2d):
|
||||||
validate_examples(examples, "EntityLinker.get_loss")
|
validate_examples(examples, "EntityLinker.get_loss")
|
||||||
entity_encodings = []
|
entity_encodings = []
|
||||||
|
eidx = 0 # indices in gold entities to keep
|
||||||
|
keep_ents = [] # indices in sentence_encodings to keep
|
||||||
|
|
||||||
for eg in examples:
|
for eg in examples:
|
||||||
kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
|
kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
|
||||||
|
|
||||||
for ent in eg.reference.ents:
|
for ent in eg.reference.ents:
|
||||||
kb_id = kb_ids[ent.start]
|
kb_id = kb_ids[ent.start]
|
||||||
if kb_id:
|
if kb_id:
|
||||||
entity_encoding = self.kb.get_vector(kb_id)
|
entity_encoding = self.kb.get_vector(kb_id)
|
||||||
entity_encodings.append(entity_encoding)
|
entity_encodings.append(entity_encoding)
|
||||||
|
keep_ents.append(eidx)
|
||||||
|
|
||||||
|
eidx += 1
|
||||||
entity_encodings = self.model.ops.asarray(entity_encodings, dtype="float32")
|
entity_encodings = self.model.ops.asarray(entity_encodings, dtype="float32")
|
||||||
if sentence_encodings.shape != entity_encodings.shape:
|
selected_encodings = sentence_encodings[keep_ents]
|
||||||
|
|
||||||
|
# If the entity encodings list is empty, then
|
||||||
|
if selected_encodings.shape != entity_encodings.shape:
|
||||||
err = Errors.E147.format(
|
err = Errors.E147.format(
|
||||||
method="get_loss", msg="gold entities do not match up"
|
method="get_loss", msg="gold entities do not match up"
|
||||||
)
|
)
|
||||||
raise RuntimeError(err)
|
raise RuntimeError(err)
|
||||||
# TODO: fix typing issue here
|
# TODO: fix typing issue here
|
||||||
gradients = self.distance.get_grad(sentence_encodings, entity_encodings) # type: ignore
|
gradients = self.distance.get_grad(selected_encodings, entity_encodings) # type: ignore
|
||||||
loss = self.distance.get_loss(sentence_encodings, entity_encodings) # type: ignore
|
# to match the input size, we need to give a zero gradient for items not in the kb
|
||||||
|
out = self.model.ops.alloc2f(*sentence_encodings.shape)
|
||||||
|
out[keep_ents] = gradients
|
||||||
|
|
||||||
|
loss = self.distance.get_loss(selected_encodings, entity_encodings) # type: ignore
|
||||||
loss = loss / len(entity_encodings)
|
loss = loss / len(entity_encodings)
|
||||||
return float(loss), gradients
|
return float(loss), out
|
||||||
|
|
||||||
def predict(self, docs: Iterable[Doc]) -> List[str]:
|
def predict(self, docs: Iterable[Doc]) -> List[str]:
|
||||||
"""Apply the pipeline's model to a batch of docs, without modifying them.
|
"""Apply the pipeline's model to a batch of docs, without modifying them.
|
||||||
|
|
3
spacy/pipeline/legacy/__init__.py
Normal file
3
spacy/pipeline/legacy/__init__.py
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
from .entity_linker import EntityLinker_v1
|
||||||
|
|
||||||
|
__all__ = ["EntityLinker_v1"]
|
427
spacy/pipeline/legacy/entity_linker.py
Normal file
427
spacy/pipeline/legacy/entity_linker.py
Normal file
|
@ -0,0 +1,427 @@
|
||||||
|
# This file is present to provide a prior version of the EntityLinker component
|
||||||
|
# for backwards compatability. For details see #9669.
|
||||||
|
|
||||||
|
from typing import Optional, Iterable, Callable, Dict, Union, List, Any
|
||||||
|
from thinc.types import Floats2d
|
||||||
|
from pathlib import Path
|
||||||
|
from itertools import islice
|
||||||
|
import srsly
|
||||||
|
import random
|
||||||
|
from thinc.api import CosineDistance, Model, Optimizer, Config
|
||||||
|
from thinc.api import set_dropout_rate
|
||||||
|
import warnings
|
||||||
|
|
||||||
|
from ...kb import KnowledgeBase, Candidate
|
||||||
|
from ...ml import empty_kb
|
||||||
|
from ...tokens import Doc, Span
|
||||||
|
from ..pipe import deserialize_config
|
||||||
|
from ..trainable_pipe import TrainablePipe
|
||||||
|
from ...language import Language
|
||||||
|
from ...vocab import Vocab
|
||||||
|
from ...training import Example, validate_examples, validate_get_examples
|
||||||
|
from ...errors import Errors, Warnings
|
||||||
|
from ...util import SimpleFrozenList, registry
|
||||||
|
from ... import util
|
||||||
|
from ...scorer import Scorer
|
||||||
|
|
||||||
|
# See #9050
|
||||||
|
BACKWARD_OVERWRITE = True
|
||||||
|
|
||||||
|
|
||||||
|
def entity_linker_score(examples, **kwargs):
|
||||||
|
return Scorer.score_links(examples, negative_labels=[EntityLinker_v1.NIL], **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
class EntityLinker_v1(TrainablePipe):
|
||||||
|
"""Pipeline component for named entity linking.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/entitylinker
|
||||||
|
"""
|
||||||
|
|
||||||
|
NIL = "NIL" # string used to refer to a non-existing link
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
vocab: Vocab,
|
||||||
|
model: Model,
|
||||||
|
name: str = "entity_linker",
|
||||||
|
*,
|
||||||
|
labels_discard: Iterable[str],
|
||||||
|
n_sents: int,
|
||||||
|
incl_prior: bool,
|
||||||
|
incl_context: bool,
|
||||||
|
entity_vector_length: int,
|
||||||
|
get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
|
||||||
|
overwrite: bool = BACKWARD_OVERWRITE,
|
||||||
|
scorer: Optional[Callable] = entity_linker_score,
|
||||||
|
) -> None:
|
||||||
|
"""Initialize an entity linker.
|
||||||
|
|
||||||
|
vocab (Vocab): The shared vocabulary.
|
||||||
|
model (thinc.api.Model): The Thinc Model powering the pipeline component.
|
||||||
|
name (str): The component instance name, used to add entries to the
|
||||||
|
losses during training.
|
||||||
|
labels_discard (Iterable[str]): NER labels that will automatically get a "NIL" prediction.
|
||||||
|
n_sents (int): The number of neighbouring sentences to take into account.
|
||||||
|
incl_prior (bool): Whether or not to include prior probabilities from the KB in the model.
|
||||||
|
incl_context (bool): Whether or not to include the local context in the model.
|
||||||
|
entity_vector_length (int): Size of encoding vectors in the KB.
|
||||||
|
get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
|
||||||
|
produces a list of candidates, given a certain knowledge base and a textual mention.
|
||||||
|
scorer (Optional[Callable]): The scoring method. Defaults to
|
||||||
|
Scorer.score_links.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/entitylinker#init
|
||||||
|
"""
|
||||||
|
self.vocab = vocab
|
||||||
|
self.model = model
|
||||||
|
self.name = name
|
||||||
|
self.labels_discard = list(labels_discard)
|
||||||
|
self.n_sents = n_sents
|
||||||
|
self.incl_prior = incl_prior
|
||||||
|
self.incl_context = incl_context
|
||||||
|
self.get_candidates = get_candidates
|
||||||
|
self.cfg: Dict[str, Any] = {"overwrite": overwrite}
|
||||||
|
self.distance = CosineDistance(normalize=False)
|
||||||
|
# how many neighbour sentences to take into account
|
||||||
|
# create an empty KB by default. If you want to load a predefined one, specify it in 'initialize'.
|
||||||
|
self.kb = empty_kb(entity_vector_length)(self.vocab)
|
||||||
|
self.scorer = scorer
|
||||||
|
|
||||||
|
def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]):
|
||||||
|
"""Define the KB of this pipe by providing a function that will
|
||||||
|
create it using this object's vocab."""
|
||||||
|
if not callable(kb_loader):
|
||||||
|
raise ValueError(Errors.E885.format(arg_type=type(kb_loader)))
|
||||||
|
|
||||||
|
self.kb = kb_loader(self.vocab)
|
||||||
|
|
||||||
|
def validate_kb(self) -> None:
|
||||||
|
# Raise an error if the knowledge base is not initialized.
|
||||||
|
if self.kb is None:
|
||||||
|
raise ValueError(Errors.E1018.format(name=self.name))
|
||||||
|
if len(self.kb) == 0:
|
||||||
|
raise ValueError(Errors.E139.format(name=self.name))
|
||||||
|
|
||||||
|
def initialize(
|
||||||
|
self,
|
||||||
|
get_examples: Callable[[], Iterable[Example]],
|
||||||
|
*,
|
||||||
|
nlp: Optional[Language] = None,
|
||||||
|
kb_loader: Optional[Callable[[Vocab], KnowledgeBase]] = None,
|
||||||
|
):
|
||||||
|
"""Initialize the pipe for training, using a representative set
|
||||||
|
of data examples.
|
||||||
|
|
||||||
|
get_examples (Callable[[], Iterable[Example]]): Function that
|
||||||
|
returns a representative sample of gold-standard Example objects.
|
||||||
|
nlp (Language): The current nlp object the component is part of.
|
||||||
|
kb_loader (Callable[[Vocab], KnowledgeBase]): A function that creates a KnowledgeBase from a Vocab instance.
|
||||||
|
Note that providing this argument, will overwrite all data accumulated in the current KB.
|
||||||
|
Use this only when loading a KB as-such from file.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/entitylinker#initialize
|
||||||
|
"""
|
||||||
|
validate_get_examples(get_examples, "EntityLinker_v1.initialize")
|
||||||
|
if kb_loader is not None:
|
||||||
|
self.set_kb(kb_loader)
|
||||||
|
self.validate_kb()
|
||||||
|
nO = self.kb.entity_vector_length
|
||||||
|
doc_sample = []
|
||||||
|
vector_sample = []
|
||||||
|
for example in islice(get_examples(), 10):
|
||||||
|
doc_sample.append(example.x)
|
||||||
|
vector_sample.append(self.model.ops.alloc1f(nO))
|
||||||
|
assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
|
||||||
|
assert len(vector_sample) > 0, Errors.E923.format(name=self.name)
|
||||||
|
self.model.initialize(
|
||||||
|
X=doc_sample, Y=self.model.ops.asarray(vector_sample, dtype="float32")
|
||||||
|
)
|
||||||
|
|
||||||
|
def update(
|
||||||
|
self,
|
||||||
|
examples: Iterable[Example],
|
||||||
|
*,
|
||||||
|
drop: float = 0.0,
|
||||||
|
sgd: Optional[Optimizer] = None,
|
||||||
|
losses: Optional[Dict[str, float]] = None,
|
||||||
|
) -> Dict[str, float]:
|
||||||
|
"""Learn from a batch of documents and gold-standard information,
|
||||||
|
updating the pipe's model. Delegates to predict and get_loss.
|
||||||
|
|
||||||
|
examples (Iterable[Example]): A batch of Example objects.
|
||||||
|
drop (float): The dropout rate.
|
||||||
|
sgd (thinc.api.Optimizer): The optimizer.
|
||||||
|
losses (Dict[str, float]): Optional record of the loss during training.
|
||||||
|
Updated using the component name as the key.
|
||||||
|
RETURNS (Dict[str, float]): The updated losses dictionary.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/entitylinker#update
|
||||||
|
"""
|
||||||
|
self.validate_kb()
|
||||||
|
if losses is None:
|
||||||
|
losses = {}
|
||||||
|
losses.setdefault(self.name, 0.0)
|
||||||
|
if not examples:
|
||||||
|
return losses
|
||||||
|
validate_examples(examples, "EntityLinker_v1.update")
|
||||||
|
sentence_docs = []
|
||||||
|
for eg in examples:
|
||||||
|
sentences = [s for s in eg.reference.sents]
|
||||||
|
kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
|
||||||
|
for ent in eg.reference.ents:
|
||||||
|
# KB ID of the first token is the same as the whole span
|
||||||
|
kb_id = kb_ids[ent.start]
|
||||||
|
if kb_id:
|
||||||
|
try:
|
||||||
|
# find the sentence in the list of sentences.
|
||||||
|
sent_index = sentences.index(ent.sent)
|
||||||
|
except AttributeError:
|
||||||
|
# Catch the exception when ent.sent is None and provide a user-friendly warning
|
||||||
|
raise RuntimeError(Errors.E030) from None
|
||||||
|
# get n previous sentences, if there are any
|
||||||
|
start_sentence = max(0, sent_index - self.n_sents)
|
||||||
|
# get n posterior sentences, or as many < n as there are
|
||||||
|
end_sentence = min(len(sentences) - 1, sent_index + self.n_sents)
|
||||||
|
# get token positions
|
||||||
|
start_token = sentences[start_sentence].start
|
||||||
|
end_token = sentences[end_sentence].end
|
||||||
|
# append that span as a doc to training
|
||||||
|
sent_doc = eg.predicted[start_token:end_token].as_doc()
|
||||||
|
sentence_docs.append(sent_doc)
|
||||||
|
set_dropout_rate(self.model, drop)
|
||||||
|
if not sentence_docs:
|
||||||
|
warnings.warn(Warnings.W093.format(name="Entity Linker"))
|
||||||
|
return losses
|
||||||
|
sentence_encodings, bp_context = self.model.begin_update(sentence_docs)
|
||||||
|
loss, d_scores = self.get_loss(
|
||||||
|
sentence_encodings=sentence_encodings, examples=examples
|
||||||
|
)
|
||||||
|
bp_context(d_scores)
|
||||||
|
if sgd is not None:
|
||||||
|
self.finish_update(sgd)
|
||||||
|
losses[self.name] += loss
|
||||||
|
return losses
|
||||||
|
|
||||||
|
def get_loss(self, examples: Iterable[Example], sentence_encodings: Floats2d):
|
||||||
|
validate_examples(examples, "EntityLinker_v1.get_loss")
|
||||||
|
entity_encodings = []
|
||||||
|
for eg in examples:
|
||||||
|
kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
|
||||||
|
for ent in eg.reference.ents:
|
||||||
|
kb_id = kb_ids[ent.start]
|
||||||
|
if kb_id:
|
||||||
|
entity_encoding = self.kb.get_vector(kb_id)
|
||||||
|
entity_encodings.append(entity_encoding)
|
||||||
|
entity_encodings = self.model.ops.asarray(entity_encodings, dtype="float32")
|
||||||
|
if sentence_encodings.shape != entity_encodings.shape:
|
||||||
|
err = Errors.E147.format(
|
||||||
|
method="get_loss", msg="gold entities do not match up"
|
||||||
|
)
|
||||||
|
raise RuntimeError(err)
|
||||||
|
# TODO: fix typing issue here
|
||||||
|
gradients = self.distance.get_grad(sentence_encodings, entity_encodings) # type: ignore
|
||||||
|
loss = self.distance.get_loss(sentence_encodings, entity_encodings) # type: ignore
|
||||||
|
loss = loss / len(entity_encodings)
|
||||||
|
return float(loss), gradients
|
||||||
|
|
||||||
|
def predict(self, docs: Iterable[Doc]) -> List[str]:
|
||||||
|
"""Apply the pipeline's model to a batch of docs, without modifying them.
|
||||||
|
Returns the KB IDs for each entity in each doc, including NIL if there is
|
||||||
|
no prediction.
|
||||||
|
|
||||||
|
docs (Iterable[Doc]): The documents to predict.
|
||||||
|
RETURNS (List[str]): The models prediction for each document.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/entitylinker#predict
|
||||||
|
"""
|
||||||
|
self.validate_kb()
|
||||||
|
entity_count = 0
|
||||||
|
final_kb_ids: List[str] = []
|
||||||
|
if not docs:
|
||||||
|
return final_kb_ids
|
||||||
|
if isinstance(docs, Doc):
|
||||||
|
docs = [docs]
|
||||||
|
for i, doc in enumerate(docs):
|
||||||
|
sentences = [s for s in doc.sents]
|
||||||
|
if len(doc) > 0:
|
||||||
|
# Looping through each entity (TODO: rewrite)
|
||||||
|
for ent in doc.ents:
|
||||||
|
sent = ent.sent
|
||||||
|
sent_index = sentences.index(sent)
|
||||||
|
assert sent_index >= 0
|
||||||
|
# get n_neighbour sentences, clipped to the length of the document
|
||||||
|
start_sentence = max(0, sent_index - self.n_sents)
|
||||||
|
end_sentence = min(len(sentences) - 1, sent_index + self.n_sents)
|
||||||
|
start_token = sentences[start_sentence].start
|
||||||
|
end_token = sentences[end_sentence].end
|
||||||
|
sent_doc = doc[start_token:end_token].as_doc()
|
||||||
|
# currently, the context is the same for each entity in a sentence (should be refined)
|
||||||
|
xp = self.model.ops.xp
|
||||||
|
if self.incl_context:
|
||||||
|
sentence_encoding = self.model.predict([sent_doc])[0]
|
||||||
|
sentence_encoding_t = sentence_encoding.T
|
||||||
|
sentence_norm = xp.linalg.norm(sentence_encoding_t)
|
||||||
|
entity_count += 1
|
||||||
|
if ent.label_ in self.labels_discard:
|
||||||
|
# ignoring this entity - setting to NIL
|
||||||
|
final_kb_ids.append(self.NIL)
|
||||||
|
else:
|
||||||
|
candidates = list(self.get_candidates(self.kb, ent))
|
||||||
|
if not candidates:
|
||||||
|
# no prediction possible for this entity - setting to NIL
|
||||||
|
final_kb_ids.append(self.NIL)
|
||||||
|
elif len(candidates) == 1:
|
||||||
|
# shortcut for efficiency reasons: take the 1 candidate
|
||||||
|
# TODO: thresholding
|
||||||
|
final_kb_ids.append(candidates[0].entity_)
|
||||||
|
else:
|
||||||
|
random.shuffle(candidates)
|
||||||
|
# set all prior probabilities to 0 if incl_prior=False
|
||||||
|
prior_probs = xp.asarray([c.prior_prob for c in candidates])
|
||||||
|
if not self.incl_prior:
|
||||||
|
prior_probs = xp.asarray([0.0 for _ in candidates])
|
||||||
|
scores = prior_probs
|
||||||
|
# add in similarity from the context
|
||||||
|
if self.incl_context:
|
||||||
|
entity_encodings = xp.asarray(
|
||||||
|
[c.entity_vector for c in candidates]
|
||||||
|
)
|
||||||
|
entity_norm = xp.linalg.norm(entity_encodings, axis=1)
|
||||||
|
if len(entity_encodings) != len(prior_probs):
|
||||||
|
raise RuntimeError(
|
||||||
|
Errors.E147.format(
|
||||||
|
method="predict",
|
||||||
|
msg="vectors not of equal length",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
# cosine similarity
|
||||||
|
sims = xp.dot(entity_encodings, sentence_encoding_t) / (
|
||||||
|
sentence_norm * entity_norm
|
||||||
|
)
|
||||||
|
if sims.shape != prior_probs.shape:
|
||||||
|
raise ValueError(Errors.E161)
|
||||||
|
scores = prior_probs + sims - (prior_probs * sims)
|
||||||
|
# TODO: thresholding
|
||||||
|
best_index = scores.argmax().item()
|
||||||
|
best_candidate = candidates[best_index]
|
||||||
|
final_kb_ids.append(best_candidate.entity_)
|
||||||
|
if not (len(final_kb_ids) == entity_count):
|
||||||
|
err = Errors.E147.format(
|
||||||
|
method="predict", msg="result variables not of equal length"
|
||||||
|
)
|
||||||
|
raise RuntimeError(err)
|
||||||
|
return final_kb_ids
|
||||||
|
|
||||||
|
def set_annotations(self, docs: Iterable[Doc], kb_ids: List[str]) -> None:
|
||||||
|
"""Modify a batch of documents, using pre-computed scores.
|
||||||
|
|
||||||
|
docs (Iterable[Doc]): The documents to modify.
|
||||||
|
kb_ids (List[str]): The IDs to set, produced by EntityLinker.predict.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/entitylinker#set_annotations
|
||||||
|
"""
|
||||||
|
count_ents = len([ent for doc in docs for ent in doc.ents])
|
||||||
|
if count_ents != len(kb_ids):
|
||||||
|
raise ValueError(Errors.E148.format(ents=count_ents, ids=len(kb_ids)))
|
||||||
|
i = 0
|
||||||
|
overwrite = self.cfg["overwrite"]
|
||||||
|
for doc in docs:
|
||||||
|
for ent in doc.ents:
|
||||||
|
kb_id = kb_ids[i]
|
||||||
|
i += 1
|
||||||
|
for token in ent:
|
||||||
|
if token.ent_kb_id == 0 or overwrite:
|
||||||
|
token.ent_kb_id_ = kb_id
|
||||||
|
|
||||||
|
def to_bytes(self, *, exclude=tuple()):
|
||||||
|
"""Serialize the pipe to a bytestring.
|
||||||
|
|
||||||
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
|
RETURNS (bytes): The serialized object.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/entitylinker#to_bytes
|
||||||
|
"""
|
||||||
|
self._validate_serialization_attrs()
|
||||||
|
serialize = {}
|
||||||
|
if hasattr(self, "cfg") and self.cfg is not None:
|
||||||
|
serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
|
||||||
|
serialize["vocab"] = lambda: self.vocab.to_bytes(exclude=exclude)
|
||||||
|
serialize["kb"] = self.kb.to_bytes
|
||||||
|
serialize["model"] = self.model.to_bytes
|
||||||
|
return util.to_bytes(serialize, exclude)
|
||||||
|
|
||||||
|
def from_bytes(self, bytes_data, *, exclude=tuple()):
|
||||||
|
"""Load the pipe from a bytestring.
|
||||||
|
|
||||||
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
|
RETURNS (TrainablePipe): The loaded object.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/entitylinker#from_bytes
|
||||||
|
"""
|
||||||
|
self._validate_serialization_attrs()
|
||||||
|
|
||||||
|
def load_model(b):
|
||||||
|
try:
|
||||||
|
self.model.from_bytes(b)
|
||||||
|
except AttributeError:
|
||||||
|
raise ValueError(Errors.E149) from None
|
||||||
|
|
||||||
|
deserialize = {}
|
||||||
|
if hasattr(self, "cfg") and self.cfg is not None:
|
||||||
|
deserialize["cfg"] = lambda b: self.cfg.update(srsly.json_loads(b))
|
||||||
|
deserialize["vocab"] = lambda b: self.vocab.from_bytes(b, exclude=exclude)
|
||||||
|
deserialize["kb"] = lambda b: self.kb.from_bytes(b)
|
||||||
|
deserialize["model"] = load_model
|
||||||
|
util.from_bytes(bytes_data, deserialize, exclude)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def to_disk(
|
||||||
|
self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
|
||||||
|
) -> None:
|
||||||
|
"""Serialize the pipe to disk.
|
||||||
|
|
||||||
|
path (str / Path): Path to a directory.
|
||||||
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/entitylinker#to_disk
|
||||||
|
"""
|
||||||
|
serialize = {}
|
||||||
|
serialize["vocab"] = lambda p: self.vocab.to_disk(p, exclude=exclude)
|
||||||
|
serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
|
||||||
|
serialize["kb"] = lambda p: self.kb.to_disk(p)
|
||||||
|
serialize["model"] = lambda p: self.model.to_disk(p)
|
||||||
|
util.to_disk(path, serialize, exclude)
|
||||||
|
|
||||||
|
def from_disk(
|
||||||
|
self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
|
||||||
|
) -> "EntityLinker_v1":
|
||||||
|
"""Load the pipe from disk. Modifies the object in place and returns it.
|
||||||
|
|
||||||
|
path (str / Path): Path to a directory.
|
||||||
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||||
|
RETURNS (EntityLinker): The modified EntityLinker object.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/entitylinker#from_disk
|
||||||
|
"""
|
||||||
|
|
||||||
|
def load_model(p):
|
||||||
|
try:
|
||||||
|
with p.open("rb") as infile:
|
||||||
|
self.model.from_bytes(infile.read())
|
||||||
|
except AttributeError:
|
||||||
|
raise ValueError(Errors.E149) from None
|
||||||
|
|
||||||
|
deserialize: Dict[str, Callable[[Any], Any]] = {}
|
||||||
|
deserialize["cfg"] = lambda p: self.cfg.update(deserialize_config(p))
|
||||||
|
deserialize["vocab"] = lambda p: self.vocab.from_disk(p, exclude=exclude)
|
||||||
|
deserialize["kb"] = lambda p: self.kb.from_disk(p)
|
||||||
|
deserialize["model"] = load_model
|
||||||
|
util.from_disk(path, deserialize, exclude)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def rehearse(self, examples, *, sgd=None, losses=None, **config):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def add_label(self, label):
|
||||||
|
raise NotImplementedError
|
|
@ -25,7 +25,7 @@ BACKWARD_EXTEND = False
|
||||||
|
|
||||||
default_model_config = """
|
default_model_config = """
|
||||||
[model]
|
[model]
|
||||||
@architectures = "spacy.Tagger.v1"
|
@architectures = "spacy.Tagger.v2"
|
||||||
|
|
||||||
[model.tok2vec]
|
[model.tok2vec]
|
||||||
@architectures = "spacy.Tok2Vec.v2"
|
@architectures = "spacy.Tok2Vec.v2"
|
||||||
|
|
|
@ -26,6 +26,8 @@ class Pipe:
|
||||||
@property
|
@property
|
||||||
def labels(self) -> Tuple[str, ...]: ...
|
def labels(self) -> Tuple[str, ...]: ...
|
||||||
@property
|
@property
|
||||||
|
def hide_labels(self) -> bool: ...
|
||||||
|
@property
|
||||||
def label_data(self) -> Any: ...
|
def label_data(self) -> Any: ...
|
||||||
def _require_labels(self) -> None: ...
|
def _require_labels(self) -> None: ...
|
||||||
def set_error_handler(
|
def set_error_handler(
|
||||||
|
|
|
@ -102,6 +102,10 @@ cdef class Pipe:
|
||||||
def labels(self) -> Tuple[str, ...]:
|
def labels(self) -> Tuple[str, ...]:
|
||||||
return tuple()
|
return tuple()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def hide_labels(self) -> bool:
|
||||||
|
return False
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def label_data(self):
|
def label_data(self):
|
||||||
"""Optional JSON-serializable data that would be sufficient to recreate
|
"""Optional JSON-serializable data that would be sufficient to recreate
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
# cython: infer_types=True, profile=True, binding=True
|
# cython: infer_types=True, profile=True, binding=True
|
||||||
from itertools import islice
|
|
||||||
from typing import Optional, Callable
|
from typing import Optional, Callable
|
||||||
|
from itertools import islice
|
||||||
|
|
||||||
import srsly
|
import srsly
|
||||||
from thinc.api import Model, SequenceCategoricalCrossentropy, Config
|
from thinc.api import Model, SequenceCategoricalCrossentropy, Config
|
||||||
|
@ -20,7 +20,7 @@ BACKWARD_OVERWRITE = False
|
||||||
|
|
||||||
default_model_config = """
|
default_model_config = """
|
||||||
[model]
|
[model]
|
||||||
@architectures = "spacy.Tagger.v1"
|
@architectures = "spacy.Tagger.v2"
|
||||||
|
|
||||||
[model.tok2vec]
|
[model.tok2vec]
|
||||||
@architectures = "spacy.HashEmbedCNN.v2"
|
@architectures = "spacy.HashEmbedCNN.v2"
|
||||||
|
@ -99,6 +99,10 @@ class SentenceRecognizer(Tagger):
|
||||||
# are 0
|
# are 0
|
||||||
return tuple(["I", "S"])
|
return tuple(["I", "S"])
|
||||||
|
|
||||||
|
@property
|
||||||
|
def hide_labels(self):
|
||||||
|
return True
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def label_data(self):
|
def label_data(self):
|
||||||
return None
|
return None
|
||||||
|
|
|
@ -1,9 +1,10 @@
|
||||||
import numpy
|
|
||||||
from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any, cast
|
from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any, cast
|
||||||
from thinc.api import Config, Model, get_current_ops, set_dropout_rate, Ops
|
from thinc.api import Config, Model, get_current_ops, set_dropout_rate, Ops
|
||||||
from thinc.api import Optimizer
|
from thinc.api import Optimizer
|
||||||
from thinc.types import Ragged, Ints2d, Floats2d, Ints1d
|
from thinc.types import Ragged, Ints2d, Floats2d, Ints1d
|
||||||
|
|
||||||
|
import numpy
|
||||||
|
|
||||||
from ..compat import Protocol, runtime_checkable
|
from ..compat import Protocol, runtime_checkable
|
||||||
from ..scorer import Scorer
|
from ..scorer import Scorer
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
|
@ -271,6 +272,24 @@ class SpanCategorizer(TrainablePipe):
|
||||||
scores = self.model.predict((docs, indices)) # type: ignore
|
scores = self.model.predict((docs, indices)) # type: ignore
|
||||||
return indices, scores
|
return indices, scores
|
||||||
|
|
||||||
|
def set_candidates(
|
||||||
|
self, docs: Iterable[Doc], *, candidates_key: str = "candidates"
|
||||||
|
) -> None:
|
||||||
|
"""Use the spancat suggester to add a list of span candidates to a list of docs.
|
||||||
|
This method is intended to be used for debugging purposes.
|
||||||
|
|
||||||
|
docs (Iterable[Doc]): The documents to modify.
|
||||||
|
candidates_key (str): Key of the Doc.spans dict to save the candidate spans under.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/spancategorizer#set_candidates
|
||||||
|
"""
|
||||||
|
suggester_output = self.suggester(docs, ops=self.model.ops)
|
||||||
|
|
||||||
|
for candidates, doc in zip(suggester_output, docs): # type: ignore
|
||||||
|
doc.spans[candidates_key] = []
|
||||||
|
for index in candidates.dataXd:
|
||||||
|
doc.spans[candidates_key].append(doc[index[0] : index[1]])
|
||||||
|
|
||||||
def set_annotations(self, docs: Iterable[Doc], indices_scores) -> None:
|
def set_annotations(self, docs: Iterable[Doc], indices_scores) -> None:
|
||||||
"""Modify a batch of Doc objects, using pre-computed scores.
|
"""Modify a batch of Doc objects, using pre-computed scores.
|
||||||
|
|
||||||
|
@ -377,7 +396,7 @@ class SpanCategorizer(TrainablePipe):
|
||||||
# If the prediction is 0.9 and it's false, the gradient will be
|
# If the prediction is 0.9 and it's false, the gradient will be
|
||||||
# 0.9 (0.9 - 0.0)
|
# 0.9 (0.9 - 0.0)
|
||||||
d_scores = scores - target
|
d_scores = scores - target
|
||||||
loss = float((d_scores ** 2).sum())
|
loss = float((d_scores**2).sum())
|
||||||
return loss, d_scores
|
return loss, d_scores
|
||||||
|
|
||||||
def initialize(
|
def initialize(
|
||||||
|
|
|
@ -27,7 +27,7 @@ BACKWARD_OVERWRITE = False
|
||||||
|
|
||||||
default_model_config = """
|
default_model_config = """
|
||||||
[model]
|
[model]
|
||||||
@architectures = "spacy.Tagger.v1"
|
@architectures = "spacy.Tagger.v2"
|
||||||
|
|
||||||
[model.tok2vec]
|
[model.tok2vec]
|
||||||
@architectures = "spacy.HashEmbedCNN.v2"
|
@architectures = "spacy.HashEmbedCNN.v2"
|
||||||
|
@ -225,6 +225,7 @@ class Tagger(TrainablePipe):
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tagger#rehearse
|
DOCS: https://spacy.io/api/tagger#rehearse
|
||||||
"""
|
"""
|
||||||
|
loss_func = SequenceCategoricalCrossentropy()
|
||||||
if losses is None:
|
if losses is None:
|
||||||
losses = {}
|
losses = {}
|
||||||
losses.setdefault(self.name, 0.0)
|
losses.setdefault(self.name, 0.0)
|
||||||
|
@ -236,12 +237,12 @@ class Tagger(TrainablePipe):
|
||||||
# Handle cases where there are no tokens in any docs.
|
# Handle cases where there are no tokens in any docs.
|
||||||
return losses
|
return losses
|
||||||
set_dropout_rate(self.model, drop)
|
set_dropout_rate(self.model, drop)
|
||||||
guesses, backprop = self.model.begin_update(docs)
|
tag_scores, bp_tag_scores = self.model.begin_update(docs)
|
||||||
target = self._rehearsal_model(examples)
|
tutor_tag_scores, _ = self._rehearsal_model.begin_update(docs)
|
||||||
gradient = guesses - target
|
grads, loss = loss_func(tag_scores, tutor_tag_scores)
|
||||||
backprop(gradient)
|
bp_tag_scores(grads)
|
||||||
self.finish_update(sgd)
|
self.finish_update(sgd)
|
||||||
losses[self.name] += (gradient**2).sum()
|
losses[self.name] += loss
|
||||||
return losses
|
return losses
|
||||||
|
|
||||||
def get_loss(self, examples, scores):
|
def get_loss(self, examples, scores):
|
||||||
|
|
|
@ -158,6 +158,13 @@ class TextCategorizer(TrainablePipe):
|
||||||
self.cfg = dict(cfg)
|
self.cfg = dict(cfg)
|
||||||
self.scorer = scorer
|
self.scorer = scorer
|
||||||
|
|
||||||
|
@property
|
||||||
|
def support_missing_values(self):
|
||||||
|
# There are no missing values as the textcat should always
|
||||||
|
# predict exactly one label. All other labels are 0.0
|
||||||
|
# Subclasses may override this property to change internal behaviour.
|
||||||
|
return False
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def labels(self) -> Tuple[str]:
|
def labels(self) -> Tuple[str]:
|
||||||
"""RETURNS (Tuple[str]): The labels currently added to the component.
|
"""RETURNS (Tuple[str]): The labels currently added to the component.
|
||||||
|
@ -276,12 +283,12 @@ class TextCategorizer(TrainablePipe):
|
||||||
return losses
|
return losses
|
||||||
set_dropout_rate(self.model, drop)
|
set_dropout_rate(self.model, drop)
|
||||||
scores, bp_scores = self.model.begin_update(docs)
|
scores, bp_scores = self.model.begin_update(docs)
|
||||||
target = self._rehearsal_model(examples)
|
target, _ = self._rehearsal_model.begin_update(docs)
|
||||||
gradient = scores - target
|
gradient = scores - target
|
||||||
bp_scores(gradient)
|
bp_scores(gradient)
|
||||||
if sgd is not None:
|
if sgd is not None:
|
||||||
self.finish_update(sgd)
|
self.finish_update(sgd)
|
||||||
losses[self.name] += (gradient ** 2).sum()
|
losses[self.name] += (gradient**2).sum()
|
||||||
return losses
|
return losses
|
||||||
|
|
||||||
def _examples_to_truth(
|
def _examples_to_truth(
|
||||||
|
@ -294,7 +301,7 @@ class TextCategorizer(TrainablePipe):
|
||||||
for j, label in enumerate(self.labels):
|
for j, label in enumerate(self.labels):
|
||||||
if label in eg.reference.cats:
|
if label in eg.reference.cats:
|
||||||
truths[i, j] = eg.reference.cats[label]
|
truths[i, j] = eg.reference.cats[label]
|
||||||
else:
|
elif self.support_missing_values:
|
||||||
not_missing[i, j] = 0.0
|
not_missing[i, j] = 0.0
|
||||||
truths = self.model.ops.asarray(truths) # type: ignore
|
truths = self.model.ops.asarray(truths) # type: ignore
|
||||||
return truths, not_missing # type: ignore
|
return truths, not_missing # type: ignore
|
||||||
|
@ -313,9 +320,9 @@ class TextCategorizer(TrainablePipe):
|
||||||
self._validate_categories(examples)
|
self._validate_categories(examples)
|
||||||
truths, not_missing = self._examples_to_truth(examples)
|
truths, not_missing = self._examples_to_truth(examples)
|
||||||
not_missing = self.model.ops.asarray(not_missing) # type: ignore
|
not_missing = self.model.ops.asarray(not_missing) # type: ignore
|
||||||
d_scores = (scores - truths) / scores.shape[0]
|
d_scores = scores - truths
|
||||||
d_scores *= not_missing
|
d_scores *= not_missing
|
||||||
mean_square_error = (d_scores ** 2).sum(axis=1).mean()
|
mean_square_error = (d_scores**2).mean()
|
||||||
return float(mean_square_error), d_scores
|
return float(mean_square_error), d_scores
|
||||||
|
|
||||||
def add_label(self, label: str) -> int:
|
def add_label(self, label: str) -> int:
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
from itertools import islice
|
|
||||||
from typing import Iterable, Optional, Dict, List, Callable, Any
|
from typing import Iterable, Optional, Dict, List, Callable, Any
|
||||||
|
|
||||||
from thinc.api import Model, Config
|
|
||||||
from thinc.types import Floats2d
|
from thinc.types import Floats2d
|
||||||
|
from thinc.api import Model, Config
|
||||||
|
|
||||||
|
from itertools import islice
|
||||||
|
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..training import Example, validate_get_examples
|
from ..training import Example, validate_get_examples
|
||||||
|
@ -158,6 +158,10 @@ class MultiLabel_TextCategorizer(TextCategorizer):
|
||||||
self.cfg = dict(cfg)
|
self.cfg = dict(cfg)
|
||||||
self.scorer = scorer
|
self.scorer = scorer
|
||||||
|
|
||||||
|
@property
|
||||||
|
def support_missing_values(self):
|
||||||
|
return True
|
||||||
|
|
||||||
def initialize( # type: ignore[override]
|
def initialize( # type: ignore[override]
|
||||||
self,
|
self,
|
||||||
get_examples: Callable[[], Iterable[Example]],
|
get_examples: Callable[[], Iterable[Example]],
|
||||||
|
|
|
@ -118,6 +118,10 @@ class Tok2Vec(TrainablePipe):
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tok2vec#predict
|
DOCS: https://spacy.io/api/tok2vec#predict
|
||||||
"""
|
"""
|
||||||
|
if not any(len(doc) for doc in docs):
|
||||||
|
# Handle cases where there are no tokens in any docs.
|
||||||
|
width = self.model.get_dim("nO")
|
||||||
|
return [self.model.ops.alloc((0, width)) for doc in docs]
|
||||||
tokvecs = self.model.predict(docs)
|
tokvecs = self.model.predict(docs)
|
||||||
batch_id = Tok2VecListener.get_batch_id(docs)
|
batch_id = Tok2VecListener.get_batch_id(docs)
|
||||||
for listener in self.listeners:
|
for listener in self.listeners:
|
||||||
|
|
|
@ -228,7 +228,7 @@ class Scorer:
|
||||||
if token.orth_.isspace():
|
if token.orth_.isspace():
|
||||||
continue
|
continue
|
||||||
if align.x2y.lengths[token.i] == 1:
|
if align.x2y.lengths[token.i] == 1:
|
||||||
gold_i = align.x2y[token.i].dataXd[0, 0]
|
gold_i = align.x2y[token.i][0]
|
||||||
if gold_i not in missing_indices:
|
if gold_i not in missing_indices:
|
||||||
pred_tags.add((gold_i, getter(token, attr)))
|
pred_tags.add((gold_i, getter(token, attr)))
|
||||||
tag_score.score_set(pred_tags, gold_tags)
|
tag_score.score_set(pred_tags, gold_tags)
|
||||||
|
@ -287,7 +287,7 @@ class Scorer:
|
||||||
if token.orth_.isspace():
|
if token.orth_.isspace():
|
||||||
continue
|
continue
|
||||||
if align.x2y.lengths[token.i] == 1:
|
if align.x2y.lengths[token.i] == 1:
|
||||||
gold_i = align.x2y[token.i].dataXd[0, 0]
|
gold_i = align.x2y[token.i][0]
|
||||||
if gold_i not in missing_indices:
|
if gold_i not in missing_indices:
|
||||||
value = getter(token, attr)
|
value = getter(token, attr)
|
||||||
morph = gold_doc.vocab.strings[value]
|
morph = gold_doc.vocab.strings[value]
|
||||||
|
@ -553,7 +553,8 @@ class Scorer:
|
||||||
getter(doc, attr) should return the values for the individual doc.
|
getter(doc, attr) should return the values for the individual doc.
|
||||||
labels (Iterable[str]): The set of possible labels. Defaults to [].
|
labels (Iterable[str]): The set of possible labels. Defaults to [].
|
||||||
multi_label (bool): Whether the attribute allows multiple labels.
|
multi_label (bool): Whether the attribute allows multiple labels.
|
||||||
Defaults to True.
|
Defaults to True. When set to False (exclusive labels), missing
|
||||||
|
gold labels are interpreted as 0.0.
|
||||||
positive_label (str): The positive label for a binary task with
|
positive_label (str): The positive label for a binary task with
|
||||||
exclusive classes. Defaults to None.
|
exclusive classes. Defaults to None.
|
||||||
threshold (float): Cutoff to consider a prediction "positive". Defaults
|
threshold (float): Cutoff to consider a prediction "positive". Defaults
|
||||||
|
@ -592,13 +593,15 @@ class Scorer:
|
||||||
|
|
||||||
for label in labels:
|
for label in labels:
|
||||||
pred_score = pred_cats.get(label, 0.0)
|
pred_score = pred_cats.get(label, 0.0)
|
||||||
gold_score = gold_cats.get(label, 0.0)
|
gold_score = gold_cats.get(label)
|
||||||
|
if not gold_score and not multi_label:
|
||||||
|
gold_score = 0.0
|
||||||
if gold_score is not None:
|
if gold_score is not None:
|
||||||
auc_per_type[label].score_set(pred_score, gold_score)
|
auc_per_type[label].score_set(pred_score, gold_score)
|
||||||
if multi_label:
|
if multi_label:
|
||||||
for label in labels:
|
for label in labels:
|
||||||
pred_score = pred_cats.get(label, 0.0)
|
pred_score = pred_cats.get(label, 0.0)
|
||||||
gold_score = gold_cats.get(label, 0.0)
|
gold_score = gold_cats.get(label)
|
||||||
if gold_score is not None:
|
if gold_score is not None:
|
||||||
if pred_score >= threshold and gold_score > 0:
|
if pred_score >= threshold and gold_score > 0:
|
||||||
f_per_type[label].tp += 1
|
f_per_type[label].tp += 1
|
||||||
|
@ -610,16 +613,15 @@ class Scorer:
|
||||||
# Get the highest-scoring for each.
|
# Get the highest-scoring for each.
|
||||||
pred_label, pred_score = max(pred_cats.items(), key=lambda it: it[1])
|
pred_label, pred_score = max(pred_cats.items(), key=lambda it: it[1])
|
||||||
gold_label, gold_score = max(gold_cats.items(), key=lambda it: it[1])
|
gold_label, gold_score = max(gold_cats.items(), key=lambda it: it[1])
|
||||||
if gold_score is not None:
|
if pred_label == gold_label and pred_score >= threshold:
|
||||||
if pred_label == gold_label and pred_score >= threshold:
|
f_per_type[pred_label].tp += 1
|
||||||
f_per_type[pred_label].tp += 1
|
else:
|
||||||
else:
|
f_per_type[gold_label].fn += 1
|
||||||
f_per_type[gold_label].fn += 1
|
if pred_score >= threshold:
|
||||||
if pred_score >= threshold:
|
f_per_type[pred_label].fp += 1
|
||||||
f_per_type[pred_label].fp += 1
|
|
||||||
elif gold_cats:
|
elif gold_cats:
|
||||||
gold_label, gold_score = max(gold_cats, key=lambda it: it[1])
|
gold_label, gold_score = max(gold_cats, key=lambda it: it[1])
|
||||||
if gold_score is not None and gold_score > 0:
|
if gold_score > 0:
|
||||||
f_per_type[gold_label].fn += 1
|
f_per_type[gold_label].fn += 1
|
||||||
elif pred_cats:
|
elif pred_cats:
|
||||||
pred_label, pred_score = max(pred_cats.items(), key=lambda it: it[1])
|
pred_label, pred_score = max(pred_cats.items(), key=lambda it: it[1])
|
||||||
|
@ -800,13 +802,13 @@ class Scorer:
|
||||||
if align.x2y.lengths[token.i] != 1:
|
if align.x2y.lengths[token.i] != 1:
|
||||||
gold_i = None # type: ignore
|
gold_i = None # type: ignore
|
||||||
else:
|
else:
|
||||||
gold_i = align.x2y[token.i].dataXd[0, 0]
|
gold_i = align.x2y[token.i][0]
|
||||||
if gold_i not in missing_indices:
|
if gold_i not in missing_indices:
|
||||||
dep = getter(token, attr)
|
dep = getter(token, attr)
|
||||||
head = head_getter(token, head_attr)
|
head = head_getter(token, head_attr)
|
||||||
if dep not in ignore_labels and token.orth_.strip():
|
if dep not in ignore_labels and token.orth_.strip():
|
||||||
if align.x2y.lengths[head.i] == 1:
|
if align.x2y.lengths[head.i] == 1:
|
||||||
gold_head = align.x2y[head.i].dataXd[0, 0]
|
gold_head = align.x2y[head.i][0]
|
||||||
else:
|
else:
|
||||||
gold_head = None
|
gold_head = None
|
||||||
# None is indistinct, so we can't just add it to the set
|
# None is indistinct, so we can't just add it to the set
|
||||||
|
@ -856,7 +858,7 @@ def get_ner_prf(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
||||||
for pred_ent in eg.x.ents:
|
for pred_ent in eg.x.ents:
|
||||||
if pred_ent.label_ not in score_per_type:
|
if pred_ent.label_ not in score_per_type:
|
||||||
score_per_type[pred_ent.label_] = PRFScore()
|
score_per_type[pred_ent.label_] = PRFScore()
|
||||||
indices = align_x2y[pred_ent.start : pred_ent.end].dataXd.ravel()
|
indices = align_x2y[pred_ent.start : pred_ent.end]
|
||||||
if len(indices):
|
if len(indices):
|
||||||
g_span = eg.y[indices[0] : indices[-1] + 1]
|
g_span = eg.y[indices[0] : indices[-1] + 1]
|
||||||
# Check we aren't missing annotation on this span. If so,
|
# Check we aren't missing annotation on this span. If so,
|
||||||
|
|
|
@ -99,6 +99,11 @@ def de_vocab():
|
||||||
return get_lang_class("de")().vocab
|
return get_lang_class("de")().vocab
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def dsb_tokenizer():
|
||||||
|
return get_lang_class("dsb")().tokenizer
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def el_tokenizer():
|
def el_tokenizer():
|
||||||
return get_lang_class("el")().tokenizer
|
return get_lang_class("el")().tokenizer
|
||||||
|
@ -155,6 +160,11 @@ def fr_tokenizer():
|
||||||
return get_lang_class("fr")().tokenizer
|
return get_lang_class("fr")().tokenizer
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def fr_vocab():
|
||||||
|
return get_lang_class("fr")().vocab
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def ga_tokenizer():
|
def ga_tokenizer():
|
||||||
return get_lang_class("ga")().tokenizer
|
return get_lang_class("ga")().tokenizer
|
||||||
|
@ -205,18 +215,41 @@ def it_tokenizer():
|
||||||
return get_lang_class("it")().tokenizer
|
return get_lang_class("it")().tokenizer
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def it_vocab():
|
||||||
|
return get_lang_class("it")().vocab
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def ja_tokenizer():
|
def ja_tokenizer():
|
||||||
pytest.importorskip("sudachipy")
|
pytest.importorskip("sudachipy")
|
||||||
return get_lang_class("ja")().tokenizer
|
return get_lang_class("ja")().tokenizer
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def hsb_tokenizer():
|
||||||
|
return get_lang_class("hsb")().tokenizer
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def ko_tokenizer():
|
def ko_tokenizer():
|
||||||
pytest.importorskip("natto")
|
pytest.importorskip("natto")
|
||||||
return get_lang_class("ko")().tokenizer
|
return get_lang_class("ko")().tokenizer
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def ko_tokenizer_tokenizer():
|
||||||
|
config = {
|
||||||
|
"nlp": {
|
||||||
|
"tokenizer": {
|
||||||
|
"@tokenizers": "spacy.Tokenizer.v1",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
nlp = get_lang_class("ko").from_config(config)
|
||||||
|
return nlp.tokenizer
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def lb_tokenizer():
|
def lb_tokenizer():
|
||||||
return get_lang_class("lb")().tokenizer
|
return get_lang_class("lb")().tokenizer
|
||||||
|
@ -324,6 +357,11 @@ def sv_tokenizer():
|
||||||
return get_lang_class("sv")().tokenizer
|
return get_lang_class("sv")().tokenizer
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def ta_tokenizer():
|
||||||
|
return get_lang_class("ta")().tokenizer
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def th_tokenizer():
|
def th_tokenizer():
|
||||||
pytest.importorskip("pythainlp")
|
pytest.importorskip("pythainlp")
|
||||||
|
|
|
@ -684,6 +684,7 @@ def test_has_annotation(en_vocab):
|
||||||
attrs = ("TAG", "POS", "MORPH", "LEMMA", "DEP", "HEAD", "ENT_IOB", "ENT_TYPE")
|
attrs = ("TAG", "POS", "MORPH", "LEMMA", "DEP", "HEAD", "ENT_IOB", "ENT_TYPE")
|
||||||
for attr in attrs:
|
for attr in attrs:
|
||||||
assert not doc.has_annotation(attr)
|
assert not doc.has_annotation(attr)
|
||||||
|
assert not doc.has_annotation(attr, require_complete=True)
|
||||||
|
|
||||||
doc[0].tag_ = "A"
|
doc[0].tag_ = "A"
|
||||||
doc[0].pos_ = "X"
|
doc[0].pos_ = "X"
|
||||||
|
@ -709,6 +710,27 @@ def test_has_annotation(en_vocab):
|
||||||
assert doc.has_annotation(attr, require_complete=True)
|
assert doc.has_annotation(attr, require_complete=True)
|
||||||
|
|
||||||
|
|
||||||
|
def test_has_annotation_sents(en_vocab):
|
||||||
|
doc = Doc(en_vocab, words=["Hello", "beautiful", "world"])
|
||||||
|
attrs = ("SENT_START", "IS_SENT_START", "IS_SENT_END")
|
||||||
|
for attr in attrs:
|
||||||
|
assert not doc.has_annotation(attr)
|
||||||
|
assert not doc.has_annotation(attr, require_complete=True)
|
||||||
|
|
||||||
|
# The first token (index 0) is always assumed to be a sentence start,
|
||||||
|
# and ignored by the check in doc.has_annotation
|
||||||
|
|
||||||
|
doc[1].is_sent_start = False
|
||||||
|
for attr in attrs:
|
||||||
|
assert doc.has_annotation(attr)
|
||||||
|
assert not doc.has_annotation(attr, require_complete=True)
|
||||||
|
|
||||||
|
doc[2].is_sent_start = False
|
||||||
|
for attr in attrs:
|
||||||
|
assert doc.has_annotation(attr)
|
||||||
|
assert doc.has_annotation(attr, require_complete=True)
|
||||||
|
|
||||||
|
|
||||||
def test_is_flags_deprecated(en_tokenizer):
|
def test_is_flags_deprecated(en_tokenizer):
|
||||||
doc = en_tokenizer("test")
|
doc = en_tokenizer("test")
|
||||||
with pytest.deprecated_call():
|
with pytest.deprecated_call():
|
||||||
|
|
|
@ -573,6 +573,55 @@ def test_span_with_vectors(doc):
|
||||||
doc.vocab.vectors = prev_vectors
|
doc.vocab.vectors = prev_vectors
|
||||||
|
|
||||||
|
|
||||||
|
# fmt: off
|
||||||
|
def test_span_comparison(doc):
|
||||||
|
|
||||||
|
# Identical start, end, only differ in label and kb_id
|
||||||
|
assert Span(doc, 0, 3) == Span(doc, 0, 3)
|
||||||
|
assert Span(doc, 0, 3, "LABEL") == Span(doc, 0, 3, "LABEL")
|
||||||
|
assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") == Span(doc, 0, 3, "LABEL", kb_id="KB_ID")
|
||||||
|
|
||||||
|
assert Span(doc, 0, 3) != Span(doc, 0, 3, "LABEL")
|
||||||
|
assert Span(doc, 0, 3) != Span(doc, 0, 3, "LABEL", kb_id="KB_ID")
|
||||||
|
assert Span(doc, 0, 3, "LABEL") != Span(doc, 0, 3, "LABEL", kb_id="KB_ID")
|
||||||
|
|
||||||
|
assert Span(doc, 0, 3) <= Span(doc, 0, 3) and Span(doc, 0, 3) >= Span(doc, 0, 3)
|
||||||
|
assert Span(doc, 0, 3, "LABEL") <= Span(doc, 0, 3, "LABEL") and Span(doc, 0, 3, "LABEL") >= Span(doc, 0, 3, "LABEL")
|
||||||
|
assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") <= Span(doc, 0, 3, "LABEL", kb_id="KB_ID")
|
||||||
|
assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") >= Span(doc, 0, 3, "LABEL", kb_id="KB_ID")
|
||||||
|
|
||||||
|
assert (Span(doc, 0, 3) < Span(doc, 0, 3, "", kb_id="KB_ID") < Span(doc, 0, 3, "LABEL") < Span(doc, 0, 3, "LABEL", kb_id="KB_ID"))
|
||||||
|
assert (Span(doc, 0, 3) <= Span(doc, 0, 3, "", kb_id="KB_ID") <= Span(doc, 0, 3, "LABEL") <= Span(doc, 0, 3, "LABEL", kb_id="KB_ID"))
|
||||||
|
|
||||||
|
assert (Span(doc, 0, 3, "LABEL", kb_id="KB_ID") > Span(doc, 0, 3, "LABEL") > Span(doc, 0, 3, "", kb_id="KB_ID") > Span(doc, 0, 3))
|
||||||
|
assert (Span(doc, 0, 3, "LABEL", kb_id="KB_ID") >= Span(doc, 0, 3, "LABEL") >= Span(doc, 0, 3, "", kb_id="KB_ID") >= Span(doc, 0, 3))
|
||||||
|
|
||||||
|
# Different end
|
||||||
|
assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") < Span(doc, 0, 4, "LABEL", kb_id="KB_ID")
|
||||||
|
|
||||||
|
assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") < Span(doc, 0, 4)
|
||||||
|
assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") <= Span(doc, 0, 4)
|
||||||
|
assert Span(doc, 0, 4) > Span(doc, 0, 3, "LABEL", kb_id="KB_ID")
|
||||||
|
assert Span(doc, 0, 4) >= Span(doc, 0, 3, "LABEL", kb_id="KB_ID")
|
||||||
|
|
||||||
|
# Different start
|
||||||
|
assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") != Span(doc, 1, 3, "LABEL", kb_id="KB_ID")
|
||||||
|
|
||||||
|
assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") < Span(doc, 1, 3)
|
||||||
|
assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") <= Span(doc, 1, 3)
|
||||||
|
assert Span(doc, 1, 3) > Span(doc, 0, 3, "LABEL", kb_id="KB_ID")
|
||||||
|
assert Span(doc, 1, 3) >= Span(doc, 0, 3, "LABEL", kb_id="KB_ID")
|
||||||
|
|
||||||
|
# Different start & different end
|
||||||
|
assert Span(doc, 0, 4, "LABEL", kb_id="KB_ID") != Span(doc, 1, 3, "LABEL", kb_id="KB_ID")
|
||||||
|
|
||||||
|
assert Span(doc, 0, 4, "LABEL", kb_id="KB_ID") < Span(doc, 1, 3)
|
||||||
|
assert Span(doc, 0, 4, "LABEL", kb_id="KB_ID") <= Span(doc, 1, 3)
|
||||||
|
assert Span(doc, 1, 3) > Span(doc, 0, 4, "LABEL", kb_id="KB_ID")
|
||||||
|
assert Span(doc, 1, 3) >= Span(doc, 0, 4, "LABEL", kb_id="KB_ID")
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"start,end,expected_sentences,expected_sentences_with_hook",
|
"start,end,expected_sentences,expected_sentences_with_hook",
|
||||||
[
|
[
|
||||||
|
@ -606,3 +655,16 @@ def test_span_sents(doc, start, end, expected_sentences, expected_sentences_with
|
||||||
def test_span_sents_not_parsed(doc_not_parsed):
|
def test_span_sents_not_parsed(doc_not_parsed):
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
list(Span(doc_not_parsed, 0, 3).sents)
|
list(Span(doc_not_parsed, 0, 3).sents)
|
||||||
|
|
||||||
|
|
||||||
|
def test_span_group_copy(doc):
|
||||||
|
doc.spans["test"] = [doc[0:1], doc[2:4]]
|
||||||
|
assert len(doc.spans["test"]) == 2
|
||||||
|
doc_copy = doc.copy()
|
||||||
|
# check that the spans were indeed copied
|
||||||
|
assert len(doc_copy.spans["test"]) == 2
|
||||||
|
# add a new span to the original doc
|
||||||
|
doc.spans["test"].append(doc[3:4])
|
||||||
|
assert len(doc.spans["test"]) == 3
|
||||||
|
# check that the copy spans were not modified and this is an isolated doc
|
||||||
|
assert len(doc_copy.spans["test"]) == 2
|
||||||
|
|
242
spacy/tests/doc/test_span_group.py
Normal file
242
spacy/tests/doc/test_span_group.py
Normal file
|
@ -0,0 +1,242 @@
|
||||||
|
import pytest
|
||||||
|
from random import Random
|
||||||
|
from spacy.matcher import Matcher
|
||||||
|
from spacy.tokens import Span, SpanGroup
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def doc(en_tokenizer):
|
||||||
|
doc = en_tokenizer("0 1 2 3 4 5 6")
|
||||||
|
matcher = Matcher(en_tokenizer.vocab, validate=True)
|
||||||
|
|
||||||
|
# fmt: off
|
||||||
|
matcher.add("4", [[{}, {}, {}, {}]])
|
||||||
|
matcher.add("2", [[{}, {}, ]])
|
||||||
|
matcher.add("1", [[{}, ]])
|
||||||
|
# fmt: on
|
||||||
|
matches = matcher(doc)
|
||||||
|
spans = []
|
||||||
|
for match in matches:
|
||||||
|
spans.append(
|
||||||
|
Span(doc, match[1], match[2], en_tokenizer.vocab.strings[match[0]])
|
||||||
|
)
|
||||||
|
Random(42).shuffle(spans)
|
||||||
|
doc.spans["SPANS"] = SpanGroup(
|
||||||
|
doc, name="SPANS", attrs={"key": "value"}, spans=spans
|
||||||
|
)
|
||||||
|
return doc
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def other_doc(en_tokenizer):
|
||||||
|
doc = en_tokenizer("0 1 2 3 4 5 6")
|
||||||
|
matcher = Matcher(en_tokenizer.vocab, validate=True)
|
||||||
|
|
||||||
|
# fmt: off
|
||||||
|
matcher.add("4", [[{}, {}, {}, {}]])
|
||||||
|
matcher.add("2", [[{}, {}, ]])
|
||||||
|
matcher.add("1", [[{}, ]])
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
|
matches = matcher(doc)
|
||||||
|
spans = []
|
||||||
|
for match in matches:
|
||||||
|
spans.append(
|
||||||
|
Span(doc, match[1], match[2], en_tokenizer.vocab.strings[match[0]])
|
||||||
|
)
|
||||||
|
Random(42).shuffle(spans)
|
||||||
|
doc.spans["SPANS"] = SpanGroup(
|
||||||
|
doc, name="SPANS", attrs={"key": "value"}, spans=spans
|
||||||
|
)
|
||||||
|
return doc
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def span_group(en_tokenizer):
|
||||||
|
doc = en_tokenizer("0 1 2 3 4 5 6")
|
||||||
|
matcher = Matcher(en_tokenizer.vocab, validate=True)
|
||||||
|
|
||||||
|
# fmt: off
|
||||||
|
matcher.add("4", [[{}, {}, {}, {}]])
|
||||||
|
matcher.add("2", [[{}, {}, ]])
|
||||||
|
matcher.add("1", [[{}, ]])
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
|
matches = matcher(doc)
|
||||||
|
spans = []
|
||||||
|
for match in matches:
|
||||||
|
spans.append(
|
||||||
|
Span(doc, match[1], match[2], en_tokenizer.vocab.strings[match[0]])
|
||||||
|
)
|
||||||
|
Random(42).shuffle(spans)
|
||||||
|
doc.spans["SPANS"] = SpanGroup(
|
||||||
|
doc, name="SPANS", attrs={"key": "value"}, spans=spans
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_span_group_copy(doc):
|
||||||
|
span_group = doc.spans["SPANS"]
|
||||||
|
clone = span_group.copy()
|
||||||
|
assert clone != span_group
|
||||||
|
assert clone.name == span_group.name
|
||||||
|
assert clone.attrs == span_group.attrs
|
||||||
|
assert len(clone) == len(span_group)
|
||||||
|
assert list(span_group) == list(clone)
|
||||||
|
clone.name = "new_name"
|
||||||
|
clone.attrs["key"] = "new_value"
|
||||||
|
clone.append(Span(doc, 0, 6, "LABEL"))
|
||||||
|
assert clone.name != span_group.name
|
||||||
|
assert clone.attrs != span_group.attrs
|
||||||
|
assert span_group.attrs["key"] == "value"
|
||||||
|
assert list(span_group) != list(clone)
|
||||||
|
|
||||||
|
|
||||||
|
def test_span_group_set_item(doc, other_doc):
|
||||||
|
span_group = doc.spans["SPANS"]
|
||||||
|
|
||||||
|
index = 5
|
||||||
|
span = span_group[index]
|
||||||
|
span.label_ = "NEW LABEL"
|
||||||
|
span.kb_id = doc.vocab.strings["KB_ID"]
|
||||||
|
|
||||||
|
assert span_group[index].label != span.label
|
||||||
|
assert span_group[index].kb_id != span.kb_id
|
||||||
|
|
||||||
|
span_group[index] = span
|
||||||
|
assert span_group[index].start == span.start
|
||||||
|
assert span_group[index].end == span.end
|
||||||
|
assert span_group[index].label == span.label
|
||||||
|
assert span_group[index].kb_id == span.kb_id
|
||||||
|
assert span_group[index] == span
|
||||||
|
|
||||||
|
with pytest.raises(IndexError):
|
||||||
|
span_group[-100] = span
|
||||||
|
with pytest.raises(IndexError):
|
||||||
|
span_group[100] = span
|
||||||
|
|
||||||
|
span = Span(other_doc, 0, 2)
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
span_group[index] = span
|
||||||
|
|
||||||
|
|
||||||
|
def test_span_group_has_overlap(doc):
|
||||||
|
span_group = doc.spans["SPANS"]
|
||||||
|
assert span_group.has_overlap
|
||||||
|
|
||||||
|
|
||||||
|
def test_span_group_concat(doc, other_doc):
|
||||||
|
span_group_1 = doc.spans["SPANS"]
|
||||||
|
spans = [doc[0:5], doc[0:6]]
|
||||||
|
span_group_2 = SpanGroup(
|
||||||
|
doc,
|
||||||
|
name="MORE_SPANS",
|
||||||
|
attrs={"key": "new_value", "new_key": "new_value"},
|
||||||
|
spans=spans,
|
||||||
|
)
|
||||||
|
span_group_3 = span_group_1._concat(span_group_2)
|
||||||
|
assert span_group_3.name == span_group_1.name
|
||||||
|
assert span_group_3.attrs == {"key": "value", "new_key": "new_value"}
|
||||||
|
span_list_expected = list(span_group_1) + list(span_group_2)
|
||||||
|
assert list(span_group_3) == list(span_list_expected)
|
||||||
|
|
||||||
|
# Inplace
|
||||||
|
span_list_expected = list(span_group_1) + list(span_group_2)
|
||||||
|
span_group_3 = span_group_1._concat(span_group_2, inplace=True)
|
||||||
|
assert span_group_3 == span_group_1
|
||||||
|
assert span_group_3.name == span_group_1.name
|
||||||
|
assert span_group_3.attrs == {"key": "value", "new_key": "new_value"}
|
||||||
|
assert list(span_group_3) == list(span_list_expected)
|
||||||
|
|
||||||
|
span_group_2 = other_doc.spans["SPANS"]
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
span_group_1._concat(span_group_2)
|
||||||
|
|
||||||
|
|
||||||
|
def test_span_doc_delitem(doc):
|
||||||
|
span_group = doc.spans["SPANS"]
|
||||||
|
length = len(span_group)
|
||||||
|
index = 5
|
||||||
|
span = span_group[index]
|
||||||
|
next_span = span_group[index + 1]
|
||||||
|
del span_group[index]
|
||||||
|
assert len(span_group) == length - 1
|
||||||
|
assert span_group[index] != span
|
||||||
|
assert span_group[index] == next_span
|
||||||
|
|
||||||
|
with pytest.raises(IndexError):
|
||||||
|
del span_group[-100]
|
||||||
|
with pytest.raises(IndexError):
|
||||||
|
del span_group[100]
|
||||||
|
|
||||||
|
|
||||||
|
def test_span_group_add(doc):
|
||||||
|
span_group_1 = doc.spans["SPANS"]
|
||||||
|
spans = [doc[0:5], doc[0:6]]
|
||||||
|
span_group_2 = SpanGroup(
|
||||||
|
doc,
|
||||||
|
name="MORE_SPANS",
|
||||||
|
attrs={"key": "new_value", "new_key": "new_value"},
|
||||||
|
spans=spans,
|
||||||
|
)
|
||||||
|
|
||||||
|
span_group_3_expected = span_group_1._concat(span_group_2)
|
||||||
|
|
||||||
|
span_group_3 = span_group_1 + span_group_2
|
||||||
|
assert len(span_group_3) == len(span_group_3_expected)
|
||||||
|
assert span_group_3.attrs == {"key": "value", "new_key": "new_value"}
|
||||||
|
assert list(span_group_3) == list(span_group_3_expected)
|
||||||
|
|
||||||
|
|
||||||
|
def test_span_group_iadd(doc):
|
||||||
|
span_group_1 = doc.spans["SPANS"].copy()
|
||||||
|
spans = [doc[0:5], doc[0:6]]
|
||||||
|
span_group_2 = SpanGroup(
|
||||||
|
doc,
|
||||||
|
name="MORE_SPANS",
|
||||||
|
attrs={"key": "new_value", "new_key": "new_value"},
|
||||||
|
spans=spans,
|
||||||
|
)
|
||||||
|
|
||||||
|
span_group_1_expected = span_group_1._concat(span_group_2)
|
||||||
|
|
||||||
|
span_group_1 += span_group_2
|
||||||
|
assert len(span_group_1) == len(span_group_1_expected)
|
||||||
|
assert span_group_1.attrs == {"key": "value", "new_key": "new_value"}
|
||||||
|
assert list(span_group_1) == list(span_group_1_expected)
|
||||||
|
|
||||||
|
span_group_1 = doc.spans["SPANS"].copy()
|
||||||
|
span_group_1 += spans
|
||||||
|
assert len(span_group_1) == len(span_group_1_expected)
|
||||||
|
assert span_group_1.attrs == {
|
||||||
|
"key": "value",
|
||||||
|
}
|
||||||
|
assert list(span_group_1) == list(span_group_1_expected)
|
||||||
|
|
||||||
|
|
||||||
|
def test_span_group_extend(doc):
|
||||||
|
span_group_1 = doc.spans["SPANS"].copy()
|
||||||
|
spans = [doc[0:5], doc[0:6]]
|
||||||
|
span_group_2 = SpanGroup(
|
||||||
|
doc,
|
||||||
|
name="MORE_SPANS",
|
||||||
|
attrs={"key": "new_value", "new_key": "new_value"},
|
||||||
|
spans=spans,
|
||||||
|
)
|
||||||
|
|
||||||
|
span_group_1_expected = span_group_1._concat(span_group_2)
|
||||||
|
|
||||||
|
span_group_1.extend(span_group_2)
|
||||||
|
assert len(span_group_1) == len(span_group_1_expected)
|
||||||
|
assert span_group_1.attrs == {"key": "value", "new_key": "new_value"}
|
||||||
|
assert list(span_group_1) == list(span_group_1_expected)
|
||||||
|
|
||||||
|
span_group_1 = doc.spans["SPANS"]
|
||||||
|
span_group_1.extend(spans)
|
||||||
|
assert len(span_group_1) == len(span_group_1_expected)
|
||||||
|
assert span_group_1.attrs == {"key": "value"}
|
||||||
|
assert list(span_group_1) == list(span_group_1_expected)
|
||||||
|
|
||||||
|
|
||||||
|
def test_span_group_dealloc(span_group):
|
||||||
|
with pytest.raises(AttributeError):
|
||||||
|
print(span_group.doc)
|
|
@ -1,5 +1,5 @@
|
||||||
import pytest
|
import pytest
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc, Span
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture()
|
@pytest.fixture()
|
||||||
|
@ -60,3 +60,13 @@ def test_doc_to_json_underscore_error_serialize(doc):
|
||||||
Doc.set_extension("json_test4", method=lambda doc: doc.text)
|
Doc.set_extension("json_test4", method=lambda doc: doc.text)
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
doc.to_json(underscore=["json_test4"])
|
doc.to_json(underscore=["json_test4"])
|
||||||
|
|
||||||
|
|
||||||
|
def test_doc_to_json_span(doc):
|
||||||
|
"""Test that Doc.to_json() includes spans"""
|
||||||
|
doc.spans["test"] = [Span(doc, 0, 2, "test"), Span(doc, 0, 1, "test")]
|
||||||
|
json_doc = doc.to_json()
|
||||||
|
assert "spans" in json_doc
|
||||||
|
assert len(json_doc["spans"]) == 1
|
||||||
|
assert len(json_doc["spans"]["test"]) == 2
|
||||||
|
assert json_doc["spans"]["test"][0]["start"] == 0
|
||||||
|
|
0
spacy/tests/lang/dsb/__init__.py
Normal file
0
spacy/tests/lang/dsb/__init__.py
Normal file
25
spacy/tests/lang/dsb/test_text.py
Normal file
25
spacy/tests/lang/dsb/test_text.py
Normal file
|
@ -0,0 +1,25 @@
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"text,match",
|
||||||
|
[
|
||||||
|
("10", True),
|
||||||
|
("1", True),
|
||||||
|
("10,000", True),
|
||||||
|
("10,00", True),
|
||||||
|
("jadno", True),
|
||||||
|
("dwanassćo", True),
|
||||||
|
("milion", True),
|
||||||
|
("sto", True),
|
||||||
|
("ceła", False),
|
||||||
|
("kopica", False),
|
||||||
|
("narěcow", False),
|
||||||
|
(",", False),
|
||||||
|
("1/2", True),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_lex_attrs_like_number(dsb_tokenizer, text, match):
|
||||||
|
tokens = dsb_tokenizer(text)
|
||||||
|
assert len(tokens) == 1
|
||||||
|
assert tokens[0].like_num == match
|
29
spacy/tests/lang/dsb/test_tokenizer.py
Normal file
29
spacy/tests/lang/dsb/test_tokenizer.py
Normal file
|
@ -0,0 +1,29 @@
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
DSB_BASIC_TOKENIZATION_TESTS = [
|
||||||
|
(
|
||||||
|
"Ale eksistěrujo mimo togo ceła kopica narěcow, ako na pśikład slěpjańska.",
|
||||||
|
[
|
||||||
|
"Ale",
|
||||||
|
"eksistěrujo",
|
||||||
|
"mimo",
|
||||||
|
"togo",
|
||||||
|
"ceła",
|
||||||
|
"kopica",
|
||||||
|
"narěcow",
|
||||||
|
",",
|
||||||
|
"ako",
|
||||||
|
"na",
|
||||||
|
"pśikład",
|
||||||
|
"slěpjańska",
|
||||||
|
".",
|
||||||
|
],
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("text,expected_tokens", DSB_BASIC_TOKENIZATION_TESTS)
|
||||||
|
def test_dsb_tokenizer_basic(dsb_tokenizer, text, expected_tokens):
|
||||||
|
tokens = dsb_tokenizer(text)
|
||||||
|
token_list = [token.text for token in tokens if not token.is_space]
|
||||||
|
assert expected_tokens == token_list
|
189
spacy/tests/lang/fi/test_noun_chunks.py
Normal file
189
spacy/tests/lang/fi/test_noun_chunks.py
Normal file
|
@ -0,0 +1,189 @@
|
||||||
|
import pytest
|
||||||
|
from spacy.tokens import Doc
|
||||||
|
|
||||||
|
|
||||||
|
FI_NP_TEST_EXAMPLES = [
|
||||||
|
(
|
||||||
|
"Kaksi tyttöä potkii punaista palloa",
|
||||||
|
["NUM", "NOUN", "VERB", "ADJ", "NOUN"],
|
||||||
|
["nummod", "nsubj", "ROOT", "amod", "obj"],
|
||||||
|
[1, 1, 0, 1, -2],
|
||||||
|
["Kaksi tyttöä", "punaista palloa"],
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"Erittäin vaarallinen leijona karkasi kiertävän sirkuksen eläintenkesyttäjältä",
|
||||||
|
["ADV", "ADJ", "NOUN", "VERB", "ADJ", "NOUN", "NOUN"],
|
||||||
|
["advmod", "amod", "nsubj", "ROOT", "amod", "nmod:poss", "obl"],
|
||||||
|
[1, 1, 1, 0, 1, 1, -3],
|
||||||
|
["Erittäin vaarallinen leijona", "kiertävän sirkuksen eläintenkesyttäjältä"],
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"Leijona raidallisine tassuineen piileksii Porin kaupungin lähellä",
|
||||||
|
["NOUN", "ADJ", "NOUN", "VERB", "PROPN", "NOUN", "ADP"],
|
||||||
|
["nsubj", "amod", "nmod", "ROOT", "nmod:poss", "obl", "case"],
|
||||||
|
[3, 1, -2, 0, 1, -2, -1],
|
||||||
|
["Leijona raidallisine tassuineen", "Porin kaupungin"],
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"Lounaalla nautittiin salaattia, maukasta kanaa ja raikasta vettä",
|
||||||
|
["NOUN", "VERB", "NOUN", "PUNCT", "ADJ", "NOUN", "CCONJ", "ADJ", "NOUN"],
|
||||||
|
["obl", "ROOT", "obj", "punct", "amod", "conj", "cc", "amod", "conj"],
|
||||||
|
[1, 0, -1, 2, 1, -3, 2, 1, -6],
|
||||||
|
["Lounaalla", "salaattia", "maukasta kanaa", "raikasta vettä"],
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"Minua houkuttaa maalle muuttaminen talven jälkeen",
|
||||||
|
["PRON", "VERB", "NOUN", "NOUN", "NOUN", "ADP"],
|
||||||
|
["obj", "ROOT", "nmod", "nsubj", "obl", "case"],
|
||||||
|
[1, 0, 1, -2, -3, -1],
|
||||||
|
["maalle muuttaminen", "talven"],
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"Päivän kohokohta oli vierailu museossa kummilasten kanssa",
|
||||||
|
["NOUN", "NOUN", "AUX", "NOUN", "NOUN", "NOUN", "ADP"],
|
||||||
|
["nmod:poss", "nsubj:cop", "cop", "ROOT", "nmod", "obl", "case"],
|
||||||
|
[1, 2, 1, 0, -1, -2, -1],
|
||||||
|
["Päivän kohokohta", "vierailu museossa", "kummilasten"],
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"Yrittäjät maksoivat tuomioistuimen määräämät korvaukset",
|
||||||
|
["NOUN", "VERB", "NOUN", "VERB", "NOUN"],
|
||||||
|
["nsubj", "ROOT", "nsubj", "acl", "obj"],
|
||||||
|
[1, 0, 1, 1, -3],
|
||||||
|
["Yrittäjät", "tuomioistuimen", "korvaukset"],
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"Julkisoikeudelliset tai niihin rinnastettavat saatavat ovat suoraan ulosottokelpoisia",
|
||||||
|
["ADJ", "CCONJ", "PRON", "VERB", "NOUN", "AUX", "ADV", "NOUN"],
|
||||||
|
["amod", "cc", "obl", "acl", "nsubj:cop", "cop", "advmod", "ROOT"],
|
||||||
|
[4, 3, 1, 1, 3, 2, 1, 0],
|
||||||
|
["Julkisoikeudelliset tai niihin rinnastettavat saatavat", "ulosottokelpoisia"],
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"Se oli ala-arvoista käytöstä kaikilta oppilailta, myös valvojaoppilailta",
|
||||||
|
["PRON", "AUX", "ADJ", "NOUN", "PRON", "NOUN", "PUNCT", "ADV", "NOUN"],
|
||||||
|
["nsubj:cop", "cop", "amod", "ROOT", "det", "nmod", "punct", "advmod", "appos"],
|
||||||
|
[3, 2, 1, 0, 1, -2, 2, 1, -3],
|
||||||
|
["ala-arvoista käytöstä kaikilta oppilailta", "valvojaoppilailta"],
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"Isä souti veneellä, jonka hän oli vuokrannut",
|
||||||
|
["NOUN", "VERB", "NOUN", "PUNCT", "PRON", "PRON", "AUX", "VERB"],
|
||||||
|
["nsubj", "ROOT", "obl", "punct", "obj", "nsubj", "aux", "acl:relcl"],
|
||||||
|
[1, 0, -1, 4, 3, 2, 1, -5],
|
||||||
|
["Isä", "veneellä"],
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"Kirja, jonka poimin hyllystä, kertoo norsuista",
|
||||||
|
["NOUN", "PUNCT", "PRON", "VERB", "NOUN", "PUNCT", "VERB", "NOUN"],
|
||||||
|
["nsubj", "punct", "obj", "acl:relcl", "obl", "punct", "ROOT", "obl"],
|
||||||
|
[6, 2, 1, -3, -1, 1, 0, -1],
|
||||||
|
["Kirja", "hyllystä", "norsuista"],
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"Huomenna on päivä, jota olemme odottaneet",
|
||||||
|
["NOUN", "AUX", "NOUN", "PUNCT", "PRON", "AUX", "VERB"],
|
||||||
|
["ROOT", "cop", "nsubj:cop", "punct", "obj", "aux", "acl:relcl"],
|
||||||
|
[0, -1, -2, 3, 2, 1, -4],
|
||||||
|
["Huomenna", "päivä"],
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"Liikkuvuuden lisääminen on yksi korkeakoulutuksen keskeisistä kehittämiskohteista",
|
||||||
|
["NOUN", "NOUN", "AUX", "PRON", "NOUN", "ADJ", "NOUN"],
|
||||||
|
["nmod:gobj", "nsubj:cop", "cop", "ROOT", "nmod:poss", "amod", "nmod"],
|
||||||
|
[1, 2, 1, 0, 2, 1, -3],
|
||||||
|
[
|
||||||
|
"Liikkuvuuden lisääminen",
|
||||||
|
"korkeakoulutuksen keskeisistä kehittämiskohteista",
|
||||||
|
],
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"Kaupalliset palvelut jätetään yksityisten palveluntarjoajien tarjottavaksi",
|
||||||
|
["ADJ", "NOUN", "VERB", "ADJ", "NOUN", "NOUN"],
|
||||||
|
["amod", "obj", "ROOT", "amod", "nmod:gsubj", "obl"],
|
||||||
|
[1, 1, 0, 1, 1, -3],
|
||||||
|
["Kaupalliset palvelut", "yksityisten palveluntarjoajien tarjottavaksi"],
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"New York tunnetaan kaupunkina, joka ei koskaan nuku",
|
||||||
|
["PROPN", "PROPN", "VERB", "NOUN", "PUNCT", "PRON", "AUX", "ADV", "VERB"],
|
||||||
|
[
|
||||||
|
"obj",
|
||||||
|
"flat:name",
|
||||||
|
"ROOT",
|
||||||
|
"obl",
|
||||||
|
"punct",
|
||||||
|
"nsubj",
|
||||||
|
"aux",
|
||||||
|
"advmod",
|
||||||
|
"acl:relcl",
|
||||||
|
],
|
||||||
|
[2, -1, 0, -1, 4, 3, 2, 1, -5],
|
||||||
|
["New York", "kaupunkina"],
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"Loput vihjeet saat herra Möttöseltä",
|
||||||
|
["NOUN", "NOUN", "VERB", "NOUN", "PROPN"],
|
||||||
|
["compound:nn", "obj", "ROOT", "compound:nn", "obj"],
|
||||||
|
[1, 1, 0, 1, -2],
|
||||||
|
["Loput vihjeet", "herra Möttöseltä"],
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"mahdollisuus tukea muita päivystysyksiköitä",
|
||||||
|
["NOUN", "VERB", "PRON", "NOUN"],
|
||||||
|
["ROOT", "acl", "det", "obj"],
|
||||||
|
[0, -1, 1, -2],
|
||||||
|
["mahdollisuus", "päivystysyksiköitä"],
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"sairaanhoitopiirit harjoittavat leikkaustoimintaa alueellaan useammassa sairaalassa",
|
||||||
|
["NOUN", "VERB", "NOUN", "NOUN", "ADJ", "NOUN"],
|
||||||
|
["nsubj", "ROOT", "obj", "obl", "amod", "obl"],
|
||||||
|
[1, 0, -1, -1, 1, -3],
|
||||||
|
[
|
||||||
|
"sairaanhoitopiirit",
|
||||||
|
"leikkaustoimintaa",
|
||||||
|
"alueellaan",
|
||||||
|
"useammassa sairaalassa",
|
||||||
|
],
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"Lain mukaan varhaiskasvatus on suunnitelmallista toimintaa",
|
||||||
|
["NOUN", "ADP", "NOUN", "AUX", "ADJ", "NOUN"],
|
||||||
|
["obl", "case", "nsubj:cop", "cop", "amod", "ROOT"],
|
||||||
|
[5, -1, 3, 2, 1, 0],
|
||||||
|
["Lain", "varhaiskasvatus", "suunnitelmallista toimintaa"],
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_noun_chunks_is_parsed(fi_tokenizer):
|
||||||
|
"""Test that noun_chunks raises Value Error for 'fi' language if Doc is not parsed.
|
||||||
|
To check this test, we're constructing a Doc
|
||||||
|
with a new Vocab here and forcing is_parsed to 'False'
|
||||||
|
to make sure the noun chunks don't run.
|
||||||
|
"""
|
||||||
|
doc = fi_tokenizer("Tämä on testi")
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
list(doc.noun_chunks)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"text,pos,deps,heads,expected_noun_chunks", FI_NP_TEST_EXAMPLES
|
||||||
|
)
|
||||||
|
def test_fi_noun_chunks(fi_tokenizer, text, pos, deps, heads, expected_noun_chunks):
|
||||||
|
tokens = fi_tokenizer(text)
|
||||||
|
|
||||||
|
assert len(heads) == len(pos)
|
||||||
|
doc = Doc(
|
||||||
|
tokens.vocab,
|
||||||
|
words=[t.text for t in tokens],
|
||||||
|
heads=[head + i for i, head in enumerate(heads)],
|
||||||
|
deps=deps,
|
||||||
|
pos=pos,
|
||||||
|
)
|
||||||
|
|
||||||
|
noun_chunks = list(doc.noun_chunks)
|
||||||
|
assert len(noun_chunks) == len(expected_noun_chunks)
|
||||||
|
for i, np in enumerate(noun_chunks):
|
||||||
|
assert np.text == expected_noun_chunks[i]
|
|
@ -1,8 +1,230 @@
|
||||||
|
from spacy.tokens import Doc
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
# fmt: off
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"words,heads,deps,pos,chunk_offsets",
|
||||||
|
[
|
||||||
|
# determiner + noun
|
||||||
|
# un nom -> un nom
|
||||||
|
(
|
||||||
|
["un", "nom"],
|
||||||
|
[1, 1],
|
||||||
|
["det", "ROOT"],
|
||||||
|
["DET", "NOUN"],
|
||||||
|
[(0, 2)],
|
||||||
|
),
|
||||||
|
# determiner + noun starting with vowel
|
||||||
|
# l'heure -> l'heure
|
||||||
|
(
|
||||||
|
["l'", "heure"],
|
||||||
|
[1, 1],
|
||||||
|
["det", "ROOT"],
|
||||||
|
["DET", "NOUN"],
|
||||||
|
[(0, 2)],
|
||||||
|
),
|
||||||
|
# determiner + plural noun
|
||||||
|
# les romans -> les romans
|
||||||
|
(
|
||||||
|
["les", "romans"],
|
||||||
|
[1, 1],
|
||||||
|
["det", "ROOT"],
|
||||||
|
["DET", "NOUN"],
|
||||||
|
[(0, 2)],
|
||||||
|
),
|
||||||
|
# det + adj + noun
|
||||||
|
# Le vieux Londres -> Le vieux Londres
|
||||||
|
(
|
||||||
|
['Les', 'vieux', 'Londres'],
|
||||||
|
[2, 2, 2],
|
||||||
|
["det", "amod", "ROOT"],
|
||||||
|
["DET", "ADJ", "NOUN"],
|
||||||
|
[(0,3)]
|
||||||
|
),
|
||||||
|
# det + noun + adj
|
||||||
|
# le nom propre -> le nom propre a proper noun
|
||||||
|
(
|
||||||
|
["le", "nom", "propre"],
|
||||||
|
[1, 1, 1],
|
||||||
|
["det", "ROOT", "amod"],
|
||||||
|
["DET", "NOUN", "ADJ"],
|
||||||
|
[(0, 3)],
|
||||||
|
),
|
||||||
|
# det + noun + adj plural
|
||||||
|
# Les chiens bruns -> les chiens bruns
|
||||||
|
(
|
||||||
|
["Les", "chiens", "bruns"],
|
||||||
|
[1, 1, 1],
|
||||||
|
["det", "ROOT", "amod"],
|
||||||
|
["DET", "NOUN", "ADJ"],
|
||||||
|
[(0, 3)],
|
||||||
|
),
|
||||||
|
# multiple adjectives: one adj before the noun, one adj after the noun
|
||||||
|
# un nouveau film intéressant -> un nouveau film intéressant
|
||||||
|
(
|
||||||
|
["un", "nouveau", "film", "intéressant"],
|
||||||
|
[2, 2, 2, 2],
|
||||||
|
["det", "amod", "ROOT", "amod"],
|
||||||
|
["DET", "ADJ", "NOUN", "ADJ"],
|
||||||
|
[(0,4)]
|
||||||
|
),
|
||||||
|
# multiple adjectives, both adjs after the noun
|
||||||
|
# une personne intelligente et drôle -> une personne intelligente et drôle
|
||||||
|
(
|
||||||
|
["une", "personne", "intelligente", "et", "drôle"],
|
||||||
|
[1, 1, 1, 4, 2],
|
||||||
|
["det", "ROOT", "amod", "cc", "conj"],
|
||||||
|
["DET", "NOUN", "ADJ", "CCONJ", "ADJ"],
|
||||||
|
[(0,5)]
|
||||||
|
),
|
||||||
|
# relative pronoun
|
||||||
|
# un bus qui va au ville -> un bus, qui, ville
|
||||||
|
(
|
||||||
|
['un', 'bus', 'qui', 'va', 'au', 'ville'],
|
||||||
|
[1, 1, 3, 1, 5, 3],
|
||||||
|
['det', 'ROOT', 'nsubj', 'acl:relcl', 'case', 'obl:arg'],
|
||||||
|
['DET', 'NOUN', 'PRON', 'VERB', 'ADP', 'NOUN'],
|
||||||
|
[(0,2), (2,3), (5,6)]
|
||||||
|
),
|
||||||
|
# relative subclause
|
||||||
|
# Voilà la maison que nous voulons acheter -> la maison, nous That's the house that we want to buy.
|
||||||
|
(
|
||||||
|
['Voilà', 'la', 'maison', 'que', 'nous', 'voulons', 'acheter'],
|
||||||
|
[0, 2, 0, 5, 5, 2, 5],
|
||||||
|
['ROOT', 'det', 'obj', 'mark', 'nsubj', 'acl:relcl', 'xcomp'],
|
||||||
|
['VERB', 'DET', 'NOUN', 'SCONJ', 'PRON', 'VERB', 'VERB'],
|
||||||
|
[(1,3), (4,5)]
|
||||||
|
),
|
||||||
|
# Person name and title by flat
|
||||||
|
# Louis XIV -> Louis XIV
|
||||||
|
(
|
||||||
|
["Louis", "XIV"],
|
||||||
|
[0, 0],
|
||||||
|
["ROOT", "flat:name"],
|
||||||
|
["PROPN", "PROPN"],
|
||||||
|
[(0,2)]
|
||||||
|
),
|
||||||
|
# Organization name by flat
|
||||||
|
# Nations Unies -> Nations Unies
|
||||||
|
(
|
||||||
|
["Nations", "Unies"],
|
||||||
|
[0, 0],
|
||||||
|
["ROOT", "flat:name"],
|
||||||
|
["PROPN", "PROPN"],
|
||||||
|
[(0,2)]
|
||||||
|
),
|
||||||
|
# Noun compound, person name created by two flats
|
||||||
|
# Louise de Bratagne -> Louise de Bratagne
|
||||||
|
(
|
||||||
|
["Louise", "de", "Bratagne"],
|
||||||
|
[0, 0, 0],
|
||||||
|
["ROOT", "flat:name", "flat:name"],
|
||||||
|
["PROPN", "PROPN", "PROPN"],
|
||||||
|
[(0,3)]
|
||||||
|
),
|
||||||
|
# Noun compound, person name created by two flats
|
||||||
|
# Louis François Joseph -> Louis François Joseph
|
||||||
|
(
|
||||||
|
["Louis", "François", "Joseph"],
|
||||||
|
[0, 0, 0],
|
||||||
|
["ROOT", "flat:name", "flat:name"],
|
||||||
|
["PROPN", "PROPN", "PROPN"],
|
||||||
|
[(0,3)]
|
||||||
|
),
|
||||||
|
# one determiner + one noun + one adjective qualified by an adverb
|
||||||
|
# quelques agriculteurs très riches -> quelques agriculteurs très riches
|
||||||
|
(
|
||||||
|
["quelques", "agriculteurs", "très", "riches"],
|
||||||
|
[1, 1, 3, 1],
|
||||||
|
['det', 'ROOT', 'advmod', 'amod'],
|
||||||
|
['DET', 'NOUN', 'ADV', 'ADJ'],
|
||||||
|
[(0,4)]
|
||||||
|
),
|
||||||
|
# Two NPs conjuncted
|
||||||
|
# Il a un chien et un chat -> Il, un chien, un chat
|
||||||
|
(
|
||||||
|
['Il', 'a', 'un', 'chien', 'et', 'un', 'chat'],
|
||||||
|
[1, 1, 3, 1, 6, 6, 3],
|
||||||
|
['nsubj', 'ROOT', 'det', 'obj', 'cc', 'det', 'conj'],
|
||||||
|
['PRON', 'VERB', 'DET', 'NOUN', 'CCONJ', 'DET', 'NOUN'],
|
||||||
|
[(0,1), (2,4), (5,7)]
|
||||||
|
|
||||||
|
),
|
||||||
|
# Two NPs together
|
||||||
|
# l'écrivain brésilien Aníbal Machado -> l'écrivain brésilien, Aníbal Machado
|
||||||
|
(
|
||||||
|
["l'", 'écrivain', 'brésilien', 'Aníbal', 'Machado'],
|
||||||
|
[1, 1, 1, 1, 3],
|
||||||
|
['det', 'ROOT', 'amod', 'appos', 'flat:name'],
|
||||||
|
['DET', 'NOUN', 'ADJ', 'PROPN', 'PROPN'],
|
||||||
|
[(0, 3), (3, 5)]
|
||||||
|
),
|
||||||
|
# nmod relation between NPs
|
||||||
|
# la destruction de la ville -> la destruction, la ville
|
||||||
|
(
|
||||||
|
['la', 'destruction', 'de', 'la', 'ville'],
|
||||||
|
[1, 1, 4, 4, 1],
|
||||||
|
['det', 'ROOT', 'case', 'det', 'nmod'],
|
||||||
|
['DET', 'NOUN', 'ADP', 'DET', 'NOUN'],
|
||||||
|
[(0,2), (3,5)]
|
||||||
|
),
|
||||||
|
# nmod relation between NPs
|
||||||
|
# Archiduchesse d’Autriche -> Archiduchesse, Autriche
|
||||||
|
(
|
||||||
|
['Archiduchesse', 'd’', 'Autriche'],
|
||||||
|
[0, 2, 0],
|
||||||
|
['ROOT', 'case', 'nmod'],
|
||||||
|
['NOUN', 'ADP', 'PROPN'],
|
||||||
|
[(0,1), (2,3)]
|
||||||
|
),
|
||||||
|
# Compounding by nmod, several NPs chained together
|
||||||
|
# la première usine de drogue du gouvernement -> la première usine, drogue, gouvernement
|
||||||
|
(
|
||||||
|
["la", "première", "usine", "de", "drogue", "du", "gouvernement"],
|
||||||
|
[2, 2, 2, 4, 2, 6, 2],
|
||||||
|
['det', 'amod', 'ROOT', 'case', 'nmod', 'case', 'nmod'],
|
||||||
|
['DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'],
|
||||||
|
[(0, 3), (4, 5), (6, 7)]
|
||||||
|
),
|
||||||
|
# several NPs
|
||||||
|
# Traduction du rapport de Susana -> Traduction, rapport, Susana
|
||||||
|
(
|
||||||
|
['Traduction', 'du', 'raport', 'de', 'Susana'],
|
||||||
|
[0, 2, 0, 4, 2],
|
||||||
|
['ROOT', 'case', 'nmod', 'case', 'nmod'],
|
||||||
|
['NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'],
|
||||||
|
[(0,1), (2,3), (4,5)]
|
||||||
|
|
||||||
|
),
|
||||||
|
# Several NPs
|
||||||
|
# Le gros chat de Susana et son amie -> Le gros chat, Susana, son amie
|
||||||
|
(
|
||||||
|
['Le', 'gros', 'chat', 'de', 'Susana', 'et', 'son', 'amie'],
|
||||||
|
[2, 2, 2, 4, 2, 7, 7, 2],
|
||||||
|
['det', 'amod', 'ROOT', 'case', 'nmod', 'cc', 'det', 'conj'],
|
||||||
|
['DET', 'ADJ', 'NOUN', 'ADP', 'PROPN', 'CCONJ', 'DET', 'NOUN'],
|
||||||
|
[(0,3), (4,5), (6,8)]
|
||||||
|
),
|
||||||
|
# Passive subject
|
||||||
|
# Les nouvelles dépenses sont alimentées par le grand compte bancaire de Clinton -> Les nouvelles dépenses, le grand compte bancaire, Clinton
|
||||||
|
(
|
||||||
|
['Les', 'nouvelles', 'dépenses', 'sont', 'alimentées', 'par', 'le', 'grand', 'compte', 'bancaire', 'de', 'Clinton'],
|
||||||
|
[2, 2, 4, 4, 4, 8, 8, 8, 4, 8, 11, 8],
|
||||||
|
['det', 'amod', 'nsubj:pass', 'aux:pass', 'ROOT', 'case', 'det', 'amod', 'obl:agent', 'amod', 'case', 'nmod'],
|
||||||
|
['DET', 'ADJ', 'NOUN', 'AUX', 'VERB', 'ADP', 'DET', 'ADJ', 'NOUN', 'ADJ', 'ADP', 'PROPN'],
|
||||||
|
[(0, 3), (6, 10), (11, 12)]
|
||||||
|
)
|
||||||
|
],
|
||||||
|
)
|
||||||
|
# fmt: on
|
||||||
|
def test_fr_noun_chunks(fr_vocab, words, heads, deps, pos, chunk_offsets):
|
||||||
|
doc = Doc(fr_vocab, words=words, heads=heads, deps=deps, pos=pos)
|
||||||
|
assert [(c.start, c.end) for c in doc.noun_chunks] == chunk_offsets
|
||||||
|
|
||||||
|
|
||||||
def test_noun_chunks_is_parsed_fr(fr_tokenizer):
|
def test_noun_chunks_is_parsed_fr(fr_tokenizer):
|
||||||
"""Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed."""
|
"""Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed."""
|
||||||
doc = fr_tokenizer("trouver des travaux antérieurs")
|
doc = fr_tokenizer("Je suis allé à l'école")
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
list(doc.noun_chunks)
|
list(doc.noun_chunks)
|
||||||
|
|
0
spacy/tests/lang/hsb/__init__.py
Normal file
0
spacy/tests/lang/hsb/__init__.py
Normal file
25
spacy/tests/lang/hsb/test_text.py
Normal file
25
spacy/tests/lang/hsb/test_text.py
Normal file
|
@ -0,0 +1,25 @@
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"text,match",
|
||||||
|
[
|
||||||
|
("10", True),
|
||||||
|
("1", True),
|
||||||
|
("10,000", True),
|
||||||
|
("10,00", True),
|
||||||
|
("jedne", True),
|
||||||
|
("dwanaće", True),
|
||||||
|
("milion", True),
|
||||||
|
("sto", True),
|
||||||
|
("załožene", False),
|
||||||
|
("wona", False),
|
||||||
|
("powšitkownej", False),
|
||||||
|
(",", False),
|
||||||
|
("1/2", True),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_lex_attrs_like_number(hsb_tokenizer, text, match):
|
||||||
|
tokens = hsb_tokenizer(text)
|
||||||
|
assert len(tokens) == 1
|
||||||
|
assert tokens[0].like_num == match
|
32
spacy/tests/lang/hsb/test_tokenizer.py
Normal file
32
spacy/tests/lang/hsb/test_tokenizer.py
Normal file
|
@ -0,0 +1,32 @@
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
HSB_BASIC_TOKENIZATION_TESTS = [
|
||||||
|
(
|
||||||
|
"Hornjoserbšćina wobsteji resp. wobsteješe z wjacorych dialektow, kotrež so zdźěla chětro wot so rozeznawachu.",
|
||||||
|
[
|
||||||
|
"Hornjoserbšćina",
|
||||||
|
"wobsteji",
|
||||||
|
"resp.",
|
||||||
|
"wobsteješe",
|
||||||
|
"z",
|
||||||
|
"wjacorych",
|
||||||
|
"dialektow",
|
||||||
|
",",
|
||||||
|
"kotrež",
|
||||||
|
"so",
|
||||||
|
"zdźěla",
|
||||||
|
"chětro",
|
||||||
|
"wot",
|
||||||
|
"so",
|
||||||
|
"rozeznawachu",
|
||||||
|
".",
|
||||||
|
],
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("text,expected_tokens", HSB_BASIC_TOKENIZATION_TESTS)
|
||||||
|
def test_hsb_tokenizer_basic(hsb_tokenizer, text, expected_tokens):
|
||||||
|
tokens = hsb_tokenizer(text)
|
||||||
|
token_list = [token.text for token in tokens if not token.is_space]
|
||||||
|
assert expected_tokens == token_list
|
221
spacy/tests/lang/it/test_noun_chunks.py
Normal file
221
spacy/tests/lang/it/test_noun_chunks.py
Normal file
|
@ -0,0 +1,221 @@
|
||||||
|
from spacy.tokens import Doc
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
# fmt: off
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"words,heads,deps,pos,chunk_offsets",
|
||||||
|
[
|
||||||
|
# determiner + noun
|
||||||
|
# un pollo -> un pollo
|
||||||
|
(
|
||||||
|
["un", "pollo"],
|
||||||
|
[1, 1],
|
||||||
|
["det", "ROOT"],
|
||||||
|
["DET", "NOUN"],
|
||||||
|
[(0,2)],
|
||||||
|
),
|
||||||
|
# two determiners + noun
|
||||||
|
# il mio cane -> il mio cane
|
||||||
|
(
|
||||||
|
["il", "mio", "cane"],
|
||||||
|
[2, 2, 2],
|
||||||
|
["det", "det:poss", "ROOT"],
|
||||||
|
["DET", "DET", "NOUN"],
|
||||||
|
[(0,3)],
|
||||||
|
),
|
||||||
|
# two determiners, one is after noun. rare usage but still testing
|
||||||
|
# il cane mio-> il cane mio
|
||||||
|
(
|
||||||
|
["il", "cane", "mio"],
|
||||||
|
[1, 1, 1],
|
||||||
|
["det", "ROOT", "det:poss"],
|
||||||
|
["DET", "NOUN", "DET"],
|
||||||
|
[(0,3)],
|
||||||
|
),
|
||||||
|
# relative pronoun
|
||||||
|
# È molto bello il vestito che hai acquistat -> il vestito, che the dress that you bought is very pretty.
|
||||||
|
(
|
||||||
|
["È", "molto", "bello", "il", "vestito", "che", "hai", "acquistato"],
|
||||||
|
[2, 2, 2, 4, 2, 7, 7, 4],
|
||||||
|
['cop', 'advmod', 'ROOT', 'det', 'nsubj', 'obj', 'aux', 'acl:relcl'],
|
||||||
|
['AUX', 'ADV', 'ADJ', 'DET', 'NOUN', 'PRON', 'AUX', 'VERB'],
|
||||||
|
[(3,5), (5,6)]
|
||||||
|
),
|
||||||
|
# relative subclause
|
||||||
|
# il computer che hai comprato -> il computer, che the computer that you bought
|
||||||
|
(
|
||||||
|
['il', 'computer', 'che', 'hai', 'comprato'],
|
||||||
|
[1, 1, 4, 4, 1],
|
||||||
|
['det', 'ROOT', 'nsubj', 'aux', 'acl:relcl'],
|
||||||
|
['DET', 'NOUN', 'PRON', 'AUX', 'VERB'],
|
||||||
|
[(0,2), (2,3)]
|
||||||
|
),
|
||||||
|
# det + noun + adj
|
||||||
|
# Una macchina grande -> Una macchina grande
|
||||||
|
(
|
||||||
|
["Una", "macchina", "grande"],
|
||||||
|
[1, 1, 1],
|
||||||
|
["det", "ROOT", "amod"],
|
||||||
|
["DET", "NOUN", "ADJ"],
|
||||||
|
[(0,3)],
|
||||||
|
),
|
||||||
|
# noun + adj plural
|
||||||
|
# mucche bianche
|
||||||
|
(
|
||||||
|
["mucche", "bianche"],
|
||||||
|
[0, 0],
|
||||||
|
["ROOT", "amod"],
|
||||||
|
["NOUN", "ADJ"],
|
||||||
|
[(0,2)],
|
||||||
|
),
|
||||||
|
# det + adj + noun
|
||||||
|
# Una grande macchina -> Una grande macchina
|
||||||
|
(
|
||||||
|
['Una', 'grande', 'macchina'],
|
||||||
|
[2, 2, 2],
|
||||||
|
["det", "amod", "ROOT"],
|
||||||
|
["DET", "ADJ", "NOUN"],
|
||||||
|
[(0,3)]
|
||||||
|
),
|
||||||
|
# det + adj + noun, det with apostrophe
|
||||||
|
# un'importante associazione -> un'importante associazione
|
||||||
|
(
|
||||||
|
["Un'", 'importante', 'associazione'],
|
||||||
|
[2, 2, 2],
|
||||||
|
["det", "amod", "ROOT"],
|
||||||
|
["DET", "ADJ", "NOUN"],
|
||||||
|
[(0,3)]
|
||||||
|
),
|
||||||
|
# multiple adjectives
|
||||||
|
# Un cane piccolo e marrone -> Un cane piccolo e marrone
|
||||||
|
(
|
||||||
|
["Un", "cane", "piccolo", "e", "marrone"],
|
||||||
|
[1, 1, 1, 4, 2],
|
||||||
|
["det", "ROOT", "amod", "cc", "conj"],
|
||||||
|
["DET", "NOUN", "ADJ", "CCONJ", "ADJ"],
|
||||||
|
[(0,5)]
|
||||||
|
),
|
||||||
|
# determiner, adjective, compound created by flat
|
||||||
|
# le Nazioni Unite -> le Nazioni Unite
|
||||||
|
(
|
||||||
|
["le", "Nazioni", "Unite"],
|
||||||
|
[1, 1, 1],
|
||||||
|
["det", "ROOT", "flat:name"],
|
||||||
|
["DET", "PROPN", "PROPN"],
|
||||||
|
[(0,3)]
|
||||||
|
),
|
||||||
|
# one determiner + one noun + one adjective qualified by an adverb
|
||||||
|
# alcuni contadini molto ricchi -> alcuni contadini molto ricchi some very rich farmers
|
||||||
|
(
|
||||||
|
['alcuni', 'contadini', 'molto', 'ricchi'],
|
||||||
|
[1, 1, 3, 1],
|
||||||
|
['det', 'ROOT', 'advmod', 'amod'],
|
||||||
|
['DET', 'NOUN', 'ADV', 'ADJ'],
|
||||||
|
[(0,4)]
|
||||||
|
),
|
||||||
|
# Two NPs conjuncted
|
||||||
|
# Ho un cane e un gatto -> un cane, un gatto
|
||||||
|
(
|
||||||
|
['Ho', 'un', 'cane', 'e', 'un', 'gatto'],
|
||||||
|
[0, 2, 0, 5, 5, 0],
|
||||||
|
['ROOT', 'det', 'obj', 'cc', 'det', 'conj'],
|
||||||
|
['VERB', 'DET', 'NOUN', 'CCONJ', 'DET', 'NOUN'],
|
||||||
|
[(1,3), (4,6)]
|
||||||
|
|
||||||
|
),
|
||||||
|
# Two NPs together
|
||||||
|
# lo scrittore brasiliano Aníbal Machado -> lo scrittore brasiliano, Aníbal Machado
|
||||||
|
(
|
||||||
|
['lo', 'scrittore', 'brasiliano', 'Aníbal', 'Machado'],
|
||||||
|
[1, 1, 1, 1, 3],
|
||||||
|
['det', 'ROOT', 'amod', 'nmod', 'flat:name'],
|
||||||
|
['DET', 'NOUN', 'ADJ', 'PROPN', 'PROPN'],
|
||||||
|
[(0, 3), (3, 5)]
|
||||||
|
),
|
||||||
|
# Noun compound, person name and titles
|
||||||
|
# Dom Pedro II -> Dom Pedro II
|
||||||
|
(
|
||||||
|
["Dom", "Pedro", "II"],
|
||||||
|
[0, 0, 0],
|
||||||
|
["ROOT", "flat:name", "flat:name"],
|
||||||
|
["PROPN", "PROPN", "PROPN"],
|
||||||
|
[(0,3)]
|
||||||
|
),
|
||||||
|
# Noun compound created by flat
|
||||||
|
# gli Stati Uniti
|
||||||
|
(
|
||||||
|
["gli", "Stati", "Uniti"],
|
||||||
|
[1, 1, 1],
|
||||||
|
["det", "ROOT", "flat:name"],
|
||||||
|
["DET", "PROPN", "PROPN"],
|
||||||
|
[(0,3)]
|
||||||
|
),
|
||||||
|
# nmod relation between NPs
|
||||||
|
# la distruzione della città -> la distruzione, città
|
||||||
|
(
|
||||||
|
['la', 'distruzione', 'della', 'città'],
|
||||||
|
[1, 1, 3, 1],
|
||||||
|
['det', 'ROOT', 'case', 'nmod'],
|
||||||
|
['DET', 'NOUN', 'ADP', 'NOUN'],
|
||||||
|
[(0,2), (3,4)]
|
||||||
|
),
|
||||||
|
# Compounding by nmod, several NPs chained together
|
||||||
|
# la prima fabbrica di droga del governo -> la prima fabbrica, droga, governo
|
||||||
|
(
|
||||||
|
["la", "prima", "fabbrica", "di", "droga", "del", "governo"],
|
||||||
|
[2, 2, 2, 4, 2, 6, 2],
|
||||||
|
['det', 'amod', 'ROOT', 'case', 'nmod', 'case', 'nmod'],
|
||||||
|
['DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'],
|
||||||
|
[(0, 3), (4, 5), (6, 7)]
|
||||||
|
),
|
||||||
|
# several NPs
|
||||||
|
# Traduzione del rapporto di Susana -> Traduzione, rapporto, Susana
|
||||||
|
(
|
||||||
|
['Traduzione', 'del', 'rapporto', 'di', 'Susana'],
|
||||||
|
[0, 2, 0, 4, 2],
|
||||||
|
['ROOT', 'case', 'nmod', 'case', 'nmod'],
|
||||||
|
['NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'],
|
||||||
|
[(0,1), (2,3), (4,5)]
|
||||||
|
|
||||||
|
),
|
||||||
|
# Several NPs
|
||||||
|
# Il gatto grasso di Susana e la sua amica -> Il gatto grasso, Susana, sua amica
|
||||||
|
(
|
||||||
|
['Il', 'gatto', 'grasso', 'di', 'Susana', 'e', 'la', 'sua', 'amica'],
|
||||||
|
[1, 1, 1, 4, 1, 8, 8, 8, 1],
|
||||||
|
['det', 'ROOT', 'amod', 'case', 'nmod', 'cc', 'det', 'det:poss', 'conj'],
|
||||||
|
['DET', 'NOUN', 'ADJ', 'ADP', 'PROPN', 'CCONJ', 'DET', 'DET', 'NOUN'],
|
||||||
|
[(0,3), (4,5), (6,9)]
|
||||||
|
),
|
||||||
|
# Passive subject
|
||||||
|
# La nuova spesa è alimentata dal grande conto in banca di Clinton -> Le nuova spesa, grande conto, banca, Clinton
|
||||||
|
(
|
||||||
|
['La', 'nuova', 'spesa', 'è', 'alimentata', 'dal', 'grande', 'conto', 'in', 'banca', 'di', 'Clinton'],
|
||||||
|
[2, 2, 4, 4, 4, 7, 7, 4, 9, 7, 11, 9],
|
||||||
|
['det', 'amod', 'nsubj:pass', 'aux:pass', 'ROOT', 'case', 'amod', 'obl:agent', 'case', 'nmod', 'case', 'nmod'],
|
||||||
|
['DET', 'ADJ', 'NOUN', 'AUX', 'VERB', 'ADP', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'],
|
||||||
|
[(0, 3), (6, 8), (9, 10), (11,12)]
|
||||||
|
),
|
||||||
|
# Misc
|
||||||
|
# Ma mentre questo prestito possa ora sembrare gestibile, un improvviso cambiamento delle circostanze potrebbe portare a problemi di debiti -> questo prestiti, un provisso cambiento, circostanze, problemi, debiti
|
||||||
|
(
|
||||||
|
['Ma', 'mentre', 'questo', 'prestito', 'possa', 'ora', 'sembrare', 'gestibile', ',', 'un', 'improvviso', 'cambiamento', 'delle', 'circostanze', 'potrebbe', 'portare', 'a', 'problemi', 'di', 'debitii'],
|
||||||
|
[15, 6, 3, 6, 6, 6, 15, 6, 6, 11, 11, 15, 13, 11, 15, 15, 17, 15, 19, 17],
|
||||||
|
['cc', 'mark', 'det', 'nsubj', 'aux', 'advmod', 'advcl', 'xcomp', 'punct', 'det', 'amod', 'nsubj', 'case', 'nmod', 'aux', 'ROOT', 'case', 'obl', 'case', 'nmod'],
|
||||||
|
['CCONJ', 'SCONJ', 'DET', 'NOUN', 'AUX', 'ADV', 'VERB', 'ADJ', 'PUNCT', 'DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'AUX', 'VERB', 'ADP', 'NOUN', 'ADP', 'NOUN'],
|
||||||
|
[(2,4), (9,12), (13,14), (17,18), (19,20)]
|
||||||
|
)
|
||||||
|
],
|
||||||
|
)
|
||||||
|
# fmt: on
|
||||||
|
def test_it_noun_chunks(it_vocab, words, heads, deps, pos, chunk_offsets):
|
||||||
|
doc = Doc(it_vocab, words=words, heads=heads, deps=deps, pos=pos)
|
||||||
|
assert [(c.start, c.end) for c in doc.noun_chunks] == chunk_offsets
|
||||||
|
|
||||||
|
|
||||||
|
def test_noun_chunks_is_parsed_it(it_tokenizer):
|
||||||
|
"""Test that noun_chunks raises Value Error for 'it' language if Doc is not parsed."""
|
||||||
|
doc = it_tokenizer("Sei andato a Oxford")
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
list(doc.noun_chunks)
|
17
spacy/tests/lang/it/test_stopwords.py
Normal file
17
spacy/tests/lang/it/test_stopwords.py
Normal file
|
@ -0,0 +1,17 @@
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"word", ["un", "lo", "dell", "dall", "si", "ti", "mi", "quest", "quel", "quello"]
|
||||||
|
)
|
||||||
|
def test_stopwords_basic(it_tokenizer, word):
|
||||||
|
tok = it_tokenizer(word)[0]
|
||||||
|
assert tok.is_stop
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"word", ["quest'uomo", "l'ho", "un'amica", "dell'olio", "s'arrende", "m'ascolti"]
|
||||||
|
)
|
||||||
|
def test_stopwords_elided(it_tokenizer, word):
|
||||||
|
tok = it_tokenizer(word)[0]
|
||||||
|
assert tok.is_stop
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user