mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-18 12:12:20 +03:00
Merge branch 'master' into feature/coref
This commit is contained in:
commit
683f470852
2
.github/ISSUE_TEMPLATE/01_bugs.md
vendored
2
.github/ISSUE_TEMPLATE/01_bugs.md
vendored
|
@ -4,6 +4,8 @@ about: Use this template if you came across a bug or unexpected behaviour differ
|
|||
|
||||
---
|
||||
|
||||
<!-- NOTE: For questions or install related issues, please open a Discussion instead. -->
|
||||
|
||||
## How to reproduce the behaviour
|
||||
<!-- Include a code example or the steps that led to the problem. Please try to be as specific as possible. -->
|
||||
|
||||
|
|
3
.github/ISSUE_TEMPLATE/config.yml
vendored
3
.github/ISSUE_TEMPLATE/config.yml
vendored
|
@ -1,8 +1,5 @@
|
|||
blank_issues_enabled: false
|
||||
contact_links:
|
||||
- name: ⚠️ Python 3.10 Support
|
||||
url: https://github.com/explosion/spaCy/discussions/9418
|
||||
about: Python 3.10 wheels haven't been released yet, see the link for details.
|
||||
- name: 🗯 Discussions Forum
|
||||
url: https://github.com/explosion/spaCy/discussions
|
||||
about: Install issues, usage questions, general discussion and anything else that isn't a bug report.
|
||||
|
|
34
.github/azure-steps.yml
vendored
34
.github/azure-steps.yml
vendored
|
@ -64,12 +64,12 @@ steps:
|
|||
displayName: "Run GPU tests"
|
||||
condition: eq(${{ parameters.gpu }}, true)
|
||||
|
||||
- script: |
|
||||
python -m spacy download ca_core_news_sm
|
||||
python -m spacy download ca_core_news_md
|
||||
python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
|
||||
displayName: 'Test download CLI'
|
||||
condition: eq(variables['python_version'], '3.8')
|
||||
# - script: |
|
||||
# python -m spacy download ca_core_news_sm
|
||||
# python -m spacy download ca_core_news_md
|
||||
# python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
|
||||
# displayName: 'Test download CLI'
|
||||
# condition: eq(variables['python_version'], '3.8')
|
||||
|
||||
- script: |
|
||||
python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
|
||||
|
@ -93,17 +93,17 @@ steps:
|
|||
displayName: 'Test train CLI'
|
||||
condition: eq(variables['python_version'], '3.8')
|
||||
|
||||
- script: |
|
||||
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
|
||||
PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
|
||||
displayName: 'Test assemble CLI'
|
||||
condition: eq(variables['python_version'], '3.8')
|
||||
|
||||
- script: |
|
||||
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
|
||||
python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
|
||||
displayName: 'Test assemble CLI vectors warning'
|
||||
condition: eq(variables['python_version'], '3.8')
|
||||
# - script: |
|
||||
# python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
|
||||
# PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
|
||||
# displayName: 'Test assemble CLI'
|
||||
# condition: eq(variables['python_version'], '3.8')
|
||||
#
|
||||
# - script: |
|
||||
# python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
|
||||
# python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
|
||||
# displayName: 'Test assemble CLI vectors warning'
|
||||
# condition: eq(variables['python_version'], '3.8')
|
||||
|
||||
- script: |
|
||||
python .github/validate_universe_json.py website/meta/universe.json
|
||||
|
|
106
.github/contributors/fonfonx.md
vendored
Normal file
106
.github/contributors/fonfonx.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
|||
# spaCy contributor agreement
|
||||
|
||||
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||
The SCA applies to any contribution that you make to any product or project
|
||||
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||
**"you"** shall mean the person or entity identified below.
|
||||
|
||||
If you agree to be bound by these terms, fill in the information requested
|
||||
below and include the filled-in version with your first pull request, under the
|
||||
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||
should be your GitHub username, with the extension `.md`. For example, the user
|
||||
example_user would create the file `.github/contributors/example_user.md`.
|
||||
|
||||
Read this agreement carefully before signing. These terms and conditions
|
||||
constitute a binding legal agreement.
|
||||
|
||||
## Contributor Agreement
|
||||
|
||||
1. The term "contribution" or "contributed materials" means any source code,
|
||||
object code, patch, tool, sample, graphic, specification, manual,
|
||||
documentation, or any other material posted or submitted by you to the project.
|
||||
|
||||
2. With respect to any worldwide copyrights, or copyright applications and
|
||||
registrations, in your contribution:
|
||||
|
||||
* you hereby assign to us joint ownership, and to the extent that such
|
||||
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||
royalty-free, unrestricted license to exercise all rights under those
|
||||
copyrights. This includes, at our option, the right to sublicense these same
|
||||
rights to third parties through multiple levels of sublicensees or other
|
||||
licensing arrangements;
|
||||
|
||||
* you agree that each of us can do all things in relation to your
|
||||
contribution as if each of us were the sole owners, and if one of us makes
|
||||
a derivative work of your contribution, the one who makes the derivative
|
||||
work (or has it made will be the sole owner of that derivative work;
|
||||
|
||||
* you agree that you will not assert any moral rights in your contribution
|
||||
against us, our licensees or transferees;
|
||||
|
||||
* you agree that we may register a copyright in your contribution and
|
||||
exercise all ownership rights associated with it; and
|
||||
|
||||
* you agree that neither of us has any duty to consult with, obtain the
|
||||
consent of, pay or render an accounting to the other for any use or
|
||||
distribution of your contribution.
|
||||
|
||||
3. With respect to any patents you own, or that you can license without payment
|
||||
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||
|
||||
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||
your contribution in whole or in part, alone or in combination with or
|
||||
included in any product, work or materials arising out of the project to
|
||||
which your contribution was submitted, and
|
||||
|
||||
* at our option, to sublicense these same rights to third parties through
|
||||
multiple levels of sublicensees or other licensing arrangements.
|
||||
|
||||
4. Except as set out above, you keep all right, title, and interest in your
|
||||
contribution. The rights that you grant to us under these terms are effective
|
||||
on the date you first submitted a contribution to us, even if your submission
|
||||
took place before the date you sign these terms.
|
||||
|
||||
5. You covenant, represent, warrant and agree that:
|
||||
|
||||
* Each contribution that you submit is and shall be an original work of
|
||||
authorship and you can legally grant the rights set out in this SCA;
|
||||
|
||||
* to the best of your knowledge, each contribution will not violate any
|
||||
third party's copyrights, trademarks, patents, or other intellectual
|
||||
property rights; and
|
||||
|
||||
* each contribution shall be in compliance with U.S. export control laws and
|
||||
other applicable export and import laws. You agree to notify us if you
|
||||
become aware of any circumstance which would make any of the foregoing
|
||||
representations inaccurate in any respect. We may publicly disclose your
|
||||
participation in the project, including the fact that you have signed the SCA.
|
||||
|
||||
6. This SCA is governed by the laws of the State of California and applicable
|
||||
U.S. Federal law. Any choice of law rules will not apply.
|
||||
|
||||
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||
mark both statements:
|
||||
|
||||
* [x] I am signing on behalf of myself as an individual and no other person
|
||||
or entity, including my employer, has or will have rights with respect to my
|
||||
contributions.
|
||||
|
||||
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||
actual authority to contractually bind that entity.
|
||||
|
||||
## Contributor Details
|
||||
|
||||
| Field | Entry |
|
||||
|------------------------------- | -------------------- |
|
||||
| Name | Xavier Fontaine |
|
||||
| Company name (if applicable) | |
|
||||
| Title or role (if applicable) | |
|
||||
| Date | 2022-04-13 |
|
||||
| GitHub username | fonfonx |
|
||||
| Website (optional) | |
|
21
.github/workflows/gputests.yml
vendored
Normal file
21
.github/workflows/gputests.yml
vendored
Normal file
|
@ -0,0 +1,21 @@
|
|||
name: Weekly GPU tests
|
||||
|
||||
on:
|
||||
schedule:
|
||||
- cron: '0 1 * * MON'
|
||||
|
||||
jobs:
|
||||
weekly-gputests:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
branch: [master, v4]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Trigger buildkite build
|
||||
uses: buildkite/trigger-pipeline-action@v1.2.0
|
||||
env:
|
||||
PIPELINE: explosion-ai/spacy-slow-gpu-tests
|
||||
BRANCH: ${{ matrix.branch }}
|
||||
MESSAGE: ":github: Weekly GPU + slow tests - triggered from a GitHub Action"
|
||||
BUILDKITE_API_ACCESS_TOKEN: ${{ secrets.BUILDKITE_SECRET }}
|
37
.github/workflows/slowtests.yml
vendored
Normal file
37
.github/workflows/slowtests.yml
vendored
Normal file
|
@ -0,0 +1,37 @@
|
|||
name: Daily slow tests
|
||||
|
||||
on:
|
||||
schedule:
|
||||
- cron: '0 0 * * *'
|
||||
|
||||
jobs:
|
||||
daily-slowtests:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
branch: [master, v4]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v1
|
||||
with:
|
||||
ref: ${{ matrix.branch }}
|
||||
- name: Get commits from past 24 hours
|
||||
id: check_commits
|
||||
run: |
|
||||
today=$(date '+%Y-%m-%d %H:%M:%S')
|
||||
yesterday=$(date -d "yesterday" '+%Y-%m-%d %H:%M:%S')
|
||||
if git log --after="$yesterday" --before="$today" | grep commit ; then
|
||||
echo "::set-output name=run_tests::true"
|
||||
else
|
||||
echo "::set-output name=run_tests::false"
|
||||
fi
|
||||
|
||||
- name: Trigger buildkite build
|
||||
if: steps.check_commits.outputs.run_tests == 'true'
|
||||
uses: buildkite/trigger-pipeline-action@v1.2.0
|
||||
env:
|
||||
PIPELINE: explosion-ai/spacy-slow-tests
|
||||
BRANCH: ${{ matrix.branch }}
|
||||
MESSAGE: ":github: Daily slow tests - triggered from a GitHub Action"
|
||||
BUILDKITE_API_ACCESS_TOKEN: ${{ secrets.BUILDKITE_SECRET }}
|
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -9,7 +9,6 @@ keys/
|
|||
spacy/tests/package/setup.cfg
|
||||
spacy/tests/package/pyproject.toml
|
||||
spacy/tests/package/requirements.txt
|
||||
spacy/tests/universe/universe.json
|
||||
|
||||
# Website
|
||||
website/.cache/
|
||||
|
|
|
@ -233,7 +233,7 @@ also want to keep an eye on unused declared variables or repeated
|
|||
(i.e. overwritten) dictionary keys. If your code was formatted with `black`
|
||||
(see above), you shouldn't see any formatting-related warnings.
|
||||
|
||||
The [`.flake8`](.flake8) config defines the configuration we use for this
|
||||
The `flake8` section in [`setup.cfg`](setup.cfg) defines the configuration we use for this
|
||||
codebase. For example, we're not super strict about the line length, and we're
|
||||
excluding very large files like lemmatization and tokenizer exception tables.
|
||||
|
||||
|
|
|
@ -33,7 +33,7 @@ open-source software, released under the MIT license.
|
|||
## 📖 Documentation
|
||||
|
||||
| Documentation | |
|
||||
| -------------------------- | -------------------------------------------------------------- |
|
||||
| ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| ⭐️ **[spaCy 101]** | New to spaCy? Here's everything you need to know! |
|
||||
| 📚 **[Usage Guides]** | How to use spaCy and its features. |
|
||||
| 🚀 **[New in v3.0]** | New features, backwards incompatibilities and migration guide. |
|
||||
|
@ -45,6 +45,7 @@ open-source software, released under the MIT license.
|
|||
| 📺 **[Videos]** | Our YouTube channel with video tutorials, talks and more. |
|
||||
| 🛠 **[Changelog]** | Changes and version history. |
|
||||
| 💝 **[Contribute]** | How to contribute to the spaCy project and code base. |
|
||||
| <a href="https://explosion.ai/spacy-tailored-pipelines"><img src="https://user-images.githubusercontent.com/13643239/152853098-1c761611-ccb0-4ec6-9066-b234552831fe.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Get a custom spaCy pipeline, tailor-made for your NLP problem by spaCy's core developers. Streamlined, production-ready, predictable and maintainable. Start by completing our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-pipelines)** |
|
||||
|
||||
[spacy 101]: https://spacy.io/usage/spacy-101
|
||||
[new in v3.0]: https://spacy.io/usage/v3
|
||||
|
@ -60,9 +61,7 @@ open-source software, released under the MIT license.
|
|||
|
||||
## 💬 Where to ask questions
|
||||
|
||||
The spaCy project is maintained by **[@honnibal](https://github.com/honnibal)**,
|
||||
**[@ines](https://github.com/ines)**, **[@svlandeg](https://github.com/svlandeg)**,
|
||||
**[@adrianeboyd](https://github.com/adrianeboyd)** and **[@polm](https://github.com/polm)**.
|
||||
The spaCy project is maintained by the [spaCy team](https://explosion.ai/about).
|
||||
Please understand that we won't be able to provide individual support via email.
|
||||
We also believe that help is much more valuable if it's shared publicly, so that
|
||||
more people can benefit from it.
|
||||
|
|
|
@ -11,12 +11,14 @@ trigger:
|
|||
exclude:
|
||||
- "website/*"
|
||||
- "*.md"
|
||||
- ".github/workflows/*"
|
||||
pr:
|
||||
paths:
|
||||
exclude:
|
||||
- "*.md"
|
||||
- "website/docs/*"
|
||||
- "website/src/*"
|
||||
- ".github/workflows/*"
|
||||
|
||||
jobs:
|
||||
# Perform basic checks for most important errors (syntax etc.) Uses the config
|
||||
|
|
|
@ -137,7 +137,7 @@ If any of the TODOs you've added are important and should be fixed soon, you sho
|
|||
|
||||
## Type hints
|
||||
|
||||
We use Python type hints across the `.py` files wherever possible. This makes it easy to understand what a function expects and returns, and modern editors will be able to show this information to you when you call an annotated function. Type hints are not currently used in the `.pyx` (Cython) code, except for definitions of registered functions and component factories, where they're used for config validation.
|
||||
We use Python type hints across the `.py` files wherever possible. This makes it easy to understand what a function expects and returns, and modern editors will be able to show this information to you when you call an annotated function. Type hints are not currently used in the `.pyx` (Cython) code, except for definitions of registered functions and component factories, where they're used for config validation. Ideally when developing, run `mypy spacy` on the code base to inspect any issues.
|
||||
|
||||
If possible, you should always use the more descriptive type hints like `List[str]` or even `List[Any]` instead of only `list`. We also annotate arguments and return types of `Callable` – although, you can simplify this if the type otherwise gets too verbose (e.g. functions that return factories to create callbacks). Remember that `Callable` takes two values: a **list** of the argument type(s) in order, and the return values.
|
||||
|
||||
|
@ -155,6 +155,13 @@ def create_callback(some_arg: bool) -> Callable[[str, int], List[str]]:
|
|||
return callback
|
||||
```
|
||||
|
||||
For typing variables, we prefer the explicit format.
|
||||
|
||||
```diff
|
||||
- var = value # type: Type
|
||||
+ var: Type = value
|
||||
```
|
||||
|
||||
For model architectures, Thinc also provides a collection of [custom types](https://thinc.ai/docs/api-types), including more specific types for arrays and model inputs/outputs. Even outside of static type checking, using these types will make the code a lot easier to read and follow, since it's always clear what array types are expected (and what might go wrong if the output is different from the expected type).
|
||||
|
||||
```python
|
||||
|
|
|
@ -5,7 +5,7 @@ requires = [
|
|||
"cymem>=2.0.2,<2.1.0",
|
||||
"preshed>=3.0.2,<3.1.0",
|
||||
"murmurhash>=0.28.0,<1.1.0",
|
||||
"thinc>=8.0.12,<8.1.0",
|
||||
"thinc>=8.0.14,<8.1.0",
|
||||
"blis>=0.4.0,<0.8.0",
|
||||
"pathy",
|
||||
"numpy>=1.15.0",
|
||||
|
|
|
@ -1,14 +1,14 @@
|
|||
# Our libraries
|
||||
spacy-legacy>=3.0.8,<3.1.0
|
||||
spacy-legacy>=3.0.9,<3.1.0
|
||||
spacy-loggers>=1.0.0,<2.0.0
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
thinc>=8.0.12,<8.1.0
|
||||
thinc>=8.0.14,<8.1.0
|
||||
blis>=0.4.0,<0.8.0
|
||||
ml_datasets>=0.2.0,<0.3.0
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
wasabi>=0.8.1,<1.1.0
|
||||
srsly>=2.4.1,<3.0.0
|
||||
wasabi>=0.9.1,<1.1.0
|
||||
srsly>=2.4.3,<3.0.0
|
||||
catalogue>=2.0.6,<2.1.0
|
||||
typer>=0.3.0,<0.5.0
|
||||
pathy>=0.3.5
|
||||
|
@ -26,7 +26,7 @@ typing_extensions>=3.7.4.1,<4.0.0.0; python_version < "3.8"
|
|||
# Development dependencies
|
||||
pre-commit>=2.13.0
|
||||
cython>=0.25,<3.0
|
||||
pytest>=5.2.0
|
||||
pytest>=5.2.0,!=7.1.0
|
||||
pytest-timeout>=1.3.0,<2.0.0
|
||||
mock>=2.0.0,<3.0.0
|
||||
flake8>=3.8.0,<3.10.0
|
||||
|
@ -35,3 +35,4 @@ mypy==0.910
|
|||
types-dataclasses>=0.1.3; python_version < "3.7"
|
||||
types-mock>=0.1.1
|
||||
types-requests
|
||||
black>=22.0,<23.0
|
||||
|
|
10
setup.cfg
10
setup.cfg
|
@ -38,18 +38,18 @@ setup_requires =
|
|||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
thinc>=8.0.12,<8.1.0
|
||||
thinc>=8.0.14,<8.1.0
|
||||
install_requires =
|
||||
# Our libraries
|
||||
spacy-legacy>=3.0.8,<3.1.0
|
||||
spacy-legacy>=3.0.9,<3.1.0
|
||||
spacy-loggers>=1.0.0,<2.0.0
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
thinc>=8.0.12,<8.1.0
|
||||
thinc>=8.0.14,<8.1.0
|
||||
blis>=0.4.0,<0.8.0
|
||||
wasabi>=0.8.1,<1.1.0
|
||||
srsly>=2.4.1,<3.0.0
|
||||
wasabi>=0.9.1,<1.1.0
|
||||
srsly>=2.4.3,<3.0.0
|
||||
catalogue>=2.0.6,<2.1.0
|
||||
typer>=0.3.0,<0.5.0
|
||||
pathy>=0.3.5
|
||||
|
|
3
setup.py
3
setup.py
|
@ -23,6 +23,7 @@ Options.docstrings = True
|
|||
|
||||
PACKAGES = find_packages()
|
||||
MOD_NAMES = [
|
||||
"spacy.training.alignment_array",
|
||||
"spacy.training.example",
|
||||
"spacy.parts_of_speech",
|
||||
"spacy.strings",
|
||||
|
@ -33,6 +34,7 @@ MOD_NAMES = [
|
|||
"spacy.ml.parser_model",
|
||||
"spacy.morphology",
|
||||
"spacy.pipeline.dep_parser",
|
||||
"spacy.pipeline._edit_tree_internals.edit_trees",
|
||||
"spacy.pipeline.morphologizer",
|
||||
"spacy.pipeline.multitask",
|
||||
"spacy.pipeline.ner",
|
||||
|
@ -81,7 +83,6 @@ COPY_FILES = {
|
|||
ROOT / "setup.cfg": PACKAGE_ROOT / "tests" / "package",
|
||||
ROOT / "pyproject.toml": PACKAGE_ROOT / "tests" / "package",
|
||||
ROOT / "requirements.txt": PACKAGE_ROOT / "tests" / "package",
|
||||
ROOT / "website" / "meta" / "universe.json": PACKAGE_ROOT / "tests" / "universe",
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
# fmt: off
|
||||
__title__ = "spacy"
|
||||
__version__ = "3.2.1"
|
||||
__version__ = "3.3.0.dev0"
|
||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||
__projects__ = "https://github.com/explosion/projects"
|
||||
|
|
|
@ -14,6 +14,7 @@ from .pretrain import pretrain # noqa: F401
|
|||
from .debug_data import debug_data # noqa: F401
|
||||
from .debug_config import debug_config # noqa: F401
|
||||
from .debug_model import debug_model # noqa: F401
|
||||
from .debug_diff import debug_diff # noqa: F401
|
||||
from .evaluate import evaluate # noqa: F401
|
||||
from .convert import convert # noqa: F401
|
||||
from .init_pipeline import init_pipeline_cli # noqa: F401
|
||||
|
|
|
@ -360,7 +360,7 @@ def download_file(src: Union[str, "Pathy"], dest: Path, *, force: bool = False)
|
|||
src = str(src)
|
||||
with smart_open.open(src, mode="rb", ignore_ext=True) as input_file:
|
||||
with dest.open(mode="wb") as output_file:
|
||||
output_file.write(input_file.read())
|
||||
shutil.copyfileobj(input_file, output_file)
|
||||
|
||||
|
||||
def ensure_pathy(path):
|
||||
|
|
|
@ -19,6 +19,7 @@ from ..morphology import Morphology
|
|||
from ..language import Language
|
||||
from ..util import registry, resolve_dot_names
|
||||
from ..compat import Literal
|
||||
from ..vectors import Mode as VectorsMode
|
||||
from .. import util
|
||||
|
||||
|
||||
|
@ -170,6 +171,14 @@ def debug_data(
|
|||
show=verbose,
|
||||
)
|
||||
if len(nlp.vocab.vectors):
|
||||
if nlp.vocab.vectors.mode == VectorsMode.floret:
|
||||
msg.info(
|
||||
f"floret vectors with {len(nlp.vocab.vectors)} vectors, "
|
||||
f"{nlp.vocab.vectors_length} dimensions, "
|
||||
f"{nlp.vocab.vectors.minn}-{nlp.vocab.vectors.maxn} char "
|
||||
f"n-gram subwords"
|
||||
)
|
||||
else:
|
||||
msg.info(
|
||||
f"{len(nlp.vocab.vectors)} vectors ({nlp.vocab.vectors.n_keys} "
|
||||
f"unique keys, {nlp.vocab.vectors_length} dimensions)"
|
||||
|
@ -193,6 +202,70 @@ def debug_data(
|
|||
else:
|
||||
msg.info("No word vectors present in the package")
|
||||
|
||||
if "spancat" in factory_names:
|
||||
model_labels_spancat = _get_labels_from_spancat(nlp)
|
||||
has_low_data_warning = False
|
||||
has_no_neg_warning = False
|
||||
|
||||
msg.divider("Span Categorization")
|
||||
msg.table(model_labels_spancat, header=["Spans Key", "Labels"], divider=True)
|
||||
|
||||
msg.text("Label counts in train data: ", show=verbose)
|
||||
for spans_key, data_labels in gold_train_data["spancat"].items():
|
||||
msg.text(
|
||||
f"Key: {spans_key}, {_format_labels(data_labels.items(), counts=True)}",
|
||||
show=verbose,
|
||||
)
|
||||
# Data checks: only take the spans keys in the actual spancat components
|
||||
data_labels_in_component = {
|
||||
spans_key: gold_train_data["spancat"][spans_key]
|
||||
for spans_key in model_labels_spancat.keys()
|
||||
}
|
||||
for spans_key, data_labels in data_labels_in_component.items():
|
||||
for label, count in data_labels.items():
|
||||
# Check for missing labels
|
||||
spans_key_in_model = spans_key in model_labels_spancat.keys()
|
||||
if (spans_key_in_model) and (
|
||||
label not in model_labels_spancat[spans_key]
|
||||
):
|
||||
msg.warn(
|
||||
f"Label '{label}' is not present in the model labels of key '{spans_key}'. "
|
||||
"Performance may degrade after training."
|
||||
)
|
||||
# Check for low number of examples per label
|
||||
if count <= NEW_LABEL_THRESHOLD:
|
||||
msg.warn(
|
||||
f"Low number of examples for label '{label}' in key '{spans_key}' ({count})"
|
||||
)
|
||||
has_low_data_warning = True
|
||||
# Check for negative examples
|
||||
with msg.loading("Analyzing label distribution..."):
|
||||
neg_docs = _get_examples_without_label(
|
||||
train_dataset, label, "spancat", spans_key
|
||||
)
|
||||
if neg_docs == 0:
|
||||
msg.warn(f"No examples for texts WITHOUT new label '{label}'")
|
||||
has_no_neg_warning = True
|
||||
|
||||
if has_low_data_warning:
|
||||
msg.text(
|
||||
f"To train a new span type, your data should include at "
|
||||
f"least {NEW_LABEL_THRESHOLD} instances of the new label",
|
||||
show=verbose,
|
||||
)
|
||||
else:
|
||||
msg.good("Good amount of examples for all labels")
|
||||
|
||||
if has_no_neg_warning:
|
||||
msg.text(
|
||||
"Training data should always include examples of spans "
|
||||
"in context, as well as examples without a given span "
|
||||
"type.",
|
||||
show=verbose,
|
||||
)
|
||||
else:
|
||||
msg.good("Examples without ocurrences available for all labels")
|
||||
|
||||
if "ner" in factory_names:
|
||||
# Get all unique NER labels present in the data
|
||||
labels = set(
|
||||
|
@ -238,7 +311,7 @@ def debug_data(
|
|||
has_low_data_warning = True
|
||||
|
||||
with msg.loading("Analyzing label distribution..."):
|
||||
neg_docs = _get_examples_without_label(train_dataset, label)
|
||||
neg_docs = _get_examples_without_label(train_dataset, label, "ner")
|
||||
if neg_docs == 0:
|
||||
msg.warn(f"No examples for texts WITHOUT new label '{label}'")
|
||||
has_no_neg_warning = True
|
||||
|
@ -573,6 +646,7 @@ def _compile_gold(
|
|||
"deps": Counter(),
|
||||
"words": Counter(),
|
||||
"roots": Counter(),
|
||||
"spancat": dict(),
|
||||
"ws_ents": 0,
|
||||
"boundary_cross_ents": 0,
|
||||
"n_words": 0,
|
||||
|
@ -603,6 +677,7 @@ def _compile_gold(
|
|||
if nlp.vocab.strings[word] not in nlp.vocab.vectors:
|
||||
data["words_missing_vectors"].update([word])
|
||||
if "ner" in factory_names:
|
||||
sent_starts = eg.get_aligned_sent_starts()
|
||||
for i, label in enumerate(eg.get_aligned_ner()):
|
||||
if label is None:
|
||||
continue
|
||||
|
@ -612,10 +687,19 @@ def _compile_gold(
|
|||
if label.startswith(("B-", "U-")):
|
||||
combined_label = label.split("-")[1]
|
||||
data["ner"][combined_label] += 1
|
||||
if gold[i].is_sent_start and label.startswith(("I-", "L-")):
|
||||
if sent_starts[i] == True and label.startswith(("I-", "L-")):
|
||||
data["boundary_cross_ents"] += 1
|
||||
elif label == "-":
|
||||
data["ner"]["-"] += 1
|
||||
if "spancat" in factory_names:
|
||||
for span_key in list(eg.reference.spans.keys()):
|
||||
if span_key not in data["spancat"]:
|
||||
data["spancat"][span_key] = Counter()
|
||||
for i, span in enumerate(eg.reference.spans[span_key]):
|
||||
if span.label_ is None:
|
||||
continue
|
||||
else:
|
||||
data["spancat"][span_key][span.label_] += 1
|
||||
if "textcat" in factory_names or "textcat_multilabel" in factory_names:
|
||||
data["cats"].update(gold.cats)
|
||||
if any(val not in (0, 1) for val in gold.cats.values()):
|
||||
|
@ -686,14 +770,28 @@ def _format_labels(
|
|||
return ", ".join([f"'{l}'" for l in cast(Iterable[str], labels)])
|
||||
|
||||
|
||||
def _get_examples_without_label(data: Sequence[Example], label: str) -> int:
|
||||
def _get_examples_without_label(
|
||||
data: Sequence[Example],
|
||||
label: str,
|
||||
component: Literal["ner", "spancat"] = "ner",
|
||||
spans_key: Optional[str] = "sc",
|
||||
) -> int:
|
||||
count = 0
|
||||
for eg in data:
|
||||
if component == "ner":
|
||||
labels = [
|
||||
label.split("-")[1]
|
||||
for label in eg.get_aligned_ner()
|
||||
if label not in ("O", "-", None)
|
||||
]
|
||||
|
||||
if component == "spancat":
|
||||
labels = (
|
||||
[span.label_ for span in eg.reference.spans[spans_key]]
|
||||
if spans_key in eg.reference.spans
|
||||
else []
|
||||
)
|
||||
|
||||
if label not in labels:
|
||||
count += 1
|
||||
return count
|
||||
|
|
89
spacy/cli/debug_diff.py
Normal file
89
spacy/cli/debug_diff.py
Normal file
|
@ -0,0 +1,89 @@
|
|||
from typing import Optional
|
||||
|
||||
import typer
|
||||
from wasabi import Printer, diff_strings, MarkdownRenderer
|
||||
from pathlib import Path
|
||||
from thinc.api import Config
|
||||
|
||||
from ._util import debug_cli, Arg, Opt, show_validation_error, parse_config_overrides
|
||||
from ..util import load_config
|
||||
from .init_config import init_config, Optimizations
|
||||
|
||||
|
||||
@debug_cli.command(
|
||||
"diff-config",
|
||||
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
||||
)
|
||||
def debug_diff_cli(
|
||||
# fmt: off
|
||||
ctx: typer.Context,
|
||||
config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
|
||||
compare_to: Optional[Path] = Opt(None, help="Path to a config file to diff against, or `None` to compare against default settings", exists=True, allow_dash=True),
|
||||
optimize: Optimizations = Opt(Optimizations.efficiency.value, "--optimize", "-o", help="Whether the user config was optimized for efficiency or accuracy. Only relevant when comparing against the default config."),
|
||||
gpu: bool = Opt(False, "--gpu", "-G", help="Whether the original config can run on a GPU. Only relevant when comparing against the default config."),
|
||||
pretraining: bool = Opt(False, "--pretraining", "--pt", help="Whether to compare on a config with pretraining involved. Only relevant when comparing against the default config."),
|
||||
markdown: bool = Opt(False, "--markdown", "-md", help="Generate Markdown for GitHub issues")
|
||||
# fmt: on
|
||||
):
|
||||
"""Show a diff of a config file with respect to spaCy's defaults or another config file. If
|
||||
additional settings were used in the creation of the config file, then you
|
||||
must supply these as extra parameters to the command when comparing to the default settings. The generated diff
|
||||
can also be used when posting to the discussion forum to provide more
|
||||
information for the maintainers.
|
||||
|
||||
The `optimize`, `gpu`, and `pretraining` options are only relevant when
|
||||
comparing against the default configuration (or specifically when `compare_to` is None).
|
||||
|
||||
DOCS: https://spacy.io/api/cli#debug-diff
|
||||
"""
|
||||
debug_diff(
|
||||
config_path=config_path,
|
||||
compare_to=compare_to,
|
||||
gpu=gpu,
|
||||
optimize=optimize,
|
||||
pretraining=pretraining,
|
||||
markdown=markdown,
|
||||
)
|
||||
|
||||
|
||||
def debug_diff(
|
||||
config_path: Path,
|
||||
compare_to: Optional[Path],
|
||||
gpu: bool,
|
||||
optimize: Optimizations,
|
||||
pretraining: bool,
|
||||
markdown: bool,
|
||||
):
|
||||
msg = Printer()
|
||||
with show_validation_error(hint_fill=False):
|
||||
user_config = load_config(config_path)
|
||||
if compare_to:
|
||||
other_config = load_config(compare_to)
|
||||
else:
|
||||
# Recreate a default config based from user's config
|
||||
lang = user_config["nlp"]["lang"]
|
||||
pipeline = list(user_config["nlp"]["pipeline"])
|
||||
msg.info(f"Found user-defined language: '{lang}'")
|
||||
msg.info(f"Found user-defined pipelines: {pipeline}")
|
||||
other_config = init_config(
|
||||
lang=lang,
|
||||
pipeline=pipeline,
|
||||
optimize=optimize.value,
|
||||
gpu=gpu,
|
||||
pretraining=pretraining,
|
||||
silent=True,
|
||||
)
|
||||
|
||||
user = user_config.to_str()
|
||||
other = other_config.to_str()
|
||||
|
||||
if user == other:
|
||||
msg.warn("No diff to show: configs are identical")
|
||||
else:
|
||||
diff_text = diff_strings(other, user, add_symbols=markdown)
|
||||
if markdown:
|
||||
md = MarkdownRenderer()
|
||||
md.add(md.code_block(diff_text, "diff"))
|
||||
print(md.text)
|
||||
else:
|
||||
print(diff_text)
|
|
@ -7,6 +7,7 @@ from collections import defaultdict
|
|||
from catalogue import RegistryError
|
||||
import srsly
|
||||
import sys
|
||||
import re
|
||||
|
||||
from ._util import app, Arg, Opt, string_to_list, WHEEL_SUFFIX, SDIST_SUFFIX
|
||||
from ..schemas import validate, ModelMetaSchema
|
||||
|
@ -109,6 +110,24 @@ def package(
|
|||
", ".join(meta["requirements"]),
|
||||
)
|
||||
if name is not None:
|
||||
if not name.isidentifier():
|
||||
msg.fail(
|
||||
f"Model name ('{name}') is not a valid module name. "
|
||||
"This is required so it can be imported as a module.",
|
||||
"We recommend names that use ASCII A-Z, a-z, _ (underscore), "
|
||||
"and 0-9. "
|
||||
"For specific details see: https://docs.python.org/3/reference/lexical_analysis.html#identifiers",
|
||||
exits=1,
|
||||
)
|
||||
if not _is_permitted_package_name(name):
|
||||
msg.fail(
|
||||
f"Model name ('{name}') is not a permitted package name. "
|
||||
"This is required to correctly load the model with spacy.load.",
|
||||
"We recommend names that use ASCII A-Z, a-z, _ (underscore), "
|
||||
"and 0-9. "
|
||||
"For specific details see: https://www.python.org/dev/peps/pep-0426/#name",
|
||||
exits=1,
|
||||
)
|
||||
meta["name"] = name
|
||||
if version is not None:
|
||||
meta["version"] = version
|
||||
|
@ -162,7 +181,7 @@ def package(
|
|||
imports="\n".join(f"from . import {m}" for m in imports)
|
||||
)
|
||||
create_file(package_path / "__init__.py", init_py)
|
||||
msg.good(f"Successfully created package '{model_name_v}'", main_path)
|
||||
msg.good(f"Successfully created package directory '{model_name_v}'", main_path)
|
||||
if create_sdist:
|
||||
with util.working_dir(main_path):
|
||||
util.run_command([sys.executable, "setup.py", "sdist"], capture=False)
|
||||
|
@ -171,8 +190,14 @@ def package(
|
|||
if create_wheel:
|
||||
with util.working_dir(main_path):
|
||||
util.run_command([sys.executable, "setup.py", "bdist_wheel"], capture=False)
|
||||
wheel = main_path / "dist" / f"{model_name_v}{WHEEL_SUFFIX}"
|
||||
wheel_name_squashed = re.sub("_+", "_", model_name_v)
|
||||
wheel = main_path / "dist" / f"{wheel_name_squashed}{WHEEL_SUFFIX}"
|
||||
msg.good(f"Successfully created binary wheel", wheel)
|
||||
if "__" in model_name:
|
||||
msg.warn(
|
||||
f"Model name ('{model_name}') contains a run of underscores. "
|
||||
"Runs of underscores are not significant in installed package names.",
|
||||
)
|
||||
|
||||
|
||||
def has_wheel() -> bool:
|
||||
|
@ -422,6 +447,14 @@ def _format_label_scheme(data: Dict[str, Any]) -> str:
|
|||
return md.text
|
||||
|
||||
|
||||
def _is_permitted_package_name(package_name: str) -> bool:
|
||||
# regex from: https://www.python.org/dev/peps/pep-0426/#name
|
||||
permitted_match = re.search(
|
||||
r"^([A-Z0-9]|[A-Z0-9][A-Z0-9._-]*[A-Z0-9])$", package_name, re.IGNORECASE
|
||||
)
|
||||
return permitted_match is not None
|
||||
|
||||
|
||||
TEMPLATE_SETUP = """
|
||||
#!/usr/bin/env python
|
||||
import io
|
||||
|
|
|
@ -3,9 +3,15 @@ the docs and the init config command. It encodes various best practices and
|
|||
can help generate the best possible configuration, given a user's requirements. #}
|
||||
{%- set use_transformer = hardware != "cpu" -%}
|
||||
{%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
|
||||
{%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "spancat", "trainable_lemmatizer"] -%}
|
||||
[paths]
|
||||
train = null
|
||||
dev = null
|
||||
{% if use_transformer or optimize == "efficiency" or not word_vectors -%}
|
||||
vectors = null
|
||||
{% else -%}
|
||||
vectors = "{{ word_vectors }}"
|
||||
{% endif -%}
|
||||
|
||||
[system]
|
||||
{% if use_transformer -%}
|
||||
|
@ -19,10 +25,10 @@ lang = "{{ lang }}"
|
|||
{%- set has_textcat = ("textcat" in components or "textcat_multilabel" in components) -%}
|
||||
{%- set with_accuracy = optimize == "accuracy" -%}
|
||||
{%- set has_accurate_textcat = has_textcat and with_accuracy -%}
|
||||
{%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "entity_linker" in components or has_accurate_textcat) -%}
|
||||
{%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components %}
|
||||
{%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "spancat" in components or "trainable_lemmatizer" in components or "entity_linker" in components or has_accurate_textcat) -%}
|
||||
{%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components -%}
|
||||
{%- else -%}
|
||||
{%- set full_pipeline = components %}
|
||||
{%- set full_pipeline = components -%}
|
||||
{%- endif %}
|
||||
pipeline = {{ full_pipeline|pprint()|replace("'", '"')|safe }}
|
||||
batch_size = {{ 128 if hardware == "gpu" else 1000 }}
|
||||
|
@ -49,7 +55,7 @@ stride = 96
|
|||
factory = "morphologizer"
|
||||
|
||||
[components.morphologizer.model]
|
||||
@architectures = "spacy.Tagger.v1"
|
||||
@architectures = "spacy.Tagger.v2"
|
||||
nO = null
|
||||
|
||||
[components.morphologizer.model.tok2vec]
|
||||
|
@ -65,7 +71,7 @@ grad_factor = 1.0
|
|||
factory = "tagger"
|
||||
|
||||
[components.tagger.model]
|
||||
@architectures = "spacy.Tagger.v1"
|
||||
@architectures = "spacy.Tagger.v2"
|
||||
nO = null
|
||||
|
||||
[components.tagger.model.tok2vec]
|
||||
|
@ -118,6 +124,60 @@ grad_factor = 1.0
|
|||
@layers = "reduce_mean.v1"
|
||||
{% endif -%}
|
||||
|
||||
{% if "spancat" in components -%}
|
||||
[components.spancat]
|
||||
factory = "spancat"
|
||||
max_positive = null
|
||||
scorer = {"@scorers":"spacy.spancat_scorer.v1"}
|
||||
spans_key = "sc"
|
||||
threshold = 0.5
|
||||
|
||||
[components.spancat.model]
|
||||
@architectures = "spacy.SpanCategorizer.v1"
|
||||
|
||||
[components.spancat.model.reducer]
|
||||
@layers = "spacy.mean_max_reducer.v1"
|
||||
hidden_size = 128
|
||||
|
||||
[components.spancat.model.scorer]
|
||||
@layers = "spacy.LinearLogistic.v1"
|
||||
nO = null
|
||||
nI = null
|
||||
|
||||
[components.spancat.model.tok2vec]
|
||||
@architectures = "spacy-transformers.TransformerListener.v1"
|
||||
grad_factor = 1.0
|
||||
|
||||
[components.spancat.model.tok2vec.pooling]
|
||||
@layers = "reduce_mean.v1"
|
||||
|
||||
[components.spancat.suggester]
|
||||
@misc = "spacy.ngram_suggester.v1"
|
||||
sizes = [1,2,3]
|
||||
{% endif -%}
|
||||
|
||||
{% if "trainable_lemmatizer" in components -%}
|
||||
[components.trainable_lemmatizer]
|
||||
factory = "trainable_lemmatizer"
|
||||
backoff = "orth"
|
||||
min_tree_freq = 3
|
||||
overwrite = false
|
||||
scorer = {"@scorers":"spacy.lemmatizer_scorer.v1"}
|
||||
top_k = 1
|
||||
|
||||
[components.trainable_lemmatizer.model]
|
||||
@architectures = "spacy.Tagger.v2"
|
||||
nO = null
|
||||
normalize = false
|
||||
|
||||
[components.trainable_lemmatizer.model.tok2vec]
|
||||
@architectures = "spacy-transformers.TransformerListener.v1"
|
||||
grad_factor = 1.0
|
||||
|
||||
[components.trainable_lemmatizer.model.tok2vec.pooling]
|
||||
@layers = "reduce_mean.v1"
|
||||
{% endif -%}
|
||||
|
||||
{% if "entity_linker" in components -%}
|
||||
[components.entity_linker]
|
||||
factory = "entity_linker"
|
||||
|
@ -126,7 +186,7 @@ incl_context = true
|
|||
incl_prior = true
|
||||
|
||||
[components.entity_linker.model]
|
||||
@architectures = "spacy.EntityLinker.v1"
|
||||
@architectures = "spacy.EntityLinker.v2"
|
||||
nO = null
|
||||
|
||||
[components.entity_linker.model.tok2vec]
|
||||
|
@ -233,7 +293,7 @@ maxout_pieces = 3
|
|||
factory = "morphologizer"
|
||||
|
||||
[components.morphologizer.model]
|
||||
@architectures = "spacy.Tagger.v1"
|
||||
@architectures = "spacy.Tagger.v2"
|
||||
nO = null
|
||||
|
||||
[components.morphologizer.model.tok2vec]
|
||||
|
@ -246,7 +306,7 @@ width = ${components.tok2vec.model.encode.width}
|
|||
factory = "tagger"
|
||||
|
||||
[components.tagger.model]
|
||||
@architectures = "spacy.Tagger.v1"
|
||||
@architectures = "spacy.Tagger.v2"
|
||||
nO = null
|
||||
|
||||
[components.tagger.model.tok2vec]
|
||||
|
@ -290,6 +350,54 @@ nO = null
|
|||
width = ${components.tok2vec.model.encode.width}
|
||||
{% endif %}
|
||||
|
||||
{% if "spancat" in components %}
|
||||
[components.spancat]
|
||||
factory = "spancat"
|
||||
max_positive = null
|
||||
scorer = {"@scorers":"spacy.spancat_scorer.v1"}
|
||||
spans_key = "sc"
|
||||
threshold = 0.5
|
||||
|
||||
[components.spancat.model]
|
||||
@architectures = "spacy.SpanCategorizer.v1"
|
||||
|
||||
[components.spancat.model.reducer]
|
||||
@layers = "spacy.mean_max_reducer.v1"
|
||||
hidden_size = 128
|
||||
|
||||
[components.spancat.model.scorer]
|
||||
@layers = "spacy.LinearLogistic.v1"
|
||||
nO = null
|
||||
nI = null
|
||||
|
||||
[components.spancat.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecListener.v1"
|
||||
width = ${components.tok2vec.model.encode.width}
|
||||
|
||||
[components.spancat.suggester]
|
||||
@misc = "spacy.ngram_suggester.v1"
|
||||
sizes = [1,2,3]
|
||||
{% endif %}
|
||||
|
||||
{% if "trainable_lemmatizer" in components -%}
|
||||
[components.trainable_lemmatizer]
|
||||
factory = "trainable_lemmatizer"
|
||||
backoff = "orth"
|
||||
min_tree_freq = 3
|
||||
overwrite = false
|
||||
scorer = {"@scorers":"spacy.lemmatizer_scorer.v1"}
|
||||
top_k = 1
|
||||
|
||||
[components.trainable_lemmatizer.model]
|
||||
@architectures = "spacy.Tagger.v2"
|
||||
nO = null
|
||||
normalize = false
|
||||
|
||||
[components.trainable_lemmatizer.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecListener.v1"
|
||||
width = ${components.tok2vec.model.encode.width}
|
||||
{% endif -%}
|
||||
|
||||
{% if "entity_linker" in components -%}
|
||||
[components.entity_linker]
|
||||
factory = "entity_linker"
|
||||
|
@ -298,7 +406,7 @@ incl_context = true
|
|||
incl_prior = true
|
||||
|
||||
[components.entity_linker.model]
|
||||
@architectures = "spacy.EntityLinker.v1"
|
||||
@architectures = "spacy.EntityLinker.v2"
|
||||
nO = null
|
||||
|
||||
[components.entity_linker.model.tok2vec]
|
||||
|
@ -364,7 +472,7 @@ no_output_layer = false
|
|||
{% endif %}
|
||||
|
||||
{% for pipe in components %}
|
||||
{% if pipe not in ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker"] %}
|
||||
{% if pipe not in listener_components %}
|
||||
{# Other components defined by the user: we just assume they're factories #}
|
||||
[components.{{ pipe }}]
|
||||
factory = "{{ pipe }}"
|
||||
|
@ -421,8 +529,4 @@ compound = 1.001
|
|||
{% endif %}
|
||||
|
||||
[initialize]
|
||||
{% if use_transformer or optimize == "efficiency" or not word_vectors -%}
|
||||
vectors = ${paths.vectors}
|
||||
{% else -%}
|
||||
vectors = "{{ word_vectors }}"
|
||||
{% endif -%}
|
||||
|
|
|
@ -4,10 +4,10 @@ spaCy's built in visualization suite for dependencies and named entities.
|
|||
DOCS: https://spacy.io/api/top-level#displacy
|
||||
USAGE: https://spacy.io/usage/visualizers
|
||||
"""
|
||||
from typing import Union, Iterable, Optional, Dict, Any, Callable
|
||||
from typing import List, Union, Iterable, Optional, Dict, Any, Callable
|
||||
import warnings
|
||||
|
||||
from .render import DependencyRenderer, EntityRenderer
|
||||
from .render import DependencyRenderer, EntityRenderer, SpanRenderer
|
||||
from ..tokens import Doc, Span
|
||||
from ..errors import Errors, Warnings
|
||||
from ..util import is_in_jupyter
|
||||
|
@ -44,6 +44,7 @@ def render(
|
|||
factories = {
|
||||
"dep": (DependencyRenderer, parse_deps),
|
||||
"ent": (EntityRenderer, parse_ents),
|
||||
"span": (SpanRenderer, parse_spans),
|
||||
}
|
||||
if style not in factories:
|
||||
raise ValueError(Errors.E087.format(style=style))
|
||||
|
@ -203,6 +204,42 @@ def parse_ents(doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
|
|||
return {"text": doc.text, "ents": ents, "title": title, "settings": settings}
|
||||
|
||||
|
||||
def parse_spans(doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
|
||||
"""Generate spans in [{start: i, end: i, label: 'label'}] format.
|
||||
|
||||
doc (Doc): Document to parse.
|
||||
options (Dict[str, any]): Span-specific visualisation options.
|
||||
RETURNS (dict): Generated span types keyed by text (original text) and spans.
|
||||
"""
|
||||
kb_url_template = options.get("kb_url_template", None)
|
||||
spans_key = options.get("spans_key", "sc")
|
||||
spans = [
|
||||
{
|
||||
"start": span.start_char,
|
||||
"end": span.end_char,
|
||||
"start_token": span.start,
|
||||
"end_token": span.end,
|
||||
"label": span.label_,
|
||||
"kb_id": span.kb_id_ if span.kb_id_ else "",
|
||||
"kb_url": kb_url_template.format(span.kb_id_) if kb_url_template else "#",
|
||||
}
|
||||
for span in doc.spans[spans_key]
|
||||
]
|
||||
tokens = [token.text for token in doc]
|
||||
|
||||
if not spans:
|
||||
warnings.warn(Warnings.W117.format(spans_key=spans_key))
|
||||
title = doc.user_data.get("title", None) if hasattr(doc, "user_data") else None
|
||||
settings = get_doc_settings(doc)
|
||||
return {
|
||||
"text": doc.text,
|
||||
"spans": spans,
|
||||
"title": title,
|
||||
"settings": settings,
|
||||
"tokens": tokens,
|
||||
}
|
||||
|
||||
|
||||
def set_render_wrapper(func: Callable[[str], str]) -> None:
|
||||
"""Set an optional wrapper function that is called around the generated
|
||||
HTML markup on displacy.render. This can be used to allow integration into
|
||||
|
|
|
@ -1,12 +1,15 @@
|
|||
from typing import Dict, Any, List, Optional, Union
|
||||
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||
import uuid
|
||||
import itertools
|
||||
|
||||
from .templates import TPL_DEP_SVG, TPL_DEP_WORDS, TPL_DEP_WORDS_LEMMA, TPL_DEP_ARCS
|
||||
from .templates import TPL_ENT, TPL_ENT_RTL, TPL_FIGURE, TPL_TITLE, TPL_PAGE
|
||||
from .templates import TPL_ENTS, TPL_KB_LINK
|
||||
from ..util import minify_html, escape_html, registry
|
||||
from ..errors import Errors
|
||||
|
||||
from ..util import escape_html, minify_html, registry
|
||||
from .templates import TPL_DEP_ARCS, TPL_DEP_SVG, TPL_DEP_WORDS
|
||||
from .templates import TPL_DEP_WORDS_LEMMA, TPL_ENT, TPL_ENT_RTL, TPL_ENTS
|
||||
from .templates import TPL_FIGURE, TPL_KB_LINK, TPL_PAGE, TPL_SPAN
|
||||
from .templates import TPL_SPAN_RTL, TPL_SPAN_SLICE, TPL_SPAN_SLICE_RTL
|
||||
from .templates import TPL_SPAN_START, TPL_SPAN_START_RTL, TPL_SPANS
|
||||
from .templates import TPL_TITLE
|
||||
|
||||
DEFAULT_LANG = "en"
|
||||
DEFAULT_DIR = "ltr"
|
||||
|
@ -33,6 +36,168 @@ DEFAULT_LABEL_COLORS = {
|
|||
}
|
||||
|
||||
|
||||
class SpanRenderer:
|
||||
"""Render Spans as SVGs."""
|
||||
|
||||
style = "span"
|
||||
|
||||
def __init__(self, options: Dict[str, Any] = {}) -> None:
|
||||
"""Initialise span renderer
|
||||
|
||||
options (dict): Visualiser-specific options (colors, spans)
|
||||
"""
|
||||
# Set up the colors and overall look
|
||||
colors = dict(DEFAULT_LABEL_COLORS)
|
||||
user_colors = registry.displacy_colors.get_all()
|
||||
for user_color in user_colors.values():
|
||||
if callable(user_color):
|
||||
# Since this comes from the function registry, we want to make
|
||||
# sure we support functions that *return* a dict of colors
|
||||
user_color = user_color()
|
||||
if not isinstance(user_color, dict):
|
||||
raise ValueError(Errors.E925.format(obj=type(user_color)))
|
||||
colors.update(user_color)
|
||||
colors.update(options.get("colors", {}))
|
||||
self.default_color = DEFAULT_ENTITY_COLOR
|
||||
self.colors = {label.upper(): color for label, color in colors.items()}
|
||||
|
||||
# Set up how the text and labels will be rendered
|
||||
self.direction = DEFAULT_DIR
|
||||
self.lang = DEFAULT_LANG
|
||||
self.top_offset = options.get("top_offset", 40)
|
||||
self.top_offset_step = options.get("top_offset_step", 17)
|
||||
|
||||
# Set up which templates will be used
|
||||
template = options.get("template")
|
||||
if template:
|
||||
self.span_template = template["span"]
|
||||
self.span_slice_template = template["slice"]
|
||||
self.span_start_template = template["start"]
|
||||
else:
|
||||
if self.direction == "rtl":
|
||||
self.span_template = TPL_SPAN_RTL
|
||||
self.span_slice_template = TPL_SPAN_SLICE_RTL
|
||||
self.span_start_template = TPL_SPAN_START_RTL
|
||||
else:
|
||||
self.span_template = TPL_SPAN
|
||||
self.span_slice_template = TPL_SPAN_SLICE
|
||||
self.span_start_template = TPL_SPAN_START
|
||||
|
||||
def render(
|
||||
self, parsed: List[Dict[str, Any]], page: bool = False, minify: bool = False
|
||||
) -> str:
|
||||
"""Render complete markup.
|
||||
|
||||
parsed (list): Dependency parses to render.
|
||||
page (bool): Render parses wrapped as full HTML page.
|
||||
minify (bool): Minify HTML markup.
|
||||
RETURNS (str): Rendered HTML markup.
|
||||
"""
|
||||
rendered = []
|
||||
for i, p in enumerate(parsed):
|
||||
if i == 0:
|
||||
settings = p.get("settings", {})
|
||||
self.direction = settings.get("direction", DEFAULT_DIR)
|
||||
self.lang = settings.get("lang", DEFAULT_LANG)
|
||||
rendered.append(self.render_spans(p["tokens"], p["spans"], p.get("title")))
|
||||
|
||||
if page:
|
||||
docs = "".join([TPL_FIGURE.format(content=doc) for doc in rendered])
|
||||
markup = TPL_PAGE.format(content=docs, lang=self.lang, dir=self.direction)
|
||||
else:
|
||||
markup = "".join(rendered)
|
||||
if minify:
|
||||
return minify_html(markup)
|
||||
return markup
|
||||
|
||||
def render_spans(
|
||||
self,
|
||||
tokens: List[str],
|
||||
spans: List[Dict[str, Any]],
|
||||
title: Optional[str],
|
||||
) -> str:
|
||||
"""Render span types in text.
|
||||
|
||||
Spans are rendered per-token, this means that for each token, we check if it's part
|
||||
of a span slice (a member of a span type) or a span start (the starting token of a
|
||||
given span type).
|
||||
|
||||
tokens (list): Individual tokens in the text
|
||||
spans (list): Individual entity spans and their start, end, label, kb_id and kb_url.
|
||||
title (str / None): Document title set in Doc.user_data['title'].
|
||||
"""
|
||||
per_token_info = []
|
||||
for idx, token in enumerate(tokens):
|
||||
# Identify if a token belongs to a Span (and which) and if it's a
|
||||
# start token of said Span. We'll use this for the final HTML render
|
||||
token_markup: Dict[str, Any] = {}
|
||||
token_markup["text"] = token
|
||||
entities = []
|
||||
for span in spans:
|
||||
ent = {}
|
||||
if span["start_token"] <= idx < span["end_token"]:
|
||||
ent["label"] = span["label"]
|
||||
ent["is_start"] = True if idx == span["start_token"] else False
|
||||
kb_id = span.get("kb_id", "")
|
||||
kb_url = span.get("kb_url", "#")
|
||||
ent["kb_link"] = (
|
||||
TPL_KB_LINK.format(kb_id=kb_id, kb_url=kb_url) if kb_id else ""
|
||||
)
|
||||
entities.append(ent)
|
||||
token_markup["entities"] = entities
|
||||
per_token_info.append(token_markup)
|
||||
|
||||
markup = self._render_markup(per_token_info)
|
||||
markup = TPL_SPANS.format(content=markup, dir=self.direction)
|
||||
if title:
|
||||
markup = TPL_TITLE.format(title=title) + markup
|
||||
return markup
|
||||
|
||||
def _render_markup(self, per_token_info: List[Dict[str, Any]]) -> str:
|
||||
"""Render the markup from per-token information"""
|
||||
markup = ""
|
||||
for token in per_token_info:
|
||||
entities = sorted(token["entities"], key=lambda d: d["label"])
|
||||
if entities:
|
||||
slices = self._get_span_slices(token["entities"])
|
||||
starts = self._get_span_starts(token["entities"])
|
||||
markup += self.span_template.format(
|
||||
text=token["text"], span_slices=slices, span_starts=starts
|
||||
)
|
||||
else:
|
||||
markup += escape_html(token["text"] + " ")
|
||||
return markup
|
||||
|
||||
def _get_span_slices(self, entities: List[Dict]) -> str:
|
||||
"""Get the rendered markup of all Span slices"""
|
||||
span_slices = []
|
||||
for entity, step in zip(entities, itertools.count(step=self.top_offset_step)):
|
||||
color = self.colors.get(entity["label"].upper(), self.default_color)
|
||||
span_slice = self.span_slice_template.format(
|
||||
bg=color, top_offset=self.top_offset + step
|
||||
)
|
||||
span_slices.append(span_slice)
|
||||
return "".join(span_slices)
|
||||
|
||||
def _get_span_starts(self, entities: List[Dict]) -> str:
|
||||
"""Get the rendered markup of all Span start tokens"""
|
||||
span_starts = []
|
||||
for entity, step in zip(entities, itertools.count(step=self.top_offset_step)):
|
||||
color = self.colors.get(entity["label"].upper(), self.default_color)
|
||||
span_start = (
|
||||
self.span_start_template.format(
|
||||
bg=color,
|
||||
top_offset=self.top_offset + step,
|
||||
label=entity["label"],
|
||||
kb_link=entity["kb_link"],
|
||||
)
|
||||
if entity["is_start"]
|
||||
else ""
|
||||
)
|
||||
span_starts.append(span_start)
|
||||
return "".join(span_starts)
|
||||
|
||||
|
||||
class DependencyRenderer:
|
||||
"""Render dependency parses as SVGs."""
|
||||
|
||||
|
@ -105,7 +270,7 @@ class DependencyRenderer:
|
|||
RETURNS (str): Rendered SVG markup.
|
||||
"""
|
||||
self.levels = self.get_levels(arcs)
|
||||
self.highest_level = len(self.levels)
|
||||
self.highest_level = max(self.levels.values(), default=0)
|
||||
self.offset_y = self.distance / 2 * self.highest_level + self.arrow_stroke
|
||||
self.width = self.offset_x + len(words) * self.distance
|
||||
self.height = self.offset_y + 3 * self.word_spacing
|
||||
|
@ -165,7 +330,7 @@ class DependencyRenderer:
|
|||
if start < 0 or end < 0:
|
||||
error_args = dict(start=start, end=end, label=label, dir=direction)
|
||||
raise ValueError(Errors.E157.format(**error_args))
|
||||
level = self.levels.index(end - start) + 1
|
||||
level = self.levels[(start, end, label)]
|
||||
x_start = self.offset_x + start * self.distance + self.arrow_spacing
|
||||
if self.direction == "rtl":
|
||||
x_start = self.width - x_start
|
||||
|
@ -181,7 +346,7 @@ class DependencyRenderer:
|
|||
y_curve = self.offset_y - level * self.distance / 2
|
||||
if self.compact:
|
||||
y_curve = self.offset_y - level * self.distance / 6
|
||||
if y_curve == 0 and len(self.levels) > 5:
|
||||
if y_curve == 0 and max(self.levels.values(), default=0) > 5:
|
||||
y_curve = -self.distance
|
||||
arrowhead = self.get_arrowhead(direction, x_start, y, x_end)
|
||||
arc = self.get_arc(x_start, y, y_curve, x_end)
|
||||
|
@ -225,15 +390,23 @@ class DependencyRenderer:
|
|||
p1, p2, p3 = (end, end + self.arrow_width - 2, end - self.arrow_width + 2)
|
||||
return f"M{p1},{y + 2} L{p2},{y - self.arrow_width} {p3},{y - self.arrow_width}"
|
||||
|
||||
def get_levels(self, arcs: List[Dict[str, Any]]) -> List[int]:
|
||||
def get_levels(self, arcs: List[Dict[str, Any]]) -> Dict[Tuple[int, int, str], int]:
|
||||
"""Calculate available arc height "levels".
|
||||
Used to calculate arrow heights dynamically and without wasting space.
|
||||
|
||||
args (list): Individual arcs and their start, end, direction and label.
|
||||
RETURNS (list): Arc levels sorted from lowest to highest.
|
||||
RETURNS (dict): Arc levels keyed by (start, end, label).
|
||||
"""
|
||||
levels = set(map(lambda arc: arc["end"] - arc["start"], arcs))
|
||||
return sorted(list(levels))
|
||||
arcs = [dict(t) for t in {tuple(sorted(arc.items())) for arc in arcs}]
|
||||
length = max([arc["end"] for arc in arcs], default=0)
|
||||
max_level = [0] * length
|
||||
levels = {}
|
||||
for arc in sorted(arcs, key=lambda arc: arc["end"] - arc["start"]):
|
||||
level = max(max_level[arc["start"] : arc["end"]]) + 1
|
||||
for i in range(arc["start"], arc["end"]):
|
||||
max_level[i] = level
|
||||
levels[(arc["start"], arc["end"], arc["label"])] = level
|
||||
return levels
|
||||
|
||||
|
||||
class EntityRenderer:
|
||||
|
@ -242,7 +415,7 @@ class EntityRenderer:
|
|||
style = "ent"
|
||||
|
||||
def __init__(self, options: Dict[str, Any] = {}) -> None:
|
||||
"""Initialise dependency renderer.
|
||||
"""Initialise entity renderer.
|
||||
|
||||
options (dict): Visualiser-specific options (colors, ents)
|
||||
"""
|
||||
|
|
|
@ -62,6 +62,55 @@ TPL_ENT_RTL = """
|
|||
</mark>
|
||||
"""
|
||||
|
||||
TPL_SPANS = """
|
||||
<div class="spans" style="line-height: 2.5; direction: {dir}">{content}</div>
|
||||
"""
|
||||
|
||||
TPL_SPAN = """
|
||||
<span style="font-weight: bold; display: inline-block; position: relative;">
|
||||
{text}
|
||||
{span_slices}
|
||||
{span_starts}
|
||||
</span>
|
||||
"""
|
||||
|
||||
TPL_SPAN_SLICE = """
|
||||
<span style="background: {bg}; top: {top_offset}px; height: 4px; left: -1px; width: calc(100% + 2px); position: absolute;">
|
||||
</span>
|
||||
"""
|
||||
|
||||
|
||||
TPL_SPAN_START = """
|
||||
<span style="background: {bg}; top: {top_offset}px; height: 4px; border-top-left-radius: 3px; border-bottom-left-radius: 3px; left: -1px; width: calc(100% + 2px); position: absolute;">
|
||||
<span style="background: {bg}; z-index: 10; color: #000; top: -0.5em; padding: 2px 3px; position: absolute; font-size: 0.6em; font-weight: bold; line-height: 1; border-radius: 3px">
|
||||
{label}{kb_link}
|
||||
</span>
|
||||
</span>
|
||||
|
||||
"""
|
||||
|
||||
TPL_SPAN_RTL = """
|
||||
<span style="font-weight: bold; display: inline-block; position: relative;">
|
||||
{text}
|
||||
{span_slices}
|
||||
{span_starts}
|
||||
</span>
|
||||
"""
|
||||
|
||||
TPL_SPAN_SLICE_RTL = """
|
||||
<span style="background: {bg}; top: {top_offset}px; height: 4px; left: -1px; width: calc(100% + 2px); position: absolute;">
|
||||
</span>
|
||||
"""
|
||||
|
||||
TPL_SPAN_START_RTL = """
|
||||
<span style="background: {bg}; top: {top_offset}px; height: 4px; border-top-left-radius: 3px; border-bottom-left-radius: 3px; left: -1px; width: calc(100% + 2px); position: absolute;">
|
||||
<span style="background: {bg}; z-index: 10; color: #000; top: -0.5em; padding: 2px 3px; position: absolute; font-size: 0.6em; font-weight: bold; line-height: 1; border-radius: 3px">
|
||||
{label}{kb_link}
|
||||
</span>
|
||||
</span>
|
||||
"""
|
||||
|
||||
|
||||
# Important: this needs to start with a space!
|
||||
TPL_KB_LINK = """
|
||||
<a style="text-decoration: none; color: inherit; font-weight: normal" href="{kb_url}">{kb_id}</a>
|
||||
|
|
|
@ -192,6 +192,13 @@ class Warnings(metaclass=ErrorsWithCodes):
|
|||
W115 = ("Skipping {method}: the floret vector table cannot be modified. "
|
||||
"Vectors are calculated from character ngrams.")
|
||||
W116 = ("Unable to clean attribute '{attr}'.")
|
||||
W117 = ("No spans to visualize found in Doc object with spans_key: '{spans_key}'. If this is "
|
||||
"surprising to you, make sure the Doc was processed using a model "
|
||||
"that supports span categorization, and check the `doc.spans[spans_key]` "
|
||||
"property manually if necessary.")
|
||||
W118 = ("Term '{term}' not found in glossary. It may however be explained in documentation "
|
||||
"for the corpora used to train the language. Please check "
|
||||
"`nlp.meta[\"sources\"]` for any relevant links.")
|
||||
|
||||
|
||||
class Errors(metaclass=ErrorsWithCodes):
|
||||
|
@ -483,7 +490,7 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
"components, since spans are only views of the Doc. Use Doc and "
|
||||
"Token attributes (or custom extension attributes) only and remove "
|
||||
"the following: {attrs}")
|
||||
E181 = ("Received invalid attributes for unkown object {obj}: {attrs}. "
|
||||
E181 = ("Received invalid attributes for unknown object {obj}: {attrs}. "
|
||||
"Only Doc and Token attributes are supported.")
|
||||
E182 = ("Received invalid attribute declaration: {attr}\nDid you forget "
|
||||
"to define the attribute? For example: `{attr}.???`")
|
||||
|
@ -520,10 +527,14 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
E202 = ("Unsupported {name} mode '{mode}'. Supported modes: {modes}.")
|
||||
|
||||
# New errors added in v3.x
|
||||
E855 = ("Invalid {obj}: {obj} is not from the same doc.")
|
||||
E856 = ("Error accessing span at position {i}: out of bounds in span group "
|
||||
"of length {length}.")
|
||||
E857 = ("Entry '{name}' not found in edit tree lemmatizer labels.")
|
||||
E858 = ("The {mode} vector table does not support this operation. "
|
||||
"{alternative}")
|
||||
E859 = ("The floret vector table cannot be modified.")
|
||||
E860 = ("Can't truncate fasttext-bloom vectors.")
|
||||
E860 = ("Can't truncate floret vectors.")
|
||||
E861 = ("No 'keys' should be provided when initializing floret vectors "
|
||||
"with 'minn' and 'maxn'.")
|
||||
E862 = ("'hash_count' must be between 1-4 for floret vectors.")
|
||||
|
@ -566,9 +577,6 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
E879 = ("Unexpected type for 'spans' data. Provide a dictionary mapping keys to "
|
||||
"a list of spans, with each span represented by a tuple (start_char, end_char). "
|
||||
"The tuple can be optionally extended with a label and a KB ID.")
|
||||
E880 = ("The 'wandb' library could not be found - did you install it? "
|
||||
"Alternatively, specify the 'ConsoleLogger' in the 'training.logger' "
|
||||
"config section, instead of the 'WandbLogger'.")
|
||||
E884 = ("The pipeline could not be initialized because the vectors "
|
||||
"could not be found at '{vectors}'. If your pipeline was already "
|
||||
"initialized/trained before, call 'resume_training' instead of 'initialize', "
|
||||
|
@ -894,6 +902,9 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
"patterns.")
|
||||
E1025 = ("Cannot intify the value '{value}' as an IOB string. The only "
|
||||
"supported values are: 'I', 'O', 'B' and ''")
|
||||
E1026 = ("Edit tree has an invalid format:\n{errors}")
|
||||
E1027 = ("AlignmentArray only supports slicing with a step of 1.")
|
||||
E1028 = ("AlignmentArray only supports indexing using an int or a slice.")
|
||||
|
||||
|
||||
# Deprecated model shortcuts, only used in errors and warnings
|
||||
|
|
|
@ -1,3 +1,7 @@
|
|||
import warnings
|
||||
from .errors import Warnings
|
||||
|
||||
|
||||
def explain(term):
|
||||
"""Get a description for a given POS tag, dependency label or entity type.
|
||||
|
||||
|
@ -11,6 +15,8 @@ def explain(term):
|
|||
"""
|
||||
if term in GLOSSARY:
|
||||
return GLOSSARY[term]
|
||||
else:
|
||||
warnings.warn(Warnings.W118.format(term=term))
|
||||
|
||||
|
||||
GLOSSARY = {
|
||||
|
@ -310,7 +316,6 @@ GLOSSARY = {
|
|||
"re": "repeated element",
|
||||
"rs": "reported speech",
|
||||
"sb": "subject",
|
||||
"sb": "subject",
|
||||
"sbp": "passivized subject (PP)",
|
||||
"sp": "subject or predicate",
|
||||
"svp": "separable verb prefix",
|
||||
|
|
|
@ -45,6 +45,10 @@ _hangul_syllables = r"\uAC00-\uD7AF"
|
|||
_hangul_jamo = r"\u1100-\u11FF"
|
||||
_hangul = _hangul_syllables + _hangul_jamo
|
||||
|
||||
_hiragana = r"\u3040-\u309F"
|
||||
_katakana = r"\u30A0-\u30FFー"
|
||||
_kana = _hiragana + _katakana
|
||||
|
||||
# letters with diacritics - Catalan, Czech, Latin, Latvian, Lithuanian, Polish, Slovak, Turkish, Welsh
|
||||
_latin_u_extendedA = (
|
||||
r"\u0100\u0102\u0104\u0106\u0108\u010A\u010C\u010E\u0110\u0112\u0114\u0116\u0118\u011A\u011C"
|
||||
|
@ -244,6 +248,7 @@ _uncased = (
|
|||
+ _tamil
|
||||
+ _telugu
|
||||
+ _hangul
|
||||
+ _kana
|
||||
+ _cjk
|
||||
)
|
||||
|
||||
|
|
16
spacy/lang/dsb/__init__.py
Normal file
16
spacy/lang/dsb/__init__.py
Normal file
|
@ -0,0 +1,16 @@
|
|||
from .lex_attrs import LEX_ATTRS
|
||||
from .stop_words import STOP_WORDS
|
||||
from ...language import Language, BaseDefaults
|
||||
|
||||
|
||||
class LowerSorbianDefaults(BaseDefaults):
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class LowerSorbian(Language):
|
||||
lang = "dsb"
|
||||
Defaults = LowerSorbianDefaults
|
||||
|
||||
|
||||
__all__ = ["LowerSorbian"]
|
15
spacy/lang/dsb/examples.py
Normal file
15
spacy/lang/dsb/examples.py
Normal file
|
@ -0,0 +1,15 @@
|
|||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
>>> from spacy.lang.dsb.examples import sentences
|
||||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"Z tym stwori so wuměnjenje a zakład za dalše wobdźěłanje přez analyzu tekstoweje struktury a semantisku anotaciju a z tym tež za tu předstajenu digitalnu online-wersiju.",
|
||||
"Mi so tu jara derje spodoba.",
|
||||
"Kotre nowniny chceće měć?",
|
||||
"Tak ako w slědnem lěśe jo teke lětosa jano doma zapustowaś móžno.",
|
||||
"Zwóstanjo pótakem hyšći wjele źěła.",
|
||||
]
|
113
spacy/lang/dsb/lex_attrs.py
Normal file
113
spacy/lang/dsb/lex_attrs.py
Normal file
|
@ -0,0 +1,113 @@
|
|||
from ...attrs import LIKE_NUM
|
||||
|
||||
_num_words = [
|
||||
"nul",
|
||||
"jaden",
|
||||
"jadna",
|
||||
"jadno",
|
||||
"dwa",
|
||||
"dwě",
|
||||
"tśi",
|
||||
"tśo",
|
||||
"styri",
|
||||
"styrjo",
|
||||
"pěś",
|
||||
"pěśo",
|
||||
"šesć",
|
||||
"šesćo",
|
||||
"sedym",
|
||||
"sedymjo",
|
||||
"wósym",
|
||||
"wósymjo",
|
||||
"źewjeś",
|
||||
"źewjeśo",
|
||||
"źaseś",
|
||||
"źaseśo",
|
||||
"jadnassćo",
|
||||
"dwanassćo",
|
||||
"tśinasćo",
|
||||
"styrnasćo",
|
||||
"pěśnasćo",
|
||||
"šesnasćo",
|
||||
"sedymnasćo",
|
||||
"wósymnasćo",
|
||||
"źewjeśnasćo",
|
||||
"dwanasćo",
|
||||
"dwaźasća",
|
||||
"tśiźasća",
|
||||
"styrźasća",
|
||||
"pěśźaset",
|
||||
"šesćźaset",
|
||||
"sedymźaset",
|
||||
"wósymźaset",
|
||||
"źewjeśźaset",
|
||||
"sto",
|
||||
"tysac",
|
||||
"milion",
|
||||
"miliarda",
|
||||
"bilion",
|
||||
"biliarda",
|
||||
"trilion",
|
||||
"triliarda",
|
||||
]
|
||||
|
||||
_ordinal_words = [
|
||||
"prědny",
|
||||
"prědna",
|
||||
"prědne",
|
||||
"drugi",
|
||||
"druga",
|
||||
"druge",
|
||||
"tśeśi",
|
||||
"tśeśa",
|
||||
"tśeśe",
|
||||
"stwórty",
|
||||
"stwórta",
|
||||
"stwórte",
|
||||
"pêty",
|
||||
"pěta",
|
||||
"pête",
|
||||
"šesty",
|
||||
"šesta",
|
||||
"šeste",
|
||||
"sedymy",
|
||||
"sedyma",
|
||||
"sedyme",
|
||||
"wósymy",
|
||||
"wósyma",
|
||||
"wósyme",
|
||||
"źewjety",
|
||||
"źewjeta",
|
||||
"źewjete",
|
||||
"źasety",
|
||||
"źaseta",
|
||||
"źasete",
|
||||
"jadnasty",
|
||||
"jadnasta",
|
||||
"jadnaste",
|
||||
"dwanasty",
|
||||
"dwanasta",
|
||||
"dwanaste",
|
||||
]
|
||||
|
||||
|
||||
def like_num(text):
|
||||
if text.startswith(("+", "-", "±", "~")):
|
||||
text = text[1:]
|
||||
text = text.replace(",", "").replace(".", "")
|
||||
if text.isdigit():
|
||||
return True
|
||||
if text.count("/") == 1:
|
||||
num, denom = text.split("/")
|
||||
if num.isdigit() and denom.isdigit():
|
||||
return True
|
||||
text_lower = text.lower()
|
||||
if text_lower in _num_words:
|
||||
return True
|
||||
# Check ordinal number
|
||||
if text_lower in _ordinal_words:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
LEX_ATTRS = {LIKE_NUM: like_num}
|
15
spacy/lang/dsb/stop_words.py
Normal file
15
spacy/lang/dsb/stop_words.py
Normal file
|
@ -0,0 +1,15 @@
|
|||
STOP_WORDS = set(
|
||||
"""
|
||||
a abo aby ako ale až
|
||||
|
||||
daniž dokulaž
|
||||
|
||||
gaž
|
||||
|
||||
jolic
|
||||
|
||||
pak pótom
|
||||
|
||||
teke togodla
|
||||
""".split()
|
||||
)
|
|
@ -447,7 +447,6 @@ for exc_data in [
|
|||
{ORTH: "La.", NORM: "Louisiana"},
|
||||
{ORTH: "Mar.", NORM: "March"},
|
||||
{ORTH: "Mass.", NORM: "Massachusetts"},
|
||||
{ORTH: "May.", NORM: "May"},
|
||||
{ORTH: "Mich.", NORM: "Michigan"},
|
||||
{ORTH: "Minn.", NORM: "Minnesota"},
|
||||
{ORTH: "Miss.", NORM: "Mississippi"},
|
||||
|
|
|
@ -47,6 +47,41 @@ _num_words = [
|
|||
]
|
||||
|
||||
|
||||
_ordinal_words = [
|
||||
"primero",
|
||||
"segundo",
|
||||
"tercero",
|
||||
"cuarto",
|
||||
"quinto",
|
||||
"sexto",
|
||||
"séptimo",
|
||||
"octavo",
|
||||
"noveno",
|
||||
"décimo",
|
||||
"undécimo",
|
||||
"duodécimo",
|
||||
"decimotercero",
|
||||
"decimocuarto",
|
||||
"decimoquinto",
|
||||
"decimosexto",
|
||||
"decimoséptimo",
|
||||
"decimoctavo",
|
||||
"decimonoveno",
|
||||
"vigésimo",
|
||||
"trigésimo",
|
||||
"cuadragésimo",
|
||||
"quincuagésimo",
|
||||
"sexagésimo",
|
||||
"septuagésimo",
|
||||
"octogésima",
|
||||
"nonagésima",
|
||||
"centésima",
|
||||
"milésima",
|
||||
"millonésima",
|
||||
"billonésima",
|
||||
]
|
||||
|
||||
|
||||
def like_num(text):
|
||||
if text.startswith(("+", "-", "±", "~")):
|
||||
text = text[1:]
|
||||
|
@ -57,7 +92,11 @@ def like_num(text):
|
|||
num, denom = text.split("/")
|
||||
if num.isdigit() and denom.isdigit():
|
||||
return True
|
||||
if text.lower() in _num_words:
|
||||
text_lower = text.lower()
|
||||
if text_lower in _num_words:
|
||||
return True
|
||||
# Check ordinal number
|
||||
if text_lower in _ordinal_words:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
|
|
@ -2,6 +2,7 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
|||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
from ...language import Language, BaseDefaults
|
||||
|
||||
|
||||
|
@ -11,6 +12,7 @@ class FinnishDefaults(BaseDefaults):
|
|||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
stop_words = STOP_WORDS
|
||||
syntax_iterators = SYNTAX_ITERATORS
|
||||
|
||||
|
||||
class Finnish(Language):
|
||||
|
|
79
spacy/lang/fi/syntax_iterators.py
Normal file
79
spacy/lang/fi/syntax_iterators.py
Normal file
|
@ -0,0 +1,79 @@
|
|||
from typing import Iterator, Tuple, Union
|
||||
from ...tokens import Doc, Span
|
||||
from ...symbols import NOUN, PROPN, PRON
|
||||
from ...errors import Errors
|
||||
|
||||
|
||||
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
|
||||
"""Detect base noun phrases from a dependency parse. Works on both Doc and Span."""
|
||||
labels = [
|
||||
"appos",
|
||||
"nsubj",
|
||||
"nsubj:cop",
|
||||
"obj",
|
||||
"obl",
|
||||
"ROOT",
|
||||
]
|
||||
extend_labels = [
|
||||
"amod",
|
||||
"compound",
|
||||
"compound:nn",
|
||||
"flat:name",
|
||||
"nmod",
|
||||
"nmod:gobj",
|
||||
"nmod:gsubj",
|
||||
"nmod:poss",
|
||||
"nummod",
|
||||
]
|
||||
|
||||
def potential_np_head(word):
|
||||
return word.pos in (NOUN, PROPN) and (
|
||||
word.dep in np_deps or word.head.pos == PRON
|
||||
)
|
||||
|
||||
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||
if not doc.has_annotation("DEP"):
|
||||
raise ValueError(Errors.E029)
|
||||
|
||||
np_deps = [doc.vocab.strings[label] for label in labels]
|
||||
extend_deps = [doc.vocab.strings[label] for label in extend_labels]
|
||||
np_label = doc.vocab.strings.add("NP")
|
||||
conj_label = doc.vocab.strings.add("conj")
|
||||
|
||||
rbracket = 0
|
||||
prev_end = -1
|
||||
for i, word in enumerate(doclike):
|
||||
if i < rbracket:
|
||||
continue
|
||||
|
||||
# Is this a potential independent NP head or coordinated with
|
||||
# a NOUN that is itself an independent NP head?
|
||||
#
|
||||
# e.g. "Terveyden ja hyvinvoinnin laitos"
|
||||
if potential_np_head(word) or (
|
||||
word.dep == conj_label and potential_np_head(word.head)
|
||||
):
|
||||
# Try to extend to the left to include adjective/num
|
||||
# modifiers, compound words etc.
|
||||
lbracket = word.i
|
||||
for ldep in word.lefts:
|
||||
if ldep.dep in extend_deps:
|
||||
lbracket = ldep.left_edge.i
|
||||
break
|
||||
|
||||
# Prevent nested chunks from being produced
|
||||
if lbracket <= prev_end:
|
||||
continue
|
||||
|
||||
rbracket = word.i
|
||||
# Try to extend the span to the right to capture
|
||||
# appositions and noun modifiers
|
||||
for rdep in word.rights:
|
||||
if rdep.dep in extend_deps:
|
||||
rbracket = rdep.i
|
||||
prev_end = rbracket
|
||||
|
||||
yield lbracket, rbracket + 1, np_label
|
||||
|
||||
|
||||
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
|
|
@ -3,7 +3,7 @@ from ...attrs import LIKE_NUM
|
|||
|
||||
_num_words = set(
|
||||
"""
|
||||
zero un deux trois quatre cinq six sept huit neuf dix
|
||||
zero un une deux trois quatre cinq six sept huit neuf dix
|
||||
onze douze treize quatorze quinze seize dix-sept dix-huit dix-neuf
|
||||
vingt trente quarante cinquante soixante soixante-dix septante quatre-vingt huitante quatre-vingt-dix nonante
|
||||
cent mille mil million milliard billion quadrillion quintillion
|
||||
|
@ -13,7 +13,7 @@ sextillion septillion octillion nonillion decillion
|
|||
|
||||
_ordinal_words = set(
|
||||
"""
|
||||
premier deuxième second troisième quatrième cinquième sixième septième huitième neuvième dixième
|
||||
premier première deuxième second seconde troisième quatrième cinquième sixième septième huitième neuvième dixième
|
||||
onzième douzième treizième quatorzième quinzième seizième dix-septième dix-huitième dix-neuvième
|
||||
vingtième trentième quarantième cinquantième soixantième soixante-dixième septantième quatre-vingtième huitantième quatre-vingt-dixième nonantième
|
||||
centième millième millionnième milliardième billionnième quadrillionnième quintillionnième
|
||||
|
|
|
@ -6,16 +6,35 @@ from ...tokens import Doc, Span
|
|||
|
||||
|
||||
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
|
||||
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
|
||||
# fmt: off
|
||||
labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
|
||||
# fmt: on
|
||||
"""
|
||||
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
|
||||
"""
|
||||
labels = [
|
||||
"nsubj",
|
||||
"nsubj:pass",
|
||||
"obj",
|
||||
"obl",
|
||||
"obl:agent",
|
||||
"obl:arg",
|
||||
"obl:mod",
|
||||
"nmod",
|
||||
"pcomp",
|
||||
"appos",
|
||||
"ROOT",
|
||||
]
|
||||
post_modifiers = ["flat", "flat:name", "flat:foreign", "fixed", "compound"]
|
||||
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||
if not doc.has_annotation("DEP"):
|
||||
raise ValueError(Errors.E029)
|
||||
np_deps = [doc.vocab.strings[label] for label in labels]
|
||||
conj = doc.vocab.strings.add("conj")
|
||||
np_deps = {doc.vocab.strings.add(label) for label in labels}
|
||||
np_modifs = {doc.vocab.strings.add(modifier) for modifier in post_modifiers}
|
||||
np_label = doc.vocab.strings.add("NP")
|
||||
adj_label = doc.vocab.strings.add("amod")
|
||||
det_label = doc.vocab.strings.add("det")
|
||||
det_pos = doc.vocab.strings.add("DET")
|
||||
adp_pos = doc.vocab.strings.add("ADP")
|
||||
conj_label = doc.vocab.strings.add("conj")
|
||||
conj_pos = doc.vocab.strings.add("CCONJ")
|
||||
prev_end = -1
|
||||
for i, word in enumerate(doclike):
|
||||
if word.pos not in (NOUN, PROPN, PRON):
|
||||
|
@ -24,16 +43,43 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
|
|||
if word.left_edge.i <= prev_end:
|
||||
continue
|
||||
if word.dep in np_deps:
|
||||
prev_end = word.right_edge.i
|
||||
yield word.left_edge.i, word.right_edge.i + 1, np_label
|
||||
elif word.dep == conj:
|
||||
right_childs = list(word.rights)
|
||||
right_child = right_childs[0] if right_childs else None
|
||||
|
||||
if right_child:
|
||||
if (
|
||||
right_child.dep == adj_label
|
||||
): # allow chain of adjectives by expanding to right
|
||||
right_end = right_child.right_edge
|
||||
elif (
|
||||
right_child.dep == det_label and right_child.pos == det_pos
|
||||
): # cut relative pronouns here
|
||||
right_end = right_child
|
||||
elif right_child.dep in np_modifs: # Check if we can expand to right
|
||||
right_end = word.right_edge
|
||||
else:
|
||||
right_end = word
|
||||
else:
|
||||
right_end = word
|
||||
prev_end = right_end.i
|
||||
|
||||
left_index = word.left_edge.i
|
||||
left_index = left_index + 1 if word.left_edge.pos == adp_pos else left_index
|
||||
|
||||
yield left_index, right_end.i + 1, np_label
|
||||
elif word.dep == conj_label:
|
||||
head = word.head
|
||||
while head.dep == conj and head.head.i < head.i:
|
||||
while head.dep == conj_label and head.head.i < head.i:
|
||||
head = head.head
|
||||
# If the head is an NP, and we're coordinated to it, we're an NP
|
||||
if head.dep in np_deps:
|
||||
prev_end = word.right_edge.i
|
||||
yield word.left_edge.i, word.right_edge.i + 1, np_label
|
||||
prev_end = word.i
|
||||
|
||||
left_index = word.left_edge.i # eliminate left attached conjunction
|
||||
left_index = (
|
||||
left_index + 1 if word.left_edge.pos == conj_pos else left_index
|
||||
)
|
||||
yield left_index, word.i + 1, np_label
|
||||
|
||||
|
||||
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
|
||||
|
|
18
spacy/lang/hsb/__init__.py
Normal file
18
spacy/lang/hsb/__init__.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
from .lex_attrs import LEX_ATTRS
|
||||
from .stop_words import STOP_WORDS
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from ...language import Language, BaseDefaults
|
||||
|
||||
|
||||
class UpperSorbianDefaults(BaseDefaults):
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
stop_words = STOP_WORDS
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
|
||||
|
||||
class UpperSorbian(Language):
|
||||
lang = "hsb"
|
||||
Defaults = UpperSorbianDefaults
|
||||
|
||||
|
||||
__all__ = ["UpperSorbian"]
|
15
spacy/lang/hsb/examples.py
Normal file
15
spacy/lang/hsb/examples.py
Normal file
|
@ -0,0 +1,15 @@
|
|||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
>>> from spacy.lang.hsb.examples import sentences
|
||||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"To běšo wjelgin raźone a jo se wót luźi derje pśiwzeło. Tak som dožywiła wjelgin",
|
||||
"Jogo pśewóźowarce stej groniłej, až how w serbskich stronach njama Santa Claus nic pytaś.",
|
||||
"A ten sobuźěłaśeŕ Statneje biblioteki w Barlinju jo pśimjeł drogotne knigły bźez rukajcowu z nagima rukoma!",
|
||||
"Take wobchadanje z našym kulturnym derbstwom zewšym njejźo.",
|
||||
"Wopśimjeśe drugich pśinoskow jo było na wusokem niwowje, ako pśecej.",
|
||||
]
|
106
spacy/lang/hsb/lex_attrs.py
Normal file
106
spacy/lang/hsb/lex_attrs.py
Normal file
|
@ -0,0 +1,106 @@
|
|||
from ...attrs import LIKE_NUM
|
||||
|
||||
_num_words = [
|
||||
"nul",
|
||||
"jedyn",
|
||||
"jedna",
|
||||
"jedne",
|
||||
"dwaj",
|
||||
"dwě",
|
||||
"tři",
|
||||
"třo",
|
||||
"štyri",
|
||||
"štyrjo",
|
||||
"pjeć",
|
||||
"šěsć",
|
||||
"sydom",
|
||||
"wosom",
|
||||
"dźewjeć",
|
||||
"dźesać",
|
||||
"jědnaće",
|
||||
"dwanaće",
|
||||
"třinaće",
|
||||
"štyrnaće",
|
||||
"pjatnaće",
|
||||
"šěsnaće",
|
||||
"sydomnaće",
|
||||
"wosomnaće",
|
||||
"dźewjatnaće",
|
||||
"dwaceći",
|
||||
"třiceći",
|
||||
"štyrceći",
|
||||
"pjećdźesat",
|
||||
"šěsćdźesat",
|
||||
"sydomdźesat",
|
||||
"wosomdźesat",
|
||||
"dźewjećdźesat",
|
||||
"sto",
|
||||
"tysac",
|
||||
"milion",
|
||||
"miliarda",
|
||||
"bilion",
|
||||
"biliarda",
|
||||
"trilion",
|
||||
"triliarda",
|
||||
]
|
||||
|
||||
_ordinal_words = [
|
||||
"prěni",
|
||||
"prěnja",
|
||||
"prěnje",
|
||||
"druhi",
|
||||
"druha",
|
||||
"druhe",
|
||||
"třeći",
|
||||
"třeća",
|
||||
"třeće",
|
||||
"štwórty",
|
||||
"štwórta",
|
||||
"štwórte",
|
||||
"pjaty",
|
||||
"pjata",
|
||||
"pjate",
|
||||
"šěsty",
|
||||
"šěsta",
|
||||
"šěste",
|
||||
"sydmy",
|
||||
"sydma",
|
||||
"sydme",
|
||||
"wosmy",
|
||||
"wosma",
|
||||
"wosme",
|
||||
"dźewjaty",
|
||||
"dźewjata",
|
||||
"dźewjate",
|
||||
"dźesaty",
|
||||
"dźesata",
|
||||
"dźesate",
|
||||
"jědnaty",
|
||||
"jědnata",
|
||||
"jědnate",
|
||||
"dwanaty",
|
||||
"dwanata",
|
||||
"dwanate",
|
||||
]
|
||||
|
||||
|
||||
def like_num(text):
|
||||
if text.startswith(("+", "-", "±", "~")):
|
||||
text = text[1:]
|
||||
text = text.replace(",", "").replace(".", "")
|
||||
if text.isdigit():
|
||||
return True
|
||||
if text.count("/") == 1:
|
||||
num, denom = text.split("/")
|
||||
if num.isdigit() and denom.isdigit():
|
||||
return True
|
||||
text_lower = text.lower()
|
||||
if text_lower in _num_words:
|
||||
return True
|
||||
# Check ordinal number
|
||||
if text_lower in _ordinal_words:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
LEX_ATTRS = {LIKE_NUM: like_num}
|
19
spacy/lang/hsb/stop_words.py
Normal file
19
spacy/lang/hsb/stop_words.py
Normal file
|
@ -0,0 +1,19 @@
|
|||
STOP_WORDS = set(
|
||||
"""
|
||||
a abo ale ani
|
||||
|
||||
dokelž
|
||||
|
||||
hdyž
|
||||
|
||||
jeli jelizo
|
||||
|
||||
kaž
|
||||
|
||||
pak potom
|
||||
|
||||
tež tohodla
|
||||
|
||||
zo zoby
|
||||
""".split()
|
||||
)
|
18
spacy/lang/hsb/tokenizer_exceptions.py
Normal file
18
spacy/lang/hsb/tokenizer_exceptions.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...symbols import ORTH, NORM
|
||||
from ...util import update_exc
|
||||
|
||||
_exc = dict()
|
||||
for exc_data in [
|
||||
{ORTH: "mil.", NORM: "milion"},
|
||||
{ORTH: "wob.", NORM: "wobydler"},
|
||||
]:
|
||||
_exc[exc_data[ORTH]] = [exc_data]
|
||||
|
||||
for orth in [
|
||||
"resp.",
|
||||
]:
|
||||
_exc[orth] = [{ORTH: orth}]
|
||||
|
||||
|
||||
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
|
@ -6,13 +6,15 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
|||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||
from ...language import Language, BaseDefaults
|
||||
from .lemmatizer import ItalianLemmatizer
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
|
||||
|
||||
class ItalianDefaults(BaseDefaults):
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
stop_words = STOP_WORDS
|
||||
prefixes = TOKENIZER_PREFIXES
|
||||
infixes = TOKENIZER_INFIXES
|
||||
stop_words = STOP_WORDS
|
||||
syntax_iterators = SYNTAX_ITERATORS
|
||||
|
||||
|
||||
class Italian(Language):
|
||||
|
|
|
@ -10,18 +10,18 @@ avresti avrete avrà avrò avuta avute avuti avuto
|
|||
|
||||
basta bene benissimo brava bravo
|
||||
|
||||
casa caso cento certa certe certi certo che chi chicchessia chiunque ci
|
||||
casa caso cento certa certe certi certo che chi chicchessia chiunque ci c'
|
||||
ciascuna ciascuno cima cio cioe circa citta città co codesta codesti codesto
|
||||
cogli coi col colei coll coloro colui come cominci comunque con concernente
|
||||
conciliarsi conclusione consiglio contro cortesia cos cosa cosi così cui
|
||||
|
||||
da dagl dagli dai dal dall dalla dalle dallo dappertutto davanti degl degli
|
||||
dei del dell della delle dello dentro detto deve di dice dietro dire
|
||||
d' da dagl dagli dai dal dall dall' dalla dalle dallo dappertutto davanti degl degli
|
||||
dei del dell dell' della delle dello dentro detto deve di dice dietro dire
|
||||
dirimpetto diventa diventare diventato dopo dov dove dovra dovrà dovunque due
|
||||
dunque durante
|
||||
|
||||
ebbe ebbero ebbi ecc ecco ed effettivamente egli ella entrambi eppure era
|
||||
erano eravamo eravate eri ero esempio esse essendo esser essere essi ex
|
||||
e ebbe ebbero ebbi ecc ecco ed effettivamente egli ella entrambi eppure era
|
||||
erano eravamo eravate eri ero esempio esse essendo esser essere essi ex è
|
||||
|
||||
fa faccia facciamo facciano facciate faccio facemmo facendo facesse facessero
|
||||
facessi facessimo faceste facesti faceva facevamo facevano facevate facevi
|
||||
|
@ -30,21 +30,21 @@ fareste faresti farete farà farò fatto favore fece fecero feci fin finalmente
|
|||
finche fine fino forse forza fosse fossero fossi fossimo foste fosti fra
|
||||
frattempo fu fui fummo fuori furono futuro generale
|
||||
|
||||
gia già giacche giorni giorno gli gliela gliele glieli glielo gliene governo
|
||||
gia già giacche giorni giorno gli gl' gliela gliele glieli glielo gliene governo
|
||||
grande grazie gruppo
|
||||
|
||||
ha haha hai hanno ho
|
||||
|
||||
ieri il improvviso in inc infatti inoltre insieme intanto intorno invece io
|
||||
|
||||
la là lasciato lato lavoro le lei li lo lontano loro lui lungo luogo
|
||||
l' la là lasciato lato lavoro le lei li lo lontano loro lui lungo luogo
|
||||
|
||||
ma macche magari maggior mai male malgrado malissimo mancanza marche me
|
||||
m' ma macche magari maggior mai male malgrado malissimo mancanza marche me
|
||||
medesimo mediante meglio meno mentre mesi mezzo mi mia mie miei mila miliardi
|
||||
milioni minimi ministro mio modo molti moltissimo molto momento mondo mosto
|
||||
|
||||
nazionale ne negl negli nei nel nell nella nelle nello nemmeno neppure nessun
|
||||
nessuna nessuno niente no noi non nondimeno nonostante nonsia nostra nostre
|
||||
nazionale ne negl negli nei nel nell nella nelle nello nemmeno neppure nessun nessun'
|
||||
nessuna nessuno nient' niente no noi non nondimeno nonostante nonsia nostra nostre
|
||||
nostri nostro novanta nove nulla nuovo
|
||||
|
||||
od oggi ogni ognuna ognuno oltre oppure ora ore osi ossia ottanta otto
|
||||
|
@ -56,12 +56,12 @@ potrebbe preferibilmente presa press prima primo principalmente probabilmente
|
|||
proprio puo può pure purtroppo
|
||||
|
||||
qualche qualcosa qualcuna qualcuno quale quali qualunque quando quanta quante
|
||||
quanti quanto quantunque quasi quattro quel quella quelle quelli quello quest
|
||||
quanti quanto quantunque quasi quattro quel quel' quella quelle quelli quello quest quest'
|
||||
questa queste questi questo qui quindi
|
||||
|
||||
realmente recente recentemente registrazione relativo riecco salvo
|
||||
|
||||
sara sarà sarai saranno sarebbe sarebbero sarei saremmo saremo sareste
|
||||
s' sara sarà sarai saranno sarebbe sarebbero sarei saremmo saremo sareste
|
||||
saresti sarete saro sarò scola scopo scorso se secondo seguente seguito sei
|
||||
sembra sembrare sembrato sembri sempre senza sette si sia siamo siano siate
|
||||
siete sig solito solo soltanto sono sopra sotto spesso srl sta stai stando
|
||||
|
@ -72,12 +72,12 @@ steste stesti stette stettero stetti stia stiamo stiano stiate sto su sua
|
|||
subito successivamente successivo sue sugl sugli sui sul sull sulla sulle
|
||||
sullo suo suoi
|
||||
|
||||
tale tali talvolta tanto te tempo ti titolo tra tranne tre trenta
|
||||
t' tale tali talvolta tanto te tempo ti titolo tra tranne tre trenta
|
||||
troppo trovato tu tua tue tuo tuoi tutta tuttavia tutte tutti tutto
|
||||
|
||||
uguali ulteriore ultimo un una uno uomo
|
||||
uguali ulteriore ultimo un un' una uno uomo
|
||||
|
||||
va vale vari varia varie vario verso vi via vicino visto vita voi volta volte
|
||||
v' va vale vari varia varie vario verso vi via vicino visto vita voi volta volte
|
||||
vostra vostre vostri vostro
|
||||
""".split()
|
||||
)
|
||||
|
|
86
spacy/lang/it/syntax_iterators.py
Normal file
86
spacy/lang/it/syntax_iterators.py
Normal file
|
@ -0,0 +1,86 @@
|
|||
from typing import Union, Iterator, Tuple
|
||||
|
||||
from ...symbols import NOUN, PROPN, PRON
|
||||
from ...errors import Errors
|
||||
from ...tokens import Doc, Span
|
||||
|
||||
|
||||
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
|
||||
"""
|
||||
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
|
||||
"""
|
||||
labels = [
|
||||
"nsubj",
|
||||
"nsubj:pass",
|
||||
"obj",
|
||||
"obl",
|
||||
"obl:agent",
|
||||
"nmod",
|
||||
"pcomp",
|
||||
"appos",
|
||||
"ROOT",
|
||||
]
|
||||
post_modifiers = ["flat", "flat:name", "fixed", "compound"]
|
||||
dets = ["det", "det:poss"]
|
||||
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||
if not doc.has_annotation("DEP"):
|
||||
raise ValueError(Errors.E029)
|
||||
np_deps = {doc.vocab.strings.add(label) for label in labels}
|
||||
np_modifs = {doc.vocab.strings.add(modifier) for modifier in post_modifiers}
|
||||
np_label = doc.vocab.strings.add("NP")
|
||||
adj_label = doc.vocab.strings.add("amod")
|
||||
det_labels = {doc.vocab.strings.add(det) for det in dets}
|
||||
det_pos = doc.vocab.strings.add("DET")
|
||||
adp_label = doc.vocab.strings.add("ADP")
|
||||
conj = doc.vocab.strings.add("conj")
|
||||
conj_pos = doc.vocab.strings.add("CCONJ")
|
||||
prev_end = -1
|
||||
for i, word in enumerate(doclike):
|
||||
if word.pos not in (NOUN, PROPN, PRON):
|
||||
continue
|
||||
# Prevent nested chunks from being produced
|
||||
if word.left_edge.i <= prev_end:
|
||||
continue
|
||||
if word.dep in np_deps:
|
||||
right_childs = list(word.rights)
|
||||
right_child = right_childs[0] if right_childs else None
|
||||
|
||||
if right_child:
|
||||
if (
|
||||
right_child.dep == adj_label
|
||||
): # allow chain of adjectives by expanding to right
|
||||
right_end = right_child.right_edge
|
||||
elif (
|
||||
right_child.dep in det_labels and right_child.pos == det_pos
|
||||
): # cut relative pronouns here
|
||||
right_end = right_child
|
||||
elif right_child.dep in np_modifs: # Check if we can expand to right
|
||||
right_end = word.right_edge
|
||||
else:
|
||||
right_end = word
|
||||
else:
|
||||
right_end = word
|
||||
prev_end = right_end.i
|
||||
|
||||
left_index = word.left_edge.i
|
||||
left_index = (
|
||||
left_index + 1 if word.left_edge.pos == adp_label else left_index
|
||||
)
|
||||
|
||||
yield left_index, right_end.i + 1, np_label
|
||||
elif word.dep == conj:
|
||||
head = word.head
|
||||
while head.dep == conj and head.head.i < head.i:
|
||||
head = head.head
|
||||
# If the head is an NP, and we're coordinated to it, we're an NP
|
||||
if head.dep in np_deps:
|
||||
prev_end = word.i
|
||||
|
||||
left_index = word.left_edge.i # eliminate left attached conjunction
|
||||
left_index = (
|
||||
left_index + 1 if word.left_edge.pos == conj_pos else left_index
|
||||
)
|
||||
yield left_index, word.i + 1, np_label
|
||||
|
||||
|
||||
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
|
|
@ -1,12 +1,13 @@
|
|||
from typing import Iterator, Any, Dict
|
||||
|
||||
from .punctuation import TOKENIZER_INFIXES
|
||||
from .stop_words import STOP_WORDS
|
||||
from .tag_map import TAG_MAP
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from ...language import Language, BaseDefaults
|
||||
from ...tokens import Doc
|
||||
from ...scorer import Scorer
|
||||
from ...symbols import POS
|
||||
from ...symbols import POS, X
|
||||
from ...training import validate_examples
|
||||
from ...util import DummyTokenizer, registry, load_config_from_str
|
||||
from ...vocab import Vocab
|
||||
|
@ -31,15 +32,24 @@ def create_tokenizer():
|
|||
class KoreanTokenizer(DummyTokenizer):
|
||||
def __init__(self, vocab: Vocab):
|
||||
self.vocab = vocab
|
||||
MeCab = try_mecab_import() # type: ignore[func-returns-value]
|
||||
self.mecab_tokenizer = MeCab("-F%f[0],%f[7]")
|
||||
self._mecab = try_mecab_import() # type: ignore[func-returns-value]
|
||||
self._mecab_tokenizer = None
|
||||
|
||||
@property
|
||||
def mecab_tokenizer(self):
|
||||
# This is a property so that initializing a pipeline with blank:ko is
|
||||
# possible without actually requiring mecab-ko, e.g. to run
|
||||
# `spacy init vectors ko` for a pipeline that will have a different
|
||||
# tokenizer in the end. The languages need to match for the vectors
|
||||
# to be imported and there's no way to pass a custom config to
|
||||
# `init vectors`.
|
||||
if self._mecab_tokenizer is None:
|
||||
self._mecab_tokenizer = self._mecab("-F%f[0],%f[7]")
|
||||
return self._mecab_tokenizer
|
||||
|
||||
def __reduce__(self):
|
||||
return KoreanTokenizer, (self.vocab,)
|
||||
|
||||
def __del__(self):
|
||||
self.mecab_tokenizer.__del__()
|
||||
|
||||
def __call__(self, text: str) -> Doc:
|
||||
dtokens = list(self.detailed_tokens(text))
|
||||
surfaces = [dt["surface"] for dt in dtokens]
|
||||
|
@ -47,7 +57,10 @@ class KoreanTokenizer(DummyTokenizer):
|
|||
for token, dtoken in zip(doc, dtokens):
|
||||
first_tag, sep, eomi_tags = dtoken["tag"].partition("+")
|
||||
token.tag_ = first_tag # stem(어간) or pre-final(선어말 어미)
|
||||
if token.tag_ in TAG_MAP:
|
||||
token.pos = TAG_MAP[token.tag_][POS]
|
||||
else:
|
||||
token.pos = X
|
||||
token.lemma_ = dtoken["lemma"]
|
||||
doc.user_data["full_tags"] = [dt["tag"] for dt in dtokens]
|
||||
return doc
|
||||
|
@ -76,6 +89,7 @@ class KoreanDefaults(BaseDefaults):
|
|||
lex_attr_getters = LEX_ATTRS
|
||||
stop_words = STOP_WORDS
|
||||
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
|
||||
infixes = TOKENIZER_INFIXES
|
||||
|
||||
|
||||
class Korean(Language):
|
||||
|
@ -90,7 +104,8 @@ def try_mecab_import() -> None:
|
|||
return MeCab
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Korean support requires [mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), "
|
||||
'The Korean tokenizer ("spacy.ko.KoreanTokenizer") requires '
|
||||
"[mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), "
|
||||
"[mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), "
|
||||
"and [natto-py](https://github.com/buruzaemon/natto-py)"
|
||||
) from None
|
||||
|
|
12
spacy/lang/ko/punctuation.py
Normal file
12
spacy/lang/ko/punctuation.py
Normal file
|
@ -0,0 +1,12 @@
|
|||
from ..char_classes import LIST_QUOTES
|
||||
from ..punctuation import TOKENIZER_INFIXES as BASE_TOKENIZER_INFIXES
|
||||
|
||||
|
||||
_infixes = (
|
||||
["·", "ㆍ", "\(", "\)"]
|
||||
+ [r"(?<=[0-9])~(?=[0-9-])"]
|
||||
+ LIST_QUOTES
|
||||
+ BASE_TOKENIZER_INFIXES
|
||||
)
|
||||
|
||||
TOKENIZER_INFIXES = _infixes
|
|
@ -4,46 +4,42 @@ alle allerede alt and andre annen annet at av
|
|||
|
||||
bak bare bedre beste blant ble bli blir blitt bris by både
|
||||
|
||||
da dag de del dem den denne der dermed det dette disse drept du
|
||||
da dag de del dem den denne der dermed det dette disse du
|
||||
|
||||
eller en enn er et ett etter
|
||||
|
||||
fem fikk fire fjor flere folk for fortsatt fotball fra fram frankrike fredag
|
||||
fem fikk fire fjor flere folk for fortsatt fra fram
|
||||
funnet få får fått før først første
|
||||
|
||||
gang gi gikk gjennom gjorde gjort gjør gjøre god godt grunn gå går
|
||||
|
||||
ha hadde ham han hans har hele helt henne hennes her hun hva hvor hvordan
|
||||
hvorfor
|
||||
ha hadde ham han hans har hele helt henne hennes her hun
|
||||
|
||||
i ifølge igjen ikke ingen inn
|
||||
|
||||
ja jeg
|
||||
|
||||
kamp kampen kan kl klart kom komme kommer kontakt kort kroner kunne kveld
|
||||
kvinner
|
||||
|
||||
la laget land landet langt leder ligger like litt løpet lørdag
|
||||
la laget land landet langt leder ligger like litt løpet
|
||||
|
||||
man mandag mange mannen mars med meg mellom men mener menn mennesker mens mer
|
||||
millioner minutter mot msci mye må mål måtte
|
||||
man mange med meg mellom men mener mennesker mens mer mot mye må mål måtte
|
||||
|
||||
ned neste noe noen nok norge norsk norske ntb ny nye nå når
|
||||
ned neste noe noen nok ny nye nå når
|
||||
|
||||
og også om onsdag opp opplyser oslo oss over
|
||||
og også om opp opplyser oss over
|
||||
|
||||
personer plass poeng politidistrikt politiet president prosent på
|
||||
personer plass poeng på
|
||||
|
||||
regjeringen runde rundt russland
|
||||
runde rundt
|
||||
|
||||
sa saken samme sammen samtidig satt se seg seks selv senere september ser sett
|
||||
sa saken samme sammen samtidig satt se seg seks selv senere ser sett
|
||||
siden sier sin sine siste sitt skal skriver skulle slik som sted stedet stor
|
||||
store står sverige svært så søndag
|
||||
store står svært så
|
||||
|
||||
ta tatt tid tidligere til tilbake tillegg tirsdag to tok torsdag tre tror
|
||||
tyskland
|
||||
ta tatt tid tidligere til tilbake tillegg tok tror
|
||||
|
||||
under usa ut uten utenfor
|
||||
under ut uten utenfor
|
||||
|
||||
vant var ved veldig vi videre viktig vil ville viser vår være vært
|
||||
|
||||
|
|
|
@ -1,56 +1,219 @@
|
|||
from ...attrs import LIKE_NUM
|
||||
|
||||
|
||||
_num_words = [
|
||||
"ноль",
|
||||
"один",
|
||||
"два",
|
||||
"три",
|
||||
"четыре",
|
||||
"пять",
|
||||
"шесть",
|
||||
"семь",
|
||||
"восемь",
|
||||
"девять",
|
||||
"десять",
|
||||
"одиннадцать",
|
||||
"двенадцать",
|
||||
"тринадцать",
|
||||
"четырнадцать",
|
||||
"пятнадцать",
|
||||
"шестнадцать",
|
||||
"семнадцать",
|
||||
"восемнадцать",
|
||||
"девятнадцать",
|
||||
"двадцать",
|
||||
"тридцать",
|
||||
"сорок",
|
||||
"пятьдесят",
|
||||
"шестьдесят",
|
||||
"семьдесят",
|
||||
"восемьдесят",
|
||||
"девяносто",
|
||||
"сто",
|
||||
"двести",
|
||||
"триста",
|
||||
"четыреста",
|
||||
"пятьсот",
|
||||
"шестьсот",
|
||||
"семьсот",
|
||||
"восемьсот",
|
||||
"девятьсот",
|
||||
"тысяча",
|
||||
"миллион",
|
||||
"миллиард",
|
||||
"триллион",
|
||||
"квадриллион",
|
||||
"квинтиллион",
|
||||
]
|
||||
_num_words = list(
|
||||
set(
|
||||
"""
|
||||
ноль ноля нолю нолём ноле нулевой нулевого нулевому нулевым нулевом нулевая нулевую нулевое нулевые нулевых нулевыми
|
||||
|
||||
четверть четверти четвертью четвертей четвертям четвертями четвертях
|
||||
|
||||
треть трети третью третей третям третями третях
|
||||
|
||||
половина половины половине половину половиной половин половинам половинами половинах половиною
|
||||
|
||||
один одного одному одним одном
|
||||
первой первого первому первом первый первым первых
|
||||
во-первых
|
||||
единица единицы единице единицу единицей единиц единицам единицами единицах единицею
|
||||
|
||||
два двумя двум двух двоих двое две
|
||||
второго второму второй втором вторым вторых
|
||||
двойка двойки двойке двойку двойкой двоек двойкам двойками двойках двойкою
|
||||
во-вторых
|
||||
оба обе обеим обеими обеих обоим обоими обоих
|
||||
|
||||
полтора полторы полутора
|
||||
|
||||
три третьего третьему третьем третьим третий тремя трем трех трое троих трёх
|
||||
тройка тройки тройке тройку тройкою троек тройкам тройками тройках тройкой
|
||||
троечка троечки троечке троечку троечкой троечек троечкам троечками троечках троечкой
|
||||
трешка трешки трешке трешку трешкой трешек трешкам трешками трешках трешкою
|
||||
трёшка трёшки трёшке трёшку трёшкой трёшек трёшкам трёшками трёшках трёшкою
|
||||
трояк трояка трояку трояком трояке трояки трояков троякам трояками трояках
|
||||
треха треху трехой
|
||||
трёха трёху трёхой
|
||||
втроем втроём
|
||||
|
||||
четыре четвертого четвертому четвертом четвертый четвертым четверка четырьмя четырем четырех четверо четырёх четверым
|
||||
четверых
|
||||
вчетвером
|
||||
|
||||
пять пятого пятому пятом пятый пятым пятью пяти пятеро пятерых пятерыми
|
||||
впятером
|
||||
пятерочка пятерочки пятерочке пятерочками пятерочкой пятерочку пятерочкой пятерочками
|
||||
пятёрочка пятёрочки пятёрочке пятёрочками пятёрочкой пятёрочку пятёрочкой пятёрочками
|
||||
пятерка пятерки пятерке пятерками пятеркой пятерку пятерками
|
||||
пятёрка пятёрки пятёрке пятёрками пятёркой пятёрку пятёрками
|
||||
пятёра пятёры пятёре пятёрами пятёрой пятёру пятёрами
|
||||
пятера пятеры пятере пятерами пятерой пятеру пятерами
|
||||
пятак пятаки пятаке пятаками пятаком пятаку пятаками
|
||||
|
||||
шесть шестерка шестого шестому шестой шестом шестым шестью шести шестеро шестерых
|
||||
вшестером
|
||||
|
||||
семь семерка седьмого седьмому седьмой седьмом седьмым семью семи семеро седьмых
|
||||
всемером
|
||||
|
||||
восемь восьмерка восьмого восьмому восемью восьмой восьмом восьмым восеми восьмером восьми восьмью
|
||||
восьмерых
|
||||
ввосьмером
|
||||
|
||||
девять девятого девятому девятка девятом девятый девятым девятью девяти девятером вдевятером девятерых
|
||||
вдевятером
|
||||
|
||||
десять десятого десятому десятка десятом десятый десятым десятью десяти десятером десятых
|
||||
вдесятером
|
||||
|
||||
одиннадцать одиннадцатого одиннадцатому одиннадцатом одиннадцатый одиннадцатым одиннадцатью одиннадцати
|
||||
одиннадцатых
|
||||
|
||||
двенадцать двенадцатого двенадцатому двенадцатом двенадцатый двенадцатым двенадцатью двенадцати
|
||||
двенадцатых
|
||||
|
||||
тринадцать тринадцатого тринадцатому тринадцатом тринадцатый тринадцатым тринадцатью тринадцати
|
||||
тринадцатых
|
||||
|
||||
четырнадцать четырнадцатого четырнадцатому четырнадцатом четырнадцатый четырнадцатым четырнадцатью четырнадцати
|
||||
четырнадцатых
|
||||
|
||||
пятнадцать пятнадцатого пятнадцатому пятнадцатом пятнадцатый пятнадцатым пятнадцатью пятнадцати
|
||||
пятнадцатых
|
||||
пятнарик пятнарику пятнариком пятнарики
|
||||
|
||||
шестнадцать шестнадцатого шестнадцатому шестнадцатом шестнадцатый шестнадцатым шестнадцатью шестнадцати
|
||||
шестнадцатых
|
||||
|
||||
семнадцать семнадцатого семнадцатому семнадцатом семнадцатый семнадцатым семнадцатью семнадцати семнадцатых
|
||||
|
||||
восемнадцать восемнадцатого восемнадцатому восемнадцатом восемнадцатый восемнадцатым восемнадцатью восемнадцати
|
||||
восемнадцатых
|
||||
|
||||
девятнадцать девятнадцатого девятнадцатому девятнадцатом девятнадцатый девятнадцатым девятнадцатью девятнадцати
|
||||
девятнадцатых
|
||||
|
||||
двадцать двадцатого двадцатому двадцатом двадцатый двадцатым двадцатью двадцати двадцатых
|
||||
|
||||
четвертак четвертака четвертаке четвертаку четвертаки четвертаком четвертаками
|
||||
|
||||
тридцать тридцатого тридцатому тридцатом тридцатый тридцатым тридцатью тридцати тридцатых
|
||||
тридцадка тридцадку тридцадке тридцадки тридцадкой тридцадкою тридцадками
|
||||
|
||||
тридевять тридевяти тридевятью
|
||||
|
||||
сорок сорокового сороковому сороковом сороковым сороковой сороковых
|
||||
сорокет сорокета сорокету сорокете сорокеты сорокетом сорокетами сорокетам
|
||||
|
||||
пятьдесят пятьдесятого пятьдесятому пятьюдесятью пятьдесятом пятьдесятый пятьдесятым пятидесяти пятьдесятых
|
||||
полтинник полтинника полтиннике полтиннику полтинники полтинником полтинниками полтинникам полтинниках
|
||||
пятидесятка пятидесятке пятидесятку пятидесятки пятидесяткой пятидесятками пятидесяткам пятидесятках
|
||||
полтос полтоса полтосе полтосу полтосы полтосом полтосами полтосам полтосах
|
||||
|
||||
шестьдесят шестьдесятого шестьдесятому шестьюдесятью шестьдесятом шестьдесятый шестьдесятым шестидесятые шестидесяти
|
||||
шестьдесятых
|
||||
|
||||
семьдесят семьдесятого семьдесятому семьюдесятью семьдесятом семьдесятый семьдесятым семидесяти семьдесятых
|
||||
|
||||
восемьдесят восемьдесятого восемьдесятому восемьюдесятью восемьдесятом восемьдесятый восемьдесятым восемидесяти
|
||||
восьмидесяти восьмидесятых
|
||||
|
||||
девяносто девяностого девяностому девяностом девяностый девяностым девяноста девяностых
|
||||
|
||||
сто сотого сотому сотом сотен сотый сотым ста
|
||||
стольник стольника стольнику стольнике стольники стольником стольниками
|
||||
сотка сотки сотке соткой сотками соткам сотках
|
||||
сотня сотни сотне сотней сотнями сотням сотнях
|
||||
|
||||
двести двумястами двухсотого двухсотому двухсотом двухсотый двухсотым двумстам двухстах двухсот
|
||||
|
||||
триста тремястами трехсотого трехсотому трехсотом трехсотый трехсотым тремстам трехстах трехсот
|
||||
|
||||
четыреста четырехсотого четырехсотому четырьмястами четырехсотом четырехсотый четырехсотым четыремстам четырехстах
|
||||
четырехсот
|
||||
|
||||
пятьсот пятисотого пятисотому пятьюстами пятисотом пятисотый пятисотым пятистам пятистах пятисот
|
||||
пятисотка пятисотки пятисотке пятисоткой пятисотками пятисоткам пятисоткою пятисотках
|
||||
пятихатка пятихатки пятихатке пятихаткой пятихатками пятихаткам пятихаткою пятихатках
|
||||
пятифан пятифаны пятифане пятифаном пятифанами пятифанах
|
||||
|
||||
шестьсот шестисотого шестисотому шестьюстами шестисотом шестисотый шестисотым шестистам шестистах шестисот
|
||||
|
||||
семьсот семисотого семисотому семьюстами семисотом семисотый семисотым семистам семистах семисот
|
||||
|
||||
восемьсот восемисотого восемисотому восемисотом восемисотый восемисотым восьмистами восьмистам восьмистах восьмисот
|
||||
|
||||
девятьсот девятисотого девятисотому девятьюстами девятисотом девятисотый девятисотым девятистам девятистах девятисот
|
||||
|
||||
тысяча тысячного тысячному тысячном тысячный тысячным тысячам тысячах тысячей тысяч тысячи тыс
|
||||
косарь косаря косару косарем косарями косарях косарям косарей
|
||||
|
||||
десятитысячный десятитысячного десятитысячному десятитысячным десятитысячном десятитысячная десятитысячной
|
||||
десятитысячную десятитысячною десятитысячное десятитысячные десятитысячных десятитысячными
|
||||
|
||||
двадцатитысячный двадцатитысячного двадцатитысячному двадцатитысячным двадцатитысячном двадцатитысячная
|
||||
двадцатитысячной двадцатитысячную двадцатитысячною двадцатитысячное двадцатитысячные двадцатитысячных
|
||||
двадцатитысячными
|
||||
|
||||
тридцатитысячный тридцатитысячного тридцатитысячному тридцатитысячным тридцатитысячном тридцатитысячная
|
||||
тридцатитысячной тридцатитысячную тридцатитысячною тридцатитысячное тридцатитысячные тридцатитысячных
|
||||
тридцатитысячными
|
||||
|
||||
сорокатысячный сорокатысячного сорокатысячному сорокатысячным сорокатысячном сорокатысячная
|
||||
сорокатысячной сорокатысячную сорокатысячною сорокатысячное сорокатысячные сорокатысячных
|
||||
сорокатысячными
|
||||
|
||||
пятидесятитысячный пятидесятитысячного пятидесятитысячному пятидесятитысячным пятидесятитысячном пятидесятитысячная
|
||||
пятидесятитысячной пятидесятитысячную пятидесятитысячною пятидесятитысячное пятидесятитысячные пятидесятитысячных
|
||||
пятидесятитысячными
|
||||
|
||||
шестидесятитысячный шестидесятитысячного шестидесятитысячному шестидесятитысячным шестидесятитысячном шестидесятитысячная
|
||||
шестидесятитысячной шестидесятитысячную шестидесятитысячною шестидесятитысячное шестидесятитысячные шестидесятитысячных
|
||||
шестидесятитысячными
|
||||
|
||||
семидесятитысячный семидесятитысячного семидесятитысячному семидесятитысячным семидесятитысячном семидесятитысячная
|
||||
семидесятитысячной семидесятитысячную семидесятитысячною семидесятитысячное семидесятитысячные семидесятитысячных
|
||||
семидесятитысячными
|
||||
|
||||
восьмидесятитысячный восьмидесятитысячного восьмидесятитысячному восьмидесятитысячным восьмидесятитысячном восьмидесятитысячная
|
||||
восьмидесятитысячной восьмидесятитысячную восьмидесятитысячною восьмидесятитысячное восьмидесятитысячные восьмидесятитысячных
|
||||
восьмидесятитысячными
|
||||
|
||||
стотысячный стотысячного стотысячному стотысячным стотысячном стотысячная стотысячной стотысячную стотысячное
|
||||
стотысячные стотысячных стотысячными стотысячною
|
||||
|
||||
миллион миллионного миллионов миллионному миллионном миллионный миллионным миллионом миллиона миллионе миллиону
|
||||
миллионов
|
||||
лям ляма лямы лямом лямами лямах лямов
|
||||
млн
|
||||
|
||||
десятимиллионная десятимиллионной десятимиллионными десятимиллионный десятимиллионным десятимиллионному
|
||||
десятимиллионными десятимиллионную десятимиллионное десятимиллионные десятимиллионных десятимиллионною
|
||||
|
||||
миллиард миллиардного миллиардному миллиардном миллиардный миллиардным миллиардом миллиарда миллиарде миллиарду
|
||||
миллиардов
|
||||
лярд лярда лярды лярдом лярдами лярдах лярдов
|
||||
млрд
|
||||
|
||||
триллион триллионного триллионному триллионном триллионный триллионным триллионом триллиона триллионе триллиону
|
||||
триллионов трлн
|
||||
|
||||
квадриллион квадриллионного квадриллионному квадриллионный квадриллионным квадриллионом квадриллиона квадриллионе
|
||||
квадриллиону квадриллионов квадрлн
|
||||
|
||||
квинтиллион квинтиллионного квинтиллионному квинтиллионный квинтиллионным квинтиллионом квинтиллиона квинтиллионе
|
||||
квинтиллиону квинтиллионов квинтлн
|
||||
|
||||
i ii iii iv v vi vii viii ix x xi xii xiii xiv xv xvi xvii xviii xix xx xxi xxii xxiii xxiv xxv xxvi xxvii xxvii xxix
|
||||
""".split()
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def like_num(text):
|
||||
if text.startswith(("+", "-", "±", "~")):
|
||||
text = text[1:]
|
||||
if text.endswith("%"):
|
||||
text = text[:-1]
|
||||
text = text.replace(",", "").replace(".", "")
|
||||
if text.isdigit():
|
||||
return True
|
||||
|
|
|
@ -1,52 +1,111 @@
|
|||
STOP_WORDS = set(
|
||||
"""
|
||||
а
|
||||
а авось ага агу аж ай али алло ау ах ая
|
||||
|
||||
будем будет будете будешь буду будут будучи будь будьте бы был была были было
|
||||
быть
|
||||
б будем будет будете будешь буду будут будучи будь будьте бы был была были было
|
||||
быть бац без безусловно бишь благо благодаря ближайшие близко более больше
|
||||
будто бывает бывала бывали бываю бывают бытует
|
||||
|
||||
в вам вами вас весь во вот все всё всего всей всем всём всеми всему всех всею
|
||||
всея всю вся вы
|
||||
всея всю вся вы ваш ваша ваше ваши вдали вдобавок вдруг ведь везде вернее
|
||||
взаимно взаправду видно вишь включая вместо внакладе вначале вне вниз внизу
|
||||
вновь вовсе возможно воистину вокруг вон вообще вопреки вперекор вплоть
|
||||
вполне вправду вправе впрочем впрямь вресноту вроде вряд всегда всюду
|
||||
всякий всякого всякой всячески вчеред
|
||||
|
||||
да для до
|
||||
г го где гораздо гав
|
||||
|
||||
его едим едят ее её ей ел ела ем ему емъ если ест есть ешь еще ещё ею
|
||||
д да для до дабы давайте давно давным даже далее далеко дальше данная
|
||||
данного данное данной данном данному данные данный данных дану данунах
|
||||
даром де действительно довольно доколе доколь долго должен должна
|
||||
должно должны должный дополнительно другая другие другим другими
|
||||
других другое другой
|
||||
|
||||
же
|
||||
е его едим едят ее её ей ел ела ем ему емъ если ест есть ешь еще ещё ею едва
|
||||
ежели еле
|
||||
|
||||
за
|
||||
ж же
|
||||
|
||||
и из или им ими имъ их
|
||||
з за затем зато зачем здесь значит зря
|
||||
|
||||
и из или им ими имъ их ибо иль имеет имел имела имело именно иметь иначе
|
||||
иногда иным иными итак ишь
|
||||
|
||||
й
|
||||
|
||||
к как кем ко когда кого ком кому комья которая которого которое которой котором
|
||||
которому которою которую которые который которым которыми которых кто
|
||||
которому которою которую которые который которым которыми которых кто ка кабы
|
||||
каждая каждое каждые каждый кажется казалась казались казалось казался казаться
|
||||
какая какие каким какими каков какого какой какому какою касательно кой коли
|
||||
коль конечно короче кроме кстати ку куда
|
||||
|
||||
меня мне мной мною мог моги могите могла могли могло могу могут мое моё моего
|
||||
л ли либо лишь любая любого любое любой любом любую любыми любых
|
||||
|
||||
м меня мне мной мною мог моги могите могла могли могло могу могут мое моё моего
|
||||
моей моем моём моему моею можем может можете можешь мои мой моим моими моих
|
||||
мочь мою моя мы
|
||||
мочь мою моя мы мало меж между менее меньше мимо многие много многого многое
|
||||
многом многому можно мол му
|
||||
|
||||
на нам нами нас наса наш наша наше нашего нашей нашем нашему нашею наши нашим
|
||||
н на нам нами нас наса наш наша наше нашего нашей нашем нашему нашею наши нашим
|
||||
нашими наших нашу не него нее неё ней нем нём нему нет нею ним ними них но
|
||||
наверняка наверху навряд навыворот над надо назад наиболее наизворот
|
||||
наизнанку наипаче накануне наконец наоборот наперед наперекор наподобие
|
||||
например напротив напрямую насилу настоящая настоящее настоящие настоящий
|
||||
насчет нате находиться начала начале неважно негде недавно недалеко незачем
|
||||
некем некогда некому некоторая некоторые некоторый некоторых некто некуда
|
||||
нельзя немногие немногим немного необходимо необходимости необходимые
|
||||
необходимым неоткуда непрерывно нередко несколько нету неужели нечего
|
||||
нечем нечему нечто нешто нибудь нигде ниже низко никак никакой никем
|
||||
никогда никого никому никто никуда ниоткуда нипочем ничего ничем ничему
|
||||
ничто ну нужная нужно нужного нужные нужный нужных ныне нынешнее нынешней
|
||||
нынешних нынче
|
||||
|
||||
о об один одна одни одним одними одних одно одного одной одном одному одною
|
||||
одну он она оне они оно от
|
||||
одну он она оне они оно от оба общую обычно ого однажды однако ой около оный
|
||||
оп опять особенно особо особую особые откуда отнелижа отнелиже отовсюду
|
||||
отсюда оттого оттот оттуда отчего отчему ох очевидно очень ом
|
||||
|
||||
по при
|
||||
п по при паче перед под подавно поди подобная подобно подобного подобные
|
||||
подобный подобным подобных поелику пожалуй пожалуйста позже поистине
|
||||
пока покамест поколе поколь покуда покудова помимо понеже поприще пор
|
||||
пора посему поскольку после посреди посредством потом потому потомушта
|
||||
похожем почему почти поэтому прежде притом причем про просто прочего
|
||||
прочее прочему прочими проще прям пусть
|
||||
|
||||
р ради разве ранее рано раньше рядом
|
||||
|
||||
с сам сама сами самим самими самих само самого самом самому саму свое своё
|
||||
своего своей своем своём своему своею свои свой своим своими своих свою своя
|
||||
себе себя собой собою
|
||||
себе себя собой собою самая самое самой самый самых сверх свыше се сего сей
|
||||
сейчас сие сих сквозь сколько скорее скоро следует слишком смогут сможет
|
||||
сначала снова со собственно совсем сперва спокону спустя сразу среди сродни
|
||||
стал стала стали стало стать суть сызнова
|
||||
|
||||
та так такая такие таким такими таких такого такое такой таком такому такою
|
||||
такую те тебе тебя тем теми тех то тобой тобою того той только том томах тому
|
||||
тот тою ту ты
|
||||
та то ту ты ти так такая такие таким такими таких такого такое такой таком такому такою
|
||||
такую те тебе тебя тем теми тех тобой тобою того той только том томах тому
|
||||
тот тою также таки таков такова там твои твоим твоих твой твоя твоё
|
||||
теперь тогда тоже тотчас точно туда тут тьфу тая
|
||||
|
||||
у уже
|
||||
у уже увы уж ура ух ую
|
||||
|
||||
чего чем чём чему что чтобы
|
||||
ф фу
|
||||
|
||||
эта эти этим этими этих это этого этой этом этому этот этою эту
|
||||
х ха хе хорошо хотел хотела хотелось хотеть хоть хотя хочешь хочу хуже
|
||||
|
||||
я
|
||||
ч чего чем чём чему что чтобы часто чаще чей через чтоб чуть чхать чьим
|
||||
чьих чьё чё
|
||||
|
||||
ш ша
|
||||
|
||||
щ ща щас
|
||||
|
||||
ы ых ые ый
|
||||
|
||||
э эта эти этим этими этих это этого этой этом этому этот этою эту эдак эдакий
|
||||
эй эка экий этак этакий эх
|
||||
|
||||
ю
|
||||
|
||||
я явно явных яко якобы якоже
|
||||
""".split()
|
||||
)
|
||||
|
|
|
@ -2,7 +2,6 @@ from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
|||
from ...symbols import ORTH, NORM
|
||||
from ...util import update_exc
|
||||
|
||||
|
||||
_exc = {}
|
||||
|
||||
_abbrev_exc = [
|
||||
|
@ -42,7 +41,6 @@ _abbrev_exc = [
|
|||
{ORTH: "дек", NORM: "декабрь"},
|
||||
]
|
||||
|
||||
|
||||
for abbrev_desc in _abbrev_exc:
|
||||
abbrev = abbrev_desc[ORTH]
|
||||
for orth in (abbrev, abbrev.capitalize(), abbrev.upper()):
|
||||
|
@ -50,17 +48,354 @@ for abbrev_desc in _abbrev_exc:
|
|||
_exc[orth + "."] = [{ORTH: orth + ".", NORM: abbrev_desc[NORM]}]
|
||||
|
||||
|
||||
_slang_exc = [
|
||||
for abbr in [
|
||||
# Year slang abbreviations
|
||||
{ORTH: "2к15", NORM: "2015"},
|
||||
{ORTH: "2к16", NORM: "2016"},
|
||||
{ORTH: "2к17", NORM: "2017"},
|
||||
{ORTH: "2к18", NORM: "2018"},
|
||||
{ORTH: "2к19", NORM: "2019"},
|
||||
{ORTH: "2к20", NORM: "2020"},
|
||||
]
|
||||
{ORTH: "2к21", NORM: "2021"},
|
||||
{ORTH: "2к22", NORM: "2022"},
|
||||
{ORTH: "2к23", NORM: "2023"},
|
||||
{ORTH: "2к24", NORM: "2024"},
|
||||
{ORTH: "2к25", NORM: "2025"},
|
||||
]:
|
||||
_exc[abbr[ORTH]] = [abbr]
|
||||
|
||||
for slang_desc in _slang_exc:
|
||||
_exc[slang_desc[ORTH]] = [slang_desc]
|
||||
for abbr in [
|
||||
# Profession and academic titles abbreviations
|
||||
{ORTH: "ак.", NORM: "академик"},
|
||||
{ORTH: "акад.", NORM: "академик"},
|
||||
{ORTH: "д-р архитектуры", NORM: "доктор архитектуры"},
|
||||
{ORTH: "д-р биол. наук", NORM: "доктор биологических наук"},
|
||||
{ORTH: "д-р ветеринар. наук", NORM: "доктор ветеринарных наук"},
|
||||
{ORTH: "д-р воен. наук", NORM: "доктор военных наук"},
|
||||
{ORTH: "д-р геогр. наук", NORM: "доктор географических наук"},
|
||||
{ORTH: "д-р геол.-минерал. наук", NORM: "доктор геолого-минералогических наук"},
|
||||
{ORTH: "д-р искусствоведения", NORM: "доктор искусствоведения"},
|
||||
{ORTH: "д-р ист. наук", NORM: "доктор исторических наук"},
|
||||
{ORTH: "д-р культурологии", NORM: "доктор культурологии"},
|
||||
{ORTH: "д-р мед. наук", NORM: "доктор медицинских наук"},
|
||||
{ORTH: "д-р пед. наук", NORM: "доктор педагогических наук"},
|
||||
{ORTH: "д-р полит. наук", NORM: "доктор политических наук"},
|
||||
{ORTH: "д-р психол. наук", NORM: "доктор психологических наук"},
|
||||
{ORTH: "д-р с.-х. наук", NORM: "доктор сельскохозяйственных наук"},
|
||||
{ORTH: "д-р социол. наук", NORM: "доктор социологических наук"},
|
||||
{ORTH: "д-р техн. наук", NORM: "доктор технических наук"},
|
||||
{ORTH: "д-р фармацевт. наук", NORM: "доктор фармацевтических наук"},
|
||||
{ORTH: "д-р физ.-мат. наук", NORM: "доктор физико-математических наук"},
|
||||
{ORTH: "д-р филол. наук", NORM: "доктор филологических наук"},
|
||||
{ORTH: "д-р филос. наук", NORM: "доктор философских наук"},
|
||||
{ORTH: "д-р хим. наук", NORM: "доктор химических наук"},
|
||||
{ORTH: "д-р экон. наук", NORM: "доктор экономических наук"},
|
||||
{ORTH: "д-р юрид. наук", NORM: "доктор юридических наук"},
|
||||
{ORTH: "д-р", NORM: "доктор"},
|
||||
{ORTH: "д.б.н.", NORM: "доктор биологических наук"},
|
||||
{ORTH: "д.г.-м.н.", NORM: "доктор геолого-минералогических наук"},
|
||||
{ORTH: "д.г.н.", NORM: "доктор географических наук"},
|
||||
{ORTH: "д.и.н.", NORM: "доктор исторических наук"},
|
||||
{ORTH: "д.иск.", NORM: "доктор искусствоведения"},
|
||||
{ORTH: "д.м.н.", NORM: "доктор медицинских наук"},
|
||||
{ORTH: "д.п.н.", NORM: "доктор психологических наук"},
|
||||
{ORTH: "д.пед.н.", NORM: "доктор педагогических наук"},
|
||||
{ORTH: "д.полит.н.", NORM: "доктор политических наук"},
|
||||
{ORTH: "д.с.-х.н.", NORM: "доктор сельскохозяйственных наук"},
|
||||
{ORTH: "д.социол.н.", NORM: "доктор социологических наук"},
|
||||
{ORTH: "д.т.н.", NORM: "доктор технических наук"},
|
||||
{ORTH: "д.т.н", NORM: "доктор технических наук"},
|
||||
{ORTH: "д.ф.-м.н.", NORM: "доктор физико-математических наук"},
|
||||
{ORTH: "д.ф.н.", NORM: "доктор филологических наук"},
|
||||
{ORTH: "д.филос.н.", NORM: "доктор философских наук"},
|
||||
{ORTH: "д.фил.н.", NORM: "доктор филологических наук"},
|
||||
{ORTH: "д.х.н.", NORM: "доктор химических наук"},
|
||||
{ORTH: "д.э.н.", NORM: "доктор экономических наук"},
|
||||
{ORTH: "д.э.н", NORM: "доктор экономических наук"},
|
||||
{ORTH: "д.ю.н.", NORM: "доктор юридических наук"},
|
||||
{ORTH: "доц.", NORM: "доцент"},
|
||||
{ORTH: "и.о.", NORM: "исполняющий обязанности"},
|
||||
{ORTH: "к.б.н.", NORM: "кандидат биологических наук"},
|
||||
{ORTH: "к.воен.н.", NORM: "кандидат военных наук"},
|
||||
{ORTH: "к.г.-м.н.", NORM: "кандидат геолого-минералогических наук"},
|
||||
{ORTH: "к.г.н.", NORM: "кандидат географических наук"},
|
||||
{ORTH: "к.геогр.н", NORM: "кандидат географических наук"},
|
||||
{ORTH: "к.геогр.наук", NORM: "кандидат географических наук"},
|
||||
{ORTH: "к.и.н.", NORM: "кандидат исторических наук"},
|
||||
{ORTH: "к.иск.", NORM: "кандидат искусствоведения"},
|
||||
{ORTH: "к.м.н.", NORM: "кандидат медицинских наук"},
|
||||
{ORTH: "к.п.н.", NORM: "кандидат психологических наук"},
|
||||
{ORTH: "к.псх.н.", NORM: "кандидат психологических наук"},
|
||||
{ORTH: "к.пед.н.", NORM: "кандидат педагогических наук"},
|
||||
{ORTH: "канд.пед.наук", NORM: "кандидат педагогических наук"},
|
||||
{ORTH: "к.полит.н.", NORM: "кандидат политических наук"},
|
||||
{ORTH: "к.с.-х.н.", NORM: "кандидат сельскохозяйственных наук"},
|
||||
{ORTH: "к.социол.н.", NORM: "кандидат социологических наук"},
|
||||
{ORTH: "к.с.н.", NORM: "кандидат социологических наук"},
|
||||
{ORTH: "к.т.н.", NORM: "кандидат технических наук"},
|
||||
{ORTH: "к.ф.-м.н.", NORM: "кандидат физико-математических наук"},
|
||||
{ORTH: "к.ф.н.", NORM: "кандидат филологических наук"},
|
||||
{ORTH: "к.фил.н.", NORM: "кандидат филологических наук"},
|
||||
{ORTH: "к.филол.н", NORM: "кандидат филологических наук"},
|
||||
{ORTH: "к.фарм.наук", NORM: "кандидат фармакологических наук"},
|
||||
{ORTH: "к.фарм.н.", NORM: "кандидат фармакологических наук"},
|
||||
{ORTH: "к.фарм.н", NORM: "кандидат фармакологических наук"},
|
||||
{ORTH: "к.филос.наук", NORM: "кандидат философских наук"},
|
||||
{ORTH: "к.филос.н.", NORM: "кандидат философских наук"},
|
||||
{ORTH: "к.филос.н", NORM: "кандидат философских наук"},
|
||||
{ORTH: "к.х.н.", NORM: "кандидат химических наук"},
|
||||
{ORTH: "к.х.н", NORM: "кандидат химических наук"},
|
||||
{ORTH: "к.э.н.", NORM: "кандидат экономических наук"},
|
||||
{ORTH: "к.э.н", NORM: "кандидат экономических наук"},
|
||||
{ORTH: "к.ю.н.", NORM: "кандидат юридических наук"},
|
||||
{ORTH: "к.ю.н", NORM: "кандидат юридических наук"},
|
||||
{ORTH: "канд. архитектуры", NORM: "кандидат архитектуры"},
|
||||
{ORTH: "канд. биол. наук", NORM: "кандидат биологических наук"},
|
||||
{ORTH: "канд. ветеринар. наук", NORM: "кандидат ветеринарных наук"},
|
||||
{ORTH: "канд. воен. наук", NORM: "кандидат военных наук"},
|
||||
{ORTH: "канд. геогр. наук", NORM: "кандидат географических наук"},
|
||||
{ORTH: "канд. геол.-минерал. наук", NORM: "кандидат геолого-минералогических наук"},
|
||||
{ORTH: "канд. искусствоведения", NORM: "кандидат искусствоведения"},
|
||||
{ORTH: "канд. ист. наук", NORM: "кандидат исторических наук"},
|
||||
{ORTH: "к.ист.н.", NORM: "кандидат исторических наук"},
|
||||
{ORTH: "канд. культурологии", NORM: "кандидат культурологии"},
|
||||
{ORTH: "канд. мед. наук", NORM: "кандидат медицинских наук"},
|
||||
{ORTH: "канд. пед. наук", NORM: "кандидат педагогических наук"},
|
||||
{ORTH: "канд. полит. наук", NORM: "кандидат политических наук"},
|
||||
{ORTH: "канд. психол. наук", NORM: "кандидат психологических наук"},
|
||||
{ORTH: "канд. с.-х. наук", NORM: "кандидат сельскохозяйственных наук"},
|
||||
{ORTH: "канд. социол. наук", NORM: "кандидат социологических наук"},
|
||||
{ORTH: "к.соц.наук", NORM: "кандидат социологических наук"},
|
||||
{ORTH: "к.соц.н.", NORM: "кандидат социологических наук"},
|
||||
{ORTH: "к.соц.н", NORM: "кандидат социологических наук"},
|
||||
{ORTH: "канд. техн. наук", NORM: "кандидат технических наук"},
|
||||
{ORTH: "канд. фармацевт. наук", NORM: "кандидат фармацевтических наук"},
|
||||
{ORTH: "канд. физ.-мат. наук", NORM: "кандидат физико-математических наук"},
|
||||
{ORTH: "канд. филол. наук", NORM: "кандидат филологических наук"},
|
||||
{ORTH: "канд. филос. наук", NORM: "кандидат философских наук"},
|
||||
{ORTH: "канд. хим. наук", NORM: "кандидат химических наук"},
|
||||
{ORTH: "канд. экон. наук", NORM: "кандидат экономических наук"},
|
||||
{ORTH: "канд. юрид. наук", NORM: "кандидат юридических наук"},
|
||||
{ORTH: "в.н.с.", NORM: "ведущий научный сотрудник"},
|
||||
{ORTH: "мл. науч. сотр.", NORM: "младший научный сотрудник"},
|
||||
{ORTH: "м.н.с.", NORM: "младший научный сотрудник"},
|
||||
{ORTH: "проф.", NORM: "профессор"},
|
||||
{ORTH: "профессор.кафедры", NORM: "профессор кафедры"},
|
||||
{ORTH: "ст. науч. сотр.", NORM: "старший научный сотрудник"},
|
||||
{ORTH: "чл.-к.", NORM: "член корреспондент"},
|
||||
{ORTH: "чл.-корр.", NORM: "член-корреспондент"},
|
||||
{ORTH: "чл.-кор.", NORM: "член-корреспондент"},
|
||||
{ORTH: "дир.", NORM: "директор"},
|
||||
{ORTH: "зам. дир.", NORM: "заместитель директора"},
|
||||
{ORTH: "зав. каф.", NORM: "заведующий кафедрой"},
|
||||
{ORTH: "зав.кафедрой", NORM: "заведующий кафедрой"},
|
||||
{ORTH: "зав. кафедрой", NORM: "заведующий кафедрой"},
|
||||
{ORTH: "асп.", NORM: "аспирант"},
|
||||
{ORTH: "гл. науч. сотр.", NORM: "главный научный сотрудник"},
|
||||
{ORTH: "вед. науч. сотр.", NORM: "ведущий научный сотрудник"},
|
||||
{ORTH: "науч. сотр.", NORM: "научный сотрудник"},
|
||||
{ORTH: "к.м.с.", NORM: "кандидат в мастера спорта"},
|
||||
]:
|
||||
_exc[abbr[ORTH]] = [abbr]
|
||||
|
||||
|
||||
for abbr in [
|
||||
# Literary phrases abbreviations
|
||||
{ORTH: "и т.д.", NORM: "и так далее"},
|
||||
{ORTH: "и т.п.", NORM: "и тому подобное"},
|
||||
{ORTH: "т.д.", NORM: "так далее"},
|
||||
{ORTH: "т.п.", NORM: "тому подобное"},
|
||||
{ORTH: "т.е.", NORM: "то есть"},
|
||||
{ORTH: "т.к.", NORM: "так как"},
|
||||
{ORTH: "в т.ч.", NORM: "в том числе"},
|
||||
{ORTH: "и пр.", NORM: "и прочие"},
|
||||
{ORTH: "и др.", NORM: "и другие"},
|
||||
{ORTH: "т.н.", NORM: "так называемый"},
|
||||
]:
|
||||
_exc[abbr[ORTH]] = [abbr]
|
||||
|
||||
|
||||
for abbr in [
|
||||
# Appeal to a person abbreviations
|
||||
{ORTH: "г-н", NORM: "господин"},
|
||||
{ORTH: "г-да", NORM: "господа"},
|
||||
{ORTH: "г-жа", NORM: "госпожа"},
|
||||
{ORTH: "тов.", NORM: "товарищ"},
|
||||
]:
|
||||
_exc[abbr[ORTH]] = [abbr]
|
||||
|
||||
|
||||
for abbr in [
|
||||
# Time periods abbreviations
|
||||
{ORTH: "до н.э.", NORM: "до нашей эры"},
|
||||
{ORTH: "по н.в.", NORM: "по настоящее время"},
|
||||
{ORTH: "в н.в.", NORM: "в настоящее время"},
|
||||
{ORTH: "наст.", NORM: "настоящий"},
|
||||
{ORTH: "наст. время", NORM: "настоящее время"},
|
||||
{ORTH: "г.г.", NORM: "годы"},
|
||||
{ORTH: "гг.", NORM: "годы"},
|
||||
{ORTH: "т.г.", NORM: "текущий год"},
|
||||
]:
|
||||
_exc[abbr[ORTH]] = [abbr]
|
||||
|
||||
|
||||
for abbr in [
|
||||
# Address forming elements abbreviations
|
||||
{ORTH: "респ.", NORM: "республика"},
|
||||
{ORTH: "обл.", NORM: "область"},
|
||||
{ORTH: "г.ф.з.", NORM: "город федерального значения"},
|
||||
{ORTH: "а.обл.", NORM: "автономная область"},
|
||||
{ORTH: "а.окр.", NORM: "автономный округ"},
|
||||
{ORTH: "м.р-н", NORM: "муниципальный район"},
|
||||
{ORTH: "г.о.", NORM: "городской округ"},
|
||||
{ORTH: "г.п.", NORM: "городское поселение"},
|
||||
{ORTH: "с.п.", NORM: "сельское поселение"},
|
||||
{ORTH: "вн.р-н", NORM: "внутригородской район"},
|
||||
{ORTH: "вн.тер.г.", NORM: "внутригородская территория города"},
|
||||
{ORTH: "пос.", NORM: "поселение"},
|
||||
{ORTH: "р-н", NORM: "район"},
|
||||
{ORTH: "с/с", NORM: "сельсовет"},
|
||||
{ORTH: "г.", NORM: "город"},
|
||||
{ORTH: "п.г.т.", NORM: "поселок городского типа"},
|
||||
{ORTH: "пгт.", NORM: "поселок городского типа"},
|
||||
{ORTH: "р.п.", NORM: "рабочий поселок"},
|
||||
{ORTH: "рп.", NORM: "рабочий поселок"},
|
||||
{ORTH: "кп.", NORM: "курортный поселок"},
|
||||
{ORTH: "гп.", NORM: "городской поселок"},
|
||||
{ORTH: "п.", NORM: "поселок"},
|
||||
{ORTH: "в-ки", NORM: "выселки"},
|
||||
{ORTH: "г-к", NORM: "городок"},
|
||||
{ORTH: "з-ка", NORM: "заимка"},
|
||||
{ORTH: "п-к", NORM: "починок"},
|
||||
{ORTH: "киш.", NORM: "кишлак"},
|
||||
{ORTH: "п. ст. ", NORM: "поселок станция"},
|
||||
{ORTH: "п. ж/д ст. ", NORM: "поселок при железнодорожной станции"},
|
||||
{ORTH: "ж/д бл-ст", NORM: "железнодорожный блокпост"},
|
||||
{ORTH: "ж/д б-ка", NORM: "железнодорожная будка"},
|
||||
{ORTH: "ж/д в-ка", NORM: "железнодорожная ветка"},
|
||||
{ORTH: "ж/д к-ма", NORM: "железнодорожная казарма"},
|
||||
{ORTH: "ж/д к-т", NORM: "железнодорожный комбинат"},
|
||||
{ORTH: "ж/д пл-ма", NORM: "железнодорожная платформа"},
|
||||
{ORTH: "ж/д пл-ка", NORM: "железнодорожная площадка"},
|
||||
{ORTH: "ж/д п.п.", NORM: "железнодорожный путевой пост"},
|
||||
{ORTH: "ж/д о.п.", NORM: "железнодорожный остановочный пункт"},
|
||||
{ORTH: "ж/д рзд.", NORM: "железнодорожный разъезд"},
|
||||
{ORTH: "ж/д ст. ", NORM: "железнодорожная станция"},
|
||||
{ORTH: "м-ко", NORM: "местечко"},
|
||||
{ORTH: "д.", NORM: "деревня"},
|
||||
{ORTH: "с.", NORM: "село"},
|
||||
{ORTH: "сл.", NORM: "слобода"},
|
||||
{ORTH: "ст. ", NORM: "станция"},
|
||||
{ORTH: "ст-ца", NORM: "станица"},
|
||||
{ORTH: "у.", NORM: "улус"},
|
||||
{ORTH: "х.", NORM: "хутор"},
|
||||
{ORTH: "рзд.", NORM: "разъезд"},
|
||||
{ORTH: "зим.", NORM: "зимовье"},
|
||||
{ORTH: "б-г", NORM: "берег"},
|
||||
{ORTH: "ж/р", NORM: "жилой район"},
|
||||
{ORTH: "кв-л", NORM: "квартал"},
|
||||
{ORTH: "мкр.", NORM: "микрорайон"},
|
||||
{ORTH: "ост-в", NORM: "остров"},
|
||||
{ORTH: "платф.", NORM: "платформа"},
|
||||
{ORTH: "п/р", NORM: "промышленный район"},
|
||||
{ORTH: "р-н", NORM: "район"},
|
||||
{ORTH: "тер.", NORM: "территория"},
|
||||
{
|
||||
ORTH: "тер. СНО",
|
||||
NORM: "территория садоводческих некоммерческих объединений граждан",
|
||||
},
|
||||
{
|
||||
ORTH: "тер. ОНО",
|
||||
NORM: "территория огороднических некоммерческих объединений граждан",
|
||||
},
|
||||
{ORTH: "тер. ДНО", NORM: "территория дачных некоммерческих объединений граждан"},
|
||||
{ORTH: "тер. СНТ", NORM: "территория садоводческих некоммерческих товариществ"},
|
||||
{ORTH: "тер. ОНТ", NORM: "территория огороднических некоммерческих товариществ"},
|
||||
{ORTH: "тер. ДНТ", NORM: "территория дачных некоммерческих товариществ"},
|
||||
{ORTH: "тер. СПК", NORM: "территория садоводческих потребительских кооперативов"},
|
||||
{ORTH: "тер. ОПК", NORM: "территория огороднических потребительских кооперативов"},
|
||||
{ORTH: "тер. ДПК", NORM: "территория дачных потребительских кооперативов"},
|
||||
{ORTH: "тер. СНП", NORM: "территория садоводческих некоммерческих партнерств"},
|
||||
{ORTH: "тер. ОНП", NORM: "территория огороднических некоммерческих партнерств"},
|
||||
{ORTH: "тер. ДНП", NORM: "территория дачных некоммерческих партнерств"},
|
||||
{ORTH: "тер. ТСН", NORM: "территория товарищества собственников недвижимости"},
|
||||
{ORTH: "тер. ГСК", NORM: "территория гаражно-строительного кооператива"},
|
||||
{ORTH: "ус.", NORM: "усадьба"},
|
||||
{ORTH: "тер.ф.х.", NORM: "территория фермерского хозяйства"},
|
||||
{ORTH: "ю.", NORM: "юрты"},
|
||||
{ORTH: "ал.", NORM: "аллея"},
|
||||
{ORTH: "б-р", NORM: "бульвар"},
|
||||
{ORTH: "взв.", NORM: "взвоз"},
|
||||
{ORTH: "взд.", NORM: "въезд"},
|
||||
{ORTH: "дор.", NORM: "дорога"},
|
||||
{ORTH: "ззд.", NORM: "заезд"},
|
||||
{ORTH: "км", NORM: "километр"},
|
||||
{ORTH: "к-цо", NORM: "кольцо"},
|
||||
{ORTH: "лн.", NORM: "линия"},
|
||||
{ORTH: "мгстр.", NORM: "магистраль"},
|
||||
{ORTH: "наб.", NORM: "набережная"},
|
||||
{ORTH: "пер-д", NORM: "переезд"},
|
||||
{ORTH: "пер.", NORM: "переулок"},
|
||||
{ORTH: "пл-ка", NORM: "площадка"},
|
||||
{ORTH: "пл.", NORM: "площадь"},
|
||||
{ORTH: "пр-д", NORM: "проезд"},
|
||||
{ORTH: "пр-к", NORM: "просек"},
|
||||
{ORTH: "пр-ка", NORM: "просека"},
|
||||
{ORTH: "пр-лок", NORM: "проселок"},
|
||||
{ORTH: "пр-кт", NORM: "проспект"},
|
||||
{ORTH: "проул.", NORM: "проулок"},
|
||||
{ORTH: "рзд.", NORM: "разъезд"},
|
||||
{ORTH: "ряд", NORM: "ряд(ы)"},
|
||||
{ORTH: "с-р", NORM: "сквер"},
|
||||
{ORTH: "с-к", NORM: "спуск"},
|
||||
{ORTH: "сзд.", NORM: "съезд"},
|
||||
{ORTH: "туп.", NORM: "тупик"},
|
||||
{ORTH: "ул.", NORM: "улица"},
|
||||
{ORTH: "ш.", NORM: "шоссе"},
|
||||
{ORTH: "влд.", NORM: "владение"},
|
||||
{ORTH: "г-ж", NORM: "гараж"},
|
||||
{ORTH: "д.", NORM: "дом"},
|
||||
{ORTH: "двлд.", NORM: "домовладение"},
|
||||
{ORTH: "зд.", NORM: "здание"},
|
||||
{ORTH: "з/у", NORM: "земельный участок"},
|
||||
{ORTH: "кв.", NORM: "квартира"},
|
||||
{ORTH: "ком.", NORM: "комната"},
|
||||
{ORTH: "подв.", NORM: "подвал"},
|
||||
{ORTH: "кот.", NORM: "котельная"},
|
||||
{ORTH: "п-б", NORM: "погреб"},
|
||||
{ORTH: "к.", NORM: "корпус"},
|
||||
{ORTH: "ОНС", NORM: "объект незавершенного строительства"},
|
||||
{ORTH: "оф.", NORM: "офис"},
|
||||
{ORTH: "пав.", NORM: "павильон"},
|
||||
{ORTH: "помещ.", NORM: "помещение"},
|
||||
{ORTH: "раб.уч.", NORM: "рабочий участок"},
|
||||
{ORTH: "скл.", NORM: "склад"},
|
||||
{ORTH: "coop.", NORM: "сооружение"},
|
||||
{ORTH: "стр.", NORM: "строение"},
|
||||
{ORTH: "торг.зал", NORM: "торговый зал"},
|
||||
{ORTH: "а/п", NORM: "аэропорт"},
|
||||
{ORTH: "им.", NORM: "имени"},
|
||||
]:
|
||||
_exc[abbr[ORTH]] = [abbr]
|
||||
|
||||
|
||||
for abbr in [
|
||||
# Others abbreviations
|
||||
{ORTH: "тыс.руб.", NORM: "тысяч рублей"},
|
||||
{ORTH: "тыс.", NORM: "тысяч"},
|
||||
{ORTH: "руб.", NORM: "рубль"},
|
||||
{ORTH: "долл.", NORM: "доллар"},
|
||||
{ORTH: "прим.", NORM: "примечание"},
|
||||
{ORTH: "прим.ред.", NORM: "примечание редакции"},
|
||||
{ORTH: "см. также", NORM: "смотри также"},
|
||||
{ORTH: "кв.м.", NORM: "квадрантный метр"},
|
||||
{ORTH: "м2", NORM: "квадрантный метр"},
|
||||
{ORTH: "б/у", NORM: "бывший в употреблении"},
|
||||
{ORTH: "сокр.", NORM: "сокращение"},
|
||||
{ORTH: "чел.", NORM: "человек"},
|
||||
{ORTH: "б.п.", NORM: "базисный пункт"},
|
||||
]:
|
||||
_exc[abbr[ORTH]] = [abbr]
|
||||
|
||||
|
||||
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||
|
|
18
spacy/lang/sl/examples.py
Normal file
18
spacy/lang/sl/examples.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
>>> from spacy.lang.sl.examples import sentences
|
||||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"Apple načrtuje nakup britanskega startupa za 1 bilijon dolarjev",
|
||||
"France Prešeren je umrl 8. februarja 1849 v Kranju",
|
||||
"Staro ljubljansko letališče Moste bo obnovila družba BTC",
|
||||
"London je največje mesto v Združenem kraljestvu.",
|
||||
"Kje se skrivaš?",
|
||||
"Kdo je predsednik Francije?",
|
||||
"Katero je glavno mesto Združenih držav Amerike?",
|
||||
"Kdaj je bil rojen Milan Kučan?",
|
||||
]
|
|
@ -1,13 +1,10 @@
|
|||
# Source: https://github.com/stopwords-iso/stopwords-sl
|
||||
# TODO: probably needs to be tidied up – the list seems to have month names in
|
||||
# it, which shouldn't be considered stop words.
|
||||
# Removed various words that are not normally considered stop words, such as months.
|
||||
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
a
|
||||
ali
|
||||
april
|
||||
avgust
|
||||
b
|
||||
bi
|
||||
bil
|
||||
|
@ -19,7 +16,6 @@ biti
|
|||
blizu
|
||||
bo
|
||||
bodo
|
||||
bojo
|
||||
bolj
|
||||
bom
|
||||
bomo
|
||||
|
@ -37,16 +33,6 @@ da
|
|||
daleč
|
||||
dan
|
||||
danes
|
||||
datum
|
||||
december
|
||||
deset
|
||||
deseta
|
||||
deseti
|
||||
deseto
|
||||
devet
|
||||
deveta
|
||||
deveti
|
||||
deveto
|
||||
do
|
||||
dober
|
||||
dobra
|
||||
|
@ -54,16 +40,7 @@ dobri
|
|||
dobro
|
||||
dokler
|
||||
dol
|
||||
dolg
|
||||
dolga
|
||||
dolgi
|
||||
dovolj
|
||||
drug
|
||||
druga
|
||||
drugi
|
||||
drugo
|
||||
dva
|
||||
dve
|
||||
e
|
||||
eden
|
||||
en
|
||||
|
@ -74,7 +51,6 @@ enkrat
|
|||
eno
|
||||
etc.
|
||||
f
|
||||
februar
|
||||
g
|
||||
g.
|
||||
ga
|
||||
|
@ -93,16 +69,12 @@ iv
|
|||
ix
|
||||
iz
|
||||
j
|
||||
januar
|
||||
jaz
|
||||
je
|
||||
ji
|
||||
jih
|
||||
jim
|
||||
jo
|
||||
julij
|
||||
junij
|
||||
jutri
|
||||
k
|
||||
kadarkoli
|
||||
kaj
|
||||
|
@ -123,41 +95,23 @@ kje
|
|||
kjer
|
||||
kjerkoli
|
||||
ko
|
||||
koder
|
||||
koderkoli
|
||||
koga
|
||||
komu
|
||||
kot
|
||||
kratek
|
||||
kratka
|
||||
kratke
|
||||
kratki
|
||||
l
|
||||
lahka
|
||||
lahke
|
||||
lahki
|
||||
lahko
|
||||
le
|
||||
lep
|
||||
lepa
|
||||
lepe
|
||||
lepi
|
||||
lepo
|
||||
leto
|
||||
m
|
||||
maj
|
||||
majhen
|
||||
majhna
|
||||
majhni
|
||||
malce
|
||||
malo
|
||||
manj
|
||||
marec
|
||||
me
|
||||
med
|
||||
medtem
|
||||
mene
|
||||
mesec
|
||||
mi
|
||||
midva
|
||||
midve
|
||||
|
@ -183,7 +137,6 @@ najmanj
|
|||
naju
|
||||
največ
|
||||
nam
|
||||
narobe
|
||||
nas
|
||||
nato
|
||||
nazaj
|
||||
|
@ -192,7 +145,6 @@ naša
|
|||
naše
|
||||
ne
|
||||
nedavno
|
||||
nedelja
|
||||
nek
|
||||
neka
|
||||
nekaj
|
||||
|
@ -236,7 +188,6 @@ njuna
|
|||
njuno
|
||||
no
|
||||
nocoj
|
||||
november
|
||||
npr.
|
||||
o
|
||||
ob
|
||||
|
@ -244,51 +195,23 @@ oba
|
|||
obe
|
||||
oboje
|
||||
od
|
||||
odprt
|
||||
odprta
|
||||
odprti
|
||||
okoli
|
||||
oktober
|
||||
on
|
||||
onadva
|
||||
one
|
||||
oni
|
||||
onidve
|
||||
osem
|
||||
osma
|
||||
osmi
|
||||
osmo
|
||||
oz.
|
||||
p
|
||||
pa
|
||||
pet
|
||||
peta
|
||||
petek
|
||||
peti
|
||||
peto
|
||||
po
|
||||
pod
|
||||
pogosto
|
||||
poleg
|
||||
poln
|
||||
polna
|
||||
polni
|
||||
polno
|
||||
ponavadi
|
||||
ponedeljek
|
||||
ponovno
|
||||
potem
|
||||
povsod
|
||||
pozdravljen
|
||||
pozdravljeni
|
||||
prav
|
||||
prava
|
||||
prave
|
||||
pravi
|
||||
pravo
|
||||
prazen
|
||||
prazna
|
||||
prazno
|
||||
prbl.
|
||||
precej
|
||||
pred
|
||||
|
@ -297,19 +220,10 @@ preko
|
|||
pri
|
||||
pribl.
|
||||
približno
|
||||
primer
|
||||
pripravljen
|
||||
pripravljena
|
||||
pripravljeni
|
||||
proti
|
||||
prva
|
||||
prvi
|
||||
prvo
|
||||
r
|
||||
ravno
|
||||
redko
|
||||
res
|
||||
reč
|
||||
s
|
||||
saj
|
||||
sam
|
||||
|
@ -321,29 +235,17 @@ se
|
|||
sebe
|
||||
sebi
|
||||
sedaj
|
||||
sedem
|
||||
sedma
|
||||
sedmi
|
||||
sedmo
|
||||
sem
|
||||
september
|
||||
seveda
|
||||
si
|
||||
sicer
|
||||
skoraj
|
||||
skozi
|
||||
slab
|
||||
smo
|
||||
so
|
||||
sobota
|
||||
spet
|
||||
sreda
|
||||
srednja
|
||||
srednji
|
||||
sta
|
||||
ste
|
||||
stran
|
||||
stvar
|
||||
sva
|
||||
t
|
||||
ta
|
||||
|
@ -358,10 +260,6 @@ te
|
|||
tebe
|
||||
tebi
|
||||
tega
|
||||
težak
|
||||
težka
|
||||
težki
|
||||
težko
|
||||
ti
|
||||
tista
|
||||
tiste
|
||||
|
@ -371,11 +269,6 @@ tj.
|
|||
tja
|
||||
to
|
||||
toda
|
||||
torek
|
||||
tretja
|
||||
tretje
|
||||
tretji
|
||||
tri
|
||||
tu
|
||||
tudi
|
||||
tukaj
|
||||
|
@ -392,10 +285,6 @@ vaša
|
|||
vaše
|
||||
ve
|
||||
vedno
|
||||
velik
|
||||
velika
|
||||
veliki
|
||||
veliko
|
||||
vendar
|
||||
ves
|
||||
več
|
||||
|
@ -403,10 +292,6 @@ vi
|
|||
vidva
|
||||
vii
|
||||
viii
|
||||
visok
|
||||
visoka
|
||||
visoke
|
||||
visoki
|
||||
vsa
|
||||
vsaj
|
||||
vsak
|
||||
|
@ -420,34 +305,21 @@ vsega
|
|||
vsi
|
||||
vso
|
||||
včasih
|
||||
včeraj
|
||||
x
|
||||
z
|
||||
za
|
||||
zadaj
|
||||
zadnji
|
||||
zakaj
|
||||
zaprta
|
||||
zaprti
|
||||
zaprto
|
||||
zdaj
|
||||
zelo
|
||||
zunaj
|
||||
č
|
||||
če
|
||||
često
|
||||
četrta
|
||||
četrtek
|
||||
četrti
|
||||
četrto
|
||||
čez
|
||||
čigav
|
||||
š
|
||||
šest
|
||||
šesta
|
||||
šesti
|
||||
šesto
|
||||
štiri
|
||||
ž
|
||||
že
|
||||
""".split()
|
||||
|
|
|
@ -53,7 +53,7 @@ _ordinal_words = [
|
|||
"doksanıncı",
|
||||
"yüzüncü",
|
||||
"bininci",
|
||||
"mliyonuncu",
|
||||
"milyonuncu",
|
||||
"milyarıncı",
|
||||
"trilyonuncu",
|
||||
"katrilyonuncu",
|
||||
|
|
|
@ -6,19 +6,30 @@ from ...util import update_exc
|
|||
_exc = {}
|
||||
|
||||
for exc_data in [
|
||||
{ORTH: "обл.", NORM: "область"},
|
||||
{ORTH: "р-н.", NORM: "район"},
|
||||
{ORTH: "р-н", NORM: "район"},
|
||||
{ORTH: "м.", NORM: "місто"},
|
||||
{ORTH: "вул.", NORM: "вулиця"},
|
||||
{ORTH: "ім.", NORM: "імені"},
|
||||
{ORTH: "просп.", NORM: "проспект"},
|
||||
{ORTH: "пр-кт", NORM: "проспект"},
|
||||
{ORTH: "бул.", NORM: "бульвар"},
|
||||
{ORTH: "пров.", NORM: "провулок"},
|
||||
{ORTH: "пл.", NORM: "площа"},
|
||||
{ORTH: "майд.", NORM: "майдан"},
|
||||
{ORTH: "мкр.", NORM: "мікрорайон"},
|
||||
{ORTH: "ст.", NORM: "станція"},
|
||||
{ORTH: "ж/м", NORM: "житловий масив"},
|
||||
{ORTH: "наб.", NORM: "набережна"},
|
||||
{ORTH: "в/ч", NORM: "військова частина"},
|
||||
{ORTH: "в/м", NORM: "військове містечко"},
|
||||
{ORTH: "оз.", NORM: "озеро"},
|
||||
{ORTH: "ім.", NORM: "імені"},
|
||||
{ORTH: "г.", NORM: "гора"},
|
||||
{ORTH: "п.", NORM: "пан"},
|
||||
{ORTH: "м.", NORM: "місто"},
|
||||
{ORTH: "проф.", NORM: "професор"},
|
||||
{ORTH: "акад.", NORM: "академік"},
|
||||
{ORTH: "доц.", NORM: "доцент"},
|
||||
{ORTH: "оз.", NORM: "озеро"},
|
||||
]:
|
||||
_exc[exc_data[ORTH]] = [exc_data]
|
||||
|
||||
|
|
|
@ -131,7 +131,7 @@ class Language:
|
|||
self,
|
||||
vocab: Union[Vocab, bool] = True,
|
||||
*,
|
||||
max_length: int = 10 ** 6,
|
||||
max_length: int = 10**6,
|
||||
meta: Dict[str, Any] = {},
|
||||
create_tokenizer: Optional[Callable[["Language"], Callable[[str], Doc]]] = None,
|
||||
batch_size: int = 1000,
|
||||
|
@ -354,12 +354,15 @@ class Language:
|
|||
@property
|
||||
def pipe_labels(self) -> Dict[str, List[str]]:
|
||||
"""Get the labels set by the pipeline components, if available (if
|
||||
the component exposes a labels property).
|
||||
the component exposes a labels property and the labels are not
|
||||
hidden).
|
||||
|
||||
RETURNS (Dict[str, List[str]]): Labels keyed by component name.
|
||||
"""
|
||||
labels = {}
|
||||
for name, pipe in self._components:
|
||||
if hasattr(pipe, "hide_labels") and pipe.hide_labels is True:
|
||||
continue
|
||||
if hasattr(pipe, "labels"):
|
||||
labels[name] = list(pipe.labels)
|
||||
return SimpleFrozenDict(labels)
|
||||
|
@ -1219,8 +1222,9 @@ class Language:
|
|||
component_cfg = {}
|
||||
grads = {}
|
||||
|
||||
def get_grads(W, dW, key=None):
|
||||
def get_grads(key, W, dW):
|
||||
grads[key] = (W, dW)
|
||||
return W, dW
|
||||
|
||||
get_grads.learn_rate = sgd.learn_rate # type: ignore[attr-defined, union-attr]
|
||||
get_grads.b1 = sgd.b1 # type: ignore[attr-defined, union-attr]
|
||||
|
@ -1233,7 +1237,7 @@ class Language:
|
|||
examples, sgd=get_grads, losses=losses, **component_cfg.get(name, {})
|
||||
)
|
||||
for key, (W, dW) in grads.items():
|
||||
sgd(W, dW, key=key) # type: ignore[call-arg, misc]
|
||||
sgd(key, W, dW) # type: ignore[call-arg, misc]
|
||||
return losses
|
||||
|
||||
def begin_training(
|
||||
|
|
|
@ -244,6 +244,10 @@ cdef class Matcher:
|
|||
pipe = "parser"
|
||||
error_msg = Errors.E155.format(pipe=pipe, attr=self.vocab.strings.as_string(attr))
|
||||
raise ValueError(error_msg)
|
||||
|
||||
if self.patterns.empty():
|
||||
matches = []
|
||||
else:
|
||||
matches = find_matches(&self.patterns[0], self.patterns.size(), doclike, length,
|
||||
extensions=self._extensions, predicates=self._extra_predicates, with_alignments=with_alignments)
|
||||
final_matches = []
|
||||
|
|
|
@ -14,7 +14,7 @@ class PhraseMatcher:
|
|||
def add(
|
||||
self,
|
||||
key: str,
|
||||
docs: List[List[Dict[str, Any]]],
|
||||
docs: List[Doc],
|
||||
*,
|
||||
on_match: Optional[
|
||||
Callable[[Matcher, Doc, int, List[Tuple[Any, ...]]], Any]
|
||||
|
|
|
@ -63,4 +63,4 @@ def _get_span_indices(ops, spans: Ragged, lengths: Ints1d) -> Ints1d:
|
|||
|
||||
|
||||
def _ensure_cpu(spans: Ragged, lengths: Ints1d) -> Tuple[Ragged, Ints1d]:
|
||||
return (Ragged(to_numpy(spans.dataXd), to_numpy(spans.lengths)), to_numpy(lengths))
|
||||
return Ragged(to_numpy(spans.dataXd), to_numpy(spans.lengths)), to_numpy(lengths)
|
||||
|
|
|
@ -1,34 +1,82 @@
|
|||
from pathlib import Path
|
||||
from typing import Optional, Callable, Iterable, List
|
||||
from typing import Optional, Callable, Iterable, List, Tuple
|
||||
from thinc.types import Floats2d
|
||||
from thinc.api import chain, clone, list2ragged, reduce_mean, residual
|
||||
from thinc.api import Model, Maxout, Linear
|
||||
from thinc.api import Model, Maxout, Linear, noop, tuplify, Ragged
|
||||
|
||||
from ...util import registry
|
||||
from ...kb import KnowledgeBase, Candidate, get_candidates
|
||||
from ...vocab import Vocab
|
||||
from ...tokens import Span, Doc
|
||||
from ..extract_spans import extract_spans
|
||||
from ...errors import Errors
|
||||
|
||||
|
||||
@registry.architectures("spacy.EntityLinker.v1")
|
||||
@registry.architectures("spacy.EntityLinker.v2")
|
||||
def build_nel_encoder(
|
||||
tok2vec: Model, nO: Optional[int] = None
|
||||
) -> Model[List[Doc], Floats2d]:
|
||||
with Model.define_operators({">>": chain, "**": clone}):
|
||||
with Model.define_operators({">>": chain, "&": tuplify}):
|
||||
token_width = tok2vec.maybe_get_dim("nO")
|
||||
output_layer = Linear(nO=nO, nI=token_width)
|
||||
model = (
|
||||
tok2vec
|
||||
>> list2ragged()
|
||||
((tok2vec >> list2ragged()) & build_span_maker())
|
||||
>> extract_spans()
|
||||
>> reduce_mean()
|
||||
>> residual(Maxout(nO=token_width, nI=token_width, nP=2, dropout=0.0)) # type: ignore[arg-type]
|
||||
>> output_layer
|
||||
)
|
||||
model.set_ref("output_layer", output_layer)
|
||||
model.set_ref("tok2vec", tok2vec)
|
||||
# flag to show this isn't legacy
|
||||
model.attrs["include_span_maker"] = True
|
||||
return model
|
||||
|
||||
|
||||
def build_span_maker(n_sents: int = 0) -> Model:
|
||||
model: Model = Model("span_maker", forward=span_maker_forward)
|
||||
model.attrs["n_sents"] = n_sents
|
||||
return model
|
||||
|
||||
|
||||
def span_maker_forward(model, docs: List[Doc], is_train) -> Tuple[Ragged, Callable]:
|
||||
ops = model.ops
|
||||
n_sents = model.attrs["n_sents"]
|
||||
candidates = []
|
||||
for doc in docs:
|
||||
cands = []
|
||||
try:
|
||||
sentences = [s for s in doc.sents]
|
||||
except ValueError:
|
||||
# no sentence info, normal in initialization
|
||||
for tok in doc:
|
||||
tok.is_sent_start = tok.i == 0
|
||||
sentences = [doc[:]]
|
||||
for ent in doc.ents:
|
||||
try:
|
||||
# find the sentence in the list of sentences.
|
||||
sent_index = sentences.index(ent.sent)
|
||||
except AttributeError:
|
||||
# Catch the exception when ent.sent is None and provide a user-friendly warning
|
||||
raise RuntimeError(Errors.E030) from None
|
||||
# get n previous sentences, if there are any
|
||||
start_sentence = max(0, sent_index - n_sents)
|
||||
# get n posterior sentences, or as many < n as there are
|
||||
end_sentence = min(len(sentences) - 1, sent_index + n_sents)
|
||||
# get token positions
|
||||
start_token = sentences[start_sentence].start
|
||||
end_token = sentences[end_sentence].end
|
||||
# save positions for extraction
|
||||
cands.append((start_token, end_token))
|
||||
|
||||
candidates.append(ops.asarray2i(cands))
|
||||
candlens = ops.asarray1i([len(cands) for cands in candidates])
|
||||
candidates = ops.xp.concatenate(candidates)
|
||||
outputs = Ragged(candidates, candlens)
|
||||
# because this is just rearranging docs, the backprop does nothing
|
||||
return outputs, lambda x: []
|
||||
|
||||
|
||||
@registry.misc("spacy.KBFromFile.v1")
|
||||
def load_kb(kb_path: Path) -> Callable[[Vocab], KnowledgeBase]:
|
||||
def kb_from_file(vocab):
|
||||
|
|
|
@ -85,7 +85,7 @@ def get_characters_loss(ops, docs, prediction, nr_char):
|
|||
target = ops.asarray(to_categorical(target_ids, n_classes=256), dtype="f")
|
||||
target = target.reshape((-1, 256 * nr_char))
|
||||
diff = prediction - target
|
||||
loss = (diff ** 2).sum()
|
||||
loss = (diff**2).sum()
|
||||
d_target = diff / float(prediction.shape[0])
|
||||
return loss, d_target
|
||||
|
||||
|
|
|
@ -1,14 +1,14 @@
|
|||
from typing import Optional, List
|
||||
from thinc.api import zero_init, with_array, Softmax, chain, Model
|
||||
from thinc.api import zero_init, with_array, Softmax_v2, chain, Model
|
||||
from thinc.types import Floats2d
|
||||
|
||||
from ...util import registry
|
||||
from ...tokens import Doc
|
||||
|
||||
|
||||
@registry.architectures("spacy.Tagger.v1")
|
||||
@registry.architectures("spacy.Tagger.v2")
|
||||
def build_tagger_model(
|
||||
tok2vec: Model[List[Doc], List[Floats2d]], nO: Optional[int] = None
|
||||
tok2vec: Model[List[Doc], List[Floats2d]], nO: Optional[int] = None, normalize=False
|
||||
) -> Model[List[Doc], List[Floats2d]]:
|
||||
"""Build a tagger model, using a provided token-to-vector component. The tagger
|
||||
model simply adds a linear layer with softmax activation to predict scores
|
||||
|
@ -19,7 +19,9 @@ def build_tagger_model(
|
|||
"""
|
||||
# TODO: glorot_uniform_init seems to work a bit better than zero_init here?!
|
||||
t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
|
||||
output_layer = Softmax(nO, t2v_width, init_W=zero_init)
|
||||
output_layer = Softmax_v2(
|
||||
nO, t2v_width, init_W=zero_init, normalize_outputs=normalize
|
||||
)
|
||||
softmax = with_array(output_layer) # type: ignore
|
||||
model = chain(tok2vec, softmax)
|
||||
model.set_ref("tok2vec", tok2vec)
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
from .attributeruler import AttributeRuler
|
||||
from .coref import CoreferenceResolver
|
||||
from .dep_parser import DependencyParser
|
||||
from .edit_tree_lemmatizer import EditTreeLemmatizer
|
||||
from .entity_linker import EntityLinker
|
||||
from .ner import EntityRecognizer
|
||||
from .entityruler import EntityRuler
|
||||
|
|
0
spacy/pipeline/_edit_tree_internals/__init__.py
Normal file
0
spacy/pipeline/_edit_tree_internals/__init__.py
Normal file
93
spacy/pipeline/_edit_tree_internals/edit_trees.pxd
Normal file
93
spacy/pipeline/_edit_tree_internals/edit_trees.pxd
Normal file
|
@ -0,0 +1,93 @@
|
|||
from libc.stdint cimport uint32_t, uint64_t
|
||||
from libcpp.unordered_map cimport unordered_map
|
||||
from libcpp.vector cimport vector
|
||||
|
||||
from ...typedefs cimport attr_t, hash_t, len_t
|
||||
from ...strings cimport StringStore
|
||||
|
||||
cdef extern from "<algorithm>" namespace "std" nogil:
|
||||
void swap[T](T& a, T& b) except + # Only available in Cython 3.
|
||||
|
||||
# An edit tree (Müller et al., 2015) is a tree structure that consists of
|
||||
# edit operations. The two types of operations are string matches
|
||||
# and string substitutions. Given an input string s and an output string t,
|
||||
# subsitution and match nodes should be interpreted as follows:
|
||||
#
|
||||
# * Substitution node: consists of an original string and substitute string.
|
||||
# If s matches the original string, then t is the substitute. Otherwise,
|
||||
# the node does not apply.
|
||||
# * Match node: consists of a prefix length, suffix length, prefix edit tree,
|
||||
# and suffix edit tree. If s is composed of a prefix, middle part, and suffix
|
||||
# with the given suffix and prefix lengths, then t is the concatenation
|
||||
# prefix_tree(prefix) + middle + suffix_tree(suffix).
|
||||
#
|
||||
# For efficiency, we represent strings in substitution nodes as integers, with
|
||||
# the actual strings stored in a StringStore. Subtrees in match nodes are stored
|
||||
# as tree identifiers (rather than pointers) to simplify serialization.
|
||||
|
||||
cdef uint32_t NULL_TREE_ID
|
||||
|
||||
cdef struct MatchNodeC:
|
||||
len_t prefix_len
|
||||
len_t suffix_len
|
||||
uint32_t prefix_tree
|
||||
uint32_t suffix_tree
|
||||
|
||||
cdef struct SubstNodeC:
|
||||
attr_t orig
|
||||
attr_t subst
|
||||
|
||||
cdef union NodeC:
|
||||
MatchNodeC match_node
|
||||
SubstNodeC subst_node
|
||||
|
||||
cdef struct EditTreeC:
|
||||
bint is_match_node
|
||||
NodeC inner
|
||||
|
||||
cdef inline EditTreeC edittree_new_match(len_t prefix_len, len_t suffix_len,
|
||||
uint32_t prefix_tree, uint32_t suffix_tree):
|
||||
cdef MatchNodeC match_node = MatchNodeC(prefix_len=prefix_len,
|
||||
suffix_len=suffix_len, prefix_tree=prefix_tree,
|
||||
suffix_tree=suffix_tree)
|
||||
cdef NodeC inner = NodeC(match_node=match_node)
|
||||
return EditTreeC(is_match_node=True, inner=inner)
|
||||
|
||||
cdef inline EditTreeC edittree_new_subst(attr_t orig, attr_t subst):
|
||||
cdef EditTreeC node
|
||||
cdef SubstNodeC subst_node = SubstNodeC(orig=orig, subst=subst)
|
||||
cdef NodeC inner = NodeC(subst_node=subst_node)
|
||||
return EditTreeC(is_match_node=False, inner=inner)
|
||||
|
||||
cdef inline uint64_t edittree_hash(EditTreeC tree):
|
||||
cdef MatchNodeC match_node
|
||||
cdef SubstNodeC subst_node
|
||||
|
||||
if tree.is_match_node:
|
||||
match_node = tree.inner.match_node
|
||||
return hash((match_node.prefix_len, match_node.suffix_len, match_node.prefix_tree, match_node.suffix_tree))
|
||||
else:
|
||||
subst_node = tree.inner.subst_node
|
||||
return hash((subst_node.orig, subst_node.subst))
|
||||
|
||||
cdef struct LCS:
|
||||
int source_begin
|
||||
int source_end
|
||||
int target_begin
|
||||
int target_end
|
||||
|
||||
cdef inline bint lcs_is_empty(LCS lcs):
|
||||
return lcs.source_begin == 0 and lcs.source_end == 0 and lcs.target_begin == 0 and lcs.target_end == 0
|
||||
|
||||
cdef class EditTrees:
|
||||
cdef vector[EditTreeC] trees
|
||||
cdef unordered_map[hash_t, uint32_t] map
|
||||
cdef StringStore strings
|
||||
|
||||
cpdef uint32_t add(self, str form, str lemma)
|
||||
cpdef str apply(self, uint32_t tree_id, str form)
|
||||
cpdef unicode tree_to_str(self, uint32_t tree_id)
|
||||
|
||||
cdef uint32_t _add(self, str form, str lemma)
|
||||
cdef _apply(self, uint32_t tree_id, str form_part, list lemma_pieces)
|
||||
cdef uint32_t _tree_id(self, EditTreeC tree)
|
305
spacy/pipeline/_edit_tree_internals/edit_trees.pyx
Normal file
305
spacy/pipeline/_edit_tree_internals/edit_trees.pyx
Normal file
|
@ -0,0 +1,305 @@
|
|||
# cython: infer_types=True, binding=True
|
||||
from cython.operator cimport dereference as deref
|
||||
from libc.stdint cimport uint32_t
|
||||
from libc.stdint cimport UINT32_MAX
|
||||
from libc.string cimport memset
|
||||
from libcpp.pair cimport pair
|
||||
from libcpp.vector cimport vector
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from ...typedefs cimport hash_t
|
||||
|
||||
from ... import util
|
||||
from ...errors import Errors
|
||||
from ...strings import StringStore
|
||||
from .schemas import validate_edit_tree
|
||||
|
||||
|
||||
NULL_TREE_ID = UINT32_MAX
|
||||
|
||||
cdef LCS find_lcs(str source, str target):
|
||||
"""
|
||||
Find the longest common subsequence (LCS) between two strings. If there are
|
||||
multiple LCSes, only one of them is returned.
|
||||
|
||||
source (str): The first string.
|
||||
target (str): The second string.
|
||||
RETURNS (LCS): The spans of the longest common subsequences.
|
||||
"""
|
||||
cdef Py_ssize_t source_len = len(source)
|
||||
cdef Py_ssize_t target_len = len(target)
|
||||
cdef size_t longest_align = 0;
|
||||
cdef int source_idx, target_idx
|
||||
cdef LCS lcs
|
||||
cdef Py_UCS4 source_cp, target_cp
|
||||
|
||||
memset(&lcs, 0, sizeof(lcs))
|
||||
|
||||
cdef vector[size_t] prev_aligns = vector[size_t](target_len);
|
||||
cdef vector[size_t] cur_aligns = vector[size_t](target_len);
|
||||
|
||||
for (source_idx, source_cp) in enumerate(source):
|
||||
for (target_idx, target_cp) in enumerate(target):
|
||||
if source_cp == target_cp:
|
||||
if source_idx == 0 or target_idx == 0:
|
||||
cur_aligns[target_idx] = 1
|
||||
else:
|
||||
cur_aligns[target_idx] = prev_aligns[target_idx - 1] + 1
|
||||
|
||||
# Check if this is the longest alignment and replace previous
|
||||
# best alignment when this is the case.
|
||||
if cur_aligns[target_idx] > longest_align:
|
||||
longest_align = cur_aligns[target_idx]
|
||||
lcs.source_begin = source_idx - longest_align + 1
|
||||
lcs.source_end = source_idx + 1
|
||||
lcs.target_begin = target_idx - longest_align + 1
|
||||
lcs.target_end = target_idx + 1
|
||||
else:
|
||||
# No match, we start with a zero-length alignment.
|
||||
cur_aligns[target_idx] = 0
|
||||
swap(prev_aligns, cur_aligns)
|
||||
|
||||
return lcs
|
||||
|
||||
cdef class EditTrees:
|
||||
"""Container for constructing and storing edit trees."""
|
||||
def __init__(self, strings: StringStore):
|
||||
"""Create a container for edit trees.
|
||||
|
||||
strings (StringStore): the string store to use."""
|
||||
self.strings = strings
|
||||
|
||||
cpdef uint32_t add(self, str form, str lemma):
|
||||
"""Add an edit tree that rewrites the given string into the given lemma.
|
||||
|
||||
RETURNS (int): identifier of the edit tree in the container.
|
||||
"""
|
||||
# Treat two empty strings as a special case. Generating an edit
|
||||
# tree for identical strings results in a match node. However,
|
||||
# since two empty strings have a zero-length LCS, a substitution
|
||||
# node would be created. Since we do not want to clutter the
|
||||
# recursive tree construction with logic for this case, handle
|
||||
# it in this wrapper method.
|
||||
if len(form) == 0 and len(lemma) == 0:
|
||||
tree = edittree_new_match(0, 0, NULL_TREE_ID, NULL_TREE_ID)
|
||||
return self._tree_id(tree)
|
||||
|
||||
return self._add(form, lemma)
|
||||
|
||||
cdef uint32_t _add(self, str form, str lemma):
|
||||
cdef LCS lcs = find_lcs(form, lemma)
|
||||
|
||||
cdef EditTreeC tree
|
||||
cdef uint32_t tree_id, prefix_tree, suffix_tree
|
||||
if lcs_is_empty(lcs):
|
||||
tree = edittree_new_subst(self.strings.add(form), self.strings.add(lemma))
|
||||
else:
|
||||
# If we have a non-empty LCS, such as "gooi" in "ge[gooi]d" and "[gooi]en",
|
||||
# create edit trees for the prefix pair ("ge"/"") and the suffix pair ("d"/"en").
|
||||
prefix_tree = NULL_TREE_ID
|
||||
if lcs.source_begin != 0 or lcs.target_begin != 0:
|
||||
prefix_tree = self.add(form[:lcs.source_begin], lemma[:lcs.target_begin])
|
||||
|
||||
suffix_tree = NULL_TREE_ID
|
||||
if lcs.source_end != len(form) or lcs.target_end != len(lemma):
|
||||
suffix_tree = self.add(form[lcs.source_end:], lemma[lcs.target_end:])
|
||||
|
||||
tree = edittree_new_match(lcs.source_begin, len(form) - lcs.source_end, prefix_tree, suffix_tree)
|
||||
|
||||
return self._tree_id(tree)
|
||||
|
||||
cdef uint32_t _tree_id(self, EditTreeC tree):
|
||||
# If this tree has been constructed before, return its identifier.
|
||||
cdef hash_t hash = edittree_hash(tree)
|
||||
cdef unordered_map[hash_t, uint32_t].iterator iter = self.map.find(hash)
|
||||
if iter != self.map.end():
|
||||
return deref(iter).second
|
||||
|
||||
# The tree hasn't been seen before, store it.
|
||||
cdef uint32_t tree_id = self.trees.size()
|
||||
self.trees.push_back(tree)
|
||||
self.map.insert(pair[hash_t, uint32_t](hash, tree_id))
|
||||
|
||||
return tree_id
|
||||
|
||||
cpdef str apply(self, uint32_t tree_id, str form):
|
||||
"""Apply an edit tree to a form.
|
||||
|
||||
tree_id (uint32_t): the identifier of the edit tree to apply.
|
||||
form (str): the form to apply the edit tree to.
|
||||
RETURNS (str): the transformer form or None if the edit tree
|
||||
could not be applied to the form.
|
||||
"""
|
||||
if tree_id >= self.trees.size():
|
||||
raise IndexError("Edit tree identifier out of range")
|
||||
|
||||
lemma_pieces = []
|
||||
try:
|
||||
self._apply(tree_id, form, lemma_pieces)
|
||||
except ValueError:
|
||||
return None
|
||||
return "".join(lemma_pieces)
|
||||
|
||||
cdef _apply(self, uint32_t tree_id, str form_part, list lemma_pieces):
|
||||
"""Recursively apply an edit tree to a form, adding pieces to
|
||||
the lemma_pieces list."""
|
||||
assert tree_id <= self.trees.size()
|
||||
|
||||
cdef EditTreeC tree = self.trees[tree_id]
|
||||
cdef MatchNodeC match_node
|
||||
cdef int suffix_start
|
||||
|
||||
if tree.is_match_node:
|
||||
match_node = tree.inner.match_node
|
||||
|
||||
if match_node.prefix_len + match_node.suffix_len > len(form_part):
|
||||
raise ValueError("Edit tree cannot be applied to form")
|
||||
|
||||
suffix_start = len(form_part) - match_node.suffix_len
|
||||
|
||||
if match_node.prefix_tree != NULL_TREE_ID:
|
||||
self._apply(match_node.prefix_tree, form_part[:match_node.prefix_len], lemma_pieces)
|
||||
|
||||
lemma_pieces.append(form_part[match_node.prefix_len:suffix_start])
|
||||
|
||||
if match_node.suffix_tree != NULL_TREE_ID:
|
||||
self._apply(match_node.suffix_tree, form_part[suffix_start:], lemma_pieces)
|
||||
else:
|
||||
if form_part == self.strings[tree.inner.subst_node.orig]:
|
||||
lemma_pieces.append(self.strings[tree.inner.subst_node.subst])
|
||||
else:
|
||||
raise ValueError("Edit tree cannot be applied to form")
|
||||
|
||||
cpdef unicode tree_to_str(self, uint32_t tree_id):
|
||||
"""Return the tree as a string. The tree tree string is formatted
|
||||
like an S-expression. This is primarily useful for debugging. Match
|
||||
nodes have the following format:
|
||||
|
||||
(m prefix_len suffix_len prefix_tree suffix_tree)
|
||||
|
||||
Substitution nodes have the following format:
|
||||
|
||||
(s original substitute)
|
||||
|
||||
tree_id (uint32_t): the identifier of the edit tree.
|
||||
RETURNS (str): the tree as an S-expression.
|
||||
"""
|
||||
|
||||
if tree_id >= self.trees.size():
|
||||
raise IndexError("Edit tree identifier out of range")
|
||||
|
||||
cdef EditTreeC tree = self.trees[tree_id]
|
||||
cdef SubstNodeC subst_node
|
||||
|
||||
if not tree.is_match_node:
|
||||
subst_node = tree.inner.subst_node
|
||||
return f"(s '{self.strings[subst_node.orig]}' '{self.strings[subst_node.subst]}')"
|
||||
|
||||
cdef MatchNodeC match_node = tree.inner.match_node
|
||||
|
||||
prefix_tree = "()"
|
||||
if match_node.prefix_tree != NULL_TREE_ID:
|
||||
prefix_tree = self.tree_to_str(match_node.prefix_tree)
|
||||
|
||||
suffix_tree = "()"
|
||||
if match_node.suffix_tree != NULL_TREE_ID:
|
||||
suffix_tree = self.tree_to_str(match_node.suffix_tree)
|
||||
|
||||
return f"(m {match_node.prefix_len} {match_node.suffix_len} {prefix_tree} {suffix_tree})"
|
||||
|
||||
def from_json(self, trees: list) -> "EditTrees":
|
||||
self.trees.clear()
|
||||
|
||||
for tree in trees:
|
||||
tree = _dict2tree(tree)
|
||||
self.trees.push_back(tree)
|
||||
|
||||
self._rebuild_tree_map()
|
||||
|
||||
def from_bytes(self, bytes_data: bytes, *) -> "EditTrees":
|
||||
def deserialize_trees(tree_dicts):
|
||||
cdef EditTreeC c_tree
|
||||
for tree_dict in tree_dicts:
|
||||
c_tree = _dict2tree(tree_dict)
|
||||
self.trees.push_back(c_tree)
|
||||
|
||||
deserializers = {}
|
||||
deserializers["trees"] = lambda n: deserialize_trees(n)
|
||||
util.from_bytes(bytes_data, deserializers, [])
|
||||
|
||||
self._rebuild_tree_map()
|
||||
|
||||
return self
|
||||
|
||||
def to_bytes(self, **kwargs) -> bytes:
|
||||
tree_dicts = []
|
||||
for tree in self.trees:
|
||||
tree = _tree2dict(tree)
|
||||
tree_dicts.append(tree)
|
||||
|
||||
serializers = {}
|
||||
serializers["trees"] = lambda: tree_dicts
|
||||
|
||||
return util.to_bytes(serializers, [])
|
||||
|
||||
def to_disk(self, path, **kwargs) -> "EditTrees":
|
||||
path = util.ensure_path(path)
|
||||
with path.open("wb") as file_:
|
||||
file_.write(self.to_bytes())
|
||||
|
||||
def from_disk(self, path, **kwargs) -> "EditTrees":
|
||||
path = util.ensure_path(path)
|
||||
if path.exists():
|
||||
with path.open("rb") as file_:
|
||||
data = file_.read()
|
||||
return self.from_bytes(data)
|
||||
|
||||
return self
|
||||
|
||||
def __getitem__(self, idx):
|
||||
return _tree2dict(self.trees[idx])
|
||||
|
||||
def __len__(self):
|
||||
return self.trees.size()
|
||||
|
||||
def _rebuild_tree_map(self):
|
||||
"""Rebuild the tree hash -> tree id mapping"""
|
||||
cdef EditTreeC c_tree
|
||||
cdef uint32_t tree_id
|
||||
cdef hash_t tree_hash
|
||||
|
||||
self.map.clear()
|
||||
|
||||
for tree_id in range(self.trees.size()):
|
||||
c_tree = self.trees[tree_id]
|
||||
tree_hash = edittree_hash(c_tree)
|
||||
self.map.insert(pair[hash_t, uint32_t](tree_hash, tree_id))
|
||||
|
||||
def __reduce__(self):
|
||||
return (unpickle_edittrees, (self.strings, self.to_bytes()))
|
||||
|
||||
|
||||
def unpickle_edittrees(strings, trees_data):
|
||||
return EditTrees(strings).from_bytes(trees_data)
|
||||
|
||||
|
||||
def _tree2dict(tree):
|
||||
if tree["is_match_node"]:
|
||||
tree = tree["inner"]["match_node"]
|
||||
else:
|
||||
tree = tree["inner"]["subst_node"]
|
||||
return(dict(tree))
|
||||
|
||||
def _dict2tree(tree):
|
||||
errors = validate_edit_tree(tree)
|
||||
if errors:
|
||||
raise ValueError(Errors.E1026.format(errors="\n".join(errors)))
|
||||
|
||||
tree = dict(tree)
|
||||
if "prefix_len" in tree:
|
||||
tree = {"is_match_node": True, "inner": {"match_node": tree}}
|
||||
else:
|
||||
tree = {"is_match_node": False, "inner": {"subst_node": tree}}
|
||||
|
||||
return tree
|
44
spacy/pipeline/_edit_tree_internals/schemas.py
Normal file
44
spacy/pipeline/_edit_tree_internals/schemas.py
Normal file
|
@ -0,0 +1,44 @@
|
|||
from typing import Any, Dict, List, Union
|
||||
from collections import defaultdict
|
||||
from pydantic import BaseModel, Field, ValidationError
|
||||
from pydantic.types import StrictBool, StrictInt, StrictStr
|
||||
|
||||
|
||||
class MatchNodeSchema(BaseModel):
|
||||
prefix_len: StrictInt = Field(..., title="Prefix length")
|
||||
suffix_len: StrictInt = Field(..., title="Suffix length")
|
||||
prefix_tree: StrictInt = Field(..., title="Prefix tree")
|
||||
suffix_tree: StrictInt = Field(..., title="Suffix tree")
|
||||
|
||||
class Config:
|
||||
extra = "forbid"
|
||||
|
||||
|
||||
class SubstNodeSchema(BaseModel):
|
||||
orig: Union[int, StrictStr] = Field(..., title="Original substring")
|
||||
subst: Union[int, StrictStr] = Field(..., title="Replacement substring")
|
||||
|
||||
class Config:
|
||||
extra = "forbid"
|
||||
|
||||
|
||||
class EditTreeSchema(BaseModel):
|
||||
__root__: Union[MatchNodeSchema, SubstNodeSchema]
|
||||
|
||||
|
||||
def validate_edit_tree(obj: Dict[str, Any]) -> List[str]:
|
||||
"""Validate edit tree.
|
||||
|
||||
obj (Dict[str, Any]): JSON-serializable data to validate.
|
||||
RETURNS (List[str]): A list of error messages, if available.
|
||||
"""
|
||||
try:
|
||||
EditTreeSchema.parse_obj(obj)
|
||||
return []
|
||||
except ValidationError as e:
|
||||
errors = e.errors()
|
||||
data = defaultdict(list)
|
||||
for error in errors:
|
||||
err_loc = " -> ".join([str(p) for p in error.get("loc", [])])
|
||||
data[err_loc].append(error.get("msg"))
|
||||
return [f"[{loc}] {', '.join(msg)}" for loc, msg in data.items()] # type: ignore[arg-type]
|
|
@ -3,6 +3,7 @@ from libc.string cimport memcpy, memset
|
|||
from libc.stdlib cimport calloc, free
|
||||
from libc.stdint cimport uint32_t, uint64_t
|
||||
cimport libcpp
|
||||
from libcpp.unordered_map cimport unordered_map
|
||||
from libcpp.vector cimport vector
|
||||
from libcpp.set cimport set
|
||||
from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
|
||||
|
@ -30,8 +31,8 @@ cdef cppclass StateC:
|
|||
vector[int] _stack
|
||||
vector[int] _rebuffer
|
||||
vector[SpanC] _ents
|
||||
vector[ArcC] _left_arcs
|
||||
vector[ArcC] _right_arcs
|
||||
unordered_map[int, vector[ArcC]] _left_arcs
|
||||
unordered_map[int, vector[ArcC]] _right_arcs
|
||||
vector[libcpp.bool] _unshiftable
|
||||
set[int] _sent_starts
|
||||
TokenC _empty_token
|
||||
|
@ -160,15 +161,22 @@ cdef cppclass StateC:
|
|||
else:
|
||||
return &this._sent[i]
|
||||
|
||||
void get_arcs(vector[ArcC]* arcs) nogil const:
|
||||
for i in range(this._left_arcs.size()):
|
||||
arc = this._left_arcs.at(i)
|
||||
void map_get_arcs(const unordered_map[int, vector[ArcC]] &heads_arcs, vector[ArcC]* out) nogil const:
|
||||
cdef const vector[ArcC]* arcs
|
||||
head_arcs_it = heads_arcs.const_begin()
|
||||
while head_arcs_it != heads_arcs.const_end():
|
||||
arcs = &deref(head_arcs_it).second
|
||||
arcs_it = arcs.const_begin()
|
||||
while arcs_it != arcs.const_end():
|
||||
arc = deref(arcs_it)
|
||||
if arc.head != -1 and arc.child != -1:
|
||||
arcs.push_back(arc)
|
||||
for i in range(this._right_arcs.size()):
|
||||
arc = this._right_arcs.at(i)
|
||||
if arc.head != -1 and arc.child != -1:
|
||||
arcs.push_back(arc)
|
||||
out.push_back(arc)
|
||||
incr(arcs_it)
|
||||
incr(head_arcs_it)
|
||||
|
||||
void get_arcs(vector[ArcC]* out) nogil const:
|
||||
this.map_get_arcs(this._left_arcs, out)
|
||||
this.map_get_arcs(this._right_arcs, out)
|
||||
|
||||
int H(int child) nogil const:
|
||||
if child >= this.length or child < 0:
|
||||
|
@ -182,37 +190,35 @@ cdef cppclass StateC:
|
|||
else:
|
||||
return this._ents.back().start
|
||||
|
||||
int L(int head, int idx) nogil const:
|
||||
if idx < 1 or this._left_arcs.size() == 0:
|
||||
int nth_child(const unordered_map[int, vector[ArcC]]& heads_arcs, int head, int idx) nogil const:
|
||||
if idx < 1:
|
||||
return -1
|
||||
|
||||
# Work backwards through left-arcs to find the arc at the
|
||||
head_arcs_it = heads_arcs.const_find(head)
|
||||
if head_arcs_it == heads_arcs.const_end():
|
||||
return -1
|
||||
|
||||
cdef const vector[ArcC]* arcs = &deref(head_arcs_it).second
|
||||
|
||||
# Work backwards through arcs to find the arc at the
|
||||
# requested index more quickly.
|
||||
cdef size_t child_index = 0
|
||||
it = this._left_arcs.const_rbegin()
|
||||
while it != this._left_arcs.rend():
|
||||
arc = deref(it)
|
||||
if arc.head == head and arc.child != -1 and arc.child < head:
|
||||
arcs_it = arcs.const_rbegin()
|
||||
while arcs_it != arcs.const_rend() and child_index != idx:
|
||||
arc = deref(arcs_it)
|
||||
if arc.child != -1:
|
||||
child_index += 1
|
||||
if child_index == idx:
|
||||
return arc.child
|
||||
incr(it)
|
||||
incr(arcs_it)
|
||||
|
||||
return -1
|
||||
|
||||
int L(int head, int idx) nogil const:
|
||||
return this.nth_child(this._left_arcs, head, idx)
|
||||
|
||||
int R(int head, int idx) nogil const:
|
||||
if idx < 1 or this._right_arcs.size() == 0:
|
||||
return -1
|
||||
cdef vector[int] rights
|
||||
for i in range(this._right_arcs.size()):
|
||||
arc = this._right_arcs.at(i)
|
||||
if arc.head == head and arc.child != -1 and arc.child > head:
|
||||
rights.push_back(arc.child)
|
||||
idx = (<int>rights.size()) - idx
|
||||
if idx < 0:
|
||||
return -1
|
||||
else:
|
||||
return rights.at(idx)
|
||||
return this.nth_child(this._right_arcs, head, idx)
|
||||
|
||||
bint empty() nogil const:
|
||||
return this._stack.size() == 0
|
||||
|
@ -254,22 +260,29 @@ cdef cppclass StateC:
|
|||
int r_edge(int word) nogil const:
|
||||
return word
|
||||
|
||||
int n_L(int head) nogil const:
|
||||
int n_arcs(const unordered_map[int, vector[ArcC]] &heads_arcs, int head) nogil const:
|
||||
cdef int n = 0
|
||||
for i in range(this._left_arcs.size()):
|
||||
arc = this._left_arcs.at(i)
|
||||
if arc.head == head and arc.child != -1 and arc.child < arc.head:
|
||||
n += 1
|
||||
head_arcs_it = heads_arcs.const_find(head)
|
||||
if head_arcs_it == heads_arcs.const_end():
|
||||
return n
|
||||
|
||||
int n_R(int head) nogil const:
|
||||
cdef int n = 0
|
||||
for i in range(this._right_arcs.size()):
|
||||
arc = this._right_arcs.at(i)
|
||||
if arc.head == head and arc.child != -1 and arc.child > arc.head:
|
||||
cdef const vector[ArcC]* arcs = &deref(head_arcs_it).second
|
||||
arcs_it = arcs.const_begin()
|
||||
while arcs_it != arcs.end():
|
||||
arc = deref(arcs_it)
|
||||
if arc.child != -1:
|
||||
n += 1
|
||||
incr(arcs_it)
|
||||
|
||||
return n
|
||||
|
||||
|
||||
int n_L(int head) nogil const:
|
||||
return n_arcs(this._left_arcs, head)
|
||||
|
||||
int n_R(int head) nogil const:
|
||||
return n_arcs(this._right_arcs, head)
|
||||
|
||||
bint stack_is_connected() nogil const:
|
||||
return False
|
||||
|
||||
|
@ -328,19 +341,20 @@ cdef cppclass StateC:
|
|||
arc.child = child
|
||||
arc.label = label
|
||||
if head > child:
|
||||
this._left_arcs.push_back(arc)
|
||||
this._left_arcs[arc.head].push_back(arc)
|
||||
else:
|
||||
this._right_arcs.push_back(arc)
|
||||
this._right_arcs[arc.head].push_back(arc)
|
||||
this._heads[child] = head
|
||||
|
||||
void del_arc(int h_i, int c_i) nogil:
|
||||
cdef vector[ArcC]* arcs
|
||||
if h_i > c_i:
|
||||
arcs = &this._left_arcs
|
||||
else:
|
||||
arcs = &this._right_arcs
|
||||
void map_del_arc(unordered_map[int, vector[ArcC]]* heads_arcs, int h_i, int c_i) nogil:
|
||||
arcs_it = heads_arcs.find(h_i)
|
||||
if arcs_it == heads_arcs.end():
|
||||
return
|
||||
|
||||
arcs = &deref(arcs_it).second
|
||||
if arcs.size() == 0:
|
||||
return
|
||||
|
||||
arc = arcs.back()
|
||||
if arc.head == h_i and arc.child == c_i:
|
||||
arcs.pop_back()
|
||||
|
@ -353,6 +367,12 @@ cdef cppclass StateC:
|
|||
arc.label = 0
|
||||
break
|
||||
|
||||
void del_arc(int h_i, int c_i) nogil:
|
||||
if h_i > c_i:
|
||||
this.map_del_arc(&this._left_arcs, h_i, c_i)
|
||||
else:
|
||||
this.map_del_arc(&this._right_arcs, h_i, c_i)
|
||||
|
||||
SpanC get_ent() nogil const:
|
||||
cdef SpanC ent
|
||||
if this._ents.size() == 0:
|
||||
|
|
|
@ -218,7 +218,7 @@ def _get_aligned_sent_starts(example):
|
|||
sent_starts = [False] * len(example.x)
|
||||
seen_words = set()
|
||||
for y_sent in example.y.sents:
|
||||
x_indices = list(align[y_sent.start : y_sent.end].dataXd)
|
||||
x_indices = list(align[y_sent.start : y_sent.end])
|
||||
if any(x_idx in seen_words for x_idx in x_indices):
|
||||
# If there are any tokens in X that align across two sentences,
|
||||
# regard the sentence annotations as missing, as we can't
|
||||
|
|
|
@ -4,6 +4,10 @@ for doing pseudo-projective parsing implementation uses the HEAD decoration
|
|||
scheme.
|
||||
"""
|
||||
from copy import copy
|
||||
from libc.limits cimport INT_MAX
|
||||
from libc.stdlib cimport abs
|
||||
from libcpp cimport bool
|
||||
from libcpp.vector cimport vector
|
||||
|
||||
from ...tokens.doc cimport Doc, set_children_from_heads
|
||||
|
||||
|
@ -41,13 +45,18 @@ def contains_cycle(heads):
|
|||
|
||||
|
||||
def is_nonproj_arc(tokenid, heads):
|
||||
cdef vector[int] c_heads = _heads_to_c(heads)
|
||||
return _is_nonproj_arc(tokenid, c_heads)
|
||||
|
||||
|
||||
cdef bool _is_nonproj_arc(int tokenid, const vector[int]& heads) nogil:
|
||||
# definition (e.g. Havelka 2007): an arc h -> d, h < d is non-projective
|
||||
# if there is a token k, h < k < d such that h is not
|
||||
# an ancestor of k. Same for h -> d, h > d
|
||||
head = heads[tokenid]
|
||||
if head == tokenid: # root arcs cannot be non-projective
|
||||
return False
|
||||
elif head is None: # unattached tokens cannot be non-projective
|
||||
elif head < 0: # unattached tokens cannot be non-projective
|
||||
return False
|
||||
|
||||
cdef int start, end
|
||||
|
@ -56,19 +65,29 @@ def is_nonproj_arc(tokenid, heads):
|
|||
else:
|
||||
start, end = (tokenid+1, head)
|
||||
for k in range(start, end):
|
||||
for ancestor in ancestors(k, heads):
|
||||
if ancestor is None: # for unattached tokens/subtrees
|
||||
break
|
||||
elif ancestor == head: # normal case: k dominated by h
|
||||
break
|
||||
if _has_head_as_ancestor(k, head, heads):
|
||||
continue
|
||||
else: # head not in ancestors: d -> h is non-projective
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
cdef bool _has_head_as_ancestor(int tokenid, int head, const vector[int]& heads) nogil:
|
||||
ancestor = tokenid
|
||||
cnt = 0
|
||||
while cnt < heads.size():
|
||||
if heads[ancestor] == head or heads[ancestor] < 0:
|
||||
return True
|
||||
ancestor = heads[ancestor]
|
||||
cnt += 1
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def is_nonproj_tree(heads):
|
||||
cdef vector[int] c_heads = _heads_to_c(heads)
|
||||
# a tree is non-projective if at least one arc is non-projective
|
||||
return any(is_nonproj_arc(word, heads) for word in range(len(heads)))
|
||||
return any(_is_nonproj_arc(word, c_heads) for word in range(len(heads)))
|
||||
|
||||
|
||||
def decompose(label):
|
||||
|
@ -98,16 +117,31 @@ def projectivize(heads, labels):
|
|||
# tree, i.e. connected and cycle-free. Returns a new pair (heads, labels)
|
||||
# which encode a projective and decorated tree.
|
||||
proj_heads = copy(heads)
|
||||
smallest_np_arc = _get_smallest_nonproj_arc(proj_heads)
|
||||
if smallest_np_arc is None: # this sentence is already projective
|
||||
|
||||
cdef int new_head
|
||||
cdef vector[int] c_proj_heads = _heads_to_c(proj_heads)
|
||||
cdef int smallest_np_arc = _get_smallest_nonproj_arc(c_proj_heads)
|
||||
if smallest_np_arc == -1: # this sentence is already projective
|
||||
return proj_heads, copy(labels)
|
||||
while smallest_np_arc is not None:
|
||||
_lift(smallest_np_arc, proj_heads)
|
||||
smallest_np_arc = _get_smallest_nonproj_arc(proj_heads)
|
||||
while smallest_np_arc != -1:
|
||||
new_head = _lift(smallest_np_arc, proj_heads)
|
||||
c_proj_heads[smallest_np_arc] = new_head
|
||||
smallest_np_arc = _get_smallest_nonproj_arc(c_proj_heads)
|
||||
deco_labels = _decorate(heads, proj_heads, labels)
|
||||
return proj_heads, deco_labels
|
||||
|
||||
|
||||
cdef vector[int] _heads_to_c(heads):
|
||||
cdef vector[int] c_heads;
|
||||
for head in heads:
|
||||
if head == None:
|
||||
c_heads.push_back(-1)
|
||||
else:
|
||||
assert head < len(heads)
|
||||
c_heads.push_back(head)
|
||||
return c_heads
|
||||
|
||||
|
||||
cpdef deprojectivize(Doc doc):
|
||||
# Reattach arcs with decorated labels (following HEAD scheme). For each
|
||||
# decorated arc X||Y, search top-down, left-to-right, breadth-first until
|
||||
|
@ -137,27 +171,38 @@ def _decorate(heads, proj_heads, labels):
|
|||
deco_labels.append(labels[tokenid])
|
||||
return deco_labels
|
||||
|
||||
def get_smallest_nonproj_arc_slow(heads):
|
||||
cdef vector[int] c_heads = _heads_to_c(heads)
|
||||
return _get_smallest_nonproj_arc(c_heads)
|
||||
|
||||
def _get_smallest_nonproj_arc(heads):
|
||||
|
||||
cdef int _get_smallest_nonproj_arc(const vector[int]& heads) nogil:
|
||||
# return the smallest non-proj arc or None
|
||||
# where size is defined as the distance between dep and head
|
||||
# and ties are broken left to right
|
||||
smallest_size = float('inf')
|
||||
smallest_np_arc = None
|
||||
for tokenid, head in enumerate(heads):
|
||||
cdef int smallest_size = INT_MAX
|
||||
cdef int smallest_np_arc = -1
|
||||
cdef int size
|
||||
cdef int tokenid
|
||||
cdef int head
|
||||
|
||||
for tokenid in range(heads.size()):
|
||||
head = heads[tokenid]
|
||||
size = abs(tokenid-head)
|
||||
if size < smallest_size and is_nonproj_arc(tokenid, heads):
|
||||
if size < smallest_size and _is_nonproj_arc(tokenid, heads):
|
||||
smallest_size = size
|
||||
smallest_np_arc = tokenid
|
||||
return smallest_np_arc
|
||||
|
||||
|
||||
def _lift(tokenid, heads):
|
||||
cpdef int _lift(tokenid, heads):
|
||||
# reattaches a word to it's grandfather
|
||||
head = heads[tokenid]
|
||||
ghead = heads[head]
|
||||
cdef int new_head = ghead if head != ghead else tokenid
|
||||
# attach to ghead if head isn't attached to root else attach to root
|
||||
heads[tokenid] = ghead if head != ghead else tokenid
|
||||
heads[tokenid] = new_head
|
||||
return new_head
|
||||
|
||||
|
||||
def _find_new_head(token, headlabel):
|
||||
|
|
379
spacy/pipeline/edit_tree_lemmatizer.py
Normal file
379
spacy/pipeline/edit_tree_lemmatizer.py
Normal file
|
@ -0,0 +1,379 @@
|
|||
from typing import cast, Any, Callable, Dict, Iterable, List, Optional
|
||||
from typing import Sequence, Tuple, Union
|
||||
from collections import Counter
|
||||
from copy import deepcopy
|
||||
from itertools import islice
|
||||
import numpy as np
|
||||
|
||||
import srsly
|
||||
from thinc.api import Config, Model, SequenceCategoricalCrossentropy
|
||||
from thinc.types import Floats2d, Ints1d, Ints2d
|
||||
|
||||
from ._edit_tree_internals.edit_trees import EditTrees
|
||||
from ._edit_tree_internals.schemas import validate_edit_tree
|
||||
from .lemmatizer import lemmatizer_score
|
||||
from .trainable_pipe import TrainablePipe
|
||||
from ..errors import Errors
|
||||
from ..language import Language
|
||||
from ..tokens import Doc
|
||||
from ..training import Example, validate_examples, validate_get_examples
|
||||
from ..vocab import Vocab
|
||||
from .. import util
|
||||
|
||||
|
||||
default_model_config = """
|
||||
[model]
|
||||
@architectures = "spacy.Tagger.v2"
|
||||
|
||||
[model.tok2vec]
|
||||
@architectures = "spacy.HashEmbedCNN.v2"
|
||||
pretrained_vectors = null
|
||||
width = 96
|
||||
depth = 4
|
||||
embed_size = 2000
|
||||
window_size = 1
|
||||
maxout_pieces = 3
|
||||
subword_features = true
|
||||
"""
|
||||
DEFAULT_EDIT_TREE_LEMMATIZER_MODEL = Config().from_str(default_model_config)["model"]
|
||||
|
||||
|
||||
@Language.factory(
|
||||
"trainable_lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
requires=[],
|
||||
default_config={
|
||||
"model": DEFAULT_EDIT_TREE_LEMMATIZER_MODEL,
|
||||
"backoff": "orth",
|
||||
"min_tree_freq": 3,
|
||||
"overwrite": False,
|
||||
"top_k": 1,
|
||||
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||
},
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_edit_tree_lemmatizer(
|
||||
nlp: Language,
|
||||
name: str,
|
||||
model: Model,
|
||||
backoff: Optional[str],
|
||||
min_tree_freq: int,
|
||||
overwrite: bool,
|
||||
top_k: int,
|
||||
scorer: Optional[Callable],
|
||||
):
|
||||
"""Construct an EditTreeLemmatizer component."""
|
||||
return EditTreeLemmatizer(
|
||||
nlp.vocab,
|
||||
model,
|
||||
name,
|
||||
backoff=backoff,
|
||||
min_tree_freq=min_tree_freq,
|
||||
overwrite=overwrite,
|
||||
top_k=top_k,
|
||||
scorer=scorer,
|
||||
)
|
||||
|
||||
|
||||
class EditTreeLemmatizer(TrainablePipe):
|
||||
"""
|
||||
Lemmatizer that lemmatizes each word using a predicted edit tree.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab: Vocab,
|
||||
model: Model,
|
||||
name: str = "trainable_lemmatizer",
|
||||
*,
|
||||
backoff: Optional[str] = "orth",
|
||||
min_tree_freq: int = 3,
|
||||
overwrite: bool = False,
|
||||
top_k: int = 1,
|
||||
scorer: Optional[Callable] = lemmatizer_score,
|
||||
):
|
||||
"""
|
||||
Construct an edit tree lemmatizer.
|
||||
|
||||
backoff (Optional[str]): backoff to use when the predicted edit trees
|
||||
are not applicable. Must be an attribute of Token or None (leave the
|
||||
lemma unset).
|
||||
min_tree_freq (int): prune trees that are applied less than this
|
||||
frequency in the training data.
|
||||
overwrite (bool): overwrite existing lemma annotations.
|
||||
top_k (int): try to apply at most the k most probable edit trees.
|
||||
"""
|
||||
self.vocab = vocab
|
||||
self.model = model
|
||||
self.name = name
|
||||
self.backoff = backoff
|
||||
self.min_tree_freq = min_tree_freq
|
||||
self.overwrite = overwrite
|
||||
self.top_k = top_k
|
||||
|
||||
self.trees = EditTrees(self.vocab.strings)
|
||||
self.tree2label: Dict[int, int] = {}
|
||||
|
||||
self.cfg: Dict[str, Any] = {"labels": []}
|
||||
self.scorer = scorer
|
||||
|
||||
def get_loss(
|
||||
self, examples: Iterable[Example], scores: List[Floats2d]
|
||||
) -> Tuple[float, List[Floats2d]]:
|
||||
validate_examples(examples, "EditTreeLemmatizer.get_loss")
|
||||
loss_func = SequenceCategoricalCrossentropy(normalize=False, missing_value=-1)
|
||||
|
||||
truths = []
|
||||
for eg in examples:
|
||||
eg_truths = []
|
||||
for (predicted, gold_lemma) in zip(
|
||||
eg.predicted, eg.get_aligned("LEMMA", as_string=True)
|
||||
):
|
||||
if gold_lemma is None:
|
||||
label = -1
|
||||
else:
|
||||
tree_id = self.trees.add(predicted.text, gold_lemma)
|
||||
label = self.tree2label.get(tree_id, 0)
|
||||
eg_truths.append(label)
|
||||
|
||||
truths.append(eg_truths)
|
||||
|
||||
d_scores, loss = loss_func(scores, truths) # type: ignore
|
||||
if self.model.ops.xp.isnan(loss):
|
||||
raise ValueError(Errors.E910.format(name=self.name))
|
||||
|
||||
return float(loss), d_scores
|
||||
|
||||
def predict(self, docs: Iterable[Doc]) -> List[Ints2d]:
|
||||
n_docs = len(list(docs))
|
||||
if not any(len(doc) for doc in docs):
|
||||
# Handle cases where there are no tokens in any docs.
|
||||
n_labels = len(self.cfg["labels"])
|
||||
guesses: List[Ints2d] = [
|
||||
self.model.ops.alloc((0, n_labels), dtype="i") for doc in docs
|
||||
]
|
||||
assert len(guesses) == n_docs
|
||||
return guesses
|
||||
scores = self.model.predict(docs)
|
||||
assert len(scores) == n_docs
|
||||
guesses = self._scores2guesses(docs, scores)
|
||||
assert len(guesses) == n_docs
|
||||
return guesses
|
||||
|
||||
def _scores2guesses(self, docs, scores):
|
||||
guesses = []
|
||||
for doc, doc_scores in zip(docs, scores):
|
||||
if self.top_k == 1:
|
||||
doc_guesses = doc_scores.argmax(axis=1).reshape(-1, 1)
|
||||
else:
|
||||
doc_guesses = np.argsort(doc_scores)[..., : -self.top_k - 1 : -1]
|
||||
|
||||
if not isinstance(doc_guesses, np.ndarray):
|
||||
doc_guesses = doc_guesses.get()
|
||||
|
||||
doc_compat_guesses = []
|
||||
for token, candidates in zip(doc, doc_guesses):
|
||||
tree_id = -1
|
||||
for candidate in candidates:
|
||||
candidate_tree_id = self.cfg["labels"][candidate]
|
||||
|
||||
if self.trees.apply(candidate_tree_id, token.text) is not None:
|
||||
tree_id = candidate_tree_id
|
||||
break
|
||||
doc_compat_guesses.append(tree_id)
|
||||
|
||||
guesses.append(np.array(doc_compat_guesses))
|
||||
|
||||
return guesses
|
||||
|
||||
def set_annotations(self, docs: Iterable[Doc], batch_tree_ids):
|
||||
for i, doc in enumerate(docs):
|
||||
doc_tree_ids = batch_tree_ids[i]
|
||||
if hasattr(doc_tree_ids, "get"):
|
||||
doc_tree_ids = doc_tree_ids.get()
|
||||
for j, tree_id in enumerate(doc_tree_ids):
|
||||
if self.overwrite or doc[j].lemma == 0:
|
||||
# If no applicable tree could be found during prediction,
|
||||
# the special identifier -1 is used. Otherwise the tree
|
||||
# is guaranteed to be applicable.
|
||||
if tree_id == -1:
|
||||
if self.backoff is not None:
|
||||
doc[j].lemma = getattr(doc[j], self.backoff)
|
||||
else:
|
||||
lemma = self.trees.apply(tree_id, doc[j].text)
|
||||
doc[j].lemma_ = lemma
|
||||
|
||||
@property
|
||||
def labels(self) -> Tuple[int, ...]:
|
||||
"""Returns the labels currently added to the component."""
|
||||
return tuple(self.cfg["labels"])
|
||||
|
||||
@property
|
||||
def hide_labels(self) -> bool:
|
||||
return True
|
||||
|
||||
@property
|
||||
def label_data(self) -> Dict:
|
||||
trees = []
|
||||
for tree_id in range(len(self.trees)):
|
||||
tree = self.trees[tree_id]
|
||||
if "orig" in tree:
|
||||
tree["orig"] = self.vocab.strings[tree["orig"]]
|
||||
if "subst" in tree:
|
||||
tree["subst"] = self.vocab.strings[tree["subst"]]
|
||||
trees.append(tree)
|
||||
return dict(trees=trees, labels=tuple(self.cfg["labels"]))
|
||||
|
||||
def initialize(
|
||||
self,
|
||||
get_examples: Callable[[], Iterable[Example]],
|
||||
*,
|
||||
nlp: Optional[Language] = None,
|
||||
labels: Optional[Dict] = None,
|
||||
):
|
||||
validate_get_examples(get_examples, "EditTreeLemmatizer.initialize")
|
||||
|
||||
if labels is None:
|
||||
self._labels_from_data(get_examples)
|
||||
else:
|
||||
self._add_labels(labels)
|
||||
|
||||
# Sample for the model.
|
||||
doc_sample = []
|
||||
label_sample = []
|
||||
for example in islice(get_examples(), 10):
|
||||
doc_sample.append(example.x)
|
||||
gold_labels: List[List[float]] = []
|
||||
for token in example.reference:
|
||||
if token.lemma == 0:
|
||||
gold_label = None
|
||||
else:
|
||||
gold_label = self._pair2label(token.text, token.lemma_)
|
||||
|
||||
gold_labels.append(
|
||||
[
|
||||
1.0 if label == gold_label else 0.0
|
||||
for label in self.cfg["labels"]
|
||||
]
|
||||
)
|
||||
|
||||
gold_labels = cast(Floats2d, gold_labels)
|
||||
label_sample.append(self.model.ops.asarray(gold_labels, dtype="float32"))
|
||||
|
||||
self._require_labels()
|
||||
assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
|
||||
assert len(label_sample) > 0, Errors.E923.format(name=self.name)
|
||||
|
||||
self.model.initialize(X=doc_sample, Y=label_sample)
|
||||
|
||||
def from_bytes(self, bytes_data, *, exclude=tuple()):
|
||||
deserializers = {
|
||||
"cfg": lambda b: self.cfg.update(srsly.json_loads(b)),
|
||||
"model": lambda b: self.model.from_bytes(b),
|
||||
"vocab": lambda b: self.vocab.from_bytes(b, exclude=exclude),
|
||||
"trees": lambda b: self.trees.from_bytes(b),
|
||||
}
|
||||
|
||||
util.from_bytes(bytes_data, deserializers, exclude)
|
||||
|
||||
return self
|
||||
|
||||
def to_bytes(self, *, exclude=tuple()):
|
||||
serializers = {
|
||||
"cfg": lambda: srsly.json_dumps(self.cfg),
|
||||
"model": lambda: self.model.to_bytes(),
|
||||
"vocab": lambda: self.vocab.to_bytes(exclude=exclude),
|
||||
"trees": lambda: self.trees.to_bytes(),
|
||||
}
|
||||
|
||||
return util.to_bytes(serializers, exclude)
|
||||
|
||||
def to_disk(self, path, exclude=tuple()):
|
||||
path = util.ensure_path(path)
|
||||
serializers = {
|
||||
"cfg": lambda p: srsly.write_json(p, self.cfg),
|
||||
"model": lambda p: self.model.to_disk(p),
|
||||
"vocab": lambda p: self.vocab.to_disk(p, exclude=exclude),
|
||||
"trees": lambda p: self.trees.to_disk(p),
|
||||
}
|
||||
util.to_disk(path, serializers, exclude)
|
||||
|
||||
def from_disk(self, path, exclude=tuple()):
|
||||
def load_model(p):
|
||||
try:
|
||||
with open(p, "rb") as mfile:
|
||||
self.model.from_bytes(mfile.read())
|
||||
except AttributeError:
|
||||
raise ValueError(Errors.E149) from None
|
||||
|
||||
deserializers = {
|
||||
"cfg": lambda p: self.cfg.update(srsly.read_json(p)),
|
||||
"model": load_model,
|
||||
"vocab": lambda p: self.vocab.from_disk(p, exclude=exclude),
|
||||
"trees": lambda p: self.trees.from_disk(p),
|
||||
}
|
||||
|
||||
util.from_disk(path, deserializers, exclude)
|
||||
return self
|
||||
|
||||
def _add_labels(self, labels: Dict):
|
||||
if "labels" not in labels:
|
||||
raise ValueError(Errors.E857.format(name="labels"))
|
||||
if "trees" not in labels:
|
||||
raise ValueError(Errors.E857.format(name="trees"))
|
||||
|
||||
self.cfg["labels"] = list(labels["labels"])
|
||||
trees = []
|
||||
for tree in labels["trees"]:
|
||||
errors = validate_edit_tree(tree)
|
||||
if errors:
|
||||
raise ValueError(Errors.E1026.format(errors="\n".join(errors)))
|
||||
|
||||
tree = dict(tree)
|
||||
if "orig" in tree:
|
||||
tree["orig"] = self.vocab.strings[tree["orig"]]
|
||||
if "orig" in tree:
|
||||
tree["subst"] = self.vocab.strings[tree["subst"]]
|
||||
|
||||
trees.append(tree)
|
||||
|
||||
self.trees.from_json(trees)
|
||||
|
||||
for label, tree in enumerate(self.labels):
|
||||
self.tree2label[tree] = label
|
||||
|
||||
def _labels_from_data(self, get_examples: Callable[[], Iterable[Example]]):
|
||||
# Count corpus tree frequencies in ad-hoc storage to avoid cluttering
|
||||
# the final pipe/string store.
|
||||
vocab = Vocab()
|
||||
trees = EditTrees(vocab.strings)
|
||||
tree_freqs: Counter = Counter()
|
||||
repr_pairs: Dict = {}
|
||||
for example in get_examples():
|
||||
for token in example.reference:
|
||||
if token.lemma != 0:
|
||||
tree_id = trees.add(token.text, token.lemma_)
|
||||
tree_freqs[tree_id] += 1
|
||||
repr_pairs[tree_id] = (token.text, token.lemma_)
|
||||
|
||||
# Construct trees that make the frequency cut-off using representative
|
||||
# form - token pairs.
|
||||
for tree_id, freq in tree_freqs.items():
|
||||
if freq >= self.min_tree_freq:
|
||||
form, lemma = repr_pairs[tree_id]
|
||||
self._pair2label(form, lemma, add_label=True)
|
||||
|
||||
def _pair2label(self, form, lemma, add_label=False):
|
||||
"""
|
||||
Look up the edit tree identifier for a form/label pair. If the edit
|
||||
tree is unknown and "add_label" is set, the edit tree will be added to
|
||||
the labels.
|
||||
"""
|
||||
tree_id = self.trees.add(form, lemma)
|
||||
if tree_id not in self.tree2label:
|
||||
if not add_label:
|
||||
return None
|
||||
|
||||
self.tree2label[tree_id] = len(self.cfg["labels"])
|
||||
self.cfg["labels"].append(tree_id)
|
||||
return self.tree2label[tree_id]
|
|
@ -6,17 +6,17 @@ import srsly
|
|||
import random
|
||||
from thinc.api import CosineDistance, Model, Optimizer, Config
|
||||
from thinc.api import set_dropout_rate
|
||||
import warnings
|
||||
|
||||
from ..kb import KnowledgeBase, Candidate
|
||||
from ..ml import empty_kb
|
||||
from ..tokens import Doc, Span
|
||||
from .pipe import deserialize_config
|
||||
from .legacy.entity_linker import EntityLinker_v1
|
||||
from .trainable_pipe import TrainablePipe
|
||||
from ..language import Language
|
||||
from ..vocab import Vocab
|
||||
from ..training import Example, validate_examples, validate_get_examples
|
||||
from ..errors import Errors, Warnings
|
||||
from ..errors import Errors
|
||||
from ..util import SimpleFrozenList, registry
|
||||
from .. import util
|
||||
from ..scorer import Scorer
|
||||
|
@ -26,7 +26,7 @@ BACKWARD_OVERWRITE = True
|
|||
|
||||
default_model_config = """
|
||||
[model]
|
||||
@architectures = "spacy.EntityLinker.v1"
|
||||
@architectures = "spacy.EntityLinker.v2"
|
||||
|
||||
[model.tok2vec]
|
||||
@architectures = "spacy.HashEmbedCNN.v2"
|
||||
|
@ -55,6 +55,7 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]
|
|||
"get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
|
||||
"overwrite": True,
|
||||
"scorer": {"@scorers": "spacy.entity_linker_scorer.v1"},
|
||||
"use_gold_ents": True,
|
||||
},
|
||||
default_score_weights={
|
||||
"nel_micro_f": 1.0,
|
||||
|
@ -75,6 +76,7 @@ def make_entity_linker(
|
|||
get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
|
||||
overwrite: bool,
|
||||
scorer: Optional[Callable],
|
||||
use_gold_ents: bool,
|
||||
):
|
||||
"""Construct an EntityLinker component.
|
||||
|
||||
|
@ -90,6 +92,22 @@ def make_entity_linker(
|
|||
produces a list of candidates, given a certain knowledge base and a textual mention.
|
||||
scorer (Optional[Callable]): The scoring method.
|
||||
"""
|
||||
|
||||
if not model.attrs.get("include_span_maker", False):
|
||||
# The only difference in arguments here is that use_gold_ents is not available
|
||||
return EntityLinker_v1(
|
||||
nlp.vocab,
|
||||
model,
|
||||
name,
|
||||
labels_discard=labels_discard,
|
||||
n_sents=n_sents,
|
||||
incl_prior=incl_prior,
|
||||
incl_context=incl_context,
|
||||
entity_vector_length=entity_vector_length,
|
||||
get_candidates=get_candidates,
|
||||
overwrite=overwrite,
|
||||
scorer=scorer,
|
||||
)
|
||||
return EntityLinker(
|
||||
nlp.vocab,
|
||||
model,
|
||||
|
@ -102,6 +120,7 @@ def make_entity_linker(
|
|||
get_candidates=get_candidates,
|
||||
overwrite=overwrite,
|
||||
scorer=scorer,
|
||||
use_gold_ents=use_gold_ents,
|
||||
)
|
||||
|
||||
|
||||
|
@ -136,6 +155,7 @@ class EntityLinker(TrainablePipe):
|
|||
get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
|
||||
overwrite: bool = BACKWARD_OVERWRITE,
|
||||
scorer: Optional[Callable] = entity_linker_score,
|
||||
use_gold_ents: bool,
|
||||
) -> None:
|
||||
"""Initialize an entity linker.
|
||||
|
||||
|
@ -152,6 +172,8 @@ class EntityLinker(TrainablePipe):
|
|||
produces a list of candidates, given a certain knowledge base and a textual mention.
|
||||
scorer (Optional[Callable]): The scoring method. Defaults to
|
||||
Scorer.score_links.
|
||||
use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
|
||||
component must provide entity annotations.
|
||||
|
||||
DOCS: https://spacy.io/api/entitylinker#init
|
||||
"""
|
||||
|
@ -169,6 +191,7 @@ class EntityLinker(TrainablePipe):
|
|||
# create an empty KB by default. If you want to load a predefined one, specify it in 'initialize'.
|
||||
self.kb = empty_kb(entity_vector_length)(self.vocab)
|
||||
self.scorer = scorer
|
||||
self.use_gold_ents = use_gold_ents
|
||||
|
||||
def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]):
|
||||
"""Define the KB of this pipe by providing a function that will
|
||||
|
@ -212,14 +235,48 @@ class EntityLinker(TrainablePipe):
|
|||
doc_sample = []
|
||||
vector_sample = []
|
||||
for example in islice(get_examples(), 10):
|
||||
doc_sample.append(example.x)
|
||||
doc = example.x
|
||||
if self.use_gold_ents:
|
||||
doc.ents = example.y.ents
|
||||
doc_sample.append(doc)
|
||||
vector_sample.append(self.model.ops.alloc1f(nO))
|
||||
assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
|
||||
assert len(vector_sample) > 0, Errors.E923.format(name=self.name)
|
||||
|
||||
# XXX In order for size estimation to work, there has to be at least
|
||||
# one entity. It's not used for training so it doesn't have to be real,
|
||||
# so we add a fake one if none are present.
|
||||
# We can't use Doc.has_annotation here because it can be True for docs
|
||||
# that have been through an NER component but got no entities.
|
||||
has_annotations = any([doc.ents for doc in doc_sample])
|
||||
if not has_annotations:
|
||||
doc = doc_sample[0]
|
||||
ent = doc[0:1]
|
||||
ent.label_ = "XXX"
|
||||
doc.ents = (ent,)
|
||||
|
||||
self.model.initialize(
|
||||
X=doc_sample, Y=self.model.ops.asarray(vector_sample, dtype="float32")
|
||||
)
|
||||
|
||||
if not has_annotations:
|
||||
# Clean up dummy annotation
|
||||
doc.ents = []
|
||||
|
||||
def batch_has_learnable_example(self, examples):
|
||||
"""Check if a batch contains a learnable example.
|
||||
|
||||
If one isn't present, then the update step needs to be skipped.
|
||||
"""
|
||||
|
||||
for eg in examples:
|
||||
for ent in eg.predicted.ents:
|
||||
candidates = list(self.get_candidates(self.kb, ent))
|
||||
if candidates:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def update(
|
||||
self,
|
||||
examples: Iterable[Example],
|
||||
|
@ -247,35 +304,29 @@ class EntityLinker(TrainablePipe):
|
|||
if not examples:
|
||||
return losses
|
||||
validate_examples(examples, "EntityLinker.update")
|
||||
sentence_docs = []
|
||||
for eg in examples:
|
||||
sentences = [s for s in eg.reference.sents]
|
||||
kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
|
||||
for ent in eg.reference.ents:
|
||||
# KB ID of the first token is the same as the whole span
|
||||
kb_id = kb_ids[ent.start]
|
||||
if kb_id:
|
||||
try:
|
||||
# find the sentence in the list of sentences.
|
||||
sent_index = sentences.index(ent.sent)
|
||||
except AttributeError:
|
||||
# Catch the exception when ent.sent is None and provide a user-friendly warning
|
||||
raise RuntimeError(Errors.E030) from None
|
||||
# get n previous sentences, if there are any
|
||||
start_sentence = max(0, sent_index - self.n_sents)
|
||||
# get n posterior sentences, or as many < n as there are
|
||||
end_sentence = min(len(sentences) - 1, sent_index + self.n_sents)
|
||||
# get token positions
|
||||
start_token = sentences[start_sentence].start
|
||||
end_token = sentences[end_sentence].end
|
||||
# append that span as a doc to training
|
||||
sent_doc = eg.predicted[start_token:end_token].as_doc()
|
||||
sentence_docs.append(sent_doc)
|
||||
|
||||
set_dropout_rate(self.model, drop)
|
||||
if not sentence_docs:
|
||||
warnings.warn(Warnings.W093.format(name="Entity Linker"))
|
||||
docs = [eg.predicted for eg in examples]
|
||||
# save to restore later
|
||||
old_ents = [doc.ents for doc in docs]
|
||||
|
||||
for doc, ex in zip(docs, examples):
|
||||
if self.use_gold_ents:
|
||||
doc.ents = ex.reference.ents
|
||||
else:
|
||||
# only keep matching ents
|
||||
doc.ents = ex.get_matching_ents()
|
||||
|
||||
# make sure we have something to learn from, if not, short-circuit
|
||||
if not self.batch_has_learnable_example(examples):
|
||||
return losses
|
||||
sentence_encodings, bp_context = self.model.begin_update(sentence_docs)
|
||||
|
||||
sentence_encodings, bp_context = self.model.begin_update(docs)
|
||||
|
||||
# now restore the ents
|
||||
for doc, old in zip(docs, old_ents):
|
||||
doc.ents = old
|
||||
|
||||
loss, d_scores = self.get_loss(
|
||||
sentence_encodings=sentence_encodings, examples=examples
|
||||
)
|
||||
|
@ -288,24 +339,38 @@ class EntityLinker(TrainablePipe):
|
|||
def get_loss(self, examples: Iterable[Example], sentence_encodings: Floats2d):
|
||||
validate_examples(examples, "EntityLinker.get_loss")
|
||||
entity_encodings = []
|
||||
eidx = 0 # indices in gold entities to keep
|
||||
keep_ents = [] # indices in sentence_encodings to keep
|
||||
|
||||
for eg in examples:
|
||||
kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
|
||||
|
||||
for ent in eg.reference.ents:
|
||||
kb_id = kb_ids[ent.start]
|
||||
if kb_id:
|
||||
entity_encoding = self.kb.get_vector(kb_id)
|
||||
entity_encodings.append(entity_encoding)
|
||||
keep_ents.append(eidx)
|
||||
|
||||
eidx += 1
|
||||
entity_encodings = self.model.ops.asarray(entity_encodings, dtype="float32")
|
||||
if sentence_encodings.shape != entity_encodings.shape:
|
||||
selected_encodings = sentence_encodings[keep_ents]
|
||||
|
||||
# If the entity encodings list is empty, then
|
||||
if selected_encodings.shape != entity_encodings.shape:
|
||||
err = Errors.E147.format(
|
||||
method="get_loss", msg="gold entities do not match up"
|
||||
)
|
||||
raise RuntimeError(err)
|
||||
# TODO: fix typing issue here
|
||||
gradients = self.distance.get_grad(sentence_encodings, entity_encodings) # type: ignore
|
||||
loss = self.distance.get_loss(sentence_encodings, entity_encodings) # type: ignore
|
||||
gradients = self.distance.get_grad(selected_encodings, entity_encodings) # type: ignore
|
||||
# to match the input size, we need to give a zero gradient for items not in the kb
|
||||
out = self.model.ops.alloc2f(*sentence_encodings.shape)
|
||||
out[keep_ents] = gradients
|
||||
|
||||
loss = self.distance.get_loss(selected_encodings, entity_encodings) # type: ignore
|
||||
loss = loss / len(entity_encodings)
|
||||
return float(loss), gradients
|
||||
return float(loss), out
|
||||
|
||||
def predict(self, docs: Iterable[Doc]) -> List[str]:
|
||||
"""Apply the pipeline's model to a batch of docs, without modifying them.
|
||||
|
|
3
spacy/pipeline/legacy/__init__.py
Normal file
3
spacy/pipeline/legacy/__init__.py
Normal file
|
@ -0,0 +1,3 @@
|
|||
from .entity_linker import EntityLinker_v1
|
||||
|
||||
__all__ = ["EntityLinker_v1"]
|
427
spacy/pipeline/legacy/entity_linker.py
Normal file
427
spacy/pipeline/legacy/entity_linker.py
Normal file
|
@ -0,0 +1,427 @@
|
|||
# This file is present to provide a prior version of the EntityLinker component
|
||||
# for backwards compatability. For details see #9669.
|
||||
|
||||
from typing import Optional, Iterable, Callable, Dict, Union, List, Any
|
||||
from thinc.types import Floats2d
|
||||
from pathlib import Path
|
||||
from itertools import islice
|
||||
import srsly
|
||||
import random
|
||||
from thinc.api import CosineDistance, Model, Optimizer, Config
|
||||
from thinc.api import set_dropout_rate
|
||||
import warnings
|
||||
|
||||
from ...kb import KnowledgeBase, Candidate
|
||||
from ...ml import empty_kb
|
||||
from ...tokens import Doc, Span
|
||||
from ..pipe import deserialize_config
|
||||
from ..trainable_pipe import TrainablePipe
|
||||
from ...language import Language
|
||||
from ...vocab import Vocab
|
||||
from ...training import Example, validate_examples, validate_get_examples
|
||||
from ...errors import Errors, Warnings
|
||||
from ...util import SimpleFrozenList, registry
|
||||
from ... import util
|
||||
from ...scorer import Scorer
|
||||
|
||||
# See #9050
|
||||
BACKWARD_OVERWRITE = True
|
||||
|
||||
|
||||
def entity_linker_score(examples, **kwargs):
|
||||
return Scorer.score_links(examples, negative_labels=[EntityLinker_v1.NIL], **kwargs)
|
||||
|
||||
|
||||
class EntityLinker_v1(TrainablePipe):
|
||||
"""Pipeline component for named entity linking.
|
||||
|
||||
DOCS: https://spacy.io/api/entitylinker
|
||||
"""
|
||||
|
||||
NIL = "NIL" # string used to refer to a non-existing link
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab: Vocab,
|
||||
model: Model,
|
||||
name: str = "entity_linker",
|
||||
*,
|
||||
labels_discard: Iterable[str],
|
||||
n_sents: int,
|
||||
incl_prior: bool,
|
||||
incl_context: bool,
|
||||
entity_vector_length: int,
|
||||
get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
|
||||
overwrite: bool = BACKWARD_OVERWRITE,
|
||||
scorer: Optional[Callable] = entity_linker_score,
|
||||
) -> None:
|
||||
"""Initialize an entity linker.
|
||||
|
||||
vocab (Vocab): The shared vocabulary.
|
||||
model (thinc.api.Model): The Thinc Model powering the pipeline component.
|
||||
name (str): The component instance name, used to add entries to the
|
||||
losses during training.
|
||||
labels_discard (Iterable[str]): NER labels that will automatically get a "NIL" prediction.
|
||||
n_sents (int): The number of neighbouring sentences to take into account.
|
||||
incl_prior (bool): Whether or not to include prior probabilities from the KB in the model.
|
||||
incl_context (bool): Whether or not to include the local context in the model.
|
||||
entity_vector_length (int): Size of encoding vectors in the KB.
|
||||
get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
|
||||
produces a list of candidates, given a certain knowledge base and a textual mention.
|
||||
scorer (Optional[Callable]): The scoring method. Defaults to
|
||||
Scorer.score_links.
|
||||
|
||||
DOCS: https://spacy.io/api/entitylinker#init
|
||||
"""
|
||||
self.vocab = vocab
|
||||
self.model = model
|
||||
self.name = name
|
||||
self.labels_discard = list(labels_discard)
|
||||
self.n_sents = n_sents
|
||||
self.incl_prior = incl_prior
|
||||
self.incl_context = incl_context
|
||||
self.get_candidates = get_candidates
|
||||
self.cfg: Dict[str, Any] = {"overwrite": overwrite}
|
||||
self.distance = CosineDistance(normalize=False)
|
||||
# how many neighbour sentences to take into account
|
||||
# create an empty KB by default. If you want to load a predefined one, specify it in 'initialize'.
|
||||
self.kb = empty_kb(entity_vector_length)(self.vocab)
|
||||
self.scorer = scorer
|
||||
|
||||
def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]):
|
||||
"""Define the KB of this pipe by providing a function that will
|
||||
create it using this object's vocab."""
|
||||
if not callable(kb_loader):
|
||||
raise ValueError(Errors.E885.format(arg_type=type(kb_loader)))
|
||||
|
||||
self.kb = kb_loader(self.vocab)
|
||||
|
||||
def validate_kb(self) -> None:
|
||||
# Raise an error if the knowledge base is not initialized.
|
||||
if self.kb is None:
|
||||
raise ValueError(Errors.E1018.format(name=self.name))
|
||||
if len(self.kb) == 0:
|
||||
raise ValueError(Errors.E139.format(name=self.name))
|
||||
|
||||
def initialize(
|
||||
self,
|
||||
get_examples: Callable[[], Iterable[Example]],
|
||||
*,
|
||||
nlp: Optional[Language] = None,
|
||||
kb_loader: Optional[Callable[[Vocab], KnowledgeBase]] = None,
|
||||
):
|
||||
"""Initialize the pipe for training, using a representative set
|
||||
of data examples.
|
||||
|
||||
get_examples (Callable[[], Iterable[Example]]): Function that
|
||||
returns a representative sample of gold-standard Example objects.
|
||||
nlp (Language): The current nlp object the component is part of.
|
||||
kb_loader (Callable[[Vocab], KnowledgeBase]): A function that creates a KnowledgeBase from a Vocab instance.
|
||||
Note that providing this argument, will overwrite all data accumulated in the current KB.
|
||||
Use this only when loading a KB as-such from file.
|
||||
|
||||
DOCS: https://spacy.io/api/entitylinker#initialize
|
||||
"""
|
||||
validate_get_examples(get_examples, "EntityLinker_v1.initialize")
|
||||
if kb_loader is not None:
|
||||
self.set_kb(kb_loader)
|
||||
self.validate_kb()
|
||||
nO = self.kb.entity_vector_length
|
||||
doc_sample = []
|
||||
vector_sample = []
|
||||
for example in islice(get_examples(), 10):
|
||||
doc_sample.append(example.x)
|
||||
vector_sample.append(self.model.ops.alloc1f(nO))
|
||||
assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
|
||||
assert len(vector_sample) > 0, Errors.E923.format(name=self.name)
|
||||
self.model.initialize(
|
||||
X=doc_sample, Y=self.model.ops.asarray(vector_sample, dtype="float32")
|
||||
)
|
||||
|
||||
def update(
|
||||
self,
|
||||
examples: Iterable[Example],
|
||||
*,
|
||||
drop: float = 0.0,
|
||||
sgd: Optional[Optimizer] = None,
|
||||
losses: Optional[Dict[str, float]] = None,
|
||||
) -> Dict[str, float]:
|
||||
"""Learn from a batch of documents and gold-standard information,
|
||||
updating the pipe's model. Delegates to predict and get_loss.
|
||||
|
||||
examples (Iterable[Example]): A batch of Example objects.
|
||||
drop (float): The dropout rate.
|
||||
sgd (thinc.api.Optimizer): The optimizer.
|
||||
losses (Dict[str, float]): Optional record of the loss during training.
|
||||
Updated using the component name as the key.
|
||||
RETURNS (Dict[str, float]): The updated losses dictionary.
|
||||
|
||||
DOCS: https://spacy.io/api/entitylinker#update
|
||||
"""
|
||||
self.validate_kb()
|
||||
if losses is None:
|
||||
losses = {}
|
||||
losses.setdefault(self.name, 0.0)
|
||||
if not examples:
|
||||
return losses
|
||||
validate_examples(examples, "EntityLinker_v1.update")
|
||||
sentence_docs = []
|
||||
for eg in examples:
|
||||
sentences = [s for s in eg.reference.sents]
|
||||
kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
|
||||
for ent in eg.reference.ents:
|
||||
# KB ID of the first token is the same as the whole span
|
||||
kb_id = kb_ids[ent.start]
|
||||
if kb_id:
|
||||
try:
|
||||
# find the sentence in the list of sentences.
|
||||
sent_index = sentences.index(ent.sent)
|
||||
except AttributeError:
|
||||
# Catch the exception when ent.sent is None and provide a user-friendly warning
|
||||
raise RuntimeError(Errors.E030) from None
|
||||
# get n previous sentences, if there are any
|
||||
start_sentence = max(0, sent_index - self.n_sents)
|
||||
# get n posterior sentences, or as many < n as there are
|
||||
end_sentence = min(len(sentences) - 1, sent_index + self.n_sents)
|
||||
# get token positions
|
||||
start_token = sentences[start_sentence].start
|
||||
end_token = sentences[end_sentence].end
|
||||
# append that span as a doc to training
|
||||
sent_doc = eg.predicted[start_token:end_token].as_doc()
|
||||
sentence_docs.append(sent_doc)
|
||||
set_dropout_rate(self.model, drop)
|
||||
if not sentence_docs:
|
||||
warnings.warn(Warnings.W093.format(name="Entity Linker"))
|
||||
return losses
|
||||
sentence_encodings, bp_context = self.model.begin_update(sentence_docs)
|
||||
loss, d_scores = self.get_loss(
|
||||
sentence_encodings=sentence_encodings, examples=examples
|
||||
)
|
||||
bp_context(d_scores)
|
||||
if sgd is not None:
|
||||
self.finish_update(sgd)
|
||||
losses[self.name] += loss
|
||||
return losses
|
||||
|
||||
def get_loss(self, examples: Iterable[Example], sentence_encodings: Floats2d):
|
||||
validate_examples(examples, "EntityLinker_v1.get_loss")
|
||||
entity_encodings = []
|
||||
for eg in examples:
|
||||
kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
|
||||
for ent in eg.reference.ents:
|
||||
kb_id = kb_ids[ent.start]
|
||||
if kb_id:
|
||||
entity_encoding = self.kb.get_vector(kb_id)
|
||||
entity_encodings.append(entity_encoding)
|
||||
entity_encodings = self.model.ops.asarray(entity_encodings, dtype="float32")
|
||||
if sentence_encodings.shape != entity_encodings.shape:
|
||||
err = Errors.E147.format(
|
||||
method="get_loss", msg="gold entities do not match up"
|
||||
)
|
||||
raise RuntimeError(err)
|
||||
# TODO: fix typing issue here
|
||||
gradients = self.distance.get_grad(sentence_encodings, entity_encodings) # type: ignore
|
||||
loss = self.distance.get_loss(sentence_encodings, entity_encodings) # type: ignore
|
||||
loss = loss / len(entity_encodings)
|
||||
return float(loss), gradients
|
||||
|
||||
def predict(self, docs: Iterable[Doc]) -> List[str]:
|
||||
"""Apply the pipeline's model to a batch of docs, without modifying them.
|
||||
Returns the KB IDs for each entity in each doc, including NIL if there is
|
||||
no prediction.
|
||||
|
||||
docs (Iterable[Doc]): The documents to predict.
|
||||
RETURNS (List[str]): The models prediction for each document.
|
||||
|
||||
DOCS: https://spacy.io/api/entitylinker#predict
|
||||
"""
|
||||
self.validate_kb()
|
||||
entity_count = 0
|
||||
final_kb_ids: List[str] = []
|
||||
if not docs:
|
||||
return final_kb_ids
|
||||
if isinstance(docs, Doc):
|
||||
docs = [docs]
|
||||
for i, doc in enumerate(docs):
|
||||
sentences = [s for s in doc.sents]
|
||||
if len(doc) > 0:
|
||||
# Looping through each entity (TODO: rewrite)
|
||||
for ent in doc.ents:
|
||||
sent = ent.sent
|
||||
sent_index = sentences.index(sent)
|
||||
assert sent_index >= 0
|
||||
# get n_neighbour sentences, clipped to the length of the document
|
||||
start_sentence = max(0, sent_index - self.n_sents)
|
||||
end_sentence = min(len(sentences) - 1, sent_index + self.n_sents)
|
||||
start_token = sentences[start_sentence].start
|
||||
end_token = sentences[end_sentence].end
|
||||
sent_doc = doc[start_token:end_token].as_doc()
|
||||
# currently, the context is the same for each entity in a sentence (should be refined)
|
||||
xp = self.model.ops.xp
|
||||
if self.incl_context:
|
||||
sentence_encoding = self.model.predict([sent_doc])[0]
|
||||
sentence_encoding_t = sentence_encoding.T
|
||||
sentence_norm = xp.linalg.norm(sentence_encoding_t)
|
||||
entity_count += 1
|
||||
if ent.label_ in self.labels_discard:
|
||||
# ignoring this entity - setting to NIL
|
||||
final_kb_ids.append(self.NIL)
|
||||
else:
|
||||
candidates = list(self.get_candidates(self.kb, ent))
|
||||
if not candidates:
|
||||
# no prediction possible for this entity - setting to NIL
|
||||
final_kb_ids.append(self.NIL)
|
||||
elif len(candidates) == 1:
|
||||
# shortcut for efficiency reasons: take the 1 candidate
|
||||
# TODO: thresholding
|
||||
final_kb_ids.append(candidates[0].entity_)
|
||||
else:
|
||||
random.shuffle(candidates)
|
||||
# set all prior probabilities to 0 if incl_prior=False
|
||||
prior_probs = xp.asarray([c.prior_prob for c in candidates])
|
||||
if not self.incl_prior:
|
||||
prior_probs = xp.asarray([0.0 for _ in candidates])
|
||||
scores = prior_probs
|
||||
# add in similarity from the context
|
||||
if self.incl_context:
|
||||
entity_encodings = xp.asarray(
|
||||
[c.entity_vector for c in candidates]
|
||||
)
|
||||
entity_norm = xp.linalg.norm(entity_encodings, axis=1)
|
||||
if len(entity_encodings) != len(prior_probs):
|
||||
raise RuntimeError(
|
||||
Errors.E147.format(
|
||||
method="predict",
|
||||
msg="vectors not of equal length",
|
||||
)
|
||||
)
|
||||
# cosine similarity
|
||||
sims = xp.dot(entity_encodings, sentence_encoding_t) / (
|
||||
sentence_norm * entity_norm
|
||||
)
|
||||
if sims.shape != prior_probs.shape:
|
||||
raise ValueError(Errors.E161)
|
||||
scores = prior_probs + sims - (prior_probs * sims)
|
||||
# TODO: thresholding
|
||||
best_index = scores.argmax().item()
|
||||
best_candidate = candidates[best_index]
|
||||
final_kb_ids.append(best_candidate.entity_)
|
||||
if not (len(final_kb_ids) == entity_count):
|
||||
err = Errors.E147.format(
|
||||
method="predict", msg="result variables not of equal length"
|
||||
)
|
||||
raise RuntimeError(err)
|
||||
return final_kb_ids
|
||||
|
||||
def set_annotations(self, docs: Iterable[Doc], kb_ids: List[str]) -> None:
|
||||
"""Modify a batch of documents, using pre-computed scores.
|
||||
|
||||
docs (Iterable[Doc]): The documents to modify.
|
||||
kb_ids (List[str]): The IDs to set, produced by EntityLinker.predict.
|
||||
|
||||
DOCS: https://spacy.io/api/entitylinker#set_annotations
|
||||
"""
|
||||
count_ents = len([ent for doc in docs for ent in doc.ents])
|
||||
if count_ents != len(kb_ids):
|
||||
raise ValueError(Errors.E148.format(ents=count_ents, ids=len(kb_ids)))
|
||||
i = 0
|
||||
overwrite = self.cfg["overwrite"]
|
||||
for doc in docs:
|
||||
for ent in doc.ents:
|
||||
kb_id = kb_ids[i]
|
||||
i += 1
|
||||
for token in ent:
|
||||
if token.ent_kb_id == 0 or overwrite:
|
||||
token.ent_kb_id_ = kb_id
|
||||
|
||||
def to_bytes(self, *, exclude=tuple()):
|
||||
"""Serialize the pipe to a bytestring.
|
||||
|
||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||
RETURNS (bytes): The serialized object.
|
||||
|
||||
DOCS: https://spacy.io/api/entitylinker#to_bytes
|
||||
"""
|
||||
self._validate_serialization_attrs()
|
||||
serialize = {}
|
||||
if hasattr(self, "cfg") and self.cfg is not None:
|
||||
serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
|
||||
serialize["vocab"] = lambda: self.vocab.to_bytes(exclude=exclude)
|
||||
serialize["kb"] = self.kb.to_bytes
|
||||
serialize["model"] = self.model.to_bytes
|
||||
return util.to_bytes(serialize, exclude)
|
||||
|
||||
def from_bytes(self, bytes_data, *, exclude=tuple()):
|
||||
"""Load the pipe from a bytestring.
|
||||
|
||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||
RETURNS (TrainablePipe): The loaded object.
|
||||
|
||||
DOCS: https://spacy.io/api/entitylinker#from_bytes
|
||||
"""
|
||||
self._validate_serialization_attrs()
|
||||
|
||||
def load_model(b):
|
||||
try:
|
||||
self.model.from_bytes(b)
|
||||
except AttributeError:
|
||||
raise ValueError(Errors.E149) from None
|
||||
|
||||
deserialize = {}
|
||||
if hasattr(self, "cfg") and self.cfg is not None:
|
||||
deserialize["cfg"] = lambda b: self.cfg.update(srsly.json_loads(b))
|
||||
deserialize["vocab"] = lambda b: self.vocab.from_bytes(b, exclude=exclude)
|
||||
deserialize["kb"] = lambda b: self.kb.from_bytes(b)
|
||||
deserialize["model"] = load_model
|
||||
util.from_bytes(bytes_data, deserialize, exclude)
|
||||
return self
|
||||
|
||||
def to_disk(
|
||||
self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
|
||||
) -> None:
|
||||
"""Serialize the pipe to disk.
|
||||
|
||||
path (str / Path): Path to a directory.
|
||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||
|
||||
DOCS: https://spacy.io/api/entitylinker#to_disk
|
||||
"""
|
||||
serialize = {}
|
||||
serialize["vocab"] = lambda p: self.vocab.to_disk(p, exclude=exclude)
|
||||
serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
|
||||
serialize["kb"] = lambda p: self.kb.to_disk(p)
|
||||
serialize["model"] = lambda p: self.model.to_disk(p)
|
||||
util.to_disk(path, serialize, exclude)
|
||||
|
||||
def from_disk(
|
||||
self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
|
||||
) -> "EntityLinker_v1":
|
||||
"""Load the pipe from disk. Modifies the object in place and returns it.
|
||||
|
||||
path (str / Path): Path to a directory.
|
||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||
RETURNS (EntityLinker): The modified EntityLinker object.
|
||||
|
||||
DOCS: https://spacy.io/api/entitylinker#from_disk
|
||||
"""
|
||||
|
||||
def load_model(p):
|
||||
try:
|
||||
with p.open("rb") as infile:
|
||||
self.model.from_bytes(infile.read())
|
||||
except AttributeError:
|
||||
raise ValueError(Errors.E149) from None
|
||||
|
||||
deserialize: Dict[str, Callable[[Any], Any]] = {}
|
||||
deserialize["cfg"] = lambda p: self.cfg.update(deserialize_config(p))
|
||||
deserialize["vocab"] = lambda p: self.vocab.from_disk(p, exclude=exclude)
|
||||
deserialize["kb"] = lambda p: self.kb.from_disk(p)
|
||||
deserialize["model"] = load_model
|
||||
util.from_disk(path, deserialize, exclude)
|
||||
return self
|
||||
|
||||
def rehearse(self, examples, *, sgd=None, losses=None, **config):
|
||||
raise NotImplementedError
|
||||
|
||||
def add_label(self, label):
|
||||
raise NotImplementedError
|
|
@ -25,7 +25,7 @@ BACKWARD_EXTEND = False
|
|||
|
||||
default_model_config = """
|
||||
[model]
|
||||
@architectures = "spacy.Tagger.v1"
|
||||
@architectures = "spacy.Tagger.v2"
|
||||
|
||||
[model.tok2vec]
|
||||
@architectures = "spacy.Tok2Vec.v2"
|
||||
|
|
|
@ -26,6 +26,8 @@ class Pipe:
|
|||
@property
|
||||
def labels(self) -> Tuple[str, ...]: ...
|
||||
@property
|
||||
def hide_labels(self) -> bool: ...
|
||||
@property
|
||||
def label_data(self) -> Any: ...
|
||||
def _require_labels(self) -> None: ...
|
||||
def set_error_handler(
|
||||
|
|
|
@ -102,6 +102,10 @@ cdef class Pipe:
|
|||
def labels(self) -> Tuple[str, ...]:
|
||||
return tuple()
|
||||
|
||||
@property
|
||||
def hide_labels(self) -> bool:
|
||||
return False
|
||||
|
||||
@property
|
||||
def label_data(self):
|
||||
"""Optional JSON-serializable data that would be sufficient to recreate
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
# cython: infer_types=True, profile=True, binding=True
|
||||
from itertools import islice
|
||||
from typing import Optional, Callable
|
||||
from itertools import islice
|
||||
|
||||
import srsly
|
||||
from thinc.api import Model, SequenceCategoricalCrossentropy, Config
|
||||
|
@ -20,7 +20,7 @@ BACKWARD_OVERWRITE = False
|
|||
|
||||
default_model_config = """
|
||||
[model]
|
||||
@architectures = "spacy.Tagger.v1"
|
||||
@architectures = "spacy.Tagger.v2"
|
||||
|
||||
[model.tok2vec]
|
||||
@architectures = "spacy.HashEmbedCNN.v2"
|
||||
|
@ -99,6 +99,10 @@ class SentenceRecognizer(Tagger):
|
|||
# are 0
|
||||
return tuple(["I", "S"])
|
||||
|
||||
@property
|
||||
def hide_labels(self):
|
||||
return True
|
||||
|
||||
@property
|
||||
def label_data(self):
|
||||
return None
|
||||
|
|
|
@ -1,9 +1,10 @@
|
|||
import numpy
|
||||
from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any, cast
|
||||
from thinc.api import Config, Model, get_current_ops, set_dropout_rate, Ops
|
||||
from thinc.api import Optimizer
|
||||
from thinc.types import Ragged, Ints2d, Floats2d, Ints1d
|
||||
|
||||
import numpy
|
||||
|
||||
from ..compat import Protocol, runtime_checkable
|
||||
from ..scorer import Scorer
|
||||
from ..language import Language
|
||||
|
@ -271,6 +272,24 @@ class SpanCategorizer(TrainablePipe):
|
|||
scores = self.model.predict((docs, indices)) # type: ignore
|
||||
return indices, scores
|
||||
|
||||
def set_candidates(
|
||||
self, docs: Iterable[Doc], *, candidates_key: str = "candidates"
|
||||
) -> None:
|
||||
"""Use the spancat suggester to add a list of span candidates to a list of docs.
|
||||
This method is intended to be used for debugging purposes.
|
||||
|
||||
docs (Iterable[Doc]): The documents to modify.
|
||||
candidates_key (str): Key of the Doc.spans dict to save the candidate spans under.
|
||||
|
||||
DOCS: https://spacy.io/api/spancategorizer#set_candidates
|
||||
"""
|
||||
suggester_output = self.suggester(docs, ops=self.model.ops)
|
||||
|
||||
for candidates, doc in zip(suggester_output, docs): # type: ignore
|
||||
doc.spans[candidates_key] = []
|
||||
for index in candidates.dataXd:
|
||||
doc.spans[candidates_key].append(doc[index[0] : index[1]])
|
||||
|
||||
def set_annotations(self, docs: Iterable[Doc], indices_scores) -> None:
|
||||
"""Modify a batch of Doc objects, using pre-computed scores.
|
||||
|
||||
|
@ -377,7 +396,7 @@ class SpanCategorizer(TrainablePipe):
|
|||
# If the prediction is 0.9 and it's false, the gradient will be
|
||||
# 0.9 (0.9 - 0.0)
|
||||
d_scores = scores - target
|
||||
loss = float((d_scores ** 2).sum())
|
||||
loss = float((d_scores**2).sum())
|
||||
return loss, d_scores
|
||||
|
||||
def initialize(
|
||||
|
|
|
@ -27,7 +27,7 @@ BACKWARD_OVERWRITE = False
|
|||
|
||||
default_model_config = """
|
||||
[model]
|
||||
@architectures = "spacy.Tagger.v1"
|
||||
@architectures = "spacy.Tagger.v2"
|
||||
|
||||
[model.tok2vec]
|
||||
@architectures = "spacy.HashEmbedCNN.v2"
|
||||
|
@ -225,6 +225,7 @@ class Tagger(TrainablePipe):
|
|||
|
||||
DOCS: https://spacy.io/api/tagger#rehearse
|
||||
"""
|
||||
loss_func = SequenceCategoricalCrossentropy()
|
||||
if losses is None:
|
||||
losses = {}
|
||||
losses.setdefault(self.name, 0.0)
|
||||
|
@ -236,12 +237,12 @@ class Tagger(TrainablePipe):
|
|||
# Handle cases where there are no tokens in any docs.
|
||||
return losses
|
||||
set_dropout_rate(self.model, drop)
|
||||
guesses, backprop = self.model.begin_update(docs)
|
||||
target = self._rehearsal_model(examples)
|
||||
gradient = guesses - target
|
||||
backprop(gradient)
|
||||
tag_scores, bp_tag_scores = self.model.begin_update(docs)
|
||||
tutor_tag_scores, _ = self._rehearsal_model.begin_update(docs)
|
||||
grads, loss = loss_func(tag_scores, tutor_tag_scores)
|
||||
bp_tag_scores(grads)
|
||||
self.finish_update(sgd)
|
||||
losses[self.name] += (gradient**2).sum()
|
||||
losses[self.name] += loss
|
||||
return losses
|
||||
|
||||
def get_loss(self, examples, scores):
|
||||
|
|
|
@ -158,6 +158,13 @@ class TextCategorizer(TrainablePipe):
|
|||
self.cfg = dict(cfg)
|
||||
self.scorer = scorer
|
||||
|
||||
@property
|
||||
def support_missing_values(self):
|
||||
# There are no missing values as the textcat should always
|
||||
# predict exactly one label. All other labels are 0.0
|
||||
# Subclasses may override this property to change internal behaviour.
|
||||
return False
|
||||
|
||||
@property
|
||||
def labels(self) -> Tuple[str]:
|
||||
"""RETURNS (Tuple[str]): The labels currently added to the component.
|
||||
|
@ -276,12 +283,12 @@ class TextCategorizer(TrainablePipe):
|
|||
return losses
|
||||
set_dropout_rate(self.model, drop)
|
||||
scores, bp_scores = self.model.begin_update(docs)
|
||||
target = self._rehearsal_model(examples)
|
||||
target, _ = self._rehearsal_model.begin_update(docs)
|
||||
gradient = scores - target
|
||||
bp_scores(gradient)
|
||||
if sgd is not None:
|
||||
self.finish_update(sgd)
|
||||
losses[self.name] += (gradient ** 2).sum()
|
||||
losses[self.name] += (gradient**2).sum()
|
||||
return losses
|
||||
|
||||
def _examples_to_truth(
|
||||
|
@ -294,7 +301,7 @@ class TextCategorizer(TrainablePipe):
|
|||
for j, label in enumerate(self.labels):
|
||||
if label in eg.reference.cats:
|
||||
truths[i, j] = eg.reference.cats[label]
|
||||
else:
|
||||
elif self.support_missing_values:
|
||||
not_missing[i, j] = 0.0
|
||||
truths = self.model.ops.asarray(truths) # type: ignore
|
||||
return truths, not_missing # type: ignore
|
||||
|
@ -313,9 +320,9 @@ class TextCategorizer(TrainablePipe):
|
|||
self._validate_categories(examples)
|
||||
truths, not_missing = self._examples_to_truth(examples)
|
||||
not_missing = self.model.ops.asarray(not_missing) # type: ignore
|
||||
d_scores = (scores - truths) / scores.shape[0]
|
||||
d_scores = scores - truths
|
||||
d_scores *= not_missing
|
||||
mean_square_error = (d_scores ** 2).sum(axis=1).mean()
|
||||
mean_square_error = (d_scores**2).mean()
|
||||
return float(mean_square_error), d_scores
|
||||
|
||||
def add_label(self, label: str) -> int:
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
from itertools import islice
|
||||
from typing import Iterable, Optional, Dict, List, Callable, Any
|
||||
|
||||
from thinc.api import Model, Config
|
||||
from thinc.types import Floats2d
|
||||
from thinc.api import Model, Config
|
||||
|
||||
from itertools import islice
|
||||
|
||||
from ..language import Language
|
||||
from ..training import Example, validate_get_examples
|
||||
|
@ -158,6 +158,10 @@ class MultiLabel_TextCategorizer(TextCategorizer):
|
|||
self.cfg = dict(cfg)
|
||||
self.scorer = scorer
|
||||
|
||||
@property
|
||||
def support_missing_values(self):
|
||||
return True
|
||||
|
||||
def initialize( # type: ignore[override]
|
||||
self,
|
||||
get_examples: Callable[[], Iterable[Example]],
|
||||
|
|
|
@ -118,6 +118,10 @@ class Tok2Vec(TrainablePipe):
|
|||
|
||||
DOCS: https://spacy.io/api/tok2vec#predict
|
||||
"""
|
||||
if not any(len(doc) for doc in docs):
|
||||
# Handle cases where there are no tokens in any docs.
|
||||
width = self.model.get_dim("nO")
|
||||
return [self.model.ops.alloc((0, width)) for doc in docs]
|
||||
tokvecs = self.model.predict(docs)
|
||||
batch_id = Tok2VecListener.get_batch_id(docs)
|
||||
for listener in self.listeners:
|
||||
|
|
|
@ -228,7 +228,7 @@ class Scorer:
|
|||
if token.orth_.isspace():
|
||||
continue
|
||||
if align.x2y.lengths[token.i] == 1:
|
||||
gold_i = align.x2y[token.i].dataXd[0, 0]
|
||||
gold_i = align.x2y[token.i][0]
|
||||
if gold_i not in missing_indices:
|
||||
pred_tags.add((gold_i, getter(token, attr)))
|
||||
tag_score.score_set(pred_tags, gold_tags)
|
||||
|
@ -287,7 +287,7 @@ class Scorer:
|
|||
if token.orth_.isspace():
|
||||
continue
|
||||
if align.x2y.lengths[token.i] == 1:
|
||||
gold_i = align.x2y[token.i].dataXd[0, 0]
|
||||
gold_i = align.x2y[token.i][0]
|
||||
if gold_i not in missing_indices:
|
||||
value = getter(token, attr)
|
||||
morph = gold_doc.vocab.strings[value]
|
||||
|
@ -553,7 +553,8 @@ class Scorer:
|
|||
getter(doc, attr) should return the values for the individual doc.
|
||||
labels (Iterable[str]): The set of possible labels. Defaults to [].
|
||||
multi_label (bool): Whether the attribute allows multiple labels.
|
||||
Defaults to True.
|
||||
Defaults to True. When set to False (exclusive labels), missing
|
||||
gold labels are interpreted as 0.0.
|
||||
positive_label (str): The positive label for a binary task with
|
||||
exclusive classes. Defaults to None.
|
||||
threshold (float): Cutoff to consider a prediction "positive". Defaults
|
||||
|
@ -592,13 +593,15 @@ class Scorer:
|
|||
|
||||
for label in labels:
|
||||
pred_score = pred_cats.get(label, 0.0)
|
||||
gold_score = gold_cats.get(label, 0.0)
|
||||
gold_score = gold_cats.get(label)
|
||||
if not gold_score and not multi_label:
|
||||
gold_score = 0.0
|
||||
if gold_score is not None:
|
||||
auc_per_type[label].score_set(pred_score, gold_score)
|
||||
if multi_label:
|
||||
for label in labels:
|
||||
pred_score = pred_cats.get(label, 0.0)
|
||||
gold_score = gold_cats.get(label, 0.0)
|
||||
gold_score = gold_cats.get(label)
|
||||
if gold_score is not None:
|
||||
if pred_score >= threshold and gold_score > 0:
|
||||
f_per_type[label].tp += 1
|
||||
|
@ -610,7 +613,6 @@ class Scorer:
|
|||
# Get the highest-scoring for each.
|
||||
pred_label, pred_score = max(pred_cats.items(), key=lambda it: it[1])
|
||||
gold_label, gold_score = max(gold_cats.items(), key=lambda it: it[1])
|
||||
if gold_score is not None:
|
||||
if pred_label == gold_label and pred_score >= threshold:
|
||||
f_per_type[pred_label].tp += 1
|
||||
else:
|
||||
|
@ -619,7 +621,7 @@ class Scorer:
|
|||
f_per_type[pred_label].fp += 1
|
||||
elif gold_cats:
|
||||
gold_label, gold_score = max(gold_cats, key=lambda it: it[1])
|
||||
if gold_score is not None and gold_score > 0:
|
||||
if gold_score > 0:
|
||||
f_per_type[gold_label].fn += 1
|
||||
elif pred_cats:
|
||||
pred_label, pred_score = max(pred_cats.items(), key=lambda it: it[1])
|
||||
|
@ -800,13 +802,13 @@ class Scorer:
|
|||
if align.x2y.lengths[token.i] != 1:
|
||||
gold_i = None # type: ignore
|
||||
else:
|
||||
gold_i = align.x2y[token.i].dataXd[0, 0]
|
||||
gold_i = align.x2y[token.i][0]
|
||||
if gold_i not in missing_indices:
|
||||
dep = getter(token, attr)
|
||||
head = head_getter(token, head_attr)
|
||||
if dep not in ignore_labels and token.orth_.strip():
|
||||
if align.x2y.lengths[head.i] == 1:
|
||||
gold_head = align.x2y[head.i].dataXd[0, 0]
|
||||
gold_head = align.x2y[head.i][0]
|
||||
else:
|
||||
gold_head = None
|
||||
# None is indistinct, so we can't just add it to the set
|
||||
|
@ -856,7 +858,7 @@ def get_ner_prf(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
|||
for pred_ent in eg.x.ents:
|
||||
if pred_ent.label_ not in score_per_type:
|
||||
score_per_type[pred_ent.label_] = PRFScore()
|
||||
indices = align_x2y[pred_ent.start : pred_ent.end].dataXd.ravel()
|
||||
indices = align_x2y[pred_ent.start : pred_ent.end]
|
||||
if len(indices):
|
||||
g_span = eg.y[indices[0] : indices[-1] + 1]
|
||||
# Check we aren't missing annotation on this span. If so,
|
||||
|
|
|
@ -99,6 +99,11 @@ def de_vocab():
|
|||
return get_lang_class("de")().vocab
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def dsb_tokenizer():
|
||||
return get_lang_class("dsb")().tokenizer
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def el_tokenizer():
|
||||
return get_lang_class("el")().tokenizer
|
||||
|
@ -155,6 +160,11 @@ def fr_tokenizer():
|
|||
return get_lang_class("fr")().tokenizer
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def fr_vocab():
|
||||
return get_lang_class("fr")().vocab
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def ga_tokenizer():
|
||||
return get_lang_class("ga")().tokenizer
|
||||
|
@ -205,18 +215,41 @@ def it_tokenizer():
|
|||
return get_lang_class("it")().tokenizer
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def it_vocab():
|
||||
return get_lang_class("it")().vocab
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def ja_tokenizer():
|
||||
pytest.importorskip("sudachipy")
|
||||
return get_lang_class("ja")().tokenizer
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def hsb_tokenizer():
|
||||
return get_lang_class("hsb")().tokenizer
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def ko_tokenizer():
|
||||
pytest.importorskip("natto")
|
||||
return get_lang_class("ko")().tokenizer
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def ko_tokenizer_tokenizer():
|
||||
config = {
|
||||
"nlp": {
|
||||
"tokenizer": {
|
||||
"@tokenizers": "spacy.Tokenizer.v1",
|
||||
}
|
||||
}
|
||||
}
|
||||
nlp = get_lang_class("ko").from_config(config)
|
||||
return nlp.tokenizer
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def lb_tokenizer():
|
||||
return get_lang_class("lb")().tokenizer
|
||||
|
@ -324,6 +357,11 @@ def sv_tokenizer():
|
|||
return get_lang_class("sv")().tokenizer
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def ta_tokenizer():
|
||||
return get_lang_class("ta")().tokenizer
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def th_tokenizer():
|
||||
pytest.importorskip("pythainlp")
|
||||
|
|
|
@ -684,6 +684,7 @@ def test_has_annotation(en_vocab):
|
|||
attrs = ("TAG", "POS", "MORPH", "LEMMA", "DEP", "HEAD", "ENT_IOB", "ENT_TYPE")
|
||||
for attr in attrs:
|
||||
assert not doc.has_annotation(attr)
|
||||
assert not doc.has_annotation(attr, require_complete=True)
|
||||
|
||||
doc[0].tag_ = "A"
|
||||
doc[0].pos_ = "X"
|
||||
|
@ -709,6 +710,27 @@ def test_has_annotation(en_vocab):
|
|||
assert doc.has_annotation(attr, require_complete=True)
|
||||
|
||||
|
||||
def test_has_annotation_sents(en_vocab):
|
||||
doc = Doc(en_vocab, words=["Hello", "beautiful", "world"])
|
||||
attrs = ("SENT_START", "IS_SENT_START", "IS_SENT_END")
|
||||
for attr in attrs:
|
||||
assert not doc.has_annotation(attr)
|
||||
assert not doc.has_annotation(attr, require_complete=True)
|
||||
|
||||
# The first token (index 0) is always assumed to be a sentence start,
|
||||
# and ignored by the check in doc.has_annotation
|
||||
|
||||
doc[1].is_sent_start = False
|
||||
for attr in attrs:
|
||||
assert doc.has_annotation(attr)
|
||||
assert not doc.has_annotation(attr, require_complete=True)
|
||||
|
||||
doc[2].is_sent_start = False
|
||||
for attr in attrs:
|
||||
assert doc.has_annotation(attr)
|
||||
assert doc.has_annotation(attr, require_complete=True)
|
||||
|
||||
|
||||
def test_is_flags_deprecated(en_tokenizer):
|
||||
doc = en_tokenizer("test")
|
||||
with pytest.deprecated_call():
|
||||
|
|
|
@ -573,6 +573,55 @@ def test_span_with_vectors(doc):
|
|||
doc.vocab.vectors = prev_vectors
|
||||
|
||||
|
||||
# fmt: off
|
||||
def test_span_comparison(doc):
|
||||
|
||||
# Identical start, end, only differ in label and kb_id
|
||||
assert Span(doc, 0, 3) == Span(doc, 0, 3)
|
||||
assert Span(doc, 0, 3, "LABEL") == Span(doc, 0, 3, "LABEL")
|
||||
assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") == Span(doc, 0, 3, "LABEL", kb_id="KB_ID")
|
||||
|
||||
assert Span(doc, 0, 3) != Span(doc, 0, 3, "LABEL")
|
||||
assert Span(doc, 0, 3) != Span(doc, 0, 3, "LABEL", kb_id="KB_ID")
|
||||
assert Span(doc, 0, 3, "LABEL") != Span(doc, 0, 3, "LABEL", kb_id="KB_ID")
|
||||
|
||||
assert Span(doc, 0, 3) <= Span(doc, 0, 3) and Span(doc, 0, 3) >= Span(doc, 0, 3)
|
||||
assert Span(doc, 0, 3, "LABEL") <= Span(doc, 0, 3, "LABEL") and Span(doc, 0, 3, "LABEL") >= Span(doc, 0, 3, "LABEL")
|
||||
assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") <= Span(doc, 0, 3, "LABEL", kb_id="KB_ID")
|
||||
assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") >= Span(doc, 0, 3, "LABEL", kb_id="KB_ID")
|
||||
|
||||
assert (Span(doc, 0, 3) < Span(doc, 0, 3, "", kb_id="KB_ID") < Span(doc, 0, 3, "LABEL") < Span(doc, 0, 3, "LABEL", kb_id="KB_ID"))
|
||||
assert (Span(doc, 0, 3) <= Span(doc, 0, 3, "", kb_id="KB_ID") <= Span(doc, 0, 3, "LABEL") <= Span(doc, 0, 3, "LABEL", kb_id="KB_ID"))
|
||||
|
||||
assert (Span(doc, 0, 3, "LABEL", kb_id="KB_ID") > Span(doc, 0, 3, "LABEL") > Span(doc, 0, 3, "", kb_id="KB_ID") > Span(doc, 0, 3))
|
||||
assert (Span(doc, 0, 3, "LABEL", kb_id="KB_ID") >= Span(doc, 0, 3, "LABEL") >= Span(doc, 0, 3, "", kb_id="KB_ID") >= Span(doc, 0, 3))
|
||||
|
||||
# Different end
|
||||
assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") < Span(doc, 0, 4, "LABEL", kb_id="KB_ID")
|
||||
|
||||
assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") < Span(doc, 0, 4)
|
||||
assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") <= Span(doc, 0, 4)
|
||||
assert Span(doc, 0, 4) > Span(doc, 0, 3, "LABEL", kb_id="KB_ID")
|
||||
assert Span(doc, 0, 4) >= Span(doc, 0, 3, "LABEL", kb_id="KB_ID")
|
||||
|
||||
# Different start
|
||||
assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") != Span(doc, 1, 3, "LABEL", kb_id="KB_ID")
|
||||
|
||||
assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") < Span(doc, 1, 3)
|
||||
assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") <= Span(doc, 1, 3)
|
||||
assert Span(doc, 1, 3) > Span(doc, 0, 3, "LABEL", kb_id="KB_ID")
|
||||
assert Span(doc, 1, 3) >= Span(doc, 0, 3, "LABEL", kb_id="KB_ID")
|
||||
|
||||
# Different start & different end
|
||||
assert Span(doc, 0, 4, "LABEL", kb_id="KB_ID") != Span(doc, 1, 3, "LABEL", kb_id="KB_ID")
|
||||
|
||||
assert Span(doc, 0, 4, "LABEL", kb_id="KB_ID") < Span(doc, 1, 3)
|
||||
assert Span(doc, 0, 4, "LABEL", kb_id="KB_ID") <= Span(doc, 1, 3)
|
||||
assert Span(doc, 1, 3) > Span(doc, 0, 4, "LABEL", kb_id="KB_ID")
|
||||
assert Span(doc, 1, 3) >= Span(doc, 0, 4, "LABEL", kb_id="KB_ID")
|
||||
# fmt: on
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"start,end,expected_sentences,expected_sentences_with_hook",
|
||||
[
|
||||
|
@ -606,3 +655,16 @@ def test_span_sents(doc, start, end, expected_sentences, expected_sentences_with
|
|||
def test_span_sents_not_parsed(doc_not_parsed):
|
||||
with pytest.raises(ValueError):
|
||||
list(Span(doc_not_parsed, 0, 3).sents)
|
||||
|
||||
|
||||
def test_span_group_copy(doc):
|
||||
doc.spans["test"] = [doc[0:1], doc[2:4]]
|
||||
assert len(doc.spans["test"]) == 2
|
||||
doc_copy = doc.copy()
|
||||
# check that the spans were indeed copied
|
||||
assert len(doc_copy.spans["test"]) == 2
|
||||
# add a new span to the original doc
|
||||
doc.spans["test"].append(doc[3:4])
|
||||
assert len(doc.spans["test"]) == 3
|
||||
# check that the copy spans were not modified and this is an isolated doc
|
||||
assert len(doc_copy.spans["test"]) == 2
|
||||
|
|
242
spacy/tests/doc/test_span_group.py
Normal file
242
spacy/tests/doc/test_span_group.py
Normal file
|
@ -0,0 +1,242 @@
|
|||
import pytest
|
||||
from random import Random
|
||||
from spacy.matcher import Matcher
|
||||
from spacy.tokens import Span, SpanGroup
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def doc(en_tokenizer):
|
||||
doc = en_tokenizer("0 1 2 3 4 5 6")
|
||||
matcher = Matcher(en_tokenizer.vocab, validate=True)
|
||||
|
||||
# fmt: off
|
||||
matcher.add("4", [[{}, {}, {}, {}]])
|
||||
matcher.add("2", [[{}, {}, ]])
|
||||
matcher.add("1", [[{}, ]])
|
||||
# fmt: on
|
||||
matches = matcher(doc)
|
||||
spans = []
|
||||
for match in matches:
|
||||
spans.append(
|
||||
Span(doc, match[1], match[2], en_tokenizer.vocab.strings[match[0]])
|
||||
)
|
||||
Random(42).shuffle(spans)
|
||||
doc.spans["SPANS"] = SpanGroup(
|
||||
doc, name="SPANS", attrs={"key": "value"}, spans=spans
|
||||
)
|
||||
return doc
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def other_doc(en_tokenizer):
|
||||
doc = en_tokenizer("0 1 2 3 4 5 6")
|
||||
matcher = Matcher(en_tokenizer.vocab, validate=True)
|
||||
|
||||
# fmt: off
|
||||
matcher.add("4", [[{}, {}, {}, {}]])
|
||||
matcher.add("2", [[{}, {}, ]])
|
||||
matcher.add("1", [[{}, ]])
|
||||
# fmt: on
|
||||
|
||||
matches = matcher(doc)
|
||||
spans = []
|
||||
for match in matches:
|
||||
spans.append(
|
||||
Span(doc, match[1], match[2], en_tokenizer.vocab.strings[match[0]])
|
||||
)
|
||||
Random(42).shuffle(spans)
|
||||
doc.spans["SPANS"] = SpanGroup(
|
||||
doc, name="SPANS", attrs={"key": "value"}, spans=spans
|
||||
)
|
||||
return doc
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def span_group(en_tokenizer):
|
||||
doc = en_tokenizer("0 1 2 3 4 5 6")
|
||||
matcher = Matcher(en_tokenizer.vocab, validate=True)
|
||||
|
||||
# fmt: off
|
||||
matcher.add("4", [[{}, {}, {}, {}]])
|
||||
matcher.add("2", [[{}, {}, ]])
|
||||
matcher.add("1", [[{}, ]])
|
||||
# fmt: on
|
||||
|
||||
matches = matcher(doc)
|
||||
spans = []
|
||||
for match in matches:
|
||||
spans.append(
|
||||
Span(doc, match[1], match[2], en_tokenizer.vocab.strings[match[0]])
|
||||
)
|
||||
Random(42).shuffle(spans)
|
||||
doc.spans["SPANS"] = SpanGroup(
|
||||
doc, name="SPANS", attrs={"key": "value"}, spans=spans
|
||||
)
|
||||
|
||||
|
||||
def test_span_group_copy(doc):
|
||||
span_group = doc.spans["SPANS"]
|
||||
clone = span_group.copy()
|
||||
assert clone != span_group
|
||||
assert clone.name == span_group.name
|
||||
assert clone.attrs == span_group.attrs
|
||||
assert len(clone) == len(span_group)
|
||||
assert list(span_group) == list(clone)
|
||||
clone.name = "new_name"
|
||||
clone.attrs["key"] = "new_value"
|
||||
clone.append(Span(doc, 0, 6, "LABEL"))
|
||||
assert clone.name != span_group.name
|
||||
assert clone.attrs != span_group.attrs
|
||||
assert span_group.attrs["key"] == "value"
|
||||
assert list(span_group) != list(clone)
|
||||
|
||||
|
||||
def test_span_group_set_item(doc, other_doc):
|
||||
span_group = doc.spans["SPANS"]
|
||||
|
||||
index = 5
|
||||
span = span_group[index]
|
||||
span.label_ = "NEW LABEL"
|
||||
span.kb_id = doc.vocab.strings["KB_ID"]
|
||||
|
||||
assert span_group[index].label != span.label
|
||||
assert span_group[index].kb_id != span.kb_id
|
||||
|
||||
span_group[index] = span
|
||||
assert span_group[index].start == span.start
|
||||
assert span_group[index].end == span.end
|
||||
assert span_group[index].label == span.label
|
||||
assert span_group[index].kb_id == span.kb_id
|
||||
assert span_group[index] == span
|
||||
|
||||
with pytest.raises(IndexError):
|
||||
span_group[-100] = span
|
||||
with pytest.raises(IndexError):
|
||||
span_group[100] = span
|
||||
|
||||
span = Span(other_doc, 0, 2)
|
||||
with pytest.raises(ValueError):
|
||||
span_group[index] = span
|
||||
|
||||
|
||||
def test_span_group_has_overlap(doc):
|
||||
span_group = doc.spans["SPANS"]
|
||||
assert span_group.has_overlap
|
||||
|
||||
|
||||
def test_span_group_concat(doc, other_doc):
|
||||
span_group_1 = doc.spans["SPANS"]
|
||||
spans = [doc[0:5], doc[0:6]]
|
||||
span_group_2 = SpanGroup(
|
||||
doc,
|
||||
name="MORE_SPANS",
|
||||
attrs={"key": "new_value", "new_key": "new_value"},
|
||||
spans=spans,
|
||||
)
|
||||
span_group_3 = span_group_1._concat(span_group_2)
|
||||
assert span_group_3.name == span_group_1.name
|
||||
assert span_group_3.attrs == {"key": "value", "new_key": "new_value"}
|
||||
span_list_expected = list(span_group_1) + list(span_group_2)
|
||||
assert list(span_group_3) == list(span_list_expected)
|
||||
|
||||
# Inplace
|
||||
span_list_expected = list(span_group_1) + list(span_group_2)
|
||||
span_group_3 = span_group_1._concat(span_group_2, inplace=True)
|
||||
assert span_group_3 == span_group_1
|
||||
assert span_group_3.name == span_group_1.name
|
||||
assert span_group_3.attrs == {"key": "value", "new_key": "new_value"}
|
||||
assert list(span_group_3) == list(span_list_expected)
|
||||
|
||||
span_group_2 = other_doc.spans["SPANS"]
|
||||
with pytest.raises(ValueError):
|
||||
span_group_1._concat(span_group_2)
|
||||
|
||||
|
||||
def test_span_doc_delitem(doc):
|
||||
span_group = doc.spans["SPANS"]
|
||||
length = len(span_group)
|
||||
index = 5
|
||||
span = span_group[index]
|
||||
next_span = span_group[index + 1]
|
||||
del span_group[index]
|
||||
assert len(span_group) == length - 1
|
||||
assert span_group[index] != span
|
||||
assert span_group[index] == next_span
|
||||
|
||||
with pytest.raises(IndexError):
|
||||
del span_group[-100]
|
||||
with pytest.raises(IndexError):
|
||||
del span_group[100]
|
||||
|
||||
|
||||
def test_span_group_add(doc):
|
||||
span_group_1 = doc.spans["SPANS"]
|
||||
spans = [doc[0:5], doc[0:6]]
|
||||
span_group_2 = SpanGroup(
|
||||
doc,
|
||||
name="MORE_SPANS",
|
||||
attrs={"key": "new_value", "new_key": "new_value"},
|
||||
spans=spans,
|
||||
)
|
||||
|
||||
span_group_3_expected = span_group_1._concat(span_group_2)
|
||||
|
||||
span_group_3 = span_group_1 + span_group_2
|
||||
assert len(span_group_3) == len(span_group_3_expected)
|
||||
assert span_group_3.attrs == {"key": "value", "new_key": "new_value"}
|
||||
assert list(span_group_3) == list(span_group_3_expected)
|
||||
|
||||
|
||||
def test_span_group_iadd(doc):
|
||||
span_group_1 = doc.spans["SPANS"].copy()
|
||||
spans = [doc[0:5], doc[0:6]]
|
||||
span_group_2 = SpanGroup(
|
||||
doc,
|
||||
name="MORE_SPANS",
|
||||
attrs={"key": "new_value", "new_key": "new_value"},
|
||||
spans=spans,
|
||||
)
|
||||
|
||||
span_group_1_expected = span_group_1._concat(span_group_2)
|
||||
|
||||
span_group_1 += span_group_2
|
||||
assert len(span_group_1) == len(span_group_1_expected)
|
||||
assert span_group_1.attrs == {"key": "value", "new_key": "new_value"}
|
||||
assert list(span_group_1) == list(span_group_1_expected)
|
||||
|
||||
span_group_1 = doc.spans["SPANS"].copy()
|
||||
span_group_1 += spans
|
||||
assert len(span_group_1) == len(span_group_1_expected)
|
||||
assert span_group_1.attrs == {
|
||||
"key": "value",
|
||||
}
|
||||
assert list(span_group_1) == list(span_group_1_expected)
|
||||
|
||||
|
||||
def test_span_group_extend(doc):
|
||||
span_group_1 = doc.spans["SPANS"].copy()
|
||||
spans = [doc[0:5], doc[0:6]]
|
||||
span_group_2 = SpanGroup(
|
||||
doc,
|
||||
name="MORE_SPANS",
|
||||
attrs={"key": "new_value", "new_key": "new_value"},
|
||||
spans=spans,
|
||||
)
|
||||
|
||||
span_group_1_expected = span_group_1._concat(span_group_2)
|
||||
|
||||
span_group_1.extend(span_group_2)
|
||||
assert len(span_group_1) == len(span_group_1_expected)
|
||||
assert span_group_1.attrs == {"key": "value", "new_key": "new_value"}
|
||||
assert list(span_group_1) == list(span_group_1_expected)
|
||||
|
||||
span_group_1 = doc.spans["SPANS"]
|
||||
span_group_1.extend(spans)
|
||||
assert len(span_group_1) == len(span_group_1_expected)
|
||||
assert span_group_1.attrs == {"key": "value"}
|
||||
assert list(span_group_1) == list(span_group_1_expected)
|
||||
|
||||
|
||||
def test_span_group_dealloc(span_group):
|
||||
with pytest.raises(AttributeError):
|
||||
print(span_group.doc)
|
|
@ -1,5 +1,5 @@
|
|||
import pytest
|
||||
from spacy.tokens import Doc
|
||||
from spacy.tokens import Doc, Span
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
|
@ -60,3 +60,13 @@ def test_doc_to_json_underscore_error_serialize(doc):
|
|||
Doc.set_extension("json_test4", method=lambda doc: doc.text)
|
||||
with pytest.raises(ValueError):
|
||||
doc.to_json(underscore=["json_test4"])
|
||||
|
||||
|
||||
def test_doc_to_json_span(doc):
|
||||
"""Test that Doc.to_json() includes spans"""
|
||||
doc.spans["test"] = [Span(doc, 0, 2, "test"), Span(doc, 0, 1, "test")]
|
||||
json_doc = doc.to_json()
|
||||
assert "spans" in json_doc
|
||||
assert len(json_doc["spans"]) == 1
|
||||
assert len(json_doc["spans"]["test"]) == 2
|
||||
assert json_doc["spans"]["test"][0]["start"] == 0
|
||||
|
|
0
spacy/tests/lang/dsb/__init__.py
Normal file
0
spacy/tests/lang/dsb/__init__.py
Normal file
25
spacy/tests/lang/dsb/test_text.py
Normal file
25
spacy/tests/lang/dsb/test_text.py
Normal file
|
@ -0,0 +1,25 @@
|
|||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text,match",
|
||||
[
|
||||
("10", True),
|
||||
("1", True),
|
||||
("10,000", True),
|
||||
("10,00", True),
|
||||
("jadno", True),
|
||||
("dwanassćo", True),
|
||||
("milion", True),
|
||||
("sto", True),
|
||||
("ceła", False),
|
||||
("kopica", False),
|
||||
("narěcow", False),
|
||||
(",", False),
|
||||
("1/2", True),
|
||||
],
|
||||
)
|
||||
def test_lex_attrs_like_number(dsb_tokenizer, text, match):
|
||||
tokens = dsb_tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
assert tokens[0].like_num == match
|
29
spacy/tests/lang/dsb/test_tokenizer.py
Normal file
29
spacy/tests/lang/dsb/test_tokenizer.py
Normal file
|
@ -0,0 +1,29 @@
|
|||
import pytest
|
||||
|
||||
DSB_BASIC_TOKENIZATION_TESTS = [
|
||||
(
|
||||
"Ale eksistěrujo mimo togo ceła kopica narěcow, ako na pśikład slěpjańska.",
|
||||
[
|
||||
"Ale",
|
||||
"eksistěrujo",
|
||||
"mimo",
|
||||
"togo",
|
||||
"ceła",
|
||||
"kopica",
|
||||
"narěcow",
|
||||
",",
|
||||
"ako",
|
||||
"na",
|
||||
"pśikład",
|
||||
"slěpjańska",
|
||||
".",
|
||||
],
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text,expected_tokens", DSB_BASIC_TOKENIZATION_TESTS)
|
||||
def test_dsb_tokenizer_basic(dsb_tokenizer, text, expected_tokens):
|
||||
tokens = dsb_tokenizer(text)
|
||||
token_list = [token.text for token in tokens if not token.is_space]
|
||||
assert expected_tokens == token_list
|
189
spacy/tests/lang/fi/test_noun_chunks.py
Normal file
189
spacy/tests/lang/fi/test_noun_chunks.py
Normal file
|
@ -0,0 +1,189 @@
|
|||
import pytest
|
||||
from spacy.tokens import Doc
|
||||
|
||||
|
||||
FI_NP_TEST_EXAMPLES = [
|
||||
(
|
||||
"Kaksi tyttöä potkii punaista palloa",
|
||||
["NUM", "NOUN", "VERB", "ADJ", "NOUN"],
|
||||
["nummod", "nsubj", "ROOT", "amod", "obj"],
|
||||
[1, 1, 0, 1, -2],
|
||||
["Kaksi tyttöä", "punaista palloa"],
|
||||
),
|
||||
(
|
||||
"Erittäin vaarallinen leijona karkasi kiertävän sirkuksen eläintenkesyttäjältä",
|
||||
["ADV", "ADJ", "NOUN", "VERB", "ADJ", "NOUN", "NOUN"],
|
||||
["advmod", "amod", "nsubj", "ROOT", "amod", "nmod:poss", "obl"],
|
||||
[1, 1, 1, 0, 1, 1, -3],
|
||||
["Erittäin vaarallinen leijona", "kiertävän sirkuksen eläintenkesyttäjältä"],
|
||||
),
|
||||
(
|
||||
"Leijona raidallisine tassuineen piileksii Porin kaupungin lähellä",
|
||||
["NOUN", "ADJ", "NOUN", "VERB", "PROPN", "NOUN", "ADP"],
|
||||
["nsubj", "amod", "nmod", "ROOT", "nmod:poss", "obl", "case"],
|
||||
[3, 1, -2, 0, 1, -2, -1],
|
||||
["Leijona raidallisine tassuineen", "Porin kaupungin"],
|
||||
),
|
||||
(
|
||||
"Lounaalla nautittiin salaattia, maukasta kanaa ja raikasta vettä",
|
||||
["NOUN", "VERB", "NOUN", "PUNCT", "ADJ", "NOUN", "CCONJ", "ADJ", "NOUN"],
|
||||
["obl", "ROOT", "obj", "punct", "amod", "conj", "cc", "amod", "conj"],
|
||||
[1, 0, -1, 2, 1, -3, 2, 1, -6],
|
||||
["Lounaalla", "salaattia", "maukasta kanaa", "raikasta vettä"],
|
||||
),
|
||||
(
|
||||
"Minua houkuttaa maalle muuttaminen talven jälkeen",
|
||||
["PRON", "VERB", "NOUN", "NOUN", "NOUN", "ADP"],
|
||||
["obj", "ROOT", "nmod", "nsubj", "obl", "case"],
|
||||
[1, 0, 1, -2, -3, -1],
|
||||
["maalle muuttaminen", "talven"],
|
||||
),
|
||||
(
|
||||
"Päivän kohokohta oli vierailu museossa kummilasten kanssa",
|
||||
["NOUN", "NOUN", "AUX", "NOUN", "NOUN", "NOUN", "ADP"],
|
||||
["nmod:poss", "nsubj:cop", "cop", "ROOT", "nmod", "obl", "case"],
|
||||
[1, 2, 1, 0, -1, -2, -1],
|
||||
["Päivän kohokohta", "vierailu museossa", "kummilasten"],
|
||||
),
|
||||
(
|
||||
"Yrittäjät maksoivat tuomioistuimen määräämät korvaukset",
|
||||
["NOUN", "VERB", "NOUN", "VERB", "NOUN"],
|
||||
["nsubj", "ROOT", "nsubj", "acl", "obj"],
|
||||
[1, 0, 1, 1, -3],
|
||||
["Yrittäjät", "tuomioistuimen", "korvaukset"],
|
||||
),
|
||||
(
|
||||
"Julkisoikeudelliset tai niihin rinnastettavat saatavat ovat suoraan ulosottokelpoisia",
|
||||
["ADJ", "CCONJ", "PRON", "VERB", "NOUN", "AUX", "ADV", "NOUN"],
|
||||
["amod", "cc", "obl", "acl", "nsubj:cop", "cop", "advmod", "ROOT"],
|
||||
[4, 3, 1, 1, 3, 2, 1, 0],
|
||||
["Julkisoikeudelliset tai niihin rinnastettavat saatavat", "ulosottokelpoisia"],
|
||||
),
|
||||
(
|
||||
"Se oli ala-arvoista käytöstä kaikilta oppilailta, myös valvojaoppilailta",
|
||||
["PRON", "AUX", "ADJ", "NOUN", "PRON", "NOUN", "PUNCT", "ADV", "NOUN"],
|
||||
["nsubj:cop", "cop", "amod", "ROOT", "det", "nmod", "punct", "advmod", "appos"],
|
||||
[3, 2, 1, 0, 1, -2, 2, 1, -3],
|
||||
["ala-arvoista käytöstä kaikilta oppilailta", "valvojaoppilailta"],
|
||||
),
|
||||
(
|
||||
"Isä souti veneellä, jonka hän oli vuokrannut",
|
||||
["NOUN", "VERB", "NOUN", "PUNCT", "PRON", "PRON", "AUX", "VERB"],
|
||||
["nsubj", "ROOT", "obl", "punct", "obj", "nsubj", "aux", "acl:relcl"],
|
||||
[1, 0, -1, 4, 3, 2, 1, -5],
|
||||
["Isä", "veneellä"],
|
||||
),
|
||||
(
|
||||
"Kirja, jonka poimin hyllystä, kertoo norsuista",
|
||||
["NOUN", "PUNCT", "PRON", "VERB", "NOUN", "PUNCT", "VERB", "NOUN"],
|
||||
["nsubj", "punct", "obj", "acl:relcl", "obl", "punct", "ROOT", "obl"],
|
||||
[6, 2, 1, -3, -1, 1, 0, -1],
|
||||
["Kirja", "hyllystä", "norsuista"],
|
||||
),
|
||||
(
|
||||
"Huomenna on päivä, jota olemme odottaneet",
|
||||
["NOUN", "AUX", "NOUN", "PUNCT", "PRON", "AUX", "VERB"],
|
||||
["ROOT", "cop", "nsubj:cop", "punct", "obj", "aux", "acl:relcl"],
|
||||
[0, -1, -2, 3, 2, 1, -4],
|
||||
["Huomenna", "päivä"],
|
||||
),
|
||||
(
|
||||
"Liikkuvuuden lisääminen on yksi korkeakoulutuksen keskeisistä kehittämiskohteista",
|
||||
["NOUN", "NOUN", "AUX", "PRON", "NOUN", "ADJ", "NOUN"],
|
||||
["nmod:gobj", "nsubj:cop", "cop", "ROOT", "nmod:poss", "amod", "nmod"],
|
||||
[1, 2, 1, 0, 2, 1, -3],
|
||||
[
|
||||
"Liikkuvuuden lisääminen",
|
||||
"korkeakoulutuksen keskeisistä kehittämiskohteista",
|
||||
],
|
||||
),
|
||||
(
|
||||
"Kaupalliset palvelut jätetään yksityisten palveluntarjoajien tarjottavaksi",
|
||||
["ADJ", "NOUN", "VERB", "ADJ", "NOUN", "NOUN"],
|
||||
["amod", "obj", "ROOT", "amod", "nmod:gsubj", "obl"],
|
||||
[1, 1, 0, 1, 1, -3],
|
||||
["Kaupalliset palvelut", "yksityisten palveluntarjoajien tarjottavaksi"],
|
||||
),
|
||||
(
|
||||
"New York tunnetaan kaupunkina, joka ei koskaan nuku",
|
||||
["PROPN", "PROPN", "VERB", "NOUN", "PUNCT", "PRON", "AUX", "ADV", "VERB"],
|
||||
[
|
||||
"obj",
|
||||
"flat:name",
|
||||
"ROOT",
|
||||
"obl",
|
||||
"punct",
|
||||
"nsubj",
|
||||
"aux",
|
||||
"advmod",
|
||||
"acl:relcl",
|
||||
],
|
||||
[2, -1, 0, -1, 4, 3, 2, 1, -5],
|
||||
["New York", "kaupunkina"],
|
||||
),
|
||||
(
|
||||
"Loput vihjeet saat herra Möttöseltä",
|
||||
["NOUN", "NOUN", "VERB", "NOUN", "PROPN"],
|
||||
["compound:nn", "obj", "ROOT", "compound:nn", "obj"],
|
||||
[1, 1, 0, 1, -2],
|
||||
["Loput vihjeet", "herra Möttöseltä"],
|
||||
),
|
||||
(
|
||||
"mahdollisuus tukea muita päivystysyksiköitä",
|
||||
["NOUN", "VERB", "PRON", "NOUN"],
|
||||
["ROOT", "acl", "det", "obj"],
|
||||
[0, -1, 1, -2],
|
||||
["mahdollisuus", "päivystysyksiköitä"],
|
||||
),
|
||||
(
|
||||
"sairaanhoitopiirit harjoittavat leikkaustoimintaa alueellaan useammassa sairaalassa",
|
||||
["NOUN", "VERB", "NOUN", "NOUN", "ADJ", "NOUN"],
|
||||
["nsubj", "ROOT", "obj", "obl", "amod", "obl"],
|
||||
[1, 0, -1, -1, 1, -3],
|
||||
[
|
||||
"sairaanhoitopiirit",
|
||||
"leikkaustoimintaa",
|
||||
"alueellaan",
|
||||
"useammassa sairaalassa",
|
||||
],
|
||||
),
|
||||
(
|
||||
"Lain mukaan varhaiskasvatus on suunnitelmallista toimintaa",
|
||||
["NOUN", "ADP", "NOUN", "AUX", "ADJ", "NOUN"],
|
||||
["obl", "case", "nsubj:cop", "cop", "amod", "ROOT"],
|
||||
[5, -1, 3, 2, 1, 0],
|
||||
["Lain", "varhaiskasvatus", "suunnitelmallista toimintaa"],
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
def test_noun_chunks_is_parsed(fi_tokenizer):
|
||||
"""Test that noun_chunks raises Value Error for 'fi' language if Doc is not parsed.
|
||||
To check this test, we're constructing a Doc
|
||||
with a new Vocab here and forcing is_parsed to 'False'
|
||||
to make sure the noun chunks don't run.
|
||||
"""
|
||||
doc = fi_tokenizer("Tämä on testi")
|
||||
with pytest.raises(ValueError):
|
||||
list(doc.noun_chunks)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text,pos,deps,heads,expected_noun_chunks", FI_NP_TEST_EXAMPLES
|
||||
)
|
||||
def test_fi_noun_chunks(fi_tokenizer, text, pos, deps, heads, expected_noun_chunks):
|
||||
tokens = fi_tokenizer(text)
|
||||
|
||||
assert len(heads) == len(pos)
|
||||
doc = Doc(
|
||||
tokens.vocab,
|
||||
words=[t.text for t in tokens],
|
||||
heads=[head + i for i, head in enumerate(heads)],
|
||||
deps=deps,
|
||||
pos=pos,
|
||||
)
|
||||
|
||||
noun_chunks = list(doc.noun_chunks)
|
||||
assert len(noun_chunks) == len(expected_noun_chunks)
|
||||
for i, np in enumerate(noun_chunks):
|
||||
assert np.text == expected_noun_chunks[i]
|
|
@ -1,8 +1,230 @@
|
|||
from spacy.tokens import Doc
|
||||
import pytest
|
||||
|
||||
|
||||
# fmt: off
|
||||
@pytest.mark.parametrize(
|
||||
"words,heads,deps,pos,chunk_offsets",
|
||||
[
|
||||
# determiner + noun
|
||||
# un nom -> un nom
|
||||
(
|
||||
["un", "nom"],
|
||||
[1, 1],
|
||||
["det", "ROOT"],
|
||||
["DET", "NOUN"],
|
||||
[(0, 2)],
|
||||
),
|
||||
# determiner + noun starting with vowel
|
||||
# l'heure -> l'heure
|
||||
(
|
||||
["l'", "heure"],
|
||||
[1, 1],
|
||||
["det", "ROOT"],
|
||||
["DET", "NOUN"],
|
||||
[(0, 2)],
|
||||
),
|
||||
# determiner + plural noun
|
||||
# les romans -> les romans
|
||||
(
|
||||
["les", "romans"],
|
||||
[1, 1],
|
||||
["det", "ROOT"],
|
||||
["DET", "NOUN"],
|
||||
[(0, 2)],
|
||||
),
|
||||
# det + adj + noun
|
||||
# Le vieux Londres -> Le vieux Londres
|
||||
(
|
||||
['Les', 'vieux', 'Londres'],
|
||||
[2, 2, 2],
|
||||
["det", "amod", "ROOT"],
|
||||
["DET", "ADJ", "NOUN"],
|
||||
[(0,3)]
|
||||
),
|
||||
# det + noun + adj
|
||||
# le nom propre -> le nom propre a proper noun
|
||||
(
|
||||
["le", "nom", "propre"],
|
||||
[1, 1, 1],
|
||||
["det", "ROOT", "amod"],
|
||||
["DET", "NOUN", "ADJ"],
|
||||
[(0, 3)],
|
||||
),
|
||||
# det + noun + adj plural
|
||||
# Les chiens bruns -> les chiens bruns
|
||||
(
|
||||
["Les", "chiens", "bruns"],
|
||||
[1, 1, 1],
|
||||
["det", "ROOT", "amod"],
|
||||
["DET", "NOUN", "ADJ"],
|
||||
[(0, 3)],
|
||||
),
|
||||
# multiple adjectives: one adj before the noun, one adj after the noun
|
||||
# un nouveau film intéressant -> un nouveau film intéressant
|
||||
(
|
||||
["un", "nouveau", "film", "intéressant"],
|
||||
[2, 2, 2, 2],
|
||||
["det", "amod", "ROOT", "amod"],
|
||||
["DET", "ADJ", "NOUN", "ADJ"],
|
||||
[(0,4)]
|
||||
),
|
||||
# multiple adjectives, both adjs after the noun
|
||||
# une personne intelligente et drôle -> une personne intelligente et drôle
|
||||
(
|
||||
["une", "personne", "intelligente", "et", "drôle"],
|
||||
[1, 1, 1, 4, 2],
|
||||
["det", "ROOT", "amod", "cc", "conj"],
|
||||
["DET", "NOUN", "ADJ", "CCONJ", "ADJ"],
|
||||
[(0,5)]
|
||||
),
|
||||
# relative pronoun
|
||||
# un bus qui va au ville -> un bus, qui, ville
|
||||
(
|
||||
['un', 'bus', 'qui', 'va', 'au', 'ville'],
|
||||
[1, 1, 3, 1, 5, 3],
|
||||
['det', 'ROOT', 'nsubj', 'acl:relcl', 'case', 'obl:arg'],
|
||||
['DET', 'NOUN', 'PRON', 'VERB', 'ADP', 'NOUN'],
|
||||
[(0,2), (2,3), (5,6)]
|
||||
),
|
||||
# relative subclause
|
||||
# Voilà la maison que nous voulons acheter -> la maison, nous That's the house that we want to buy.
|
||||
(
|
||||
['Voilà', 'la', 'maison', 'que', 'nous', 'voulons', 'acheter'],
|
||||
[0, 2, 0, 5, 5, 2, 5],
|
||||
['ROOT', 'det', 'obj', 'mark', 'nsubj', 'acl:relcl', 'xcomp'],
|
||||
['VERB', 'DET', 'NOUN', 'SCONJ', 'PRON', 'VERB', 'VERB'],
|
||||
[(1,3), (4,5)]
|
||||
),
|
||||
# Person name and title by flat
|
||||
# Louis XIV -> Louis XIV
|
||||
(
|
||||
["Louis", "XIV"],
|
||||
[0, 0],
|
||||
["ROOT", "flat:name"],
|
||||
["PROPN", "PROPN"],
|
||||
[(0,2)]
|
||||
),
|
||||
# Organization name by flat
|
||||
# Nations Unies -> Nations Unies
|
||||
(
|
||||
["Nations", "Unies"],
|
||||
[0, 0],
|
||||
["ROOT", "flat:name"],
|
||||
["PROPN", "PROPN"],
|
||||
[(0,2)]
|
||||
),
|
||||
# Noun compound, person name created by two flats
|
||||
# Louise de Bratagne -> Louise de Bratagne
|
||||
(
|
||||
["Louise", "de", "Bratagne"],
|
||||
[0, 0, 0],
|
||||
["ROOT", "flat:name", "flat:name"],
|
||||
["PROPN", "PROPN", "PROPN"],
|
||||
[(0,3)]
|
||||
),
|
||||
# Noun compound, person name created by two flats
|
||||
# Louis François Joseph -> Louis François Joseph
|
||||
(
|
||||
["Louis", "François", "Joseph"],
|
||||
[0, 0, 0],
|
||||
["ROOT", "flat:name", "flat:name"],
|
||||
["PROPN", "PROPN", "PROPN"],
|
||||
[(0,3)]
|
||||
),
|
||||
# one determiner + one noun + one adjective qualified by an adverb
|
||||
# quelques agriculteurs très riches -> quelques agriculteurs très riches
|
||||
(
|
||||
["quelques", "agriculteurs", "très", "riches"],
|
||||
[1, 1, 3, 1],
|
||||
['det', 'ROOT', 'advmod', 'amod'],
|
||||
['DET', 'NOUN', 'ADV', 'ADJ'],
|
||||
[(0,4)]
|
||||
),
|
||||
# Two NPs conjuncted
|
||||
# Il a un chien et un chat -> Il, un chien, un chat
|
||||
(
|
||||
['Il', 'a', 'un', 'chien', 'et', 'un', 'chat'],
|
||||
[1, 1, 3, 1, 6, 6, 3],
|
||||
['nsubj', 'ROOT', 'det', 'obj', 'cc', 'det', 'conj'],
|
||||
['PRON', 'VERB', 'DET', 'NOUN', 'CCONJ', 'DET', 'NOUN'],
|
||||
[(0,1), (2,4), (5,7)]
|
||||
|
||||
),
|
||||
# Two NPs together
|
||||
# l'écrivain brésilien Aníbal Machado -> l'écrivain brésilien, Aníbal Machado
|
||||
(
|
||||
["l'", 'écrivain', 'brésilien', 'Aníbal', 'Machado'],
|
||||
[1, 1, 1, 1, 3],
|
||||
['det', 'ROOT', 'amod', 'appos', 'flat:name'],
|
||||
['DET', 'NOUN', 'ADJ', 'PROPN', 'PROPN'],
|
||||
[(0, 3), (3, 5)]
|
||||
),
|
||||
# nmod relation between NPs
|
||||
# la destruction de la ville -> la destruction, la ville
|
||||
(
|
||||
['la', 'destruction', 'de', 'la', 'ville'],
|
||||
[1, 1, 4, 4, 1],
|
||||
['det', 'ROOT', 'case', 'det', 'nmod'],
|
||||
['DET', 'NOUN', 'ADP', 'DET', 'NOUN'],
|
||||
[(0,2), (3,5)]
|
||||
),
|
||||
# nmod relation between NPs
|
||||
# Archiduchesse d’Autriche -> Archiduchesse, Autriche
|
||||
(
|
||||
['Archiduchesse', 'd’', 'Autriche'],
|
||||
[0, 2, 0],
|
||||
['ROOT', 'case', 'nmod'],
|
||||
['NOUN', 'ADP', 'PROPN'],
|
||||
[(0,1), (2,3)]
|
||||
),
|
||||
# Compounding by nmod, several NPs chained together
|
||||
# la première usine de drogue du gouvernement -> la première usine, drogue, gouvernement
|
||||
(
|
||||
["la", "première", "usine", "de", "drogue", "du", "gouvernement"],
|
||||
[2, 2, 2, 4, 2, 6, 2],
|
||||
['det', 'amod', 'ROOT', 'case', 'nmod', 'case', 'nmod'],
|
||||
['DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'],
|
||||
[(0, 3), (4, 5), (6, 7)]
|
||||
),
|
||||
# several NPs
|
||||
# Traduction du rapport de Susana -> Traduction, rapport, Susana
|
||||
(
|
||||
['Traduction', 'du', 'raport', 'de', 'Susana'],
|
||||
[0, 2, 0, 4, 2],
|
||||
['ROOT', 'case', 'nmod', 'case', 'nmod'],
|
||||
['NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'],
|
||||
[(0,1), (2,3), (4,5)]
|
||||
|
||||
),
|
||||
# Several NPs
|
||||
# Le gros chat de Susana et son amie -> Le gros chat, Susana, son amie
|
||||
(
|
||||
['Le', 'gros', 'chat', 'de', 'Susana', 'et', 'son', 'amie'],
|
||||
[2, 2, 2, 4, 2, 7, 7, 2],
|
||||
['det', 'amod', 'ROOT', 'case', 'nmod', 'cc', 'det', 'conj'],
|
||||
['DET', 'ADJ', 'NOUN', 'ADP', 'PROPN', 'CCONJ', 'DET', 'NOUN'],
|
||||
[(0,3), (4,5), (6,8)]
|
||||
),
|
||||
# Passive subject
|
||||
# Les nouvelles dépenses sont alimentées par le grand compte bancaire de Clinton -> Les nouvelles dépenses, le grand compte bancaire, Clinton
|
||||
(
|
||||
['Les', 'nouvelles', 'dépenses', 'sont', 'alimentées', 'par', 'le', 'grand', 'compte', 'bancaire', 'de', 'Clinton'],
|
||||
[2, 2, 4, 4, 4, 8, 8, 8, 4, 8, 11, 8],
|
||||
['det', 'amod', 'nsubj:pass', 'aux:pass', 'ROOT', 'case', 'det', 'amod', 'obl:agent', 'amod', 'case', 'nmod'],
|
||||
['DET', 'ADJ', 'NOUN', 'AUX', 'VERB', 'ADP', 'DET', 'ADJ', 'NOUN', 'ADJ', 'ADP', 'PROPN'],
|
||||
[(0, 3), (6, 10), (11, 12)]
|
||||
)
|
||||
],
|
||||
)
|
||||
# fmt: on
|
||||
def test_fr_noun_chunks(fr_vocab, words, heads, deps, pos, chunk_offsets):
|
||||
doc = Doc(fr_vocab, words=words, heads=heads, deps=deps, pos=pos)
|
||||
assert [(c.start, c.end) for c in doc.noun_chunks] == chunk_offsets
|
||||
|
||||
|
||||
def test_noun_chunks_is_parsed_fr(fr_tokenizer):
|
||||
"""Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed."""
|
||||
doc = fr_tokenizer("trouver des travaux antérieurs")
|
||||
doc = fr_tokenizer("Je suis allé à l'école")
|
||||
with pytest.raises(ValueError):
|
||||
list(doc.noun_chunks)
|
||||
|
|
0
spacy/tests/lang/hsb/__init__.py
Normal file
0
spacy/tests/lang/hsb/__init__.py
Normal file
25
spacy/tests/lang/hsb/test_text.py
Normal file
25
spacy/tests/lang/hsb/test_text.py
Normal file
|
@ -0,0 +1,25 @@
|
|||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text,match",
|
||||
[
|
||||
("10", True),
|
||||
("1", True),
|
||||
("10,000", True),
|
||||
("10,00", True),
|
||||
("jedne", True),
|
||||
("dwanaće", True),
|
||||
("milion", True),
|
||||
("sto", True),
|
||||
("załožene", False),
|
||||
("wona", False),
|
||||
("powšitkownej", False),
|
||||
(",", False),
|
||||
("1/2", True),
|
||||
],
|
||||
)
|
||||
def test_lex_attrs_like_number(hsb_tokenizer, text, match):
|
||||
tokens = hsb_tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
assert tokens[0].like_num == match
|
32
spacy/tests/lang/hsb/test_tokenizer.py
Normal file
32
spacy/tests/lang/hsb/test_tokenizer.py
Normal file
|
@ -0,0 +1,32 @@
|
|||
import pytest
|
||||
|
||||
HSB_BASIC_TOKENIZATION_TESTS = [
|
||||
(
|
||||
"Hornjoserbšćina wobsteji resp. wobsteješe z wjacorych dialektow, kotrež so zdźěla chětro wot so rozeznawachu.",
|
||||
[
|
||||
"Hornjoserbšćina",
|
||||
"wobsteji",
|
||||
"resp.",
|
||||
"wobsteješe",
|
||||
"z",
|
||||
"wjacorych",
|
||||
"dialektow",
|
||||
",",
|
||||
"kotrež",
|
||||
"so",
|
||||
"zdźěla",
|
||||
"chětro",
|
||||
"wot",
|
||||
"so",
|
||||
"rozeznawachu",
|
||||
".",
|
||||
],
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text,expected_tokens", HSB_BASIC_TOKENIZATION_TESTS)
|
||||
def test_hsb_tokenizer_basic(hsb_tokenizer, text, expected_tokens):
|
||||
tokens = hsb_tokenizer(text)
|
||||
token_list = [token.text for token in tokens if not token.is_space]
|
||||
assert expected_tokens == token_list
|
221
spacy/tests/lang/it/test_noun_chunks.py
Normal file
221
spacy/tests/lang/it/test_noun_chunks.py
Normal file
|
@ -0,0 +1,221 @@
|
|||
from spacy.tokens import Doc
|
||||
import pytest
|
||||
|
||||
|
||||
# fmt: off
|
||||
@pytest.mark.parametrize(
|
||||
"words,heads,deps,pos,chunk_offsets",
|
||||
[
|
||||
# determiner + noun
|
||||
# un pollo -> un pollo
|
||||
(
|
||||
["un", "pollo"],
|
||||
[1, 1],
|
||||
["det", "ROOT"],
|
||||
["DET", "NOUN"],
|
||||
[(0,2)],
|
||||
),
|
||||
# two determiners + noun
|
||||
# il mio cane -> il mio cane
|
||||
(
|
||||
["il", "mio", "cane"],
|
||||
[2, 2, 2],
|
||||
["det", "det:poss", "ROOT"],
|
||||
["DET", "DET", "NOUN"],
|
||||
[(0,3)],
|
||||
),
|
||||
# two determiners, one is after noun. rare usage but still testing
|
||||
# il cane mio-> il cane mio
|
||||
(
|
||||
["il", "cane", "mio"],
|
||||
[1, 1, 1],
|
||||
["det", "ROOT", "det:poss"],
|
||||
["DET", "NOUN", "DET"],
|
||||
[(0,3)],
|
||||
),
|
||||
# relative pronoun
|
||||
# È molto bello il vestito che hai acquistat -> il vestito, che the dress that you bought is very pretty.
|
||||
(
|
||||
["È", "molto", "bello", "il", "vestito", "che", "hai", "acquistato"],
|
||||
[2, 2, 2, 4, 2, 7, 7, 4],
|
||||
['cop', 'advmod', 'ROOT', 'det', 'nsubj', 'obj', 'aux', 'acl:relcl'],
|
||||
['AUX', 'ADV', 'ADJ', 'DET', 'NOUN', 'PRON', 'AUX', 'VERB'],
|
||||
[(3,5), (5,6)]
|
||||
),
|
||||
# relative subclause
|
||||
# il computer che hai comprato -> il computer, che the computer that you bought
|
||||
(
|
||||
['il', 'computer', 'che', 'hai', 'comprato'],
|
||||
[1, 1, 4, 4, 1],
|
||||
['det', 'ROOT', 'nsubj', 'aux', 'acl:relcl'],
|
||||
['DET', 'NOUN', 'PRON', 'AUX', 'VERB'],
|
||||
[(0,2), (2,3)]
|
||||
),
|
||||
# det + noun + adj
|
||||
# Una macchina grande -> Una macchina grande
|
||||
(
|
||||
["Una", "macchina", "grande"],
|
||||
[1, 1, 1],
|
||||
["det", "ROOT", "amod"],
|
||||
["DET", "NOUN", "ADJ"],
|
||||
[(0,3)],
|
||||
),
|
||||
# noun + adj plural
|
||||
# mucche bianche
|
||||
(
|
||||
["mucche", "bianche"],
|
||||
[0, 0],
|
||||
["ROOT", "amod"],
|
||||
["NOUN", "ADJ"],
|
||||
[(0,2)],
|
||||
),
|
||||
# det + adj + noun
|
||||
# Una grande macchina -> Una grande macchina
|
||||
(
|
||||
['Una', 'grande', 'macchina'],
|
||||
[2, 2, 2],
|
||||
["det", "amod", "ROOT"],
|
||||
["DET", "ADJ", "NOUN"],
|
||||
[(0,3)]
|
||||
),
|
||||
# det + adj + noun, det with apostrophe
|
||||
# un'importante associazione -> un'importante associazione
|
||||
(
|
||||
["Un'", 'importante', 'associazione'],
|
||||
[2, 2, 2],
|
||||
["det", "amod", "ROOT"],
|
||||
["DET", "ADJ", "NOUN"],
|
||||
[(0,3)]
|
||||
),
|
||||
# multiple adjectives
|
||||
# Un cane piccolo e marrone -> Un cane piccolo e marrone
|
||||
(
|
||||
["Un", "cane", "piccolo", "e", "marrone"],
|
||||
[1, 1, 1, 4, 2],
|
||||
["det", "ROOT", "amod", "cc", "conj"],
|
||||
["DET", "NOUN", "ADJ", "CCONJ", "ADJ"],
|
||||
[(0,5)]
|
||||
),
|
||||
# determiner, adjective, compound created by flat
|
||||
# le Nazioni Unite -> le Nazioni Unite
|
||||
(
|
||||
["le", "Nazioni", "Unite"],
|
||||
[1, 1, 1],
|
||||
["det", "ROOT", "flat:name"],
|
||||
["DET", "PROPN", "PROPN"],
|
||||
[(0,3)]
|
||||
),
|
||||
# one determiner + one noun + one adjective qualified by an adverb
|
||||
# alcuni contadini molto ricchi -> alcuni contadini molto ricchi some very rich farmers
|
||||
(
|
||||
['alcuni', 'contadini', 'molto', 'ricchi'],
|
||||
[1, 1, 3, 1],
|
||||
['det', 'ROOT', 'advmod', 'amod'],
|
||||
['DET', 'NOUN', 'ADV', 'ADJ'],
|
||||
[(0,4)]
|
||||
),
|
||||
# Two NPs conjuncted
|
||||
# Ho un cane e un gatto -> un cane, un gatto
|
||||
(
|
||||
['Ho', 'un', 'cane', 'e', 'un', 'gatto'],
|
||||
[0, 2, 0, 5, 5, 0],
|
||||
['ROOT', 'det', 'obj', 'cc', 'det', 'conj'],
|
||||
['VERB', 'DET', 'NOUN', 'CCONJ', 'DET', 'NOUN'],
|
||||
[(1,3), (4,6)]
|
||||
|
||||
),
|
||||
# Two NPs together
|
||||
# lo scrittore brasiliano Aníbal Machado -> lo scrittore brasiliano, Aníbal Machado
|
||||
(
|
||||
['lo', 'scrittore', 'brasiliano', 'Aníbal', 'Machado'],
|
||||
[1, 1, 1, 1, 3],
|
||||
['det', 'ROOT', 'amod', 'nmod', 'flat:name'],
|
||||
['DET', 'NOUN', 'ADJ', 'PROPN', 'PROPN'],
|
||||
[(0, 3), (3, 5)]
|
||||
),
|
||||
# Noun compound, person name and titles
|
||||
# Dom Pedro II -> Dom Pedro II
|
||||
(
|
||||
["Dom", "Pedro", "II"],
|
||||
[0, 0, 0],
|
||||
["ROOT", "flat:name", "flat:name"],
|
||||
["PROPN", "PROPN", "PROPN"],
|
||||
[(0,3)]
|
||||
),
|
||||
# Noun compound created by flat
|
||||
# gli Stati Uniti
|
||||
(
|
||||
["gli", "Stati", "Uniti"],
|
||||
[1, 1, 1],
|
||||
["det", "ROOT", "flat:name"],
|
||||
["DET", "PROPN", "PROPN"],
|
||||
[(0,3)]
|
||||
),
|
||||
# nmod relation between NPs
|
||||
# la distruzione della città -> la distruzione, città
|
||||
(
|
||||
['la', 'distruzione', 'della', 'città'],
|
||||
[1, 1, 3, 1],
|
||||
['det', 'ROOT', 'case', 'nmod'],
|
||||
['DET', 'NOUN', 'ADP', 'NOUN'],
|
||||
[(0,2), (3,4)]
|
||||
),
|
||||
# Compounding by nmod, several NPs chained together
|
||||
# la prima fabbrica di droga del governo -> la prima fabbrica, droga, governo
|
||||
(
|
||||
["la", "prima", "fabbrica", "di", "droga", "del", "governo"],
|
||||
[2, 2, 2, 4, 2, 6, 2],
|
||||
['det', 'amod', 'ROOT', 'case', 'nmod', 'case', 'nmod'],
|
||||
['DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'],
|
||||
[(0, 3), (4, 5), (6, 7)]
|
||||
),
|
||||
# several NPs
|
||||
# Traduzione del rapporto di Susana -> Traduzione, rapporto, Susana
|
||||
(
|
||||
['Traduzione', 'del', 'rapporto', 'di', 'Susana'],
|
||||
[0, 2, 0, 4, 2],
|
||||
['ROOT', 'case', 'nmod', 'case', 'nmod'],
|
||||
['NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'],
|
||||
[(0,1), (2,3), (4,5)]
|
||||
|
||||
),
|
||||
# Several NPs
|
||||
# Il gatto grasso di Susana e la sua amica -> Il gatto grasso, Susana, sua amica
|
||||
(
|
||||
['Il', 'gatto', 'grasso', 'di', 'Susana', 'e', 'la', 'sua', 'amica'],
|
||||
[1, 1, 1, 4, 1, 8, 8, 8, 1],
|
||||
['det', 'ROOT', 'amod', 'case', 'nmod', 'cc', 'det', 'det:poss', 'conj'],
|
||||
['DET', 'NOUN', 'ADJ', 'ADP', 'PROPN', 'CCONJ', 'DET', 'DET', 'NOUN'],
|
||||
[(0,3), (4,5), (6,9)]
|
||||
),
|
||||
# Passive subject
|
||||
# La nuova spesa è alimentata dal grande conto in banca di Clinton -> Le nuova spesa, grande conto, banca, Clinton
|
||||
(
|
||||
['La', 'nuova', 'spesa', 'è', 'alimentata', 'dal', 'grande', 'conto', 'in', 'banca', 'di', 'Clinton'],
|
||||
[2, 2, 4, 4, 4, 7, 7, 4, 9, 7, 11, 9],
|
||||
['det', 'amod', 'nsubj:pass', 'aux:pass', 'ROOT', 'case', 'amod', 'obl:agent', 'case', 'nmod', 'case', 'nmod'],
|
||||
['DET', 'ADJ', 'NOUN', 'AUX', 'VERB', 'ADP', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'],
|
||||
[(0, 3), (6, 8), (9, 10), (11,12)]
|
||||
),
|
||||
# Misc
|
||||
# Ma mentre questo prestito possa ora sembrare gestibile, un improvviso cambiamento delle circostanze potrebbe portare a problemi di debiti -> questo prestiti, un provisso cambiento, circostanze, problemi, debiti
|
||||
(
|
||||
['Ma', 'mentre', 'questo', 'prestito', 'possa', 'ora', 'sembrare', 'gestibile', ',', 'un', 'improvviso', 'cambiamento', 'delle', 'circostanze', 'potrebbe', 'portare', 'a', 'problemi', 'di', 'debitii'],
|
||||
[15, 6, 3, 6, 6, 6, 15, 6, 6, 11, 11, 15, 13, 11, 15, 15, 17, 15, 19, 17],
|
||||
['cc', 'mark', 'det', 'nsubj', 'aux', 'advmod', 'advcl', 'xcomp', 'punct', 'det', 'amod', 'nsubj', 'case', 'nmod', 'aux', 'ROOT', 'case', 'obl', 'case', 'nmod'],
|
||||
['CCONJ', 'SCONJ', 'DET', 'NOUN', 'AUX', 'ADV', 'VERB', 'ADJ', 'PUNCT', 'DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'AUX', 'VERB', 'ADP', 'NOUN', 'ADP', 'NOUN'],
|
||||
[(2,4), (9,12), (13,14), (17,18), (19,20)]
|
||||
)
|
||||
],
|
||||
)
|
||||
# fmt: on
|
||||
def test_it_noun_chunks(it_vocab, words, heads, deps, pos, chunk_offsets):
|
||||
doc = Doc(it_vocab, words=words, heads=heads, deps=deps, pos=pos)
|
||||
assert [(c.start, c.end) for c in doc.noun_chunks] == chunk_offsets
|
||||
|
||||
|
||||
def test_noun_chunks_is_parsed_it(it_tokenizer):
|
||||
"""Test that noun_chunks raises Value Error for 'it' language if Doc is not parsed."""
|
||||
doc = it_tokenizer("Sei andato a Oxford")
|
||||
with pytest.raises(ValueError):
|
||||
list(doc.noun_chunks)
|
17
spacy/tests/lang/it/test_stopwords.py
Normal file
17
spacy/tests/lang/it/test_stopwords.py
Normal file
|
@ -0,0 +1,17 @@
|
|||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"word", ["un", "lo", "dell", "dall", "si", "ti", "mi", "quest", "quel", "quello"]
|
||||
)
|
||||
def test_stopwords_basic(it_tokenizer, word):
|
||||
tok = it_tokenizer(word)[0]
|
||||
assert tok.is_stop
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"word", ["quest'uomo", "l'ho", "un'amica", "dell'olio", "s'arrende", "m'ascolti"]
|
||||
)
|
||||
def test_stopwords_elided(it_tokenizer, word):
|
||||
tok = it_tokenizer(word)[0]
|
||||
assert tok.is_stop
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user