mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 17:24:41 +03:00
Merge branch 'master' into rfd-robot-slowtests
This commit is contained in:
commit
66b474ce05
8
.github/azure-steps.yml
vendored
8
.github/azure-steps.yml
vendored
|
@ -25,6 +25,9 @@ steps:
|
||||||
${{ parameters.prefix }} python setup.py sdist --formats=gztar
|
${{ parameters.prefix }} python setup.py sdist --formats=gztar
|
||||||
displayName: "Compile and build sdist"
|
displayName: "Compile and build sdist"
|
||||||
|
|
||||||
|
- script: python -m mypy spacy
|
||||||
|
displayName: 'Run mypy'
|
||||||
|
|
||||||
- task: DeleteFiles@1
|
- task: DeleteFiles@1
|
||||||
inputs:
|
inputs:
|
||||||
contents: "spacy"
|
contents: "spacy"
|
||||||
|
@ -100,3 +103,8 @@ steps:
|
||||||
python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
|
python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
|
||||||
displayName: 'Test assemble CLI vectors warning'
|
displayName: 'Test assemble CLI vectors warning'
|
||||||
condition: eq(variables['python_version'], '3.8')
|
condition: eq(variables['python_version'], '3.8')
|
||||||
|
|
||||||
|
- script: |
|
||||||
|
python .github/validate_universe_json.py website/meta/universe.json
|
||||||
|
displayName: 'Test website/meta/universe.json'
|
||||||
|
condition: eq(variables['python_version'], '3.8')
|
||||||
|
|
106
.github/contributors/connorbrinton.md
vendored
Normal file
106
.github/contributors/connorbrinton.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------- |
|
||||||
|
| Name | Connor Brinton |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | July 20th, 2021 |
|
||||||
|
| GitHub username | connorbrinton |
|
||||||
|
| Website (optional) | |
|
19
.github/lock.yml
vendored
19
.github/lock.yml
vendored
|
@ -1,19 +0,0 @@
|
||||||
# Configuration for lock-threads - https://github.com/dessant/lock-threads
|
|
||||||
|
|
||||||
# Number of days of inactivity before a closed issue or pull request is locked
|
|
||||||
daysUntilLock: 30
|
|
||||||
|
|
||||||
# Issues and pull requests with these labels will not be locked. Set to `[]` to disable
|
|
||||||
exemptLabels: []
|
|
||||||
|
|
||||||
# Label to add before locking, such as `outdated`. Set to `false` to disable
|
|
||||||
lockLabel: false
|
|
||||||
|
|
||||||
# Comment to post before locking. Set to `false` to disable
|
|
||||||
lockComment: >
|
|
||||||
This thread has been automatically locked since there has not been
|
|
||||||
any recent activity after it was closed. Please open a new issue for
|
|
||||||
related bugs.
|
|
||||||
|
|
||||||
# Limit to only `issues` or `pulls`
|
|
||||||
only: issues
|
|
|
@ -1,13 +1,11 @@
|
||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
|
import sys
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
def test_universe_json():
|
def validate_json(document):
|
||||||
|
universe_file = Path(document)
|
||||||
root_dir = Path(__file__).parent
|
|
||||||
universe_file = root_dir / "universe.json"
|
|
||||||
|
|
||||||
with universe_file.open() as f:
|
with universe_file.open() as f:
|
||||||
universe_data = json.load(f)
|
universe_data = json.load(f)
|
||||||
for entry in universe_data["resources"]:
|
for entry in universe_data["resources"]:
|
||||||
|
@ -15,3 +13,7 @@ def test_universe_json():
|
||||||
assert not re.match(
|
assert not re.match(
|
||||||
r"^(http:)|^(https:)", entry["github"]
|
r"^(http:)|^(https:)", entry["github"]
|
||||||
), "Github field should be user/repo, not a url"
|
), "Github field should be user/repo, not a url"
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
validate_json(str(sys.argv[1]))
|
3
.github/workflows/explosionbot.yml
vendored
3
.github/workflows/explosionbot.yml
vendored
|
@ -23,4 +23,5 @@ jobs:
|
||||||
env:
|
env:
|
||||||
INPUT_TOKEN: ${{ secrets.EXPLOSIONBOT_TOKEN }}
|
INPUT_TOKEN: ${{ secrets.EXPLOSIONBOT_TOKEN }}
|
||||||
INPUT_BK_TOKEN: ${{ secrets.BUILDKITE_SECRET }}
|
INPUT_BK_TOKEN: ${{ secrets.BUILDKITE_SECRET }}
|
||||||
ENABLED_COMMANDS: "test_gpu,test_slow"
|
ENABLED_COMMANDS: "test_gpu,test_slow"
|
||||||
|
ALLOWED_TEAMS: "spaCy"
|
||||||
|
|
25
.github/workflows/lock.yml
vendored
Normal file
25
.github/workflows/lock.yml
vendored
Normal file
|
@ -0,0 +1,25 @@
|
||||||
|
name: 'Lock Threads'
|
||||||
|
|
||||||
|
on:
|
||||||
|
schedule:
|
||||||
|
- cron: '0 0 * * *' # check every day
|
||||||
|
workflow_dispatch:
|
||||||
|
|
||||||
|
permissions:
|
||||||
|
issues: write
|
||||||
|
|
||||||
|
concurrency:
|
||||||
|
group: lock
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
action:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- uses: dessant/lock-threads@v3
|
||||||
|
with:
|
||||||
|
process-only: 'issues'
|
||||||
|
issue-inactive-days: '30'
|
||||||
|
issue-comment: >
|
||||||
|
This thread has been automatically locked since there
|
||||||
|
has not been any recent activity after it was closed.
|
||||||
|
Please open a new issue for related bugs.
|
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -9,7 +9,6 @@ keys/
|
||||||
spacy/tests/package/setup.cfg
|
spacy/tests/package/setup.cfg
|
||||||
spacy/tests/package/pyproject.toml
|
spacy/tests/package/pyproject.toml
|
||||||
spacy/tests/package/requirements.txt
|
spacy/tests/package/requirements.txt
|
||||||
spacy/tests/universe/universe.json
|
|
||||||
|
|
||||||
# Website
|
# Website
|
||||||
website/.cache/
|
website/.cache/
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
recursive-include include *.h
|
recursive-include include *.h
|
||||||
recursive-include spacy *.pyx *.pxd *.txt *.cfg *.jinja *.toml
|
recursive-include spacy *.pyi *.pyx *.pxd *.txt *.cfg *.jinja *.toml
|
||||||
include LICENSE
|
include LICENSE
|
||||||
include README.md
|
include README.md
|
||||||
include pyproject.toml
|
include pyproject.toml
|
||||||
|
|
|
@ -12,10 +12,11 @@ trigger:
|
||||||
- "website/*"
|
- "website/*"
|
||||||
- "*.md"
|
- "*.md"
|
||||||
pr:
|
pr:
|
||||||
paths:
|
paths:
|
||||||
exclude:
|
exclude:
|
||||||
- "website/*"
|
|
||||||
- "*.md"
|
- "*.md"
|
||||||
|
- "website/docs/*"
|
||||||
|
- "website/src/*"
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
# Perform basic checks for most important errors (syntax etc.) Uses the config
|
# Perform basic checks for most important errors (syntax etc.) Uses the config
|
||||||
|
|
|
@ -5,7 +5,7 @@ requires = [
|
||||||
"cymem>=2.0.2,<2.1.0",
|
"cymem>=2.0.2,<2.1.0",
|
||||||
"preshed>=3.0.2,<3.1.0",
|
"preshed>=3.0.2,<3.1.0",
|
||||||
"murmurhash>=0.28.0,<1.1.0",
|
"murmurhash>=0.28.0,<1.1.0",
|
||||||
"thinc>=8.0.10,<8.1.0",
|
"thinc>=8.0.11,<8.1.0",
|
||||||
"blis>=0.4.0,<0.8.0",
|
"blis>=0.4.0,<0.8.0",
|
||||||
"pathy",
|
"pathy",
|
||||||
"numpy>=1.15.0",
|
"numpy>=1.15.0",
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
spacy-legacy>=3.0.8,<3.1.0
|
spacy-legacy>=3.0.8,<3.1.0
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
thinc>=8.0.10,<8.1.0
|
thinc>=8.0.11,<8.1.0
|
||||||
blis>=0.4.0,<0.8.0
|
blis>=0.4.0,<0.8.0
|
||||||
ml_datasets>=0.2.0,<0.3.0
|
ml_datasets>=0.2.0,<0.3.0
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
|
@ -29,3 +29,7 @@ pytest-timeout>=1.3.0,<2.0.0
|
||||||
mock>=2.0.0,<3.0.0
|
mock>=2.0.0,<3.0.0
|
||||||
flake8>=3.8.0,<3.10.0
|
flake8>=3.8.0,<3.10.0
|
||||||
hypothesis>=3.27.0,<7.0.0
|
hypothesis>=3.27.0,<7.0.0
|
||||||
|
mypy>=0.910
|
||||||
|
types-dataclasses>=0.1.3; python_version < "3.7"
|
||||||
|
types-mock>=0.1.1
|
||||||
|
types-requests
|
||||||
|
|
10
setup.cfg
10
setup.cfg
|
@ -37,14 +37,14 @@ setup_requires =
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
thinc>=8.0.10,<8.1.0
|
thinc>=8.0.11,<8.1.0
|
||||||
install_requires =
|
install_requires =
|
||||||
# Our libraries
|
# Our libraries
|
||||||
spacy-legacy>=3.0.8,<3.1.0
|
spacy-legacy>=3.0.8,<3.1.0
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
thinc>=8.0.10,<8.1.0
|
thinc>=8.0.11,<8.1.0
|
||||||
blis>=0.4.0,<0.8.0
|
blis>=0.4.0,<0.8.0
|
||||||
wasabi>=0.8.1,<1.1.0
|
wasabi>=0.8.1,<1.1.0
|
||||||
srsly>=2.4.1,<3.0.0
|
srsly>=2.4.1,<3.0.0
|
||||||
|
@ -70,7 +70,7 @@ console_scripts =
|
||||||
lookups =
|
lookups =
|
||||||
spacy_lookups_data>=1.0.2,<1.1.0
|
spacy_lookups_data>=1.0.2,<1.1.0
|
||||||
transformers =
|
transformers =
|
||||||
spacy_transformers>=1.0.1,<1.1.0
|
spacy_transformers>=1.0.1,<1.2.0
|
||||||
ray =
|
ray =
|
||||||
spacy_ray>=0.1.0,<1.0.0
|
spacy_ray>=0.1.0,<1.0.0
|
||||||
cuda =
|
cuda =
|
||||||
|
@ -122,9 +122,11 @@ exclude =
|
||||||
|
|
||||||
[tool:pytest]
|
[tool:pytest]
|
||||||
markers =
|
markers =
|
||||||
slow
|
slow: mark a test as slow
|
||||||
|
issue: reference specific issue
|
||||||
|
|
||||||
[mypy]
|
[mypy]
|
||||||
ignore_missing_imports = True
|
ignore_missing_imports = True
|
||||||
no_implicit_optional = True
|
no_implicit_optional = True
|
||||||
plugins = pydantic.mypy, thinc.mypy
|
plugins = pydantic.mypy, thinc.mypy
|
||||||
|
allow_redefinition = True
|
||||||
|
|
1
setup.py
1
setup.py
|
@ -81,7 +81,6 @@ COPY_FILES = {
|
||||||
ROOT / "setup.cfg": PACKAGE_ROOT / "tests" / "package",
|
ROOT / "setup.cfg": PACKAGE_ROOT / "tests" / "package",
|
||||||
ROOT / "pyproject.toml": PACKAGE_ROOT / "tests" / "package",
|
ROOT / "pyproject.toml": PACKAGE_ROOT / "tests" / "package",
|
||||||
ROOT / "requirements.txt": PACKAGE_ROOT / "tests" / "package",
|
ROOT / "requirements.txt": PACKAGE_ROOT / "tests" / "package",
|
||||||
ROOT / "website" / "meta" / "universe.json": PACKAGE_ROOT / "tests" / "universe",
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
from typing import Dict, Any, Union, List, Optional, Tuple, Iterable, TYPE_CHECKING
|
from typing import Dict, Any, Union, List, Optional, Tuple, Iterable
|
||||||
|
from typing import TYPE_CHECKING, overload
|
||||||
import sys
|
import sys
|
||||||
import shutil
|
import shutil
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
@ -15,6 +16,7 @@ from thinc.util import has_cupy, gpu_is_available
|
||||||
from configparser import InterpolationError
|
from configparser import InterpolationError
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
from ..compat import Literal
|
||||||
from ..schemas import ProjectConfigSchema, validate
|
from ..schemas import ProjectConfigSchema, validate
|
||||||
from ..util import import_file, run_command, make_tempdir, registry, logger
|
from ..util import import_file, run_command, make_tempdir, registry, logger
|
||||||
from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS
|
from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS
|
||||||
|
@ -260,15 +262,16 @@ def get_checksum(path: Union[Path, str]) -> str:
|
||||||
RETURNS (str): The checksum.
|
RETURNS (str): The checksum.
|
||||||
"""
|
"""
|
||||||
path = Path(path)
|
path = Path(path)
|
||||||
|
if not (path.is_file() or path.is_dir()):
|
||||||
|
msg.fail(f"Can't get checksum for {path}: not a file or directory", exits=1)
|
||||||
if path.is_file():
|
if path.is_file():
|
||||||
return hashlib.md5(Path(path).read_bytes()).hexdigest()
|
return hashlib.md5(Path(path).read_bytes()).hexdigest()
|
||||||
if path.is_dir():
|
else:
|
||||||
# TODO: this is currently pretty slow
|
# TODO: this is currently pretty slow
|
||||||
dir_checksum = hashlib.md5()
|
dir_checksum = hashlib.md5()
|
||||||
for sub_file in sorted(fp for fp in path.rglob("*") if fp.is_file()):
|
for sub_file in sorted(fp for fp in path.rglob("*") if fp.is_file()):
|
||||||
dir_checksum.update(sub_file.read_bytes())
|
dir_checksum.update(sub_file.read_bytes())
|
||||||
return dir_checksum.hexdigest()
|
return dir_checksum.hexdigest()
|
||||||
msg.fail(f"Can't get checksum for {path}: not a file or directory", exits=1)
|
|
||||||
|
|
||||||
|
|
||||||
@contextmanager
|
@contextmanager
|
||||||
|
@ -468,12 +471,15 @@ def get_git_version(
|
||||||
RETURNS (Tuple[int, int]): The version as a (major, minor) tuple. Returns
|
RETURNS (Tuple[int, int]): The version as a (major, minor) tuple. Returns
|
||||||
(0, 0) if the version couldn't be determined.
|
(0, 0) if the version couldn't be determined.
|
||||||
"""
|
"""
|
||||||
ret = run_command("git --version", capture=True)
|
try:
|
||||||
|
ret = run_command("git --version", capture=True)
|
||||||
|
except:
|
||||||
|
raise RuntimeError(error)
|
||||||
stdout = ret.stdout.strip()
|
stdout = ret.stdout.strip()
|
||||||
if not stdout or not stdout.startswith("git version"):
|
if not stdout or not stdout.startswith("git version"):
|
||||||
return (0, 0)
|
return 0, 0
|
||||||
version = stdout[11:].strip().split(".")
|
version = stdout[11:].strip().split(".")
|
||||||
return (int(version[0]), int(version[1]))
|
return int(version[0]), int(version[1])
|
||||||
|
|
||||||
|
|
||||||
def _http_to_git(repo: str) -> str:
|
def _http_to_git(repo: str) -> str:
|
||||||
|
@ -500,6 +506,16 @@ def is_subpath_of(parent, child):
|
||||||
return os.path.commonpath([parent_realpath, child_realpath]) == parent_realpath
|
return os.path.commonpath([parent_realpath, child_realpath]) == parent_realpath
|
||||||
|
|
||||||
|
|
||||||
|
@overload
|
||||||
|
def string_to_list(value: str, intify: Literal[False] = ...) -> List[str]:
|
||||||
|
...
|
||||||
|
|
||||||
|
|
||||||
|
@overload
|
||||||
|
def string_to_list(value: str, intify: Literal[True]) -> List[int]:
|
||||||
|
...
|
||||||
|
|
||||||
|
|
||||||
def string_to_list(value: str, intify: bool = False) -> Union[List[str], List[int]]:
|
def string_to_list(value: str, intify: bool = False) -> Union[List[str], List[int]]:
|
||||||
"""Parse a comma-separated string to a list and account for various
|
"""Parse a comma-separated string to a list and account for various
|
||||||
formatting options. Mostly used to handle CLI arguments that take a list of
|
formatting options. Mostly used to handle CLI arguments that take a list of
|
||||||
|
@ -510,7 +526,7 @@ def string_to_list(value: str, intify: bool = False) -> Union[List[str], List[in
|
||||||
RETURNS (Union[List[str], List[int]]): A list of strings or ints.
|
RETURNS (Union[List[str], List[int]]): A list of strings or ints.
|
||||||
"""
|
"""
|
||||||
if not value:
|
if not value:
|
||||||
return []
|
return [] # type: ignore[return-value]
|
||||||
if value.startswith("[") and value.endswith("]"):
|
if value.startswith("[") and value.endswith("]"):
|
||||||
value = value[1:-1]
|
value = value[1:-1]
|
||||||
result = []
|
result = []
|
||||||
|
@ -522,7 +538,7 @@ def string_to_list(value: str, intify: bool = False) -> Union[List[str], List[in
|
||||||
p = p[1:-1]
|
p = p[1:-1]
|
||||||
p = p.strip()
|
p = p.strip()
|
||||||
if intify:
|
if intify:
|
||||||
p = int(p)
|
p = int(p) # type: ignore[assignment]
|
||||||
result.append(p)
|
result.append(p)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import Optional, Any, List, Union
|
from typing import Callable, Iterable, Mapping, Optional, Any, List, Union
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from wasabi import Printer
|
from wasabi import Printer
|
||||||
|
@ -9,7 +9,7 @@ import itertools
|
||||||
|
|
||||||
from ._util import app, Arg, Opt
|
from ._util import app, Arg, Opt
|
||||||
from ..training import docs_to_json
|
from ..training import docs_to_json
|
||||||
from ..tokens import DocBin
|
from ..tokens import Doc, DocBin
|
||||||
from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs
|
from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs
|
||||||
from ..training.converters import conllu_to_docs
|
from ..training.converters import conllu_to_docs
|
||||||
|
|
||||||
|
@ -19,7 +19,7 @@ from ..training.converters import conllu_to_docs
|
||||||
# entry to this dict with the file extension mapped to the converter function
|
# entry to this dict with the file extension mapped to the converter function
|
||||||
# imported from /converters.
|
# imported from /converters.
|
||||||
|
|
||||||
CONVERTERS = {
|
CONVERTERS: Mapping[str, Callable[..., Iterable[Doc]]] = {
|
||||||
"conllubio": conllu_to_docs,
|
"conllubio": conllu_to_docs,
|
||||||
"conllu": conllu_to_docs,
|
"conllu": conllu_to_docs,
|
||||||
"conll": conll_ner_to_docs,
|
"conll": conll_ner_to_docs,
|
||||||
|
@ -66,19 +66,16 @@ def convert_cli(
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/cli#convert
|
DOCS: https://spacy.io/api/cli#convert
|
||||||
"""
|
"""
|
||||||
if isinstance(file_type, FileTypes):
|
|
||||||
# We get an instance of the FileTypes from the CLI so we need its string value
|
|
||||||
file_type = file_type.value
|
|
||||||
input_path = Path(input_path)
|
input_path = Path(input_path)
|
||||||
output_dir = "-" if output_dir == Path("-") else output_dir
|
output_dir: Union[str, Path] = "-" if output_dir == Path("-") else output_dir
|
||||||
silent = output_dir == "-"
|
silent = output_dir == "-"
|
||||||
msg = Printer(no_print=silent)
|
msg = Printer(no_print=silent)
|
||||||
verify_cli_args(msg, input_path, output_dir, file_type, converter, ner_map)
|
verify_cli_args(msg, input_path, output_dir, file_type.value, converter, ner_map)
|
||||||
converter = _get_converter(msg, converter, input_path)
|
converter = _get_converter(msg, converter, input_path)
|
||||||
convert(
|
convert(
|
||||||
input_path,
|
input_path,
|
||||||
output_dir,
|
output_dir,
|
||||||
file_type=file_type,
|
file_type=file_type.value,
|
||||||
n_sents=n_sents,
|
n_sents=n_sents,
|
||||||
seg_sents=seg_sents,
|
seg_sents=seg_sents,
|
||||||
model=model,
|
model=model,
|
||||||
|
@ -94,7 +91,7 @@ def convert_cli(
|
||||||
|
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
input_path: Union[str, Path],
|
input_path: Path,
|
||||||
output_dir: Union[str, Path],
|
output_dir: Union[str, Path],
|
||||||
*,
|
*,
|
||||||
file_type: str = "json",
|
file_type: str = "json",
|
||||||
|
@ -108,13 +105,14 @@ def convert(
|
||||||
lang: Optional[str] = None,
|
lang: Optional[str] = None,
|
||||||
concatenate: bool = False,
|
concatenate: bool = False,
|
||||||
silent: bool = True,
|
silent: bool = True,
|
||||||
msg: Optional[Printer],
|
msg: Optional[Printer] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
|
input_path = Path(input_path)
|
||||||
if not msg:
|
if not msg:
|
||||||
msg = Printer(no_print=silent)
|
msg = Printer(no_print=silent)
|
||||||
ner_map = srsly.read_json(ner_map) if ner_map is not None else None
|
ner_map = srsly.read_json(ner_map) if ner_map is not None else None
|
||||||
doc_files = []
|
doc_files = []
|
||||||
for input_loc in walk_directory(Path(input_path), converter):
|
for input_loc in walk_directory(input_path, converter):
|
||||||
with input_loc.open("r", encoding="utf-8") as infile:
|
with input_loc.open("r", encoding="utf-8") as infile:
|
||||||
input_data = infile.read()
|
input_data = infile.read()
|
||||||
# Use converter function to convert data
|
# Use converter function to convert data
|
||||||
|
@ -141,7 +139,7 @@ def convert(
|
||||||
else:
|
else:
|
||||||
db = DocBin(docs=docs, store_user_data=True)
|
db = DocBin(docs=docs, store_user_data=True)
|
||||||
len_docs = len(db)
|
len_docs = len(db)
|
||||||
data = db.to_bytes()
|
data = db.to_bytes() # type: ignore[assignment]
|
||||||
if output_dir == "-":
|
if output_dir == "-":
|
||||||
_print_docs_to_stdout(data, file_type)
|
_print_docs_to_stdout(data, file_type)
|
||||||
else:
|
else:
|
||||||
|
@ -220,13 +218,12 @@ def walk_directory(path: Path, converter: str) -> List[Path]:
|
||||||
|
|
||||||
def verify_cli_args(
|
def verify_cli_args(
|
||||||
msg: Printer,
|
msg: Printer,
|
||||||
input_path: Union[str, Path],
|
input_path: Path,
|
||||||
output_dir: Union[str, Path],
|
output_dir: Union[str, Path],
|
||||||
file_type: FileTypes,
|
file_type: str,
|
||||||
converter: str,
|
converter: str,
|
||||||
ner_map: Optional[Path],
|
ner_map: Optional[Path],
|
||||||
):
|
):
|
||||||
input_path = Path(input_path)
|
|
||||||
if file_type not in FILE_TYPES_STDOUT and output_dir == "-":
|
if file_type not in FILE_TYPES_STDOUT and output_dir == "-":
|
||||||
msg.fail(
|
msg.fail(
|
||||||
f"Can't write .{file_type} data to stdout. Please specify an output directory.",
|
f"Can't write .{file_type} data to stdout. Please specify an output directory.",
|
||||||
|
@ -244,13 +241,13 @@ def verify_cli_args(
|
||||||
msg.fail("No input files in directory", input_path, exits=1)
|
msg.fail("No input files in directory", input_path, exits=1)
|
||||||
file_types = list(set([loc.suffix[1:] for loc in input_locs]))
|
file_types = list(set([loc.suffix[1:] for loc in input_locs]))
|
||||||
if converter == "auto" and len(file_types) >= 2:
|
if converter == "auto" and len(file_types) >= 2:
|
||||||
file_types = ",".join(file_types)
|
file_types_str = ",".join(file_types)
|
||||||
msg.fail("All input files must be same type", file_types, exits=1)
|
msg.fail("All input files must be same type", file_types_str, exits=1)
|
||||||
if converter != "auto" and converter not in CONVERTERS:
|
if converter != "auto" and converter not in CONVERTERS:
|
||||||
msg.fail(f"Can't find converter for {converter}", exits=1)
|
msg.fail(f"Can't find converter for {converter}", exits=1)
|
||||||
|
|
||||||
|
|
||||||
def _get_converter(msg, converter, input_path):
|
def _get_converter(msg, converter, input_path: Path):
|
||||||
if input_path.is_dir():
|
if input_path.is_dir():
|
||||||
input_path = walk_directory(input_path, converter)[0]
|
input_path = walk_directory(input_path, converter)[0]
|
||||||
if converter == "auto":
|
if converter == "auto":
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
from typing import List, Sequence, Dict, Any, Tuple, Optional, Set
|
from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Tuple, Union
|
||||||
|
from typing import cast, overload
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
import sys
|
import sys
|
||||||
|
@ -17,6 +18,7 @@ from ..pipeline import Morphologizer
|
||||||
from ..morphology import Morphology
|
from ..morphology import Morphology
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..util import registry, resolve_dot_names
|
from ..util import registry, resolve_dot_names
|
||||||
|
from ..compat import Literal
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
|
||||||
|
@ -378,10 +380,11 @@ def debug_data(
|
||||||
|
|
||||||
if "tagger" in factory_names:
|
if "tagger" in factory_names:
|
||||||
msg.divider("Part-of-speech Tagging")
|
msg.divider("Part-of-speech Tagging")
|
||||||
labels = [label for label in gold_train_data["tags"]]
|
label_list = [label for label in gold_train_data["tags"]]
|
||||||
model_labels = _get_labels_from_model(nlp, "tagger")
|
model_labels = _get_labels_from_model(nlp, "tagger")
|
||||||
msg.info(f"{len(labels)} label(s) in train data")
|
msg.info(f"{len(label_list)} label(s) in train data")
|
||||||
missing_labels = model_labels - set(labels)
|
labels = set(label_list)
|
||||||
|
missing_labels = model_labels - labels
|
||||||
if missing_labels:
|
if missing_labels:
|
||||||
msg.warn(
|
msg.warn(
|
||||||
"Some model labels are not present in the train data. The "
|
"Some model labels are not present in the train data. The "
|
||||||
|
@ -395,10 +398,11 @@ def debug_data(
|
||||||
|
|
||||||
if "morphologizer" in factory_names:
|
if "morphologizer" in factory_names:
|
||||||
msg.divider("Morphologizer (POS+Morph)")
|
msg.divider("Morphologizer (POS+Morph)")
|
||||||
labels = [label for label in gold_train_data["morphs"]]
|
label_list = [label for label in gold_train_data["morphs"]]
|
||||||
model_labels = _get_labels_from_model(nlp, "morphologizer")
|
model_labels = _get_labels_from_model(nlp, "morphologizer")
|
||||||
msg.info(f"{len(labels)} label(s) in train data")
|
msg.info(f"{len(label_list)} label(s) in train data")
|
||||||
missing_labels = model_labels - set(labels)
|
labels = set(label_list)
|
||||||
|
missing_labels = model_labels - labels
|
||||||
if missing_labels:
|
if missing_labels:
|
||||||
msg.warn(
|
msg.warn(
|
||||||
"Some model labels are not present in the train data. The "
|
"Some model labels are not present in the train data. The "
|
||||||
|
@ -565,7 +569,7 @@ def _compile_gold(
|
||||||
nlp: Language,
|
nlp: Language,
|
||||||
make_proj: bool,
|
make_proj: bool,
|
||||||
) -> Dict[str, Any]:
|
) -> Dict[str, Any]:
|
||||||
data = {
|
data: Dict[str, Any] = {
|
||||||
"ner": Counter(),
|
"ner": Counter(),
|
||||||
"cats": Counter(),
|
"cats": Counter(),
|
||||||
"tags": Counter(),
|
"tags": Counter(),
|
||||||
|
@ -670,10 +674,28 @@ def _compile_gold(
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
||||||
def _format_labels(labels: List[Tuple[str, int]], counts: bool = False) -> str:
|
@overload
|
||||||
|
def _format_labels(labels: Iterable[str], counts: Literal[False] = False) -> str:
|
||||||
|
...
|
||||||
|
|
||||||
|
|
||||||
|
@overload
|
||||||
|
def _format_labels(
|
||||||
|
labels: Iterable[Tuple[str, int]],
|
||||||
|
counts: Literal[True],
|
||||||
|
) -> str:
|
||||||
|
...
|
||||||
|
|
||||||
|
|
||||||
|
def _format_labels(
|
||||||
|
labels: Union[Iterable[str], Iterable[Tuple[str, int]]],
|
||||||
|
counts: bool = False,
|
||||||
|
) -> str:
|
||||||
if counts:
|
if counts:
|
||||||
return ", ".join([f"'{l}' ({c})" for l, c in labels])
|
return ", ".join(
|
||||||
return ", ".join([f"'{l}'" for l in labels])
|
[f"'{l}' ({c})" for l, c in cast(Iterable[Tuple[str, int]], labels)]
|
||||||
|
)
|
||||||
|
return ", ".join([f"'{l}'" for l in cast(Iterable[str], labels)])
|
||||||
|
|
||||||
|
|
||||||
def _get_examples_without_label(data: Sequence[Example], label: str) -> int:
|
def _get_examples_without_label(data: Sequence[Example], label: str) -> int:
|
||||||
|
|
|
@ -136,7 +136,7 @@ def evaluate(
|
||||||
|
|
||||||
|
|
||||||
def handle_scores_per_type(
|
def handle_scores_per_type(
|
||||||
scores: Union[Scorer, Dict[str, Any]],
|
scores: Dict[str, Any],
|
||||||
data: Dict[str, Any] = {},
|
data: Dict[str, Any] = {},
|
||||||
*,
|
*,
|
||||||
spans_key: str = "sc",
|
spans_key: str = "sc",
|
||||||
|
|
|
@ -15,7 +15,7 @@ def info_cli(
|
||||||
model: Optional[str] = Arg(None, help="Optional loadable spaCy pipeline"),
|
model: Optional[str] = Arg(None, help="Optional loadable spaCy pipeline"),
|
||||||
markdown: bool = Opt(False, "--markdown", "-md", help="Generate Markdown for GitHub issues"),
|
markdown: bool = Opt(False, "--markdown", "-md", help="Generate Markdown for GitHub issues"),
|
||||||
silent: bool = Opt(False, "--silent", "-s", "-S", help="Don't print anything (just return)"),
|
silent: bool = Opt(False, "--silent", "-s", "-S", help="Don't print anything (just return)"),
|
||||||
exclude: Optional[str] = Opt("labels", "--exclude", "-e", help="Comma-separated keys to exclude from the print-out"),
|
exclude: str = Opt("labels", "--exclude", "-e", help="Comma-separated keys to exclude from the print-out"),
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
|
@ -61,7 +61,7 @@ def info(
|
||||||
return raw_data
|
return raw_data
|
||||||
|
|
||||||
|
|
||||||
def info_spacy() -> Dict[str, any]:
|
def info_spacy() -> Dict[str, Any]:
|
||||||
"""Generate info about the current spaCy intallation.
|
"""Generate info about the current spaCy intallation.
|
||||||
|
|
||||||
RETURNS (dict): The spaCy info.
|
RETURNS (dict): The spaCy info.
|
||||||
|
|
|
@ -28,8 +28,8 @@ class Optimizations(str, Enum):
|
||||||
def init_config_cli(
|
def init_config_cli(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
output_file: Path = Arg(..., help="File to save config.cfg to or - for stdout (will only output config and no additional logging info)", allow_dash=True),
|
output_file: Path = Arg(..., help="File to save config.cfg to or - for stdout (will only output config and no additional logging info)", allow_dash=True),
|
||||||
lang: Optional[str] = Opt("en", "--lang", "-l", help="Two-letter code of the language to use"),
|
lang: str = Opt("en", "--lang", "-l", help="Two-letter code of the language to use"),
|
||||||
pipeline: Optional[str] = Opt("tagger,parser,ner", "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include (without 'tok2vec' or 'transformer')"),
|
pipeline: str = Opt("tagger,parser,ner", "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include (without 'tok2vec' or 'transformer')"),
|
||||||
optimize: Optimizations = Opt(Optimizations.efficiency.value, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."),
|
optimize: Optimizations = Opt(Optimizations.efficiency.value, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."),
|
||||||
gpu: bool = Opt(False, "--gpu", "-G", help="Whether the model can run on GPU. This will impact the choice of architecture, pretrained weights and related hyperparameters."),
|
gpu: bool = Opt(False, "--gpu", "-G", help="Whether the model can run on GPU. This will impact the choice of architecture, pretrained weights and related hyperparameters."),
|
||||||
pretraining: bool = Opt(False, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"),
|
pretraining: bool = Opt(False, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"),
|
||||||
|
@ -44,8 +44,6 @@ def init_config_cli(
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/cli#init-config
|
DOCS: https://spacy.io/api/cli#init-config
|
||||||
"""
|
"""
|
||||||
if isinstance(optimize, Optimizations): # instance of enum from the CLI
|
|
||||||
optimize = optimize.value
|
|
||||||
pipeline = string_to_list(pipeline)
|
pipeline = string_to_list(pipeline)
|
||||||
is_stdout = str(output_file) == "-"
|
is_stdout = str(output_file) == "-"
|
||||||
if not is_stdout and output_file.exists() and not force_overwrite:
|
if not is_stdout and output_file.exists() and not force_overwrite:
|
||||||
|
@ -57,7 +55,7 @@ def init_config_cli(
|
||||||
config = init_config(
|
config = init_config(
|
||||||
lang=lang,
|
lang=lang,
|
||||||
pipeline=pipeline,
|
pipeline=pipeline,
|
||||||
optimize=optimize,
|
optimize=optimize.value,
|
||||||
gpu=gpu,
|
gpu=gpu,
|
||||||
pretraining=pretraining,
|
pretraining=pretraining,
|
||||||
silent=is_stdout,
|
silent=is_stdout,
|
||||||
|
@ -175,8 +173,8 @@ def init_config(
|
||||||
"Pipeline": ", ".join(pipeline),
|
"Pipeline": ", ".join(pipeline),
|
||||||
"Optimize for": optimize,
|
"Optimize for": optimize,
|
||||||
"Hardware": variables["hardware"].upper(),
|
"Hardware": variables["hardware"].upper(),
|
||||||
"Transformer": template_vars.transformer.get("name")
|
"Transformer": template_vars.transformer.get("name") # type: ignore[attr-defined]
|
||||||
if template_vars.use_transformer
|
if template_vars.use_transformer # type: ignore[attr-defined]
|
||||||
else None,
|
else None,
|
||||||
}
|
}
|
||||||
msg.info("Generated config template specific for your use case")
|
msg.info("Generated config template specific for your use case")
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import Optional, Union, Any, Dict, List, Tuple
|
from typing import Optional, Union, Any, Dict, List, Tuple, cast
|
||||||
import shutil
|
import shutil
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from wasabi import Printer, MarkdownRenderer, get_raw_input
|
from wasabi import Printer, MarkdownRenderer, get_raw_input
|
||||||
|
@ -215,9 +215,9 @@ def get_third_party_dependencies(
|
||||||
for reg_name, func_names in funcs.items():
|
for reg_name, func_names in funcs.items():
|
||||||
for func_name in func_names:
|
for func_name in func_names:
|
||||||
func_info = util.registry.find(reg_name, func_name)
|
func_info = util.registry.find(reg_name, func_name)
|
||||||
module_name = func_info.get("module")
|
module_name = func_info.get("module") # type: ignore[attr-defined]
|
||||||
if module_name: # the code is part of a module, not a --code file
|
if module_name: # the code is part of a module, not a --code file
|
||||||
modules.add(func_info["module"].split(".")[0])
|
modules.add(func_info["module"].split(".")[0]) # type: ignore[index]
|
||||||
dependencies = []
|
dependencies = []
|
||||||
for module_name in modules:
|
for module_name in modules:
|
||||||
if module_name in distributions:
|
if module_name in distributions:
|
||||||
|
@ -227,7 +227,7 @@ def get_third_party_dependencies(
|
||||||
if pkg in own_packages or pkg in exclude:
|
if pkg in own_packages or pkg in exclude:
|
||||||
continue
|
continue
|
||||||
version = util.get_package_version(pkg)
|
version = util.get_package_version(pkg)
|
||||||
version_range = util.get_minor_version_range(version)
|
version_range = util.get_minor_version_range(version) # type: ignore[arg-type]
|
||||||
dependencies.append(f"{pkg}{version_range}")
|
dependencies.append(f"{pkg}{version_range}")
|
||||||
return dependencies
|
return dependencies
|
||||||
|
|
||||||
|
@ -252,7 +252,7 @@ def create_file(file_path: Path, contents: str) -> None:
|
||||||
def get_meta(
|
def get_meta(
|
||||||
model_path: Union[str, Path], existing_meta: Dict[str, Any]
|
model_path: Union[str, Path], existing_meta: Dict[str, Any]
|
||||||
) -> Dict[str, Any]:
|
) -> Dict[str, Any]:
|
||||||
meta = {
|
meta: Dict[str, Any] = {
|
||||||
"lang": "en",
|
"lang": "en",
|
||||||
"name": "pipeline",
|
"name": "pipeline",
|
||||||
"version": "0.0.0",
|
"version": "0.0.0",
|
||||||
|
@ -324,8 +324,8 @@ def generate_readme(meta: Dict[str, Any]) -> str:
|
||||||
license_name = meta.get("license")
|
license_name = meta.get("license")
|
||||||
sources = _format_sources(meta.get("sources"))
|
sources = _format_sources(meta.get("sources"))
|
||||||
description = meta.get("description")
|
description = meta.get("description")
|
||||||
label_scheme = _format_label_scheme(meta.get("labels"))
|
label_scheme = _format_label_scheme(cast(Dict[str, Any], meta.get("labels")))
|
||||||
accuracy = _format_accuracy(meta.get("performance"))
|
accuracy = _format_accuracy(cast(Dict[str, Any], meta.get("performance")))
|
||||||
table_data = [
|
table_data = [
|
||||||
(md.bold("Name"), md.code(name)),
|
(md.bold("Name"), md.code(name)),
|
||||||
(md.bold("Version"), md.code(version)),
|
(md.bold("Version"), md.code(version)),
|
||||||
|
|
|
@ -32,7 +32,7 @@ def profile_cli(
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/cli#debug-profile
|
DOCS: https://spacy.io/api/cli#debug-profile
|
||||||
"""
|
"""
|
||||||
if ctx.parent.command.name == NAME: # called as top-level command
|
if ctx.parent.command.name == NAME: # type: ignore[union-attr] # called as top-level command
|
||||||
msg.warn(
|
msg.warn(
|
||||||
"The profile command is now available via the 'debug profile' "
|
"The profile command is now available via the 'debug profile' "
|
||||||
"subcommand. You can run python -m spacy debug --help for an "
|
"subcommand. You can run python -m spacy debug --help for an "
|
||||||
|
@ -42,9 +42,9 @@ def profile_cli(
|
||||||
|
|
||||||
|
|
||||||
def profile(model: str, inputs: Optional[Path] = None, n_texts: int = 10000) -> None:
|
def profile(model: str, inputs: Optional[Path] = None, n_texts: int = 10000) -> None:
|
||||||
|
|
||||||
if inputs is not None:
|
if inputs is not None:
|
||||||
inputs = _read_inputs(inputs, msg)
|
texts = _read_inputs(inputs, msg)
|
||||||
|
texts = list(itertools.islice(texts, n_texts))
|
||||||
if inputs is None:
|
if inputs is None:
|
||||||
try:
|
try:
|
||||||
import ml_datasets
|
import ml_datasets
|
||||||
|
@ -56,16 +56,13 @@ def profile(model: str, inputs: Optional[Path] = None, n_texts: int = 10000) ->
|
||||||
exits=1,
|
exits=1,
|
||||||
)
|
)
|
||||||
|
|
||||||
n_inputs = 25000
|
with msg.loading("Loading IMDB dataset via ml_datasets..."):
|
||||||
with msg.loading("Loading IMDB dataset via Thinc..."):
|
imdb_train, _ = ml_datasets.imdb(train_limit=n_texts, dev_limit=0)
|
||||||
imdb_train, _ = ml_datasets.imdb()
|
texts, _ = zip(*imdb_train)
|
||||||
inputs, _ = zip(*imdb_train)
|
msg.info(f"Loaded IMDB dataset and using {n_texts} examples")
|
||||||
msg.info(f"Loaded IMDB dataset and using {n_inputs} examples")
|
|
||||||
inputs = inputs[:n_inputs]
|
|
||||||
with msg.loading(f"Loading pipeline '{model}'..."):
|
with msg.loading(f"Loading pipeline '{model}'..."):
|
||||||
nlp = load_model(model)
|
nlp = load_model(model)
|
||||||
msg.good(f"Loaded pipeline '{model}'")
|
msg.good(f"Loaded pipeline '{model}'")
|
||||||
texts = list(itertools.islice(inputs, n_texts))
|
|
||||||
cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof")
|
cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof")
|
||||||
s = pstats.Stats("Profile.prof")
|
s = pstats.Stats("Profile.prof")
|
||||||
msg.divider("Profile stats")
|
msg.divider("Profile stats")
|
||||||
|
@ -87,7 +84,7 @@ def _read_inputs(loc: Union[Path, str], msg: Printer) -> Iterator[str]:
|
||||||
if not input_path.exists() or not input_path.is_file():
|
if not input_path.exists() or not input_path.is_file():
|
||||||
msg.fail("Not a valid input data file", loc, exits=1)
|
msg.fail("Not a valid input data file", loc, exits=1)
|
||||||
msg.info(f"Using data from {input_path.parts[-1]}")
|
msg.info(f"Using data from {input_path.parts[-1]}")
|
||||||
file_ = input_path.open()
|
file_ = input_path.open() # type: ignore[assignment]
|
||||||
for line in file_:
|
for line in file_:
|
||||||
data = srsly.json_loads(line)
|
data = srsly.json_loads(line)
|
||||||
text = data["text"]
|
text = data["text"]
|
||||||
|
|
|
@ -133,7 +133,6 @@ def fetch_asset(
|
||||||
# If there's already a file, check for checksum
|
# If there's already a file, check for checksum
|
||||||
if checksum == get_checksum(dest_path):
|
if checksum == get_checksum(dest_path):
|
||||||
msg.good(f"Skipping download with matching checksum: {dest}")
|
msg.good(f"Skipping download with matching checksum: {dest}")
|
||||||
return dest_path
|
|
||||||
# We might as well support the user here and create parent directories in
|
# We might as well support the user here and create parent directories in
|
||||||
# case the asset dir isn't listed as a dir to create in the project.yml
|
# case the asset dir isn't listed as a dir to create in the project.yml
|
||||||
if not dest_path.parent.exists():
|
if not dest_path.parent.exists():
|
||||||
|
@ -150,7 +149,6 @@ def fetch_asset(
|
||||||
msg.good(f"Copied local asset {dest}")
|
msg.good(f"Copied local asset {dest}")
|
||||||
else:
|
else:
|
||||||
msg.fail(f"Download failed: {dest}", e)
|
msg.fail(f"Download failed: {dest}", e)
|
||||||
return
|
|
||||||
if checksum and checksum != get_checksum(dest_path):
|
if checksum and checksum != get_checksum(dest_path):
|
||||||
msg.fail(f"Checksum doesn't match value defined in {PROJECT_FILE}: {dest}")
|
msg.fail(f"Checksum doesn't match value defined in {PROJECT_FILE}: {dest}")
|
||||||
|
|
||||||
|
|
|
@ -80,9 +80,9 @@ def check_clone(name: str, dest: Path, repo: str) -> None:
|
||||||
repo (str): URL of the repo to clone from.
|
repo (str): URL of the repo to clone from.
|
||||||
"""
|
"""
|
||||||
git_err = (
|
git_err = (
|
||||||
f"Cloning spaCy project templates requires Git and the 'git' command. ",
|
f"Cloning spaCy project templates requires Git and the 'git' command. "
|
||||||
f"To clone a project without Git, copy the files from the '{name}' "
|
f"To clone a project without Git, copy the files from the '{name}' "
|
||||||
f"directory in the {repo} to {dest} manually.",
|
f"directory in the {repo} to {dest} manually."
|
||||||
)
|
)
|
||||||
get_git_version(error=git_err)
|
get_git_version(error=git_err)
|
||||||
if not dest:
|
if not dest:
|
||||||
|
|
|
@ -143,8 +143,8 @@ def run_dvc_commands(
|
||||||
easier to pass flags like --quiet that depend on a variable or
|
easier to pass flags like --quiet that depend on a variable or
|
||||||
command-line setting while avoiding lots of nested conditionals.
|
command-line setting while avoiding lots of nested conditionals.
|
||||||
"""
|
"""
|
||||||
for command in commands:
|
for c in commands:
|
||||||
command = split_command(command)
|
command = split_command(c)
|
||||||
dvc_command = ["dvc", *command]
|
dvc_command = ["dvc", *command]
|
||||||
# Add the flags if they are set to True
|
# Add the flags if they are set to True
|
||||||
for flag, is_active in flags.items():
|
for flag, is_active in flags.items():
|
||||||
|
|
|
@ -41,7 +41,7 @@ class RemoteStorage:
|
||||||
raise IOError(f"Cannot push {loc}: does not exist.")
|
raise IOError(f"Cannot push {loc}: does not exist.")
|
||||||
url = self.make_url(path, command_hash, content_hash)
|
url = self.make_url(path, command_hash, content_hash)
|
||||||
if url.exists():
|
if url.exists():
|
||||||
return None
|
return url
|
||||||
tmp: Path
|
tmp: Path
|
||||||
with make_tempdir() as tmp:
|
with make_tempdir() as tmp:
|
||||||
tar_loc = tmp / self.encode_name(str(path))
|
tar_loc = tmp / self.encode_name(str(path))
|
||||||
|
@ -131,8 +131,10 @@ def get_command_hash(
|
||||||
currently installed packages, whatever environment variables have been marked
|
currently installed packages, whatever environment variables have been marked
|
||||||
as relevant, and the command.
|
as relevant, and the command.
|
||||||
"""
|
"""
|
||||||
check_commit = check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION)
|
if check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION):
|
||||||
spacy_v = GIT_VERSION if check_commit else get_minor_version(about.__version__)
|
spacy_v = GIT_VERSION
|
||||||
|
else:
|
||||||
|
spacy_v = str(get_minor_version(about.__version__) or "")
|
||||||
dep_checksums = [get_checksum(dep) for dep in sorted(deps)]
|
dep_checksums = [get_checksum(dep) for dep in sorted(deps)]
|
||||||
hashes = [spacy_v, site_hash, env_hash] + dep_checksums
|
hashes = [spacy_v, site_hash, env_hash] + dep_checksums
|
||||||
hashes.extend(cmd)
|
hashes.extend(cmd)
|
||||||
|
|
|
@ -70,7 +70,7 @@ def project_run(
|
||||||
config = load_project_config(project_dir, overrides=overrides)
|
config = load_project_config(project_dir, overrides=overrides)
|
||||||
commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
|
commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
|
||||||
workflows = config.get("workflows", {})
|
workflows = config.get("workflows", {})
|
||||||
validate_subcommand(commands.keys(), workflows.keys(), subcommand)
|
validate_subcommand(list(commands.keys()), list(workflows.keys()), subcommand)
|
||||||
if subcommand in workflows:
|
if subcommand in workflows:
|
||||||
msg.info(f"Running workflow '{subcommand}'")
|
msg.info(f"Running workflow '{subcommand}'")
|
||||||
for cmd in workflows[subcommand]:
|
for cmd in workflows[subcommand]:
|
||||||
|
@ -116,7 +116,7 @@ def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
|
||||||
workflows = config.get("workflows", {})
|
workflows = config.get("workflows", {})
|
||||||
project_loc = "" if is_cwd(project_dir) else project_dir
|
project_loc = "" if is_cwd(project_dir) else project_dir
|
||||||
if subcommand:
|
if subcommand:
|
||||||
validate_subcommand(commands.keys(), workflows.keys(), subcommand)
|
validate_subcommand(list(commands.keys()), list(workflows.keys()), subcommand)
|
||||||
print(f"Usage: {COMMAND} project run {subcommand} {project_loc}")
|
print(f"Usage: {COMMAND} project run {subcommand} {project_loc}")
|
||||||
if subcommand in commands:
|
if subcommand in commands:
|
||||||
help_text = commands[subcommand].get("help")
|
help_text = commands[subcommand].get("help")
|
||||||
|
@ -164,8 +164,8 @@ def run_commands(
|
||||||
when you want to turn over execution to the command, and capture=True
|
when you want to turn over execution to the command, and capture=True
|
||||||
when you want to run the command more like a function.
|
when you want to run the command more like a function.
|
||||||
"""
|
"""
|
||||||
for command in commands:
|
for c in commands:
|
||||||
command = split_command(command)
|
command = split_command(c)
|
||||||
# Not sure if this is needed or a good idea. Motivation: users may often
|
# Not sure if this is needed or a good idea. Motivation: users may often
|
||||||
# use commands in their config that reference "python" and we want to
|
# use commands in their config that reference "python" and we want to
|
||||||
# make sure that it's always executing the same Python that spaCy is
|
# make sure that it's always executing the same Python that spaCy is
|
||||||
|
@ -294,7 +294,7 @@ def get_lock_entry(project_dir: Path, command: Dict[str, Any]) -> Dict[str, Any]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def get_fileinfo(project_dir: Path, paths: List[str]) -> List[Dict[str, str]]:
|
def get_fileinfo(project_dir: Path, paths: List[str]) -> List[Dict[str, Optional[str]]]:
|
||||||
"""Generate the file information for a list of paths (dependencies, outputs).
|
"""Generate the file information for a list of paths (dependencies, outputs).
|
||||||
Includes the file path and the file's checksum.
|
Includes the file path and the file's checksum.
|
||||||
|
|
||||||
|
|
|
@ -32,7 +32,7 @@ batch_size = {{ 128 if hardware == "gpu" else 1000 }}
|
||||||
factory = "transformer"
|
factory = "transformer"
|
||||||
|
|
||||||
[components.transformer.model]
|
[components.transformer.model]
|
||||||
@architectures = "spacy-transformers.TransformerModel.v1"
|
@architectures = "spacy-transformers.TransformerModel.v3"
|
||||||
name = "{{ transformer["name"] }}"
|
name = "{{ transformer["name"] }}"
|
||||||
tokenizer_config = {"use_fast": true}
|
tokenizer_config = {"use_fast": true}
|
||||||
|
|
||||||
|
|
|
@ -99,7 +99,7 @@ def get_model_pkgs(silent: bool = False) -> Tuple[dict, dict]:
|
||||||
warnings.filterwarnings("ignore", message="\\[W09[45]")
|
warnings.filterwarnings("ignore", message="\\[W09[45]")
|
||||||
model_meta = get_model_meta(model_path)
|
model_meta = get_model_meta(model_path)
|
||||||
spacy_version = model_meta.get("spacy_version", "n/a")
|
spacy_version = model_meta.get("spacy_version", "n/a")
|
||||||
is_compat = is_compatible_version(about.__version__, spacy_version)
|
is_compat = is_compatible_version(about.__version__, spacy_version) # type: ignore[assignment]
|
||||||
pkgs[pkg_name] = {
|
pkgs[pkg_name] = {
|
||||||
"name": package,
|
"name": package,
|
||||||
"version": version,
|
"version": version,
|
||||||
|
|
|
@ -5,12 +5,12 @@ from thinc.util import copy_array
|
||||||
try:
|
try:
|
||||||
import cPickle as pickle
|
import cPickle as pickle
|
||||||
except ImportError:
|
except ImportError:
|
||||||
import pickle
|
import pickle # type: ignore[no-redef]
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import copy_reg
|
import copy_reg
|
||||||
except ImportError:
|
except ImportError:
|
||||||
import copyreg as copy_reg
|
import copyreg as copy_reg # type: ignore[no-redef]
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from cupy.cuda.stream import Stream as CudaStream
|
from cupy.cuda.stream import Stream as CudaStream
|
||||||
|
@ -22,9 +22,9 @@ try:
|
||||||
except ImportError:
|
except ImportError:
|
||||||
cupy = None
|
cupy = None
|
||||||
|
|
||||||
try: # Python 3.8+
|
if sys.version_info[:2] >= (3, 8): # Python 3.8+
|
||||||
from typing import Literal
|
from typing import Literal
|
||||||
except ImportError:
|
else:
|
||||||
from typing_extensions import Literal # noqa: F401
|
from typing_extensions import Literal # noqa: F401
|
||||||
|
|
||||||
# Important note: The importlib_metadata "backport" includes functionality
|
# Important note: The importlib_metadata "backport" includes functionality
|
||||||
|
@ -33,7 +33,7 @@ except ImportError:
|
||||||
try: # Python 3.8+
|
try: # Python 3.8+
|
||||||
import importlib.metadata as importlib_metadata
|
import importlib.metadata as importlib_metadata
|
||||||
except ImportError:
|
except ImportError:
|
||||||
from catalogue import _importlib_metadata as importlib_metadata # noqa: F401
|
from catalogue import _importlib_metadata as importlib_metadata # type: ignore[no-redef] # noqa: F401
|
||||||
|
|
||||||
from thinc.api import Optimizer # noqa: F401
|
from thinc.api import Optimizer # noqa: F401
|
||||||
|
|
||||||
|
|
|
@ -18,7 +18,7 @@ RENDER_WRAPPER = None
|
||||||
|
|
||||||
|
|
||||||
def render(
|
def render(
|
||||||
docs: Union[Iterable[Union[Doc, Span]], Doc, Span],
|
docs: Union[Iterable[Union[Doc, Span, dict]], Doc, Span, dict],
|
||||||
style: str = "dep",
|
style: str = "dep",
|
||||||
page: bool = False,
|
page: bool = False,
|
||||||
minify: bool = False,
|
minify: bool = False,
|
||||||
|
@ -28,7 +28,8 @@ def render(
|
||||||
) -> str:
|
) -> str:
|
||||||
"""Render displaCy visualisation.
|
"""Render displaCy visualisation.
|
||||||
|
|
||||||
docs (Union[Iterable[Doc], Doc]): Document(s) to visualise.
|
docs (Union[Iterable[Union[Doc, Span, dict]], Doc, Span, dict]]): Document(s) to visualise.
|
||||||
|
a 'dict' is only allowed here when 'manual' is set to True
|
||||||
style (str): Visualisation style, 'dep' or 'ent'.
|
style (str): Visualisation style, 'dep' or 'ent'.
|
||||||
page (bool): Render markup as full HTML page.
|
page (bool): Render markup as full HTML page.
|
||||||
minify (bool): Minify HTML markup.
|
minify (bool): Minify HTML markup.
|
||||||
|
@ -53,8 +54,8 @@ def render(
|
||||||
raise ValueError(Errors.E096)
|
raise ValueError(Errors.E096)
|
||||||
renderer_func, converter = factories[style]
|
renderer_func, converter = factories[style]
|
||||||
renderer = renderer_func(options=options)
|
renderer = renderer_func(options=options)
|
||||||
parsed = [converter(doc, options) for doc in docs] if not manual else docs
|
parsed = [converter(doc, options) for doc in docs] if not manual else docs # type: ignore
|
||||||
_html["parsed"] = renderer.render(parsed, page=page, minify=minify).strip()
|
_html["parsed"] = renderer.render(parsed, page=page, minify=minify).strip() # type: ignore
|
||||||
html = _html["parsed"]
|
html = _html["parsed"]
|
||||||
if RENDER_WRAPPER is not None:
|
if RENDER_WRAPPER is not None:
|
||||||
html = RENDER_WRAPPER(html)
|
html = RENDER_WRAPPER(html)
|
||||||
|
@ -133,7 +134,7 @@ def parse_deps(orig_doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
|
||||||
"lemma": np.root.lemma_,
|
"lemma": np.root.lemma_,
|
||||||
"ent_type": np.root.ent_type_,
|
"ent_type": np.root.ent_type_,
|
||||||
}
|
}
|
||||||
retokenizer.merge(np, attrs=attrs)
|
retokenizer.merge(np, attrs=attrs) # type: ignore[arg-type]
|
||||||
if options.get("collapse_punct", True):
|
if options.get("collapse_punct", True):
|
||||||
spans = []
|
spans = []
|
||||||
for word in doc[:-1]:
|
for word in doc[:-1]:
|
||||||
|
@ -148,7 +149,7 @@ def parse_deps(orig_doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
|
||||||
with doc.retokenize() as retokenizer:
|
with doc.retokenize() as retokenizer:
|
||||||
for span, tag, lemma, ent_type in spans:
|
for span, tag, lemma, ent_type in spans:
|
||||||
attrs = {"tag": tag, "lemma": lemma, "ent_type": ent_type}
|
attrs = {"tag": tag, "lemma": lemma, "ent_type": ent_type}
|
||||||
retokenizer.merge(span, attrs=attrs)
|
retokenizer.merge(span, attrs=attrs) # type: ignore[arg-type]
|
||||||
fine_grained = options.get("fine_grained")
|
fine_grained = options.get("fine_grained")
|
||||||
add_lemma = options.get("add_lemma")
|
add_lemma = options.get("add_lemma")
|
||||||
words = [
|
words = [
|
||||||
|
|
|
@ -25,7 +25,7 @@ def setup_default_warnings():
|
||||||
filter_warning("once", error_msg=Warnings.W036.format(name=pipe))
|
filter_warning("once", error_msg=Warnings.W036.format(name=pipe))
|
||||||
|
|
||||||
# warn once about lemmatizer without required POS
|
# warn once about lemmatizer without required POS
|
||||||
filter_warning("once", error_msg="[W108]")
|
filter_warning("once", error_msg=Warnings.W108)
|
||||||
|
|
||||||
|
|
||||||
def filter_warning(action: str, error_msg: str):
|
def filter_warning(action: str, error_msg: str):
|
||||||
|
@ -170,8 +170,8 @@ class Warnings:
|
||||||
"call the {matcher} on each Doc object.")
|
"call the {matcher} on each Doc object.")
|
||||||
W107 = ("The property `Doc.{prop}` is deprecated. Use "
|
W107 = ("The property `Doc.{prop}` is deprecated. Use "
|
||||||
"`Doc.has_annotation(\"{attr}\")` instead.")
|
"`Doc.has_annotation(\"{attr}\")` instead.")
|
||||||
W108 = ("The rule-based lemmatizer did not find POS annotation for the "
|
W108 = ("The rule-based lemmatizer did not find POS annotation for one or "
|
||||||
"token '{text}'. Check that your pipeline includes components that "
|
"more tokens. Check that your pipeline includes components that "
|
||||||
"assign token.pos, typically 'tagger'+'attribute_ruler' or "
|
"assign token.pos, typically 'tagger'+'attribute_ruler' or "
|
||||||
"'morphologizer'.")
|
"'morphologizer'.")
|
||||||
W109 = ("Unable to save user hooks while serializing the doc. Re-add any "
|
W109 = ("Unable to save user hooks while serializing the doc. Re-add any "
|
||||||
|
|
13
spacy/kb.pyx
13
spacy/kb.pyx
|
@ -1,5 +1,5 @@
|
||||||
# cython: infer_types=True, profile=True
|
# cython: infer_types=True, profile=True
|
||||||
from typing import Iterator, Iterable
|
from typing import Iterator, Iterable, Callable, Dict, Any
|
||||||
|
|
||||||
import srsly
|
import srsly
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
|
@ -96,6 +96,8 @@ cdef class KnowledgeBase:
|
||||||
def initialize_entities(self, int64_t nr_entities):
|
def initialize_entities(self, int64_t nr_entities):
|
||||||
self._entry_index = PreshMap(nr_entities + 1)
|
self._entry_index = PreshMap(nr_entities + 1)
|
||||||
self._entries = entry_vec(nr_entities + 1)
|
self._entries = entry_vec(nr_entities + 1)
|
||||||
|
|
||||||
|
def initialize_vectors(self, int64_t nr_entities):
|
||||||
self._vectors_table = float_matrix(nr_entities + 1)
|
self._vectors_table = float_matrix(nr_entities + 1)
|
||||||
|
|
||||||
def initialize_aliases(self, int64_t nr_aliases):
|
def initialize_aliases(self, int64_t nr_aliases):
|
||||||
|
@ -154,6 +156,7 @@ cdef class KnowledgeBase:
|
||||||
|
|
||||||
nr_entities = len(set(entity_list))
|
nr_entities = len(set(entity_list))
|
||||||
self.initialize_entities(nr_entities)
|
self.initialize_entities(nr_entities)
|
||||||
|
self.initialize_vectors(nr_entities)
|
||||||
|
|
||||||
i = 0
|
i = 0
|
||||||
cdef KBEntryC entry
|
cdef KBEntryC entry
|
||||||
|
@ -172,8 +175,8 @@ cdef class KnowledgeBase:
|
||||||
entry.entity_hash = entity_hash
|
entry.entity_hash = entity_hash
|
||||||
entry.freq = freq_list[i]
|
entry.freq = freq_list[i]
|
||||||
|
|
||||||
vector_index = self.c_add_vector(entity_vector=vector_list[i])
|
self._vectors_table[i] = entity_vector
|
||||||
entry.vector_index = vector_index
|
entry.vector_index = i
|
||||||
|
|
||||||
entry.feats_row = -1 # Features table currently not implemented
|
entry.feats_row = -1 # Features table currently not implemented
|
||||||
|
|
||||||
|
@ -386,6 +389,7 @@ cdef class KnowledgeBase:
|
||||||
nr_aliases = header[1]
|
nr_aliases = header[1]
|
||||||
entity_vector_length = header[2]
|
entity_vector_length = header[2]
|
||||||
self.initialize_entities(nr_entities)
|
self.initialize_entities(nr_entities)
|
||||||
|
self.initialize_vectors(nr_entities)
|
||||||
self.initialize_aliases(nr_aliases)
|
self.initialize_aliases(nr_aliases)
|
||||||
self.entity_vector_length = entity_vector_length
|
self.entity_vector_length = entity_vector_length
|
||||||
|
|
||||||
|
@ -446,7 +450,7 @@ cdef class KnowledgeBase:
|
||||||
raise ValueError(Errors.E929.format(loc=path))
|
raise ValueError(Errors.E929.format(loc=path))
|
||||||
if not path.is_dir():
|
if not path.is_dir():
|
||||||
raise ValueError(Errors.E928.format(loc=path))
|
raise ValueError(Errors.E928.format(loc=path))
|
||||||
deserialize = {}
|
deserialize: Dict[str, Callable[[Any], Any]] = {}
|
||||||
deserialize["contents"] = lambda p: self.read_contents(p)
|
deserialize["contents"] = lambda p: self.read_contents(p)
|
||||||
deserialize["strings.json"] = lambda p: self.vocab.strings.from_disk(p)
|
deserialize["strings.json"] = lambda p: self.vocab.strings.from_disk(p)
|
||||||
util.from_disk(path, deserialize, exclude)
|
util.from_disk(path, deserialize, exclude)
|
||||||
|
@ -509,6 +513,7 @@ cdef class KnowledgeBase:
|
||||||
reader.read_header(&nr_entities, &entity_vector_length)
|
reader.read_header(&nr_entities, &entity_vector_length)
|
||||||
|
|
||||||
self.initialize_entities(nr_entities)
|
self.initialize_entities(nr_entities)
|
||||||
|
self.initialize_vectors(nr_entities)
|
||||||
self.entity_vector_length = entity_vector_length
|
self.entity_vector_length = entity_vector_length
|
||||||
|
|
||||||
# STEP 1: load entity vectors
|
# STEP 1: load entity vectors
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ...language import Language
|
from ...language import Language, BaseDefaults
|
||||||
|
|
||||||
|
|
||||||
class AfrikaansDefaults(Language.Defaults):
|
class AfrikaansDefaults(BaseDefaults):
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -4,12 +4,12 @@ from .punctuation import TOKENIZER_SUFFIXES
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ...language import Language
|
from ...language import Language, BaseDefaults
|
||||||
from ...attrs import LANG
|
from ...attrs import LANG
|
||||||
from ...util import update_exc
|
from ...util import update_exc
|
||||||
|
|
||||||
|
|
||||||
class AmharicDefaults(Language.Defaults):
|
class AmharicDefaults(BaseDefaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
lex_attr_getters.update(LEX_ATTRS)
|
lex_attr_getters.update(LEX_ATTRS)
|
||||||
lex_attr_getters[LANG] = lambda text: "am"
|
lex_attr_getters[LANG] = lambda text: "am"
|
||||||
|
|
|
@ -2,10 +2,10 @@ from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .punctuation import TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_SUFFIXES
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from ...language import Language
|
from ...language import Language, BaseDefaults
|
||||||
|
|
||||||
|
|
||||||
class ArabicDefaults(Language.Defaults):
|
class ArabicDefaults(BaseDefaults):
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from ...language import Language
|
from ...language import Language, BaseDefaults
|
||||||
|
|
||||||
|
|
||||||
class AzerbaijaniDefaults(Language.Defaults):
|
class AzerbaijaniDefaults(BaseDefaults):
|
||||||
lex_attr_getters = LEX_ATTRS
|
lex_attr_getters = LEX_ATTRS
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
|
@ -3,12 +3,12 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
|
|
||||||
from ...language import Language
|
from ...language import Language, BaseDefaults
|
||||||
from ...attrs import LANG
|
from ...attrs import LANG
|
||||||
from ...util import update_exc
|
from ...util import update_exc
|
||||||
|
|
||||||
|
|
||||||
class BulgarianDefaults(Language.Defaults):
|
class BulgarianDefaults(BaseDefaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
lex_attr_getters[LANG] = lambda text: "bg"
|
lex_attr_getters[LANG] = lambda text: "bg"
|
||||||
|
|
||||||
|
|
|
@ -3,11 +3,11 @@ from thinc.api import Model
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ...language import Language
|
from ...language import Language, BaseDefaults
|
||||||
from ...pipeline import Lemmatizer
|
from ...pipeline import Lemmatizer
|
||||||
|
|
||||||
|
|
||||||
class BengaliDefaults(Language.Defaults):
|
class BengaliDefaults(BaseDefaults):
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
prefixes = TOKENIZER_PREFIXES
|
prefixes = TOKENIZER_PREFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
|
|
|
@ -7,11 +7,11 @@ from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from ...language import Language
|
from ...language import Language, BaseDefaults
|
||||||
from .lemmatizer import CatalanLemmatizer
|
from .lemmatizer import CatalanLemmatizer
|
||||||
|
|
||||||
|
|
||||||
class CatalanDefaults(Language.Defaults):
|
class CatalanDefaults(BaseDefaults):
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
|
|
|
@ -1,8 +1,10 @@
|
||||||
|
from typing import Union, Iterator, Tuple
|
||||||
|
from ...tokens import Doc, Span
|
||||||
from ...symbols import NOUN, PROPN
|
from ...symbols import NOUN, PROPN
|
||||||
from ...errors import Errors
|
from ...errors import Errors
|
||||||
|
|
||||||
|
|
||||||
def noun_chunks(doclike):
|
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
|
||||||
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
|
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
|
||||||
# fmt: off
|
# fmt: off
|
||||||
labels = ["nsubj", "nsubj:pass", "obj", "obl", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
|
labels = ["nsubj", "nsubj:pass", "obj", "obl", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from ...language import Language
|
from ...language import Language, BaseDefaults
|
||||||
|
|
||||||
|
|
||||||
class CzechDefaults(Language.Defaults):
|
class CzechDefaults(BaseDefaults):
|
||||||
lex_attr_getters = LEX_ATTRS
|
lex_attr_getters = LEX_ATTRS
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
|
@ -3,10 +3,10 @@ from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from ...language import Language
|
from ...language import Language, BaseDefaults
|
||||||
|
|
||||||
|
|
||||||
class DanishDefaults(Language.Defaults):
|
class DanishDefaults(BaseDefaults):
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
|
|
|
@ -1,8 +1,10 @@
|
||||||
|
from typing import Union, Iterator, Tuple
|
||||||
|
from ...tokens import Doc, Span
|
||||||
from ...symbols import NOUN, PROPN, PRON, VERB, AUX
|
from ...symbols import NOUN, PROPN, PRON, VERB, AUX
|
||||||
from ...errors import Errors
|
from ...errors import Errors
|
||||||
|
|
||||||
|
|
||||||
def noun_chunks(doclike):
|
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
|
||||||
def is_verb_token(tok):
|
def is_verb_token(tok):
|
||||||
return tok.pos in [VERB, AUX]
|
return tok.pos in [VERB, AUX]
|
||||||
|
|
||||||
|
@ -32,7 +34,7 @@ def noun_chunks(doclike):
|
||||||
def get_bounds(doc, root):
|
def get_bounds(doc, root):
|
||||||
return get_left_bound(doc, root), get_right_bound(doc, root)
|
return get_left_bound(doc, root), get_right_bound(doc, root)
|
||||||
|
|
||||||
doc = doclike.doc
|
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||||
|
|
||||||
if not doc.has_annotation("DEP"):
|
if not doc.has_annotation("DEP"):
|
||||||
raise ValueError(Errors.E029)
|
raise ValueError(Errors.E029)
|
||||||
|
|
|
@ -2,10 +2,10 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from ...language import Language
|
from ...language import Language, BaseDefaults
|
||||||
|
|
||||||
|
|
||||||
class GermanDefaults(Language.Defaults):
|
class GermanDefaults(BaseDefaults):
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
prefixes = TOKENIZER_PREFIXES
|
prefixes = TOKENIZER_PREFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
|
|
|
@ -1,11 +1,11 @@
|
||||||
from typing import Union, Iterator
|
from typing import Union, Iterator, Tuple
|
||||||
|
|
||||||
from ...symbols import NOUN, PROPN, PRON
|
from ...symbols import NOUN, PROPN, PRON
|
||||||
from ...errors import Errors
|
from ...errors import Errors
|
||||||
from ...tokens import Doc, Span
|
from ...tokens import Doc, Span
|
||||||
|
|
||||||
|
|
||||||
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
|
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
|
||||||
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
|
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
|
||||||
# this iterator extracts spans headed by NOUNs starting from the left-most
|
# this iterator extracts spans headed by NOUNs starting from the left-most
|
||||||
# syntactic dependent until the NOUN itself for close apposition and
|
# syntactic dependent until the NOUN itself for close apposition and
|
||||||
|
|
|
@ -7,10 +7,10 @@ from .lex_attrs import LEX_ATTRS
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||||
from .lemmatizer import GreekLemmatizer
|
from .lemmatizer import GreekLemmatizer
|
||||||
from ...language import Language
|
from ...language import Language, BaseDefaults
|
||||||
|
|
||||||
|
|
||||||
class GreekDefaults(Language.Defaults):
|
class GreekDefaults(BaseDefaults):
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
prefixes = TOKENIZER_PREFIXES
|
prefixes = TOKENIZER_PREFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
|
|
|
@ -1,11 +1,11 @@
|
||||||
from typing import Union, Iterator
|
from typing import Union, Iterator, Tuple
|
||||||
|
|
||||||
from ...symbols import NOUN, PROPN, PRON
|
from ...symbols import NOUN, PROPN, PRON
|
||||||
from ...errors import Errors
|
from ...errors import Errors
|
||||||
from ...tokens import Doc, Span
|
from ...tokens import Doc, Span
|
||||||
|
|
||||||
|
|
||||||
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
|
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
|
||||||
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
|
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
|
||||||
# It follows the logic of the noun chunks finder of English language,
|
# It follows the logic of the noun chunks finder of English language,
|
||||||
# adjusted to some Greek language special characteristics.
|
# adjusted to some Greek language special characteristics.
|
||||||
|
|
|
@ -7,10 +7,10 @@ from .lex_attrs import LEX_ATTRS
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from .punctuation import TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_INFIXES
|
||||||
from .lemmatizer import EnglishLemmatizer
|
from .lemmatizer import EnglishLemmatizer
|
||||||
from ...language import Language
|
from ...language import Language, BaseDefaults
|
||||||
|
|
||||||
|
|
||||||
class EnglishDefaults(Language.Defaults):
|
class EnglishDefaults(BaseDefaults):
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
lex_attr_getters = LEX_ATTRS
|
lex_attr_getters = LEX_ATTRS
|
||||||
|
|
|
@ -19,7 +19,7 @@ _ordinal_words = [
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
||||||
|
|
||||||
def like_num(text: str) -> bool:
|
def like_num(text):
|
||||||
if text.startswith(("+", "-", "±", "~")):
|
if text.startswith(("+", "-", "±", "~")):
|
||||||
text = text[1:]
|
text = text[1:]
|
||||||
text = text.replace(",", "").replace(".", "")
|
text = text.replace(",", "").replace(".", "")
|
||||||
|
|
|
@ -1,11 +1,11 @@
|
||||||
from typing import Union, Iterator
|
from typing import Union, Iterator, Tuple
|
||||||
|
|
||||||
from ...symbols import NOUN, PROPN, PRON
|
from ...symbols import NOUN, PROPN, PRON
|
||||||
from ...errors import Errors
|
from ...errors import Errors
|
||||||
from ...tokens import Doc, Span
|
from ...tokens import Doc, Span
|
||||||
|
|
||||||
|
|
||||||
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
|
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
|
||||||
"""
|
"""
|
||||||
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
|
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -1,9 +1,10 @@
|
||||||
|
from typing import Dict, List
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ...symbols import ORTH, NORM
|
from ...symbols import ORTH, NORM
|
||||||
from ...util import update_exc
|
from ...util import update_exc
|
||||||
|
|
||||||
|
|
||||||
_exc = {}
|
_exc: Dict[str, List[Dict]] = {}
|
||||||
_exclude = [
|
_exclude = [
|
||||||
"Ill",
|
"Ill",
|
||||||
"ill",
|
"ill",
|
||||||
|
@ -294,9 +295,9 @@ for verb_data in [
|
||||||
{ORTH: "has", NORM: "has"},
|
{ORTH: "has", NORM: "has"},
|
||||||
{ORTH: "dare", NORM: "dare"},
|
{ORTH: "dare", NORM: "dare"},
|
||||||
]:
|
]:
|
||||||
verb_data_tc = dict(verb_data)
|
verb_data_tc = dict(verb_data) # type: ignore[call-overload]
|
||||||
verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
|
verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
|
||||||
for data in [verb_data, verb_data_tc]:
|
for data in [verb_data, verb_data_tc]: # type: ignore[assignment]
|
||||||
_exc[data[ORTH] + "n't"] = [
|
_exc[data[ORTH] + "n't"] = [
|
||||||
dict(data),
|
dict(data),
|
||||||
{ORTH: "n't", NORM: "not"},
|
{ORTH: "n't", NORM: "not"},
|
||||||
|
|
|
@ -6,10 +6,10 @@ from .lex_attrs import LEX_ATTRS
|
||||||
from .lemmatizer import SpanishLemmatizer
|
from .lemmatizer import SpanishLemmatizer
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||||
from ...language import Language
|
from ...language import Language, BaseDefaults
|
||||||
|
|
||||||
|
|
||||||
class SpanishDefaults(Language.Defaults):
|
class SpanishDefaults(BaseDefaults):
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
|
|
|
@ -52,7 +52,7 @@ class SpanishLemmatizer(Lemmatizer):
|
||||||
rule_pos = "verb"
|
rule_pos = "verb"
|
||||||
else:
|
else:
|
||||||
rule_pos = pos
|
rule_pos = pos
|
||||||
rule = self.select_rule(rule_pos, features)
|
rule = self.select_rule(rule_pos, list(features))
|
||||||
index = self.lookups.get_table("lemma_index").get(rule_pos, [])
|
index = self.lookups.get_table("lemma_index").get(rule_pos, [])
|
||||||
lemmas = getattr(self, "lemmatize_" + rule_pos)(
|
lemmas = getattr(self, "lemmatize_" + rule_pos)(
|
||||||
string, features, rule, index
|
string, features, rule, index
|
||||||
|
@ -191,6 +191,8 @@ class SpanishLemmatizer(Lemmatizer):
|
||||||
return selected_lemmas
|
return selected_lemmas
|
||||||
else:
|
else:
|
||||||
return possible_lemmas
|
return possible_lemmas
|
||||||
|
else:
|
||||||
|
return []
|
||||||
|
|
||||||
def lemmatize_noun(
|
def lemmatize_noun(
|
||||||
self, word: str, features: List[str], rule: str, index: List[str]
|
self, word: str, features: List[str], rule: str, index: List[str]
|
||||||
|
@ -268,7 +270,7 @@ class SpanishLemmatizer(Lemmatizer):
|
||||||
return [word]
|
return [word]
|
||||||
|
|
||||||
def lemmatize_pron(
|
def lemmatize_pron(
|
||||||
self, word: str, features: List[str], rule: str, index: List[str]
|
self, word: str, features: List[str], rule: Optional[str], index: List[str]
|
||||||
) -> List[str]:
|
) -> List[str]:
|
||||||
"""
|
"""
|
||||||
Lemmatize a pronoun.
|
Lemmatize a pronoun.
|
||||||
|
@ -319,9 +321,11 @@ class SpanishLemmatizer(Lemmatizer):
|
||||||
return selected_lemmas
|
return selected_lemmas
|
||||||
else:
|
else:
|
||||||
return possible_lemmas
|
return possible_lemmas
|
||||||
|
else:
|
||||||
|
return []
|
||||||
|
|
||||||
def lemmatize_verb(
|
def lemmatize_verb(
|
||||||
self, word: str, features: List[str], rule: str, index: List[str]
|
self, word: str, features: List[str], rule: Optional[str], index: List[str]
|
||||||
) -> List[str]:
|
) -> List[str]:
|
||||||
"""
|
"""
|
||||||
Lemmatize a verb.
|
Lemmatize a verb.
|
||||||
|
@ -342,6 +346,7 @@ class SpanishLemmatizer(Lemmatizer):
|
||||||
selected_lemmas = []
|
selected_lemmas = []
|
||||||
|
|
||||||
# Apply lemmatization rules
|
# Apply lemmatization rules
|
||||||
|
rule = str(rule or "")
|
||||||
for old, new in self.lookups.get_table("lemma_rules").get(rule, []):
|
for old, new in self.lookups.get_table("lemma_rules").get(rule, []):
|
||||||
possible_lemma = re.sub(old + "$", new, word)
|
possible_lemma = re.sub(old + "$", new, word)
|
||||||
if possible_lemma != word:
|
if possible_lemma != word:
|
||||||
|
@ -389,11 +394,11 @@ class SpanishLemmatizer(Lemmatizer):
|
||||||
return [word]
|
return [word]
|
||||||
|
|
||||||
def lemmatize_verb_pron(
|
def lemmatize_verb_pron(
|
||||||
self, word: str, features: List[str], rule: str, index: List[str]
|
self, word: str, features: List[str], rule: Optional[str], index: List[str]
|
||||||
) -> List[str]:
|
) -> List[str]:
|
||||||
# Strip and collect pronouns
|
# Strip and collect pronouns
|
||||||
pron_patt = "^(.*?)([mts]e|l[aeo]s?|n?os)$"
|
pron_patt = "^(.*?)([mts]e|l[aeo]s?|n?os)$"
|
||||||
prons = []
|
prons: List[str] = []
|
||||||
verb = word
|
verb = word
|
||||||
m = re.search(pron_patt, verb)
|
m = re.search(pron_patt, verb)
|
||||||
while m is not None and len(prons) <= 3:
|
while m is not None and len(prons) <= 3:
|
||||||
|
@ -410,7 +415,7 @@ class SpanishLemmatizer(Lemmatizer):
|
||||||
else:
|
else:
|
||||||
rule = self.select_rule("verb", features)
|
rule = self.select_rule("verb", features)
|
||||||
verb_lemma = self.lemmatize_verb(
|
verb_lemma = self.lemmatize_verb(
|
||||||
verb, features - {"PronType=Prs"}, rule, index
|
verb, features - {"PronType=Prs"}, rule, index # type: ignore[operator]
|
||||||
)[0]
|
)[0]
|
||||||
pron_lemmas = []
|
pron_lemmas = []
|
||||||
for pron in prons:
|
for pron in prons:
|
||||||
|
|
|
@ -1,11 +1,11 @@
|
||||||
from typing import Union, Iterator
|
from typing import Union, Iterator, Tuple
|
||||||
|
|
||||||
from ...symbols import NOUN, PROPN, PRON, VERB, AUX
|
from ...symbols import NOUN, PROPN, PRON, VERB, AUX
|
||||||
from ...errors import Errors
|
from ...errors import Errors
|
||||||
from ...tokens import Doc, Span, Token
|
from ...tokens import Doc, Span, Token
|
||||||
|
|
||||||
|
|
||||||
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
|
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
|
||||||
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
|
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
|
||||||
doc = doclike.doc
|
doc = doclike.doc
|
||||||
if not doc.has_annotation("DEP"):
|
if not doc.has_annotation("DEP"):
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ...language import Language
|
from ...language import Language, BaseDefaults
|
||||||
|
|
||||||
|
|
||||||
class EstonianDefaults(Language.Defaults):
|
class EstonianDefaults(BaseDefaults):
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,10 +1,10 @@
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .punctuation import TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_SUFFIXES
|
||||||
from ...language import Language
|
from ...language import Language, BaseDefaults
|
||||||
|
|
||||||
|
|
||||||
class BasqueDefaults(Language.Defaults):
|
class BasqueDefaults(BaseDefaults):
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
lex_attr_getters = LEX_ATTRS
|
lex_attr_getters = LEX_ATTRS
|
||||||
|
|
|
@ -5,11 +5,11 @@ from .lex_attrs import LEX_ATTRS
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .punctuation import TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_SUFFIXES
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from ...language import Language
|
from ...language import Language, BaseDefaults
|
||||||
from ...pipeline import Lemmatizer
|
from ...pipeline import Lemmatizer
|
||||||
|
|
||||||
|
|
||||||
class PersianDefaults(Language.Defaults):
|
class PersianDefaults(BaseDefaults):
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
lex_attr_getters = LEX_ATTRS
|
lex_attr_getters = LEX_ATTRS
|
||||||
|
|
|
@ -639,10 +639,12 @@ for verb_root in verb_roots:
|
||||||
)
|
)
|
||||||
|
|
||||||
if past.startswith("آ"):
|
if past.startswith("آ"):
|
||||||
conjugations = set(
|
conjugations = list(
|
||||||
map(
|
set(
|
||||||
lambda item: item.replace("بآ", "بیا").replace("نآ", "نیا"),
|
map(
|
||||||
conjugations,
|
lambda item: item.replace("بآ", "بیا").replace("نآ", "نیا"),
|
||||||
|
conjugations,
|
||||||
|
)
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -1,8 +1,10 @@
|
||||||
|
from typing import Union, Iterator, Tuple
|
||||||
|
from ...tokens import Doc, Span
|
||||||
from ...symbols import NOUN, PROPN, PRON
|
from ...symbols import NOUN, PROPN, PRON
|
||||||
from ...errors import Errors
|
from ...errors import Errors
|
||||||
|
|
||||||
|
|
||||||
def noun_chunks(doclike):
|
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
|
||||||
"""
|
"""
|
||||||
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
|
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -2,10 +2,10 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||||
from ...language import Language
|
from ...language import Language, BaseDefaults
|
||||||
|
|
||||||
|
|
||||||
class FinnishDefaults(Language.Defaults):
|
class FinnishDefaults(BaseDefaults):
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
|
|
|
@ -9,10 +9,10 @@ from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from .lemmatizer import FrenchLemmatizer
|
from .lemmatizer import FrenchLemmatizer
|
||||||
from ...language import Language
|
from ...language import Language, BaseDefaults
|
||||||
|
|
||||||
|
|
||||||
class FrenchDefaults(Language.Defaults):
|
class FrenchDefaults(BaseDefaults):
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
prefixes = TOKENIZER_PREFIXES
|
prefixes = TOKENIZER_PREFIXES
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
|
|
|
@ -1,11 +1,11 @@
|
||||||
from typing import Union, Iterator
|
from typing import Union, Iterator, Tuple
|
||||||
|
|
||||||
from ...symbols import NOUN, PROPN, PRON
|
from ...symbols import NOUN, PROPN, PRON
|
||||||
from ...errors import Errors
|
from ...errors import Errors
|
||||||
from ...tokens import Doc, Span
|
from ...tokens import Doc, Span
|
||||||
|
|
||||||
|
|
||||||
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
|
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
|
||||||
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
|
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
|
||||||
# fmt: off
|
# fmt: off
|
||||||
labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
|
labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
|
||||||
|
|
|
@ -115,7 +115,7 @@ for s, verb, pronoun in [("s", "est", "il"), ("S", "EST", "IL")]:
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
_infixes_exc = []
|
_infixes_exc = [] # type: ignore[var-annotated]
|
||||||
orig_elision = "'"
|
orig_elision = "'"
|
||||||
orig_hyphen = "-"
|
orig_hyphen = "-"
|
||||||
|
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ...language import Language
|
from ...language import Language, BaseDefaults
|
||||||
|
|
||||||
|
|
||||||
class IrishDefaults(Language.Defaults):
|
class IrishDefaults(BaseDefaults):
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
|
@ -1,10 +1,10 @@
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from ...language import Language
|
from ...language import Language, BaseDefaults
|
||||||
|
|
||||||
|
|
||||||
class AncientGreekDefaults(Language.Defaults):
|
class AncientGreekDefaults(BaseDefaults):
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
lex_attr_getters = LEX_ATTRS
|
lex_attr_getters = LEX_ATTRS
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
|
|
@ -108,8 +108,4 @@ _other_exc = {
|
||||||
|
|
||||||
_exc.update(_other_exc)
|
_exc.update(_other_exc)
|
||||||
|
|
||||||
_exc_data = {}
|
|
||||||
|
|
||||||
_exc.update(_exc_data)
|
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ...language import Language
|
from ...language import Language, BaseDefaults
|
||||||
|
|
||||||
|
|
||||||
class GujaratiDefaults(Language.Defaults):
|
class GujaratiDefaults(BaseDefaults):
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from ...language import Language
|
from ...language import Language, BaseDefaults
|
||||||
|
|
||||||
|
|
||||||
class HebrewDefaults(Language.Defaults):
|
class HebrewDefaults(BaseDefaults):
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
lex_attr_getters = LEX_ATTRS
|
lex_attr_getters = LEX_ATTRS
|
||||||
writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
|
writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from ...language import Language
|
from ...language import Language, BaseDefaults
|
||||||
|
|
||||||
|
|
||||||
class HindiDefaults(Language.Defaults):
|
class HindiDefaults(BaseDefaults):
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
lex_attr_getters = LEX_ATTRS
|
lex_attr_getters = LEX_ATTRS
|
||||||
|
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ...language import Language
|
from ...language import Language, BaseDefaults
|
||||||
|
|
||||||
|
|
||||||
class CroatianDefaults(Language.Defaults):
|
class CroatianDefaults(BaseDefaults):
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,10 +1,10 @@
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
|
||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ...language import Language
|
from ...language import Language, BaseDefaults
|
||||||
|
|
||||||
|
|
||||||
class HungarianDefaults(Language.Defaults):
|
class HungarianDefaults(BaseDefaults):
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
prefixes = TOKENIZER_PREFIXES
|
prefixes = TOKENIZER_PREFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from ...language import Language
|
from ...language import Language, BaseDefaults
|
||||||
|
|
||||||
|
|
||||||
class ArmenianDefaults(Language.Defaults):
|
class ArmenianDefaults(BaseDefaults):
|
||||||
lex_attr_getters = LEX_ATTRS
|
lex_attr_getters = LEX_ATTRS
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
|
@ -3,10 +3,10 @@ from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES, TOKENIZER_INFIX
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from ...language import Language
|
from ...language import Language, BaseDefaults
|
||||||
|
|
||||||
|
|
||||||
class IndonesianDefaults(Language.Defaults):
|
class IndonesianDefaults(BaseDefaults):
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
prefixes = TOKENIZER_PREFIXES
|
prefixes = TOKENIZER_PREFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
|
|
|
@ -1,11 +1,11 @@
|
||||||
from typing import Union, Iterator
|
from typing import Union, Iterator, Tuple
|
||||||
|
|
||||||
from ...symbols import NOUN, PROPN, PRON
|
from ...symbols import NOUN, PROPN, PRON
|
||||||
from ...errors import Errors
|
from ...errors import Errors
|
||||||
from ...tokens import Doc, Span
|
from ...tokens import Doc, Span
|
||||||
|
|
||||||
|
|
||||||
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
|
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
|
||||||
"""
|
"""
|
||||||
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
|
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ...language import Language
|
from ...language import Language, BaseDefaults
|
||||||
|
|
||||||
|
|
||||||
class IcelandicDefaults(Language.Defaults):
|
class IcelandicDefaults(BaseDefaults):
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -4,11 +4,11 @@ from thinc.api import Model
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||||
from ...language import Language
|
from ...language import Language, BaseDefaults
|
||||||
from .lemmatizer import ItalianLemmatizer
|
from .lemmatizer import ItalianLemmatizer
|
||||||
|
|
||||||
|
|
||||||
class ItalianDefaults(Language.Defaults):
|
class ItalianDefaults(BaseDefaults):
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
prefixes = TOKENIZER_PREFIXES
|
prefixes = TOKENIZER_PREFIXES
|
||||||
|
|
|
@ -10,7 +10,7 @@ from .tag_orth_map import TAG_ORTH_MAP
|
||||||
from .tag_bigram_map import TAG_BIGRAM_MAP
|
from .tag_bigram_map import TAG_BIGRAM_MAP
|
||||||
from ...compat import copy_reg
|
from ...compat import copy_reg
|
||||||
from ...errors import Errors
|
from ...errors import Errors
|
||||||
from ...language import Language
|
from ...language import Language, BaseDefaults
|
||||||
from ...scorer import Scorer
|
from ...scorer import Scorer
|
||||||
from ...symbols import POS
|
from ...symbols import POS
|
||||||
from ...tokens import Doc
|
from ...tokens import Doc
|
||||||
|
@ -154,7 +154,7 @@ class JapaneseTokenizer(DummyTokenizer):
|
||||||
def to_disk(self, path: Union[str, Path], **kwargs) -> None:
|
def to_disk(self, path: Union[str, Path], **kwargs) -> None:
|
||||||
path = util.ensure_path(path)
|
path = util.ensure_path(path)
|
||||||
serializers = {"cfg": lambda p: srsly.write_json(p, self._get_config())}
|
serializers = {"cfg": lambda p: srsly.write_json(p, self._get_config())}
|
||||||
return util.to_disk(path, serializers, [])
|
util.to_disk(path, serializers, [])
|
||||||
|
|
||||||
def from_disk(self, path: Union[str, Path], **kwargs) -> "JapaneseTokenizer":
|
def from_disk(self, path: Union[str, Path], **kwargs) -> "JapaneseTokenizer":
|
||||||
path = util.ensure_path(path)
|
path = util.ensure_path(path)
|
||||||
|
@ -164,7 +164,7 @@ class JapaneseTokenizer(DummyTokenizer):
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
|
||||||
class JapaneseDefaults(Language.Defaults):
|
class JapaneseDefaults(BaseDefaults):
|
||||||
config = load_config_from_str(DEFAULT_CONFIG)
|
config = load_config_from_str(DEFAULT_CONFIG)
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
syntax_iterators = SYNTAX_ITERATORS
|
syntax_iterators = SYNTAX_ITERATORS
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import Union, Iterator
|
from typing import Union, Iterator, Tuple, Set
|
||||||
|
|
||||||
from ...symbols import NOUN, PROPN, PRON, VERB
|
from ...symbols import NOUN, PROPN, PRON, VERB
|
||||||
from ...tokens import Doc, Span
|
from ...tokens import Doc, Span
|
||||||
|
@ -10,13 +10,13 @@ labels = ["nsubj", "nmod", "ddoclike", "nsubjpass", "pcomp", "pdoclike", "doclik
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
||||||
|
|
||||||
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
|
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
|
||||||
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
|
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
|
||||||
doc = doclike.doc # Ensure works on both Doc and Span.
|
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||||
np_deps = [doc.vocab.strings.add(label) for label in labels]
|
np_deps = [doc.vocab.strings.add(label) for label in labels]
|
||||||
doc.vocab.strings.add("conj")
|
doc.vocab.strings.add("conj")
|
||||||
np_label = doc.vocab.strings.add("NP")
|
np_label = doc.vocab.strings.add("NP")
|
||||||
seen = set()
|
seen: Set[int] = set()
|
||||||
for i, word in enumerate(doclike):
|
for i, word in enumerate(doclike):
|
||||||
if word.pos not in (NOUN, PROPN, PRON):
|
if word.pos not in (NOUN, PROPN, PRON):
|
||||||
continue
|
continue
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ...language import Language
|
from ...language import Language, BaseDefaults
|
||||||
|
|
||||||
|
|
||||||
class KannadaDefaults(Language.Defaults):
|
class KannadaDefaults(BaseDefaults):
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
from typing import Optional, Any, Dict
|
from typing import Iterator, Any, Dict
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .tag_map import TAG_MAP
|
from .tag_map import TAG_MAP
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from ...language import Language
|
from ...language import Language, BaseDefaults
|
||||||
from ...tokens import Doc
|
from ...tokens import Doc
|
||||||
from ...compat import copy_reg
|
from ...compat import copy_reg
|
||||||
from ...scorer import Scorer
|
from ...scorer import Scorer
|
||||||
|
@ -29,9 +29,9 @@ def create_tokenizer():
|
||||||
|
|
||||||
|
|
||||||
class KoreanTokenizer(DummyTokenizer):
|
class KoreanTokenizer(DummyTokenizer):
|
||||||
def __init__(self, nlp: Optional[Language] = None):
|
def __init__(self, nlp: Language):
|
||||||
self.vocab = nlp.vocab
|
self.vocab = nlp.vocab
|
||||||
MeCab = try_mecab_import()
|
MeCab = try_mecab_import() # type: ignore[func-returns-value]
|
||||||
self.mecab_tokenizer = MeCab("-F%f[0],%f[7]")
|
self.mecab_tokenizer = MeCab("-F%f[0],%f[7]")
|
||||||
|
|
||||||
def __del__(self):
|
def __del__(self):
|
||||||
|
@ -49,7 +49,7 @@ class KoreanTokenizer(DummyTokenizer):
|
||||||
doc.user_data["full_tags"] = [dt["tag"] for dt in dtokens]
|
doc.user_data["full_tags"] = [dt["tag"] for dt in dtokens]
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def detailed_tokens(self, text: str) -> Dict[str, Any]:
|
def detailed_tokens(self, text: str) -> Iterator[Dict[str, Any]]:
|
||||||
# 품사 태그(POS)[0], 의미 부류(semantic class)[1], 종성 유무(jongseong)[2], 읽기(reading)[3],
|
# 품사 태그(POS)[0], 의미 부류(semantic class)[1], 종성 유무(jongseong)[2], 읽기(reading)[3],
|
||||||
# 타입(type)[4], 첫번째 품사(start pos)[5], 마지막 품사(end pos)[6], 표현(expression)[7], *
|
# 타입(type)[4], 첫번째 품사(start pos)[5], 마지막 품사(end pos)[6], 표현(expression)[7], *
|
||||||
for node in self.mecab_tokenizer.parse(text, as_nodes=True):
|
for node in self.mecab_tokenizer.parse(text, as_nodes=True):
|
||||||
|
@ -68,7 +68,7 @@ class KoreanTokenizer(DummyTokenizer):
|
||||||
return Scorer.score_tokenization(examples)
|
return Scorer.score_tokenization(examples)
|
||||||
|
|
||||||
|
|
||||||
class KoreanDefaults(Language.Defaults):
|
class KoreanDefaults(BaseDefaults):
|
||||||
config = load_config_from_str(DEFAULT_CONFIG)
|
config = load_config_from_str(DEFAULT_CONFIG)
|
||||||
lex_attr_getters = LEX_ATTRS
|
lex_attr_getters = LEX_ATTRS
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
|
|
@ -2,10 +2,10 @@ from .lex_attrs import LEX_ATTRS
|
||||||
from .punctuation import TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_INFIXES
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from ...language import Language
|
from ...language import Language, BaseDefaults
|
||||||
|
|
||||||
|
|
||||||
class KyrgyzDefaults(Language.Defaults):
|
class KyrgyzDefaults(BaseDefaults):
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
lex_attr_getters = LEX_ATTRS
|
lex_attr_getters = LEX_ATTRS
|
||||||
|
|
|
@ -2,10 +2,10 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .punctuation import TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_INFIXES
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ...language import Language
|
from ...language import Language, BaseDefaults
|
||||||
|
|
||||||
|
|
||||||
class LuxembourgishDefaults(Language.Defaults):
|
class LuxembourgishDefaults(BaseDefaults):
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
lex_attr_getters = LEX_ATTRS
|
lex_attr_getters = LEX_ATTRS
|
||||||
|
|
|
@ -1,10 +1,10 @@
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .punctuation import TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_INFIXES
|
||||||
from ...language import Language
|
from ...language import Language, BaseDefaults
|
||||||
|
|
||||||
|
|
||||||
class LigurianDefaults(Language.Defaults):
|
class LigurianDefaults(BaseDefaults):
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
|
|
@ -2,10 +2,10 @@ from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from ...language import Language
|
from ...language import Language, BaseDefaults
|
||||||
|
|
||||||
|
|
||||||
class LithuanianDefaults(Language.Defaults):
|
class LithuanianDefaults(BaseDefaults):
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ...language import Language
|
from ...language import Language, BaseDefaults
|
||||||
|
|
||||||
|
|
||||||
class LatvianDefaults(Language.Defaults):
|
class LatvianDefaults(BaseDefaults):
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -6,13 +6,13 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
|
|
||||||
from ...language import Language
|
from ...language import Language, BaseDefaults
|
||||||
from ...attrs import LANG
|
from ...attrs import LANG
|
||||||
from ...util import update_exc
|
from ...util import update_exc
|
||||||
from ...lookups import Lookups
|
from ...lookups import Lookups
|
||||||
|
|
||||||
|
|
||||||
class MacedonianDefaults(Language.Defaults):
|
class MacedonianDefaults(BaseDefaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
lex_attr_getters[LANG] = lambda text: "mk"
|
lex_attr_getters[LANG] = lambda text: "mk"
|
||||||
|
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from ...language import Language
|
from ...language import Language, BaseDefaults
|
||||||
|
|
||||||
|
|
||||||
class MalayalamDefaults(Language.Defaults):
|
class MalayalamDefaults(BaseDefaults):
|
||||||
lex_attr_getters = LEX_ATTRS
|
lex_attr_getters = LEX_ATTRS
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ...language import Language
|
from ...language import Language, BaseDefaults
|
||||||
|
|
||||||
|
|
||||||
class MarathiDefaults(Language.Defaults):
|
class MarathiDefaults(BaseDefaults):
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -5,11 +5,11 @@ from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||||
from .punctuation import TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_SUFFIXES
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from ...language import Language
|
from ...language import Language, BaseDefaults
|
||||||
from ...pipeline import Lemmatizer
|
from ...pipeline import Lemmatizer
|
||||||
|
|
||||||
|
|
||||||
class NorwegianDefaults(Language.Defaults):
|
class NorwegianDefaults(BaseDefaults):
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
prefixes = TOKENIZER_PREFIXES
|
prefixes = TOKENIZER_PREFIXES
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
|
|
|
@ -1,11 +1,11 @@
|
||||||
from typing import Union, Iterator
|
from typing import Union, Iterator, Tuple
|
||||||
|
|
||||||
from ...symbols import NOUN, PROPN, PRON
|
from ...symbols import NOUN, PROPN, PRON
|
||||||
from ...errors import Errors
|
from ...errors import Errors
|
||||||
from ...tokens import Doc, Span
|
from ...tokens import Doc, Span
|
||||||
|
|
||||||
|
|
||||||
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
|
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
|
||||||
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
|
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
|
||||||
# fmt: off
|
# fmt: off
|
||||||
labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
|
labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from ...language import Language
|
from ...language import Language, BaseDefaults
|
||||||
|
|
||||||
|
|
||||||
class NepaliDefaults(Language.Defaults):
|
class NepaliDefaults(BaseDefaults):
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
lex_attr_getters = LEX_ATTRS
|
lex_attr_getters = LEX_ATTRS
|
||||||
|
|
||||||
|
|
|
@ -9,10 +9,10 @@ from .punctuation import TOKENIZER_SUFFIXES
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from ...language import Language
|
from ...language import Language, BaseDefaults
|
||||||
|
|
||||||
|
|
||||||
class DutchDefaults(Language.Defaults):
|
class DutchDefaults(BaseDefaults):
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
prefixes = TOKENIZER_PREFIXES
|
prefixes = TOKENIZER_PREFIXES
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
|
|
|
@ -1,11 +1,11 @@
|
||||||
from typing import Union, Iterator
|
from typing import Union, Iterator, Tuple
|
||||||
|
|
||||||
from ...symbols import NOUN, PRON
|
from ...symbols import NOUN, PRON
|
||||||
from ...errors import Errors
|
from ...errors import Errors
|
||||||
from ...tokens import Doc, Span
|
from ...tokens import Doc, Span
|
||||||
|
|
||||||
|
|
||||||
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
|
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
|
||||||
"""
|
"""
|
||||||
Detect base noun phrases from a dependency parse. Works on Doc and Span.
|
Detect base noun phrases from a dependency parse. Works on Doc and Span.
|
||||||
The definition is inspired by https://www.nltk.org/book/ch07.html
|
The definition is inspired by https://www.nltk.org/book/ch07.html
|
||||||
|
|
|
@ -8,7 +8,7 @@ from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .lemmatizer import PolishLemmatizer
|
from .lemmatizer import PolishLemmatizer
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ...language import Language
|
from ...language import Language, BaseDefaults
|
||||||
|
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = {
|
TOKENIZER_EXCEPTIONS = {
|
||||||
|
@ -16,7 +16,7 @@ TOKENIZER_EXCEPTIONS = {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
class PolishDefaults(Language.Defaults):
|
class PolishDefaults(BaseDefaults):
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
prefixes = TOKENIZER_PREFIXES
|
prefixes = TOKENIZER_PREFIXES
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
|
|
|
@ -2,10 +2,10 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES
|
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES
|
||||||
from ...language import Language
|
from ...language import Language, BaseDefaults
|
||||||
|
|
||||||
|
|
||||||
class PortugueseDefaults(Language.Defaults):
|
class PortugueseDefaults(BaseDefaults):
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
prefixes = TOKENIZER_PREFIXES
|
prefixes = TOKENIZER_PREFIXES
|
||||||
|
|
|
@ -3,14 +3,14 @@ from .stop_words import STOP_WORDS
|
||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||||
from .punctuation import TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_SUFFIXES
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from ...language import Language
|
from ...language import Language, BaseDefaults
|
||||||
|
|
||||||
# Lemma data note:
|
# Lemma data note:
|
||||||
# Original pairs downloaded from http://www.lexiconista.com/datasets/lemmatization/
|
# Original pairs downloaded from http://www.lexiconista.com/datasets/lemmatization/
|
||||||
# Replaced characters using cedillas with the correct ones (ș and ț)
|
# Replaced characters using cedillas with the correct ones (ș and ț)
|
||||||
|
|
||||||
|
|
||||||
class RomanianDefaults(Language.Defaults):
|
class RomanianDefaults(BaseDefaults):
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
prefixes = TOKENIZER_PREFIXES
|
prefixes = TOKENIZER_PREFIXES
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
|
|
|
@ -5,10 +5,10 @@ from .stop_words import STOP_WORDS
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .lemmatizer import RussianLemmatizer
|
from .lemmatizer import RussianLemmatizer
|
||||||
from ...language import Language
|
from ...language import Language, BaseDefaults
|
||||||
|
|
||||||
|
|
||||||
class RussianDefaults(Language.Defaults):
|
class RussianDefaults(BaseDefaults):
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
lex_attr_getters = LEX_ATTRS
|
lex_attr_getters = LEX_ATTRS
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
|
|
@ -56,7 +56,9 @@ class RussianLemmatizer(Lemmatizer):
|
||||||
if not len(filtered_analyses):
|
if not len(filtered_analyses):
|
||||||
return [string.lower()]
|
return [string.lower()]
|
||||||
if morphology is None or (len(morphology) == 1 and POS in morphology):
|
if morphology is None or (len(morphology) == 1 and POS in morphology):
|
||||||
return list(dict.fromkeys([analysis.normal_form for analysis in filtered_analyses]))
|
return list(
|
||||||
|
dict.fromkeys([analysis.normal_form for analysis in filtered_analyses])
|
||||||
|
)
|
||||||
if univ_pos in ("ADJ", "DET", "NOUN", "PROPN"):
|
if univ_pos in ("ADJ", "DET", "NOUN", "PROPN"):
|
||||||
features_to_compare = ["Case", "Number", "Gender"]
|
features_to_compare = ["Case", "Number", "Gender"]
|
||||||
elif univ_pos == "NUM":
|
elif univ_pos == "NUM":
|
||||||
|
@ -87,7 +89,9 @@ class RussianLemmatizer(Lemmatizer):
|
||||||
filtered_analyses.append(analysis)
|
filtered_analyses.append(analysis)
|
||||||
if not len(filtered_analyses):
|
if not len(filtered_analyses):
|
||||||
return [string.lower()]
|
return [string.lower()]
|
||||||
return list(dict.fromkeys([analysis.normal_form for analysis in filtered_analyses]))
|
return list(
|
||||||
|
dict.fromkeys([analysis.normal_form for analysis in filtered_analyses])
|
||||||
|
)
|
||||||
|
|
||||||
def pymorphy2_lookup_lemmatize(self, token: Token) -> List[str]:
|
def pymorphy2_lookup_lemmatize(self, token: Token) -> List[str]:
|
||||||
string = token.text
|
string = token.text
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from ...language import Language
|
from ...language import Language, BaseDefaults
|
||||||
|
|
||||||
|
|
||||||
class SanskritDefaults(Language.Defaults):
|
class SanskritDefaults(BaseDefaults):
|
||||||
lex_attr_getters = LEX_ATTRS
|
lex_attr_getters = LEX_ATTRS
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from ...language import Language
|
from ...language import Language, BaseDefaults
|
||||||
|
|
||||||
|
|
||||||
class SinhalaDefaults(Language.Defaults):
|
class SinhalaDefaults(BaseDefaults):
|
||||||
lex_attr_getters = LEX_ATTRS
|
lex_attr_getters = LEX_ATTRS
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user