Merge pull request #9540 from adrianeboyd/chore/update-develop-from-master-v3.2-1

Update develop from master for v3.2
This commit is contained in:
Adriane Boyd 2021-10-27 08:23:57 +02:00 committed by GitHub
commit 4170110ce7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
202 changed files with 1635 additions and 968 deletions

View File

@ -25,6 +25,9 @@ steps:
${{ parameters.prefix }} python setup.py sdist --formats=gztar ${{ parameters.prefix }} python setup.py sdist --formats=gztar
displayName: "Compile and build sdist" displayName: "Compile and build sdist"
- script: python -m mypy spacy
displayName: 'Run mypy'
- task: DeleteFiles@1 - task: DeleteFiles@1
inputs: inputs:
contents: "spacy" contents: "spacy"
@ -109,3 +112,9 @@ steps:
python .github/validate_universe_json.py website/meta/universe.json python .github/validate_universe_json.py website/meta/universe.json
displayName: 'Test website/meta/universe.json' displayName: 'Test website/meta/universe.json'
condition: eq(variables['python_version'], '3.8') condition: eq(variables['python_version'], '3.8')
- script: |
${{ parameters.prefix }} python -m pip install thinc-apple-ops
${{ parameters.prefix }} python -m pytest --pyargs spacy
displayName: "Run CPU tests with thinc-apple-ops"
condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.9'))

106
.github/contributors/connorbrinton.md vendored Normal file
View File

@ -0,0 +1,106 @@
# spaCy contributor agreement
This spaCy Contributor Agreement (**"SCA"**) is based on the
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
The SCA applies to any contribution that you make to any product or project
managed by us (the **"project"**), and sets out the intellectual property rights
you grant to us in the contributed materials. The term **"us"** shall mean
[ExplosionAI GmbH](https://explosion.ai/legal). The term
**"you"** shall mean the person or entity identified below.
If you agree to be bound by these terms, fill in the information requested
below and include the filled-in version with your first pull request, under the
folder [`.github/contributors/`](/.github/contributors/). The name of the file
should be your GitHub username, with the extension `.md`. For example, the user
example_user would create the file `.github/contributors/example_user.md`.
Read this agreement carefully before signing. These terms and conditions
constitute a binding legal agreement.
## Contributor Agreement
1. The term "contribution" or "contributed materials" means any source code,
object code, patch, tool, sample, graphic, specification, manual,
documentation, or any other material posted or submitted by you to the project.
2. With respect to any worldwide copyrights, or copyright applications and
registrations, in your contribution:
* you hereby assign to us joint ownership, and to the extent that such
assignment is or becomes invalid, ineffective or unenforceable, you hereby
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
royalty-free, unrestricted license to exercise all rights under those
copyrights. This includes, at our option, the right to sublicense these same
rights to third parties through multiple levels of sublicensees or other
licensing arrangements;
* you agree that each of us can do all things in relation to your
contribution as if each of us were the sole owners, and if one of us makes
a derivative work of your contribution, the one who makes the derivative
work (or has it made will be the sole owner of that derivative work;
* you agree that you will not assert any moral rights in your contribution
against us, our licensees or transferees;
* you agree that we may register a copyright in your contribution and
exercise all ownership rights associated with it; and
* you agree that neither of us has any duty to consult with, obtain the
consent of, pay or render an accounting to the other for any use or
distribution of your contribution.
3. With respect to any patents you own, or that you can license without payment
to any third party, you hereby grant to us a perpetual, irrevocable,
non-exclusive, worldwide, no-charge, royalty-free license to:
* make, have made, use, sell, offer to sell, import, and otherwise transfer
your contribution in whole or in part, alone or in combination with or
included in any product, work or materials arising out of the project to
which your contribution was submitted, and
* at our option, to sublicense these same rights to third parties through
multiple levels of sublicensees or other licensing arrangements.
4. Except as set out above, you keep all right, title, and interest in your
contribution. The rights that you grant to us under these terms are effective
on the date you first submitted a contribution to us, even if your submission
took place before the date you sign these terms.
5. You covenant, represent, warrant and agree that:
* Each contribution that you submit is and shall be an original work of
authorship and you can legally grant the rights set out in this SCA;
* to the best of your knowledge, each contribution will not violate any
third party's copyrights, trademarks, patents, or other intellectual
property rights; and
* each contribution shall be in compliance with U.S. export control laws and
other applicable export and import laws. You agree to notify us if you
become aware of any circumstance which would make any of the foregoing
representations inaccurate in any respect. We may publicly disclose your
participation in the project, including the fact that you have signed the SCA.
6. This SCA is governed by the laws of the State of California and applicable
U.S. Federal law. Any choice of law rules will not apply.
7. Please place an “x” on one of the applicable statement below. Please do NOT
mark both statements:
* [x] I am signing on behalf of myself as an individual and no other person
or entity, including my employer, has or will have rights with respect to my
contributions.
* [ ] I am signing on behalf of my employer or a legal entity and I have the
actual authority to contractually bind that entity.
## Contributor Details
| Field | Entry |
|------------------------------- | -------------------- |
| Name | Connor Brinton |
| Company name (if applicable) | |
| Title or role (if applicable) | |
| Date | July 20th, 2021 |
| GitHub username | connorbrinton |
| Website (optional) | |

19
.github/lock.yml vendored
View File

@ -1,19 +0,0 @@
# Configuration for lock-threads - https://github.com/dessant/lock-threads
# Number of days of inactivity before a closed issue or pull request is locked
daysUntilLock: 30
# Issues and pull requests with these labels will not be locked. Set to `[]` to disable
exemptLabels: []
# Label to add before locking, such as `outdated`. Set to `false` to disable
lockLabel: false
# Comment to post before locking. Set to `false` to disable
lockComment: >
This thread has been automatically locked since there has not been
any recent activity after it was closed. Please open a new issue for
related bugs.
# Limit to only `issues` or `pulls`
only: issues

View File

@ -23,5 +23,5 @@ jobs:
env: env:
INPUT_TOKEN: ${{ secrets.EXPLOSIONBOT_TOKEN }} INPUT_TOKEN: ${{ secrets.EXPLOSIONBOT_TOKEN }}
INPUT_BK_TOKEN: ${{ secrets.BUILDKITE_SECRET }} INPUT_BK_TOKEN: ${{ secrets.BUILDKITE_SECRET }}
ENABLED_COMMANDS: "test_gpu" ENABLED_COMMANDS: "test_gpu,test_slow"
ALLOWED_TEAMS: "spaCy" ALLOWED_TEAMS: "spaCy"

25
.github/workflows/lock.yml vendored Normal file
View File

@ -0,0 +1,25 @@
name: 'Lock Threads'
on:
schedule:
- cron: '0 0 * * *' # check every day
workflow_dispatch:
permissions:
issues: write
concurrency:
group: lock
jobs:
action:
runs-on: ubuntu-latest
steps:
- uses: dessant/lock-threads@v3
with:
process-only: 'issues'
issue-inactive-days: '30'
issue-comment: >
This thread has been automatically locked since there
has not been any recent activity after it was closed.
Please open a new issue for related bugs.

View File

@ -419,7 +419,7 @@ simply click on the "Suggest edits" button at the bottom of a page.
## Publishing spaCy extensions and plugins ## Publishing spaCy extensions and plugins
We're very excited about all the new possibilities for **community extensions** We're very excited about all the new possibilities for **community extensions**
and plugins in spaCy v2.0, and we can't wait to see what you build with it! and plugins in spaCy v3.0, and we can't wait to see what you build with it!
- An extension or plugin should add substantial functionality, be - An extension or plugin should add substantial functionality, be
**well-documented** and **open-source**. It should be available for users to download **well-documented** and **open-source**. It should be available for users to download

View File

@ -1,5 +1,5 @@
recursive-include include *.h recursive-include include *.h
recursive-include spacy *.pyx *.pxd *.txt *.cfg *.jinja *.toml recursive-include spacy *.pyi *.pyx *.pxd *.txt *.cfg *.jinja *.toml
include LICENSE include LICENSE
include README.md include README.md
include pyproject.toml include pyproject.toml

View File

@ -12,15 +12,11 @@ trigger:
- "website/*" - "website/*"
- "*.md" - "*.md"
pr: pr:
paths: paths:
include: exclude:
- "*.cfg" - "*.md"
- "*.py" - "website/docs/*"
- "*.toml" - "website/src/*"
- "*.yml"
- ".github/azure-steps.yml"
- "spacy/*"
- "website/meta/universe.json"
jobs: jobs:
# Perform basic checks for most important errors (syntax etc.) Uses the config # Perform basic checks for most important errors (syntax etc.) Uses the config

View File

@ -5,7 +5,7 @@ requires = [
"cymem>=2.0.2,<2.1.0", "cymem>=2.0.2,<2.1.0",
"preshed>=3.0.2,<3.1.0", "preshed>=3.0.2,<3.1.0",
"murmurhash>=0.28.0,<1.1.0", "murmurhash>=0.28.0,<1.1.0",
"thinc>=8.0.10,<8.1.0", "thinc>=8.0.11,<8.1.0",
"blis>=0.4.0,<0.8.0", "blis>=0.4.0,<0.8.0",
"pathy", "pathy",
"numpy>=1.15.0", "numpy>=1.15.0",

View File

@ -3,7 +3,7 @@ spacy-legacy>=3.0.8,<3.1.0
spacy-loggers>=1.0.0,<2.0.0 spacy-loggers>=1.0.0,<2.0.0
cymem>=2.0.2,<2.1.0 cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0 preshed>=3.0.2,<3.1.0
thinc>=8.0.10,<8.1.0 thinc>=8.0.11,<8.1.0
blis>=0.4.0,<0.8.0 blis>=0.4.0,<0.8.0
ml_datasets>=0.2.0,<0.3.0 ml_datasets>=0.2.0,<0.3.0
murmurhash>=0.28.0,<1.1.0 murmurhash>=0.28.0,<1.1.0
@ -18,6 +18,7 @@ requests>=2.13.0,<3.0.0
tqdm>=4.38.0,<5.0.0 tqdm>=4.38.0,<5.0.0
pydantic>=1.7.4,!=1.8,!=1.8.1,<1.9.0 pydantic>=1.7.4,!=1.8,!=1.8.1,<1.9.0
jinja2 jinja2
langcodes>=3.2.0,<4.0.0
# Official Python utilities # Official Python utilities
setuptools setuptools
packaging>=20.0 packaging>=20.0
@ -30,4 +31,7 @@ pytest-timeout>=1.3.0,<2.0.0
mock>=2.0.0,<3.0.0 mock>=2.0.0,<3.0.0
flake8>=3.8.0,<3.10.0 flake8>=3.8.0,<3.10.0
hypothesis>=3.27.0,<7.0.0 hypothesis>=3.27.0,<7.0.0
langcodes>=3.2.0,<4.0.0 mypy>=0.910
types-dataclasses>=0.1.3; python_version < "3.7"
types-mock>=0.1.1
types-requests

View File

@ -37,7 +37,7 @@ setup_requires =
cymem>=2.0.2,<2.1.0 cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0 preshed>=3.0.2,<3.1.0
murmurhash>=0.28.0,<1.1.0 murmurhash>=0.28.0,<1.1.0
thinc>=8.0.10,<8.1.0 thinc>=8.0.11,<8.1.0
install_requires = install_requires =
# Our libraries # Our libraries
spacy-legacy>=3.0.8,<3.1.0 spacy-legacy>=3.0.8,<3.1.0
@ -45,7 +45,7 @@ install_requires =
murmurhash>=0.28.0,<1.1.0 murmurhash>=0.28.0,<1.1.0
cymem>=2.0.2,<2.1.0 cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0 preshed>=3.0.2,<3.1.0
thinc>=8.0.10,<8.1.0 thinc>=8.0.11,<8.1.0
blis>=0.4.0,<0.8.0 blis>=0.4.0,<0.8.0
wasabi>=0.8.1,<1.1.0 wasabi>=0.8.1,<1.1.0
srsly>=2.4.1,<3.0.0 srsly>=2.4.1,<3.0.0
@ -72,7 +72,7 @@ console_scripts =
lookups = lookups =
spacy_lookups_data>=1.0.3,<1.1.0 spacy_lookups_data>=1.0.3,<1.1.0
transformers = transformers =
spacy_transformers>=1.0.1,<1.1.0 spacy_transformers>=1.0.1,<1.2.0
ray = ray =
spacy_ray>=0.1.0,<1.0.0 spacy_ray>=0.1.0,<1.0.0
cuda = cuda =
@ -131,3 +131,4 @@ markers =
ignore_missing_imports = True ignore_missing_imports = True
no_implicit_optional = True no_implicit_optional = True
plugins = pydantic.mypy, thinc.mypy plugins = pydantic.mypy, thinc.mypy
allow_redefinition = True

View File

@ -1,4 +1,5 @@
from typing import Dict, Any, Union, List, Optional, Tuple, Iterable, TYPE_CHECKING from typing import Dict, Any, Union, List, Optional, Tuple, Iterable
from typing import TYPE_CHECKING, overload
import sys import sys
import shutil import shutil
from pathlib import Path from pathlib import Path
@ -15,6 +16,7 @@ from thinc.util import has_cupy, gpu_is_available
from configparser import InterpolationError from configparser import InterpolationError
import os import os
from ..compat import Literal
from ..schemas import ProjectConfigSchema, validate from ..schemas import ProjectConfigSchema, validate
from ..util import import_file, run_command, make_tempdir, registry, logger from ..util import import_file, run_command, make_tempdir, registry, logger
from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS
@ -260,15 +262,16 @@ def get_checksum(path: Union[Path, str]) -> str:
RETURNS (str): The checksum. RETURNS (str): The checksum.
""" """
path = Path(path) path = Path(path)
if not (path.is_file() or path.is_dir()):
msg.fail(f"Can't get checksum for {path}: not a file or directory", exits=1)
if path.is_file(): if path.is_file():
return hashlib.md5(Path(path).read_bytes()).hexdigest() return hashlib.md5(Path(path).read_bytes()).hexdigest()
if path.is_dir(): else:
# TODO: this is currently pretty slow # TODO: this is currently pretty slow
dir_checksum = hashlib.md5() dir_checksum = hashlib.md5()
for sub_file in sorted(fp for fp in path.rglob("*") if fp.is_file()): for sub_file in sorted(fp for fp in path.rglob("*") if fp.is_file()):
dir_checksum.update(sub_file.read_bytes()) dir_checksum.update(sub_file.read_bytes())
return dir_checksum.hexdigest() return dir_checksum.hexdigest()
msg.fail(f"Can't get checksum for {path}: not a file or directory", exits=1)
@contextmanager @contextmanager
@ -468,12 +471,15 @@ def get_git_version(
RETURNS (Tuple[int, int]): The version as a (major, minor) tuple. Returns RETURNS (Tuple[int, int]): The version as a (major, minor) tuple. Returns
(0, 0) if the version couldn't be determined. (0, 0) if the version couldn't be determined.
""" """
ret = run_command("git --version", capture=True) try:
ret = run_command("git --version", capture=True)
except:
raise RuntimeError(error)
stdout = ret.stdout.strip() stdout = ret.stdout.strip()
if not stdout or not stdout.startswith("git version"): if not stdout or not stdout.startswith("git version"):
return (0, 0) return 0, 0
version = stdout[11:].strip().split(".") version = stdout[11:].strip().split(".")
return (int(version[0]), int(version[1])) return int(version[0]), int(version[1])
def _http_to_git(repo: str) -> str: def _http_to_git(repo: str) -> str:
@ -500,6 +506,16 @@ def is_subpath_of(parent, child):
return os.path.commonpath([parent_realpath, child_realpath]) == parent_realpath return os.path.commonpath([parent_realpath, child_realpath]) == parent_realpath
@overload
def string_to_list(value: str, intify: Literal[False] = ...) -> List[str]:
...
@overload
def string_to_list(value: str, intify: Literal[True]) -> List[int]:
...
def string_to_list(value: str, intify: bool = False) -> Union[List[str], List[int]]: def string_to_list(value: str, intify: bool = False) -> Union[List[str], List[int]]:
"""Parse a comma-separated string to a list and account for various """Parse a comma-separated string to a list and account for various
formatting options. Mostly used to handle CLI arguments that take a list of formatting options. Mostly used to handle CLI arguments that take a list of
@ -510,7 +526,7 @@ def string_to_list(value: str, intify: bool = False) -> Union[List[str], List[in
RETURNS (Union[List[str], List[int]]): A list of strings or ints. RETURNS (Union[List[str], List[int]]): A list of strings or ints.
""" """
if not value: if not value:
return [] return [] # type: ignore[return-value]
if value.startswith("[") and value.endswith("]"): if value.startswith("[") and value.endswith("]"):
value = value[1:-1] value = value[1:-1]
result = [] result = []
@ -522,7 +538,7 @@ def string_to_list(value: str, intify: bool = False) -> Union[List[str], List[in
p = p[1:-1] p = p[1:-1]
p = p.strip() p = p.strip()
if intify: if intify:
p = int(p) p = int(p) # type: ignore[assignment]
result.append(p) result.append(p)
return result return result

View File

@ -1,4 +1,4 @@
from typing import Optional, Any, List, Union from typing import Callable, Iterable, Mapping, Optional, Any, List, Union
from enum import Enum from enum import Enum
from pathlib import Path from pathlib import Path
from wasabi import Printer from wasabi import Printer
@ -9,7 +9,7 @@ import itertools
from ._util import app, Arg, Opt from ._util import app, Arg, Opt
from ..training import docs_to_json from ..training import docs_to_json
from ..tokens import DocBin from ..tokens import Doc, DocBin
from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs
from ..training.converters import conllu_to_docs from ..training.converters import conllu_to_docs
@ -19,7 +19,7 @@ from ..training.converters import conllu_to_docs
# entry to this dict with the file extension mapped to the converter function # entry to this dict with the file extension mapped to the converter function
# imported from /converters. # imported from /converters.
CONVERTERS = { CONVERTERS: Mapping[str, Callable[..., Iterable[Doc]]] = {
"conllubio": conllu_to_docs, "conllubio": conllu_to_docs,
"conllu": conllu_to_docs, "conllu": conllu_to_docs,
"conll": conll_ner_to_docs, "conll": conll_ner_to_docs,
@ -66,19 +66,16 @@ def convert_cli(
DOCS: https://spacy.io/api/cli#convert DOCS: https://spacy.io/api/cli#convert
""" """
if isinstance(file_type, FileTypes):
# We get an instance of the FileTypes from the CLI so we need its string value
file_type = file_type.value
input_path = Path(input_path) input_path = Path(input_path)
output_dir = "-" if output_dir == Path("-") else output_dir output_dir: Union[str, Path] = "-" if output_dir == Path("-") else output_dir
silent = output_dir == "-" silent = output_dir == "-"
msg = Printer(no_print=silent) msg = Printer(no_print=silent)
verify_cli_args(msg, input_path, output_dir, file_type, converter, ner_map) verify_cli_args(msg, input_path, output_dir, file_type.value, converter, ner_map)
converter = _get_converter(msg, converter, input_path) converter = _get_converter(msg, converter, input_path)
convert( convert(
input_path, input_path,
output_dir, output_dir,
file_type=file_type, file_type=file_type.value,
n_sents=n_sents, n_sents=n_sents,
seg_sents=seg_sents, seg_sents=seg_sents,
model=model, model=model,
@ -94,7 +91,7 @@ def convert_cli(
def convert( def convert(
input_path: Union[str, Path], input_path: Path,
output_dir: Union[str, Path], output_dir: Union[str, Path],
*, *,
file_type: str = "json", file_type: str = "json",
@ -108,13 +105,14 @@ def convert(
lang: Optional[str] = None, lang: Optional[str] = None,
concatenate: bool = False, concatenate: bool = False,
silent: bool = True, silent: bool = True,
msg: Optional[Printer], msg: Optional[Printer] = None,
) -> None: ) -> None:
input_path = Path(input_path)
if not msg: if not msg:
msg = Printer(no_print=silent) msg = Printer(no_print=silent)
ner_map = srsly.read_json(ner_map) if ner_map is not None else None ner_map = srsly.read_json(ner_map) if ner_map is not None else None
doc_files = [] doc_files = []
for input_loc in walk_directory(Path(input_path), converter): for input_loc in walk_directory(input_path, converter):
with input_loc.open("r", encoding="utf-8") as infile: with input_loc.open("r", encoding="utf-8") as infile:
input_data = infile.read() input_data = infile.read()
# Use converter function to convert data # Use converter function to convert data
@ -141,7 +139,7 @@ def convert(
else: else:
db = DocBin(docs=docs, store_user_data=True) db = DocBin(docs=docs, store_user_data=True)
len_docs = len(db) len_docs = len(db)
data = db.to_bytes() data = db.to_bytes() # type: ignore[assignment]
if output_dir == "-": if output_dir == "-":
_print_docs_to_stdout(data, file_type) _print_docs_to_stdout(data, file_type)
else: else:
@ -220,13 +218,12 @@ def walk_directory(path: Path, converter: str) -> List[Path]:
def verify_cli_args( def verify_cli_args(
msg: Printer, msg: Printer,
input_path: Union[str, Path], input_path: Path,
output_dir: Union[str, Path], output_dir: Union[str, Path],
file_type: FileTypes, file_type: str,
converter: str, converter: str,
ner_map: Optional[Path], ner_map: Optional[Path],
): ):
input_path = Path(input_path)
if file_type not in FILE_TYPES_STDOUT and output_dir == "-": if file_type not in FILE_TYPES_STDOUT and output_dir == "-":
msg.fail( msg.fail(
f"Can't write .{file_type} data to stdout. Please specify an output directory.", f"Can't write .{file_type} data to stdout. Please specify an output directory.",
@ -244,13 +241,13 @@ def verify_cli_args(
msg.fail("No input files in directory", input_path, exits=1) msg.fail("No input files in directory", input_path, exits=1)
file_types = list(set([loc.suffix[1:] for loc in input_locs])) file_types = list(set([loc.suffix[1:] for loc in input_locs]))
if converter == "auto" and len(file_types) >= 2: if converter == "auto" and len(file_types) >= 2:
file_types = ",".join(file_types) file_types_str = ",".join(file_types)
msg.fail("All input files must be same type", file_types, exits=1) msg.fail("All input files must be same type", file_types_str, exits=1)
if converter != "auto" and converter not in CONVERTERS: if converter != "auto" and converter not in CONVERTERS:
msg.fail(f"Can't find converter for {converter}", exits=1) msg.fail(f"Can't find converter for {converter}", exits=1)
def _get_converter(msg, converter, input_path): def _get_converter(msg, converter, input_path: Path):
if input_path.is_dir(): if input_path.is_dir():
input_path = walk_directory(input_path, converter)[0] input_path = walk_directory(input_path, converter)[0]
if converter == "auto": if converter == "auto":

View File

@ -1,4 +1,5 @@
from typing import List, Sequence, Dict, Any, Tuple, Optional, Set from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Tuple, Union
from typing import cast, overload
from pathlib import Path from pathlib import Path
from collections import Counter from collections import Counter
import sys import sys
@ -17,6 +18,7 @@ from ..pipeline import Morphologizer
from ..morphology import Morphology from ..morphology import Morphology
from ..language import Language from ..language import Language
from ..util import registry, resolve_dot_names from ..util import registry, resolve_dot_names
from ..compat import Literal
from .. import util from .. import util
@ -201,7 +203,6 @@ def debug_data(
has_low_data_warning = False has_low_data_warning = False
has_no_neg_warning = False has_no_neg_warning = False
has_ws_ents_error = False has_ws_ents_error = False
has_punct_ents_warning = False
msg.divider("Named Entity Recognition") msg.divider("Named Entity Recognition")
msg.info(f"{len(model_labels)} label(s)") msg.info(f"{len(model_labels)} label(s)")
@ -228,10 +229,6 @@ def debug_data(
msg.fail(f"{gold_train_data['ws_ents']} invalid whitespace entity spans") msg.fail(f"{gold_train_data['ws_ents']} invalid whitespace entity spans")
has_ws_ents_error = True has_ws_ents_error = True
if gold_train_data["punct_ents"]:
msg.warn(f"{gold_train_data['punct_ents']} entity span(s) with punctuation")
has_punct_ents_warning = True
for label in labels: for label in labels:
if label_counts[label] <= NEW_LABEL_THRESHOLD: if label_counts[label] <= NEW_LABEL_THRESHOLD:
msg.warn( msg.warn(
@ -251,8 +248,6 @@ def debug_data(
msg.good("Examples without occurrences available for all labels") msg.good("Examples without occurrences available for all labels")
if not has_ws_ents_error: if not has_ws_ents_error:
msg.good("No entities consisting of or starting/ending with whitespace") msg.good("No entities consisting of or starting/ending with whitespace")
if not has_punct_ents_warning:
msg.good("No entities consisting of or starting/ending with punctuation")
if has_low_data_warning: if has_low_data_warning:
msg.text( msg.text(
@ -268,15 +263,9 @@ def debug_data(
show=verbose, show=verbose,
) )
if has_ws_ents_error: if has_ws_ents_error:
msg.text(
"As of spaCy v2.1.0, entity spans consisting of or starting/ending "
"with whitespace characters are considered invalid."
)
if has_punct_ents_warning:
msg.text( msg.text(
"Entity spans consisting of or starting/ending " "Entity spans consisting of or starting/ending "
"with punctuation can not be trained with a noise level > 0." "with whitespace characters are considered invalid."
) )
if "textcat" in factory_names: if "textcat" in factory_names:
@ -378,10 +367,11 @@ def debug_data(
if "tagger" in factory_names: if "tagger" in factory_names:
msg.divider("Part-of-speech Tagging") msg.divider("Part-of-speech Tagging")
labels = [label for label in gold_train_data["tags"]] label_list = [label for label in gold_train_data["tags"]]
model_labels = _get_labels_from_model(nlp, "tagger") model_labels = _get_labels_from_model(nlp, "tagger")
msg.info(f"{len(labels)} label(s) in train data") msg.info(f"{len(label_list)} label(s) in train data")
missing_labels = model_labels - set(labels) labels = set(label_list)
missing_labels = model_labels - labels
if missing_labels: if missing_labels:
msg.warn( msg.warn(
"Some model labels are not present in the train data. The " "Some model labels are not present in the train data. The "
@ -395,10 +385,11 @@ def debug_data(
if "morphologizer" in factory_names: if "morphologizer" in factory_names:
msg.divider("Morphologizer (POS+Morph)") msg.divider("Morphologizer (POS+Morph)")
labels = [label for label in gold_train_data["morphs"]] label_list = [label for label in gold_train_data["morphs"]]
model_labels = _get_labels_from_model(nlp, "morphologizer") model_labels = _get_labels_from_model(nlp, "morphologizer")
msg.info(f"{len(labels)} label(s) in train data") msg.info(f"{len(label_list)} label(s) in train data")
missing_labels = model_labels - set(labels) labels = set(label_list)
missing_labels = model_labels - labels
if missing_labels: if missing_labels:
msg.warn( msg.warn(
"Some model labels are not present in the train data. The " "Some model labels are not present in the train data. The "
@ -565,7 +556,7 @@ def _compile_gold(
nlp: Language, nlp: Language,
make_proj: bool, make_proj: bool,
) -> Dict[str, Any]: ) -> Dict[str, Any]:
data = { data: Dict[str, Any] = {
"ner": Counter(), "ner": Counter(),
"cats": Counter(), "cats": Counter(),
"tags": Counter(), "tags": Counter(),
@ -574,7 +565,6 @@ def _compile_gold(
"words": Counter(), "words": Counter(),
"roots": Counter(), "roots": Counter(),
"ws_ents": 0, "ws_ents": 0,
"punct_ents": 0,
"n_words": 0, "n_words": 0,
"n_misaligned_words": 0, "n_misaligned_words": 0,
"words_missing_vectors": Counter(), "words_missing_vectors": Counter(),
@ -609,16 +599,6 @@ def _compile_gold(
if label.startswith(("B-", "U-", "L-")) and doc[i].is_space: if label.startswith(("B-", "U-", "L-")) and doc[i].is_space:
# "Illegal" whitespace entity # "Illegal" whitespace entity
data["ws_ents"] += 1 data["ws_ents"] += 1
if label.startswith(("B-", "U-", "L-")) and doc[i].text in [
".",
"'",
"!",
"?",
",",
]:
# punctuation entity: could be replaced by whitespace when training with noise,
# so add a warning to alert the user to this unexpected side effect.
data["punct_ents"] += 1
if label.startswith(("B-", "U-")): if label.startswith(("B-", "U-")):
combined_label = label.split("-")[1] combined_label = label.split("-")[1]
data["ner"][combined_label] += 1 data["ner"][combined_label] += 1
@ -670,10 +650,28 @@ def _compile_gold(
return data return data
def _format_labels(labels: List[Tuple[str, int]], counts: bool = False) -> str: @overload
def _format_labels(labels: Iterable[str], counts: Literal[False] = False) -> str:
...
@overload
def _format_labels(
labels: Iterable[Tuple[str, int]],
counts: Literal[True],
) -> str:
...
def _format_labels(
labels: Union[Iterable[str], Iterable[Tuple[str, int]]],
counts: bool = False,
) -> str:
if counts: if counts:
return ", ".join([f"'{l}' ({c})" for l, c in labels]) return ", ".join(
return ", ".join([f"'{l}'" for l in labels]) [f"'{l}' ({c})" for l, c in cast(Iterable[Tuple[str, int]], labels)]
)
return ", ".join([f"'{l}'" for l in cast(Iterable[str], labels)])
def _get_examples_without_label(data: Sequence[Example], label: str) -> int: def _get_examples_without_label(data: Sequence[Example], label: str) -> int:

View File

@ -136,7 +136,7 @@ def evaluate(
def handle_scores_per_type( def handle_scores_per_type(
scores: Union[Scorer, Dict[str, Any]], scores: Dict[str, Any],
data: Dict[str, Any] = {}, data: Dict[str, Any] = {},
*, *,
spans_key: str = "sc", spans_key: str = "sc",

View File

@ -15,7 +15,7 @@ def info_cli(
model: Optional[str] = Arg(None, help="Optional loadable spaCy pipeline"), model: Optional[str] = Arg(None, help="Optional loadable spaCy pipeline"),
markdown: bool = Opt(False, "--markdown", "-md", help="Generate Markdown for GitHub issues"), markdown: bool = Opt(False, "--markdown", "-md", help="Generate Markdown for GitHub issues"),
silent: bool = Opt(False, "--silent", "-s", "-S", help="Don't print anything (just return)"), silent: bool = Opt(False, "--silent", "-s", "-S", help="Don't print anything (just return)"),
exclude: Optional[str] = Opt("labels", "--exclude", "-e", help="Comma-separated keys to exclude from the print-out"), exclude: str = Opt("labels", "--exclude", "-e", help="Comma-separated keys to exclude from the print-out"),
# fmt: on # fmt: on
): ):
""" """
@ -61,7 +61,7 @@ def info(
return raw_data return raw_data
def info_spacy() -> Dict[str, any]: def info_spacy() -> Dict[str, Any]:
"""Generate info about the current spaCy intallation. """Generate info about the current spaCy intallation.
RETURNS (dict): The spaCy info. RETURNS (dict): The spaCy info.

View File

@ -28,8 +28,8 @@ class Optimizations(str, Enum):
def init_config_cli( def init_config_cli(
# fmt: off # fmt: off
output_file: Path = Arg(..., help="File to save config.cfg to or - for stdout (will only output config and no additional logging info)", allow_dash=True), output_file: Path = Arg(..., help="File to save config.cfg to or - for stdout (will only output config and no additional logging info)", allow_dash=True),
lang: Optional[str] = Opt("en", "--lang", "-l", help="Two-letter code of the language to use"), lang: str = Opt("en", "--lang", "-l", help="Two-letter code of the language to use"),
pipeline: Optional[str] = Opt("tagger,parser,ner", "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include (without 'tok2vec' or 'transformer')"), pipeline: str = Opt("tagger,parser,ner", "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include (without 'tok2vec' or 'transformer')"),
optimize: Optimizations = Opt(Optimizations.efficiency.value, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."), optimize: Optimizations = Opt(Optimizations.efficiency.value, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."),
gpu: bool = Opt(False, "--gpu", "-G", help="Whether the model can run on GPU. This will impact the choice of architecture, pretrained weights and related hyperparameters."), gpu: bool = Opt(False, "--gpu", "-G", help="Whether the model can run on GPU. This will impact the choice of architecture, pretrained weights and related hyperparameters."),
pretraining: bool = Opt(False, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"), pretraining: bool = Opt(False, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"),
@ -44,8 +44,6 @@ def init_config_cli(
DOCS: https://spacy.io/api/cli#init-config DOCS: https://spacy.io/api/cli#init-config
""" """
if isinstance(optimize, Optimizations): # instance of enum from the CLI
optimize = optimize.value
pipeline = string_to_list(pipeline) pipeline = string_to_list(pipeline)
is_stdout = str(output_file) == "-" is_stdout = str(output_file) == "-"
if not is_stdout and output_file.exists() and not force_overwrite: if not is_stdout and output_file.exists() and not force_overwrite:
@ -57,7 +55,7 @@ def init_config_cli(
config = init_config( config = init_config(
lang=lang, lang=lang,
pipeline=pipeline, pipeline=pipeline,
optimize=optimize, optimize=optimize.value,
gpu=gpu, gpu=gpu,
pretraining=pretraining, pretraining=pretraining,
silent=is_stdout, silent=is_stdout,
@ -175,8 +173,8 @@ def init_config(
"Pipeline": ", ".join(pipeline), "Pipeline": ", ".join(pipeline),
"Optimize for": optimize, "Optimize for": optimize,
"Hardware": variables["hardware"].upper(), "Hardware": variables["hardware"].upper(),
"Transformer": template_vars.transformer.get("name") "Transformer": template_vars.transformer.get("name") # type: ignore[attr-defined]
if template_vars.use_transformer if template_vars.use_transformer # type: ignore[attr-defined]
else None, else None,
} }
msg.info("Generated config template specific for your use case") msg.info("Generated config template specific for your use case")

View File

@ -1,4 +1,4 @@
from typing import Optional, Union, Any, Dict, List, Tuple from typing import Optional, Union, Any, Dict, List, Tuple, cast
import shutil import shutil
from pathlib import Path from pathlib import Path
from wasabi import Printer, MarkdownRenderer, get_raw_input from wasabi import Printer, MarkdownRenderer, get_raw_input
@ -215,9 +215,9 @@ def get_third_party_dependencies(
for reg_name, func_names in funcs.items(): for reg_name, func_names in funcs.items():
for func_name in func_names: for func_name in func_names:
func_info = util.registry.find(reg_name, func_name) func_info = util.registry.find(reg_name, func_name)
module_name = func_info.get("module") module_name = func_info.get("module") # type: ignore[attr-defined]
if module_name: # the code is part of a module, not a --code file if module_name: # the code is part of a module, not a --code file
modules.add(func_info["module"].split(".")[0]) modules.add(func_info["module"].split(".")[0]) # type: ignore[index]
dependencies = [] dependencies = []
for module_name in modules: for module_name in modules:
if module_name in distributions: if module_name in distributions:
@ -227,7 +227,7 @@ def get_third_party_dependencies(
if pkg in own_packages or pkg in exclude: if pkg in own_packages or pkg in exclude:
continue continue
version = util.get_package_version(pkg) version = util.get_package_version(pkg)
version_range = util.get_minor_version_range(version) version_range = util.get_minor_version_range(version) # type: ignore[arg-type]
dependencies.append(f"{pkg}{version_range}") dependencies.append(f"{pkg}{version_range}")
return dependencies return dependencies
@ -252,7 +252,7 @@ def create_file(file_path: Path, contents: str) -> None:
def get_meta( def get_meta(
model_path: Union[str, Path], existing_meta: Dict[str, Any] model_path: Union[str, Path], existing_meta: Dict[str, Any]
) -> Dict[str, Any]: ) -> Dict[str, Any]:
meta = { meta: Dict[str, Any] = {
"lang": "en", "lang": "en",
"name": "pipeline", "name": "pipeline",
"version": "0.0.0", "version": "0.0.0",
@ -324,8 +324,8 @@ def generate_readme(meta: Dict[str, Any]) -> str:
license_name = meta.get("license") license_name = meta.get("license")
sources = _format_sources(meta.get("sources")) sources = _format_sources(meta.get("sources"))
description = meta.get("description") description = meta.get("description")
label_scheme = _format_label_scheme(meta.get("labels")) label_scheme = _format_label_scheme(cast(Dict[str, Any], meta.get("labels")))
accuracy = _format_accuracy(meta.get("performance")) accuracy = _format_accuracy(cast(Dict[str, Any], meta.get("performance")))
table_data = [ table_data = [
(md.bold("Name"), md.code(name)), (md.bold("Name"), md.code(name)),
(md.bold("Version"), md.code(version)), (md.bold("Version"), md.code(version)),

View File

@ -32,7 +32,7 @@ def profile_cli(
DOCS: https://spacy.io/api/cli#debug-profile DOCS: https://spacy.io/api/cli#debug-profile
""" """
if ctx.parent.command.name == NAME: # called as top-level command if ctx.parent.command.name == NAME: # type: ignore[union-attr] # called as top-level command
msg.warn( msg.warn(
"The profile command is now available via the 'debug profile' " "The profile command is now available via the 'debug profile' "
"subcommand. You can run python -m spacy debug --help for an " "subcommand. You can run python -m spacy debug --help for an "
@ -42,9 +42,9 @@ def profile_cli(
def profile(model: str, inputs: Optional[Path] = None, n_texts: int = 10000) -> None: def profile(model: str, inputs: Optional[Path] = None, n_texts: int = 10000) -> None:
if inputs is not None: if inputs is not None:
inputs = _read_inputs(inputs, msg) texts = _read_inputs(inputs, msg)
texts = list(itertools.islice(texts, n_texts))
if inputs is None: if inputs is None:
try: try:
import ml_datasets import ml_datasets
@ -56,16 +56,13 @@ def profile(model: str, inputs: Optional[Path] = None, n_texts: int = 10000) ->
exits=1, exits=1,
) )
n_inputs = 25000 with msg.loading("Loading IMDB dataset via ml_datasets..."):
with msg.loading("Loading IMDB dataset via Thinc..."): imdb_train, _ = ml_datasets.imdb(train_limit=n_texts, dev_limit=0)
imdb_train, _ = ml_datasets.imdb() texts, _ = zip(*imdb_train)
inputs, _ = zip(*imdb_train) msg.info(f"Loaded IMDB dataset and using {n_texts} examples")
msg.info(f"Loaded IMDB dataset and using {n_inputs} examples")
inputs = inputs[:n_inputs]
with msg.loading(f"Loading pipeline '{model}'..."): with msg.loading(f"Loading pipeline '{model}'..."):
nlp = load_model(model) nlp = load_model(model)
msg.good(f"Loaded pipeline '{model}'") msg.good(f"Loaded pipeline '{model}'")
texts = list(itertools.islice(inputs, n_texts))
cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof") cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof")
s = pstats.Stats("Profile.prof") s = pstats.Stats("Profile.prof")
msg.divider("Profile stats") msg.divider("Profile stats")
@ -87,7 +84,7 @@ def _read_inputs(loc: Union[Path, str], msg: Printer) -> Iterator[str]:
if not input_path.exists() or not input_path.is_file(): if not input_path.exists() or not input_path.is_file():
msg.fail("Not a valid input data file", loc, exits=1) msg.fail("Not a valid input data file", loc, exits=1)
msg.info(f"Using data from {input_path.parts[-1]}") msg.info(f"Using data from {input_path.parts[-1]}")
file_ = input_path.open() file_ = input_path.open() # type: ignore[assignment]
for line in file_: for line in file_:
data = srsly.json_loads(line) data = srsly.json_loads(line)
text = data["text"] text = data["text"]

View File

@ -133,7 +133,6 @@ def fetch_asset(
# If there's already a file, check for checksum # If there's already a file, check for checksum
if checksum == get_checksum(dest_path): if checksum == get_checksum(dest_path):
msg.good(f"Skipping download with matching checksum: {dest}") msg.good(f"Skipping download with matching checksum: {dest}")
return dest_path
# We might as well support the user here and create parent directories in # We might as well support the user here and create parent directories in
# case the asset dir isn't listed as a dir to create in the project.yml # case the asset dir isn't listed as a dir to create in the project.yml
if not dest_path.parent.exists(): if not dest_path.parent.exists():
@ -150,7 +149,6 @@ def fetch_asset(
msg.good(f"Copied local asset {dest}") msg.good(f"Copied local asset {dest}")
else: else:
msg.fail(f"Download failed: {dest}", e) msg.fail(f"Download failed: {dest}", e)
return
if checksum and checksum != get_checksum(dest_path): if checksum and checksum != get_checksum(dest_path):
msg.fail(f"Checksum doesn't match value defined in {PROJECT_FILE}: {dest}") msg.fail(f"Checksum doesn't match value defined in {PROJECT_FILE}: {dest}")

View File

@ -80,9 +80,9 @@ def check_clone(name: str, dest: Path, repo: str) -> None:
repo (str): URL of the repo to clone from. repo (str): URL of the repo to clone from.
""" """
git_err = ( git_err = (
f"Cloning spaCy project templates requires Git and the 'git' command. ", f"Cloning spaCy project templates requires Git and the 'git' command. "
f"To clone a project without Git, copy the files from the '{name}' " f"To clone a project without Git, copy the files from the '{name}' "
f"directory in the {repo} to {dest} manually.", f"directory in the {repo} to {dest} manually."
) )
get_git_version(error=git_err) get_git_version(error=git_err)
if not dest: if not dest:

View File

@ -143,8 +143,8 @@ def run_dvc_commands(
easier to pass flags like --quiet that depend on a variable or easier to pass flags like --quiet that depend on a variable or
command-line setting while avoiding lots of nested conditionals. command-line setting while avoiding lots of nested conditionals.
""" """
for command in commands: for c in commands:
command = split_command(command) command = split_command(c)
dvc_command = ["dvc", *command] dvc_command = ["dvc", *command]
# Add the flags if they are set to True # Add the flags if they are set to True
for flag, is_active in flags.items(): for flag, is_active in flags.items():

View File

@ -41,7 +41,7 @@ class RemoteStorage:
raise IOError(f"Cannot push {loc}: does not exist.") raise IOError(f"Cannot push {loc}: does not exist.")
url = self.make_url(path, command_hash, content_hash) url = self.make_url(path, command_hash, content_hash)
if url.exists(): if url.exists():
return None return url
tmp: Path tmp: Path
with make_tempdir() as tmp: with make_tempdir() as tmp:
tar_loc = tmp / self.encode_name(str(path)) tar_loc = tmp / self.encode_name(str(path))
@ -131,8 +131,10 @@ def get_command_hash(
currently installed packages, whatever environment variables have been marked currently installed packages, whatever environment variables have been marked
as relevant, and the command. as relevant, and the command.
""" """
check_commit = check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION) if check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION):
spacy_v = GIT_VERSION if check_commit else get_minor_version(about.__version__) spacy_v = GIT_VERSION
else:
spacy_v = str(get_minor_version(about.__version__) or "")
dep_checksums = [get_checksum(dep) for dep in sorted(deps)] dep_checksums = [get_checksum(dep) for dep in sorted(deps)]
hashes = [spacy_v, site_hash, env_hash] + dep_checksums hashes = [spacy_v, site_hash, env_hash] + dep_checksums
hashes.extend(cmd) hashes.extend(cmd)

View File

@ -70,7 +70,7 @@ def project_run(
config = load_project_config(project_dir, overrides=overrides) config = load_project_config(project_dir, overrides=overrides)
commands = {cmd["name"]: cmd for cmd in config.get("commands", [])} commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
workflows = config.get("workflows", {}) workflows = config.get("workflows", {})
validate_subcommand(commands.keys(), workflows.keys(), subcommand) validate_subcommand(list(commands.keys()), list(workflows.keys()), subcommand)
if subcommand in workflows: if subcommand in workflows:
msg.info(f"Running workflow '{subcommand}'") msg.info(f"Running workflow '{subcommand}'")
for cmd in workflows[subcommand]: for cmd in workflows[subcommand]:
@ -116,7 +116,7 @@ def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
workflows = config.get("workflows", {}) workflows = config.get("workflows", {})
project_loc = "" if is_cwd(project_dir) else project_dir project_loc = "" if is_cwd(project_dir) else project_dir
if subcommand: if subcommand:
validate_subcommand(commands.keys(), workflows.keys(), subcommand) validate_subcommand(list(commands.keys()), list(workflows.keys()), subcommand)
print(f"Usage: {COMMAND} project run {subcommand} {project_loc}") print(f"Usage: {COMMAND} project run {subcommand} {project_loc}")
if subcommand in commands: if subcommand in commands:
help_text = commands[subcommand].get("help") help_text = commands[subcommand].get("help")
@ -164,8 +164,8 @@ def run_commands(
when you want to turn over execution to the command, and capture=True when you want to turn over execution to the command, and capture=True
when you want to run the command more like a function. when you want to run the command more like a function.
""" """
for command in commands: for c in commands:
command = split_command(command) command = split_command(c)
# Not sure if this is needed or a good idea. Motivation: users may often # Not sure if this is needed or a good idea. Motivation: users may often
# use commands in their config that reference "python" and we want to # use commands in their config that reference "python" and we want to
# make sure that it's always executing the same Python that spaCy is # make sure that it's always executing the same Python that spaCy is
@ -294,7 +294,7 @@ def get_lock_entry(project_dir: Path, command: Dict[str, Any]) -> Dict[str, Any]
} }
def get_fileinfo(project_dir: Path, paths: List[str]) -> List[Dict[str, str]]: def get_fileinfo(project_dir: Path, paths: List[str]) -> List[Dict[str, Optional[str]]]:
"""Generate the file information for a list of paths (dependencies, outputs). """Generate the file information for a list of paths (dependencies, outputs).
Includes the file path and the file's checksum. Includes the file path and the file's checksum.

View File

@ -16,7 +16,8 @@ gpu_allocator = null
[nlp] [nlp]
lang = "{{ lang }}" lang = "{{ lang }}"
{%- if "tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "entity_linker" in components or (("textcat" in components or "textcat_multilabel" in components) and optimize == "accuracy") -%} {%- set no_tok2vec = components|length == 1 and (("textcat" in components or "textcat_multilabel" in components) and optimize == "efficiency")-%}
{%- if not no_tok2vec and ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "entity_linker" in components or "textcat" in components or "textcat_multilabel" in components) -%}
{%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components %} {%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components %}
{%- else -%} {%- else -%}
{%- set full_pipeline = components %} {%- set full_pipeline = components %}
@ -32,7 +33,7 @@ batch_size = {{ 128 if hardware == "gpu" else 1000 }}
factory = "transformer" factory = "transformer"
[components.transformer.model] [components.transformer.model]
@architectures = "spacy-transformers.TransformerModel.v1" @architectures = "spacy-transformers.TransformerModel.v3"
name = "{{ transformer["name"] }}" name = "{{ transformer["name"] }}"
tokenizer_config = {"use_fast": true} tokenizer_config = {"use_fast": true}
@ -198,7 +199,7 @@ no_output_layer = false
{# NON-TRANSFORMER PIPELINE #} {# NON-TRANSFORMER PIPELINE #}
{% else -%} {% else -%}
{% if not no_tok2vec-%}
[components.tok2vec] [components.tok2vec]
factory = "tok2vec" factory = "tok2vec"
@ -223,6 +224,7 @@ width = {{ 96 if optimize == "efficiency" else 256 }}
depth = {{ 4 if optimize == "efficiency" else 8 }} depth = {{ 4 if optimize == "efficiency" else 8 }}
window_size = 1 window_size = 1
maxout_pieces = 3 maxout_pieces = 3
{% endif -%}
{% if "morphologizer" in components %} {% if "morphologizer" in components %}
[components.morphologizer] [components.morphologizer]

View File

@ -99,7 +99,7 @@ def get_model_pkgs(silent: bool = False) -> Tuple[dict, dict]:
warnings.filterwarnings("ignore", message="\\[W09[45]") warnings.filterwarnings("ignore", message="\\[W09[45]")
model_meta = get_model_meta(model_path) model_meta = get_model_meta(model_path)
spacy_version = model_meta.get("spacy_version", "n/a") spacy_version = model_meta.get("spacy_version", "n/a")
is_compat = is_compatible_version(about.__version__, spacy_version) is_compat = is_compatible_version(about.__version__, spacy_version) # type: ignore[assignment]
pkgs[pkg_name] = { pkgs[pkg_name] = {
"name": package, "name": package,
"version": version, "version": version,

View File

@ -5,12 +5,12 @@ from thinc.util import copy_array
try: try:
import cPickle as pickle import cPickle as pickle
except ImportError: except ImportError:
import pickle import pickle # type: ignore[no-redef]
try: try:
import copy_reg import copy_reg
except ImportError: except ImportError:
import copyreg as copy_reg import copyreg as copy_reg # type: ignore[no-redef]
try: try:
from cupy.cuda.stream import Stream as CudaStream from cupy.cuda.stream import Stream as CudaStream
@ -22,10 +22,10 @@ try:
except ImportError: except ImportError:
cupy = None cupy = None
try: # Python 3.8+ if sys.version_info[:2] >= (3, 8): # Python 3.8+
from typing import Literal from typing import Literal, Protocol, runtime_checkable
except ImportError: else:
from typing_extensions import Literal # noqa: F401 from typing_extensions import Literal, Protocol, runtime_checkable # noqa: F401
# Important note: The importlib_metadata "backport" includes functionality # Important note: The importlib_metadata "backport" includes functionality
# that's not part of the built-in importlib.metadata. We should treat this # that's not part of the built-in importlib.metadata. We should treat this
@ -33,7 +33,7 @@ except ImportError:
try: # Python 3.8+ try: # Python 3.8+
import importlib.metadata as importlib_metadata import importlib.metadata as importlib_metadata
except ImportError: except ImportError:
from catalogue import _importlib_metadata as importlib_metadata # noqa: F401 from catalogue import _importlib_metadata as importlib_metadata # type: ignore[no-redef] # noqa: F401
from thinc.api import Optimizer # noqa: F401 from thinc.api import Optimizer # noqa: F401

View File

@ -18,7 +18,7 @@ RENDER_WRAPPER = None
def render( def render(
docs: Union[Iterable[Union[Doc, Span]], Doc, Span], docs: Union[Iterable[Union[Doc, Span, dict]], Doc, Span, dict],
style: str = "dep", style: str = "dep",
page: bool = False, page: bool = False,
minify: bool = False, minify: bool = False,
@ -28,7 +28,8 @@ def render(
) -> str: ) -> str:
"""Render displaCy visualisation. """Render displaCy visualisation.
docs (Union[Iterable[Doc], Doc]): Document(s) to visualise. docs (Union[Iterable[Union[Doc, Span, dict]], Doc, Span, dict]]): Document(s) to visualise.
a 'dict' is only allowed here when 'manual' is set to True
style (str): Visualisation style, 'dep' or 'ent'. style (str): Visualisation style, 'dep' or 'ent'.
page (bool): Render markup as full HTML page. page (bool): Render markup as full HTML page.
minify (bool): Minify HTML markup. minify (bool): Minify HTML markup.
@ -53,8 +54,8 @@ def render(
raise ValueError(Errors.E096) raise ValueError(Errors.E096)
renderer_func, converter = factories[style] renderer_func, converter = factories[style]
renderer = renderer_func(options=options) renderer = renderer_func(options=options)
parsed = [converter(doc, options) for doc in docs] if not manual else docs parsed = [converter(doc, options) for doc in docs] if not manual else docs # type: ignore
_html["parsed"] = renderer.render(parsed, page=page, minify=minify).strip() _html["parsed"] = renderer.render(parsed, page=page, minify=minify).strip() # type: ignore
html = _html["parsed"] html = _html["parsed"]
if RENDER_WRAPPER is not None: if RENDER_WRAPPER is not None:
html = RENDER_WRAPPER(html) html = RENDER_WRAPPER(html)
@ -133,7 +134,7 @@ def parse_deps(orig_doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
"lemma": np.root.lemma_, "lemma": np.root.lemma_,
"ent_type": np.root.ent_type_, "ent_type": np.root.ent_type_,
} }
retokenizer.merge(np, attrs=attrs) retokenizer.merge(np, attrs=attrs) # type: ignore[arg-type]
if options.get("collapse_punct", True): if options.get("collapse_punct", True):
spans = [] spans = []
for word in doc[:-1]: for word in doc[:-1]:
@ -148,7 +149,7 @@ def parse_deps(orig_doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
with doc.retokenize() as retokenizer: with doc.retokenize() as retokenizer:
for span, tag, lemma, ent_type in spans: for span, tag, lemma, ent_type in spans:
attrs = {"tag": tag, "lemma": lemma, "ent_type": ent_type} attrs = {"tag": tag, "lemma": lemma, "ent_type": ent_type}
retokenizer.merge(span, attrs=attrs) retokenizer.merge(span, attrs=attrs) # type: ignore[arg-type]
fine_grained = options.get("fine_grained") fine_grained = options.get("fine_grained")
add_lemma = options.get("add_lemma") add_lemma = options.get("add_lemma")
words = [ words = [

View File

@ -190,6 +190,8 @@ class Warnings:
"vectors. This is almost certainly a mistake.") "vectors. This is almost certainly a mistake.")
W113 = ("Sourced component '{name}' may not work as expected: source " W113 = ("Sourced component '{name}' may not work as expected: source "
"vectors are not identical to current pipeline vectors.") "vectors are not identical to current pipeline vectors.")
W114 = ("Using multiprocessing with GPU models is not recommended and may "
"lead to errors.")
@add_codes @add_codes

View File

@ -1,5 +1,5 @@
# cython: infer_types=True, profile=True # cython: infer_types=True, profile=True
from typing import Iterator, Iterable from typing import Iterator, Iterable, Callable, Dict, Any
import srsly import srsly
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
@ -96,6 +96,8 @@ cdef class KnowledgeBase:
def initialize_entities(self, int64_t nr_entities): def initialize_entities(self, int64_t nr_entities):
self._entry_index = PreshMap(nr_entities + 1) self._entry_index = PreshMap(nr_entities + 1)
self._entries = entry_vec(nr_entities + 1) self._entries = entry_vec(nr_entities + 1)
def initialize_vectors(self, int64_t nr_entities):
self._vectors_table = float_matrix(nr_entities + 1) self._vectors_table = float_matrix(nr_entities + 1)
def initialize_aliases(self, int64_t nr_aliases): def initialize_aliases(self, int64_t nr_aliases):
@ -154,6 +156,7 @@ cdef class KnowledgeBase:
nr_entities = len(set(entity_list)) nr_entities = len(set(entity_list))
self.initialize_entities(nr_entities) self.initialize_entities(nr_entities)
self.initialize_vectors(nr_entities)
i = 0 i = 0
cdef KBEntryC entry cdef KBEntryC entry
@ -172,8 +175,8 @@ cdef class KnowledgeBase:
entry.entity_hash = entity_hash entry.entity_hash = entity_hash
entry.freq = freq_list[i] entry.freq = freq_list[i]
vector_index = self.c_add_vector(entity_vector=vector_list[i]) self._vectors_table[i] = entity_vector
entry.vector_index = vector_index entry.vector_index = i
entry.feats_row = -1 # Features table currently not implemented entry.feats_row = -1 # Features table currently not implemented
@ -386,6 +389,7 @@ cdef class KnowledgeBase:
nr_aliases = header[1] nr_aliases = header[1]
entity_vector_length = header[2] entity_vector_length = header[2]
self.initialize_entities(nr_entities) self.initialize_entities(nr_entities)
self.initialize_vectors(nr_entities)
self.initialize_aliases(nr_aliases) self.initialize_aliases(nr_aliases)
self.entity_vector_length = entity_vector_length self.entity_vector_length = entity_vector_length
@ -446,7 +450,7 @@ cdef class KnowledgeBase:
raise ValueError(Errors.E929.format(loc=path)) raise ValueError(Errors.E929.format(loc=path))
if not path.is_dir(): if not path.is_dir():
raise ValueError(Errors.E928.format(loc=path)) raise ValueError(Errors.E928.format(loc=path))
deserialize = {} deserialize: Dict[str, Callable[[Any], Any]] = {}
deserialize["contents"] = lambda p: self.read_contents(p) deserialize["contents"] = lambda p: self.read_contents(p)
deserialize["strings.json"] = lambda p: self.vocab.strings.from_disk(p) deserialize["strings.json"] = lambda p: self.vocab.strings.from_disk(p)
util.from_disk(path, deserialize, exclude) util.from_disk(path, deserialize, exclude)
@ -509,6 +513,7 @@ cdef class KnowledgeBase:
reader.read_header(&nr_entities, &entity_vector_length) reader.read_header(&nr_entities, &entity_vector_length)
self.initialize_entities(nr_entities) self.initialize_entities(nr_entities)
self.initialize_vectors(nr_entities)
self.entity_vector_length = entity_vector_length self.entity_vector_length = entity_vector_length
# STEP 1: load entity vectors # STEP 1: load entity vectors

View File

@ -1,8 +1,8 @@
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ...language import Language from ...language import Language, BaseDefaults
class AfrikaansDefaults(Language.Defaults): class AfrikaansDefaults(BaseDefaults):
stop_words = STOP_WORDS stop_words = STOP_WORDS

View File

@ -4,12 +4,12 @@ from .punctuation import TOKENIZER_SUFFIXES
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language, BaseDefaults
from ...attrs import LANG from ...attrs import LANG
from ...util import update_exc from ...util import update_exc
class AmharicDefaults(Language.Defaults): class AmharicDefaults(BaseDefaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS) lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: "am" lex_attr_getters[LANG] = lambda text: "am"

View File

@ -2,10 +2,10 @@ from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_SUFFIXES
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from ...language import Language from ...language import Language, BaseDefaults
class ArabicDefaults(Language.Defaults): class ArabicDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
stop_words = STOP_WORDS stop_words = STOP_WORDS

View File

@ -1,9 +1,9 @@
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ...language import Language from ...language import Language, BaseDefaults
class AzerbaijaniDefaults(Language.Defaults): class AzerbaijaniDefaults(BaseDefaults):
lex_attr_getters = LEX_ATTRS lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS stop_words = STOP_WORDS

View File

@ -3,12 +3,12 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language, BaseDefaults
from ...attrs import LANG from ...attrs import LANG
from ...util import update_exc from ...util import update_exc
class BulgarianDefaults(Language.Defaults): class BulgarianDefaults(BaseDefaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: "bg" lex_attr_getters[LANG] = lambda text: "bg"

View File

@ -3,11 +3,11 @@ from thinc.api import Model
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ...language import Language from ...language import Language, BaseDefaults
from ...pipeline import Lemmatizer from ...pipeline import Lemmatizer
class BengaliDefaults(Language.Defaults): class BengaliDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES

View File

@ -7,11 +7,11 @@ from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES, TOKENIZER_PREFIX
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
from ...language import Language from ...language import Language, BaseDefaults
from .lemmatizer import CatalanLemmatizer from .lemmatizer import CatalanLemmatizer
class CatalanDefaults(Language.Defaults): class CatalanDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES

View File

@ -1,8 +1,10 @@
from typing import Union, Iterator, Tuple
from ...tokens import Doc, Span
from ...symbols import NOUN, PROPN from ...symbols import NOUN, PROPN
from ...errors import Errors from ...errors import Errors
def noun_chunks(doclike): def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
"""Detect base noun phrases from a dependency parse. Works on Doc and Span.""" """Detect base noun phrases from a dependency parse. Works on Doc and Span."""
# fmt: off # fmt: off
labels = ["nsubj", "nsubj:pass", "obj", "obl", "iobj", "ROOT", "appos", "nmod", "nmod:poss"] labels = ["nsubj", "nsubj:pass", "obj", "obl", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]

View File

@ -1,9 +1,9 @@
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ...language import Language from ...language import Language, BaseDefaults
class CzechDefaults(Language.Defaults): class CzechDefaults(BaseDefaults):
lex_attr_getters = LEX_ATTRS lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS stop_words = STOP_WORDS

View File

@ -3,10 +3,10 @@ from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
from ...language import Language from ...language import Language, BaseDefaults
class DanishDefaults(Language.Defaults): class DanishDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES

View File

@ -1,8 +1,10 @@
from typing import Union, Iterator, Tuple
from ...tokens import Doc, Span
from ...symbols import NOUN, PROPN, PRON, VERB, AUX from ...symbols import NOUN, PROPN, PRON, VERB, AUX
from ...errors import Errors from ...errors import Errors
def noun_chunks(doclike): def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
def is_verb_token(tok): def is_verb_token(tok):
return tok.pos in [VERB, AUX] return tok.pos in [VERB, AUX]
@ -32,7 +34,7 @@ def noun_chunks(doclike):
def get_bounds(doc, root): def get_bounds(doc, root):
return get_left_bound(doc, root), get_right_bound(doc, root) return get_left_bound(doc, root), get_right_bound(doc, root)
doc = doclike.doc doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.has_annotation("DEP"): if not doc.has_annotation("DEP"):
raise ValueError(Errors.E029) raise ValueError(Errors.E029)

View File

@ -2,10 +2,10 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
from ...language import Language from ...language import Language, BaseDefaults
class GermanDefaults(Language.Defaults): class GermanDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES

View File

@ -1,11 +1,11 @@
from typing import Union, Iterator from typing import Union, Iterator, Tuple
from ...symbols import NOUN, PROPN, PRON from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors from ...errors import Errors
from ...tokens import Doc, Span from ...tokens import Doc, Span
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]: def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
"""Detect base noun phrases from a dependency parse. Works on Doc and Span.""" """Detect base noun phrases from a dependency parse. Works on Doc and Span."""
# this iterator extracts spans headed by NOUNs starting from the left-most # this iterator extracts spans headed by NOUNs starting from the left-most
# syntactic dependent until the NOUN itself for close apposition and # syntactic dependent until the NOUN itself for close apposition and

View File

@ -7,10 +7,10 @@ from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from .lemmatizer import GreekLemmatizer from .lemmatizer import GreekLemmatizer
from ...language import Language from ...language import Language, BaseDefaults
class GreekDefaults(Language.Defaults): class GreekDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES

View File

@ -1,11 +1,11 @@
from typing import Union, Iterator from typing import Union, Iterator, Tuple
from ...symbols import NOUN, PROPN, PRON from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors from ...errors import Errors
from ...tokens import Doc, Span from ...tokens import Doc, Span
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]: def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
"""Detect base noun phrases from a dependency parse. Works on Doc and Span.""" """Detect base noun phrases from a dependency parse. Works on Doc and Span."""
# It follows the logic of the noun chunks finder of English language, # It follows the logic of the noun chunks finder of English language,
# adjusted to some Greek language special characteristics. # adjusted to some Greek language special characteristics.

View File

@ -7,10 +7,10 @@ from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
from .punctuation import TOKENIZER_INFIXES from .punctuation import TOKENIZER_INFIXES
from .lemmatizer import EnglishLemmatizer from .lemmatizer import EnglishLemmatizer
from ...language import Language from ...language import Language, BaseDefaults
class EnglishDefaults(Language.Defaults): class EnglishDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
lex_attr_getters = LEX_ATTRS lex_attr_getters = LEX_ATTRS

View File

@ -19,7 +19,7 @@ _ordinal_words = [
# fmt: on # fmt: on
def like_num(text: str) -> bool: def like_num(text):
if text.startswith(("+", "-", "±", "~")): if text.startswith(("+", "-", "±", "~")):
text = text[1:] text = text[1:]
text = text.replace(",", "").replace(".", "") text = text.replace(",", "").replace(".", "")

View File

@ -1,11 +1,11 @@
from typing import Union, Iterator from typing import Union, Iterator, Tuple
from ...symbols import NOUN, PROPN, PRON from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors from ...errors import Errors
from ...tokens import Doc, Span from ...tokens import Doc, Span
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]: def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
""" """
Detect base noun phrases from a dependency parse. Works on both Doc and Span. Detect base noun phrases from a dependency parse. Works on both Doc and Span.
""" """

View File

@ -1,9 +1,10 @@
from typing import Dict, List
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, NORM from ...symbols import ORTH, NORM
from ...util import update_exc from ...util import update_exc
_exc = {} _exc: Dict[str, List[Dict]] = {}
_exclude = [ _exclude = [
"Ill", "Ill",
"ill", "ill",
@ -294,9 +295,9 @@ for verb_data in [
{ORTH: "has", NORM: "has"}, {ORTH: "has", NORM: "has"},
{ORTH: "dare", NORM: "dare"}, {ORTH: "dare", NORM: "dare"},
]: ]:
verb_data_tc = dict(verb_data) verb_data_tc = dict(verb_data) # type: ignore[call-overload]
verb_data_tc[ORTH] = verb_data_tc[ORTH].title() verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
for data in [verb_data, verb_data_tc]: for data in [verb_data, verb_data_tc]: # type: ignore[assignment]
_exc[data[ORTH] + "n't"] = [ _exc[data[ORTH] + "n't"] = [
dict(data), dict(data),
{ORTH: "n't", NORM: "not"}, {ORTH: "n't", NORM: "not"},

View File

@ -6,10 +6,10 @@ from .lex_attrs import LEX_ATTRS
from .lemmatizer import SpanishLemmatizer from .lemmatizer import SpanishLemmatizer
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
from ...language import Language from ...language import Language, BaseDefaults
class SpanishDefaults(Language.Defaults): class SpanishDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES

View File

@ -52,7 +52,7 @@ class SpanishLemmatizer(Lemmatizer):
rule_pos = "verb" rule_pos = "verb"
else: else:
rule_pos = pos rule_pos = pos
rule = self.select_rule(rule_pos, features) rule = self.select_rule(rule_pos, list(features))
index = self.lookups.get_table("lemma_index").get(rule_pos, []) index = self.lookups.get_table("lemma_index").get(rule_pos, [])
lemmas = getattr(self, "lemmatize_" + rule_pos)( lemmas = getattr(self, "lemmatize_" + rule_pos)(
string, features, rule, index string, features, rule, index
@ -191,6 +191,8 @@ class SpanishLemmatizer(Lemmatizer):
return selected_lemmas return selected_lemmas
else: else:
return possible_lemmas return possible_lemmas
else:
return []
def lemmatize_noun( def lemmatize_noun(
self, word: str, features: List[str], rule: str, index: List[str] self, word: str, features: List[str], rule: str, index: List[str]
@ -268,7 +270,7 @@ class SpanishLemmatizer(Lemmatizer):
return [word] return [word]
def lemmatize_pron( def lemmatize_pron(
self, word: str, features: List[str], rule: str, index: List[str] self, word: str, features: List[str], rule: Optional[str], index: List[str]
) -> List[str]: ) -> List[str]:
""" """
Lemmatize a pronoun. Lemmatize a pronoun.
@ -319,9 +321,11 @@ class SpanishLemmatizer(Lemmatizer):
return selected_lemmas return selected_lemmas
else: else:
return possible_lemmas return possible_lemmas
else:
return []
def lemmatize_verb( def lemmatize_verb(
self, word: str, features: List[str], rule: str, index: List[str] self, word: str, features: List[str], rule: Optional[str], index: List[str]
) -> List[str]: ) -> List[str]:
""" """
Lemmatize a verb. Lemmatize a verb.
@ -342,6 +346,7 @@ class SpanishLemmatizer(Lemmatizer):
selected_lemmas = [] selected_lemmas = []
# Apply lemmatization rules # Apply lemmatization rules
rule = str(rule or "")
for old, new in self.lookups.get_table("lemma_rules").get(rule, []): for old, new in self.lookups.get_table("lemma_rules").get(rule, []):
possible_lemma = re.sub(old + "$", new, word) possible_lemma = re.sub(old + "$", new, word)
if possible_lemma != word: if possible_lemma != word:
@ -389,11 +394,11 @@ class SpanishLemmatizer(Lemmatizer):
return [word] return [word]
def lemmatize_verb_pron( def lemmatize_verb_pron(
self, word: str, features: List[str], rule: str, index: List[str] self, word: str, features: List[str], rule: Optional[str], index: List[str]
) -> List[str]: ) -> List[str]:
# Strip and collect pronouns # Strip and collect pronouns
pron_patt = "^(.*?)([mts]e|l[aeo]s?|n?os)$" pron_patt = "^(.*?)([mts]e|l[aeo]s?|n?os)$"
prons = [] prons: List[str] = []
verb = word verb = word
m = re.search(pron_patt, verb) m = re.search(pron_patt, verb)
while m is not None and len(prons) <= 3: while m is not None and len(prons) <= 3:
@ -410,7 +415,7 @@ class SpanishLemmatizer(Lemmatizer):
else: else:
rule = self.select_rule("verb", features) rule = self.select_rule("verb", features)
verb_lemma = self.lemmatize_verb( verb_lemma = self.lemmatize_verb(
verb, features - {"PronType=Prs"}, rule, index verb, features - {"PronType=Prs"}, rule, index # type: ignore[operator]
)[0] )[0]
pron_lemmas = [] pron_lemmas = []
for pron in prons: for pron in prons:

View File

@ -1,11 +1,11 @@
from typing import Union, Iterator from typing import Union, Iterator, Tuple
from ...symbols import NOUN, PROPN, PRON, VERB, AUX from ...symbols import NOUN, PROPN, PRON, VERB, AUX
from ...errors import Errors from ...errors import Errors
from ...tokens import Doc, Span, Token from ...tokens import Doc, Span, Token
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]: def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
"""Detect base noun phrases from a dependency parse. Works on Doc and Span.""" """Detect base noun phrases from a dependency parse. Works on Doc and Span."""
doc = doclike.doc doc = doclike.doc
if not doc.has_annotation("DEP"): if not doc.has_annotation("DEP"):

View File

@ -1,8 +1,8 @@
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ...language import Language from ...language import Language, BaseDefaults
class EstonianDefaults(Language.Defaults): class EstonianDefaults(BaseDefaults):
stop_words = STOP_WORDS stop_words = STOP_WORDS

View File

@ -1,10 +1,10 @@
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_SUFFIXES
from ...language import Language from ...language import Language, BaseDefaults
class BasqueDefaults(Language.Defaults): class BasqueDefaults(BaseDefaults):
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
stop_words = STOP_WORDS stop_words = STOP_WORDS
lex_attr_getters = LEX_ATTRS lex_attr_getters = LEX_ATTRS

View File

@ -5,11 +5,11 @@ from .lex_attrs import LEX_ATTRS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_SUFFIXES
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
from ...language import Language from ...language import Language, BaseDefaults
from ...pipeline import Lemmatizer from ...pipeline import Lemmatizer
class PersianDefaults(Language.Defaults): class PersianDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
lex_attr_getters = LEX_ATTRS lex_attr_getters = LEX_ATTRS

View File

@ -639,10 +639,12 @@ for verb_root in verb_roots:
) )
if past.startswith("آ"): if past.startswith("آ"):
conjugations = set( conjugations = list(
map( set(
lambda item: item.replace("بآ", "بیا").replace("نآ", "نیا"), map(
conjugations, lambda item: item.replace("بآ", "بیا").replace("نآ", "نیا"),
conjugations,
)
) )
) )

View File

@ -1,8 +1,10 @@
from typing import Union, Iterator, Tuple
from ...tokens import Doc, Span
from ...symbols import NOUN, PROPN, PRON from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors from ...errors import Errors
def noun_chunks(doclike): def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
""" """
Detect base noun phrases from a dependency parse. Works on both Doc and Span. Detect base noun phrases from a dependency parse. Works on both Doc and Span.
""" """

View File

@ -2,10 +2,10 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
from ...language import Language from ...language import Language, BaseDefaults
class FinnishDefaults(Language.Defaults): class FinnishDefaults(BaseDefaults):
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS

View File

@ -9,10 +9,10 @@ from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
from .lemmatizer import FrenchLemmatizer from .lemmatizer import FrenchLemmatizer
from ...language import Language from ...language import Language, BaseDefaults
class FrenchDefaults(Language.Defaults): class FrenchDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES

View File

@ -1,11 +1,11 @@
from typing import Union, Iterator from typing import Union, Iterator, Tuple
from ...symbols import NOUN, PROPN, PRON from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors from ...errors import Errors
from ...tokens import Doc, Span from ...tokens import Doc, Span
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]: def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
"""Detect base noun phrases from a dependency parse. Works on Doc and Span.""" """Detect base noun phrases from a dependency parse. Works on Doc and Span."""
# fmt: off # fmt: off
labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"] labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]

View File

@ -115,7 +115,7 @@ for s, verb, pronoun in [("s", "est", "il"), ("S", "EST", "IL")]:
] ]
_infixes_exc = [] _infixes_exc = [] # type: ignore[var-annotated]
orig_elision = "'" orig_elision = "'"
orig_hyphen = "-" orig_hyphen = "-"

View File

@ -4,11 +4,11 @@ from thinc.api import Model
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ...language import Language from ...language import Language, BaseDefaults
from .lemmatizer import IrishLemmatizer from .lemmatizer import IrishLemmatizer
class IrishDefaults(Language.Defaults): class IrishDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
stop_words = STOP_WORDS stop_words = STOP_WORDS

View File

@ -1,10 +1,10 @@
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ...language import Language from ...language import Language, BaseDefaults
class AncientGreekDefaults(Language.Defaults): class AncientGreekDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
lex_attr_getters = LEX_ATTRS lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS stop_words = STOP_WORDS

View File

@ -108,8 +108,4 @@ _other_exc = {
_exc.update(_other_exc) _exc.update(_other_exc)
_exc_data = {}
_exc.update(_exc_data)
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -1,8 +1,8 @@
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ...language import Language from ...language import Language, BaseDefaults
class GujaratiDefaults(Language.Defaults): class GujaratiDefaults(BaseDefaults):
stop_words = STOP_WORDS stop_words = STOP_WORDS

View File

@ -1,9 +1,9 @@
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ...language import Language from ...language import Language, BaseDefaults
class HebrewDefaults(Language.Defaults): class HebrewDefaults(BaseDefaults):
stop_words = STOP_WORDS stop_words = STOP_WORDS
lex_attr_getters = LEX_ATTRS lex_attr_getters = LEX_ATTRS
writing_system = {"direction": "rtl", "has_case": False, "has_letters": True} writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}

View File

@ -1,9 +1,9 @@
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ...language import Language from ...language import Language, BaseDefaults
class HindiDefaults(Language.Defaults): class HindiDefaults(BaseDefaults):
stop_words = STOP_WORDS stop_words = STOP_WORDS
lex_attr_getters = LEX_ATTRS lex_attr_getters = LEX_ATTRS

View File

@ -1,8 +1,8 @@
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ...language import Language from ...language import Language, BaseDefaults
class CroatianDefaults(Language.Defaults): class CroatianDefaults(BaseDefaults):
stop_words = STOP_WORDS stop_words = STOP_WORDS

View File

@ -1,10 +1,10 @@
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ...language import Language from ...language import Language, BaseDefaults
class HungarianDefaults(Language.Defaults): class HungarianDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES

View File

@ -1,9 +1,9 @@
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ...language import Language from ...language import Language, BaseDefaults
class ArmenianDefaults(Language.Defaults): class ArmenianDefaults(BaseDefaults):
lex_attr_getters = LEX_ATTRS lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS stop_words = STOP_WORDS

View File

@ -3,10 +3,10 @@ from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES, TOKENIZER_INFIX
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
from ...language import Language from ...language import Language, BaseDefaults
class IndonesianDefaults(Language.Defaults): class IndonesianDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES

View File

@ -1,11 +1,11 @@
from typing import Union, Iterator from typing import Union, Iterator, Tuple
from ...symbols import NOUN, PROPN, PRON from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors from ...errors import Errors
from ...tokens import Doc, Span from ...tokens import Doc, Span
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]: def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
""" """
Detect base noun phrases from a dependency parse. Works on both Doc and Span. Detect base noun phrases from a dependency parse. Works on both Doc and Span.
""" """

View File

@ -1,8 +1,8 @@
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ...language import Language from ...language import Language, BaseDefaults
class IcelandicDefaults(Language.Defaults): class IcelandicDefaults(BaseDefaults):
stop_words = STOP_WORDS stop_words = STOP_WORDS

View File

@ -4,11 +4,11 @@ from thinc.api import Model
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from ...language import Language from ...language import Language, BaseDefaults
from .lemmatizer import ItalianLemmatizer from .lemmatizer import ItalianLemmatizer
class ItalianDefaults(Language.Defaults): class ItalianDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
stop_words = STOP_WORDS stop_words = STOP_WORDS
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES

View File

@ -11,7 +11,7 @@ from .tag_map import TAG_MAP
from .tag_orth_map import TAG_ORTH_MAP from .tag_orth_map import TAG_ORTH_MAP
from .tag_bigram_map import TAG_BIGRAM_MAP from .tag_bigram_map import TAG_BIGRAM_MAP
from ...errors import Errors from ...errors import Errors
from ...language import Language from ...language import Language, BaseDefaults
from ...pipeline import Morphologizer from ...pipeline import Morphologizer
from ...pipeline.morphologizer import DEFAULT_MORPH_MODEL from ...pipeline.morphologizer import DEFAULT_MORPH_MODEL
from ...scorer import Scorer from ...scorer import Scorer
@ -172,7 +172,7 @@ class JapaneseTokenizer(DummyTokenizer):
def to_disk(self, path: Union[str, Path], **kwargs) -> None: def to_disk(self, path: Union[str, Path], **kwargs) -> None:
path = util.ensure_path(path) path = util.ensure_path(path)
serializers = {"cfg": lambda p: srsly.write_json(p, self._get_config())} serializers = {"cfg": lambda p: srsly.write_json(p, self._get_config())}
return util.to_disk(path, serializers, []) util.to_disk(path, serializers, [])
def from_disk(self, path: Union[str, Path], **kwargs) -> "JapaneseTokenizer": def from_disk(self, path: Union[str, Path], **kwargs) -> "JapaneseTokenizer":
path = util.ensure_path(path) path = util.ensure_path(path)
@ -182,7 +182,7 @@ class JapaneseTokenizer(DummyTokenizer):
return self return self
class JapaneseDefaults(Language.Defaults): class JapaneseDefaults(BaseDefaults):
config = load_config_from_str(DEFAULT_CONFIG) config = load_config_from_str(DEFAULT_CONFIG)
stop_words = STOP_WORDS stop_words = STOP_WORDS
syntax_iterators = SYNTAX_ITERATORS syntax_iterators = SYNTAX_ITERATORS

View File

@ -1,4 +1,4 @@
from typing import Union, Iterator from typing import Union, Iterator, Tuple, Set
from ...symbols import NOUN, PROPN, PRON, VERB from ...symbols import NOUN, PROPN, PRON, VERB
from ...tokens import Doc, Span from ...tokens import Doc, Span
@ -10,13 +10,13 @@ labels = ["nsubj", "nmod", "ddoclike", "nsubjpass", "pcomp", "pdoclike", "doclik
# fmt: on # fmt: on
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]: def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
"""Detect base noun phrases from a dependency parse. Works on Doc and Span.""" """Detect base noun phrases from a dependency parse. Works on Doc and Span."""
doc = doclike.doc # Ensure works on both Doc and Span. doc = doclike.doc # Ensure works on both Doc and Span.
np_deps = [doc.vocab.strings.add(label) for label in labels] np_deps = [doc.vocab.strings.add(label) for label in labels]
doc.vocab.strings.add("conj") doc.vocab.strings.add("conj")
np_label = doc.vocab.strings.add("NP") np_label = doc.vocab.strings.add("NP")
seen = set() seen: Set[int] = set()
for i, word in enumerate(doclike): for i, word in enumerate(doclike):
if word.pos not in (NOUN, PROPN, PRON): if word.pos not in (NOUN, PROPN, PRON):
continue continue

View File

@ -1,8 +1,8 @@
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ...language import Language from ...language import Language, BaseDefaults
class KannadaDefaults(Language.Defaults): class KannadaDefaults(BaseDefaults):
stop_words = STOP_WORDS stop_words = STOP_WORDS

View File

@ -1,9 +1,9 @@
from typing import Any, Dict from typing import Iterator, Any, Dict
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .tag_map import TAG_MAP from .tag_map import TAG_MAP
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ...language import Language from ...language import Language, BaseDefaults
from ...tokens import Doc from ...tokens import Doc
from ...scorer import Scorer from ...scorer import Scorer
from ...symbols import POS from ...symbols import POS
@ -31,7 +31,7 @@ def create_tokenizer():
class KoreanTokenizer(DummyTokenizer): class KoreanTokenizer(DummyTokenizer):
def __init__(self, vocab: Vocab): def __init__(self, vocab: Vocab):
self.vocab = vocab self.vocab = vocab
MeCab = try_mecab_import() MeCab = try_mecab_import() # type: ignore[func-returns-value]
self.mecab_tokenizer = MeCab("-F%f[0],%f[7]") self.mecab_tokenizer = MeCab("-F%f[0],%f[7]")
def __reduce__(self): def __reduce__(self):
@ -52,7 +52,7 @@ class KoreanTokenizer(DummyTokenizer):
doc.user_data["full_tags"] = [dt["tag"] for dt in dtokens] doc.user_data["full_tags"] = [dt["tag"] for dt in dtokens]
return doc return doc
def detailed_tokens(self, text: str) -> Dict[str, Any]: def detailed_tokens(self, text: str) -> Iterator[Dict[str, Any]]:
# 품사 태그(POS)[0], 의미 부류(semantic class)[1], 종성 유무(jongseong)[2], 읽기(reading)[3], # 품사 태그(POS)[0], 의미 부류(semantic class)[1], 종성 유무(jongseong)[2], 읽기(reading)[3],
# 타입(type)[4], 첫번째 품사(start pos)[5], 마지막 품사(end pos)[6], 표현(expression)[7], * # 타입(type)[4], 첫번째 품사(start pos)[5], 마지막 품사(end pos)[6], 표현(expression)[7], *
for node in self.mecab_tokenizer.parse(text, as_nodes=True): for node in self.mecab_tokenizer.parse(text, as_nodes=True):
@ -71,7 +71,7 @@ class KoreanTokenizer(DummyTokenizer):
return Scorer.score_tokenization(examples) return Scorer.score_tokenization(examples)
class KoreanDefaults(Language.Defaults): class KoreanDefaults(BaseDefaults):
config = load_config_from_str(DEFAULT_CONFIG) config = load_config_from_str(DEFAULT_CONFIG)
lex_attr_getters = LEX_ATTRS lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS stop_words = STOP_WORDS

View File

@ -2,10 +2,10 @@ from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_INFIXES from .punctuation import TOKENIZER_INFIXES
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from ...language import Language from ...language import Language, BaseDefaults
class KyrgyzDefaults(Language.Defaults): class KyrgyzDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
lex_attr_getters = LEX_ATTRS lex_attr_getters = LEX_ATTRS

View File

@ -2,10 +2,10 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_INFIXES from .punctuation import TOKENIZER_INFIXES
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ...language import Language from ...language import Language, BaseDefaults
class LuxembourgishDefaults(Language.Defaults): class LuxembourgishDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
lex_attr_getters = LEX_ATTRS lex_attr_getters = LEX_ATTRS

View File

@ -1,10 +1,10 @@
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_INFIXES from .punctuation import TOKENIZER_INFIXES
from ...language import Language from ...language import Language, BaseDefaults
class LigurianDefaults(Language.Defaults): class LigurianDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
stop_words = STOP_WORDS stop_words = STOP_WORDS

View File

@ -2,10 +2,10 @@ from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ...language import Language from ...language import Language, BaseDefaults
class LithuanianDefaults(Language.Defaults): class LithuanianDefaults(BaseDefaults):
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS

View File

@ -1,8 +1,8 @@
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ...language import Language from ...language import Language, BaseDefaults
class LatvianDefaults(Language.Defaults): class LatvianDefaults(BaseDefaults):
stop_words = STOP_WORDS stop_words = STOP_WORDS

View File

@ -6,13 +6,13 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language, BaseDefaults
from ...attrs import LANG from ...attrs import LANG
from ...util import update_exc from ...util import update_exc
from ...lookups import Lookups from ...lookups import Lookups
class MacedonianDefaults(Language.Defaults): class MacedonianDefaults(BaseDefaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: "mk" lex_attr_getters[LANG] = lambda text: "mk"

View File

@ -1,9 +1,9 @@
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ...language import Language from ...language import Language, BaseDefaults
class MalayalamDefaults(Language.Defaults): class MalayalamDefaults(BaseDefaults):
lex_attr_getters = LEX_ATTRS lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS stop_words = STOP_WORDS

View File

@ -1,8 +1,8 @@
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ...language import Language from ...language import Language, BaseDefaults
class MarathiDefaults(Language.Defaults): class MarathiDefaults(BaseDefaults):
stop_words = STOP_WORDS stop_words = STOP_WORDS

View File

@ -5,11 +5,11 @@ from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from .punctuation import TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
from ...language import Language from ...language import Language, BaseDefaults
from ...pipeline import Lemmatizer from ...pipeline import Lemmatizer
class NorwegianDefaults(Language.Defaults): class NorwegianDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES

View File

@ -1,11 +1,11 @@
from typing import Union, Iterator from typing import Union, Iterator, Tuple
from ...symbols import NOUN, PROPN, PRON from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors from ...errors import Errors
from ...tokens import Doc, Span from ...tokens import Doc, Span
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]: def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
"""Detect base noun phrases from a dependency parse. Works on Doc and Span.""" """Detect base noun phrases from a dependency parse. Works on Doc and Span."""
# fmt: off # fmt: off
labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"] labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]

View File

@ -1,9 +1,9 @@
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ...language import Language from ...language import Language, BaseDefaults
class NepaliDefaults(Language.Defaults): class NepaliDefaults(BaseDefaults):
stop_words = STOP_WORDS stop_words = STOP_WORDS
lex_attr_getters = LEX_ATTRS lex_attr_getters = LEX_ATTRS

View File

@ -9,10 +9,10 @@ from .punctuation import TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from ...language import Language from ...language import Language, BaseDefaults
class DutchDefaults(Language.Defaults): class DutchDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES

View File

@ -1,11 +1,11 @@
from typing import Union, Iterator from typing import Union, Iterator, Tuple
from ...symbols import NOUN, PRON from ...symbols import NOUN, PRON
from ...errors import Errors from ...errors import Errors
from ...tokens import Doc, Span from ...tokens import Doc, Span
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]: def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
""" """
Detect base noun phrases from a dependency parse. Works on Doc and Span. Detect base noun phrases from a dependency parse. Works on Doc and Span.
The definition is inspired by https://www.nltk.org/book/ch07.html The definition is inspired by https://www.nltk.org/book/ch07.html

View File

@ -8,7 +8,7 @@ from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .lemmatizer import PolishLemmatizer from .lemmatizer import PolishLemmatizer
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language, BaseDefaults
TOKENIZER_EXCEPTIONS = { TOKENIZER_EXCEPTIONS = {
@ -16,7 +16,7 @@ TOKENIZER_EXCEPTIONS = {
} }
class PolishDefaults(Language.Defaults): class PolishDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES

View File

@ -2,10 +2,10 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES
from ...language import Language from ...language import Language, BaseDefaults
class PortugueseDefaults(Language.Defaults): class PortugueseDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES

View File

@ -3,14 +3,14 @@ from .stop_words import STOP_WORDS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from .punctuation import TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_SUFFIXES
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ...language import Language from ...language import Language, BaseDefaults
# Lemma data note: # Lemma data note:
# Original pairs downloaded from http://www.lexiconista.com/datasets/lemmatization/ # Original pairs downloaded from http://www.lexiconista.com/datasets/lemmatization/
# Replaced characters using cedillas with the correct ones (ș and ț) # Replaced characters using cedillas with the correct ones (ș and ț)
class RomanianDefaults(Language.Defaults): class RomanianDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES suffixes = TOKENIZER_SUFFIXES

View File

@ -5,10 +5,10 @@ from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .lemmatizer import RussianLemmatizer from .lemmatizer import RussianLemmatizer
from ...language import Language from ...language import Language, BaseDefaults
class RussianDefaults(Language.Defaults): class RussianDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
lex_attr_getters = LEX_ATTRS lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS stop_words = STOP_WORDS

View File

@ -58,7 +58,9 @@ class RussianLemmatizer(Lemmatizer):
if not len(filtered_analyses): if not len(filtered_analyses):
return [string.lower()] return [string.lower()]
if morphology is None or (len(morphology) == 1 and POS in morphology): if morphology is None or (len(morphology) == 1 and POS in morphology):
return list(dict.fromkeys([analysis.normal_form for analysis in filtered_analyses])) return list(
dict.fromkeys([analysis.normal_form for analysis in filtered_analyses])
)
if univ_pos in ("ADJ", "DET", "NOUN", "PROPN"): if univ_pos in ("ADJ", "DET", "NOUN", "PROPN"):
features_to_compare = ["Case", "Number", "Gender"] features_to_compare = ["Case", "Number", "Gender"]
elif univ_pos == "NUM": elif univ_pos == "NUM":
@ -89,7 +91,9 @@ class RussianLemmatizer(Lemmatizer):
filtered_analyses.append(analysis) filtered_analyses.append(analysis)
if not len(filtered_analyses): if not len(filtered_analyses):
return [string.lower()] return [string.lower()]
return list(dict.fromkeys([analysis.normal_form for analysis in filtered_analyses])) return list(
dict.fromkeys([analysis.normal_form for analysis in filtered_analyses])
)
def pymorphy2_lookup_lemmatize(self, token: Token) -> List[str]: def pymorphy2_lookup_lemmatize(self, token: Token) -> List[str]:
string = token.text string = token.text

View File

@ -1,9 +1,9 @@
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ...language import Language from ...language import Language, BaseDefaults
class SanskritDefaults(Language.Defaults): class SanskritDefaults(BaseDefaults):
lex_attr_getters = LEX_ATTRS lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS stop_words = STOP_WORDS

View File

@ -1,9 +1,9 @@
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ...language import Language from ...language import Language, BaseDefaults
class SinhalaDefaults(Language.Defaults): class SinhalaDefaults(BaseDefaults):
lex_attr_getters = LEX_ATTRS lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS stop_words = STOP_WORDS

View File

@ -1,9 +1,9 @@
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ...language import Language from ...language import Language, BaseDefaults
class SlovakDefaults(Language.Defaults): class SlovakDefaults(BaseDefaults):
lex_attr_getters = LEX_ATTRS lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS stop_words = STOP_WORDS

View File

@ -1,8 +1,8 @@
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ...language import Language from ...language import Language, BaseDefaults
class SlovenianDefaults(Language.Defaults): class SlovenianDefaults(BaseDefaults):
stop_words = STOP_WORDS stop_words = STOP_WORDS

Some files were not shown because too many files have changed in this diff Show More