| Get a custom spaCy pipeline, tailor-made for your NLP problem by spaCy's core developers. Streamlined, production-ready, predictable and maintainable. Start by completing our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-pipelines)** |
|
| Bespoke advice for problem solving, strategy and analysis for applied NLP projects. Services include data strategy, code reviews, pipeline design and annotation coaching. Curious? Fill in our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-analysis)** |
@@ -57,13 +58,13 @@ open-source software, released under the [MIT license](https://github.com/explos
[api reference]: https://spacy.io/api/
[models]: https://spacy.io/models
[universe]: https://spacy.io/universe
+[spaCy VS Code Extension]: https://github.com/explosion/spacy-vscode
[videos]: https://www.youtube.com/c/ExplosionAI
[online course]: https://course.spacy.io
[project templates]: https://github.com/explosion/projects
[changelog]: https://spacy.io/usage#changelog
[contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md
-
## 💬 Where to ask questions
The spaCy project is maintained by the [spaCy team](https://explosion.ai/about).
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
deleted file mode 100644
index 83c57a164..000000000
--- a/azure-pipelines.yml
+++ /dev/null
@@ -1,120 +0,0 @@
-trigger:
- batch: true
- branches:
- include:
- - "*"
- exclude:
- - "spacy.io"
- - "nightly.spacy.io"
- - "v2.spacy.io"
- paths:
- exclude:
- - "website/*"
- - "*.md"
- - "*.mdx"
- - ".github/workflows/*"
-pr:
- paths:
- exclude:
- - "*.md"
- - "*.mdx"
- - "website/docs/*"
- - "website/src/*"
- - "website/meta/*.tsx"
- - "website/meta/*.mjs"
- - "website/meta/languages.json"
- - "website/meta/site.json"
- - "website/meta/sidebars.json"
- - "website/meta/type-annotations.json"
- - "website/pages/*"
- - ".github/workflows/*"
-
-jobs:
- # Check formatting and linting. Perform basic checks for most important errors
- # (syntax etc.) Uses the config defined in setup.cfg and overwrites the
- # selected codes.
- - job: "Validate"
- pool:
- vmImage: "ubuntu-latest"
- steps:
- - task: UsePythonVersion@0
- inputs:
- versionSpec: "3.7"
- - script: |
- pip install black -c requirements.txt
- python -m black spacy --check
- displayName: "black"
- - script: |
- pip install flake8==5.0.4
- python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
- displayName: "flake8"
- - script: |
- python .github/validate_universe_json.py website/meta/universe.json
- displayName: 'Validate website/meta/universe.json'
-
- - job: "Test"
- dependsOn: "Validate"
- strategy:
- matrix:
- # We're only running one platform per Python version to speed up builds
- Python36Linux:
- imageName: "ubuntu-20.04"
- python.version: "3.6"
- # Python36Windows:
- # imageName: "windows-latest"
- # python.version: "3.6"
- # Python36Mac:
- # imageName: "macos-latest"
- # python.version: "3.6"
- # Python37Linux:
- # imageName: "ubuntu-20.04"
- # python.version: "3.7"
- Python37Windows:
- imageName: "windows-latest"
- python.version: "3.7"
- # Python37Mac:
- # imageName: "macos-latest"
- # python.version: "3.7"
- # Python38Linux:
- # imageName: "ubuntu-latest"
- # python.version: "3.8"
- # Python38Windows:
- # imageName: "windows-latest"
- # python.version: "3.8"
- Python38Mac:
- imageName: "macos-latest"
- python.version: "3.8"
- Python39Linux:
- imageName: "ubuntu-latest"
- python.version: "3.9"
- # Python39Windows:
- # imageName: "windows-latest"
- # python.version: "3.9"
- # Python39Mac:
- # imageName: "macos-latest"
- # python.version: "3.9"
- # Python310Linux:
- # imageName: "ubuntu-latest"
- # python.version: "3.10"
- Python310Windows:
- imageName: "windows-latest"
- python.version: "3.10"
- # Python310Mac:
- # imageName: "macos-latest"
- # python.version: "3.10"
- Python311Linux:
- imageName: 'ubuntu-latest'
- python.version: '3.11'
- Python311Windows:
- imageName: 'windows-latest'
- python.version: '3.11'
- Python311Mac:
- imageName: 'macos-latest'
- python.version: '3.11'
- maxParallel: 4
- pool:
- vmImage: $(imageName)
- steps:
- - template: .github/azure-steps.yml
- parameters:
- python_version: '$(python.version)'
diff --git a/pyproject.toml b/pyproject.toml
index 9cd96ac2d..dcb5cf10d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -9,3 +9,6 @@ requires = [
"numpy>=1.15.0",
]
build-backend = "setuptools.build_meta"
+
+[tool.isort]
+profile = "black"
diff --git a/requirements.txt b/requirements.txt
index 94d6f23f4..f5050fee2 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,7 +9,7 @@ murmurhash>=0.28.0,<1.1.0
wasabi>=0.9.1,<1.2.0
srsly>=2.4.3,<3.0.0
catalogue>=2.0.6,<2.1.0
-typer>=0.3.0,<0.8.0
+typer>=0.3.0,<0.10.0
pathy>=0.10.0
smart-open>=5.2.1,<7.0.0
weasel>=0.1.0,<0.2.0
@@ -39,3 +39,4 @@ types-setuptools>=57.0.0
types-requests
types-setuptools>=57.0.0
black==22.3.0
+isort>=5.0,<6.0
diff --git a/setup.cfg b/setup.cfg
index 6d3c2f12c..048bb3719 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -53,7 +53,7 @@ install_requires =
catalogue>=2.0.6,<2.1.0
weasel>=0.1.0,<0.2.0
# Third-party dependencies
- typer>=0.3.0,<0.8.0
+ typer>=0.3.0,<0.10.0
pathy>=0.10.0
smart-open>=5.2.1,<7.0.0
tqdm>=4.38.0,<5.0.0
@@ -79,41 +79,41 @@ transformers =
ray =
spacy_ray>=0.1.0,<1.0.0
cuda =
- cupy>=5.0.0b4,<12.0.0
+ cupy>=5.0.0b4,<13.0.0
cuda80 =
- cupy-cuda80>=5.0.0b4,<12.0.0
+ cupy-cuda80>=5.0.0b4,<13.0.0
cuda90 =
- cupy-cuda90>=5.0.0b4,<12.0.0
+ cupy-cuda90>=5.0.0b4,<13.0.0
cuda91 =
- cupy-cuda91>=5.0.0b4,<12.0.0
+ cupy-cuda91>=5.0.0b4,<13.0.0
cuda92 =
- cupy-cuda92>=5.0.0b4,<12.0.0
+ cupy-cuda92>=5.0.0b4,<13.0.0
cuda100 =
- cupy-cuda100>=5.0.0b4,<12.0.0
+ cupy-cuda100>=5.0.0b4,<13.0.0
cuda101 =
- cupy-cuda101>=5.0.0b4,<12.0.0
+ cupy-cuda101>=5.0.0b4,<13.0.0
cuda102 =
- cupy-cuda102>=5.0.0b4,<12.0.0
+ cupy-cuda102>=5.0.0b4,<13.0.0
cuda110 =
- cupy-cuda110>=5.0.0b4,<12.0.0
+ cupy-cuda110>=5.0.0b4,<13.0.0
cuda111 =
- cupy-cuda111>=5.0.0b4,<12.0.0
+ cupy-cuda111>=5.0.0b4,<13.0.0
cuda112 =
- cupy-cuda112>=5.0.0b4,<12.0.0
+ cupy-cuda112>=5.0.0b4,<13.0.0
cuda113 =
- cupy-cuda113>=5.0.0b4,<12.0.0
+ cupy-cuda113>=5.0.0b4,<13.0.0
cuda114 =
- cupy-cuda114>=5.0.0b4,<12.0.0
+ cupy-cuda114>=5.0.0b4,<13.0.0
cuda115 =
- cupy-cuda115>=5.0.0b4,<12.0.0
+ cupy-cuda115>=5.0.0b4,<13.0.0
cuda116 =
- cupy-cuda116>=5.0.0b4,<12.0.0
+ cupy-cuda116>=5.0.0b4,<13.0.0
cuda117 =
- cupy-cuda117>=5.0.0b4,<12.0.0
+ cupy-cuda117>=5.0.0b4,<13.0.0
cuda11x =
- cupy-cuda11x>=11.0.0,<12.0.0
+ cupy-cuda11x>=11.0.0,<13.0.0
cuda-autodetect =
- cupy-wheel>=11.0.0,<12.0.0
+ cupy-wheel>=11.0.0,<13.0.0
apple =
thinc-apple-ops>=0.1.0.dev0,<1.0.0
# Language tokenizers with external dependencies
diff --git a/spacy/__init__.py b/spacy/__init__.py
index c3568bc5c..1a18ad0d5 100644
--- a/spacy/__init__.py
+++ b/spacy/__init__.py
@@ -1,6 +1,6 @@
-from typing import Union, Iterable, Dict, Any
-from pathlib import Path
import sys
+from pathlib import Path
+from typing import Any, Dict, Iterable, Union
# set library-specific custom warning handling before doing anything else
from .errors import setup_default_warnings
@@ -8,20 +8,17 @@ from .errors import setup_default_warnings
setup_default_warnings() # noqa: E402
# These are imported as part of the API
-from thinc.api import prefer_gpu, require_gpu, require_cpu # noqa: F401
-from thinc.api import Config
+from thinc.api import Config, prefer_gpu, require_cpu, require_gpu # noqa: F401
from . import pipeline # noqa: F401
-from .cli.info import info # noqa: F401
-from .glossary import explain # noqa: F401
-from .about import __version__ # noqa: F401
-from .util import registry, logger # noqa: F401
-
-from .errors import Errors
-from .language import Language
-from .vocab import Vocab
from . import util
-
+from .about import __version__ # noqa: F401
+from .cli.info import info # noqa: F401
+from .errors import Errors
+from .glossary import explain # noqa: F401
+from .language import Language
+from .util import logger, registry # noqa: F401
+from .vocab import Vocab
if sys.maxunicode == 65535:
raise SystemError(Errors.E130)
diff --git a/spacy/about.py b/spacy/about.py
index 4748d655c..3319860f1 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,5 +1,5 @@
# fmt: off
__title__ = "spacy"
-__version__ = "3.5.0"
+__version__ = "3.6.0"
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
diff --git a/spacy/attrs.pxd b/spacy/attrs.pxd
index 33d5372de..6dc9ecaee 100644
--- a/spacy/attrs.pxd
+++ b/spacy/attrs.pxd
@@ -1,6 +1,7 @@
# Reserve 64 values for flag features
from . cimport symbols
+
cdef enum attr_id_t:
NULL_ATTR
IS_ALPHA
diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py
index efabcb9cf..549a27616 100644
--- a/spacy/cli/__init__.py
+++ b/spacy/cli/__init__.py
@@ -1,28 +1,35 @@
from wasabi import msg
from ._util import app, setup_cli # noqa: F401
+from .apply import apply # noqa: F401
+from .assemble import assemble_cli # noqa: F401
# These are the actual functions, NOT the wrapped CLI commands. The CLI commands
# are registered automatically and won't have to be imported here.
from .benchmark_speed import benchmark_speed_cli # noqa: F401
-from .download import download # noqa: F401
-from .info import info # noqa: F401
-from .package import package # noqa: F401
-from .profile import profile # noqa: F401
-from .train import train_cli # noqa: F401
-from .assemble import assemble_cli # noqa: F401
-from .pretrain import pretrain # noqa: F401
-from .debug_data import debug_data # noqa: F401
-from .debug_config import debug_config # noqa: F401
-from .debug_model import debug_model # noqa: F401
-from .debug_diff import debug_diff # noqa: F401
-from .evaluate import evaluate # noqa: F401
-from .apply import apply # noqa: F401
from .convert import convert # noqa: F401
-from .init_pipeline import init_pipeline_cli # noqa: F401
-from .init_config import init_config, fill_config # noqa: F401
-from .validate import validate # noqa: F401
+from .debug_config import debug_config # noqa: F401
+from .debug_data import debug_data # noqa: F401
+from .debug_diff import debug_diff # noqa: F401
+from .debug_model import debug_model # noqa: F401
+from .download import download # noqa: F401
+from .evaluate import evaluate # noqa: F401
from .find_threshold import find_threshold # noqa: F401
+from .info import info # noqa: F401
+from .init_config import fill_config, init_config # noqa: F401
+from .init_pipeline import init_pipeline_cli # noqa: F401
+from .package import package # noqa: F401
+from .pretrain import pretrain # noqa: F401
+from .profile import profile # noqa: F401
+from .project.assets import project_assets # noqa: F401
+from .project.clone import project_clone # noqa: F401
+from .project.document import project_document # noqa: F401
+from .project.dvc import project_update_dvc # noqa: F401
+from .project.pull import project_pull # noqa: F401
+from .project.push import project_push # noqa: F401
+from .project.run import project_run # noqa: F401
+from .train import train_cli # noqa: F401
+from .validate import validate # noqa: F401
@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index 4e8102e3d..b48e928f5 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -1,25 +1,45 @@
-from typing import Dict, Any, Union, List, Optional, Tuple, Iterable
-from typing import TYPE_CHECKING, overload
-import sys
-import shutil
-from pathlib import Path
-from wasabi import msg, Printer
-import srsly
import hashlib
+import os
+import shutil
+import sys
+from configparser import InterpolationError
+from contextlib import contextmanager
+from pathlib import Path
+from typing import (
+ TYPE_CHECKING,
+ Any,
+ Dict,
+ Iterable,
+ List,
+ Optional,
+ Tuple,
+ Union,
+ overload,
+)
+
+import srsly
import typer
from click import NoSuchOption
from click.parser import split_arg_string
-from typer.main import get_command
-from contextlib import contextmanager
-from thinc.api import ConfigValidationError, require_gpu
+from thinc.api import Config, ConfigValidationError, require_gpu
from thinc.util import gpu_is_available
-from configparser import InterpolationError
-import os
-
+from typer.main import get_command
+from wasabi import Printer, msg
from weasel import app as project_cli
+from .. import about
from ..compat import Literal
-from ..util import import_file, run_command, registry, logger, ENV_VARS
+from ..schemas import ProjectConfigSchema, validate
+from ..util import (
+ ENV_VARS,
+ SimpleFrozenDict,
+ import_file,
+ is_compatible_version,
+ logger,
+ make_tempdir,
+ registry,
+ run_command,
+)
if TYPE_CHECKING:
from pathy import FluidPath # noqa: F401
diff --git a/spacy/cli/apply.py b/spacy/cli/apply.py
index f0df4e757..8c4b4c8bf 100644
--- a/spacy/cli/apply.py
+++ b/spacy/cli/apply.py
@@ -1,18 +1,15 @@
-import tqdm
-import srsly
-
from itertools import chain
from pathlib import Path
-from typing import Optional, List, Iterable, cast, Union
+from typing import Iterable, List, Optional, Union, cast
+import srsly
+import tqdm
from wasabi import msg
-from ._util import app, Arg, Opt, setup_gpu, import_code, walk_directory
-
from ..tokens import Doc, DocBin
-from ..vocab import Vocab
from ..util import ensure_path, load_model
-
+from ..vocab import Vocab
+from ._util import Arg, Opt, app, import_code, setup_gpu, walk_directory
path_help = """Location of the documents to predict on.
Can be a single file in .spacy format or a .jsonl file.
diff --git a/spacy/cli/assemble.py b/spacy/cli/assemble.py
index 1cfa290a3..ee2500b27 100644
--- a/spacy/cli/assemble.py
+++ b/spacy/cli/assemble.py
@@ -1,13 +1,20 @@
-from typing import Optional
-from pathlib import Path
-from wasabi import msg
-import typer
import logging
+from pathlib import Path
+from typing import Optional
+
+import typer
+from wasabi import msg
-from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
-from ._util import import_code
from .. import util
from ..util import get_sourced_components, load_model_from_config
+from ._util import (
+ Arg,
+ Opt,
+ app,
+ import_code,
+ parse_config_overrides,
+ show_validation_error,
+)
@app.command(
diff --git a/spacy/cli/benchmark_speed.py b/spacy/cli/benchmark_speed.py
index 4eb20a5fa..a683d1591 100644
--- a/spacy/cli/benchmark_speed.py
+++ b/spacy/cli/benchmark_speed.py
@@ -1,11 +1,12 @@
-from typing import Iterable, List, Optional
import random
-from itertools import islice
-import numpy
-from pathlib import Path
import time
-from tqdm import tqdm
+from itertools import islice
+from pathlib import Path
+from typing import Iterable, List, Optional
+
+import numpy
import typer
+from tqdm import tqdm
from wasabi import msg
from .. import util
diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py
index 68d454b3e..a66a68133 100644
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@@ -1,18 +1,22 @@
-from typing import Callable, Iterable, Mapping, Optional, Any, Union
-from enum import Enum
-from pathlib import Path
-from wasabi import Printer
-import srsly
+import itertools
import re
import sys
-import itertools
+from enum import Enum
+from pathlib import Path
+from typing import Any, Callable, Iterable, Mapping, Optional, Union
+
+import srsly
+from wasabi import Printer
-from ._util import app, Arg, Opt, walk_directory
-from ..training import docs_to_json
from ..tokens import Doc, DocBin
-from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs
-from ..training.converters import conllu_to_docs
-
+from ..training import docs_to_json
+from ..training.converters import (
+ conll_ner_to_docs,
+ conllu_to_docs,
+ iob_to_docs,
+ json_to_docs,
+)
+from ._util import Arg, Opt, app, walk_directory
# Converters are matched by file extension except for ner/iob, which are
# matched by file extension and content. To add a converter, add a new
diff --git a/spacy/cli/debug_config.py b/spacy/cli/debug_config.py
index 409fac4ed..0e5382cd9 100644
--- a/spacy/cli/debug_config.py
+++ b/spacy/cli/debug_config.py
@@ -1,15 +1,22 @@
-from typing import Optional, Dict, Any, Union, List
from pathlib import Path
-from wasabi import msg, table
+from typing import Any, Dict, List, Optional, Union
+
+import typer
from thinc.api import Config
from thinc.config import VARIABLE_RE
-import typer
+from wasabi import msg, table
-from ._util import Arg, Opt, show_validation_error, parse_config_overrides
-from ._util import import_code, debug_cli
+from .. import util
from ..schemas import ConfigSchemaInit, ConfigSchemaTraining
from ..util import registry
-from .. import util
+from ._util import (
+ Arg,
+ Opt,
+ debug_cli,
+ import_code,
+ parse_config_overrides,
+ show_validation_error,
+)
@debug_cli.command(
diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py
index 97b4db285..af3c24f3b 100644
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@@ -1,31 +1,49 @@
-from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Tuple, Union
-from typing import cast, overload
-from pathlib import Path
-from collections import Counter
-import sys
-import srsly
-from wasabi import Printer, MESSAGES, msg
-import typer
import math
-import numpy
+import sys
+from collections import Counter
+from pathlib import Path
+from typing import (
+ Any,
+ Dict,
+ Iterable,
+ List,
+ Optional,
+ Sequence,
+ Set,
+ Tuple,
+ Union,
+ cast,
+ overload,
+)
-from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
-from ._util import import_code, debug_cli, _format_number
-from ..training import Example, remove_bilu_prefix
-from ..training.initialize import get_sourced_components
-from ..schemas import ConfigSchemaTraining
-from ..pipeline import TrainablePipe
+import numpy
+import srsly
+import typer
+from wasabi import MESSAGES, Printer, msg
+
+from .. import util
+from ..compat import Literal
+from ..language import Language
+from ..morphology import Morphology
+from ..pipeline import Morphologizer, SpanCategorizer, TrainablePipe
+from ..pipeline._edit_tree_internals.edit_trees import EditTrees
from ..pipeline._parser_internals import nonproj
from ..pipeline._parser_internals.nonproj import DELIMITER
-from ..pipeline import Morphologizer, SpanCategorizer
-from ..pipeline._edit_tree_internals.edit_trees import EditTrees
-from ..morphology import Morphology
-from ..language import Language
+from ..schemas import ConfigSchemaTraining
+from ..training import Example, remove_bilu_prefix
+from ..training.initialize import get_sourced_components
from ..util import registry, resolve_dot_names
-from ..compat import Literal
from ..vectors import Mode as VectorsMode
-from .. import util
-
+from ._util import (
+ Arg,
+ Opt,
+ _format_number,
+ app,
+ debug_cli,
+ import_code,
+ parse_config_overrides,
+ show_validation_error,
+)
# Minimum number of expected occurrences of NER label in data to train new label
NEW_LABEL_THRESHOLD = 50
@@ -212,7 +230,7 @@ def debug_data(
else:
msg.info("No word vectors present in the package")
- if "spancat" in factory_names:
+ if "spancat" in factory_names or "spancat_singlelabel" in factory_names:
model_labels_spancat = _get_labels_from_spancat(nlp)
has_low_data_warning = False
has_no_neg_warning = False
@@ -337,7 +355,7 @@ def debug_data(
show=verbose,
)
else:
- msg.good("Examples without ocurrences available for all labels")
+ msg.good("Examples without occurrences available for all labels")
if "ner" in factory_names:
# Get all unique NER labels present in the data
@@ -830,7 +848,7 @@ def _compile_gold(
data["boundary_cross_ents"] += 1
elif label == "-":
data["ner"]["-"] += 1
- if "spancat" in factory_names:
+ if "spancat" in factory_names or "spancat_singlelabel" in factory_names:
for spans_key in list(eg.reference.spans.keys()):
# Obtain the span frequency
if spans_key not in data["spancat"]:
@@ -1028,7 +1046,7 @@ def _get_labels_from_spancat(nlp: Language) -> Dict[str, Set[str]]:
pipe_names = [
pipe_name
for pipe_name in nlp.pipe_names
- if nlp.get_pipe_meta(pipe_name).factory == "spancat"
+ if nlp.get_pipe_meta(pipe_name).factory in ("spancat", "spancat_singlelabel")
]
labels: Dict[str, Set[str]] = {}
for pipe_name in pipe_names:
diff --git a/spacy/cli/debug_diff.py b/spacy/cli/debug_diff.py
index 6697c38ae..c53b0acab 100644
--- a/spacy/cli/debug_diff.py
+++ b/spacy/cli/debug_diff.py
@@ -1,13 +1,13 @@
+from pathlib import Path
from typing import Optional
import typer
-from wasabi import Printer, diff_strings, MarkdownRenderer
-from pathlib import Path
from thinc.api import Config
+from wasabi import MarkdownRenderer, Printer, diff_strings
-from ._util import debug_cli, Arg, Opt, show_validation_error, parse_config_overrides
from ..util import load_config
-from .init_config import init_config, Optimizations
+from ._util import Arg, Opt, debug_cli, parse_config_overrides, show_validation_error
+from .init_config import Optimizations, init_config
@debug_cli.command(
diff --git a/spacy/cli/debug_model.py b/spacy/cli/debug_model.py
index 190094d81..8a0fd4889 100644
--- a/spacy/cli/debug_model.py
+++ b/spacy/cli/debug_model.py
@@ -1,19 +1,32 @@
-from typing import Dict, Any, Optional
-from pathlib import Path
import itertools
+from pathlib import Path
+from typing import Any, Dict, Optional
+
+import typer
+from thinc.api import (
+ Model,
+ data_validation,
+ fix_random_seed,
+ set_dropout_rate,
+ set_gpu_allocator,
+)
+from wasabi import msg
from spacy.training import Example
from spacy.util import resolve_dot_names
-from wasabi import msg
-from thinc.api import fix_random_seed, set_dropout_rate
-from thinc.api import Model, data_validation, set_gpu_allocator
-import typer
-from ._util import Arg, Opt, debug_cli, show_validation_error
-from ._util import parse_config_overrides, string_to_list, setup_gpu
+from .. import util
from ..schemas import ConfigSchemaTraining
from ..util import registry
-from .. import util
+from ._util import (
+ Arg,
+ Opt,
+ debug_cli,
+ parse_config_overrides,
+ setup_gpu,
+ show_validation_error,
+ string_to_list,
+)
@debug_cli.command(
diff --git a/spacy/cli/download.py b/spacy/cli/download.py
index 0c9a32b93..de731b0fd 100644
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@@ -1,14 +1,14 @@
-from typing import Optional, Sequence
-import requests
import sys
-from wasabi import msg
-import typer
+from typing import Optional, Sequence
+
+import requests
+import typer
+from wasabi import msg
-from ._util import app, Arg, Opt, WHEEL_SUFFIX, SDIST_SUFFIX
from .. import about
-from ..util import is_package, get_minor_version, run_command
-from ..util import is_prerelease_version
from ..errors import OLD_MODEL_SHORTCUTS
+from ..util import get_minor_version, is_package, is_prerelease_version, run_command
+from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app
@app.command(
@@ -81,11 +81,8 @@ def download(
def get_model_filename(model_name: str, version: str, sdist: bool = False) -> str:
dl_tpl = "{m}-{v}/{m}-{v}{s}"
- egg_tpl = "#egg={m}=={v}"
suffix = SDIST_SUFFIX if sdist else WHEEL_SUFFIX
filename = dl_tpl.format(m=model_name, v=version, s=suffix)
- if sdist:
- filename += egg_tpl.format(m=model_name, v=version)
return filename
diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py
index 8f3d6b859..6235b658d 100644
--- a/spacy/cli/evaluate.py
+++ b/spacy/cli/evaluate.py
@@ -1,16 +1,16 @@
-from typing import Optional, List, Dict, Any, Union
-from wasabi import Printer
-from pathlib import Path
import re
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Union
+
import srsly
from thinc.api import fix_random_seed
+from wasabi import Printer
-from ..training import Corpus
-from ..tokens import Doc
-from ._util import app, Arg, Opt, setup_gpu, import_code, benchmark_cli
+from .. import displacy, util
from ..scorer import Scorer
-from .. import util
-from .. import displacy
+from ..tokens import Doc
+from ..training import Corpus
+from ._util import Arg, Opt, app, benchmark_cli, import_code, setup_gpu
@benchmark_cli.command(
@@ -27,6 +27,7 @@ def evaluate_cli(
gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"),
displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False),
displacy_limit: int = Opt(25, "--displacy-limit", "-dl", help="Limit of parses to render as HTML"),
+ per_component: bool = Opt(False, "--per-component", "-P", help="Return scores per component, only applicable when an output JSON file is specified."),
# fmt: on
):
"""
@@ -50,6 +51,7 @@ def evaluate_cli(
gold_preproc=gold_preproc,
displacy_path=displacy_path,
displacy_limit=displacy_limit,
+ per_component=per_component,
silent=False,
)
@@ -64,6 +66,7 @@ def evaluate(
displacy_limit: int = 25,
silent: bool = True,
spans_key: str = "sc",
+ per_component: bool = False,
) -> Dict[str, Any]:
msg = Printer(no_print=silent, pretty=not silent)
fix_random_seed()
@@ -78,50 +81,61 @@ def evaluate(
corpus = Corpus(data_path, gold_preproc=gold_preproc)
nlp = util.load_model(model)
dev_dataset = list(corpus(nlp))
- scores = nlp.evaluate(dev_dataset)
- metrics = {
- "TOK": "token_acc",
- "TAG": "tag_acc",
- "POS": "pos_acc",
- "MORPH": "morph_acc",
- "LEMMA": "lemma_acc",
- "UAS": "dep_uas",
- "LAS": "dep_las",
- "NER P": "ents_p",
- "NER R": "ents_r",
- "NER F": "ents_f",
- "TEXTCAT": "cats_score",
- "SENT P": "sents_p",
- "SENT R": "sents_r",
- "SENT F": "sents_f",
- "SPAN P": f"spans_{spans_key}_p",
- "SPAN R": f"spans_{spans_key}_r",
- "SPAN F": f"spans_{spans_key}_f",
- "SPEED": "speed",
- }
- results = {}
- data = {}
- for metric, key in metrics.items():
- if key in scores:
- if key == "cats_score":
- metric = metric + " (" + scores.get("cats_score_desc", "unk") + ")"
- if isinstance(scores[key], (int, float)):
- if key == "speed":
- results[metric] = f"{scores[key]:.0f}"
+ scores = nlp.evaluate(dev_dataset, per_component=per_component)
+ if per_component:
+ data = scores
+ if output is None:
+ msg.warn(
+ "The per-component option is enabled but there is no output JSON file provided to save the scores to."
+ )
+ else:
+ msg.info("Per-component scores will be saved to output JSON file.")
+ else:
+ metrics = {
+ "TOK": "token_acc",
+ "TAG": "tag_acc",
+ "POS": "pos_acc",
+ "MORPH": "morph_acc",
+ "LEMMA": "lemma_acc",
+ "UAS": "dep_uas",
+ "LAS": "dep_las",
+ "NER P": "ents_p",
+ "NER R": "ents_r",
+ "NER F": "ents_f",
+ "TEXTCAT": "cats_score",
+ "SENT P": "sents_p",
+ "SENT R": "sents_r",
+ "SENT F": "sents_f",
+ "SPAN P": f"spans_{spans_key}_p",
+ "SPAN R": f"spans_{spans_key}_r",
+ "SPAN F": f"spans_{spans_key}_f",
+ "SPEED": "speed",
+ }
+ results = {}
+ data = {}
+ for metric, key in metrics.items():
+ if key in scores:
+ if key == "cats_score":
+ metric = metric + " (" + scores.get("cats_score_desc", "unk") + ")"
+ if isinstance(scores[key], (int, float)):
+ if key == "speed":
+ results[metric] = f"{scores[key]:.0f}"
+ else:
+ results[metric] = f"{scores[key]*100:.2f}"
else:
- results[metric] = f"{scores[key]*100:.2f}"
- else:
- results[metric] = "-"
- data[re.sub(r"[\s/]", "_", key.lower())] = scores[key]
+ results[metric] = "-"
+ data[re.sub(r"[\s/]", "_", key.lower())] = scores[key]
- msg.table(results, title="Results")
- data = handle_scores_per_type(scores, data, spans_key=spans_key, silent=silent)
+ msg.table(results, title="Results")
+ data = handle_scores_per_type(scores, data, spans_key=spans_key, silent=silent)
if displacy_path:
factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names]
docs = list(nlp.pipe(ex.reference.text for ex in dev_dataset[:displacy_limit]))
render_deps = "parser" in factory_names
render_ents = "ner" in factory_names
+ render_spans = "spancat" in factory_names
+
render_parses(
docs,
displacy_path,
@@ -129,6 +143,7 @@ def evaluate(
limit=displacy_limit,
deps=render_deps,
ents=render_ents,
+ spans=render_spans,
)
msg.good(f"Generated {displacy_limit} parses as HTML", displacy_path)
@@ -182,6 +197,7 @@ def render_parses(
limit: int = 250,
deps: bool = True,
ents: bool = True,
+ spans: bool = True,
):
docs[0].user_data["title"] = model_name
if ents:
@@ -195,6 +211,11 @@ def render_parses(
with (output_path / "parses.html").open("w", encoding="utf8") as file_:
file_.write(html)
+ if spans:
+ html = displacy.render(docs[:limit], style="span", page=True)
+ with (output_path / "spans.html").open("w", encoding="utf8") as file_:
+ file_.write(html)
+
def print_prf_per_type(
msg: Printer, scores: Dict[str, Dict[str, float]], name: str, type: str
diff --git a/spacy/cli/find_threshold.py b/spacy/cli/find_threshold.py
index 6d591053d..7aa32c0c6 100644
--- a/spacy/cli/find_threshold.py
+++ b/spacy/cli/find_threshold.py
@@ -1,17 +1,17 @@
import functools
+import logging
import operator
from pathlib import Path
-import logging
-from typing import Optional, Tuple, Any, Dict, List
+from typing import Any, Dict, List, Optional, Tuple
import numpy
import wasabi.tables
-from ..pipeline import TextCategorizer, MultiLabel_TextCategorizer
-from ..errors import Errors
-from ..training import Corpus
-from ._util import app, Arg, Opt, import_code, setup_gpu
from .. import util
+from ..errors import Errors
+from ..pipeline import MultiLabel_TextCategorizer, TextCategorizer
+from ..training import Corpus
+from ._util import Arg, Opt, app, import_code, setup_gpu
_DEFAULTS = {
"n_trials": 11,
diff --git a/spacy/cli/info.py b/spacy/cli/info.py
index d82bf3fbc..8bfc6b54f 100644
--- a/spacy/cli/info.py
+++ b/spacy/cli/info.py
@@ -1,15 +1,15 @@
-from typing import Optional, Dict, Any, Union, List
-import platform
import json
+import platform
from pathlib import Path
-from wasabi import Printer, MarkdownRenderer
-import srsly
+from typing import Any, Dict, List, Optional, Union
-from ._util import app, Arg, Opt, string_to_list
-from .download import get_model_filename, get_latest_version
-from .. import util
-from .. import about
+import srsly
+from wasabi import MarkdownRenderer, Printer
+
+from .. import about, util
from ..compat import importlib_metadata
+from ._util import Arg, Opt, app, string_to_list
+from .download import get_latest_version, get_model_filename
@app.command("info")
diff --git a/spacy/cli/init_config.py b/spacy/cli/init_config.py
index b634caa4c..a7c03d00f 100644
--- a/spacy/cli/init_config.py
+++ b/spacy/cli/init_config.py
@@ -1,19 +1,26 @@
-from typing import Optional, List, Tuple
+import re
from enum import Enum
from pathlib import Path
-from wasabi import Printer, diff_strings
-from thinc.api import Config
+from typing import List, Optional, Tuple
+
import srsly
-import re
from jinja2 import Template
+from thinc.api import Config
+from wasabi import Printer, diff_strings
from .. import util
from ..language import DEFAULT_CONFIG_PRETRAIN_PATH
from ..schemas import RecommendationSchema
from ..util import SimpleFrozenList
-from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND
-from ._util import string_to_list, import_code
-
+from ._util import (
+ COMMAND,
+ Arg,
+ Opt,
+ import_code,
+ init_cli,
+ show_validation_error,
+ string_to_list,
+)
ROOT = Path(__file__).parent / "templates"
TEMPLATE_PATH = ROOT / "quickstart_training.jinja"
diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py
index d53a61b8e..13202cb60 100644
--- a/spacy/cli/init_pipeline.py
+++ b/spacy/cli/init_pipeline.py
@@ -1,15 +1,23 @@
-from typing import Optional
import logging
from pathlib import Path
-from wasabi import msg
-import typer
+from typing import Optional
+
import srsly
+import typer
+from wasabi import msg
from .. import util
-from ..training.initialize import init_nlp, convert_vectors
from ..language import Language
-from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
-from ._util import import_code, setup_gpu
+from ..training.initialize import convert_vectors, init_nlp
+from ._util import (
+ Arg,
+ Opt,
+ import_code,
+ init_cli,
+ parse_config_overrides,
+ setup_gpu,
+ show_validation_error,
+)
@init_cli.command("vectors")
@@ -24,6 +32,7 @@ def init_vectors_cli(
name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True),
+ attr: str = Opt("ORTH", "--attr", "-a", help="Optional token attribute to use for vectors, e.g. LOWER or NORM"),
# fmt: on
):
"""Convert word vectors for use with spaCy. Will export an nlp object that
@@ -42,6 +51,7 @@ def init_vectors_cli(
prune=prune,
name=name,
mode=mode,
+ attr=attr,
)
msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
nlp.to_disk(output_dir)
diff --git a/spacy/cli/package.py b/spacy/cli/package.py
index 6351f28eb..4545578e6 100644
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@@ -1,18 +1,18 @@
-from typing import Optional, Union, Any, Dict, List, Tuple, cast
-import shutil
-from pathlib import Path
-from wasabi import Printer, MarkdownRenderer, get_raw_input
-from thinc.api import Config
-from collections import defaultdict
-from catalogue import RegistryError
-import srsly
-import sys
import re
+import shutil
+import sys
+from collections import defaultdict
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple, Union, cast
-from ._util import app, Arg, Opt, string_to_list, WHEEL_SUFFIX, SDIST_SUFFIX
-from ..schemas import validate, ModelMetaSchema
-from .. import util
-from .. import about
+import srsly
+from catalogue import RegistryError
+from thinc.api import Config
+from wasabi import MarkdownRenderer, Printer, get_raw_input
+
+from .. import about, util
+from ..schemas import ModelMetaSchema, validate
+from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app, string_to_list
@app.command("package")
diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py
index 381d589cf..446c40510 100644
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@@ -1,13 +1,21 @@
-from typing import Optional
-from pathlib import Path
-from wasabi import msg
-import typer
import re
+from pathlib import Path
+from typing import Optional
+
+import typer
+from wasabi import msg
-from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
-from ._util import import_code, setup_gpu
from ..training.pretrain import pretrain
from ..util import load_config
+from ._util import (
+ Arg,
+ Opt,
+ app,
+ import_code,
+ parse_config_overrides,
+ setup_gpu,
+ show_validation_error,
+)
@app.command(
@@ -23,6 +31,7 @@ def pretrain_cli(
resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."),
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
+ skip_last: bool = Opt(False, "--skip-last", "-L", help="Skip saving model-last.bin"),
# fmt: on
):
"""
@@ -74,6 +83,7 @@ def pretrain_cli(
epoch_resume=epoch_resume,
use_gpu=use_gpu,
silent=False,
+ skip_last=skip_last,
)
msg.good("Successfully finished pretrain")
diff --git a/spacy/cli/profile.py b/spacy/cli/profile.py
index 3c282c73d..e1f720327 100644
--- a/spacy/cli/profile.py
+++ b/spacy/cli/profile.py
@@ -1,17 +1,18 @@
-from typing import Optional, Sequence, Union, Iterator
-import tqdm
-from pathlib import Path
-import srsly
import cProfile
+import itertools
import pstats
import sys
-import itertools
-from wasabi import msg, Printer
-import typer
+from pathlib import Path
+from typing import Iterator, Optional, Sequence, Union
+
+import srsly
+import tqdm
+import typer
+from wasabi import Printer, msg
-from ._util import app, debug_cli, Arg, Opt, NAME
from ..language import Language
from ..util import load_model
+from ._util import NAME, Arg, Opt, app, debug_cli
@debug_cli.command("profile")
diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
index 9481e53be..e3ca73cfb 100644
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@@ -3,7 +3,7 @@ the docs and the init config command. It encodes various best practices and
can help generate the best possible configuration, given a user's requirements. #}
{%- set use_transformer = hardware != "cpu" and transformer_data -%}
{%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
-{%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "spancat", "spancat_singlelabel", "trainable_lemmatizer"] -%}
+{%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "span_finder", "spancat", "spancat_singlelabel", "trainable_lemmatizer"] -%}
[paths]
train = null
dev = null
@@ -28,7 +28,7 @@ lang = "{{ lang }}"
tok2vec/transformer. #}
{%- set with_accuracy_or_transformer = (use_transformer or with_accuracy) -%}
{%- set textcat_needs_features = has_textcat and with_accuracy_or_transformer -%}
-{%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "spancat" in components or "spancat_singlelabel" in components or "trainable_lemmatizer" in components or "entity_linker" in components or textcat_needs_features) -%}
+{%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "span_finder" in components or "spancat" in components or "spancat_singlelabel" in components or "trainable_lemmatizer" in components or "entity_linker" in components or textcat_needs_features) -%}
{%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components -%}
{%- else -%}
{%- set full_pipeline = components -%}
@@ -127,6 +127,30 @@ grad_factor = 1.0
@layers = "reduce_mean.v1"
{% endif -%}
+{% if "span_finder" in components -%}
+[components.span_finder]
+factory = "span_finder"
+max_length = null
+min_length = null
+scorer = {"@scorers":"spacy.span_finder_scorer.v1"}
+spans_key = "sc"
+threshold = 0.5
+
+[components.span_finder.model]
+@architectures = "spacy.SpanFinder.v1"
+
+[components.span_finder.model.scorer]
+@layers = "spacy.LinearLogistic.v1"
+nO = 2
+
+[components.span_finder.model.tok2vec]
+@architectures = "spacy-transformers.TransformerListener.v1"
+grad_factor = 1.0
+
+[components.span_finder.model.tok2vec.pooling]
+@layers = "reduce_mean.v1"
+{% endif -%}
+
{% if "spancat" in components -%}
[components.spancat]
factory = "spancat"
@@ -392,6 +416,27 @@ nO = null
width = ${components.tok2vec.model.encode.width}
{% endif %}
+{% if "span_finder" in components %}
+[components.span_finder]
+factory = "span_finder"
+max_length = null
+min_length = null
+scorer = {"@scorers":"spacy.span_finder_scorer.v1"}
+spans_key = "sc"
+threshold = 0.5
+
+[components.span_finder.model]
+@architectures = "spacy.SpanFinder.v1"
+
+[components.span_finder.model.scorer]
+@layers = "spacy.LinearLogistic.v1"
+nO = 2
+
+[components.span_finder.model.tok2vec]
+@architectures = "spacy.Tok2VecListener.v1"
+width = ${components.tok2vec.model.encode.width}
+{% endif %}
+
{% if "spancat" in components %}
[components.spancat]
factory = "spancat"
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index cc22cbba6..8bdabd39c 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -1,15 +1,23 @@
-from typing import Optional, Dict, Any, Union
-from pathlib import Path
-from wasabi import msg
-import typer
import logging
import sys
+from pathlib import Path
+from typing import Any, Dict, Optional, Union
+
+import typer
+from wasabi import msg
-from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
-from ._util import import_code, setup_gpu
-from ..training.loop import train as train_nlp
-from ..training.initialize import init_nlp
from .. import util
+from ..training.initialize import init_nlp
+from ..training.loop import train as train_nlp
+from ._util import (
+ Arg,
+ Opt,
+ app,
+ import_code,
+ parse_config_overrides,
+ setup_gpu,
+ show_validation_error,
+)
@app.command(
diff --git a/spacy/cli/validate.py b/spacy/cli/validate.py
index a918e9a39..0426f05fd 100644
--- a/spacy/cli/validate.py
+++ b/spacy/cli/validate.py
@@ -1,14 +1,21 @@
-from typing import Tuple
-from pathlib import Path
import sys
-import requests
-from wasabi import msg, Printer
import warnings
+from pathlib import Path
+from typing import Tuple
+
+import requests
+from wasabi import Printer, msg
-from ._util import app
from .. import about
-from ..util import get_package_version, get_installed_models, get_minor_version
-from ..util import get_package_path, get_model_meta, is_compatible_version
+from ..util import (
+ get_installed_models,
+ get_minor_version,
+ get_model_meta,
+ get_package_path,
+ get_package_version,
+ is_compatible_version,
+)
+from ._util import app
@app.command("validate")
diff --git a/spacy/compat.py b/spacy/compat.py
index 89132735d..522fa30dd 100644
--- a/spacy/compat.py
+++ b/spacy/compat.py
@@ -1,5 +1,6 @@
"""Helpers for Python and platform compatibility."""
import sys
+
from thinc.util import copy_array
try:
diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py
index ea6bba2c9..bde2d04fe 100644
--- a/spacy/displacy/__init__.py
+++ b/spacy/displacy/__init__.py
@@ -4,15 +4,13 @@ spaCy's built in visualization suite for dependencies and named entities.
DOCS: https://spacy.io/api/top-level#displacy
USAGE: https://spacy.io/usage/visualizers
"""
-from typing import Union, Iterable, Optional, Dict, Any, Callable
import warnings
+from typing import Any, Callable, Dict, Iterable, Optional, Union
-from .render import DependencyRenderer, EntityRenderer, SpanRenderer
-from ..tokens import Doc, Span
from ..errors import Errors, Warnings
-from ..util import is_in_jupyter
-from ..util import find_available_port
-
+from ..tokens import Doc, Span
+from ..util import find_available_port, is_in_jupyter
+from .render import DependencyRenderer, EntityRenderer, SpanRenderer
_html = {}
RENDER_WRAPPER = None
@@ -68,7 +66,7 @@ def render(
if jupyter or (jupyter is None and is_in_jupyter()):
# return HTML rendered by IPython display()
# See #4840 for details on span wrapper to disable mathjax
- from IPython.core.display import display, HTML
+ from IPython.core.display import HTML, display
return display(HTML('{}'.format(html)))
return html
@@ -125,13 +123,17 @@ def app(environ, start_response):
return [res]
-def parse_deps(orig_doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
+def parse_deps(
+ orig_doc: Union[Doc, Span], options: Dict[str, Any] = {}
+) -> Dict[str, Any]:
"""Generate dependency parse in {'words': [], 'arcs': []} format.
- orig_doc (Doc): Document to parse.
+ orig_doc (Union[Doc, Span]): Document to parse.
options (Dict[str, Any]): Dependency parse specific visualisation options.
RETURNS (dict): Generated dependency parse keyed by words and arcs.
"""
+ if isinstance(orig_doc, Span):
+ orig_doc = orig_doc.as_doc()
doc = Doc(orig_doc.vocab).from_bytes(
orig_doc.to_bytes(exclude=["user_data", "user_hooks"])
)
diff --git a/spacy/displacy/render.py b/spacy/displacy/render.py
index f74222dc2..86869e3b8 100644
--- a/spacy/displacy/render.py
+++ b/spacy/displacy/render.py
@@ -1,15 +1,29 @@
-from typing import Any, Dict, List, Optional, Tuple, Union
-import uuid
import itertools
+import uuid
+from typing import Any, Dict, List, Optional, Tuple, Union
from ..errors import Errors
from ..util import escape_html, minify_html, registry
-from .templates import TPL_DEP_ARCS, TPL_DEP_SVG, TPL_DEP_WORDS
-from .templates import TPL_DEP_WORDS_LEMMA, TPL_ENT, TPL_ENT_RTL, TPL_ENTS
-from .templates import TPL_FIGURE, TPL_KB_LINK, TPL_PAGE, TPL_SPAN
-from .templates import TPL_SPAN_RTL, TPL_SPAN_SLICE, TPL_SPAN_SLICE_RTL
-from .templates import TPL_SPAN_START, TPL_SPAN_START_RTL, TPL_SPANS
-from .templates import TPL_TITLE
+from .templates import (
+ TPL_DEP_ARCS,
+ TPL_DEP_SVG,
+ TPL_DEP_WORDS,
+ TPL_DEP_WORDS_LEMMA,
+ TPL_ENT,
+ TPL_ENT_RTL,
+ TPL_ENTS,
+ TPL_FIGURE,
+ TPL_KB_LINK,
+ TPL_PAGE,
+ TPL_SPAN,
+ TPL_SPAN_RTL,
+ TPL_SPAN_SLICE,
+ TPL_SPAN_SLICE_RTL,
+ TPL_SPAN_START,
+ TPL_SPAN_START_RTL,
+ TPL_SPANS,
+ TPL_TITLE,
+)
DEFAULT_LANG = "en"
DEFAULT_DIR = "ltr"
diff --git a/spacy/errors.py b/spacy/errors.py
index 526c4d0d6..24b60f8a3 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -1,4 +1,5 @@
import warnings
+
from .compat import Literal
@@ -215,6 +216,9 @@ class Warnings(metaclass=ErrorsWithCodes):
W123 = ("Argument `enable` with value {enable} does not contain all values specified in the config option "
"`enabled` ({enabled}). Be aware that this might affect other components in your pipeline.")
W124 = ("{host}:{port} is already in use, using the nearest available port {serve_port} as an alternative.")
+ W125 = ("The StaticVectors key_attr is no longer used. To set a custom "
+ "key attribute for vectors, configure it through Vectors(attr=) or "
+ "'spacy init vectors --attr'")
class Errors(metaclass=ErrorsWithCodes):
@@ -549,8 +553,8 @@ class Errors(metaclass=ErrorsWithCodes):
"during training, make sure to include it in 'annotating components'")
# New errors added in v3.x
- E850 = ("The PretrainVectors objective currently only supports default "
- "vectors, not {mode} vectors.")
+ E850 = ("The PretrainVectors objective currently only supports default or "
+ "floret vectors, not {mode} vectors.")
E851 = ("The 'textcat' component labels should only have values of 0 or 1, "
"but found value of '{val}'.")
E853 = ("Unsupported component factory name '{name}'. The character '.' is "
@@ -736,8 +740,8 @@ class Errors(metaclass=ErrorsWithCodes):
"model from a shortcut, which is obsolete as of spaCy v3.0. To "
"load the model, use its full name instead:\n\n"
"nlp = spacy.load(\"{full}\")\n\nFor more details on the available "
- "models, see the models directory: https://spacy.io/models. If you "
- "want to create a blank model, use spacy.blank: "
+ "models, see the models directory: https://spacy.io/models and if "
+ "you want to create a blank model, use spacy.blank: "
"nlp = spacy.blank(\"{name}\")")
E942 = ("Executing `after_{name}` callback failed. Expected the function to "
"return an initialized nlp object but got: {value}. Maybe "
@@ -968,6 +972,13 @@ class Errors(metaclass=ErrorsWithCodes):
E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port=port)` "
"or use `auto_select_port=True` to pick an available port automatically.")
E1051 = ("'allow_overlap' can only be False when max_positive is 1, but found 'max_positive': {max_positive}.")
+ E1052 = ("Unable to copy spans: the character offsets for the span at "
+ "index {i} in the span group do not align with the tokenization "
+ "in the target doc.")
+ E1053 = ("Both 'min_length' and 'max_length' should be larger than 0, but found"
+ " 'min_length': {min_length}, 'max_length': {max_length}")
+ E1054 = ("The text, including whitespace, must match between reference and "
+ "predicted docs when training {component}.")
# Deprecated model shortcuts, only used in errors and warnings
diff --git a/spacy/glossary.py b/spacy/glossary.py
index d2240fbba..1f628698b 100644
--- a/spacy/glossary.py
+++ b/spacy/glossary.py
@@ -1,4 +1,5 @@
import warnings
+
from .errors import Warnings
diff --git a/spacy/kb/__init__.py b/spacy/kb/__init__.py
index 1d70a9b34..3ce3e4c33 100644
--- a/spacy/kb/__init__.py
+++ b/spacy/kb/__init__.py
@@ -1,3 +1,3 @@
+from .candidate import Candidate, get_candidates, get_candidates_batch
from .kb import KnowledgeBase
from .kb_in_memory import InMemoryLookupKB
-from .candidate import Candidate, get_candidates, get_candidates_batch
diff --git a/spacy/kb/candidate.pxd b/spacy/kb/candidate.pxd
index 942ce9dd0..9fc4c4e9d 100644
--- a/spacy/kb/candidate.pxd
+++ b/spacy/kb/candidate.pxd
@@ -1,6 +1,8 @@
-from .kb cimport KnowledgeBase
from libcpp.vector cimport vector
+
from ..typedefs cimport hash_t
+from .kb cimport KnowledgeBase
+
# Object used by the Entity Linker that summarizes one entity-alias candidate combination.
cdef class Candidate:
diff --git a/spacy/kb/candidate.pyx b/spacy/kb/candidate.pyx
index c89efeb03..4cd734f43 100644
--- a/spacy/kb/candidate.pyx
+++ b/spacy/kb/candidate.pyx
@@ -1,9 +1,12 @@
# cython: infer_types=True, profile=True
from typing import Iterable
+
from .kb cimport KnowledgeBase
+
from ..tokens import Span
+
cdef class Candidate:
"""A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved
to a specific `entity` from a Knowledge Base. This will be used as input for the entity linking
diff --git a/spacy/kb/kb.pxd b/spacy/kb/kb.pxd
index 1adeef8ae..263469546 100644
--- a/spacy/kb/kb.pxd
+++ b/spacy/kb/kb.pxd
@@ -2,8 +2,10 @@
from cymem.cymem cimport Pool
from libc.stdint cimport int64_t
+
from ..vocab cimport Vocab
+
cdef class KnowledgeBase:
cdef Pool mem
cdef readonly Vocab vocab
diff --git a/spacy/kb/kb.pyx b/spacy/kb/kb.pyx
index ce4bc0138..a88e18e1f 100644
--- a/spacy/kb/kb.pyx
+++ b/spacy/kb/kb.pyx
@@ -2,12 +2,13 @@
from pathlib import Path
from typing import Iterable, Tuple, Union
+
from cymem.cymem cimport Pool
-from .candidate import Candidate
+from ..errors import Errors
from ..tokens import Span
from ..util import SimpleFrozenList
-from ..errors import Errors
+from .candidate import Candidate
cdef class KnowledgeBase:
diff --git a/spacy/kb/kb_in_memory.pxd b/spacy/kb/kb_in_memory.pxd
index 825a6bde9..08ec6b2a3 100644
--- a/spacy/kb/kb_in_memory.pxd
+++ b/spacy/kb/kb_in_memory.pxd
@@ -1,11 +1,11 @@
"""Knowledge-base for entity or concept linking."""
-from preshed.maps cimport PreshMap
-from libcpp.vector cimport vector
from libc.stdint cimport int32_t, int64_t
from libc.stdio cimport FILE
+from libcpp.vector cimport vector
+from preshed.maps cimport PreshMap
+from ..structs cimport AliasC, KBEntryC
from ..typedefs cimport hash_t
-from ..structs cimport KBEntryC, AliasC
from .kb cimport KnowledgeBase
ctypedef vector[KBEntryC] entry_vec
diff --git a/spacy/kb/kb_in_memory.pyx b/spacy/kb/kb_in_memory.pyx
index 2a74d047b..e991f7720 100644
--- a/spacy/kb/kb_in_memory.pyx
+++ b/spacy/kb/kb_in_memory.pyx
@@ -1,23 +1,28 @@
# cython: infer_types=True, profile=True
-from typing import Iterable, Callable, Dict, Any, Union
+from typing import Any, Callable, Dict, Iterable, Union
import srsly
-from preshed.maps cimport PreshMap
-from cpython.exc cimport PyErr_SetFromErrno
-from libc.stdio cimport fopen, fclose, fread, fwrite, feof, fseek
-from libc.stdint cimport int32_t, int64_t
-from libcpp.vector cimport vector
-from pathlib import Path
+from cpython.exc cimport PyErr_SetFromErrno
+from libc.stdint cimport int32_t, int64_t
+from libc.stdio cimport fclose, feof, fopen, fread, fseek, fwrite
+from libcpp.vector cimport vector
+from preshed.maps cimport PreshMap
+
import warnings
+from pathlib import Path
from ..tokens import Span
+
from ..typedefs cimport hash_t
-from ..errors import Errors, Warnings
+
from .. import util
+from ..errors import Errors, Warnings
from ..util import SimpleFrozenList, ensure_path
+
from ..vocab cimport Vocab
from .kb cimport KnowledgeBase
+
from .candidate import Candidate as Candidate
diff --git a/spacy/lang/af/__init__.py b/spacy/lang/af/__init__.py
index 553fcbf4c..8bd73c7ad 100644
--- a/spacy/lang/af/__init__.py
+++ b/spacy/lang/af/__init__.py
@@ -1,5 +1,5 @@
+from ...language import BaseDefaults, Language
from .stop_words import STOP_WORDS
-from ...language import Language, BaseDefaults
class AfrikaansDefaults(BaseDefaults):
diff --git a/spacy/lang/am/__init__.py b/spacy/lang/am/__init__.py
index ddae556d6..284823eaa 100644
--- a/spacy/lang/am/__init__.py
+++ b/spacy/lang/am/__init__.py
@@ -1,12 +1,11 @@
-from .stop_words import STOP_WORDS
+from ...attrs import LANG
+from ...language import BaseDefaults, Language
+from ...util import update_exc
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_SUFFIXES
-
+from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...language import Language, BaseDefaults
-from ...attrs import LANG
-from ...util import update_exc
class AmharicDefaults(BaseDefaults):
diff --git a/spacy/lang/am/punctuation.py b/spacy/lang/am/punctuation.py
index 555a179fa..87447b054 100644
--- a/spacy/lang/am/punctuation.py
+++ b/spacy/lang/am/punctuation.py
@@ -1,5 +1,11 @@
-from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
-from ..char_classes import UNITS, ALPHA_UPPER
+from ..char_classes import (
+ ALPHA_UPPER,
+ CURRENCY,
+ LIST_ELLIPSES,
+ LIST_PUNCT,
+ LIST_QUOTES,
+ UNITS,
+)
_list_punct = LIST_PUNCT + "፡ ። ፣ ፤ ፥ ፦ ፧ ፠ ፨".strip().split()
diff --git a/spacy/lang/am/tokenizer_exceptions.py b/spacy/lang/am/tokenizer_exceptions.py
index 9472fe918..1ccf996ca 100644
--- a/spacy/lang/am/tokenizer_exceptions.py
+++ b/spacy/lang/am/tokenizer_exceptions.py
@@ -1,5 +1,4 @@
-from ...symbols import ORTH, NORM
-
+from ...symbols import NORM, ORTH
_exc = {}
diff --git a/spacy/lang/ar/__init__.py b/spacy/lang/ar/__init__.py
index 18c1f90ed..d50b0722c 100644
--- a/spacy/lang/ar/__init__.py
+++ b/spacy/lang/ar/__init__.py
@@ -1,8 +1,8 @@
-from .stop_words import STOP_WORDS
+from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_SUFFIXES
+from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from ...language import Language, BaseDefaults
class ArabicDefaults(BaseDefaults):
diff --git a/spacy/lang/ar/punctuation.py b/spacy/lang/ar/punctuation.py
index f30204c02..cf03fc68e 100644
--- a/spacy/lang/ar/punctuation.py
+++ b/spacy/lang/ar/punctuation.py
@@ -1,5 +1,11 @@
-from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
-from ..char_classes import UNITS, ALPHA_UPPER
+from ..char_classes import (
+ ALPHA_UPPER,
+ CURRENCY,
+ LIST_ELLIPSES,
+ LIST_PUNCT,
+ LIST_QUOTES,
+ UNITS,
+)
_suffixes = (
LIST_PUNCT
diff --git a/spacy/lang/ar/tokenizer_exceptions.py b/spacy/lang/ar/tokenizer_exceptions.py
index 7c385bef8..eb16876f5 100644
--- a/spacy/lang/ar/tokenizer_exceptions.py
+++ b/spacy/lang/ar/tokenizer_exceptions.py
@@ -1,7 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...symbols import ORTH, NORM
+from ...symbols import NORM, ORTH
from ...util import update_exc
-
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc = {}
diff --git a/spacy/lang/az/__init__.py b/spacy/lang/az/__init__.py
index 476898364..32949aa3e 100644
--- a/spacy/lang/az/__init__.py
+++ b/spacy/lang/az/__init__.py
@@ -1,6 +1,6 @@
-from .stop_words import STOP_WORDS
+from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
-from ...language import Language, BaseDefaults
+from .stop_words import STOP_WORDS
class AzerbaijaniDefaults(BaseDefaults):
diff --git a/spacy/lang/az/lex_attrs.py b/spacy/lang/az/lex_attrs.py
index 73a5e2762..96fb7f020 100644
--- a/spacy/lang/az/lex_attrs.py
+++ b/spacy/lang/az/lex_attrs.py
@@ -1,6 +1,5 @@
from ...attrs import LIKE_NUM
-
# Eleven, twelve etc. are written separate: on bir, on iki
_num_words = [
diff --git a/spacy/lang/bg/__init__.py b/spacy/lang/bg/__init__.py
index c9176b946..acca63ba1 100644
--- a/spacy/lang/bg/__init__.py
+++ b/spacy/lang/bg/__init__.py
@@ -1,12 +1,14 @@
+from ...attrs import LANG
+from ...language import BaseDefaults, Language
+from ...util import update_exc
+from ..punctuation import (
+ COMBINING_DIACRITICS_TOKENIZER_INFIXES,
+ COMBINING_DIACRITICS_TOKENIZER_SUFFIXES,
+)
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
+from .lex_attrs import LEX_ATTRS
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .lex_attrs import LEX_ATTRS
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_INFIXES
-from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_SUFFIXES
-from ...language import Language, BaseDefaults
-from ...attrs import LANG
-from ...util import update_exc
class BulgarianDefaults(BaseDefaults):
diff --git a/spacy/lang/bg/lex_attrs.py b/spacy/lang/bg/lex_attrs.py
index bba3c74cd..0b7942aec 100644
--- a/spacy/lang/bg/lex_attrs.py
+++ b/spacy/lang/bg/lex_attrs.py
@@ -1,6 +1,5 @@
from ...attrs import LIKE_NUM
-
_num_words = [
"нула",
"едно",
diff --git a/spacy/lang/bg/tokenizer_exceptions.py b/spacy/lang/bg/tokenizer_exceptions.py
index 0f484b778..89d466daf 100644
--- a/spacy/lang/bg/tokenizer_exceptions.py
+++ b/spacy/lang/bg/tokenizer_exceptions.py
@@ -4,8 +4,7 @@ References:
(countries, occupations, fields of studies and more).
"""
-from ...symbols import ORTH, NORM
-
+from ...symbols import NORM, ORTH
_exc = {}
diff --git a/spacy/lang/bn/__init__.py b/spacy/lang/bn/__init__.py
index 6d0331e00..6a5d37f5b 100644
--- a/spacy/lang/bn/__init__.py
+++ b/spacy/lang/bn/__init__.py
@@ -1,10 +1,12 @@
-from typing import Optional, Callable
+from typing import Callable, Optional
+
from thinc.api import Model
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
-from .stop_words import STOP_WORDS
-from ...language import Language, BaseDefaults
+
+from ...language import BaseDefaults, Language
from ...pipeline import Lemmatizer
+from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
+from .stop_words import STOP_WORDS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class BengaliDefaults(BaseDefaults):
diff --git a/spacy/lang/bn/punctuation.py b/spacy/lang/bn/punctuation.py
index becfe8d2a..ddb91cef1 100644
--- a/spacy/lang/bn/punctuation.py
+++ b/spacy/lang/bn/punctuation.py
@@ -1,6 +1,14 @@
-from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
-from ..char_classes import ALPHA_LOWER, ALPHA, HYPHENS, CONCAT_QUOTES, UNITS
-
+from ..char_classes import (
+ ALPHA,
+ ALPHA_LOWER,
+ CONCAT_QUOTES,
+ HYPHENS,
+ LIST_ELLIPSES,
+ LIST_ICONS,
+ LIST_PUNCT,
+ LIST_QUOTES,
+ UNITS,
+)
_currency = r"\$¢£€¥฿৳"
_quotes = CONCAT_QUOTES.replace("'", "")
diff --git a/spacy/lang/bn/tokenizer_exceptions.py b/spacy/lang/bn/tokenizer_exceptions.py
index e666522b8..016bf0fc5 100644
--- a/spacy/lang/bn/tokenizer_exceptions.py
+++ b/spacy/lang/bn/tokenizer_exceptions.py
@@ -1,7 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...symbols import ORTH, NORM
+from ...symbols import NORM, ORTH
from ...util import update_exc
-
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc = {}
diff --git a/spacy/lang/ca/__init__.py b/spacy/lang/ca/__init__.py
index a3def660d..8b2f3e85a 100755
--- a/spacy/lang/ca/__init__.py
+++ b/spacy/lang/ca/__init__.py
@@ -1,14 +1,14 @@
-from typing import Optional, Callable
+from typing import Callable, Optional
from thinc.api import Model
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES
-from .stop_words import STOP_WORDS
-from .lex_attrs import LEX_ATTRS
-from .syntax_iterators import SYNTAX_ITERATORS
-from ...language import Language, BaseDefaults
+from ...language import BaseDefaults, Language
from .lemmatizer import CatalanLemmatizer
+from .lex_attrs import LEX_ATTRS
+from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
+from .stop_words import STOP_WORDS
+from .syntax_iterators import SYNTAX_ITERATORS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class CatalanDefaults(BaseDefaults):
diff --git a/spacy/lang/ca/lex_attrs.py b/spacy/lang/ca/lex_attrs.py
index be8b7a6ea..3e99da0e0 100644
--- a/spacy/lang/ca/lex_attrs.py
+++ b/spacy/lang/ca/lex_attrs.py
@@ -1,6 +1,5 @@
from ...attrs import LIKE_NUM
-
_num_words = [
"zero",
"un",
diff --git a/spacy/lang/ca/punctuation.py b/spacy/lang/ca/punctuation.py
index 8e2f09828..6914f67a7 100755
--- a/spacy/lang/ca/punctuation.py
+++ b/spacy/lang/ca/punctuation.py
@@ -1,9 +1,18 @@
-from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
-from ..char_classes import LIST_CURRENCY
-from ..char_classes import CURRENCY
-from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT
-from ..char_classes import merge_chars, _units
-
+from ..char_classes import (
+ ALPHA,
+ ALPHA_LOWER,
+ ALPHA_UPPER,
+ CONCAT_QUOTES,
+ CURRENCY,
+ LIST_CURRENCY,
+ LIST_ELLIPSES,
+ LIST_ICONS,
+ LIST_PUNCT,
+ LIST_QUOTES,
+ PUNCT,
+ _units,
+ merge_chars,
+)
ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "")
diff --git a/spacy/lang/ca/syntax_iterators.py b/spacy/lang/ca/syntax_iterators.py
index 917e07c93..16a4c6a81 100644
--- a/spacy/lang/ca/syntax_iterators.py
+++ b/spacy/lang/ca/syntax_iterators.py
@@ -1,7 +1,8 @@
-from typing import Union, Iterator, Tuple
-from ...tokens import Doc, Span
-from ...symbols import NOUN, PROPN
+from typing import Iterator, Tuple, Union
+
from ...errors import Errors
+from ...symbols import NOUN, PROPN
+from ...tokens import Doc, Span
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
diff --git a/spacy/lang/ca/tokenizer_exceptions.py b/spacy/lang/ca/tokenizer_exceptions.py
index b261b3498..67165780e 100755
--- a/spacy/lang/ca/tokenizer_exceptions.py
+++ b/spacy/lang/ca/tokenizer_exceptions.py
@@ -1,7 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...symbols import ORTH, NORM
+from ...symbols import NORM, ORTH
from ...util import update_exc
-
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc = {}
diff --git a/spacy/lang/cs/__init__.py b/spacy/lang/cs/__init__.py
index 3e70e4078..9ea60afdf 100644
--- a/spacy/lang/cs/__init__.py
+++ b/spacy/lang/cs/__init__.py
@@ -1,6 +1,6 @@
-from .stop_words import STOP_WORDS
+from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
-from ...language import Language, BaseDefaults
+from .stop_words import STOP_WORDS
class CzechDefaults(BaseDefaults):
diff --git a/spacy/lang/da/__init__.py b/spacy/lang/da/__init__.py
index e148a7b4f..372f372dd 100644
--- a/spacy/lang/da/__init__.py
+++ b/spacy/lang/da/__init__.py
@@ -1,9 +1,9 @@
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
+from ...language import BaseDefaults, Language
+from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS
-from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS
-from ...language import Language, BaseDefaults
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class DanishDefaults(BaseDefaults):
diff --git a/spacy/lang/da/lex_attrs.py b/spacy/lang/da/lex_attrs.py
index 403af686c..8e0420912 100644
--- a/spacy/lang/da/lex_attrs.py
+++ b/spacy/lang/da/lex_attrs.py
@@ -1,6 +1,5 @@
from ...attrs import LIKE_NUM
-
# Source http://fjern-uv.dk/tal.php
_num_words = """nul
en et to tre fire fem seks syv otte ni ti
diff --git a/spacy/lang/da/punctuation.py b/spacy/lang/da/punctuation.py
index e050ab7aa..f70fe3d64 100644
--- a/spacy/lang/da/punctuation.py
+++ b/spacy/lang/da/punctuation.py
@@ -1,8 +1,13 @@
-from ..char_classes import LIST_ELLIPSES, LIST_ICONS
-from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
+from ..char_classes import (
+ ALPHA,
+ ALPHA_LOWER,
+ ALPHA_UPPER,
+ CONCAT_QUOTES,
+ LIST_ELLIPSES,
+ LIST_ICONS,
+)
from ..punctuation import TOKENIZER_SUFFIXES
-
_quotes = CONCAT_QUOTES.replace("'", "")
_infixes = (
diff --git a/spacy/lang/da/syntax_iterators.py b/spacy/lang/da/syntax_iterators.py
index a0b70f004..60224f0b1 100644
--- a/spacy/lang/da/syntax_iterators.py
+++ b/spacy/lang/da/syntax_iterators.py
@@ -1,7 +1,8 @@
-from typing import Union, Iterator, Tuple
-from ...tokens import Doc, Span
-from ...symbols import NOUN, PROPN, PRON, VERB, AUX
+from typing import Iterator, Tuple, Union
+
from ...errors import Errors
+from ...symbols import AUX, NOUN, PRON, PROPN, VERB
+from ...tokens import Doc, Span
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
diff --git a/spacy/lang/da/tokenizer_exceptions.py b/spacy/lang/da/tokenizer_exceptions.py
index ce25c546b..649d12022 100644
--- a/spacy/lang/da/tokenizer_exceptions.py
+++ b/spacy/lang/da/tokenizer_exceptions.py
@@ -2,10 +2,9 @@
Tokenizer Exceptions.
Source: https://forkortelse.dk/ and various others.
"""
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...symbols import ORTH, NORM
+from ...symbols import NORM, ORTH
from ...util import update_exc
-
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc = {}
diff --git a/spacy/lang/de/__init__.py b/spacy/lang/de/__init__.py
index 65863c098..4f45b2357 100644
--- a/spacy/lang/de/__init__.py
+++ b/spacy/lang/de/__init__.py
@@ -1,8 +1,8 @@
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
+from ...language import BaseDefaults, Language
+from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS
-from ...language import Language, BaseDefaults
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class GermanDefaults(BaseDefaults):
diff --git a/spacy/lang/de/punctuation.py b/spacy/lang/de/punctuation.py
index 69d402237..862207649 100644
--- a/spacy/lang/de/punctuation.py
+++ b/spacy/lang/de/punctuation.py
@@ -1,9 +1,18 @@
-from ..char_classes import LIST_ELLIPSES, LIST_ICONS, LIST_PUNCT, LIST_QUOTES
-from ..char_classes import CURRENCY, UNITS, PUNCT
-from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
+from ..char_classes import (
+ ALPHA,
+ ALPHA_LOWER,
+ ALPHA_UPPER,
+ CONCAT_QUOTES,
+ CURRENCY,
+ LIST_ELLIPSES,
+ LIST_ICONS,
+ LIST_PUNCT,
+ LIST_QUOTES,
+ PUNCT,
+ UNITS,
+)
from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES
-
_prefixes = ["``"] + BASE_TOKENIZER_PREFIXES
_suffixes = (
diff --git a/spacy/lang/de/syntax_iterators.py b/spacy/lang/de/syntax_iterators.py
index e80504998..544fe299c 100644
--- a/spacy/lang/de/syntax_iterators.py
+++ b/spacy/lang/de/syntax_iterators.py
@@ -1,7 +1,7 @@
-from typing import Union, Iterator, Tuple
+from typing import Iterator, Tuple, Union
-from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors
+from ...symbols import NOUN, PRON, PROPN
from ...tokens import Doc, Span
diff --git a/spacy/lang/de/tokenizer_exceptions.py b/spacy/lang/de/tokenizer_exceptions.py
index 21d99cffe..3f1aeeccd 100644
--- a/spacy/lang/de/tokenizer_exceptions.py
+++ b/spacy/lang/de/tokenizer_exceptions.py
@@ -1,7 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...symbols import ORTH, NORM
+from ...symbols import NORM, ORTH
from ...util import update_exc
-
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc = {
"auf'm": [{ORTH: "auf"}, {ORTH: "'m", NORM: "dem"}],
diff --git a/spacy/lang/dsb/__init__.py b/spacy/lang/dsb/__init__.py
index c66092a0c..096eced19 100644
--- a/spacy/lang/dsb/__init__.py
+++ b/spacy/lang/dsb/__init__.py
@@ -1,6 +1,6 @@
+from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
from .stop_words import STOP_WORDS
-from ...language import Language, BaseDefaults
class LowerSorbianDefaults(BaseDefaults):
diff --git a/spacy/lang/el/__init__.py b/spacy/lang/el/__init__.py
index 53dd9be8e..00e52bd97 100644
--- a/spacy/lang/el/__init__.py
+++ b/spacy/lang/el/__init__.py
@@ -1,13 +1,14 @@
-from typing import Optional, Callable
+from typing import Callable, Optional
+
from thinc.api import Model
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .stop_words import STOP_WORDS
-from .lex_attrs import LEX_ATTRS
-from .syntax_iterators import SYNTAX_ITERATORS
-from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
+from ...language import BaseDefaults, Language
from .lemmatizer import GreekLemmatizer
-from ...language import Language, BaseDefaults
+from .lex_attrs import LEX_ATTRS
+from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
+from .stop_words import STOP_WORDS
+from .syntax_iterators import SYNTAX_ITERATORS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class GreekDefaults(BaseDefaults):
diff --git a/spacy/lang/el/get_pos_from_wiktionary.py b/spacy/lang/el/get_pos_from_wiktionary.py
index 369973cc0..10b54d112 100644
--- a/spacy/lang/el/get_pos_from_wiktionary.py
+++ b/spacy/lang/el/get_pos_from_wiktionary.py
@@ -1,5 +1,6 @@
def get_pos_from_wiktionary():
import re
+
from gensim.corpora.wikicorpus import extract_pages
regex = re.compile(r"==={{(\w+)\|el}}===")
diff --git a/spacy/lang/el/punctuation.py b/spacy/lang/el/punctuation.py
index 2d5690407..b8b717bac 100644
--- a/spacy/lang/el/punctuation.py
+++ b/spacy/lang/el/punctuation.py
@@ -1,6 +1,16 @@
-from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
-from ..char_classes import LIST_ICONS, ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS
-from ..char_classes import CONCAT_QUOTES, CURRENCY
+from ..char_classes import (
+ ALPHA,
+ ALPHA_LOWER,
+ ALPHA_UPPER,
+ CONCAT_QUOTES,
+ CURRENCY,
+ HYPHENS,
+ LIST_CURRENCY,
+ LIST_ELLIPSES,
+ LIST_ICONS,
+ LIST_PUNCT,
+ LIST_QUOTES,
+)
_units = (
"km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft "
diff --git a/spacy/lang/el/syntax_iterators.py b/spacy/lang/el/syntax_iterators.py
index 18fa46695..31c7dccf7 100644
--- a/spacy/lang/el/syntax_iterators.py
+++ b/spacy/lang/el/syntax_iterators.py
@@ -1,7 +1,7 @@
-from typing import Union, Iterator, Tuple
+from typing import Iterator, Tuple, Union
-from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors
+from ...symbols import NOUN, PRON, PROPN
from ...tokens import Doc, Span
diff --git a/spacy/lang/el/tokenizer_exceptions.py b/spacy/lang/el/tokenizer_exceptions.py
index 0a36d5d2b..41317ba97 100644
--- a/spacy/lang/el/tokenizer_exceptions.py
+++ b/spacy/lang/el/tokenizer_exceptions.py
@@ -1,6 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...symbols import ORTH, NORM
+from ...symbols import NORM, ORTH
from ...util import update_exc
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc = {}
diff --git a/spacy/lang/en/__init__.py b/spacy/lang/en/__init__.py
index 876186979..c4bcfb938 100644
--- a/spacy/lang/en/__init__.py
+++ b/spacy/lang/en/__init__.py
@@ -1,13 +1,14 @@
-from typing import Optional, Callable
+from typing import Callable, Optional
+
from thinc.api import Model
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .stop_words import STOP_WORDS
-from .lex_attrs import LEX_ATTRS
-from .syntax_iterators import SYNTAX_ITERATORS
-from .punctuation import TOKENIZER_INFIXES
+from ...language import BaseDefaults, Language
from .lemmatizer import EnglishLemmatizer
-from ...language import Language, BaseDefaults
+from .lex_attrs import LEX_ATTRS
+from .punctuation import TOKENIZER_INFIXES
+from .stop_words import STOP_WORDS
+from .syntax_iterators import SYNTAX_ITERATORS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class EnglishDefaults(BaseDefaults):
diff --git a/spacy/lang/en/punctuation.py b/spacy/lang/en/punctuation.py
index 5d3eb792e..775c6b001 100644
--- a/spacy/lang/en/punctuation.py
+++ b/spacy/lang/en/punctuation.py
@@ -1,5 +1,12 @@
-from ..char_classes import LIST_ELLIPSES, LIST_ICONS, HYPHENS
-from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
+from ..char_classes import (
+ ALPHA,
+ ALPHA_LOWER,
+ ALPHA_UPPER,
+ CONCAT_QUOTES,
+ HYPHENS,
+ LIST_ELLIPSES,
+ LIST_ICONS,
+)
_infixes = (
LIST_ELLIPSES
diff --git a/spacy/lang/en/syntax_iterators.py b/spacy/lang/en/syntax_iterators.py
index 7904e5621..140ae0a5c 100644
--- a/spacy/lang/en/syntax_iterators.py
+++ b/spacy/lang/en/syntax_iterators.py
@@ -1,7 +1,7 @@
-from typing import Union, Iterator, Tuple
+from typing import Iterator, Tuple, Union
-from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors
+from ...symbols import NOUN, PRON, PROPN
from ...tokens import Doc, Span
diff --git a/spacy/lang/en/tokenizer_exceptions.py b/spacy/lang/en/tokenizer_exceptions.py
index 7886e28cb..dd3650c18 100644
--- a/spacy/lang/en/tokenizer_exceptions.py
+++ b/spacy/lang/en/tokenizer_exceptions.py
@@ -1,8 +1,8 @@
from typing import Dict, List
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...symbols import ORTH, NORM
-from ...util import update_exc
+from ...symbols import NORM, ORTH
+from ...util import update_exc
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc: Dict[str, List[Dict]] = {}
_exclude = [
diff --git a/spacy/lang/es/__init__.py b/spacy/lang/es/__init__.py
index e75955202..bcaed8672 100644
--- a/spacy/lang/es/__init__.py
+++ b/spacy/lang/es/__init__.py
@@ -1,12 +1,14 @@
-from typing import Optional, Callable
+from typing import Callable, Optional
+
from thinc.api import Model
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .stop_words import STOP_WORDS
-from .lex_attrs import LEX_ATTRS
+
+from ...language import BaseDefaults, Language
from .lemmatizer import SpanishLemmatizer
-from .syntax_iterators import SYNTAX_ITERATORS
+from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
-from ...language import Language, BaseDefaults
+from .stop_words import STOP_WORDS
+from .syntax_iterators import SYNTAX_ITERATORS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class SpanishDefaults(BaseDefaults):
diff --git a/spacy/lang/es/lemmatizer.py b/spacy/lang/es/lemmatizer.py
index ca5fc08c8..44f968347 100644
--- a/spacy/lang/es/lemmatizer.py
+++ b/spacy/lang/es/lemmatizer.py
@@ -1,5 +1,5 @@
-from typing import List, Optional, Tuple
import re
+from typing import List, Optional, Tuple
from ...pipeline import Lemmatizer
from ...tokens import Token
diff --git a/spacy/lang/es/lex_attrs.py b/spacy/lang/es/lex_attrs.py
index 9d1fa93b8..4c477eaee 100644
--- a/spacy/lang/es/lex_attrs.py
+++ b/spacy/lang/es/lex_attrs.py
@@ -1,6 +1,5 @@
from ...attrs import LIKE_NUM
-
_num_words = [
"cero",
"uno",
diff --git a/spacy/lang/es/punctuation.py b/spacy/lang/es/punctuation.py
index e9552371e..3d20518cd 100644
--- a/spacy/lang/es/punctuation.py
+++ b/spacy/lang/es/punctuation.py
@@ -1,8 +1,17 @@
-from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES
-from ..char_classes import LIST_ICONS, CURRENCY, LIST_UNITS, PUNCT
-from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
-from ..char_classes import merge_chars
-
+from ..char_classes import (
+ ALPHA,
+ ALPHA_LOWER,
+ ALPHA_UPPER,
+ CONCAT_QUOTES,
+ CURRENCY,
+ LIST_ELLIPSES,
+ LIST_ICONS,
+ LIST_PUNCT,
+ LIST_QUOTES,
+ LIST_UNITS,
+ PUNCT,
+ merge_chars,
+)
_list_units = [u for u in LIST_UNITS if u != "%"]
_units = merge_chars(" ".join(_list_units))
diff --git a/spacy/lang/es/syntax_iterators.py b/spacy/lang/es/syntax_iterators.py
index f2ca2a678..96df444a3 100644
--- a/spacy/lang/es/syntax_iterators.py
+++ b/spacy/lang/es/syntax_iterators.py
@@ -1,7 +1,7 @@
-from typing import Union, Iterator, Tuple
+from typing import Iterator, Tuple, Union
-from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors
+from ...symbols import NOUN, PRON, PROPN
from ...tokens import Doc, Span
diff --git a/spacy/lang/es/tokenizer_exceptions.py b/spacy/lang/es/tokenizer_exceptions.py
index 74cdc143d..2ea0ed8b7 100644
--- a/spacy/lang/es/tokenizer_exceptions.py
+++ b/spacy/lang/es/tokenizer_exceptions.py
@@ -1,7 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...symbols import ORTH, NORM
+from ...symbols import NORM, ORTH
from ...util import update_exc
-
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc = {
"pal": [{ORTH: "pa"}, {ORTH: "l", NORM: "el"}],
diff --git a/spacy/lang/et/__init__.py b/spacy/lang/et/__init__.py
index 274bc1309..9ec7e6006 100644
--- a/spacy/lang/et/__init__.py
+++ b/spacy/lang/et/__init__.py
@@ -1,5 +1,5 @@
+from ...language import BaseDefaults, Language
from .stop_words import STOP_WORDS
-from ...language import Language, BaseDefaults
class EstonianDefaults(BaseDefaults):
diff --git a/spacy/lang/eu/__init__.py b/spacy/lang/eu/__init__.py
index 3346468bd..81f9c4a18 100644
--- a/spacy/lang/eu/__init__.py
+++ b/spacy/lang/eu/__init__.py
@@ -1,7 +1,7 @@
-from .stop_words import STOP_WORDS
+from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_SUFFIXES
-from ...language import Language, BaseDefaults
+from .stop_words import STOP_WORDS
class BasqueDefaults(BaseDefaults):
diff --git a/spacy/lang/eu/punctuation.py b/spacy/lang/eu/punctuation.py
index 5d35d0a25..382bfc75c 100644
--- a/spacy/lang/eu/punctuation.py
+++ b/spacy/lang/eu/punctuation.py
@@ -1,4 +1,3 @@
from ..punctuation import TOKENIZER_SUFFIXES
-
_suffixes = TOKENIZER_SUFFIXES
diff --git a/spacy/lang/fa/__init__.py b/spacy/lang/fa/__init__.py
index 914e4c27d..e5baa8b4a 100644
--- a/spacy/lang/fa/__init__.py
+++ b/spacy/lang/fa/__init__.py
@@ -1,12 +1,14 @@
-from typing import Optional, Callable
+from typing import Callable, Optional
+
from thinc.api import Model
-from .stop_words import STOP_WORDS
-from .lex_attrs import LEX_ATTRS
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .punctuation import TOKENIZER_SUFFIXES
-from .syntax_iterators import SYNTAX_ITERATORS
-from ...language import Language, BaseDefaults
+
+from ...language import BaseDefaults, Language
from ...pipeline import Lemmatizer
+from .lex_attrs import LEX_ATTRS
+from .punctuation import TOKENIZER_SUFFIXES
+from .stop_words import STOP_WORDS
+from .syntax_iterators import SYNTAX_ITERATORS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class PersianDefaults(BaseDefaults):
diff --git a/spacy/lang/fa/lex_attrs.py b/spacy/lang/fa/lex_attrs.py
index 99b8e2787..065e81bd6 100644
--- a/spacy/lang/fa/lex_attrs.py
+++ b/spacy/lang/fa/lex_attrs.py
@@ -1,6 +1,5 @@
from ...attrs import LIKE_NUM
-
MIM = "م"
ZWNJ_O_MIM = "ام"
YE_NUN = "ین"
diff --git a/spacy/lang/fa/punctuation.py b/spacy/lang/fa/punctuation.py
index 4b258c13d..c1ee570ce 100644
--- a/spacy/lang/fa/punctuation.py
+++ b/spacy/lang/fa/punctuation.py
@@ -1,5 +1,11 @@
-from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
-from ..char_classes import UNITS, ALPHA_UPPER
+from ..char_classes import (
+ ALPHA_UPPER,
+ CURRENCY,
+ LIST_ELLIPSES,
+ LIST_PUNCT,
+ LIST_QUOTES,
+ UNITS,
+)
_suffixes = (
LIST_PUNCT
diff --git a/spacy/lang/fa/syntax_iterators.py b/spacy/lang/fa/syntax_iterators.py
index 8207884b0..3052369a7 100644
--- a/spacy/lang/fa/syntax_iterators.py
+++ b/spacy/lang/fa/syntax_iterators.py
@@ -1,7 +1,8 @@
-from typing import Union, Iterator, Tuple
-from ...tokens import Doc, Span
-from ...symbols import NOUN, PROPN, PRON
+from typing import Iterator, Tuple, Union
+
from ...errors import Errors
+from ...symbols import NOUN, PRON, PROPN
+from ...tokens import Doc, Span
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
diff --git a/spacy/lang/fa/tokenizer_exceptions.py b/spacy/lang/fa/tokenizer_exceptions.py
index 30df798ab..3b31b7f67 100644
--- a/spacy/lang/fa/tokenizer_exceptions.py
+++ b/spacy/lang/fa/tokenizer_exceptions.py
@@ -1,5 +1,4 @@
-from ...symbols import ORTH, NORM
-
+from ...symbols import NORM, ORTH
TOKENIZER_EXCEPTIONS = {
".ق ": [{ORTH: ".ق "}],
diff --git a/spacy/lang/fi/__init__.py b/spacy/lang/fi/__init__.py
index c3a0cf451..3e371b9b5 100644
--- a/spacy/lang/fi/__init__.py
+++ b/spacy/lang/fi/__init__.py
@@ -1,9 +1,9 @@
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .stop_words import STOP_WORDS
+from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
+from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS
-from ...language import Language, BaseDefaults
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class FinnishDefaults(BaseDefaults):
diff --git a/spacy/lang/fi/lex_attrs.py b/spacy/lang/fi/lex_attrs.py
index 4d500cead..9eec41b3d 100644
--- a/spacy/lang/fi/lex_attrs.py
+++ b/spacy/lang/fi/lex_attrs.py
@@ -1,6 +1,5 @@
from ...attrs import LIKE_NUM
-
_num_words = [
"nolla",
"yksi",
diff --git a/spacy/lang/fi/punctuation.py b/spacy/lang/fi/punctuation.py
index 6e14dde38..29ddc3111 100644
--- a/spacy/lang/fi/punctuation.py
+++ b/spacy/lang/fi/punctuation.py
@@ -1,8 +1,14 @@
-from ..char_classes import LIST_ELLIPSES, LIST_ICONS, LIST_HYPHENS
-from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
+from ..char_classes import (
+ ALPHA,
+ ALPHA_LOWER,
+ ALPHA_UPPER,
+ CONCAT_QUOTES,
+ LIST_ELLIPSES,
+ LIST_HYPHENS,
+ LIST_ICONS,
+)
from ..punctuation import TOKENIZER_SUFFIXES
-
_quotes = CONCAT_QUOTES.replace("'", "")
DASHES = "|".join(x for x in LIST_HYPHENS if x != "-")
diff --git a/spacy/lang/fi/syntax_iterators.py b/spacy/lang/fi/syntax_iterators.py
index 6b481e51f..6e2216713 100644
--- a/spacy/lang/fi/syntax_iterators.py
+++ b/spacy/lang/fi/syntax_iterators.py
@@ -1,7 +1,8 @@
from typing import Iterator, Tuple, Union
-from ...tokens import Doc, Span
-from ...symbols import NOUN, PROPN, PRON
+
from ...errors import Errors
+from ...symbols import NOUN, PRON, PROPN
+from ...tokens import Doc, Span
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
diff --git a/spacy/lang/fi/tokenizer_exceptions.py b/spacy/lang/fi/tokenizer_exceptions.py
index 465333b0a..881d5b91d 100644
--- a/spacy/lang/fi/tokenizer_exceptions.py
+++ b/spacy/lang/fi/tokenizer_exceptions.py
@@ -1,7 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...symbols import ORTH, NORM
+from ...symbols import NORM, ORTH
from ...util import update_exc
-
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc = {}
diff --git a/spacy/lang/fr/__init__.py b/spacy/lang/fr/__init__.py
index 27d2a915e..a8bc7f53e 100644
--- a/spacy/lang/fr/__init__.py
+++ b/spacy/lang/fr/__init__.py
@@ -1,15 +1,14 @@
-from typing import Optional, Callable
+from typing import Callable, Optional
from thinc.api import Model
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
-from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
-from .punctuation import TOKENIZER_SUFFIXES
-from .stop_words import STOP_WORDS
-from .lex_attrs import LEX_ATTRS
-from .syntax_iterators import SYNTAX_ITERATORS
+from ...language import BaseDefaults, Language
from .lemmatizer import FrenchLemmatizer
-from ...language import Language, BaseDefaults
+from .lex_attrs import LEX_ATTRS
+from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
+from .stop_words import STOP_WORDS
+from .syntax_iterators import SYNTAX_ITERATORS
+from .tokenizer_exceptions import TOKEN_MATCH, TOKENIZER_EXCEPTIONS
class FrenchDefaults(BaseDefaults):
diff --git a/spacy/lang/fr/lex_attrs.py b/spacy/lang/fr/lex_attrs.py
index 811312ad7..9cf508a07 100644
--- a/spacy/lang/fr/lex_attrs.py
+++ b/spacy/lang/fr/lex_attrs.py
@@ -1,6 +1,5 @@
from ...attrs import LIKE_NUM
-
_num_words = set(
"""
zero un une deux trois quatre cinq six sept huit neuf dix
diff --git a/spacy/lang/fr/punctuation.py b/spacy/lang/fr/punctuation.py
index 873d01d87..a3b178a2f 100644
--- a/spacy/lang/fr/punctuation.py
+++ b/spacy/lang/fr/punctuation.py
@@ -1,8 +1,16 @@
-from ..punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
-from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
-from ..char_classes import CONCAT_QUOTES, UNITS, ALPHA, ALPHA_LOWER, ALPHA_UPPER
-from ..char_classes import merge_chars
-
+from ..char_classes import (
+ ALPHA,
+ ALPHA_LOWER,
+ ALPHA_UPPER,
+ CONCAT_QUOTES,
+ CURRENCY,
+ LIST_ELLIPSES,
+ LIST_PUNCT,
+ LIST_QUOTES,
+ UNITS,
+ merge_chars,
+)
+from ..punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES
ELISION = "' ’".replace(" ", "")
HYPHENS = r"- – — ‐ ‑".replace(" ", "")
diff --git a/spacy/lang/fr/syntax_iterators.py b/spacy/lang/fr/syntax_iterators.py
index 5849c40b3..a6bf3d3ca 100644
--- a/spacy/lang/fr/syntax_iterators.py
+++ b/spacy/lang/fr/syntax_iterators.py
@@ -1,7 +1,7 @@
-from typing import Union, Iterator, Tuple
+from typing import Iterator, Tuple, Union
-from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors
+from ...symbols import NOUN, PRON, PROPN
from ...tokens import Doc, Span
diff --git a/spacy/lang/fr/tokenizer_exceptions.py b/spacy/lang/fr/tokenizer_exceptions.py
index 2e88b58cf..fa2062ef9 100644
--- a/spacy/lang/fr/tokenizer_exceptions.py
+++ b/spacy/lang/fr/tokenizer_exceptions.py
@@ -1,11 +1,10 @@
import re
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from .punctuation import ELISION, HYPHENS
-from ..char_classes import ALPHA_LOWER, ALPHA
from ...symbols import ORTH
from ...util import update_exc
-
+from ..char_classes import ALPHA, ALPHA_LOWER
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
+from .punctuation import ELISION, HYPHENS
# not using the large _tokenizer_exceptions_list by default as it slows down the tokenizer
# from ._tokenizer_exceptions_list import FR_BASE_EXCEPTIONS
diff --git a/spacy/lang/ga/__init__.py b/spacy/lang/ga/__init__.py
index 3be53bc7a..6f9a27a14 100644
--- a/spacy/lang/ga/__init__.py
+++ b/spacy/lang/ga/__init__.py
@@ -2,10 +2,10 @@ from typing import Optional
from thinc.api import Model
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .stop_words import STOP_WORDS
-from ...language import Language, BaseDefaults
+from ...language import BaseDefaults, Language
from .lemmatizer import IrishLemmatizer
+from .stop_words import STOP_WORDS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class IrishDefaults(BaseDefaults):
diff --git a/spacy/lang/ga/lemmatizer.py b/spacy/lang/ga/lemmatizer.py
index 47aec8fd4..c9fbfbc19 100644
--- a/spacy/lang/ga/lemmatizer.py
+++ b/spacy/lang/ga/lemmatizer.py
@@ -1,4 +1,4 @@
-from typing import List, Dict, Tuple
+from typing import Dict, List, Tuple
from ...pipeline import Lemmatizer
from ...tokens import Token
diff --git a/spacy/lang/ga/tokenizer_exceptions.py b/spacy/lang/ga/tokenizer_exceptions.py
index 63af65fe9..eb4b413fb 100644
--- a/spacy/lang/ga/tokenizer_exceptions.py
+++ b/spacy/lang/ga/tokenizer_exceptions.py
@@ -1,7 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...symbols import ORTH, NORM
+from ...symbols import NORM, ORTH
from ...util import update_exc
-
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc = {
"'acha'n": [{ORTH: "'ach", NORM: "gach"}, {ORTH: "a'n", NORM: "aon"}],
diff --git a/spacy/lang/grc/__init__.py b/spacy/lang/grc/__init__.py
index 019b3802e..ed742f4c5 100644
--- a/spacy/lang/grc/__init__.py
+++ b/spacy/lang/grc/__init__.py
@@ -1,8 +1,8 @@
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .stop_words import STOP_WORDS
+from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
-from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
-from ...language import Language, BaseDefaults
+from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
+from .stop_words import STOP_WORDS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class AncientGreekDefaults(BaseDefaults):
diff --git a/spacy/lang/grc/lex_attrs.py b/spacy/lang/grc/lex_attrs.py
index 0ab15e6fd..33cfca05b 100644
--- a/spacy/lang/grc/lex_attrs.py
+++ b/spacy/lang/grc/lex_attrs.py
@@ -1,6 +1,5 @@
from ...attrs import LIKE_NUM
-
_num_words = [
# CARDINALS
"εἷς",
diff --git a/spacy/lang/grc/punctuation.py b/spacy/lang/grc/punctuation.py
index 8f3589e9a..8e9fc8bf2 100644
--- a/spacy/lang/grc/punctuation.py
+++ b/spacy/lang/grc/punctuation.py
@@ -1,6 +1,15 @@
-from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
-from ..char_classes import LIST_ICONS, ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS
-from ..char_classes import CONCAT_QUOTES
+from ..char_classes import (
+ ALPHA,
+ ALPHA_LOWER,
+ ALPHA_UPPER,
+ CONCAT_QUOTES,
+ HYPHENS,
+ LIST_CURRENCY,
+ LIST_ELLIPSES,
+ LIST_ICONS,
+ LIST_PUNCT,
+ LIST_QUOTES,
+)
_prefixes = (
[
diff --git a/spacy/lang/grc/tokenizer_exceptions.py b/spacy/lang/grc/tokenizer_exceptions.py
index bcee70f32..86527ff61 100644
--- a/spacy/lang/grc/tokenizer_exceptions.py
+++ b/spacy/lang/grc/tokenizer_exceptions.py
@@ -1,6 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...symbols import ORTH, NORM
+from ...symbols import NORM, ORTH
from ...util import update_exc
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc = {}
diff --git a/spacy/lang/gu/__init__.py b/spacy/lang/gu/__init__.py
index e6fbc9d18..2f22034c1 100644
--- a/spacy/lang/gu/__init__.py
+++ b/spacy/lang/gu/__init__.py
@@ -1,5 +1,5 @@
+from ...language import BaseDefaults, Language
from .stop_words import STOP_WORDS
-from ...language import Language, BaseDefaults
class GujaratiDefaults(BaseDefaults):
diff --git a/spacy/lang/he/__init__.py b/spacy/lang/he/__init__.py
index dd2ee478d..07084acf1 100644
--- a/spacy/lang/he/__init__.py
+++ b/spacy/lang/he/__init__.py
@@ -1,6 +1,6 @@
-from .stop_words import STOP_WORDS
+from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
-from ...language import Language, BaseDefaults
+from .stop_words import STOP_WORDS
class HebrewDefaults(BaseDefaults):
diff --git a/spacy/lang/hi/__init__.py b/spacy/lang/hi/__init__.py
index 4c8ae446d..980dc31c1 100644
--- a/spacy/lang/hi/__init__.py
+++ b/spacy/lang/hi/__init__.py
@@ -1,6 +1,6 @@
-from .stop_words import STOP_WORDS
+from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
-from ...language import Language, BaseDefaults
+from .stop_words import STOP_WORDS
class HindiDefaults(BaseDefaults):
diff --git a/spacy/lang/hi/lex_attrs.py b/spacy/lang/hi/lex_attrs.py
index ee845e8b1..4ecd1db66 100644
--- a/spacy/lang/hi/lex_attrs.py
+++ b/spacy/lang/hi/lex_attrs.py
@@ -1,6 +1,5 @@
+from ...attrs import LIKE_NUM, NORM
from ..norm_exceptions import BASE_NORMS
-from ...attrs import NORM, LIKE_NUM
-
# fmt: off
_stem_suffixes = [
diff --git a/spacy/lang/hr/__init__.py b/spacy/lang/hr/__init__.py
index 30870b522..fd7622a3d 100644
--- a/spacy/lang/hr/__init__.py
+++ b/spacy/lang/hr/__init__.py
@@ -1,5 +1,5 @@
+from ...language import BaseDefaults, Language
from .stop_words import STOP_WORDS
-from ...language import Language, BaseDefaults
class CroatianDefaults(BaseDefaults):
diff --git a/spacy/lang/hsb/__init__.py b/spacy/lang/hsb/__init__.py
index 034d82319..e8b2ffc9f 100644
--- a/spacy/lang/hsb/__init__.py
+++ b/spacy/lang/hsb/__init__.py
@@ -1,7 +1,7 @@
+from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from ...language import Language, BaseDefaults
class UpperSorbianDefaults(BaseDefaults):
diff --git a/spacy/lang/hsb/tokenizer_exceptions.py b/spacy/lang/hsb/tokenizer_exceptions.py
index 4b9a4f98a..cd3bac913 100644
--- a/spacy/lang/hsb/tokenizer_exceptions.py
+++ b/spacy/lang/hsb/tokenizer_exceptions.py
@@ -1,6 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...symbols import ORTH, NORM
+from ...symbols import NORM, ORTH
from ...util import update_exc
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc = dict()
for exc_data in [
diff --git a/spacy/lang/hu/__init__.py b/spacy/lang/hu/__init__.py
index 9426bacea..799e6d230 100644
--- a/spacy/lang/hu/__init__.py
+++ b/spacy/lang/hu/__init__.py
@@ -1,7 +1,7 @@
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
-from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
+from ...language import BaseDefaults, Language
+from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS
-from ...language import Language, BaseDefaults
+from .tokenizer_exceptions import TOKEN_MATCH, TOKENIZER_EXCEPTIONS
class HungarianDefaults(BaseDefaults):
diff --git a/spacy/lang/hu/punctuation.py b/spacy/lang/hu/punctuation.py
index f827cd677..dbf93c622 100644
--- a/spacy/lang/hu/punctuation.py
+++ b/spacy/lang/hu/punctuation.py
@@ -1,6 +1,14 @@
-from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CONCAT_QUOTES
-from ..char_classes import CONCAT_ICONS, UNITS, ALPHA, ALPHA_LOWER, ALPHA_UPPER
-
+from ..char_classes import (
+ ALPHA,
+ ALPHA_LOWER,
+ ALPHA_UPPER,
+ CONCAT_ICONS,
+ CONCAT_QUOTES,
+ LIST_ELLIPSES,
+ LIST_PUNCT,
+ LIST_QUOTES,
+ UNITS,
+)
# removing ° from the special icons to keep e.g. 99° as one token
_concat_icons = CONCAT_ICONS.replace("\u00B0", "")
diff --git a/spacy/lang/hu/tokenizer_exceptions.py b/spacy/lang/hu/tokenizer_exceptions.py
index ffaa74f50..3f79b02d2 100644
--- a/spacy/lang/hu/tokenizer_exceptions.py
+++ b/spacy/lang/hu/tokenizer_exceptions.py
@@ -1,10 +1,9 @@
import re
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ..punctuation import ALPHA_LOWER, CURRENCY
from ...symbols import ORTH
from ...util import update_exc
-
+from ..punctuation import ALPHA_LOWER, CURRENCY
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc = {}
diff --git a/spacy/lang/hy/__init__.py b/spacy/lang/hy/__init__.py
index 481eaae0a..e00d4fd11 100644
--- a/spacy/lang/hy/__init__.py
+++ b/spacy/lang/hy/__init__.py
@@ -1,6 +1,6 @@
-from .stop_words import STOP_WORDS
+from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
-from ...language import Language, BaseDefaults
+from .stop_words import STOP_WORDS
class ArmenianDefaults(BaseDefaults):
diff --git a/spacy/lang/hy/lex_attrs.py b/spacy/lang/hy/lex_attrs.py
index 9c9c0380c..4c96b8ab5 100644
--- a/spacy/lang/hy/lex_attrs.py
+++ b/spacy/lang/hy/lex_attrs.py
@@ -1,6 +1,5 @@
from ...attrs import LIKE_NUM
-
_num_words = [
"զրո",
"մեկ",
diff --git a/spacy/lang/id/__init__.py b/spacy/lang/id/__init__.py
index 0d72cfa9d..93eb3214a 100644
--- a/spacy/lang/id/__init__.py
+++ b/spacy/lang/id/__init__.py
@@ -1,9 +1,9 @@
-from .stop_words import STOP_WORDS
-from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES, TOKENIZER_INFIXES
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
+from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
+from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
+from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS
-from ...language import Language, BaseDefaults
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class IndonesianDefaults(BaseDefaults):
diff --git a/spacy/lang/id/lex_attrs.py b/spacy/lang/id/lex_attrs.py
index 3167f4659..5952c4d06 100644
--- a/spacy/lang/id/lex_attrs.py
+++ b/spacy/lang/id/lex_attrs.py
@@ -1,8 +1,7 @@
import unicodedata
-from .punctuation import LIST_CURRENCY
from ...attrs import IS_CURRENCY, LIKE_NUM
-
+from .punctuation import LIST_CURRENCY
_num_words = [
"nol",
diff --git a/spacy/lang/id/punctuation.py b/spacy/lang/id/punctuation.py
index f6c2387d8..8303b8eaa 100644
--- a/spacy/lang/id/punctuation.py
+++ b/spacy/lang/id/punctuation.py
@@ -1,6 +1,5 @@
-from ..punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
-from ..char_classes import ALPHA, merge_chars, split_chars, _currency, _units
-
+from ..char_classes import ALPHA, _currency, _units, merge_chars, split_chars
+from ..punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
_units = (
_units + "s bit Gbps Mbps mbps Kbps kbps ƒ ppi px "
diff --git a/spacy/lang/id/syntax_iterators.py b/spacy/lang/id/syntax_iterators.py
index fa984d411..027798687 100644
--- a/spacy/lang/id/syntax_iterators.py
+++ b/spacy/lang/id/syntax_iterators.py
@@ -1,7 +1,7 @@
-from typing import Union, Iterator, Tuple
+from typing import Iterator, Tuple, Union
-from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors
+from ...symbols import NOUN, PRON, PROPN
from ...tokens import Doc, Span
diff --git a/spacy/lang/id/tokenizer_exceptions.py b/spacy/lang/id/tokenizer_exceptions.py
index ff77ede9f..8dea4e97f 100644
--- a/spacy/lang/id/tokenizer_exceptions.py
+++ b/spacy/lang/id/tokenizer_exceptions.py
@@ -1,8 +1,7 @@
+from ...symbols import NORM, ORTH
+from ...util import update_exc
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ._tokenizer_exceptions_list import ID_BASE_EXCEPTIONS
-from ...symbols import ORTH, NORM
-from ...util import update_exc
-
# Daftar singkatan dan Akronim dari:
# https://id.wiktionary.org/wiki/Wiktionary:Daftar_singkatan_dan_akronim_bahasa_Indonesia#A
diff --git a/spacy/lang/is/__init__.py b/spacy/lang/is/__init__.py
index 318363beb..af1260045 100644
--- a/spacy/lang/is/__init__.py
+++ b/spacy/lang/is/__init__.py
@@ -1,5 +1,5 @@
+from ...language import BaseDefaults, Language
from .stop_words import STOP_WORDS
-from ...language import Language, BaseDefaults
class IcelandicDefaults(BaseDefaults):
diff --git a/spacy/lang/it/__init__.py b/spacy/lang/it/__init__.py
index ecf322bd7..14458d811 100644
--- a/spacy/lang/it/__init__.py
+++ b/spacy/lang/it/__init__.py
@@ -1,12 +1,13 @@
-from typing import Optional, Callable
+from typing import Callable, Optional
+
from thinc.api import Model
-from .stop_words import STOP_WORDS
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
-from ...language import Language, BaseDefaults
+from ...language import BaseDefaults, Language
from .lemmatizer import ItalianLemmatizer
+from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES
+from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class ItalianDefaults(BaseDefaults):
diff --git a/spacy/lang/it/lemmatizer.py b/spacy/lang/it/lemmatizer.py
index e44e64e3a..bf869166d 100644
--- a/spacy/lang/it/lemmatizer.py
+++ b/spacy/lang/it/lemmatizer.py
@@ -1,4 +1,4 @@
-from typing import List, Dict, Tuple
+from typing import Dict, List, Tuple
from ...pipeline import Lemmatizer
from ...tokens import Token
diff --git a/spacy/lang/it/punctuation.py b/spacy/lang/it/punctuation.py
index f01ab4f0d..51318b22d 100644
--- a/spacy/lang/it/punctuation.py
+++ b/spacy/lang/it/punctuation.py
@@ -1,8 +1,13 @@
+from ..char_classes import (
+ ALPHA,
+ ALPHA_LOWER,
+ ALPHA_UPPER,
+ CONCAT_QUOTES,
+ HYPHENS,
+ LIST_ELLIPSES,
+ LIST_ICONS,
+)
from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES
-from ..char_classes import LIST_ELLIPSES, LIST_ICONS
-from ..char_classes import ALPHA, HYPHENS, CONCAT_QUOTES
-from ..char_classes import ALPHA_LOWER, ALPHA_UPPER
-
ELISION = "'’"
diff --git a/spacy/lang/it/syntax_iterators.py b/spacy/lang/it/syntax_iterators.py
index f63df3fad..924627648 100644
--- a/spacy/lang/it/syntax_iterators.py
+++ b/spacy/lang/it/syntax_iterators.py
@@ -1,7 +1,7 @@
-from typing import Union, Iterator, Tuple
+from typing import Iterator, Tuple, Union
-from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors
+from ...symbols import NOUN, PRON, PROPN
from ...tokens import Doc, Span
diff --git a/spacy/lang/it/tokenizer_exceptions.py b/spacy/lang/it/tokenizer_exceptions.py
index 42883863b..2e7a5a1a3 100644
--- a/spacy/lang/it/tokenizer_exceptions.py
+++ b/spacy/lang/it/tokenizer_exceptions.py
@@ -1,7 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH
from ...util import update_exc
-
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc = {
"all'art.": [{ORTH: "all'"}, {ORTH: "art."}],
diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py
index bf86305fb..0d5f97ac8 100644
--- a/spacy/lang/ja/__init__.py
+++ b/spacy/lang/ja/__init__.py
@@ -1,27 +1,27 @@
-from typing import Optional, Union, Dict, Any, Callable
-from pathlib import Path
-import srsly
-from collections import namedtuple
-from thinc.api import Model
import re
+from collections import namedtuple
+from pathlib import Path
+from typing import Any, Callable, Dict, Optional, Union
-from .stop_words import STOP_WORDS
-from .syntax_iterators import SYNTAX_ITERATORS
-from .tag_map import TAG_MAP
-from .tag_orth_map import TAG_ORTH_MAP
-from .tag_bigram_map import TAG_BIGRAM_MAP
+import srsly
+from thinc.api import Model
+
+from ... import util
from ...errors import Errors
-from ...language import Language, BaseDefaults
+from ...language import BaseDefaults, Language
from ...pipeline import Morphologizer
from ...pipeline.morphologizer import DEFAULT_MORPH_MODEL
from ...scorer import Scorer
from ...symbols import POS
from ...tokens import Doc, MorphAnalysis
from ...training import validate_examples
-from ...util import DummyTokenizer, registry, load_config_from_str
+from ...util import DummyTokenizer, load_config_from_str, registry
from ...vocab import Vocab
-from ... import util
-
+from .stop_words import STOP_WORDS
+from .syntax_iterators import SYNTAX_ITERATORS
+from .tag_bigram_map import TAG_BIGRAM_MAP
+from .tag_map import TAG_MAP
+from .tag_orth_map import TAG_ORTH_MAP
DEFAULT_CONFIG = """
[nlp]
diff --git a/spacy/lang/ja/syntax_iterators.py b/spacy/lang/ja/syntax_iterators.py
index 588a9ba03..34670083e 100644
--- a/spacy/lang/ja/syntax_iterators.py
+++ b/spacy/lang/ja/syntax_iterators.py
@@ -1,9 +1,8 @@
-from typing import Union, Iterator, Tuple, Set
+from typing import Iterator, Set, Tuple, Union
-from ...symbols import NOUN, PROPN, PRON, VERB
+from ...symbols import NOUN, PRON, PROPN, VERB
from ...tokens import Doc, Span
-
# TODO: this can probably be pruned a bit
# fmt: off
labels = ["nsubj", "nmod", "ddoclike", "nsubjpass", "pcomp", "pdoclike", "doclike", "obl", "dative", "appos", "attr", "ROOT"]
diff --git a/spacy/lang/ja/tag_map.py b/spacy/lang/ja/tag_map.py
index c6de3831a..5c14f41bf 100644
--- a/spacy/lang/ja/tag_map.py
+++ b/spacy/lang/ja/tag_map.py
@@ -1,6 +1,23 @@
-from ...symbols import POS, PUNCT, INTJ, ADJ, AUX, ADP, PART, SCONJ, NOUN
-from ...symbols import SYM, PRON, VERB, ADV, PROPN, NUM, DET, SPACE, CCONJ
-
+from ...symbols import (
+ ADJ,
+ ADP,
+ ADV,
+ AUX,
+ CCONJ,
+ DET,
+ INTJ,
+ NOUN,
+ NUM,
+ PART,
+ POS,
+ PRON,
+ PROPN,
+ PUNCT,
+ SCONJ,
+ SPACE,
+ SYM,
+ VERB,
+)
TAG_MAP = {
# Explanation of Unidic tags:
diff --git a/spacy/lang/kn/__init__.py b/spacy/lang/kn/__init__.py
index ccd46a394..44d53f6b7 100644
--- a/spacy/lang/kn/__init__.py
+++ b/spacy/lang/kn/__init__.py
@@ -1,5 +1,5 @@
+from ...language import BaseDefaults, Language
from .stop_words import STOP_WORDS
-from ...language import Language, BaseDefaults
class KannadaDefaults(BaseDefaults):
diff --git a/spacy/lang/ko/__init__.py b/spacy/lang/ko/__init__.py
index 0e02e4a2d..e2c860f7d 100644
--- a/spacy/lang/ko/__init__.py
+++ b/spacy/lang/ko/__init__.py
@@ -1,17 +1,16 @@
-from typing import Iterator, Any, Dict
+from typing import Any, Dict, Iterator
+from ...language import BaseDefaults, Language
+from ...scorer import Scorer
+from ...symbols import POS, X
+from ...tokens import Doc
+from ...training import validate_examples
+from ...util import DummyTokenizer, load_config_from_str, registry
+from ...vocab import Vocab
+from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_INFIXES
from .stop_words import STOP_WORDS
from .tag_map import TAG_MAP
-from .lex_attrs import LEX_ATTRS
-from ...language import Language, BaseDefaults
-from ...tokens import Doc
-from ...scorer import Scorer
-from ...symbols import POS, X
-from ...training import validate_examples
-from ...util import DummyTokenizer, registry, load_config_from_str
-from ...vocab import Vocab
-
DEFAULT_CONFIG = """
[nlp]
diff --git a/spacy/lang/ko/lex_attrs.py b/spacy/lang/ko/lex_attrs.py
index ac5bc7e48..2c49aa389 100644
--- a/spacy/lang/ko/lex_attrs.py
+++ b/spacy/lang/ko/lex_attrs.py
@@ -1,6 +1,5 @@
from ...attrs import LIKE_NUM
-
_num_words = [
"영",
"공",
diff --git a/spacy/lang/ko/punctuation.py b/spacy/lang/ko/punctuation.py
index f5f1c51da..c3c32ea1f 100644
--- a/spacy/lang/ko/punctuation.py
+++ b/spacy/lang/ko/punctuation.py
@@ -1,7 +1,6 @@
from ..char_classes import LIST_QUOTES
from ..punctuation import TOKENIZER_INFIXES as BASE_TOKENIZER_INFIXES
-
_infixes = (
["·", "ㆍ", r"\(", r"\)"]
+ [r"(?<=[0-9])~(?=[0-9-])"]
diff --git a/spacy/lang/ko/tag_map.py b/spacy/lang/ko/tag_map.py
index 26a8c56b9..85598c3ef 100644
--- a/spacy/lang/ko/tag_map.py
+++ b/spacy/lang/ko/tag_map.py
@@ -1,5 +1,21 @@
-from ...symbols import POS, PUNCT, INTJ, X, SYM, ADJ, AUX, ADP, CONJ, NOUN, PRON
-from ...symbols import VERB, ADV, PROPN, NUM, DET
+from ...symbols import (
+ ADJ,
+ ADP,
+ ADV,
+ AUX,
+ CONJ,
+ DET,
+ INTJ,
+ NOUN,
+ NUM,
+ POS,
+ PRON,
+ PROPN,
+ PUNCT,
+ SYM,
+ VERB,
+ X,
+)
# 은전한닢(mecab-ko-dic)의 품사 태그를 universal pos tag로 대응시킴
# https://docs.google.com/spreadsheets/d/1-9blXKjtjeKZqsf4NzHeYJCrr49-nXeRF6D80udfcwY/edit#gid=589544265
diff --git a/spacy/lang/ky/__init__.py b/spacy/lang/ky/__init__.py
index ccca384bd..fafc0f020 100644
--- a/spacy/lang/ky/__init__.py
+++ b/spacy/lang/ky/__init__.py
@@ -1,8 +1,8 @@
+from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_INFIXES
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from ...language import Language, BaseDefaults
class KyrgyzDefaults(BaseDefaults):
diff --git a/spacy/lang/ky/punctuation.py b/spacy/lang/ky/punctuation.py
index fa9819f80..6d89da2f7 100644
--- a/spacy/lang/ky/punctuation.py
+++ b/spacy/lang/ky/punctuation.py
@@ -1,5 +1,12 @@
-from ..char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER, CONCAT_QUOTES, HYPHENS
-from ..char_classes import LIST_ELLIPSES, LIST_ICONS
+from ..char_classes import (
+ ALPHA,
+ ALPHA_LOWER,
+ ALPHA_UPPER,
+ CONCAT_QUOTES,
+ HYPHENS,
+ LIST_ELLIPSES,
+ LIST_ICONS,
+)
_hyphens_no_dash = HYPHENS.replace("-", "").strip("|").replace("||", "")
_infixes = (
diff --git a/spacy/lang/ky/tokenizer_exceptions.py b/spacy/lang/ky/tokenizer_exceptions.py
index 8ec727ac1..c93e3dac3 100644
--- a/spacy/lang/ky/tokenizer_exceptions.py
+++ b/spacy/lang/ky/tokenizer_exceptions.py
@@ -1,6 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...symbols import ORTH, NORM
+from ...symbols import NORM, ORTH
from ...util import update_exc
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc = {}
diff --git a/spacy/lang/la/__init__.py b/spacy/lang/la/__init__.py
index 15b87c5b9..d77ae267e 100644
--- a/spacy/lang/la/__init__.py
+++ b/spacy/lang/la/__init__.py
@@ -1,13 +1,15 @@
-from ...language import Language, BaseDefaults
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .stop_words import STOP_WORDS
+from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
+from .stop_words import STOP_WORDS
+from .syntax_iterators import SYNTAX_ITERATORS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class LatinDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
stop_words = STOP_WORDS
lex_attr_getters = LEX_ATTRS
+ syntax_iterators = SYNTAX_ITERATORS
class Latin(Language):
diff --git a/spacy/lang/la/examples.py b/spacy/lang/la/examples.py
new file mode 100644
index 000000000..db8550070
--- /dev/null
+++ b/spacy/lang/la/examples.py
@@ -0,0 +1,22 @@
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.la.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+# > Caes. BG 1.1
+# > Cic. De Amic. 1
+# > V. Georg. 1.1-5
+# > Gen. 1:1
+# > Galileo, Sid. Nunc.
+# > van Schurman, Opusc. arg. 1
+
+sentences = [
+ "Gallia est omnis divisa in partes tres, quarum unam incolunt Belgae, aliam Aquitani, tertiam qui ipsorum lingua Celtae, nostra Galli appellantur.",
+ "Q. Mucius augur multa narrare de C. Laelio socero suo memoriter et iucunde solebat nec dubitare illum in omni sermone appellare sapientem.",
+ "Quid faciat laetas segetes, quo sidere terram uertere, Maecenas, ulmisque adiungere uitis conueniat, quae cura boum, qui cultus habendo sit pecori, apibus quanta experientia parcis, hinc canere incipiam",
+ "In principio creavit Deus caelum et terram.",
+ "Quo sumpto, intelligatur lunaris globus, cuius maximus circulus CAF, centrum vero E, dimetiens CF, qui ad Terre diametrum est ut duo ad septem.",
+ "Cuicunque natura indita sunt principia, seu potentiae principiorum omnium artium, ac scientiarum, ei conveniunt omnes artes ac scientiae.",
+]
diff --git a/spacy/lang/la/lex_attrs.py b/spacy/lang/la/lex_attrs.py
index 9efb4dd3c..fcb35defc 100644
--- a/spacy/lang/la/lex_attrs.py
+++ b/spacy/lang/la/lex_attrs.py
@@ -1,22 +1,22 @@
-from ...attrs import LIKE_NUM
import re
+from ...attrs import LIKE_NUM
+
# cf. Goyvaerts/Levithan 2009; case-insensitive, allow 4
roman_numerals_compile = re.compile(
r"(?i)^(?=[MDCLXVI])M*(C[MD]|D?C{0,4})(X[CL]|L?X{0,4})(I[XV]|V?I{0,4})$"
)
-_num_words = set(
- """
-unus una unum duo duae tres tria quattuor quinque sex septem octo novem decem
+_num_words = """unus una unum duo duae tres tria quattuor quinque sex septem octo novem decem undecim duodecim tredecim quattuordecim quindecim sedecim septendecim duodeviginti undeviginti viginti triginta quadraginta quinquaginta sexaginta septuaginta octoginta nonaginta centum ducenti ducentae ducenta trecenti trecentae trecenta quadringenti quadringentae quadringenta quingenti quingentae quingenta sescenti sescentae sescenta septingenti septingentae septingenta octingenti octingentae octingenta nongenti nongentae nongenta mille
""".split()
-)
-_ordinal_words = set(
- """
-primus prima primum secundus secunda secundum tertius tertia tertium
-""".split()
-)
+_num_words += [item.replace("v", "u") for item in _num_words]
+_num_words = set(_num_words)
+
+_ordinal_words = """primus prima primum secundus secunda secundum tertius tertia tertium quartus quarta quartum quintus quinta quintum sextus sexta sextum septimus septima septimum octavus octava octavum nonus nona nonum decimus decima decimum undecimus undecima undecimum duodecimus duodecima duodecimum duodevicesimus duodevicesima duodevicesimum undevicesimus undevicesima undevicesimum vicesimus vicesima vicesimum tricesimus tricesima tricesimum quadragesimus quadragesima quadragesimum quinquagesimus quinquagesima quinquagesimum sexagesimus sexagesima sexagesimum septuagesimus septuagesima septuagesimum octogesimus octogesima octogesimum nonagesimus nonagesima nonagesimum centesimus centesima centesimum ducentesimus ducentesima ducentesimum trecentesimus trecentesima trecentesimum quadringentesimus quadringentesima quadringentesimum quingentesimus quingentesima quingentesimum sescentesimus sescentesima sescentesimum septingentesimus septingentesima septingentesimum octingentesimus octingentesima octingentesimum nongentesimus nongentesima nongentesimum millesimus millesima millesimum""".split()
+
+_ordinal_words += [item.replace("v", "u") for item in _ordinal_words]
+_ordinal_words = set(_ordinal_words)
def like_num(text):
diff --git a/spacy/lang/la/syntax_iterators.py b/spacy/lang/la/syntax_iterators.py
new file mode 100644
index 000000000..39b4fb39d
--- /dev/null
+++ b/spacy/lang/la/syntax_iterators.py
@@ -0,0 +1,86 @@
+from typing import Iterator, Tuple, Union
+
+from ...errors import Errors
+from ...symbols import AUX, NOUN, PRON, PROPN, VERB
+from ...tokens import Doc, Span
+
+# NB: Modified from da on suggestion from https://github.com/explosion/spaCy/issues/7457#issuecomment-800349751 [PJB]
+
+
+def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
+ def is_verb_token(tok):
+ return tok.pos in [VERB, AUX]
+
+ def get_left_bound(root):
+ left_bound = root
+ for tok in reversed(list(root.lefts)):
+ if tok.dep in np_left_deps:
+ left_bound = tok
+ return left_bound
+
+ def get_right_bound(doc, root):
+ right_bound = root
+ for tok in root.rights:
+ if tok.dep in np_right_deps:
+ right = get_right_bound(doc, tok)
+ if list(
+ filter(
+ lambda t: is_verb_token(t) or t.dep in stop_deps,
+ doc[root.i : right.i],
+ )
+ ):
+ break
+ else:
+ right_bound = right
+ return right_bound
+
+ def get_bounds(doc, root):
+ return get_left_bound(root), get_right_bound(doc, root)
+
+ doc = doclike.doc # Ensure works on both Doc and Span.
+
+ if not doc.has_annotation("DEP"):
+ raise ValueError(Errors.E029)
+
+ if not len(doc):
+ return
+
+ left_labels = [
+ "det",
+ "fixed",
+ "nmod:poss",
+ "amod",
+ "flat",
+ "goeswith",
+ "nummod",
+ "appos",
+ ]
+ right_labels = [
+ "fixed",
+ "nmod:poss",
+ "amod",
+ "flat",
+ "goeswith",
+ "nummod",
+ "appos",
+ "nmod",
+ "det",
+ ]
+ stop_labels = ["punct"]
+
+ np_label = doc.vocab.strings.add("NP")
+ np_left_deps = [doc.vocab.strings.add(label) for label in left_labels]
+ np_right_deps = [doc.vocab.strings.add(label) for label in right_labels]
+ stop_deps = [doc.vocab.strings.add(label) for label in stop_labels]
+
+ prev_right = -1
+ for token in doclike:
+ if token.pos in [PROPN, NOUN, PRON]:
+ left, right = get_bounds(doc, token)
+ if left.i <= prev_right:
+ continue
+ yield left.i, right.i + 1, np_label
+ prev_right = right.i
+
+
+SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
diff --git a/spacy/lang/la/tokenizer_exceptions.py b/spacy/lang/la/tokenizer_exceptions.py
index 060f6e085..c0b98116f 100644
--- a/spacy/lang/la/tokenizer_exceptions.py
+++ b/spacy/lang/la/tokenizer_exceptions.py
@@ -1,7 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH
from ...util import update_exc
-
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
## TODO: Look into systematically handling u/v
_exc = {
@@ -12,65 +11,15 @@ _exc = {
"uobiscum": [{ORTH: "uobis"}, {ORTH: "cum"}],
}
-for orth in [
- "A.",
- "Agr.",
- "Ap.",
- "C.",
- "Cn.",
- "D.",
- "F.",
- "K.",
- "L.",
- "M'.",
- "M.",
- "Mam.",
- "N.",
- "Oct.",
- "Opet.",
- "P.",
- "Paul.",
- "Post.",
- "Pro.",
- "Q.",
- "S.",
- "Ser.",
- "Sert.",
- "Sex.",
- "St.",
- "Sta.",
- "T.",
- "Ti.",
- "V.",
- "Vol.",
- "Vop.",
- "U.",
- "Uol.",
- "Uop.",
- "Ian.",
- "Febr.",
- "Mart.",
- "Apr.",
- "Mai.",
- "Iun.",
- "Iul.",
- "Aug.",
- "Sept.",
- "Oct.",
- "Nov.",
- "Nou.",
- "Dec.",
- "Non.",
- "Id.",
- "A.D.",
- "Coll.",
- "Cos.",
- "Ord.",
- "Pl.",
- "S.C.",
- "Suff.",
- "Trib.",
-]:
+_abbrev_exc = """A. A.D. Aa. Aaa. Acc. Agr. Ap. Apr. April. A.U.C. Aug. C. Caes. Caess. Cc. Cn. Coll. Cons. Conss. Cos. Coss. D. D.N. Dat. Dd. Dec. Decemb. Decembr. F. Feb. Febr. Februar. Ian. Id. Imp. Impp. Imppp. Iul. Iun. K. Kal. L. M'. M. Mai. Mam. Mar. Mart. Med. N. Nn. Nob. Non. Nov. Novemb. Oct. Octob. Opet. Ord. P. Paul. Pf. Pl. Plur. Post. Pp. Prid. Pro. Procos. Q. Quint. S. S.C. Scr. Sept. Septemb. Ser. Sert. Sex. Sext. St. Sta. Suff. T. Ti. Trib. V. Vol. Vop. Vv.""".split()
+
+_abbrev_exc += [item.lower() for item in _abbrev_exc]
+_abbrev_exc += [item.upper() for item in _abbrev_exc]
+_abbrev_exc += [item.replace("v", "u").replace("V", "U") for item in _abbrev_exc]
+
+_abbrev_exc += ["d.N."]
+
+for orth in set(_abbrev_exc):
_exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
diff --git a/spacy/lang/lb/__init__.py b/spacy/lang/lb/__init__.py
index 7827e7762..2386b4356 100644
--- a/spacy/lang/lb/__init__.py
+++ b/spacy/lang/lb/__init__.py
@@ -1,8 +1,8 @@
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .punctuation import TOKENIZER_INFIXES
+from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
+from .punctuation import TOKENIZER_INFIXES
from .stop_words import STOP_WORDS
-from ...language import Language, BaseDefaults
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class LuxembourgishDefaults(BaseDefaults):
diff --git a/spacy/lang/lb/lex_attrs.py b/spacy/lang/lb/lex_attrs.py
index d2d50d9dc..119231374 100644
--- a/spacy/lang/lb/lex_attrs.py
+++ b/spacy/lang/lb/lex_attrs.py
@@ -1,6 +1,5 @@
from ...attrs import LIKE_NUM
-
_num_words = set(
"""
null eent zwee dräi véier fënnef sechs ziwen aacht néng zéng eelef zwielef dräizéng
diff --git a/spacy/lang/lb/punctuation.py b/spacy/lang/lb/punctuation.py
index e382c56c5..8bdbf9713 100644
--- a/spacy/lang/lb/punctuation.py
+++ b/spacy/lang/lb/punctuation.py
@@ -1,4 +1,4 @@
-from ..char_classes import LIST_ELLIPSES, LIST_ICONS, ALPHA, ALPHA_LOWER, ALPHA_UPPER
+from ..char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER, LIST_ELLIPSES, LIST_ICONS
ELISION = " ' ’ ".strip().replace(" ", "")
diff --git a/spacy/lang/lb/tokenizer_exceptions.py b/spacy/lang/lb/tokenizer_exceptions.py
index d00dc9610..844826e27 100644
--- a/spacy/lang/lb/tokenizer_exceptions.py
+++ b/spacy/lang/lb/tokenizer_exceptions.py
@@ -1,7 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...symbols import ORTH, NORM
+from ...symbols import NORM, ORTH
from ...util import update_exc
-
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
# TODO
# treat other apostrophes within words as part of the word: [op d'mannst], [fir d'éischt] (= exceptions)
diff --git a/spacy/lang/lex_attrs.py b/spacy/lang/lex_attrs.py
index 6ed981a06..3ac20420d 100644
--- a/spacy/lang/lex_attrs.py
+++ b/spacy/lang/lex_attrs.py
@@ -1,11 +1,10 @@
-from typing import Set
-import unicodedata
import re
+import unicodedata
+from typing import Set
from .. import attrs
from .tokenizer_exceptions import URL_MATCH
-
_like_email = re.compile(r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)").match
_tlds = set(
"com|org|edu|gov|net|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|"
diff --git a/spacy/lang/lg/__init__.py b/spacy/lang/lg/__init__.py
index 6f7153fce..a87685375 100644
--- a/spacy/lang/lg/__init__.py
+++ b/spacy/lang/lg/__init__.py
@@ -1,7 +1,7 @@
-from .stop_words import STOP_WORDS
+from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_INFIXES
-from ...language import Language, BaseDefaults
+from .stop_words import STOP_WORDS
class LugandaDefaults(BaseDefaults):
diff --git a/spacy/lang/lg/punctuation.py b/spacy/lang/lg/punctuation.py
index 5d3eb792e..775c6b001 100644
--- a/spacy/lang/lg/punctuation.py
+++ b/spacy/lang/lg/punctuation.py
@@ -1,5 +1,12 @@
-from ..char_classes import LIST_ELLIPSES, LIST_ICONS, HYPHENS
-from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
+from ..char_classes import (
+ ALPHA,
+ ALPHA_LOWER,
+ ALPHA_UPPER,
+ CONCAT_QUOTES,
+ HYPHENS,
+ LIST_ELLIPSES,
+ LIST_ICONS,
+)
_infixes = (
LIST_ELLIPSES
diff --git a/spacy/lang/lij/__init__.py b/spacy/lang/lij/__init__.py
index b7e11f77e..3b8e972c6 100644
--- a/spacy/lang/lij/__init__.py
+++ b/spacy/lang/lij/__init__.py
@@ -1,7 +1,7 @@
+from ...language import BaseDefaults, Language
+from .punctuation import TOKENIZER_INFIXES
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .punctuation import TOKENIZER_INFIXES
-from ...language import Language, BaseDefaults
class LigurianDefaults(BaseDefaults):
diff --git a/spacy/lang/lij/punctuation.py b/spacy/lang/lij/punctuation.py
index d50b75589..c5c150d0a 100644
--- a/spacy/lang/lij/punctuation.py
+++ b/spacy/lang/lij/punctuation.py
@@ -1,6 +1,5 @@
-from ..punctuation import TOKENIZER_INFIXES
from ..char_classes import ALPHA
-
+from ..punctuation import TOKENIZER_INFIXES
ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "")
diff --git a/spacy/lang/lij/tokenizer_exceptions.py b/spacy/lang/lij/tokenizer_exceptions.py
index 52eae2c89..cf5a1af66 100644
--- a/spacy/lang/lij/tokenizer_exceptions.py
+++ b/spacy/lang/lij/tokenizer_exceptions.py
@@ -1,7 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH
from ...util import update_exc
-
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc = {}
diff --git a/spacy/lang/lt/__init__.py b/spacy/lang/lt/__init__.py
index 3ae000e5f..f3ea257b1 100644
--- a/spacy/lang/lt/__init__.py
+++ b/spacy/lang/lt/__init__.py
@@ -1,8 +1,8 @@
-from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .stop_words import STOP_WORDS
+from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
-from ...language import Language, BaseDefaults
+from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
+from .stop_words import STOP_WORDS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class LithuanianDefaults(BaseDefaults):
diff --git a/spacy/lang/lt/punctuation.py b/spacy/lang/lt/punctuation.py
index 22aee0941..deef24854 100644
--- a/spacy/lang/lt/punctuation.py
+++ b/spacy/lang/lt/punctuation.py
@@ -1,9 +1,14 @@
-from ..char_classes import LIST_ICONS, LIST_ELLIPSES
-from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
-from ..char_classes import HYPHENS
+from ..char_classes import (
+ ALPHA,
+ ALPHA_LOWER,
+ ALPHA_UPPER,
+ CONCAT_QUOTES,
+ HYPHENS,
+ LIST_ELLIPSES,
+ LIST_ICONS,
+)
from ..punctuation import TOKENIZER_SUFFIXES
-
_infixes = (
LIST_ELLIPSES
+ LIST_ICONS
diff --git a/spacy/lang/lt/tokenizer_exceptions.py b/spacy/lang/lt/tokenizer_exceptions.py
index 118fb2190..d39b86dfc 100644
--- a/spacy/lang/lt/tokenizer_exceptions.py
+++ b/spacy/lang/lt/tokenizer_exceptions.py
@@ -1,7 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH
from ...util import update_exc
-
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc = {}
diff --git a/spacy/lang/lv/__init__.py b/spacy/lang/lv/__init__.py
index a05e5b939..fdfca5e97 100644
--- a/spacy/lang/lv/__init__.py
+++ b/spacy/lang/lv/__init__.py
@@ -1,5 +1,5 @@
+from ...language import BaseDefaults, Language
from .stop_words import STOP_WORDS
-from ...language import Language, BaseDefaults
class LatvianDefaults(BaseDefaults):
diff --git a/spacy/lang/mk/__init__.py b/spacy/lang/mk/__init__.py
index fa07cfef9..413f0038d 100644
--- a/spacy/lang/mk/__init__.py
+++ b/spacy/lang/mk/__init__.py
@@ -1,15 +1,16 @@
-from typing import Optional, Callable
+from typing import Callable, Optional
+
from thinc.api import Model
+
+from ...attrs import LANG
+from ...language import BaseDefaults, Language
+from ...lookups import Lookups
+from ...util import update_exc
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
from .lemmatizer import MacedonianLemmatizer
+from .lex_attrs import LEX_ATTRS
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .lex_attrs import LEX_ATTRS
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-
-from ...language import Language, BaseDefaults
-from ...attrs import LANG
-from ...util import update_exc
-from ...lookups import Lookups
class MacedonianDefaults(BaseDefaults):
diff --git a/spacy/lang/mk/lemmatizer.py b/spacy/lang/mk/lemmatizer.py
index a792095e7..f5a5eca85 100644
--- a/spacy/lang/mk/lemmatizer.py
+++ b/spacy/lang/mk/lemmatizer.py
@@ -1,5 +1,5 @@
-from typing import List
from collections import OrderedDict
+from typing import List
from ...pipeline import Lemmatizer
from ...tokens import Token
diff --git a/spacy/lang/mk/tokenizer_exceptions.py b/spacy/lang/mk/tokenizer_exceptions.py
index 3b589b2a9..40f2c1d80 100644
--- a/spacy/lang/mk/tokenizer_exceptions.py
+++ b/spacy/lang/mk/tokenizer_exceptions.py
@@ -1,5 +1,4 @@
-from ...symbols import ORTH, NORM
-
+from ...symbols import NORM, ORTH
_exc = {}
diff --git a/spacy/lang/ml/__init__.py b/spacy/lang/ml/__init__.py
index 9f90605f0..0b17b8a7a 100644
--- a/spacy/lang/ml/__init__.py
+++ b/spacy/lang/ml/__init__.py
@@ -1,6 +1,6 @@
-from .stop_words import STOP_WORDS
+from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
-from ...language import Language, BaseDefaults
+from .stop_words import STOP_WORDS
class MalayalamDefaults(BaseDefaults):
diff --git a/spacy/lang/ml/lex_attrs.py b/spacy/lang/ml/lex_attrs.py
index 9ac19b6a7..33a144f6b 100644
--- a/spacy/lang/ml/lex_attrs.py
+++ b/spacy/lang/ml/lex_attrs.py
@@ -1,6 +1,5 @@
from ...attrs import LIKE_NUM
-
# reference 2: https://www.omniglot.com/language/numbers/malayalam.htm
_num_words = [
diff --git a/spacy/lang/mr/__init__.py b/spacy/lang/mr/__init__.py
index 3e172fa60..f980efbd0 100644
--- a/spacy/lang/mr/__init__.py
+++ b/spacy/lang/mr/__init__.py
@@ -1,5 +1,5 @@
+from ...language import BaseDefaults, Language
from .stop_words import STOP_WORDS
-from ...language import Language, BaseDefaults
class MarathiDefaults(BaseDefaults):
diff --git a/spacy/lang/ms/__init__.py b/spacy/lang/ms/__init__.py
new file mode 100644
index 000000000..f53ebfcf2
--- /dev/null
+++ b/spacy/lang/ms/__init__.py
@@ -0,0 +1,24 @@
+from ...language import BaseDefaults, Language
+from .lex_attrs import LEX_ATTRS
+from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
+from .stop_words import STOP_WORDS
+from .syntax_iterators import SYNTAX_ITERATORS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
+
+
+class MalayDefaults(BaseDefaults):
+ tokenizer_exceptions = TOKENIZER_EXCEPTIONS
+ prefixes = TOKENIZER_PREFIXES
+ suffixes = TOKENIZER_SUFFIXES
+ infixes = TOKENIZER_INFIXES
+ syntax_iterators = SYNTAX_ITERATORS
+ lex_attr_getters = LEX_ATTRS
+ stop_words = STOP_WORDS
+
+
+class Malay(Language):
+ lang = "ms"
+ Defaults = MalayDefaults
+
+
+__all__ = ["Malay"]
diff --git a/spacy/lang/ms/_tokenizer_exceptions_list.py b/spacy/lang/ms/_tokenizer_exceptions_list.py
new file mode 100644
index 000000000..fba1dd70f
--- /dev/null
+++ b/spacy/lang/ms/_tokenizer_exceptions_list.py
@@ -0,0 +1,1943 @@
+# from https://prpm.dbp.gov.my/cari1?keyword=
+# dbp https://en.wikipedia.org/wiki/Dewan_Bahasa_dan_Pustaka
+MS_BASE_EXCEPTIONS = set(
+ """
+aba-aba
+abah-abah
+abar-abar
+abrit-abritan
+abu-abu
+abuk-abuk
+abun-abun
+acak-acak
+acak-acakan
+acang-acang
+aci-aci
+aci-acian
+aci-acinya
+adang-adang
+adap-adapan
+adik-beradik
+aduk-adukan
+agak-agak
+agar-agar
+agut-agut
+air-cooled
+ajar-ajar
+aji-aji
+akal-akal
+akhir-akhir
+aki-aki
+alah-mengalahi
+alan-alan
+alang-alang
+alang-alangan
+alap-alap
+ali-ali
+alih-alih
+aling-aling
+aling-alingan
+alip-alipan
+alon-alon
+alu-alu
+alu-aluan
+alun-alun
+alur-alur
+ambah-ambah
+ambai-ambai
+ambil-mengambil
+ambring-ambringan
+ambu-ambu
+ambung-ambung
+amin-amin
+ampai-ampai
+amung-amung
+anai-anai
+anak-anak
+anak-anakan
+anak-beranak
+ancak-ancak
+ancang-ancang
+andang-andang
+angan-angan
+anggar-anggar
+angin-angin
+angin-anginan
+angkul-angkul
+angkup-angkup
+angkut-angkut
+ani-ani
+aning-aning
+anjang-anjang
+anjing-anjing
+anjung-anjung
+anjung-anjungan
+antar-antar
+ante-mortem
+anting-anting
+antung-antung
+anyam-menganyam
+apa-apa
+api-api
+apit-apit
+aprit-apritan
+arah-arah
+arak-arakan
+aram-aram
+ari-ari
+aru-aru
+asa-asaan
+asam-asaman
+asuh-asuh
+atas-mengatasi
+ati-ati
+audio-visual
+avant-garde
+awang-awang
+awang-gemawang
+ayak-ayak
+ayam-ayam
+ayam-ayaman
+ayang-ayang
+ayeng-ayengan
+ayun-temayun
+back-up
+bahu-membahu
+baik-baik
+bajang-bajang
+baji-baji
+balai-balai
+balam-balam
+balas-membalas
+baling-baling
+balut-balut
+bangun-bangun
+bantal-bantal
+barat-barat
+barau-barau
+bari-bari
+barung-barung
+basa-basi
+bata-bata
+batir-batir
+bau-bauan
+bayang-bayang
+bedil-bedal
+begana-begini
+bekal-bekalan
+belat-belit
+belu-belai
+benggal-benggil
+bengkal-bengkil
+bengkang-bengkok
+bengkang-bengkong
+berabad-abad
+berabun-rabun
+berada-ada
+beragah-agah
+beragak-agak
+beragam-ragam
+beraja-raja
+berakit-rakit
+beraku-akuan
+beralun-alun
+beramah-ramahan
+beramah-tamah
+beramai-ramai
+berambai-ambai
+berambal-ambalan
+beramuk-amukan
+berandai-andai
+berandai-randai
+berang-berang
+berangan-angan
+beranggap-anggapan
+berangguk-angguk
+berangin-angin
+berangka-angka
+berangka-angkaan
+berangkai-rangkai
+beranja-anja
+berantai-rantai
+berapi-api
+berapung-apung
+berarak-arakan
+beras-beras
+berasing-asingan
+beratus-ratus
+berawas-awas
+berayal-ayalan
+berayun-ayun
+berbagai-bagai
+berbahas-bahasan
+berbalas-balasan
+berbalik-balik
+berbanjar-banjar
+berbantah-bantah
+berbanyak-banyak
+berbarik-barik
+berbasah-basah
+berbatu-batu
+berbayang-bayang
+berbecak-becak
+berbedil-bedilan
+berbeka-beka
+berbelakang-belakangan
+berbelang-belang
+berbeli-belian
+berbelit-belit
+berbelok-belok
+berbenar-benar
+berbencah-bencah
+berbesar-besar
+berbidai-bidai
+berbiku-biku
+berbilik-bilik
+berbinar-binar
+berbincang-bincang
+berbingkah-bingkah
+berbintang-bintang
+berbintik-bintik
+berbintil-bintil
+berbisik-bisik
+berbolak-balik
+berbolong-bolong
+berbondong-bondong
+berbongkah-bongkah
+berbuai-buai
+berbual-bual
+berbukit-bukit
+berbulan-bulan
+berbunga-bunga
+berbuntut-buntut
+berbunuh-bunuhan
+berburu-buru
+berburuk-buruk
+berbutir-butir
+bercabang-cabang
+bercaci-cacian
+bercakap-cakap
+bercakar-cakaran
+bercantik-cantik
+bercari-cari
+bercari-carian
+bercarik-carik
+bercepat-cepat
+bercerai-berai
+bercerai-cerai
+bercetai-cetai
+bercikun-cikun
+bercinta-cintaan
+bercita-cita
+berciut-ciut
+berconteng-conteng
+bercoreng-coreng
+bercoreng-moreng
+bercuit-cuit
+bercumbu-cumbu
+bercumbu-cumbuan
+bercura-bura
+bercura-cura
+berdada-dadaan
+berdahulu-dahuluan
+berdalam-dalam
+berdebar-debar
+berdecap-decap
+berdedai-dedai
+berdegap-degap
+berdegar-degar
+berdeham-deham
+berdekah-dekah
+berdekat-dekat
+berdelat-delat
+berdembun-dembun
+berdempang-dempang
+berdendam-dendaman
+berdengkang-dengkang
+berdentang-dentang
+berdentum-dentum
+berdentung-dentung
+berdepak-depak
+berdepan-depan
+berderai-derai
+berderak-derak
+berderau-derau
+berdering-dering
+berderung-derung
+berdesak-desakan
+berdesing-desing
+berdesus-desus
+berdikit-dikit
+berdingkit-dingkit
+berdua-dua
+berduri-duri
+berduru-duru
+berduyun-duyun
+berebut-rebut
+berebut-rebutan
+beregang-regang
+berek-berek
+berembut-rembut
+berempat-empat
+berenak-enak
+berenteng-renteng
+beresah-resah
+berfoya-foya
+bergagah-gagahan
+bergagap-gagap
+bergalur-galur
+berganda-ganda
+berganti-ganti
+bergarah-garah
+bergaruk-garuk
+bergegas-gegas
+bergelang-gelang
+bergelap-gelap
+bergelas-gelasan
+bergeleng-geleng
+bergemal-gemal
+bergembut-gembut
+bergerek-gerek
+bergesa-gesa
+bergilir-gilir
+bergolek-golek
+bergores-gores
+bergotong-royong
+bergugus-gugus
+bergulung-gulung
+bergulut-gulut
+bergumpal-gumpal
+bergunung-gunung
+berhadap-hadapan
+berhamun-hamun
+berhandai-handai
+berhanyut-hanyut
+berhari-hari
+berhati-hati
+berhilau-hilau
+berhujan-hujan
+beria-ia
+beria-ria
+beriak-riak
+beribu-ribu
+berigi-rigi
+bering-bering
+beringat-ingat
+beringgit-ringgit
+berintik-rintik
+beriring-iring
+beriring-iringan
+berjabir-jabir
+berjaga-jaga
+berjagung-jagung
+berjalan-jalan
+berjalar-jalar
+berjalin-jalin
+berjalur-jalur
+berjam-jam
+berjauh-jauhan
+berjejal-jejal
+berjela-jela
+berjenis-jenis
+berjenjang-jenjang
+berjilid-jilid
+berjinak-jinak
+berjingkat-jingkat
+berjingkrak-jingkrak
+berjongkok-jongkok
+berjubel-jubel
+berjujut-jujutan
+berjulai-julai
+berjumbai-jumbai
+berjurai-jurai
+berjurus-jurus
+berjuta-juta
+berkaca-kaca
+berkait-kaitan
+berkala-kala
+berkali-kali
+berkanjar-kanjar
+berkaok-kaok
+berkarung-karung
+berkasih-kasihan
+berkata-kata
+berkatak-katak
+berkecai-kecai
+berkecek-kecek
+berkecil-kecil
+berkecil-kecilan
+berkedip-kedip
+berkejang-kejang
+berkejap-kejap
+berkejar-kejaran
+berkelar-kelar
+berkelip-kelip
+berkelit-kelit
+berkelok-kelok
+berkelompok-kelompok
+berkelun-kelun
+berkembur-kembur
+berkempul-kempul
+berkena-kenaan
+berkenal-kenalan
+berkendur-kendur
+berkeok-keok
+berkepak-kepak
+berkepal-kepal
+berkeping-keping
+berkepul-kepul
+berkeras-kerasan
+berkeritik-keritik
+berkeruit-keruit
+berkerut-kerut
+berketak-ketak
+berketak-ketik
+berketi-keti
+berketil-ketil
+berketuk-ketak
+berketul-ketul
+berkial-kial
+berkian-kian
+berkias-kiasan
+berkibar-kibar
+berkilah-kilah
+berkilat-kilat
+berkilau-kilauan
+berkilo-kilo
+berkinja-kinja
+berkipas-kipas
+berkira-kira
+berkirim-kiriman
+berkobar-kobar
+berkobok-kobok
+berkocak-kocak
+berkodi-kodi
+berkolek-kolek
+berkopah-kopah
+berkotak-kotak
+berkuat-kuatan
+berkunang-kunang
+berkurun-kurun
+berkusau-kusau
+berkusu-kusu
+berkusut-kusut
+berkuting-kuting
+berkutu-kutuan
+berlabun-labun
+berlain-lainan
+berlalai-lalai
+berlama-lama
+berlambai-lambai
+berlambak-lambak
+berlampang-lampang
+berlapang-lapang
+berlapis-lapis
+berlapuk-lapuk
+berlarah-larah
+berlarat-larat
+berlari-larian
+berlarik-larik
+berlarut-larut
+berlawak-lawak
+berlayap-layapan
+berlebih-lebih
+berlebih-lebihan
+berlekas-lekas
+berlena-lena
+berlengah-lengah
+berlenggek-lenggek
+berlenggok-lenggok
+berleret-leret
+berliang-liuk
+berliku-liku
+berlimpah-limpah
+berlimpap-limpap
+berlimpit-limpit
+berlinang-linang
+berlindak-lindak
+berlipat-lipat
+berlompok-lompok
+berloncat-loncatan
+berlopak-lopak
+berlubang-lubang
+bermaaf-maafan
+bermacam-macam
+bermain-main
+bermalas-malas
+bermanik-manik
+bermanis-manis
+bermanja-manja
+bermasak-masak
+bermati-mati
+bermegah-megah
+bermemek-memek
+bermesra-mesraan
+bermewah-mewah
+berminggu-minggu
+berminta-minta
+bermuda-muda
+bermudah-mudah
+bermuka-muka
+bermula-mula
+bermulut-mulut
+bernafsi-nafsi
+bernaka-naka
+berniat-niat
+berogak-ogak
+beroleng-oleng
+berolok-olok
+beromong-omong
+beronggok-onggok
+berorang-orang
+beroyal-royal
+berpada-pada
+berpahit-pahit
+berpair-pair
+berpal-pal
+berpalu-palu
+berpalu-paluan
+berpalun-palun
+berpandai-pandai
+berpandang-pandangan
+berpangkat-pangkat
+berpanjang-panjang
+berpasang-pasang
+berpasang-pasangan
+berpayah-payah
+berpeluh-peluh
+berpeluk-pelukan
+berpenat-penat
+berpencar-pencar
+berpendar-pendar
+berpenggal-penggal
+berperai-perai
+berpesai-pesai
+berpesta-pesta
+berpesuk-pesuk
+berpetak-petak
+berpeti-peti
+berpihak-pihak
+berpijar-pijar
+berpikul-pikul
+berpilih-pilih
+berpilin-pilin
+berpindah-pindah
+berpintal-pintal
+berpirau-pirau
+berpisah-pisah
+berpolah-polah
+berpongah-pongah
+berpontang-panting
+berporah-porah
+berpotong-potong
+berpuak-puak
+berpual-pual
+berpugak-pugak
+berpuluh-puluh
+berpulun-pulun
+berpuntal-puntal
+berpura-pura
+berpusar-pusar
+berpusing-pusing
+berpusu-pusu
+berputar-putar
+bersaf-saf
+bersahut-sahutan
+bersakit-sakit
+bersalah-salahan
+bersalam-salaman
+bersalin-salin
+bersama-sama
+bersambut-sambutan
+bersampan-sampan
+bersantai-santai
+bersapa-sapaan
+bersarang-sarang
+bersedan-sedan
+bersedia-sedia
+bersedu-sedu
+bersekat-sekat
+berselang-selang
+berselang-seli
+bersembur-semburan
+bersempit-sempit
+bersenang-senang
+bersenang-senangkan
+bersenda-senda
+bersendi-sendi
+bersepah-sepah
+bersepi-sepi
+berserak-serak
+berseri-seri
+bersesak-sesak
+bersetai-setai
+bersia-sia
+bersiap-siap
+bersiar-siar
+bersilir-silir
+bersimbur-simburan
+bersinau-sinau
+bersorak-sorai
+bersuap-suapan
+bersudah-sudah
+bersuka-suka
+bersuka-sukaan
+bersuku-suku
+bersumpah-sumpahan
+bersungguh-sungguh
+bersungut-sungut
+bersunyi-sunyi
+bersusah-susah
+bersusuk-susuk
+bersusuk-susukan
+bersutan-sutan
+bertabur-tabur
+bertahu-tahu
+bertahun-tahun
+bertajuk-tajuk
+bertakik-takik
+bertala-tala
+bertali-tali
+bertalu-talu
+bertambah-tambah
+bertanda-tandaan
+bertangis-tangisan
+bertangkil-tangkil
+bertanya-tanya
+bertarik-tarikan
+bertatai-tatai
+bertatih-tatih
+bertawan-tawan
+bertawar-tawaran
+bertebu-tebu
+bertebu-tebukan
+berteguh-teguh
+berteguh-teguhan
+berteka-teki
+bertelau-telau
+bertele-tele
+bertempat-tempat
+bertempuh-tempuh
+bertenang-tenang
+bertenggang-tenggangan
+bertentu-tentu
+bertepek-tepek
+berterang-terang
+berterang-terangan
+bertikam-tikaman
+bertimbal-timbalan
+bertimbun-timbun
+bertimpa-timpa
+bertimpas-timpas
+bertingkah-tingkah
+bertingkat-tingkat
+bertinjau-tinjauan
+bertiras-tiras
+bertitar-titar
+bertoboh-toboh
+bertolak-tolak
+bertolak-tolakan
+bertolong-tolongan
+bertonjol-tonjol
+bertua-tua
+bertua-tuaan
+bertual-tual
+bertubi-tubi
+bertukar-tukar
+bertukar-tukaran
+bertukas-tukas
+bertumpak-tumpak
+bertunda-tunda
+bertunjuk-tunjukan
+bertura-tura
+berturut-turut
+bertutur-tutur
+beruas-ruas
+berubah-ubah
+berulang-alik
+berulang-ulang
+berumbai-rumbai
+berundung-undung
+berunggas-runggas
+berungkur-ungkuran
+beruntai-untai
+beruntun-runtun
+berunyai-unyai
+berupa-rupa
+berura-ura
+beruris-uris
+berurut-urutan
+berwarna-warna
+berwarna-warni
+berwindu-windu
+berwiru-wiru
+beryang-yang
+besar-besaran
+betak-betak
+beti-beti
+betul-betul
+biang-biang
+biar-biar
+biji-bijian
+bila-bila
+bilang-bilang
+bincang-bincut
+bini-binian
+biri-biri
+biru-biru
+bisik-bisik
+biti-biti
+bolak-balik
+bolang-baling
+bongkar-bangkir
+buah-buahan
+buat-buatan
+buaya-buaya
+bubun-bubun
+bugi-bugi
+built-in
+bukan-bukan
+bulan-bulan
+bulan-bulanan
+bulang-bulang
+bulat-bulat
+buli-buli
+bulu-bulu
+buluh-buluh
+bulus-bulus
+bunga-bungaan
+bunuh-membunuh
+bunyi-bunyian
+buru-buru
+burung-burungan
+bye-bye
+cabik-cabik
+caing-caing
+calar-balar
+cara-cara
+carut-marut
+cawi-cawi
+cebar-cebur
+celam-celum
+celangak-celinguk
+celas-celus
+celedang-celedok
+celengkak-celengkok
+cemas-cemas
+centang-perenang
+cepat-cepat
+cerai-berai
+ceruk-menceruk
+ceruk-meruk
+check-up
+chit-chat
+cirit-birit
+cita-cita
+close-up
+closed-circuit
+cobak-cabik
+cobar-cabir
+cola-cala
+compang-camping
+congak-cangit
+congkah-cangkih
+congkah-mangkih
+copak-capik
+corak-carik
+corat-coret
+coreng-moreng
+cuang-caing
+cubung-cubung
+culik-culik
+cuma-cuma
+cumi-cumi
+cungap-cangip
+cupu-cupu
+dahulu-mendahului
+dali-dali
+dapur-dapur
+dari-dari
+daru-daru
+datang-datang
+datang-mendatangi
+daun-daunan
+dawai-dawai
+dayang-dayang
+degap-degap
+dekak-dekak
+dekat-dekat
+dengar-dengaran
+desas-desus
+diam-diam
+do-it-yourself
+dokok-dokok
+dolak-dalik
+dorong-mendorong
+drive-in
+dua-dua
+dua-duanya
+duduk-duduk
+dulang-dulang
+ecek-ecek
+embuh-embuhan
+empek-empek
+empok-empok
+encal-encal
+endap-endap
+endut-endutan
+engah-engah
+enggan-enggan
+engkah-engkah
+entah-berentah
+erang-erot
+erong-erong
+fast-food
+fifty-fifty
+flip-flop
+follow-up
+foya-foya
+gaba-gaba
+gabai-gabai
+gada-gada
+gading-gading
+gado-gado
+gajah-gajahan
+gala-gala
+gali-galian
+galing-galing
+galu-galu
+gamit-gamitan
+gampang-gampangan
+ganal-ganal
+ganda-berganda
+gapah-gopoh
+gara-gara
+garah-garah
+gatal-gatal
+gawar-gawar
+gaya-gayanya
+gedebak-gedebuk
+gelang-gelang
+gelembung-gelembungan
+geli-geli
+geliang-geliut
+geliat-geliut
+gempul-gempul
+gendang-gendang
+genjang-genjot
+gerabak-gerubuk
+gerak-gerik
+gerbas-gerbus
+gerit-gerit
+geruh-gerah
+getak-getuk
+geti-geti
+gila-gila
+gila-gilaan
+gilang-gemilang
+gilap-gemilap
+gili-gili
+giling-giling
+ginang-ginang
+girik-girik
+giring-giring
+go-kart
+golak-galik
+gonta-ganti
+gotong-royong
+gual-gail
+gudu-gudu
+gula-gula
+gulang-gulang
+guna-guna
+guntang-guntang
+gunung-ganang
+gunung-gemunung
+gunung-gunungan
+habis-habis
+habis-habisan
+halai-balai
+half-time
+hampir-hampir
+harap-harapan
+harum-haruman
+hati-hati
+heavy-duty
+hebat-hebatan
+hidup-hidup
+hiru-biru
+hiruk-pikuk
+hubaya-hubaya
+hula-hula
+huru-hara
+ibar-ibar
+icak-icak
+igau-igauan
+ikut-ikut
+ikut-ikutan
+ilam-ilam
+imbang-imbangan
+inang-inang
+inca-binca
+incang-incut
+ingat-ingat
+ingat-ingatan
+ingau-ingauan
+inggang-inggung
+injak-injak
+iras-iras
+iring-iringan
+iseng-iseng
+jadi-jadian
+jala-jala
+jamah-jamahan
+jambu-jambu
+jangan-jangan
+jarang-jarang
+jari-jari
+jaring-jaring
+jarum-jarum
+jauh-jauh
+jawi-jawi
+jebat-jebatan
+jelur-jelir
+jendal-jendul
+jenggar-jenggur
+jentik-jentik
+jerah-jerih
+jolong-jolong
+jongkar-jangkir
+juak-juak
+juang-juang
+julung-julung
+jurai-jurai
+kabu-kabu
+kacang-kacang
+kacang-kacangan
+kacau-balau
+kadang-kadang
+kail-kail
+kait-kait
+kakek-kakek
+kalau-kalau
+kaleng-kalengan
+kalut-malut
+kambing-kambing
+kanak-kanak
+kapa-kapa
+kapan-kapan
+kapu-kapu
+karang-karangan
+karang-mengarang
+kareseh-peseh
+karut-marut
+katang-katang
+kawa-kawa
+kayu-kayuan
+keabu-abuan
+keasyik-asyikan
+kebarat-baratan
+kebasah-basahan
+kebat-kebit
+kebata-bataan
+kebelanda-belandaan
+kebiru-biruan
+kebudak-budakan
+kecil-kecilan
+kecil-mengecil
+kecuh-kecah
+kedek-kedek
+kegadis-gadisan
+kegelap-gelapan
+kegila-gilaan
+kegirang-girangan
+kehijau-hijauan
+kehitam-hitaman
+kejaga-jagaan
+kejingga-jinggaan
+kekabur-kaburan
+kekanak-kanakan
+kekoboi-koboian
+kekuning-kuningan
+kelak-kelik
+kelak-keluk
+kelaki-lakian
+kelang-kelok
+kelap-kelip
+kelek-kelek
+kelek-kelekan
+kelik-kelik
+kelip-kelip
+kelusuh-kelasah
+kelut-melut
+kemak-kemik
+kemalu-maluan
+kemanja-manjaan
+kemarah-marahan
+kemasam-masaman
+kemati-matian
+kemerah-merahan
+kempang-kempis
+kempas-kempis
+kemuda-mudaan
+kena-mengena
+kenal-mengenal
+kenang-kenangan
+kencang-kencung
+kendang-kendang
+kendang-kendangan
+kentung-kentung
+kenyat-kenyit
+kepandir-pandiran
+kepang-kepot
+keperak-perakan
+kepilu-piluan
+kepura-puraan
+keputih-putihan
+kerah-kerahan
+kerancak-rancakan
+kerang-kerangan
+kerang-keroh
+kerang-kerung
+kerap-kerap
+keras-mengerasi
+kercap-kercip
+kercap-kercup
+keriang-keriut
+kernyat-kernyut
+kerong-kerong
+keropas-kerapis
+kertak-kertuk
+keruntang-pungkang
+kesap-kesip
+kesenak-senakan
+kesewenang-wenangan
+kesia-siaan
+kesik-kesik
+kesipu-sipuan
+kesu-kesi
+kesuh-kesih
+kesuk-kesik
+ketergesa-gesaan
+keti-keti
+ketidur-tiduran
+ketiga-tiganya
+ketua-tuaan
+ketuan-tuanan
+keungu-unguan
+kia-kia
+kiak-kiak
+kial-kial
+kiang-kiut
+kibang-kibut
+kicang-kecoh
+kicang-kicu
+kida-kida
+kilau-mengilau
+kili-kili
+kira-kira
+kira-kiraan
+kisi-kisi
+kocah-kacih
+kodok-kodok
+kolang-kaling
+koleh-koleh
+kolong-kolong
+koma-koma
+komat-kamit
+kontal-kantil
+kontang-kanting
+kosak-kasik
+kotak-katik
+kotak-kotak
+kuat-kuat
+kucar-kacir
+kucing-kucing
+kucing-kucingan
+kuda-kuda
+kuda-kudaan
+kudap-kudap
+kulah-kulah
+kulak-kulak
+kulik-kulik
+kulum-kulum
+kumat-kamit
+kunang-kunang
+kupat-kapit
+kupu-kupu
+kura-kura
+kurang-kurang
+kusat-mesat
+kutat-kutet
+kuti-kuti
+labi-labi
+labu-labu
+lagi-lagi
+laguh-lagah
+laki-laki
+lalu-lalang
+lama-kelamaan
+lama-lama
+lamat-lamat
+lambat-lambat
+lancar-lancar
+langak-longok
+langit-langit
+lanja-lanjaan
+lapat-lapat
+large-scale
+lari-lari
+lauk-pauk
+lawah-lawah
+lawak-lawak
+lawi-lawi
+layang-layang
+layu-layuan
+lebih-lebih
+legak-legok
+lekak-lekuk
+lekap-lekup
+lekas-lekas
+lekuh-lekih
+lekup-lekap
+lenggak-lenggok
+lenggok-lenggok
+lengket-lengket
+lentam-lentum
+lentang-lentok
+lentang-lentung
+lepa-lepa
+lerang-lerang
+lereng-lereng
+letah-letai
+letup-letup
+liang-liuk
+lidah-lidah
+line-up
+liuk-liuk
+liung-liung
+lobi-lobi
+lock-up
+lopak-lapik
+lopak-lopak
+lumba-lumba
+lumi-lumi
+luntang-lantung
+lupa-lupa
+lupa-lupaan
+main-mainan
+makan-makanan
+make-up
+malai-malai
+malam-malam
+malar-malar
+mali-mali
+malu-malu
+mana-mana
+manik-manik
+manis-manisan
+mark-up
+masing-masing
+mata-mata
+mati-matian
+maya-maya
+megap-megap
+megrek-megrek
+melak-melak
+melambai-lambai
+melambai-lambaikan
+melambat-lambatkan
+melaun-laun
+melawak-lawak
+melayap-layap
+melayap-layapkan
+melebih-lebihi
+melebih-lebihkan
+melejang-lejangkan
+melengah-lengah
+melihat-lihat
+melimpah-limpah
+melincah-lincah
+meloncat-loncat
+melonco-lonco
+melonjak-lonjak
+memacak-macak
+memaki-maki
+memaksa-maksa
+memandai-mandai
+memanggil-manggil
+memanis-manis
+memanjut-manjut
+memasak-masak
+memata-matai
+mematah-matah
+mematut-matut
+memayah-mayahkan
+membagi-bagikan
+membalik-balik
+membangkit-bangkit
+membayang-bayangi
+membayang-bayangkan
+membelai-belai
+membenar-benar
+membenar-benari
+memberai-beraikan
+membesar-besarkan
+membolak-balikkan
+membuang-buang
+membuat-buat
+membunga-bungai
+memburu-buru
+memburu-burukan
+memburuk-burukkan
+memencak-mencak
+memencar-mencar
+memetak-metak
+memetang-metangkan
+memetir-metir
+memikir-mikirkan
+memilih-milih
+meminang-minang
+meminta-minta
+memisah-misahkan
+memontang-mantingkan
+memperamat-amat
+memperamat-amatkan
+memperbagai-bagaikan
+memperganda-gandakan
+memperganduh-ganduhkan
+mempermacam-macamkan
+memperolok-olokkan
+mempersama-samakan
+mempertubi-tubi
+mempertubi-tubikan
+memperturut-turutkan
+memuja-muja
+memukang-mukang
+memulun-mulun
+memundi-mundi
+memundi-mundikan
+memuyu-muyu
+menagak-nagak
+menakut-nakuti
+menanjur-nanjur
+menanti-nanti
+menari-nari
+mencabik-cabik
+mencabik-cabikkan
+mencaing-caing
+mencak-mencak
+mencakup-cakup
+mencapak-capak
+mencari-cari
+mencarik-carik
+mencarut-carut
+mencengis-cengis
+mencepak-cepak
+mencepuk-cepuk
+mencerai-beraikan
+mencetai-cetai
+menciap-ciap
+menciar-ciar
+mencita-citakan
+menciut-ciut
+mencoang-coang
+mencubit-cubit
+mencuri-curi
+mendecap-decap
+mendengking-dengking
+menderak-derakkan
+menderau-derau
+menderu-deru
+mendesas-desuskan
+mendesus-desus
+mendewa-dewakan
+mendudu-dudu
+menebu-nebu
+menegur-neguri
+mengabung-ngabung
+mengaci-acikan
+mengada-ada
+mengaduk-aduk
+mengagak-agak
+mengagak-agihkan
+mengagut-agut
+mengais-ngais
+mengali-ali
+mengalur-alur
+mengamang-amang
+mengamat-amati
+mengambai-ambaikan
+mengambang-ambang
+mengancak-ancak
+mengangan-angankan
+mengangguk-angguk
+mengangin-anginkan
+mengangkat-angkat
+mengap-mengap
+mengapa-apai
+mengapi-apikan
+mengarah-arahi
+mengata-ngatai
+mengaum-aumkan
+mengejan-ejan
+mengelai-ngelai
+mengelepik-ngelepik
+mengelus-elus
+mengembut-embut
+mengenap-enapkan
+mengenjak-enjak
+mengepak-ngepak
+mengepak-ngepakkan
+menggaba-gabai
+menggalur-galur
+menggamak-gamak
+menggapai-gapai
+menggapai-gapaikan
+menggelepar-gelepar
+menggelepar-geleparkan
+menggemak-gemak
+menggerecak-gerecak
+menggesa-gesakan
+menggili-gili
+menggorek-gorek
+menggosok-gosok
+mengguit-guit
+menghalai-balaikan
+menghinap-hinap
+mengiang-ngiang
+mengibas-ngibas
+mengidam-idamkan
+mengilah-ngilahkan
+mengilai-ilai
+mengilat-ngilatkan
+mengilik-ngilik
+mengimak-imak
+mengiming-iming
+menginjak-injak
+mengipas-ngipas
+mengira-ngira
+mengira-ngirakan
+mengiras-iras
+mengiras-irasi
+mengitar-ngitar
+mengitik-ngitik
+mengogok-ogok
+mengolak-alikkan
+mengoleng-oleng
+mengongkang-ongkang
+mengongkok-ongkok
+mengonyah-anyih
+mengotak-ngatikkan
+mengoyak-ngoyakkan
+mengoyak-oyak
+menguar-nguarkan
+menguar-uarkan
+menguber-uber
+mengubit-ubit
+mengubrak-abrik
+mengucar-ngacirkan
+mengucek-ngucek
+menguik-uik
+menguis-uis
+mengulit-ulit
+menguman-uman
+mengumbang-ambingkan
+mengumpak-umpak
+mengungkat-ungkat
+mengungkit-ungkit
+mengurik-urik
+mengutak-ngatikkan
+mengutik-ngutik
+menimang-nimang
+meningkat-ningkat
+meniru-niru
+meniup-niup
+menjadi-jadi
+menjengek-jengek
+menjengit-jengit
+menjilat-jilat
+mentah-mentah
+mentang-mentang
+menunda-nunda
+menusuk-nusuk
+menyama-nyama
+menyambar-nyambar
+menyanjung-nyanjung
+menyapu-nyapu
+menyarat-nyarat
+menyendi-nyendi
+menyeret-nyeret
+menyeru-nyerukan
+menyia-nyiakan
+menyungguh-nyungguhi
+meraba-raba
+merangkak-rangkak
+merasa-rasai
+meraung-raung
+meraung-raungkan
+merayau-rayau
+merayu-rayu
+mereka-reka
+merelap-relap
+meremah-remah
+meremeh-temehkan
+merempah-rempahi
+merengek-rengek
+merenik-renik
+merenta-renta
+merenyai-renyai
+merintang-rintang
+merintik-rintik
+merobek-robek
+meronta-ronta
+merungus-rungus
+merungut-rungut
+mewarna-warnikan
+meyakin-yakini
+miju-miju
+minta-minta
+moga-moga
+morat-marit
+muda-mudi
+mudah-mudahan
+muka-muka
+mula-mula
+muluk-muluk
+naga-naga
+nanti-nantian
+nasi-nasi
+nasib-nasiban
+nenek-nenek
+nyolong-nyolong
+ogah-ogahan
+ogak-ogak
+olak-alik
+olak-olak
+olang-aling
+olang-alingan
+oleh-oleh
+olok-olok
+olok-olokan
+olong-olong
+on-screen
+onde-onde
+one-to-one
+oneng-oneng
+ongkang-ongkang
+ongol-ongol
+onyah-anyih
+orak-arik
+orang-aring
+orang-orangan
+orok-orok
+orong-orong
+otak-otak
+otak-otakan
+padi-padian
+pagi-pagi
+palas-palas
+paling-paling
+palu-memalu
+panas-panas
+pandang-memandang
+panji-panji
+para-para
+paru-paru
+pasang-memasang
+pasu-pasu
+paya-paya
+pecah-pecah
+pelan-pelan
+pengundang-undang
+perang-perangan
+perintang-rintang
+perlahan-lahan
+perlip-perlipan
+pertama-tama
+perundang-undangan
+pesan-pesan
+piat-piut
+pick-up
+pijak-pijak
+pijar-pijar
+pijat-pijat
+pina-pina
+pisang-pisang
+play-off
+pohon-pohonan
+pokrol-pokrolan
+polang-paling
+poma-poma
+pontang-panting
+porak-parik
+porak-peranda
+potong-memotong
+puji-pujian
+pukang-pukang
+pukul-memukul
+pulang-pergi
+pulut-pulut
+pundi-pundi
+punggung-memunggung
+pura-pura
+pusar-pusar
+push-up
+pusing-pusing
+putus-putus
+rada-rada
+radio-frequency
+ragu-ragu
+rama-rama
+rambu-rambu
+rango-rango
+rasa-rasanya
+rata-rata
+real-time
+rebah-rebah
+rebah-rebahan
+redam-redam
+reka-reka
+reka-rekaan
+remah-remah
+remang-remang
+rembah-rembih
+remeh-temeh
+rempah-rempah
+repuh-repuh
+riang-riang
+ribu-ribu
+rigi-rigi
+robak-rabik
+robat-rabit
+role-play
+roll-on
+rombang-rambing
+ruak-ruak
+ruku-ruku
+rumah-rumah
+rumah-rumahan
+rumput-rumputan
+runding-merunding
+runggu-rangga
+runner-up
+rupa-rupa
+rupa-rupanya
+saban-saban
+sabung-menyabung
+saing-menyaing
+salah-salah
+sama-sama
+samar-samar
+sambar-menyambar
+sambung-bersambung
+sambung-menyambung
+sambut-menyambut
+sampai-sampai
+sandar-menyandar
+sangat-sangat
+sangkut-menyangkut
+sapa-menyapa
+sapu-sapu
+sarit-sarit
+satu-satu
+satu-satunya
+sayup-menyayup
+sayup-sayup
+sayur-mayur
+sayur-sayuran
+sci-fi
+seakal-akal
+seakan-akan
+sealak-alak
+sebaik-baiknya
+sebelah-menyebelah
+sebentar-sebentar
+seberang-menyeberang
+seboleh-bolehnya
+sedalam-dalamnya
+sedang-menyedang
+sedap-sedapan
+sedapat-dapatnya
+sedikit-dikitnya
+sedikit-sedikit
+sedikit-sedikitnya
+seelok-eloknya
+segala-galanya
+segan-menyegan
+segan-menyegani
+segan-segan
+sehari-hari
+sehari-harian
+sejadi-jadinya
+sekali-kali
+sekali-sekali
+sekira-kira
+sekonyong-konyong
+sekuasa-kuasanya
+sekurang-kurangnya
+sela-menyela
+sela-sela
+selama-lamanya
+selambat-lambatnya
+selang-seli
+selang-seling
+selar-belar
+selat-latnya
+selekas-lekasnya
+selepas-lepas
+self-esteem
+self-help
+sema-sema
+semah-semah
+semak-semak
+semalam-malaman
+semasa-masa
+semata-mata
+sembunyi-sembunyi
+sembunyi-sembunyian
+semena-mena
+semenda-menyemenda
+semengga-mengga
+sementang-mentang
+semu-semu
+semut-semutan
+sengal-sengal
+sengau-sengauan
+seolah-olah
+sepala-pala
+sepandai-pandai
+sepetang-petangan
+sepoi-sepoi
+sepuas-puasnya
+serang-menyerang
+seraya-menyeraya
+serba-serbi
+serbah-serbih
+serembah-serembih
+sering-sering
+serta-menyertai
+serta-serta
+sesal-menyesali
+sesudah-sudah
+sesudah-sudahnya
+sesuka-suka
+setempat-setempat
+setengah-setengah
+setidak-tidaknya
+seupaya-upaya
+seupaya-upayanya
+sewaktu-waktu
+sewenang-wenang
+short-term
+sia-sia
+siang-siang
+siapa-siapa
+sibar-sibar
+sibur-sibur
+sida-sida
+siku-siku
+silah-silah
+silang-menyilang
+silir-semilir
+sinar-seminar
+sindir-menyindir
+singgah-menyinggah
+sorak-sorai
+stand-by
+stand-up
+sudu-sudu
+sudung-sudung
+suka-suka
+sulang-menyulang
+sulur-suluran
+sumpah-sumpah
+sumpit-sumpit
+sungguh-sungguh
+sungut-sungut
+suram-suram
+surat-menyurat
+suruh-suruhan
+tabar-tabar
+tabir-mabir
+tabrak-tubruk
+tabuh-tabuhan
+tahu-menahu
+tahu-tahu
+takang-takik
+take-off
+takut-takut
+takut-takutan
+tali-bertali
+tali-tali
+tampak-tampak
+tanam-menanam
+tanam-tanaman
+tanda-tanda
+tangan-menangan
+tangan-tangan
+tanggung-menanggung
+tapa-tapa
+tapak-tapak
+tari-menari
+tari-tarian
+tarik-menarik
+tatah-tatah
+tawak-tawak
+tawang-tawang
+tawar-menawar
+tawar-tawar
+tayum-temayum
+tebu-tebu
+tegak-tegak
+teka-teki
+temas-temas
+tembak-menembak
+temut-temut
+tenggang-menenggang
+teraba-raba
+terambang-ambang
+terang-terang
+terang-terangan
+teranggar-anggar
+terangguk-angguk
+teranggul-anggul
+terangin-angin
+terangkup-angkup
+teranja-anja
+terapung-apung
+terayan-rayan
+terayap-rayap
+terbada-bada
+terbahak-bahak
+terbata-bata
+terbatuk-batuk
+terbayang-bayang
+terbengkil-bengkil
+terbirit-birit
+terbuai-buai
+terbuang-buang
+terburu-buru
+tercangak-cangak
+tercengang-cengang
+tercilap-cilap
+tercongget-congget
+tercungap-cungap
+terdangka-dangka
+terdengih-dengih
+terekeh-ekeh
+terembut-embut
+terembut-rembut
+terengah-engah
+teresak-esak
+tergagap-gagap
+tergagau-gagau
+tergaguk-gaguk
+tergapai-gapai
+tergegap-gegap
+tergegas-gegas
+tergelung-gelung
+tergerenyeng-gerenyeng
+tergesa-gesa
+tergila-gila
+tergontai-gontai
+tergudik-gudik
+terguling-guling
+tergulut-gulut
+terharak-harak
+terharap-harap
+terhengit-hengit
+terhinggut-hinggut
+terigau-igau
+terincut-incut
+teringa-inga
+teringat-ingat
+terinjak-injak
+terjembak-jembak
+terjerit-jerit
+terkadang-kadang
+terkakah-kakah
+terkakak-kakak
+terkanjar-kanjar
+terkapah-kapah
+terkapai-kapai
+terkapung-kapung
+terkatah-katah
+terkatung-katung
+terkecap-kecap
+terkedek-kedek
+terkedip-kedip
+terkejar-kejar
+terkekau-kekau
+terkekeh-kekeh
+terkekek-kekek
+terkelinjat-kelinjat
+terkelip-kelip
+terkempul-kempul
+terkemut-kemut
+terkencar-kencar
+terkepak-kepak
+terkesot-kesot
+terkesut-kesut
+terkial-kial
+terkincak-kincak
+terkindap-kindap
+terkinja-kinja
+terkirai-kirai
+terkitar-kitar
+terkocoh-kocoh
+terkokol-kokol
+terkosel-kosel
+terkoteng-koteng
+terkumpal-kumpal
+terlara-lara
+terlayang-layang
+terlebih-lebih
+terlincah-lincah
+terliuk-liuk
+terlolong-lolong
+terlongong-longong
+termangu-mangu
+termanja-manja
+termata-mata
+termengah-mengah
+termimpi-mimpi
+ternanti-nanti
+terngiang-ngiang
+teroleng-oleng
+terpandang-pandang
+terpecah-pecah
+terpekik-pekik
+terpereh-pereh
+terpikau-pikau
+terpinga-pinga
+terpingkal-pingkal
+terpontang-panting
+terpusing-pusing
+terputus-putus
+tersanga-sanga
+tersaruk-saruk
+tersedan-sedan
+tersedih-sedih
+tersedu-sedu
+tersendat-sendat
+tersendeng-sendeng
+tersengal-sengal
+tersengguk-sengguk
+tersengut-sengut
+tersera-sera
+terserak-serak
+tersetai-setai
+tersia-sia
+tersipu-sipu
+tersoja-soja
+tersungkuk-sungkuk
+tertagak-tagak
+tertahan-tahan
+tertatih-tatih
+tertegun-tegun
+tertekan-tekan
+terteleng-teleng
+terumbang-ambing
+terumbang-umbang
+terungkap-ungkap
+terus-menerus
+terus-terusan
+think-tank
+tiap-tiap
+tiba-tiba
+tidak-tidak
+tidur-tidur
+tie-dye
+tiga-tiganya
+tikam-menikam
+tilik-menilik
+timah-timah
+timang-timangan
+timbang-menimbang
+timu-timu
+tindih-bertindih
+tinjau-meninjau
+tip-off
+tiru-tiruan
+tiup-tiup
+tokak-takik
+tokok-menokok
+tolak-menolak
+tolong-menolong
+top-level
+trade-in
+tua-tua
+tuan-tuan
+tuang-tuang
+tuban-tuban
+tukang-menukang
+tukar-menukar
+tulang-tulangan
+tuli-tuli
+tulis-menulis
+tumbuh-tumbuhan
+tune-up
+tunggang-tunggit
+tupai-tupai
+turun-temurun
+turut-menurut
+turut-turutan
+two-tone
+uar-uar
+ubel-ubel
+ubun-ubun
+ubur-ubur
+uci-uci
+udap-udapan
+ugal-ugalan
+uir-uir
+ujar-ujar
+ukir-mengukir
+ula-ula
+ulak-ulak
+ulang-alik
+ulang-aling
+ulang-ulang
+ulap-ulap
+ular-ular
+ular-ularan
+ulung-ulung
+umang-umang
+umbang-ambing
+umbi-umbian
+umbul-umbul
+umbut-umbut
+uncang-uncit
+undak-undakan
+undang-undang
+unduk-unduk
+undung-undung
+undur-undur
+unggat-unggit
+ungkit-ungkit
+unting-unting
+untung-untung
+untung-untungan
+upside-down
+ura-ura
+uran-uran
+urat-urat
+uring-uringan
+urup-urup
+urup-urupan
+urus-urus
+user-user
+user-useran
+utar-utar
+voice-over
+walk-out
+wangi-wangian
+wanti-wanti
+wara-wara
+warna-warni
+water-cooled
+world-class
+yang-yang
+""".split()
+)
diff --git a/spacy/lang/ms/examples.py b/spacy/lang/ms/examples.py
new file mode 100644
index 000000000..97ab19b6e
--- /dev/null
+++ b/spacy/lang/ms/examples.py
@@ -0,0 +1,17 @@
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.ms.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+ "Malaysia ialah sebuah negara yang terletak di Asia Tenggara.",
+ "Berapa banyak pelajar yang akan menghadiri majlis perpisahan sekolah?",
+ "Pengeluaran makanan berasal dari beberapa lokasi termasuk Cameron Highlands, Johor Bahru, dan Kuching.",
+ "Syarikat XYZ telah menghasilkan 20,000 unit produk baharu dalam setahun terakhir",
+ "Kuala Lumpur merupakan ibu negara Malaysia." "Kau berada di mana semalam?",
+ "Siapa yang akan memimpin projek itu?",
+ "Siapa perdana menteri Malaysia sekarang?",
+]
diff --git a/spacy/lang/ms/lex_attrs.py b/spacy/lang/ms/lex_attrs.py
new file mode 100644
index 000000000..2088c9955
--- /dev/null
+++ b/spacy/lang/ms/lex_attrs.py
@@ -0,0 +1,65 @@
+import unicodedata
+
+from ...attrs import IS_CURRENCY, LIKE_NUM
+from .punctuation import LIST_CURRENCY
+
+_num_words = [
+ "kosong",
+ "satu",
+ "dua",
+ "tiga",
+ "empat",
+ "lima",
+ "enam",
+ "tujuh",
+ "lapan",
+ "sembilan",
+ "sepuluh",
+ "sebelas",
+ "belas",
+ "puluh",
+ "ratus",
+ "ribu",
+ "juta",
+ "billion",
+ "trillion",
+ "kuadrilion",
+ "kuintilion",
+ "sekstilion",
+ "septilion",
+ "oktilion",
+ "nonilion",
+ "desilion",
+]
+
+
+def like_num(text):
+ if text.startswith(("+", "-", "±", "~")):
+ text = text[1:]
+ text = text.replace(",", "").replace(".", "")
+ if text.isdigit():
+ return True
+ if text.count("/") == 1:
+ num, denom = text.split("/")
+ if num.isdigit() and denom.isdigit():
+ return True
+ if text.lower() in _num_words:
+ return True
+ if text.count("-") == 1:
+ _, num = text.split("-")
+ if num.isdigit() or num in _num_words:
+ return True
+ return False
+
+
+def is_currency(text):
+ if text in LIST_CURRENCY:
+ return True
+
+ for char in text:
+ if unicodedata.category(char) != "Sc":
+ return False
+ return True
+
+
+LEX_ATTRS = {IS_CURRENCY: is_currency, LIKE_NUM: like_num}
diff --git a/spacy/lang/ms/punctuation.py b/spacy/lang/ms/punctuation.py
new file mode 100644
index 000000000..a8d6c2e8e
--- /dev/null
+++ b/spacy/lang/ms/punctuation.py
@@ -0,0 +1,60 @@
+from ..char_classes import ALPHA, _currency, _units, merge_chars, split_chars
+from ..punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
+
+_units = (
+ _units + "s bit Gbps Mbps mbps Kbps kbps ƒ ppi px "
+ "Hz kHz MHz GHz mAh "
+ "ratus rb ribu ribuan "
+ "juta jt jutaan mill?iar million bil[l]?iun bilyun billion "
+)
+_currency = _currency + r" USD RM MYR Rp IDR RMB SGD S\$"
+_months = (
+ "Januari Februari Mac April Mei Jun Julai Ogos September "
+ "Oktober November Disember Januari Februari Mac Mei Jun "
+ "Julai Ogos Oktober Disember Jan Feb Mac Jun Julai Ogos Sept "
+ "Okt Nov Dis"
+)
+
+
+UNITS = merge_chars(_units)
+CURRENCY = merge_chars(_currency)
+HTML_PREFIX = r"<(b|strong|i|em|p|span|div|br)\s?/>|]+)>"
+HTML_SUFFIX = r"(b|strong|i|em|p|span|div|a)>"
+MONTHS = merge_chars(_months)
+LIST_CURRENCY = split_chars(_currency)
+
+_prefixes = list(TOKENIZER_PREFIXES)
+_prefixes.remove("#") # hashtag
+_prefixes = _prefixes + LIST_CURRENCY + [HTML_PREFIX] + ["/", "—"]
+
+_suffixes = (
+ TOKENIZER_SUFFIXES
+ + [r"\-[Nn]ya", "-[KkMm]u", "[—-]"]
+ + [
+ # disabled: variable width currency variable
+ # r"(?<={c})(?:[0-9]+)".format(c=CURRENCY),
+ r"(?<=[0-9])(?:{u})".format(u=UNITS),
+ r"(?<=[0-9])%",
+ # disabled: variable width HTML_SUFFIX variable
+ # r"(?<=[0-9{a}]{h})(?:[\.,:-])".format(a=ALPHA, h=HTML_SUFFIX),
+ r"(?<=[0-9{a}])(?:{h})".format(a=ALPHA, h=HTML_SUFFIX),
+ ]
+)
+
+_infixes = TOKENIZER_INFIXES + [
+ r"(?<=[0-9])[\\/](?=[0-9%-])",
+ r"(?<=[0-9])%(?=[{a}0-9/])".format(a=ALPHA),
+ # disabled: variable width units variable
+ # r"(?<={u})[\/-](?=[0-9])".format(u=UNITS),
+ # disabled: variable width months variable
+ # r"(?<={m})[\/-](?=[0-9])".format(m=MONTHS),
+ r'(?<=[0-9)][.,])"(?=[0-9])',
+ r'(?<=[{a})][.,\'])["—](?=[{a}])'.format(a=ALPHA),
+ r"(?<=[{a}])-(?=[0-9])".format(a=ALPHA),
+ r"(?<=[0-9])-(?=[{a}])".format(a=ALPHA),
+ r"(?<=[{a}])[\/-](?={c}|[{a}])".format(a=ALPHA, c=CURRENCY),
+]
+
+TOKENIZER_PREFIXES = _prefixes
+TOKENIZER_SUFFIXES = _suffixes
+TOKENIZER_INFIXES = _infixes
diff --git a/spacy/lang/ms/stop_words.py b/spacy/lang/ms/stop_words.py
new file mode 100644
index 000000000..b1bfaea79
--- /dev/null
+++ b/spacy/lang/ms/stop_words.py
@@ -0,0 +1,118 @@
+STOP_WORDS = set(
+ """
+ada adalah adanya adapun agak agaknya agar akan akankah akhir akhiri akhirnya
+aku akulah amat amatlah anda andalah antar antara antaranya apa apaan apabila
+apakah apalagi apatah artinya asal asalkan atas atau ataukah ataupun awal
+awalnya
+
+bagai bagaikan bagaimana bagaimanakah bagaimanapun bagi bagian bahkan bahwa
+bahwasanya baik bakal bakalan balik banyak bapak baru bawah beberapa begini
+beginian beginikah beginilah begitu begitukah begitulah begitupun bekerja
+belakang belakangan belum belumlah benar benarkah benarlah berada berakhir
+berakhirlah berakhirnya berapa berapakah berapalah berapapun berarti berawal
+berbagai berdatangan beri berikan berikut berikutnya berjumlah berkali-kali
+berkata berkehendak berkeinginan berkenaan berlainan berlalu berlangsung
+berlebihan bermacam bermacam-macam bermaksud bermula bersama bersama-sama
+bersiap bersiap-siap bertanya bertanya-tanya berturut berturut-turut bertutur
+berujar berupa besar betul betulkah biasa biasanya bila bilakah bisa bisakah
+boleh bolehkah bolehlah buat bukan bukankah bukanlah bukannya bulan bung
+
+cara caranya cukup cukupkah cukuplah cuma
+
+dahulu dalam dan dapat dari daripada datang dekat demi demikian demikianlah
+dengan depan di dia diakhiri diakhirinya dialah diantara diantaranya diberi
+diberikan diberikannya dibuat dibuatnya didapat didatangkan digunakan
+diibaratkan diibaratkannya diingat diingatkan diinginkan dijawab dijelaskan
+dijelaskannya dikarenakan dikatakan dikatakannya dikerjakan diketahui
+diketahuinya dikira dilakukan dilalui dilihat dimaksud dimaksudkan
+dimaksudkannya dimaksudnya diminta dimintai dimisalkan dimulai dimulailah
+dimulainya dimungkinkan dini dipastikan diperbuat diperbuatnya dipergunakan
+diperkirakan diperlihatkan diperlukan diperlukannya dipersoalkan dipertanyakan
+dipunyai diri dirinya disampaikan disebut disebutkan disebutkannya disini
+disinilah ditambahkan ditandaskan ditanya ditanyai ditanyakan ditegaskan
+ditujukan ditunjuk ditunjuki ditunjukkan ditunjukkannya ditunjuknya dituturkan
+dituturkannya diucapkan diucapkannya diungkapkan dong dua dulu
+
+empat enggak enggaknya entah entahlah
+
+guna gunakan
+
+hal hampir hanya hanyalah hari harus haruslah harusnya hendak hendaklah
+hendaknya hingga
+
+ia ialah ibarat ibaratkan ibaratnya ibu ikut ingat ingat-ingat ingin inginkah
+inginkan ini inikah inilah itu itukah itulah
+
+jadi jadilah jadinya jangan jangankan janganlah jauh jawab jawaban jawabnya
+jelas jelaskan jelaslah jelasnya jika jikalau juga jumlah jumlahnya justru
+
+kala kalau kalaulah kalaupun kalian kami kamilah kamu kamulah kan kapan
+kapankah kapanpun karena karenanya kasus kata katakan katakanlah katanya ke
+keadaan kebetulan kecil kedua keduanya keinginan kelamaan kelihatan
+kelihatannya kelima keluar kembali kemudian kemungkinan kemungkinannya kenapa
+kepada kepadanya kesampaian keseluruhan keseluruhannya keterlaluan ketika
+khususnya kini kinilah kira kira-kira kiranya kita kitalah kok kurang
+
+lagi lagian lah lain lainnya lalu lama lamanya lanjut lanjutnya lebih lewat
+lima luar
+
+macam maka makanya makin malah malahan mampu mampukah mana manakala manalagi
+masa masalah masalahnya masih masihkah masing masing-masing mau maupun
+melainkan melakukan melalui melihat melihatnya memang memastikan memberi
+memberikan membuat memerlukan memihak meminta memintakan memisalkan memperbuat
+mempergunakan memperkirakan memperlihatkan mempersiapkan mempersoalkan
+mempertanyakan mempunyai memulai memungkinkan menaiki menambahkan menandaskan
+menanti menanti-nanti menantikan menanya menanyai menanyakan mendapat
+mendapatkan mendatang mendatangi mendatangkan menegaskan mengakhiri mengapa
+mengatakan mengatakannya mengenai mengerjakan mengetahui menggunakan
+menghendaki mengibaratkan mengibaratkannya mengingat mengingatkan menginginkan
+mengira mengucapkan mengucapkannya mengungkapkan menjadi menjawab menjelaskan
+menuju menunjuk menunjuki menunjukkan menunjuknya menurut menuturkan
+menyampaikan menyangkut menyatakan menyebutkan menyeluruh menyiapkan merasa
+mereka merekalah merupakan meski meskipun meyakini meyakinkan minta mirip
+misal misalkan misalnya mula mulai mulailah mulanya mungkin mungkinkah
+
+nah naik namun nanti nantinya nyaris nyatanya
+
+oleh olehnya
+
+pada padahal padanya pak paling panjang pantas para pasti pastilah penting
+pentingnya per percuma perlu perlukah perlunya pernah persoalan pertama
+pertama-tama pertanyaan pertanyakan pihak pihaknya pukul pula pun punya
+
+rasa rasanya rata rupanya
+
+saat saatnya saja sajalah saling sama sama-sama sambil sampai sampai-sampai
+sampaikan sana sangat sangatlah satu saya sayalah se sebab sebabnya sebagai
+sebagaimana sebagainya sebagian sebaik sebaik-baiknya sebaiknya sebaliknya
+sebanyak sebegini sebegitu sebelum sebelumnya sebenarnya seberapa sebesar
+sebetulnya sebisanya sebuah sebut sebutlah sebutnya secara secukupnya sedang
+sedangkan sedemikian sedikit sedikitnya seenaknya segala segalanya segera
+seharusnya sehingga seingat sejak sejauh sejenak sejumlah sekadar sekadarnya
+sekali sekali-kali sekalian sekaligus sekalipun sekarang sekarang sekecil
+seketika sekiranya sekitar sekitarnya sekurang-kurangnya sekurangnya sela
+selain selaku selalu selama selama-lamanya selamanya selanjutnya seluruh
+seluruhnya semacam semakin semampu semampunya semasa semasih semata semata-mata
+semaunya sementara semisal semisalnya sempat semua semuanya semula sendiri
+sendirian sendirinya seolah seolah-olah seorang sepanjang sepantasnya
+sepantasnyalah seperlunya seperti sepertinya sepihak sering seringnya serta
+serupa sesaat sesama sesampai sesegera sesekali seseorang sesuatu sesuatunya
+sesudah sesudahnya setelah setempat setengah seterusnya setiap setiba setibanya
+setidak-tidaknya setidaknya setinggi seusai sewaktu siap siapa siapakah
+siapapun sini sinilah soal soalnya suatu sudah sudahkah sudahlah supaya
+
+tadi tadinya tahu tahun tak tambah tambahnya tampak tampaknya tandas tandasnya
+tanpa tanya tanyakan tanyanya tapi tegas tegasnya telah tempat tengah tentang
+tentu tentulah tentunya tepat terakhir terasa terbanyak terdahulu terdapat
+terdiri terhadap terhadapnya teringat teringat-ingat terjadi terjadilah
+terjadinya terkira terlalu terlebih terlihat termasuk ternyata tersampaikan
+tersebut tersebutlah tertentu tertuju terus terutama tetap tetapi tiap tiba
+tiba-tiba tidak tidakkah tidaklah tiga tinggi toh tunjuk turut tutur tuturnya
+
+ucap ucapnya ujar ujarnya umum umumnya ungkap ungkapnya untuk usah usai
+
+waduh wah wahai waktu waktunya walau walaupun wong
+
+yaitu yakin yakni yang
+""".split()
+)
diff --git a/spacy/lang/ms/syntax_iterators.py b/spacy/lang/ms/syntax_iterators.py
new file mode 100644
index 000000000..027798687
--- /dev/null
+++ b/spacy/lang/ms/syntax_iterators.py
@@ -0,0 +1,41 @@
+from typing import Iterator, Tuple, Union
+
+from ...errors import Errors
+from ...symbols import NOUN, PRON, PROPN
+from ...tokens import Doc, Span
+
+
+def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
+ """
+ Detect base noun phrases from a dependency parse. Works on both Doc and Span.
+ """
+ # fmt: off
+ labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
+ # fmt: on
+ doc = doclike.doc # Ensure works on both Doc and Span.
+ if not doc.has_annotation("DEP"):
+ raise ValueError(Errors.E029)
+ np_deps = [doc.vocab.strings[label] for label in labels]
+ conj = doc.vocab.strings.add("conj")
+ np_label = doc.vocab.strings.add("NP")
+ prev_end = -1
+ for i, word in enumerate(doclike):
+ if word.pos not in (NOUN, PROPN, PRON):
+ continue
+ # Prevent nested chunks from being produced
+ if word.left_edge.i <= prev_end:
+ continue
+ if word.dep in np_deps:
+ prev_end = word.right_edge.i
+ yield word.left_edge.i, word.right_edge.i + 1, np_label
+ elif word.dep == conj:
+ head = word.head
+ while head.dep == conj and head.head.i < head.i:
+ head = head.head
+ # If the head is an NP, and we're coordinated to it, we're an NP
+ if head.dep in np_deps:
+ prev_end = word.right_edge.i
+ yield word.left_edge.i, word.right_edge.i + 1, np_label
+
+
+SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
diff --git a/spacy/lang/ms/tokenizer_exceptions.py b/spacy/lang/ms/tokenizer_exceptions.py
new file mode 100644
index 000000000..e8b53fed8
--- /dev/null
+++ b/spacy/lang/ms/tokenizer_exceptions.py
@@ -0,0 +1,1532 @@
+from ...symbols import NORM, ORTH
+from ...util import update_exc
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
+from ._tokenizer_exceptions_list import MS_BASE_EXCEPTIONS
+
+# Daftar singkatan dan Akronim dari:
+# https://ms.wiktionary.org/wiki/Wiktionary:Senarai_akronim_dan_singkatan
+
+_exc = {}
+
+for orth in MS_BASE_EXCEPTIONS:
+ _exc[orth] = [{ORTH: orth}]
+ orth_title = orth.title()
+ _exc[orth_title] = [{ORTH: orth_title}]
+ orth_caps = orth.upper()
+ _exc[orth_caps] = [{ORTH: orth_caps}]
+ orth_lower = orth.lower()
+ _exc[orth_lower] = [{ORTH: orth_lower}]
+ orth_first_upper = orth[0].upper() + orth[1:]
+ _exc[orth_first_upper] = [{ORTH: orth_first_upper}]
+ if "-" in orth:
+ orth_title = "-".join([part.title() for part in orth.split("-")])
+ _exc[orth_title] = [{ORTH: orth_title}]
+ orth_caps = "-".join([part.upper() for part in orth.split("-")])
+ _exc[orth_caps] = [{ORTH: orth_caps}]
+
+for exc_data in [
+ {ORTH: "Jan.", NORM: "Januari"},
+ {ORTH: "Feb.", NORM: "Februari"},
+ {ORTH: "Mac.", NORM: "Mac"},
+ {ORTH: "Apr.", NORM: "April"},
+ {ORTH: "Jun.", NORM: "Jun"},
+ {ORTH: "Jul.", NORM: "Julai"},
+ {ORTH: "Ogos.", NORM: "Ogos"},
+ {ORTH: "Sep.", NORM: "September"},
+ {ORTH: "Okt.", NORM: "Oktober"},
+ {ORTH: "Nov.", NORM: "November"},
+ {ORTH: "Dis.", NORM: "Disember"},
+]:
+ _exc[exc_data[ORTH]] = [exc_data]
+
+_other_exc = {
+ "do'a": [{ORTH: "do'a", NORM: "doa"}],
+ "jum'at": [{ORTH: "jum'at", NORM: "Jumat"}],
+ "Jum'at": [{ORTH: "Jum'at", NORM: "Jumat"}],
+ "la'nat": [{ORTH: "la'nat", NORM: "laknat"}],
+ "ma'af": [{ORTH: "ma'af", NORM: "maaf"}],
+ "mu'jizat": [{ORTH: "mu'jizat", NORM: "mukjizat"}],
+ "Mu'jizat": [{ORTH: "Mu'jizat", NORM: "mukjizat"}],
+ "ni'mat": [{ORTH: "ni'mat", NORM: "nikmat"}],
+ "raka'at": [{ORTH: "raka'at", NORM: "rakaat"}],
+ "ta'at": [{ORTH: "ta'at", NORM: "taat"}],
+}
+
+_exc.update(_other_exc)
+
+for orth in [
+ "1 Kor.",
+ "1 Ptr.",
+ "1 Raj.",
+ "1 Sam.",
+ "1 Taw.",
+ "1 Tes.",
+ "1 Tim.",
+ "1 Yoh.",
+ "1Ch.",
+ "1Co.",
+ "1Jo.",
+ "1Ki.",
+ "1Pe.",
+ "1Sa.",
+ "1Th.",
+ "1Ti.",
+ "2 Kor.",
+ "2 Ptr.",
+ "2 Raj.",
+ "2 Sam.",
+ "2 Taw.",
+ "2 Tes.",
+ "2 Tim.",
+ "2 Yoh.",
+ "2Ch.",
+ "2Co.",
+ "2Jo.",
+ "2Ki.",
+ "2Pe.",
+ "2Sa.",
+ "2Th.",
+ "2Ti.",
+ "3 Yoh.",
+ "3D",
+ "3F",
+ "3Jo.",
+ "3M",
+ "8MP",
+ "AA",
+ "AAAAAA",
+ "AB",
+ "Abd.",
+ "ABC",
+ "ABIM",
+ "ABM",
+ "ABMI",
+ "ABS",
+ "AC",
+ "Ac",
+ "ACAPLPL",
+ "Act.",
+ "AD",
+ "AD LIB",
+ "ADAM",
+ "ADB",
+ "ADD",
+ "ADIL",
+ "ADN",
+ "ADR",
+ "ADRI",
+ "ADSL",
+ "ADUN",
+ "AFAS",
+ "AFTA",
+ "Ag",
+ "AGMARIS",
+ "AH",
+ "AI",
+ "AIA",
+ "AIDS",
+ "AIJV",
+ "AIM",
+ "a/k",
+ "ak",
+ "AKN",
+ "Al",
+ "a/l",
+ "AM",
+ "Am",
+ "Am.",
+ "AMN",
+ "Amo.",
+ "AMPS",
+ "Ams.",
+ "AMWA",
+ "AN",
+ "a.n.",
+ "ANGKASA",
+ "ANM",
+ "ANSI",
+ "Ant.",
+ "AOL",
+ "AP",
+ "a/p",
+ "APD",
+ "APEC",
+ "API",
+ "APIK",
+ "APM",
+ "APN",
+ "APP",
+ "Apr.",
+ "APRI",
+ "Ar",
+ "Ar.",
+ "ark.",
+ "A.S.",
+ "As",
+ "a.s.",
+ "ASA",
+ "ASAS 50",
+ "ASB",
+ "ASCII",
+ "ASEAN",
+ "ASEAN+3",
+ "ASEM",
+ "a.s.f.",
+ "ASN",
+ "a.s.o.",
+ "ASP",
+ "Ast.",
+ "A.T.",
+ "At",
+ "ATM",
+ "a.t.r.",
+ "ATUR",
+ "Au",
+ "AURI",
+ "Aug.",
+ "AWOL",
+ "Ayb.",
+ "B",
+ "BA",
+ "Ba",
+ "BAC",
+ "BAFIA",
+ "BAM",
+ "BANANA",
+ "BAPP",
+ "BASF",
+ "BATA",
+ "BB",
+ "BBC",
+ "BBE",
+ "BBS",
+ "BC",
+ "BCG",
+ "BCIC",
+ "b.d.",
+ "BDSSHAM",
+ "Be",
+ "BEER",
+ "BERNAMA",
+ "Bh",
+ "b.h.",
+ "Bhd.",
+ "Bi",
+ "BIDS",
+ "Bil.",
+ "bil.",
+ "BIMP-EAGA",
+ "Bio.",
+ "BIOS",
+ "BITMB",
+ "BJ",
+ "Bk",
+ "b.k.",
+ "BKAL",
+ "bkn.",
+ "BKP",
+ "BL",
+ "BLR",
+ "BM",
+ "BMI",
+ "BMW",
+ "BN",
+ "BNM",
+ "BO",
+ "BOJ",
+ "BOO",
+ "BOP",
+ "BOT",
+ "BP",
+ "b.p.",
+ "BPA",
+ "BPAs",
+ "bpd.",
+ "BPIMB",
+ "BPM",
+ "BPO",
+ "BPPH",
+ "Br",
+ "Br.",
+ "BSA",
+ "B.Sc.",
+ "B.Sh.",
+ "b.s.j.",
+ "BSN",
+ "Bt.",
+ "bt.",
+ "BWT",
+ "BYOB",
+ "C",
+ "C.",
+ "C/E",
+ "Ca",
+ "CAAM",
+ "CAD",
+ "CAM",
+ "CATV",
+ "CBS",
+ "CBT",
+ "CC",
+ "CCD",
+ "CCM",
+ "CCR",
+ "cct-km",
+ "CCTV",
+ "CCU",
+ "CD",
+ "Cd",
+ "CD-ROM",
+ "CD-RW",
+ "CDRC",
+ "Ce",
+ "CEO",
+ "CEPT",
+ "Cetak",
+ "Cf",
+ "CFO",
+ "CFTC",
+ "CGC",
+ "CGI",
+ "CH",
+ "CIA",
+ "CIAST",
+ "CID",
+ "CIDB",
+ "CIQ",
+ "CKD",
+ "CL",
+ "Cl",
+ "c.l.",
+ "CLI",
+ "CLOB",
+ "CM",
+ "Cm",
+ "cm.",
+ "CMAG",
+ "CMI",
+ "CMP",
+ "CNN",
+ "Co",
+ "COD",
+ "Col.",
+ "COLA",
+ "COMDEX",
+ "CP",
+ "CPI",
+ "CPO",
+ "CPR",
+ "CPU",
+ "Cr",
+ "CRDF",
+ "Cs",
+ "CST",
+ "CT",
+ "CTIP",
+ "CTRM",
+ "Cu",
+ "CUEPACS",
+ "D-8",
+ "d/a",
+ "DAGS",
+ "Dan.",
+ "DANCED",
+ "DAP",
+ "DARA",
+ "Db",
+ "DBKL",
+ "DBP",
+ "DBR",
+ "DC",
+ "DDA",
+ "DDT",
+ "DEB",
+ "Dec.",
+ "Deu.",
+ "DFIs",
+ "dgn.",
+ "DHL",
+ "DIBML",
+ "DIN",
+ "Dis.",
+ "DJ",
+ "d.l.l.",
+ "dlm.",
+ "dng.",
+ "DNS",
+ "DO",
+ "DOA",
+ "DOE",
+ "DOF",
+ "DOSH",
+ "doz.",
+ "DPPS",
+ "Dr.",
+ "dr.",
+ "drp.",
+ "drpd.",
+ "Ds",
+ "d.sb.",
+ "d.st.",
+ "DSTN2",
+ "Dt.",
+ "DTAs",
+ "DTMF",
+ "DTP",
+ "DTV",
+ "DUBES",
+ "DUNHILL",
+ "DV8",
+ "DVD",
+ "DVE",
+ "DVS",
+ "dw.t.",
+ "Dy",
+ "DYMM",
+ "E",
+ "E-Commerce",
+ "E-Dagang",
+ "E&E",
+ "E-Faraid",
+ "E-Government",
+ "E-Kerajaan",
+ "E-Mail",
+ "E-Services",
+ "E-Village",
+ "E-Zine",
+ "EALAF",
+ "EBI",
+ "EBP",
+ "EC",
+ "ECAFE",
+ "Ecc.",
+ "ECI",
+ "ECM",
+ "ECOSOC",
+ "ECP",
+ "ECR",
+ "EDI",
+ "EE",
+ "EEC",
+ "Ef.",
+ "EG",
+ "Eko.",
+ "EKS",
+ "ELWS",
+ "ELX",
+ "EMI",
+ "EMUs",
+ "En.",
+ "EP",
+ "EPF",
+ "Eph.",
+ "EPP",
+ "EPS",
+ "EPU",
+ "ER",
+ "Er",
+ "ERL",
+ "ERT",
+ "Es",
+ "ESCAP",
+ "ESOS",
+ "ESP",
+ "EST",
+ "Est.",
+ "ET",
+ "ETA",
+ "ETACS",
+ "ETC",
+ "ETD",
+ "EU",
+ "Eu",
+ "EVIAN",
+ "Exim Bank",
+ "Exo.",
+ "Eze.",
+ "Ezr.",
+ "F",
+ "FAM",
+ "FAMA",
+ "FAO",
+ "FAQ",
+ "FAX",
+ "FBI",
+ "FC",
+ "FCA",
+ "FCC",
+ "FDI",
+ "FE",
+ "Fe",
+ "f.e.",
+ "Feb.",
+ "FELCRA",
+ "FELDA",
+ "FI",
+ "FIA 1993",
+ "FIAT",
+ "FIC",
+ "FIDA",
+ "FIFA",
+ "FIMA",
+ "Fiz.",
+ "Flm.",
+ "Flp.",
+ "FM",
+ "Fm",
+ "FMUTM",
+ "FO",
+ "FOA",
+ "FOB",
+ "FOC",
+ "FOMCA",
+ "FORD",
+ "Fr",
+ "FRIM",
+ "FRTI",
+ "FSMP",
+ "FTA",
+ "FTE",
+ "FTP",
+ "G",
+ "g.",
+ "G15",
+ "G77",
+ "Ga",
+ "GAC",
+ "GACM",
+ "Gal.",
+ "GAPENA",
+ "GATS",
+ "GATT",
+ "GB",
+ "Gbps.",
+ "Gd",
+ "GDP",
+ "Ge",
+ "GEC",
+ "Gen.",
+ "Geo.",
+ "Geog.",
+ "Gerakan",
+ "GH",
+ "GIF",
+ "GII",
+ "GIS",
+ "GITIC",
+ "GITN",
+ "GJ",
+ "GLCs",
+ "GM",
+ "GMBH",
+ "GMI",
+ "GMT",
+ "GNP",
+ "GNS",
+ "GOLD",
+ "GP",
+ "GPC",
+ "GPIM",
+ "GPMS",
+ "GPO",
+ "GPP",
+ "GPS",
+ "GRO",
+ "GRS",
+ "GSMC",
+ "GST",
+ "GTZ",
+ "GUI",
+ "GWh.",
+ "H",
+ "Ha",
+ "Hab.",
+ "Hag.",
+ "Hak.",
+ "ham",
+ "hb.",
+ "HCI",
+ "HDTV",
+ "He",
+ "Heb.",
+ "Hf",
+ "Hg",
+ "HI-FI",
+ "HIS",
+ "HIV",
+ "Hj.",
+ "HMS",
+ "Ho",
+ "Hos.",
+ "HP",
+ "HRDC",
+ "HRDF",
+ "HRMIS",
+ "Hs",
+ "Hut.",
+ "I",
+ "I/O",
+ "IA",
+ "IAA",
+ "IADPs",
+ "IB",
+ "i.b.",
+ "IBA",
+ "IBFIM",
+ "IBG",
+ "Ibr.",
+ "IBRD",
+ "IBS",
+ "IC",
+ "ICA",
+ "ICBM",
+ "ICFM",
+ "ICI",
+ "ICM",
+ "ICOR",
+ "ICP",
+ "ICT",
+ "ICU",
+ "ID",
+ "Id.",
+ "IDB",
+ "IDFR",
+ "IE",
+ "i.e.",
+ "IFSB",
+ "IGAs",
+ "IGS",
+ "IHP",
+ "IHPG",
+ "IIM",
+ "IINA",
+ "IKKL",
+ "IKP",
+ "IKPH",
+ "IKS",
+ "Im.",
+ "IMD",
+ "IMF",
+ "IMP2",
+ "IMR",
+ "IMS-GT",
+ "IMT-GT",
+ "In",
+ "in.",
+ "INFRA",
+ "INSEP",
+ "INSPEN",
+ "INTAN",
+ "IOFC",
+ "IOU",
+ "IP",
+ "IPA",
+ "IPBA",
+ "IPCs",
+ "IPEBP",
+ "IPI",
+ "IPKIM",
+ "IPKPM",
+ "IPO",
+ "IPP",
+ "IPPM",
+ "IPPPM",
+ "i.pt.",
+ "IPTAR",
+ "IPTNM",
+ "IQR",
+ "Ir",
+ "IRA",
+ "IRPA",
+ "IRS",
+ "i.s.",
+ "ISA",
+ "Isa.",
+ "ISDN",
+ "ISMM",
+ "ISO",
+ "ISP",
+ "ist.",
+ "IT",
+ "i.t.",
+ "ITA",
+ "ITAF",
+ "ITEX",
+ "ITK",
+ "ITM",
+ "ITO",
+ "ITRCo",
+ "ITTA",
+ "ITU",
+ "JAK",
+ "JAKIM",
+ "Jam.",
+ "Jan.",
+ "Jb.",
+ "JBIC",
+ "JD",
+ "JDA",
+ "Jdg.",
+ "Jer.",
+ "Jh.",
+ "JICA",
+ "JJ",
+ "Jk.",
+ "JKKK",
+ "jkps.",
+ "JKR",
+ "JMTI",
+ "JOA",
+ "Joe.",
+ "Joh.",
+ "Jon.",
+ "Jos.",
+ "JP",
+ "JPA",
+ "JPEG",
+ "JPH",
+ "JPJ",
+ "JPSHK",
+ "JPS",
+ "JPT",
+ "JRDA",
+ "JSM",
+ "JT",
+ "Jud.",
+ "Jul.",
+ "Jun.",
+ "JVC",
+ "Jw.",
+ "K",
+ "K-Economy",
+ "KADA",
+ "KBE",
+ "KBIA",
+ "KBPA",
+ "KBSM",
+ "KD",
+ "Kd.",
+ "KDI",
+ "KDN",
+ "KDNK",
+ "KE",
+ "KEAP",
+ "Kej.",
+ "Kel.",
+ "KEM",
+ "KEMLU",
+ "kep.",
+ "Kg.",
+ "kg.",
+ "KGB",
+ "KGK",
+ "KH",
+ "ki.",
+ "Kid.",
+ "KIK",
+ "KIKMTT",
+ "KIM",
+ "Kim.",
+ "Kis.",
+ "KIX",
+ "KKGSK",
+ "KKK",
+ "KKPPA",
+ "KL",
+ "Kl.",
+ "KLCI",
+ "KLIA",
+ "KLIBOR",
+ "KLIM",
+ "KLM",
+ "KLSE",
+ "KM",
+ "KMM",
+ "KNK",
+ "KO",
+ "Kol.",
+ "Kom.",
+ "Komp.",
+ "KOMSAS",
+ "KPAI",
+ "KPB",
+ "KPBA",
+ "KPC",
+ "kpd.",
+ "KPE",
+ "KPIs",
+ "KPPL",
+ "KPPMS",
+ "KPWM",
+ "Kr",
+ "KRM",
+ "KSTI",
+ "KT",
+ "KTA",
+ "KTABKL",
+ "KTM",
+ "KTMB",
+ "kV",
+ "kW",
+ "kWh",
+ "kWj",
+ "KWSP",
+ "LA",
+ "La",
+ "LABOR",
+ "Lam.",
+ "LAN",
+ "LAPD",
+ "LASER",
+ "LAX",
+ "lb.",
+ "LC",
+ "LCD",
+ "LCHRF",
+ "LCLY",
+ "LED",
+ "Lev.",
+ "LFPR",
+ "LFS",
+ "LFX",
+ "LGM",
+ "Li",
+ "LID",
+ "Lin.",
+ "LKN",
+ "LKPM",
+ "LKPP",
+ "LKTP",
+ "LKWJ",
+ "LLB",
+ "LLC",
+ "LLN",
+ "LLS",
+ "LMSM",
+ "LNG",
+ "LOA",
+ "LOBATA",
+ "LOFSA",
+ "LPG",
+ "LPIP",
+ "LPKI",
+ "LPKLPL",
+ "LPKN",
+ "LPN",
+ "LPP",
+ "LPPK",
+ "LPPM",
+ "LPPP",
+ "LPPTP",
+ "Lr",
+ "LRs",
+ "LRT",
+ "LS",
+ "LTAKL",
+ "LTD",
+ "LTK",
+ "Lu",
+ "LUAS",
+ "Luk.",
+ "lw.",
+ "lwn.",
+ "M\n",
+ "m",
+ "M&A",
+ "MAB",
+ "MACRES",
+ "MAD",
+ "MADA",
+ "MAGERAN",
+ "MAHA",
+ "MAHSURI",
+ "Mal.",
+ "MALINDO",
+ "MAMPU",
+ "Mar.",
+ "MARA",
+ "MARC",
+ "MARDI",
+ "MARLBORO",
+ "MAS",
+ "MASSA",
+ "MASSCORP",
+ "Mat.",
+ "MATRADE",
+ "MAVCAP",
+ "MB",
+ "MBA",
+ "MBBS",
+ "MBM",
+ "MBO",
+ "MBS",
+ "MBTU",
+ "MC",
+ "MCA",
+ "MCB",
+ "MCSL",
+ "MCSv5",
+ "MD",
+ "Md",
+ "MDB",
+ "MDC",
+ "MDG",
+ "MDV",
+ "MEASAT",
+ "MEATJ",
+ "MECIB",
+ "MEMO",
+ "MENLU",
+ "MEPS",
+ "MES",
+ "MESDAQ",
+ "METEOR",
+ "MFI",
+ "MFIs",
+ "MG",
+ "Mg",
+ "MGM",
+ "MGR",
+ "MGS",
+ "MHA",
+ "Mi.",
+ "MIA",
+ "MIB",
+ "MIC",
+ "Mic.",
+ "MICE",
+ "MIDA",
+ "MIDF",
+ "MIDI",
+ "MIG",
+ "MIGHT",
+ "MII",
+ "MIMOS",
+ "MINDEF",
+ "MINT",
+ "mis.",
+ "MIT",
+ "MITC",
+ "MITI",
+ "Ml.",
+ "MLNG",
+ "mlpd.",
+ "MM",
+ "mm",
+ "MMN",
+ "mmscfd.",
+ "MMU",
+ "MMX",
+ "Mn",
+ "Mn.",
+ "MNA",
+ "MNCs",
+ "MO",
+ "Mo",
+ "MOA",
+ "MOD",
+ "MODEM",
+ "MOE",
+ "MOH",
+ "MOSTE",
+ "MOSTI",
+ "MOU",
+ "MP",
+ "MPB",
+ "MPEG",
+ "MPOB",
+ "MPP",
+ "mppa.",
+ "MPPJ",
+ "MPS",
+ "MPTM",
+ "MR",
+ "m.r.",
+ "MRB",
+ "MRELB",
+ "Mrk.",
+ "MRRDB",
+ "MS",
+ "MS-DOS",
+ "MSC",
+ "MSG",
+ "MSM",
+ "Mt",
+ "MTC",
+ "MTCP",
+ "MTD",
+ "MTDC",
+ "MTPB",
+ "MTV",
+ "Muz.",
+ "MV",
+ "MW",
+ "MY",
+ "MyKe",
+ "Mzm.",
+ "N",
+ "N/A",
+ "Na",
+ "NAB",
+ "NACIWID",
+ "Nah.",
+ "NAP",
+ "NASA",
+ "NATO",
+ "NAV",
+ "NB",
+ "Nb",
+ "NBA",
+ "NBC",
+ "NCR",
+ "Nd",
+ "NDP",
+ "Ne",
+ "NEAC",
+ "NEC",
+ "NEF",
+ "Neh.",
+ "NEP",
+ "NEqO",
+ "NERP",
+ "NF",
+ "NFPEs",
+ "NG",
+ "NGOs",
+ "NGV",
+ "NHEF",
+ "NHHES",
+ "NHK",
+ "Ni",
+ "NIDC",
+ "NIH",
+ "NIP",
+ "NIPA",
+ "NIS",
+ "NISIR",
+ "NITA",
+ "NITC",
+ "NITP",
+ "NIV",
+ "NLAC",
+ "NMPBSP",
+ "NMU",
+ "No",
+ "No.",
+ "no.",
+ "NOSS",
+ "Nov.",
+ "Np",
+ "NPC",
+ "NPCS",
+ "NPL",
+ "NRCC",
+ "NRW",
+ "NS",
+ "Ns",
+ "NSB",
+ "NTA",
+ "NTHRDC",
+ "NTMP",
+ "NTSC",
+ "Num.",
+ "NUTF",
+ "NVP",
+ "NVTC",
+ "NWRC",
+ "O",
+ "Ob.",
+ "Oba.",
+ "OC",
+ "OCPD",
+ "Oct.",
+ "OD",
+ "ODA",
+ "OECD",
+ "OEM",
+ "Ogo.",
+ "OHQs",
+ "OIC",
+ "Okt.",
+ "OPEC",
+ "OPP",
+ "OPP3",
+ "OPR",
+ "OS",
+ "Os",
+ "OSA",
+ "OT",
+ "OUG",
+ "oz.",
+ "P",
+ "P&P",
+ "PA",
+ "Pa",
+ "PABK",
+ "PABX",
+ "PAK",
+ "PAKSI",
+ "PAL",
+ "PALL MALL",
+ "PAS",
+ "PATA",
+ "PAWS",
+ "Pb",
+ "PBA",
+ "PBB",
+ "PBM",
+ "PBP",
+ "PBSM",
+ "PBT",
+ "PC",
+ "PC(s)",
+ "PCB",
+ "PCIRITA",
+ "PCM",
+ "PCMCIA",
+ "PCN",
+ "PD",
+ "Pd",
+ "pd.",
+ "PDS",
+ "PE",
+ "PEKEMAS",
+ "PEMADAM",
+ "PENA",
+ "PENIS",
+ "PERDANA",
+ "PERKESO",
+ "PERKIM",
+ "PERNAS",
+ "PERTAMA",
+ "PERTIWI",
+ "PESAKA",
+ "PETA",
+ "PETRONAS",
+ "PGU",
+ "Ph.",
+ "PHD",
+ "Phi.",
+ "Phm.",
+ "PIK",
+ "PIKOM",
+ "PIN",
+ "PINTAS",
+ "PIPM",
+ "PISK",
+ "PITA",
+ "PIXEL",
+ "PJ",
+ "PJK",
+ "PJKB",
+ "PJP",
+ "PKBM",
+ "PKBTA",
+ "PKEN",
+ "Pkh.",
+ "PKKM",
+ "PKLPA",
+ "PKM",
+ "PKNS",
+ "PKPIM",
+ "PKPM",
+ "PKR",
+ "PKS",
+ "Pl.",
+ "p.l.",
+ "PLA",
+ "PLC",
+ "PLCHP",
+ "PLCs",
+ "PLI",
+ "PLT",
+ "PLUS",
+ "PLWS",
+ "PM",
+ "Pm",
+ "PMM",
+ "PMP",
+ "PMR",
+ "PMS",
+ "Pn.",
+ "PNAT",
+ "PNS",
+ "PO",
+ "Po",
+ "POCPA",
+ "POKEMON",
+ "Pol.",
+ "POP",
+ "PORIM",
+ "PORLA",
+ "PORTAFOAM",
+ "PP",
+ "PPA",
+ "PPBE",
+ "PPBK",
+ "ppd.",
+ "PPGM",
+ "PPI",
+ "PPK",
+ "PPL",
+ "PPM",
+ "PPP",
+ "PPPB",
+ "PPPLM",
+ "PPPM",
+ "PPR",
+ "PPRT",
+ "PPS",
+ "PPTM",
+ "PPU",
+ "PR",
+ "Pr",
+ "Pr.",
+ "prb.",
+ "PRI",
+ "PRO",
+ "Pro.",
+ "Prof.",
+ "PROSPER",
+ "PROSTAR",
+ "PROTON",
+ "PS",
+ "PSA",
+ "Psa.",
+ "PSCs",
+ "PSDC",
+ "PSDH",
+ "Psi.",
+ "PSKE",
+ "PSRM",
+ "PST",
+ "PT",
+ "Pt",
+ "PTD",
+ "PTP",
+ "Pu",
+ "PUNB",
+ "QA",
+ "QC",
+ "QCC",
+ "R&D",
+ "RA",
+ "Ra",
+ "RAM",
+ "RAPP",
+ "Rat.",
+ "Rb",
+ "RCA",
+ "RDA",
+ "RDAs",
+ "RDCs",
+ "RE",
+ "Re",
+ "REHDA",
+ "Rev.",
+ "Rf",
+ "Rg",
+ "RGB",
+ "Rh",
+ "RI",
+ "RIDA",
+ "RIP",
+ "RISDA",
+ "r.l.",
+ "RM",
+ "Rm.",
+ "RMKe-8",
+ "Rn",
+ "ROC",
+ "ROM",
+ "Rom.",
+ "RPG",
+ "RPS",
+ "RRI",
+ "RRIM",
+ "RRJP",
+ "RRP",
+ "RSGC",
+ "RSS",
+ "RSVP",
+ "Rt.",
+ "RTA",
+ "RTM",
+ "Ru",
+ "Rut.",
+ "RWCR",
+ "RX",
+ "S",
+ "S/N",
+ "S&T",
+ "S-VHS",
+ "SA",
+ "SAC",
+ "SADCs",
+ "SAGA",
+ "SALCRA",
+ "SALM",
+ "SALT",
+ "SAM",
+ "SAP",
+ "SARS",
+ "Sas.",
+ "s.a.w.",
+ "SB",
+ "Sb",
+ "Sb.",
+ "SBA",
+ "SBB",
+ "sbg.",
+ "SBK",
+ "SC",
+ "Sc",
+ "SCA",
+ "SCADA",
+ "SCANS",
+ "SCSI",
+ "SCuM",
+ "SDCs",
+ "Sdn. Bhd.",
+ "sdr.",
+ "SDRC",
+ "Se",
+ "SEATO",
+ "SEB",
+ "SECAM",
+ "SEDCs",
+ "SEFF",
+ "Sej.",
+ "SEMS",
+ "Sep.",
+ "Sept.",
+ "SESB",
+ "SESCo",
+ "s.f.",
+ "Sg",
+ "SGPCA",
+ "SGPPI",
+ "SGPPKRM",
+ "SGX",
+ "Si",
+ "Si.",
+ "SIA 1983",
+ "SIC",
+ "SIM",
+ "SING",
+ "SIRIM",
+ "SITTDEC",
+ "sj.",
+ "SKDTP",
+ "SKM",
+ "SKSM",
+ "SL",
+ "Sl.",
+ "sl.",
+ "SLMCH",
+ "SLR",
+ "SM",
+ "Sm",
+ "SMART",
+ "SMEs",
+ "SMEt",
+ "SMIs",
+ "SMIDEC",
+ "SMIDP",
+ "SMJK",
+ "SMR",
+ "SMS",
+ "SMT",
+ "SMTP",
+ "SN",
+ "Sn",
+ "SOB",
+ "SOCSO",
+ "SOHO",
+ "Son.",
+ "SOS",
+ "Sos.",
+ "SP",
+ "SPA",
+ "SPAM",
+ "SPCA",
+ "SPKR",
+ "SPLAM",
+ "SPM",
+ "SPNB",
+ "SPSP",
+ "t.",
+ "Ta",
+ "Tadb.",
+ "TAF",
+ "TAF-W",
+ "Tani",
+ "TAP",
+ "TAR",
+ "TARBI",
+ "TB",
+ "Tb",
+ "TBA",
+ "TBTP",
+ "Tc",
+ "TCPD",
+ "TDCs",
+ "Te",
+ "TEKUN",
+ "TELCO",
+ "TELEX",
+ "TEUs",
+ "TFP",
+ "TGV",
+ "TH",
+ "Th",
+ "THIS",
+ "Ti",
+ "TICAD",
+ "Tit.",
+ "TKA",
+ "Tks.",
+ "Tl",
+ "TLDM",
+ "TM",
+ "Tm",
+ "TMB",
+ "TMK",
+ "TNB",
+ "TNSB",
+ "TNT",
+ "TOEFL",
+ "TP",
+ "TPIM",
+ "TPK",
+ "TPPP",
+ "TPPT",
+ "TPSM",
+ "TPUB",
+ "TQM",
+ "Tr.",
+ "TRIPs",
+ "tsb.",
+ "tscf.",
+ "t.sh.",
+ "t.s.t.",
+ "TT",
+ "t.t.",
+ "TUDM",
+ "TV",
+ "TVSMR",
+ "TWAIN",
+ "TX",
+ "TYPHIrapid",
+ "U",
+ "Ubat",
+ "UDA",
+ "Udg.",
+ "UFO",
+ "UH",
+ "UIA",
+ "UiTM",
+ "UK",
+ "UKM",
+ "UL",
+ "Ul.",
+ "ULC",
+ "UM",
+ "UMNO",
+ "UMS",
+ "UN",
+ "UN/OSCAL",
+ "UNCLE",
+ "UNCTAD",
+ "UNDP",
+ "UNESCO",
+ "UNFCCC",
+ "UNFPA",
+ "UNHCR",
+ "UNICEF",
+ "UNIMAS",
+ "UNTAET",
+ "UPE",
+ "UPM",
+ "UPS",
+ "UPSR",
+ "URL",
+ "US",
+ "USAINS",
+ "USD",
+ "USM",
+ "USNO",
+ "USS",
+ "USSR",
+ "UTC",
+ "UTF",
+ "utk.",
+ "UTM",
+ "V",
+ "VAT",
+ "VCC",
+ "VCD",
+ "VCR",
+ "VD",
+ "VDSC",
+ "VGA",
+ "VHF",
+ "VHS",
+ "VIP",
+ "VMS",
+ "VO",
+ "VOA",
+ "VoIP",
+ "VR",
+ "VSOP",
+ "VW",
+ "W",
+ "W/O",
+ "WAP",
+ "WAY",
+ "WC",
+ "WDDM",
+ "WDM",
+ "WHO",
+ "Why.",
+ "WIM",
+ "WPG",
+ "WTO",
+ "WWF",
+ "WWW",
+ "WYSIWYG",
+ "Xe",
+ "XO",
+ "XXL",
+ "Y",
+ "Y2K",
+ "YAB",
+ "Yak.",
+ "YAM",
+ "YAS",
+ "YB",
+ "Yb",
+ "Yeh.",
+ "Yer.",
+ "Yes.",
+ "yg.",
+ "Yl.",
+ "YM",
+ "YMCA",
+ "Yoh.",
+ "Yos.",
+ "Y.Th.",
+ "YTM",
+ "Yud.",
+ "Yun.",
+ "Za.",
+ "Zec.",
+ "Zef.",
+ "Zep.",
+ "ZIP",
+ "Zn",
+ "Zr",
+]:
+ _exc[orth] = [{ORTH: orth}]
+
+TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
diff --git a/spacy/lang/nb/__init__.py b/spacy/lang/nb/__init__.py
index e079236fd..ef4665ccc 100644
--- a/spacy/lang/nb/__init__.py
+++ b/spacy/lang/nb/__init__.py
@@ -1,12 +1,13 @@
-from typing import Optional, Callable
+from typing import Callable, Optional
+
from thinc.api import Model
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
-from .punctuation import TOKENIZER_SUFFIXES
+
+from ...language import BaseDefaults, Language
+from ...pipeline import Lemmatizer
+from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS
-from ...language import Language, BaseDefaults
-from ...pipeline import Lemmatizer
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class NorwegianDefaults(BaseDefaults):
diff --git a/spacy/lang/nb/punctuation.py b/spacy/lang/nb/punctuation.py
index 8f2933670..a1fdb872a 100644
--- a/spacy/lang/nb/punctuation.py
+++ b/spacy/lang/nb/punctuation.py
@@ -1,7 +1,17 @@
-from ..char_classes import LIST_ELLIPSES, LIST_ICONS, LIST_PUNCT, LIST_QUOTES
-from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
-from ..char_classes import CURRENCY, PUNCT, UNITS, LIST_CURRENCY
-
+from ..char_classes import (
+ ALPHA,
+ ALPHA_LOWER,
+ ALPHA_UPPER,
+ CONCAT_QUOTES,
+ CURRENCY,
+ LIST_CURRENCY,
+ LIST_ELLIPSES,
+ LIST_ICONS,
+ LIST_PUNCT,
+ LIST_QUOTES,
+ PUNCT,
+ UNITS,
+)
# Punctuation adapted from Danish
_quotes = CONCAT_QUOTES.replace("'", "")
diff --git a/spacy/lang/nb/syntax_iterators.py b/spacy/lang/nb/syntax_iterators.py
index d86662693..89a8f5edf 100644
--- a/spacy/lang/nb/syntax_iterators.py
+++ b/spacy/lang/nb/syntax_iterators.py
@@ -1,7 +1,7 @@
-from typing import Union, Iterator, Tuple
+from typing import Iterator, Tuple, Union
-from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors
+from ...symbols import NOUN, PRON, PROPN
from ...tokens import Doc, Span
diff --git a/spacy/lang/nb/tokenizer_exceptions.py b/spacy/lang/nb/tokenizer_exceptions.py
index 0be436ae4..9b99a1d65 100644
--- a/spacy/lang/nb/tokenizer_exceptions.py
+++ b/spacy/lang/nb/tokenizer_exceptions.py
@@ -1,7 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...symbols import ORTH, NORM
+from ...symbols import NORM, ORTH
from ...util import update_exc
-
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc = {}
diff --git a/spacy/lang/ne/__init__.py b/spacy/lang/ne/__init__.py
index 0028d1b0b..5c9e6870e 100644
--- a/spacy/lang/ne/__init__.py
+++ b/spacy/lang/ne/__init__.py
@@ -1,6 +1,6 @@
-from .stop_words import STOP_WORDS
+from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
-from ...language import Language, BaseDefaults
+from .stop_words import STOP_WORDS
class NepaliDefaults(BaseDefaults):
diff --git a/spacy/lang/ne/lex_attrs.py b/spacy/lang/ne/lex_attrs.py
index 7cb01c515..91d5b0eb5 100644
--- a/spacy/lang/ne/lex_attrs.py
+++ b/spacy/lang/ne/lex_attrs.py
@@ -1,6 +1,5 @@
+from ...attrs import LIKE_NUM, NORM
from ..norm_exceptions import BASE_NORMS
-from ...attrs import NORM, LIKE_NUM
-
# fmt: off
_stem_suffixes = [
diff --git a/spacy/lang/nl/__init__.py b/spacy/lang/nl/__init__.py
index ad2205a0b..213041a85 100644
--- a/spacy/lang/nl/__init__.py
+++ b/spacy/lang/nl/__init__.py
@@ -1,15 +1,14 @@
-from typing import Optional, Callable
+from typing import Callable, Optional
from thinc.api import Model
+from ...language import BaseDefaults, Language
from .lemmatizer import DutchLemmatizer
from .lex_attrs import LEX_ATTRS
-from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
-from .punctuation import TOKENIZER_SUFFIXES
+from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from ...language import Language, BaseDefaults
class DutchDefaults(BaseDefaults):
diff --git a/spacy/lang/nl/lex_attrs.py b/spacy/lang/nl/lex_attrs.py
index f1acaefeb..488224c2f 100644
--- a/spacy/lang/nl/lex_attrs.py
+++ b/spacy/lang/nl/lex_attrs.py
@@ -1,6 +1,5 @@
from ...attrs import LIKE_NUM
-
_num_words = set(
"""
nul een één twee drie vier vijf zes zeven acht negen tien elf twaalf dertien
diff --git a/spacy/lang/nl/punctuation.py b/spacy/lang/nl/punctuation.py
index d9dd2a6e3..c9a4c9eeb 100644
--- a/spacy/lang/nl/punctuation.py
+++ b/spacy/lang/nl/punctuation.py
@@ -1,10 +1,19 @@
-from ..char_classes import LIST_ELLIPSES, LIST_ICONS, LIST_UNITS, merge_chars
-from ..char_classes import LIST_PUNCT, LIST_QUOTES, CURRENCY, PUNCT
-from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
-
+from ..char_classes import (
+ ALPHA,
+ ALPHA_LOWER,
+ ALPHA_UPPER,
+ CONCAT_QUOTES,
+ CURRENCY,
+ LIST_ELLIPSES,
+ LIST_ICONS,
+ LIST_PUNCT,
+ LIST_QUOTES,
+ LIST_UNITS,
+ PUNCT,
+ merge_chars,
+)
from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES
-
_prefixes = [",,"] + BASE_TOKENIZER_PREFIXES
diff --git a/spacy/lang/nl/syntax_iterators.py b/spacy/lang/nl/syntax_iterators.py
index be9beabe6..d7388a333 100644
--- a/spacy/lang/nl/syntax_iterators.py
+++ b/spacy/lang/nl/syntax_iterators.py
@@ -1,7 +1,7 @@
-from typing import Union, Iterator, Tuple
+from typing import Iterator, Tuple, Union
-from ...symbols import NOUN, PRON
from ...errors import Errors
+from ...symbols import NOUN, PRON
from ...tokens import Doc, Span
diff --git a/spacy/lang/nl/tokenizer_exceptions.py b/spacy/lang/nl/tokenizer_exceptions.py
index 489d10d71..85ad49f14 100644
--- a/spacy/lang/nl/tokenizer_exceptions.py
+++ b/spacy/lang/nl/tokenizer_exceptions.py
@@ -1,7 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH
from ...util import update_exc
-
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
# Extensive list of both common and uncommon dutch abbreviations copied from
# github.com/diasks2/pragmatic_segmenter, a Ruby library for rule-based
diff --git a/spacy/lang/pl/__init__.py b/spacy/lang/pl/__init__.py
index 02c96799b..50a3a8e4c 100644
--- a/spacy/lang/pl/__init__.py
+++ b/spacy/lang/pl/__init__.py
@@ -1,15 +1,13 @@
-from typing import Optional, Callable
+from typing import Callable, Optional
from thinc.api import Model
-from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
-from .punctuation import TOKENIZER_SUFFIXES
-from .stop_words import STOP_WORDS
-from .lex_attrs import LEX_ATTRS
-from .lemmatizer import PolishLemmatizer
+from ...language import BaseDefaults, Language
from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...language import Language, BaseDefaults
-
+from .lemmatizer import PolishLemmatizer
+from .lex_attrs import LEX_ATTRS
+from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
+from .stop_words import STOP_WORDS
TOKENIZER_EXCEPTIONS = {
exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".")
diff --git a/spacy/lang/pl/lemmatizer.py b/spacy/lang/pl/lemmatizer.py
index 059d0609a..d1d2a9c54 100644
--- a/spacy/lang/pl/lemmatizer.py
+++ b/spacy/lang/pl/lemmatizer.py
@@ -1,4 +1,4 @@
-from typing import List, Dict, Tuple
+from typing import Dict, List, Tuple
from ...pipeline import Lemmatizer
from ...tokens import Token
diff --git a/spacy/lang/pl/lex_attrs.py b/spacy/lang/pl/lex_attrs.py
index ce56e28a8..398f52a3c 100644
--- a/spacy/lang/pl/lex_attrs.py
+++ b/spacy/lang/pl/lex_attrs.py
@@ -1,6 +1,5 @@
from ...attrs import LIKE_NUM
-
_num_words = [
"zero",
"jeden",
diff --git a/spacy/lang/pl/punctuation.py b/spacy/lang/pl/punctuation.py
index 31e56b9ae..84ff239ed 100644
--- a/spacy/lang/pl/punctuation.py
+++ b/spacy/lang/pl/punctuation.py
@@ -1,6 +1,17 @@
-from ..char_classes import LIST_ELLIPSES, LIST_PUNCT, LIST_HYPHENS
-from ..char_classes import LIST_ICONS, LIST_QUOTES, CURRENCY, UNITS, PUNCT
-from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
+from ..char_classes import (
+ ALPHA,
+ ALPHA_LOWER,
+ ALPHA_UPPER,
+ CONCAT_QUOTES,
+ CURRENCY,
+ LIST_ELLIPSES,
+ LIST_HYPHENS,
+ LIST_ICONS,
+ LIST_PUNCT,
+ LIST_QUOTES,
+ PUNCT,
+ UNITS,
+)
from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES
_quotes = CONCAT_QUOTES.replace("'", "")
diff --git a/spacy/lang/pt/__init__.py b/spacy/lang/pt/__init__.py
index 454002491..be4041f8e 100644
--- a/spacy/lang/pt/__init__.py
+++ b/spacy/lang/pt/__init__.py
@@ -1,9 +1,9 @@
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .stop_words import STOP_WORDS
+from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
-from .syntax_iterators import SYNTAX_ITERATORS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES
-from ...language import Language, BaseDefaults
+from .stop_words import STOP_WORDS
+from .syntax_iterators import SYNTAX_ITERATORS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class PortugueseDefaults(BaseDefaults):
diff --git a/spacy/lang/pt/lex_attrs.py b/spacy/lang/pt/lex_attrs.py
index 3c6979ab4..de6a67f14 100644
--- a/spacy/lang/pt/lex_attrs.py
+++ b/spacy/lang/pt/lex_attrs.py
@@ -1,6 +1,5 @@
from ...attrs import LIKE_NUM
-
_num_words = [
"zero",
"um",
diff --git a/spacy/lang/pt/punctuation.py b/spacy/lang/pt/punctuation.py
index 08e31f9d0..b2d63cb3d 100644
--- a/spacy/lang/pt/punctuation.py
+++ b/spacy/lang/pt/punctuation.py
@@ -1,6 +1,6 @@
+from ..punctuation import TOKENIZER_INFIXES as BASE_TOKENIZER_INFIXES
from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES
from ..punctuation import TOKENIZER_SUFFIXES as BASE_TOKENIZER_SUFFIXES
-from ..punctuation import TOKENIZER_INFIXES as BASE_TOKENIZER_INFIXES
_prefixes = [r"\w{1,3}\$"] + BASE_TOKENIZER_PREFIXES
diff --git a/spacy/lang/pt/syntax_iterators.py b/spacy/lang/pt/syntax_iterators.py
index 62661f5e4..11017aace 100644
--- a/spacy/lang/pt/syntax_iterators.py
+++ b/spacy/lang/pt/syntax_iterators.py
@@ -1,7 +1,7 @@
-from typing import Union, Iterator, Tuple
+from typing import Iterator, Tuple, Union
-from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors
+from ...symbols import NOUN, PRON, PROPN
from ...tokens import Doc, Span
diff --git a/spacy/lang/pt/tokenizer_exceptions.py b/spacy/lang/pt/tokenizer_exceptions.py
index 187fc65ea..e369eda80 100644
--- a/spacy/lang/pt/tokenizer_exceptions.py
+++ b/spacy/lang/pt/tokenizer_exceptions.py
@@ -1,7 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH
from ...util import update_exc
-
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc = {}
diff --git a/spacy/lang/punctuation.py b/spacy/lang/punctuation.py
index a1cfe6224..e4a6392c8 100644
--- a/spacy/lang/punctuation.py
+++ b/spacy/lang/punctuation.py
@@ -1,7 +1,19 @@
-from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
-from .char_classes import LIST_ICONS, HYPHENS, CURRENCY, UNITS, COMBINING_DIACRITICS
-from .char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT
-
+from .char_classes import (
+ ALPHA,
+ ALPHA_LOWER,
+ ALPHA_UPPER,
+ COMBINING_DIACRITICS,
+ CONCAT_QUOTES,
+ CURRENCY,
+ HYPHENS,
+ LIST_CURRENCY,
+ LIST_ELLIPSES,
+ LIST_ICONS,
+ LIST_PUNCT,
+ LIST_QUOTES,
+ PUNCT,
+ UNITS,
+)
TOKENIZER_PREFIXES = (
["§", "%", "=", "—", "–", r"\+(?![0-9])"]
diff --git a/spacy/lang/ro/__init__.py b/spacy/lang/ro/__init__.py
index 50027ffd2..441fefbb6 100644
--- a/spacy/lang/ro/__init__.py
+++ b/spacy/lang/ro/__init__.py
@@ -1,9 +1,8 @@
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .stop_words import STOP_WORDS
-from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
-from .punctuation import TOKENIZER_SUFFIXES
+from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
-from ...language import Language, BaseDefaults
+from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
+from .stop_words import STOP_WORDS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
# Lemma data note:
# Original pairs downloaded from http://www.lexiconista.com/datasets/lemmatization/
diff --git a/spacy/lang/ro/lex_attrs.py b/spacy/lang/ro/lex_attrs.py
index 0f86f53cd..736aa911a 100644
--- a/spacy/lang/ro/lex_attrs.py
+++ b/spacy/lang/ro/lex_attrs.py
@@ -1,6 +1,5 @@
from ...attrs import LIKE_NUM
-
_num_words = set(
"""
zero unu doi două trei patru cinci șase șapte opt nouă zece
diff --git a/spacy/lang/ro/punctuation.py b/spacy/lang/ro/punctuation.py
index 529e1c977..7259f9ae7 100644
--- a/spacy/lang/ro/punctuation.py
+++ b/spacy/lang/ro/punctuation.py
@@ -1,9 +1,18 @@
import itertools
-from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
-from ..char_classes import LIST_ICONS, CURRENCY
-from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT
-
+from ..char_classes import (
+ ALPHA,
+ ALPHA_LOWER,
+ ALPHA_UPPER,
+ CONCAT_QUOTES,
+ CURRENCY,
+ LIST_CURRENCY,
+ LIST_ELLIPSES,
+ LIST_ICONS,
+ LIST_PUNCT,
+ LIST_QUOTES,
+ PUNCT,
+)
_list_icons = [x for x in LIST_ICONS if x != "°"]
_list_icons = [x.replace("\\u00B0", "") for x in _list_icons]
diff --git a/spacy/lang/ro/tokenizer_exceptions.py b/spacy/lang/ro/tokenizer_exceptions.py
index b8af0b1d6..a397b2754 100644
--- a/spacy/lang/ro/tokenizer_exceptions.py
+++ b/spacy/lang/ro/tokenizer_exceptions.py
@@ -1,9 +1,8 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH
from ...util import update_exc
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
from .punctuation import _make_ro_variants
-
_exc = {}
diff --git a/spacy/lang/ru/__init__.py b/spacy/lang/ru/__init__.py
index 7d17628c4..880965b70 100644
--- a/spacy/lang/ru/__init__.py
+++ b/spacy/lang/ru/__init__.py
@@ -1,13 +1,16 @@
-from typing import Optional, Callable
+from typing import Callable, Optional
+
from thinc.api import Model
+from ...language import BaseDefaults, Language
+from ..punctuation import (
+ COMBINING_DIACRITICS_TOKENIZER_INFIXES,
+ COMBINING_DIACRITICS_TOKENIZER_SUFFIXES,
+)
+from .lemmatizer import RussianLemmatizer
+from .lex_attrs import LEX_ATTRS
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .lex_attrs import LEX_ATTRS
-from .lemmatizer import RussianLemmatizer
-from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_INFIXES
-from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_SUFFIXES
-from ...language import Language, BaseDefaults
class RussianDefaults(BaseDefaults):
diff --git a/spacy/lang/ru/lemmatizer.py b/spacy/lang/ru/lemmatizer.py
index f4a35de38..1e41220f3 100644
--- a/spacy/lang/ru/lemmatizer.py
+++ b/spacy/lang/ru/lemmatizer.py
@@ -1,4 +1,4 @@
-from typing import Optional, List, Dict, Tuple, Callable
+from typing import Callable, Dict, List, Optional, Tuple
from thinc.api import Model
@@ -8,7 +8,6 @@ from ...symbols import POS
from ...tokens import Token
from ...vocab import Vocab
-
PUNCT_RULES = {"«": '"', "»": '"'}
diff --git a/spacy/lang/ru/lex_attrs.py b/spacy/lang/ru/lex_attrs.py
index 2afe47623..e0b35bdc0 100644
--- a/spacy/lang/ru/lex_attrs.py
+++ b/spacy/lang/ru/lex_attrs.py
@@ -1,6 +1,5 @@
from ...attrs import LIKE_NUM
-
_num_words = list(
set(
"""
diff --git a/spacy/lang/ru/tokenizer_exceptions.py b/spacy/lang/ru/tokenizer_exceptions.py
index e1889f785..0a8c476b1 100644
--- a/spacy/lang/ru/tokenizer_exceptions.py
+++ b/spacy/lang/ru/tokenizer_exceptions.py
@@ -1,6 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...symbols import ORTH, NORM
+from ...symbols import NORM, ORTH
from ...util import update_exc
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc = {}
diff --git a/spacy/lang/sa/__init__.py b/spacy/lang/sa/__init__.py
index 61398af6c..c7c0e98e6 100644
--- a/spacy/lang/sa/__init__.py
+++ b/spacy/lang/sa/__init__.py
@@ -1,6 +1,6 @@
-from .stop_words import STOP_WORDS
+from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
-from ...language import Language, BaseDefaults
+from .stop_words import STOP_WORDS
class SanskritDefaults(BaseDefaults):
diff --git a/spacy/lang/si/__init__.py b/spacy/lang/si/__init__.py
index 971cee3c6..08d0937b1 100644
--- a/spacy/lang/si/__init__.py
+++ b/spacy/lang/si/__init__.py
@@ -1,6 +1,6 @@
-from .stop_words import STOP_WORDS
+from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
-from ...language import Language, BaseDefaults
+from .stop_words import STOP_WORDS
class SinhalaDefaults(BaseDefaults):
diff --git a/spacy/lang/sk/__init__.py b/spacy/lang/sk/__init__.py
index da6e3048e..2ed7448d2 100644
--- a/spacy/lang/sk/__init__.py
+++ b/spacy/lang/sk/__init__.py
@@ -1,6 +1,6 @@
-from .stop_words import STOP_WORDS
+from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
-from ...language import Language, BaseDefaults
+from .stop_words import STOP_WORDS
class SlovakDefaults(BaseDefaults):
diff --git a/spacy/lang/sl/__init__.py b/spacy/lang/sl/__init__.py
index 0070e9fa1..cd3d70fc9 100644
--- a/spacy/lang/sl/__init__.py
+++ b/spacy/lang/sl/__init__.py
@@ -1,8 +1,8 @@
+from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
-from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES
+from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from ...language import Language, BaseDefaults
class SlovenianDefaults(BaseDefaults):
diff --git a/spacy/lang/sl/lex_attrs.py b/spacy/lang/sl/lex_attrs.py
index 958152e37..3c1493050 100644
--- a/spacy/lang/sl/lex_attrs.py
+++ b/spacy/lang/sl/lex_attrs.py
@@ -1,7 +1,6 @@
-from ...attrs import LIKE_NUM
-from ...attrs import IS_CURRENCY
import unicodedata
+from ...attrs import IS_CURRENCY, LIKE_NUM
_num_words = set(
"""
diff --git a/spacy/lang/sl/punctuation.py b/spacy/lang/sl/punctuation.py
index b6ca1830e..dadb54d31 100644
--- a/spacy/lang/sl/punctuation.py
+++ b/spacy/lang/sl/punctuation.py
@@ -1,20 +1,21 @@
from ..char_classes import (
+ ALPHA,
+ ALPHA_LOWER,
+ ALPHA_UPPER,
+ CONCAT_QUOTES,
+ CURRENCY,
+ HYPHENS,
+ LIST_CURRENCY,
LIST_ELLIPSES,
LIST_ICONS,
- HYPHENS,
LIST_PUNCT,
LIST_QUOTES,
- CURRENCY,
- UNITS,
PUNCT,
- LIST_CURRENCY,
- CONCAT_QUOTES,
+ UNITS,
+ merge_chars,
)
-from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
-from ..char_classes import merge_chars
from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES
-
INCLUDE_SPECIAL = ["\\+", "\\/", "\\•", "\\¯", "\\=", "\\×"] + HYPHENS.split("|")
_prefixes = INCLUDE_SPECIAL + BASE_TOKENIZER_PREFIXES
diff --git a/spacy/lang/sl/tokenizer_exceptions.py b/spacy/lang/sl/tokenizer_exceptions.py
index 3d4109228..ec4ea9e41 100644
--- a/spacy/lang/sl/tokenizer_exceptions.py
+++ b/spacy/lang/sl/tokenizer_exceptions.py
@@ -1,7 +1,8 @@
from typing import Dict, List
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...symbols import ORTH, NORM
+
+from ...symbols import NORM, ORTH
from ...util import update_exc
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc: Dict[str, List[Dict]] = {}
diff --git a/spacy/lang/sq/__init__.py b/spacy/lang/sq/__init__.py
index 5e32a0cbe..1c8a5acf8 100644
--- a/spacy/lang/sq/__init__.py
+++ b/spacy/lang/sq/__init__.py
@@ -1,5 +1,5 @@
+from ...language import BaseDefaults, Language
from .stop_words import STOP_WORDS
-from ...language import Language, BaseDefaults
class AlbanianDefaults(BaseDefaults):
diff --git a/spacy/lang/sr/__init__.py b/spacy/lang/sr/__init__.py
index fd0c8c832..5f121d79e 100644
--- a/spacy/lang/sr/__init__.py
+++ b/spacy/lang/sr/__init__.py
@@ -1,11 +1,14 @@
+from ...language import BaseDefaults, Language
+from .lex_attrs import LEX_ATTRS
+from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .lex_attrs import LEX_ATTRS
-from ...language import Language, BaseDefaults
class SerbianDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
+ infixes = TOKENIZER_INFIXES
+ suffixes = TOKENIZER_SUFFIXES
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
diff --git a/spacy/lang/sr/lex_attrs.py b/spacy/lang/sr/lex_attrs.py
index dc48909bc..696b9fd74 100644
--- a/spacy/lang/sr/lex_attrs.py
+++ b/spacy/lang/sr/lex_attrs.py
@@ -1,6 +1,5 @@
from ...attrs import LIKE_NUM
-
_num_words = [
"нула",
"један",
diff --git a/spacy/lang/sr/punctuation.py b/spacy/lang/sr/punctuation.py
new file mode 100644
index 000000000..cafb0f68f
--- /dev/null
+++ b/spacy/lang/sr/punctuation.py
@@ -0,0 +1,45 @@
+from ..char_classes import (
+ ALPHA,
+ ALPHA_LOWER,
+ ALPHA_UPPER,
+ CONCAT_QUOTES,
+ CURRENCY,
+ LIST_ELLIPSES,
+ LIST_ICONS,
+ LIST_PUNCT,
+ LIST_QUOTES,
+ PUNCT,
+ UNITS,
+)
+
+_infixes = (
+ LIST_ELLIPSES
+ + LIST_ICONS
+ + [
+ r"(?<=[0-9])[+\-\*^](?=[0-9-])",
+ r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
+ al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
+ ),
+ r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
+ r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
+ ]
+)
+
+_suffixes = (
+ LIST_PUNCT
+ + LIST_ELLIPSES
+ + LIST_QUOTES
+ + LIST_ICONS
+ + [
+ r"(?<=[0-9])\+",
+ r"(?<=°[FfCcKk])\.",
+ r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
+ r"(?<=[0-9])(?:{u})".format(u=UNITS),
+ r"(?<=[{a}{e}{p}(?:{q})])\.".format(
+ a=ALPHA, e=r"%²\-\+", q=CONCAT_QUOTES, p=PUNCT
+ ),
+ ]
+)
+
+TOKENIZER_INFIXES = _infixes
+TOKENIZER_SUFFIXES = _suffixes
diff --git a/spacy/lang/sr/tokenizer_exceptions.py b/spacy/lang/sr/tokenizer_exceptions.py
index dcaa3e239..b7db0aadc 100755
--- a/spacy/lang/sr/tokenizer_exceptions.py
+++ b/spacy/lang/sr/tokenizer_exceptions.py
@@ -1,7 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...symbols import ORTH, NORM
+from ...symbols import NORM, ORTH
from ...util import update_exc
-
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc = {}
diff --git a/spacy/lang/sv/__init__.py b/spacy/lang/sv/__init__.py
index 28e5085a8..bb4ee1702 100644
--- a/spacy/lang/sv/__init__.py
+++ b/spacy/lang/sv/__init__.py
@@ -1,12 +1,14 @@
-from typing import Optional, Callable
+from typing import Callable, Optional
+
from thinc.api import Model
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .stop_words import STOP_WORDS
-from .lex_attrs import LEX_ATTRS
-from .syntax_iterators import SYNTAX_ITERATORS
-from ...language import Language, BaseDefaults
+
+from ...language import BaseDefaults, Language
from ...pipeline import Lemmatizer
+from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
+from .stop_words import STOP_WORDS
+from .syntax_iterators import SYNTAX_ITERATORS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class SwedishDefaults(BaseDefaults):
diff --git a/spacy/lang/sv/lex_attrs.py b/spacy/lang/sv/lex_attrs.py
index f8ada9e2e..8eeafede8 100644
--- a/spacy/lang/sv/lex_attrs.py
+++ b/spacy/lang/sv/lex_attrs.py
@@ -1,6 +1,5 @@
from ...attrs import LIKE_NUM
-
_num_words = [
"noll",
"en",
diff --git a/spacy/lang/sv/punctuation.py b/spacy/lang/sv/punctuation.py
index 67f1bcdc4..64f1da989 100644
--- a/spacy/lang/sv/punctuation.py
+++ b/spacy/lang/sv/punctuation.py
@@ -1,8 +1,13 @@
-from ..char_classes import LIST_ELLIPSES, LIST_ICONS
-from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
+from ..char_classes import (
+ ALPHA,
+ ALPHA_LOWER,
+ ALPHA_UPPER,
+ CONCAT_QUOTES,
+ LIST_ELLIPSES,
+ LIST_ICONS,
+)
from ..punctuation import TOKENIZER_SUFFIXES
-
_quotes = CONCAT_QUOTES.replace("'", "")
_infixes = (
diff --git a/spacy/lang/sv/syntax_iterators.py b/spacy/lang/sv/syntax_iterators.py
index 06ad016ac..09153a8ec 100644
--- a/spacy/lang/sv/syntax_iterators.py
+++ b/spacy/lang/sv/syntax_iterators.py
@@ -1,7 +1,7 @@
-from typing import Union, Iterator, Tuple
+from typing import Iterator, Tuple, Union
-from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors
+from ...symbols import NOUN, PRON, PROPN
from ...tokens import Doc, Span
diff --git a/spacy/lang/sv/tokenizer_exceptions.py b/spacy/lang/sv/tokenizer_exceptions.py
index ce7db895a..8fd3afbe3 100644
--- a/spacy/lang/sv/tokenizer_exceptions.py
+++ b/spacy/lang/sv/tokenizer_exceptions.py
@@ -1,6 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import NORM, ORTH
from ...util import update_exc
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc = {}
diff --git a/spacy/lang/ta/__init__.py b/spacy/lang/ta/__init__.py
index 4929a4b97..7fd29371a 100644
--- a/spacy/lang/ta/__init__.py
+++ b/spacy/lang/ta/__init__.py
@@ -1,6 +1,6 @@
-from .stop_words import STOP_WORDS
+from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
-from ...language import Language, BaseDefaults
+from .stop_words import STOP_WORDS
class TamilDefaults(BaseDefaults):
diff --git a/spacy/lang/ta/lex_attrs.py b/spacy/lang/ta/lex_attrs.py
index f830f4ac9..d66125552 100644
--- a/spacy/lang/ta/lex_attrs.py
+++ b/spacy/lang/ta/lex_attrs.py
@@ -1,6 +1,5 @@
from ...attrs import LIKE_NUM
-
_numeral_suffixes = {"பத்து": "பது", "ற்று": "று", "ரத்து": "ரம்", "சத்து": "சம்"}
_num_words = [
"பூச்சியம்",
diff --git a/spacy/lang/te/__init__.py b/spacy/lang/te/__init__.py
index 77cc2fe9b..611e9746a 100644
--- a/spacy/lang/te/__init__.py
+++ b/spacy/lang/te/__init__.py
@@ -1,6 +1,6 @@
-from .stop_words import STOP_WORDS
+from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
-from ...language import Language, BaseDefaults
+from .stop_words import STOP_WORDS
class TeluguDefaults(BaseDefaults):
diff --git a/spacy/lang/th/__init__.py b/spacy/lang/th/__init__.py
index 12b1527e0..bd29d32a4 100644
--- a/spacy/lang/th/__init__.py
+++ b/spacy/lang/th/__init__.py
@@ -1,10 +1,9 @@
-from .stop_words import STOP_WORDS
-from .lex_attrs import LEX_ATTRS
-from ...language import Language, BaseDefaults
+from ...language import BaseDefaults, Language
from ...tokens import Doc
-from ...util import DummyTokenizer, registry, load_config_from_str
+from ...util import DummyTokenizer, load_config_from_str, registry
from ...vocab import Vocab
-
+from .lex_attrs import LEX_ATTRS
+from .stop_words import STOP_WORDS
DEFAULT_CONFIG = """
[nlp]
diff --git a/spacy/lang/th/lex_attrs.py b/spacy/lang/th/lex_attrs.py
index bc4e5293e..80f6ccbe8 100644
--- a/spacy/lang/th/lex_attrs.py
+++ b/spacy/lang/th/lex_attrs.py
@@ -1,6 +1,5 @@
from ...attrs import LIKE_NUM
-
_num_words = [
"ศูนย์",
"หนึ่ง",
diff --git a/spacy/lang/th/tokenizer_exceptions.py b/spacy/lang/th/tokenizer_exceptions.py
index 92116d474..954766d28 100644
--- a/spacy/lang/th/tokenizer_exceptions.py
+++ b/spacy/lang/th/tokenizer_exceptions.py
@@ -1,6 +1,5 @@
from ...symbols import ORTH
-
_exc = {
# หน่วยงานรัฐ / government agency
"กกต.": [{ORTH: "กกต."}],
diff --git a/spacy/lang/ti/__init__.py b/spacy/lang/ti/__init__.py
index c74c081b5..510999f67 100644
--- a/spacy/lang/ti/__init__.py
+++ b/spacy/lang/ti/__init__.py
@@ -1,12 +1,11 @@
-from .stop_words import STOP_WORDS
+from ...attrs import LANG
+from ...language import BaseDefaults, Language
+from ...util import update_exc
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_SUFFIXES
-
+from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...language import Language, BaseDefaults
-from ...attrs import LANG
-from ...util import update_exc
class TigrinyaDefaults(BaseDefaults):
diff --git a/spacy/lang/ti/punctuation.py b/spacy/lang/ti/punctuation.py
index aa884c2ba..f29f30e26 100644
--- a/spacy/lang/ti/punctuation.py
+++ b/spacy/lang/ti/punctuation.py
@@ -1,5 +1,11 @@
-from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
-from ..char_classes import UNITS, ALPHA_UPPER
+from ..char_classes import (
+ ALPHA_UPPER,
+ CURRENCY,
+ LIST_ELLIPSES,
+ LIST_PUNCT,
+ LIST_QUOTES,
+ UNITS,
+)
_list_punct = LIST_PUNCT + "፡ ። ፣ ፤ ፥ ፦ ፧ ፠ ፨".strip().split()
diff --git a/spacy/lang/ti/tokenizer_exceptions.py b/spacy/lang/ti/tokenizer_exceptions.py
index 3d79cd84b..711e4b406 100644
--- a/spacy/lang/ti/tokenizer_exceptions.py
+++ b/spacy/lang/ti/tokenizer_exceptions.py
@@ -1,5 +1,4 @@
-from ...symbols import ORTH, NORM
-
+from ...symbols import NORM, ORTH
_exc = {}
diff --git a/spacy/lang/tl/__init__.py b/spacy/lang/tl/__init__.py
index 30838890a..6849810ef 100644
--- a/spacy/lang/tl/__init__.py
+++ b/spacy/lang/tl/__init__.py
@@ -1,7 +1,7 @@
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .stop_words import STOP_WORDS
+from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
-from ...language import Language, BaseDefaults
+from .stop_words import STOP_WORDS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class TagalogDefaults(BaseDefaults):
diff --git a/spacy/lang/tl/lex_attrs.py b/spacy/lang/tl/lex_attrs.py
index 60bdc923b..8866453a0 100644
--- a/spacy/lang/tl/lex_attrs.py
+++ b/spacy/lang/tl/lex_attrs.py
@@ -1,6 +1,5 @@
from ...attrs import LIKE_NUM
-
_num_words = [
"sero",
"isa",
diff --git a/spacy/lang/tl/tokenizer_exceptions.py b/spacy/lang/tl/tokenizer_exceptions.py
index 51ad12d9f..b10c90437 100644
--- a/spacy/lang/tl/tokenizer_exceptions.py
+++ b/spacy/lang/tl/tokenizer_exceptions.py
@@ -1,7 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...symbols import ORTH, NORM
+from ...symbols import NORM, ORTH
from ...util import update_exc
-
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc = {
"tayo'y": [{ORTH: "tayo"}, {ORTH: "'y", NORM: "ay"}],
diff --git a/spacy/lang/tn/__init__.py b/spacy/lang/tn/__init__.py
index 28e887eea..4cb8a1635 100644
--- a/spacy/lang/tn/__init__.py
+++ b/spacy/lang/tn/__init__.py
@@ -1,7 +1,7 @@
-from .stop_words import STOP_WORDS
+from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_INFIXES
-from ...language import Language, BaseDefaults
+from .stop_words import STOP_WORDS
class SetswanaDefaults(BaseDefaults):
diff --git a/spacy/lang/tn/punctuation.py b/spacy/lang/tn/punctuation.py
index a52755564..54d76fbaf 100644
--- a/spacy/lang/tn/punctuation.py
+++ b/spacy/lang/tn/punctuation.py
@@ -1,5 +1,12 @@
-from ..char_classes import LIST_ELLIPSES, LIST_ICONS, HYPHENS
-from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
+from ..char_classes import (
+ ALPHA,
+ ALPHA_LOWER,
+ ALPHA_UPPER,
+ CONCAT_QUOTES,
+ HYPHENS,
+ LIST_ELLIPSES,
+ LIST_ICONS,
+)
_infixes = (
LIST_ELLIPSES
diff --git a/spacy/lang/tokenizer_exceptions.py b/spacy/lang/tokenizer_exceptions.py
index d76fe4262..dbf9aab49 100644
--- a/spacy/lang/tokenizer_exceptions.py
+++ b/spacy/lang/tokenizer_exceptions.py
@@ -1,8 +1,7 @@
import re
+from ..symbols import NORM, ORTH
from .char_classes import ALPHA_LOWER
-from ..symbols import ORTH, NORM
-
# URL validation regex courtesy of: https://mathiasbynens.be/demo/url-regex
# and https://gist.github.com/dperini/729294 (Diego Perini, MIT License)
diff --git a/spacy/lang/tr/__init__.py b/spacy/lang/tr/__init__.py
index 02b5c7bf4..9aa752168 100644
--- a/spacy/lang/tr/__init__.py
+++ b/spacy/lang/tr/__init__.py
@@ -1,8 +1,8 @@
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
+from ...language import BaseDefaults, Language
+from .lex_attrs import LEX_ATTRS
from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS
-from .lex_attrs import LEX_ATTRS
-from ...language import Language, BaseDefaults
+from .tokenizer_exceptions import TOKEN_MATCH, TOKENIZER_EXCEPTIONS
class TurkishDefaults(BaseDefaults):
diff --git a/spacy/lang/tr/lex_attrs.py b/spacy/lang/tr/lex_attrs.py
index 6d9f4f388..2189932b6 100644
--- a/spacy/lang/tr/lex_attrs.py
+++ b/spacy/lang/tr/lex_attrs.py
@@ -1,6 +1,5 @@
from ...attrs import LIKE_NUM
-
# Thirteen, fifteen etc. are written separate: on üç
_num_words = [
diff --git a/spacy/lang/tr/syntax_iterators.py b/spacy/lang/tr/syntax_iterators.py
index 769af1223..ed588424a 100644
--- a/spacy/lang/tr/syntax_iterators.py
+++ b/spacy/lang/tr/syntax_iterators.py
@@ -1,7 +1,8 @@
-from typing import Union, Iterator, Tuple
-from ...tokens import Doc, Span
-from ...symbols import NOUN, PROPN, PRON
+from typing import Iterator, Tuple, Union
+
from ...errors import Errors
+from ...symbols import NOUN, PRON, PROPN
+from ...tokens import Doc, Span
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
diff --git a/spacy/lang/tr/tokenizer_exceptions.py b/spacy/lang/tr/tokenizer_exceptions.py
index 22fa9f09e..d095a3d0e 100644
--- a/spacy/lang/tr/tokenizer_exceptions.py
+++ b/spacy/lang/tr/tokenizer_exceptions.py
@@ -1,8 +1,7 @@
import re
-from ..punctuation import ALPHA_LOWER, ALPHA
-from ...symbols import ORTH, NORM
-
+from ...symbols import NORM, ORTH
+from ..punctuation import ALPHA, ALPHA_LOWER
_exc = {}
diff --git a/spacy/lang/tt/__init__.py b/spacy/lang/tt/__init__.py
index d5e1e87ef..ce04d09c2 100644
--- a/spacy/lang/tt/__init__.py
+++ b/spacy/lang/tt/__init__.py
@@ -1,8 +1,8 @@
+from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_INFIXES
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from ...language import Language, BaseDefaults
class TatarDefaults(BaseDefaults):
diff --git a/spacy/lang/tt/punctuation.py b/spacy/lang/tt/punctuation.py
index f644a8ccb..5c233df7c 100644
--- a/spacy/lang/tt/punctuation.py
+++ b/spacy/lang/tt/punctuation.py
@@ -1,5 +1,12 @@
-from ..char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER, CONCAT_QUOTES, HYPHENS
-from ..char_classes import LIST_ELLIPSES, LIST_ICONS
+from ..char_classes import (
+ ALPHA,
+ ALPHA_LOWER,
+ ALPHA_UPPER,
+ CONCAT_QUOTES,
+ HYPHENS,
+ LIST_ELLIPSES,
+ LIST_ICONS,
+)
_hyphens_no_dash = HYPHENS.replace("-", "").strip("|").replace("||", "")
_infixes = (
diff --git a/spacy/lang/tt/tokenizer_exceptions.py b/spacy/lang/tt/tokenizer_exceptions.py
index 3b8cc86b5..280b9f866 100644
--- a/spacy/lang/tt/tokenizer_exceptions.py
+++ b/spacy/lang/tt/tokenizer_exceptions.py
@@ -1,7 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...symbols import ORTH, NORM
+from ...symbols import NORM, ORTH
from ...util import update_exc
-
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc = {}
diff --git a/spacy/lang/uk/__init__.py b/spacy/lang/uk/__init__.py
index bfea9ff69..5dd75a2a4 100644
--- a/spacy/lang/uk/__init__.py
+++ b/spacy/lang/uk/__init__.py
@@ -1,14 +1,16 @@
-from typing import Optional, Callable
+from typing import Callable, Optional
from thinc.api import Model
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .stop_words import STOP_WORDS
-from .lex_attrs import LEX_ATTRS
+from ...language import BaseDefaults, Language
+from ..punctuation import (
+ COMBINING_DIACRITICS_TOKENIZER_INFIXES,
+ COMBINING_DIACRITICS_TOKENIZER_SUFFIXES,
+)
from .lemmatizer import UkrainianLemmatizer
-from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_INFIXES
-from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_SUFFIXES
-from ...language import Language, BaseDefaults
+from .lex_attrs import LEX_ATTRS
+from .stop_words import STOP_WORDS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class UkrainianDefaults(BaseDefaults):
diff --git a/spacy/lang/uk/lemmatizer.py b/spacy/lang/uk/lemmatizer.py
index 37015cc2a..9ec582b76 100644
--- a/spacy/lang/uk/lemmatizer.py
+++ b/spacy/lang/uk/lemmatizer.py
@@ -1,10 +1,10 @@
-from typing import Optional, Callable
+from typing import Callable, Optional
from thinc.api import Model
-from ..ru.lemmatizer import RussianLemmatizer
from ...pipeline.lemmatizer import lemmatizer_score
from ...vocab import Vocab
+from ..ru.lemmatizer import RussianLemmatizer
class UkrainianLemmatizer(RussianLemmatizer):
diff --git a/spacy/lang/uk/tokenizer_exceptions.py b/spacy/lang/uk/tokenizer_exceptions.py
index 7e168a27c..07dd941af 100644
--- a/spacy/lang/uk/tokenizer_exceptions.py
+++ b/spacy/lang/uk/tokenizer_exceptions.py
@@ -1,7 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...symbols import ORTH, NORM
+from ...symbols import NORM, ORTH
from ...util import update_exc
-
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc = {}
diff --git a/spacy/lang/ur/__init__.py b/spacy/lang/ur/__init__.py
index 266c5a73d..4f20ac92f 100644
--- a/spacy/lang/ur/__init__.py
+++ b/spacy/lang/ur/__init__.py
@@ -1,7 +1,7 @@
-from .stop_words import STOP_WORDS
+from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_SUFFIXES
-from ...language import Language, BaseDefaults
+from .stop_words import STOP_WORDS
class UrduDefaults(BaseDefaults):
diff --git a/spacy/lang/ur/punctuation.py b/spacy/lang/ur/punctuation.py
index 5d35d0a25..382bfc75c 100644
--- a/spacy/lang/ur/punctuation.py
+++ b/spacy/lang/ur/punctuation.py
@@ -1,4 +1,3 @@
from ..punctuation import TOKENIZER_SUFFIXES
-
_suffixes = TOKENIZER_SUFFIXES
diff --git a/spacy/lang/vi/__init__.py b/spacy/lang/vi/__init__.py
index 822dc348c..a621b8bfe 100644
--- a/spacy/lang/vi/__init__.py
+++ b/spacy/lang/vi/__init__.py
@@ -1,17 +1,17 @@
-from typing import Any, Dict, Union
-from pathlib import Path
import re
-import srsly
import string
+from pathlib import Path
+from typing import Any, Dict, Union
+
+import srsly
-from .stop_words import STOP_WORDS
-from .lex_attrs import LEX_ATTRS
-from ...language import Language, BaseDefaults
-from ...tokens import Doc
-from ...util import DummyTokenizer, registry, load_config_from_str
-from ...vocab import Vocab
from ... import util
-
+from ...language import BaseDefaults, Language
+from ...tokens import Doc
+from ...util import DummyTokenizer, load_config_from_str, registry
+from ...vocab import Vocab
+from .lex_attrs import LEX_ATTRS
+from .stop_words import STOP_WORDS
DEFAULT_CONFIG = """
[nlp]
diff --git a/spacy/lang/vi/lex_attrs.py b/spacy/lang/vi/lex_attrs.py
index 0cbda4ffb..82997a133 100644
--- a/spacy/lang/vi/lex_attrs.py
+++ b/spacy/lang/vi/lex_attrs.py
@@ -1,6 +1,5 @@
from ...attrs import LIKE_NUM
-
_num_words = [
"không", # Zero
"một", # One
diff --git a/spacy/lang/yo/__init__.py b/spacy/lang/yo/__init__.py
index 6c38ec8af..93c4ca493 100644
--- a/spacy/lang/yo/__init__.py
+++ b/spacy/lang/yo/__init__.py
@@ -1,6 +1,6 @@
-from .stop_words import STOP_WORDS
+from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
-from ...language import Language, BaseDefaults
+from .stop_words import STOP_WORDS
class YorubaDefaults(BaseDefaults):
diff --git a/spacy/lang/yo/lex_attrs.py b/spacy/lang/yo/lex_attrs.py
index ead68ced2..5f33e06a5 100644
--- a/spacy/lang/yo/lex_attrs.py
+++ b/spacy/lang/yo/lex_attrs.py
@@ -2,7 +2,6 @@ import unicodedata
from ...attrs import LIKE_NUM
-
_num_words = [
"ení",
"oókàn",
diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py
index fdf6776e2..f7bb09277 100644
--- a/spacy/lang/zh/__init__.py
+++ b/spacy/lang/zh/__init__.py
@@ -1,21 +1,21 @@
-from typing import Optional, List, Dict, Any, Callable, Iterable
-from enum import Enum
import tempfile
-import srsly
import warnings
+from enum import Enum
from pathlib import Path
+from typing import Any, Callable, Dict, Iterable, List, Optional
-from ...errors import Warnings, Errors
-from ...language import Language, BaseDefaults
+import srsly
+
+from ... import util
+from ...errors import Errors, Warnings
+from ...language import BaseDefaults, Language
from ...scorer import Scorer
from ...tokens import Doc
-from ...training import validate_examples, Example
-from ...util import DummyTokenizer, registry, load_config_from_str
+from ...training import Example, validate_examples
+from ...util import DummyTokenizer, load_config_from_str, registry
from ...vocab import Vocab
from .lex_attrs import LEX_ATTRS
from .stop_words import STOP_WORDS
-from ... import util
-
# fmt: off
_PKUSEG_INSTALL_MSG = "install spacy-pkuseg with `pip install \"spacy-pkuseg>=0.0.27,<0.1.0\"` or `conda install -c conda-forge \"spacy-pkuseg>=0.0.27,<0.1.0\"`"
diff --git a/spacy/lang/zh/lex_attrs.py b/spacy/lang/zh/lex_attrs.py
index 08c8e3160..36fa7310a 100644
--- a/spacy/lang/zh/lex_attrs.py
+++ b/spacy/lang/zh/lex_attrs.py
@@ -2,7 +2,6 @@ import re
from ...attrs import LIKE_NUM
-
_single_num_words = [
"〇",
"一",
diff --git a/spacy/language.py b/spacy/language.py
index 9fdcf6328..fd616483b 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1,47 +1,70 @@
-from typing import Iterator, Optional, Any, Dict, Callable, Iterable
-from typing import Union, Tuple, List, Set, Pattern, Sequence
-from typing import NoReturn, TYPE_CHECKING, TypeVar, cast, overload
-
-from dataclasses import dataclass
-import random
-import itertools
import functools
+import itertools
+import multiprocessing as mp
+import random
+import traceback
+import warnings
from contextlib import contextmanager
from copy import deepcopy
-from pathlib import Path
-import warnings
-
-from thinc.api import get_current_ops, Config, CupyOps, Optimizer
-import srsly
-import multiprocessing as mp
+from dataclasses import dataclass
from itertools import chain, cycle
+from pathlib import Path
from timeit import default_timer as timer
-import traceback
+from typing import (
+ Any,
+ Callable,
+ Dict,
+ Iterable,
+ Iterator,
+ List,
+ NoReturn,
+ Optional,
+ Pattern,
+ Sequence,
+ Set,
+ Tuple,
+ TypeVar,
+ Union,
+ cast,
+ overload,
+)
-from . import ty
-from .tokens.underscore import Underscore
-from .vocab import Vocab, create_vocab
-from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis
-from .training import Example, validate_examples
-from .training.initialize import init_vocab, init_tok2vec
-from .scorer import Scorer
-from .util import registry, SimpleFrozenList, _pipe, raise_error, _DEFAULT_EMPTY_PIPES
-from .util import SimpleFrozenDict, combine_score_weights, CONFIG_SECTION_ORDER
-from .util import warn_if_jupyter_cupy
-from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS
-from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
-from .lang.punctuation import TOKENIZER_INFIXES
-from .tokens import Doc
-from .tokenizer import Tokenizer
-from .errors import Errors, Warnings
-from .schemas import ConfigSchema, ConfigSchemaNlp, ConfigSchemaInit
-from .schemas import ConfigSchemaPretrain, validate_init_settings
-from .git_info import GIT_VERSION
-from . import util
-from . import about
-from .lookups import load_lookups
+import srsly
+from thinc.api import Config, CupyOps, Optimizer, get_current_ops
+
+from . import about, ty, util
from .compat import Literal
-
+from .errors import Errors, Warnings
+from .git_info import GIT_VERSION
+from .lang.punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
+from .lang.tokenizer_exceptions import BASE_EXCEPTIONS, URL_MATCH
+from .lookups import load_lookups
+from .pipe_analysis import analyze_pipes, print_pipe_analysis, validate_attrs
+from .schemas import (
+ ConfigSchema,
+ ConfigSchemaInit,
+ ConfigSchemaNlp,
+ ConfigSchemaPretrain,
+ validate_init_settings,
+)
+from .scorer import Scorer
+from .tokenizer import Tokenizer
+from .tokens import Doc
+from .tokens.underscore import Underscore
+from .training import Example, validate_examples
+from .training.initialize import init_tok2vec, init_vocab
+from .util import (
+ _DEFAULT_EMPTY_PIPES,
+ CONFIG_SECTION_ORDER,
+ SimpleFrozenDict,
+ SimpleFrozenList,
+ _pipe,
+ combine_score_weights,
+ raise_error,
+ registry,
+ warn_if_jupyter_cupy,
+)
+from .vocab import Vocab, create_vocab
PipeCallable = Callable[[Doc], Doc]
@@ -716,6 +739,11 @@ class Language:
)
)
pipe = source.get_pipe(source_name)
+ # There is no actual solution here. Either the component has the right
+ # name for the source pipeline or the component has the right name for
+ # the current pipeline. This prioritizes the current pipeline.
+ if hasattr(pipe, "name"):
+ pipe.name = name
# Make sure the source config is interpolated so we don't end up with
# orphaned variables in our final config
source_config = source.config.interpolate()
@@ -793,6 +821,7 @@ class Language:
pipe_index = self._get_pipe_index(before, after, first, last)
self._pipe_meta[name] = self.get_factory_meta(factory_name)
self._components.insert(pipe_index, (name, pipe_component))
+ self._link_components()
return pipe_component
def _get_pipe_index(
@@ -928,6 +957,7 @@ class Language:
if old_name in self._config["initialize"]["components"]:
init_cfg = self._config["initialize"]["components"].pop(old_name)
self._config["initialize"]["components"][new_name] = init_cfg
+ self._link_components()
def remove_pipe(self, name: str) -> Tuple[str, PipeCallable]:
"""Remove a component from the pipeline.
@@ -951,6 +981,7 @@ class Language:
# Make sure the name is also removed from the set of disabled components
if name in self.disabled:
self._disabled.remove(name)
+ self._link_components()
return removed
def disable_pipe(self, name: str) -> None:
@@ -1269,7 +1300,10 @@ class Language:
"No 'get_examples' callback provided to 'Language.initialize', creating dummy examples"
)
doc = Doc(self.vocab, words=["x", "y", "z"])
- get_examples = lambda: [Example.from_dict(doc, {})]
+
+ def get_examples():
+ return [Example.from_dict(doc, {})]
+
if not hasattr(get_examples, "__call__"):
err = Errors.E930.format(
method="Language.initialize", obj=type(get_examples)
@@ -1372,6 +1406,7 @@ class Language:
scorer: Optional[Scorer] = None,
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
scorer_cfg: Optional[Dict[str, Any]] = None,
+ per_component: bool = False,
) -> Dict[str, Any]:
"""Evaluate a model's pipeline components.
@@ -1383,6 +1418,8 @@ class Language:
arguments for specific components.
scorer_cfg (dict): An optional dictionary with extra keyword arguments
for the scorer.
+ per_component (bool): Whether to return the scores keyed by component
+ name. Defaults to False.
RETURNS (Scorer): The scorer containing the evaluation results.
@@ -1415,7 +1452,7 @@ class Language:
for eg, doc in zip(examples, docs):
eg.predicted = doc
end_time = timer()
- results = scorer.score(examples)
+ results = scorer.score(examples, per_component=per_component)
n_words = sum(len(eg.predicted) for eg in examples)
results["speed"] = n_words / (end_time - start_time)
return results
@@ -1673,8 +1710,16 @@ class Language:
# The problem is we need to do it during deserialization...And the
# components don't receive the pipeline then. So this does have to be
# here :(
+ # First, fix up all the internal component names in case they have
+ # gotten out of sync due to sourcing components from different
+ # pipelines, since find_listeners uses proc2.name for the listener
+ # map.
+ for name, proc in self.pipeline:
+ if hasattr(proc, "name"):
+ proc.name = name
for i, (name1, proc1) in enumerate(self.pipeline):
if isinstance(proc1, ty.ListenedToComponent):
+ proc1.listener_map = {}
for name2, proc2 in self.pipeline[i + 1 :]:
proc1.find_listeners(proc2)
@@ -1808,6 +1853,7 @@ class Language:
raw_config=raw_config,
)
else:
+ assert "source" in pipe_cfg
# We need the sourced components to reference the same
# vocab without modifying the current vocab state **AND**
# we still want to load the source model vectors to perform
@@ -1827,6 +1873,10 @@ class Language:
source_name = pipe_cfg.get("component", pipe_name)
listeners_replaced = False
if "replace_listeners" in pipe_cfg:
+ # Make sure that the listened-to component has the
+ # state of the source pipeline listener map so that the
+ # replace_listeners method below works as intended.
+ source_nlps[model]._link_components()
for name, proc in source_nlps[model].pipeline:
if source_name in getattr(proc, "listening_components", []):
source_nlps[model].replace_listeners(
@@ -1838,6 +1888,8 @@ class Language:
nlp.add_pipe(
source_name, source=source_nlps[model], name=pipe_name
)
+ # At this point after nlp.add_pipe, the listener map
+ # corresponds to the new pipeline.
if model not in source_nlp_vectors_hashes:
source_nlp_vectors_hashes[model] = hash(
source_nlps[model].vocab.vectors.to_bytes(
@@ -1892,27 +1944,6 @@ class Language:
raise ValueError(
Errors.E942.format(name="pipeline_creation", value=type(nlp))
)
- # Detect components with listeners that are not frozen consistently
- for name, proc in nlp.pipeline:
- if isinstance(proc, ty.ListenedToComponent):
- # Remove listeners not in the pipeline
- listener_names = proc.listening_components
- unused_listener_names = [
- ll for ll in listener_names if ll not in nlp.pipe_names
- ]
- for listener_name in unused_listener_names:
- for listener in proc.listener_map.get(listener_name, []):
- proc.remove_listener(listener, listener_name)
-
- for listener_name in proc.listening_components:
- # e.g. tok2vec/transformer
- # If it's a component sourced from another pipeline, we check if
- # the tok2vec listeners should be replaced with standalone tok2vec
- # models (e.g. so component can be frozen without its performance
- # degrading when other components/tok2vec are updated)
- paths = sourced.get(listener_name, {}).get("replace_listeners", [])
- if paths:
- nlp.replace_listeners(name, listener_name, paths)
return nlp
def replace_listeners(
diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd
index 8dea0d6a2..ff2e4f92e 100644
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@@ -1,11 +1,20 @@
from numpy cimport ndarray
-from .typedefs cimport attr_t, hash_t, flags_t, len_t, tag_t
-from .attrs cimport attr_id_t
-from .attrs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, LANG
-
-from .structs cimport LexemeC
+from .attrs cimport (
+ ID,
+ LANG,
+ LENGTH,
+ LOWER,
+ NORM,
+ ORTH,
+ PREFIX,
+ SHAPE,
+ SUFFIX,
+ attr_id_t,
+)
from .strings cimport StringStore
+from .structs cimport LexemeC
+from .typedefs cimport attr_t, flags_t, hash_t, len_t, tag_t
from .vocab cimport Vocab
diff --git a/spacy/lexeme.pyi b/spacy/lexeme.pyi
index 9b7a6156a..9980b9fce 100644
--- a/spacy/lexeme.pyi
+++ b/spacy/lexeme.pyi
@@ -1,8 +1,7 @@
-from typing import (
- Union,
- Any,
-)
+from typing import Any, Union
+
from thinc.types import Floats1d
+
from .tokens import Doc, Span, Token
from .vocab import Vocab
diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx
index e70feaf9a..00e2c6258 100644
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@@ -1,24 +1,40 @@
# cython: embedsignature=True
# Compiler crashes on memory view coercion without this. Should report bug.
+cimport numpy as np
from cython.view cimport array as cvarray
from libc.string cimport memset
-cimport numpy as np
+
np.import_array()
+import warnings
+
import numpy
from thinc.api import get_array_module
-import warnings
+from .attrs cimport (
+ IS_ALPHA,
+ IS_ASCII,
+ IS_BRACKET,
+ IS_CURRENCY,
+ IS_DIGIT,
+ IS_LEFT_PUNCT,
+ IS_LOWER,
+ IS_PUNCT,
+ IS_QUOTE,
+ IS_RIGHT_PUNCT,
+ IS_SPACE,
+ IS_STOP,
+ IS_TITLE,
+ IS_UPPER,
+ LIKE_EMAIL,
+ LIKE_NUM,
+ LIKE_URL,
+)
from .typedefs cimport attr_t, flags_t
-from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
-from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
-from .attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT
-from .attrs cimport IS_CURRENCY
from .attrs import intify_attrs
from .errors import Errors, Warnings
-
OOV_RANK = 0xffffffffffffffff # UINT64_MAX
memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
EMPTY_LEXEME.id = OOV_RANK
diff --git a/spacy/lookups.py b/spacy/lookups.py
index d7cc44fb3..1a2c44bfa 100644
--- a/spacy/lookups.py
+++ b/spacy/lookups.py
@@ -1,13 +1,13 @@
-from typing import Any, List, Union, Optional, Dict
+from collections import OrderedDict
from pathlib import Path
+from typing import Any, Dict, List, Optional, Union
+
import srsly
from preshed.bloom import BloomFilter
-from collections import OrderedDict
from .errors import Errors
-from .util import SimpleFrozenDict, ensure_path, registry, load_language_data
from .strings import get_string_id
-
+from .util import SimpleFrozenDict, ensure_path, load_language_data, registry
UNSET = object()
diff --git a/spacy/matcher/__init__.py b/spacy/matcher/__init__.py
index a4f164847..f671f2e35 100644
--- a/spacy/matcher/__init__.py
+++ b/spacy/matcher/__init__.py
@@ -1,6 +1,6 @@
-from .matcher import Matcher
-from .phrasematcher import PhraseMatcher
from .dependencymatcher import DependencyMatcher
from .levenshtein import levenshtein
+from .matcher import Matcher
+from .phrasematcher import PhraseMatcher
__all__ = ["Matcher", "PhraseMatcher", "DependencyMatcher", "levenshtein"]
diff --git a/spacy/matcher/dependencymatcher.pyi b/spacy/matcher/dependencymatcher.pyi
index c19d3a71c..b9fbabda7 100644
--- a/spacy/matcher/dependencymatcher.pyi
+++ b/spacy/matcher/dependencymatcher.pyi
@@ -1,8 +1,9 @@
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
-from .matcher import Matcher
-from ..vocab import Vocab
+
from ..tokens.doc import Doc
from ..tokens.span import Span
+from ..vocab import Vocab
+from .matcher import Matcher
class DependencyMatcher:
"""Match dependency parse tree based on pattern rules."""
diff --git a/spacy/matcher/dependencymatcher.pyx b/spacy/matcher/dependencymatcher.pyx
index adf96702b..a214c0668 100644
--- a/spacy/matcher/dependencymatcher.pyx
+++ b/spacy/matcher/dependencymatcher.pyx
@@ -1,18 +1,16 @@
# cython: infer_types=True, profile=True
-from typing import List
+import warnings
from collections import defaultdict
from itertools import product
+from typing import List
-import warnings
-
-from .matcher cimport Matcher
-from ..vocab cimport Vocab
from ..tokens.doc cimport Doc
+from ..vocab cimport Vocab
+from .matcher cimport Matcher
from ..errors import Errors, Warnings
from ..tokens import Span
-
DELIMITER = "||"
INDEX_HEAD = 1
INDEX_RELOP = 0
@@ -432,22 +430,22 @@ cdef class DependencyMatcher:
return [doc[child.i] for child in doc[node].head.children if child.i < node]
def _imm_right_child(self, doc, node):
- for child in doc[node].children:
+ for child in doc[node].rights:
if child.i == node + 1:
return [doc[child.i]]
return []
def _imm_left_child(self, doc, node):
- for child in doc[node].children:
+ for child in doc[node].lefts:
if child.i == node - 1:
return [doc[child.i]]
return []
def _right_child(self, doc, node):
- return [doc[child.i] for child in doc[node].children if child.i > node]
+ return [child for child in doc[node].rights]
def _left_child(self, doc, node):
- return [doc[child.i] for child in doc[node].children if child.i < node]
+ return [child for child in doc[node].lefts]
def _imm_right_parent(self, doc, node):
if doc[node].head.i == node + 1:
diff --git a/spacy/matcher/matcher.pxd b/spacy/matcher/matcher.pxd
index 51854d562..2c82cea1d 100644
--- a/spacy/matcher/matcher.pxd
+++ b/spacy/matcher/matcher.pxd
@@ -1,11 +1,11 @@
+from cymem.cymem cimport Pool
from libc.stdint cimport int32_t
from libcpp.vector cimport vector
-from cymem.cymem cimport Pool
-from ..vocab cimport Vocab
-from ..typedefs cimport attr_t, hash_t
-from ..structs cimport TokenC
from ..lexeme cimport attr_id_t
+from ..structs cimport TokenC
+from ..typedefs cimport attr_t, hash_t
+from ..vocab cimport Vocab
cdef enum action_t:
diff --git a/spacy/matcher/matcher.pyi b/spacy/matcher/matcher.pyi
index 48922865b..c33b534cb 100644
--- a/spacy/matcher/matcher.pyi
+++ b/spacy/matcher/matcher.pyi
@@ -1,8 +1,19 @@
-from typing import Any, List, Dict, Tuple, Optional, Callable, Union
-from typing import Iterator, Iterable, overload
+from typing import (
+ Any,
+ Callable,
+ Dict,
+ Iterable,
+ Iterator,
+ List,
+ Optional,
+ Tuple,
+ Union,
+ overload,
+)
+
from ..compat import Literal
-from ..vocab import Vocab
from ..tokens import Doc, Span
+from ..vocab import Vocab
class Matcher:
def __init__(
diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index b886bd2ec..3d03f37ae 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -1,32 +1,43 @@
# cython: binding=True, infer_types=True, profile=True
-from typing import List, Iterable
+from typing import Iterable, List
-from libcpp.vector cimport vector
-from libc.stdint cimport int32_t, int8_t
-from libc.string cimport memset, memcmp
from cymem.cymem cimport Pool
+from libc.stdint cimport int8_t, int32_t
+from libc.string cimport memcmp, memset
+from libcpp.vector cimport vector
from murmurhash.mrmr cimport hash64
import re
-import srsly
import warnings
-from ..typedefs cimport attr_t
+import srsly
+
+from ..attrs cimport (
+ DEP,
+ ENT_IOB,
+ ID,
+ LEMMA,
+ MORPH,
+ NULL_ATTR,
+ ORTH,
+ POS,
+ TAG,
+ attr_id_t,
+)
from ..structs cimport TokenC
-from ..vocab cimport Vocab
from ..tokens.doc cimport Doc, get_token_attr_for_matcher
+from ..tokens.morphanalysis cimport MorphAnalysis
from ..tokens.span cimport Span
from ..tokens.token cimport Token
-from ..tokens.morphanalysis cimport MorphAnalysis
-from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH, ENT_IOB
+from ..typedefs cimport attr_t
+from ..vocab cimport Vocab
-from .levenshtein import levenshtein_compare
-from ..schemas import validate_token_pattern
-from ..errors import Errors, MatchPatternError, Warnings
-from ..strings import get_string_id
from ..attrs import IDS
+from ..errors import Errors, MatchPatternError, Warnings
+from ..schemas import validate_token_pattern
+from ..strings import get_string_id
from ..util import registry
-
+from .levenshtein import levenshtein_compare
DEF PADDING = 5
diff --git a/spacy/matcher/phrasematcher.pxd b/spacy/matcher/phrasematcher.pxd
index 1bdc19012..bffc1ac97 100644
--- a/spacy/matcher/phrasematcher.pxd
+++ b/spacy/matcher/phrasematcher.pxd
@@ -1,6 +1,6 @@
-from libcpp.vector cimport vector
from cymem.cymem cimport Pool
-from preshed.maps cimport key_t, MapStruct
+from libcpp.vector cimport vector
+from preshed.maps cimport MapStruct, key_t
from ..attrs cimport attr_id_t
from ..structs cimport SpanC
diff --git a/spacy/matcher/phrasematcher.pyi b/spacy/matcher/phrasematcher.pyi
index 68e3386e4..27f6ba373 100644
--- a/spacy/matcher/phrasematcher.pyi
+++ b/spacy/matcher/phrasematcher.pyi
@@ -1,12 +1,13 @@
-from typing import List, Tuple, Union, Optional, Callable, Any, Dict, overload
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union, overload
+
from ..compat import Literal
-from .matcher import Matcher
-from ..vocab import Vocab
from ..tokens import Doc, Span
+from ..vocab import Vocab
+from .matcher import Matcher
class PhraseMatcher:
def __init__(
- self, vocab: Vocab, attr: Optional[Union[int, str]], validate: bool = ...
+ self, vocab: Vocab, attr: Optional[Union[int, str]] = ..., validate: bool = ...
) -> None: ...
def __reduce__(self) -> Any: ...
def __len__(self) -> int: ...
diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx
index 382029872..c407cf1cc 100644
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@@ -1,18 +1,20 @@
# cython: infer_types=True, profile=True
from libc.stdint cimport uintptr_t
-from preshed.maps cimport map_init, map_set, map_get, map_clear, map_iter
+from preshed.maps cimport map_clear, map_get, map_init, map_iter, map_set
import warnings
-from ..attrs cimport ORTH, POS, TAG, DEP, LEMMA, MORPH
+from ..attrs cimport DEP, LEMMA, MORPH, ORTH, POS, TAG
+
from ..attrs import IDS
+
from ..structs cimport TokenC
-from ..tokens.token cimport Token
from ..tokens.span cimport Span
+from ..tokens.token cimport Token
from ..typedefs cimport attr_t
-from ..schemas import TokenPattern
from ..errors import Errors, Warnings
+from ..schemas import TokenPattern
cdef class PhraseMatcher:
diff --git a/spacy/ml/_character_embed.py b/spacy/ml/_character_embed.py
index e46735102..89c836144 100644
--- a/spacy/ml/_character_embed.py
+++ b/spacy/ml/_character_embed.py
@@ -1,4 +1,5 @@
from typing import List
+
from thinc.api import Model
from thinc.types import Floats2d
diff --git a/spacy/ml/callbacks.py b/spacy/ml/callbacks.py
index 3b60ec2ab..e2378a7ba 100644
--- a/spacy/ml/callbacks.py
+++ b/spacy/ml/callbacks.py
@@ -1,8 +1,8 @@
-from typing import Type, Callable, Dict, TYPE_CHECKING, List, Optional, Set
import functools
import inspect
import types
import warnings
+from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Set, Type
from thinc.layers import with_nvtx_range
from thinc.model import Model, wrap_model_recursive
diff --git a/spacy/ml/extract_ngrams.py b/spacy/ml/extract_ngrams.py
index c9c82f369..ce7c585cc 100644
--- a/spacy/ml/extract_ngrams.py
+++ b/spacy/ml/extract_ngrams.py
@@ -1,7 +1,7 @@
from thinc.api import Model
-from ..util import registry
from ..attrs import LOWER
+from ..util import registry
@registry.layers("spacy.extract_ngrams.v1")
diff --git a/spacy/ml/extract_spans.py b/spacy/ml/extract_spans.py
index d5e9bc07c..ac0f5fa1b 100644
--- a/spacy/ml/extract_spans.py
+++ b/spacy/ml/extract_spans.py
@@ -1,6 +1,7 @@
-from typing import Tuple, Callable
+from typing import Callable, List, Tuple
+
from thinc.api import Model, to_numpy
-from thinc.types import Ragged, Ints1d
+from thinc.types import Ints1d, Ragged
from ..util import registry
@@ -52,14 +53,14 @@ def _get_span_indices(ops, spans: Ragged, lengths: Ints1d) -> Ints1d:
indices will be [5, 6, 7, 8, 8, 9].
"""
spans, lengths = _ensure_cpu(spans, lengths)
- indices = []
+ indices: List[int] = []
offset = 0
for i, length in enumerate(lengths):
spans_i = spans[i].dataXd + offset
for j in range(spans_i.shape[0]):
- indices.append(ops.xp.arange(spans_i[j, 0], spans_i[j, 1])) # type: ignore[call-overload, index]
+ indices.extend(range(spans_i[j, 0], spans_i[j, 1])) # type: ignore[arg-type, call-overload]
offset += length
- return ops.flatten(indices, dtype="i", ndim_if_empty=1)
+ return ops.asarray1i(indices)
def _ensure_cpu(spans: Ragged, lengths: Ints1d) -> Tuple[Ragged, Ints1d]:
diff --git a/spacy/ml/featureextractor.py b/spacy/ml/featureextractor.py
index ed2918f02..06f1ff51a 100644
--- a/spacy/ml/featureextractor.py
+++ b/spacy/ml/featureextractor.py
@@ -1,6 +1,7 @@
-from typing import List, Union, Callable, Tuple
-from thinc.types import Ints2d
+from typing import Callable, List, Tuple, Union
+
from thinc.api import Model, registry
+from thinc.types import Ints2d
from ..tokens import Doc
diff --git a/spacy/ml/models/__init__.py b/spacy/ml/models/__init__.py
index 9b7628f0e..5125018e5 100644
--- a/spacy/ml/models/__init__.py
+++ b/spacy/ml/models/__init__.py
@@ -1,6 +1,7 @@
from .entity_linker import * # noqa
from .multi_task import * # noqa
from .parser import * # noqa
+from .span_finder import * # noqa
from .spancat import * # noqa
from .tagger import * # noqa
from .textcat import * # noqa
diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py
index 7332ca199..b7100c00a 100644
--- a/spacy/ml/models/entity_linker.py
+++ b/spacy/ml/models/entity_linker.py
@@ -1,16 +1,31 @@
from pathlib import Path
-from typing import Optional, Callable, Iterable, List, Tuple
-from thinc.types import Floats2d
-from thinc.api import chain, list2ragged, reduce_mean, residual
-from thinc.api import Model, Maxout, Linear, tuplify, Ragged
+from typing import Callable, Iterable, List, Optional, Tuple
+
+from thinc.api import (
+ Linear,
+ Maxout,
+ Model,
+ Ragged,
+ chain,
+ list2ragged,
+ reduce_mean,
+ residual,
+ tuplify,
+)
+from thinc.types import Floats2d
-from ...util import registry
-from ...kb import KnowledgeBase, InMemoryLookupKB
-from ...kb import Candidate, get_candidates, get_candidates_batch
-from ...vocab import Vocab
-from ...tokens import Span, Doc
-from ..extract_spans import extract_spans
from ...errors import Errors
+from ...kb import (
+ Candidate,
+ InMemoryLookupKB,
+ KnowledgeBase,
+ get_candidates,
+ get_candidates_batch,
+)
+from ...tokens import Doc, Span
+from ...util import registry
+from ...vocab import Vocab
+from ..extract_spans import extract_spans
@registry.architectures("spacy.EntityLinker.v2")
diff --git a/spacy/ml/models/multi_task.py b/spacy/ml/models/multi_task.py
index 826fddd4f..b7faf1cd7 100644
--- a/spacy/ml/models/multi_task.py
+++ b/spacy/ml/models/multi_task.py
@@ -1,22 +1,33 @@
-from typing import Any, Optional, Iterable, Tuple, List, Callable, TYPE_CHECKING, cast
-from thinc.types import Floats2d
-from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init, Model
-from thinc.api import MultiSoftmax, list2array
-from thinc.api import to_categorical, CosineDistance, L2Distance
-from thinc.loss import Loss
-
-from ...util import registry, OOV_RANK
-from ...errors import Errors
-from ...attrs import ID
-from ...vectors import Mode as VectorsMode
+from functools import partial
+from typing import TYPE_CHECKING, Any, Callable, Iterable, List, Optional, Tuple, cast
import numpy
-from functools import partial
+from thinc.api import (
+ CosineDistance,
+ L2Distance,
+ LayerNorm,
+ Linear,
+ Maxout,
+ Model,
+ MultiSoftmax,
+ Softmax,
+ chain,
+ list2array,
+ to_categorical,
+ zero_init,
+)
+from thinc.loss import Loss
+from thinc.types import Floats2d, Ints1d
+
+from ...attrs import ID, ORTH
+from ...errors import Errors
+from ...util import OOV_RANK, registry
+from ...vectors import Mode as VectorsMode
if TYPE_CHECKING:
# This lets us add type hints for mypy etc. without causing circular imports
- from ...vocab import Vocab # noqa: F401
from ...tokens.doc import Doc # noqa: F401
+ from ...vocab import Vocab # noqa: F401
@registry.architectures("spacy.PretrainVectors.v1")
@@ -24,8 +35,6 @@ def create_pretrain_vectors(
maxout_pieces: int, hidden_size: int, loss: str
) -> Callable[["Vocab", Model], Model]:
def create_vectors_objective(vocab: "Vocab", tok2vec: Model) -> Model:
- if vocab.vectors.mode != VectorsMode.default:
- raise ValueError(Errors.E850.format(mode=vocab.vectors.mode))
if vocab.vectors.shape[1] == 0:
raise ValueError(Errors.E875)
model = build_cloze_multi_task_model(
@@ -70,14 +79,23 @@ def get_vectors_loss(ops, docs, prediction, distance):
"""Compute a loss based on a distance between the documents' vectors and
the prediction.
"""
- # The simplest way to implement this would be to vstack the
- # token.vector values, but that's a bit inefficient, especially on GPU.
- # Instead we fetch the index into the vectors table for each of our tokens,
- # and look them up all at once. This prevents data copying.
- ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
- target = docs[0].vocab.vectors.data[ids]
- target[ids == OOV_RANK] = 0
- d_target, loss = distance(prediction, target)
+ vocab = docs[0].vocab
+ if vocab.vectors.mode == VectorsMode.default:
+ # The simplest way to implement this would be to vstack the
+ # token.vector values, but that's a bit inefficient, especially on GPU.
+ # Instead we fetch the index into the vectors table for each of our
+ # tokens, and look them up all at once. This prevents data copying.
+ ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
+ target = docs[0].vocab.vectors.data[ids]
+ target[ids == OOV_RANK] = 0
+ d_target, loss = distance(prediction, target)
+ elif vocab.vectors.mode == VectorsMode.floret:
+ keys = ops.flatten([cast(Ints1d, doc.to_array(ORTH)) for doc in docs])
+ target = vocab.vectors.get_batch(keys)
+ target = ops.as_contig(target)
+ d_target, loss = distance(prediction, target)
+ else:
+ raise ValueError(Errors.E850.format(mode=vocab.vectors.mode))
return loss, d_target
diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py
index a70d84dea..f6c0e565d 100644
--- a/spacy/ml/models/parser.py
+++ b/spacy/ml/models/parser.py
@@ -1,13 +1,14 @@
-from typing import Optional, List, cast
-from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops
+from typing import List, Optional, cast
+
+from thinc.api import Linear, Model, chain, list2array, use_ops, zero_init
from thinc.types import Floats2d
-from ...errors import Errors
from ...compat import Literal
+from ...errors import Errors
+from ...tokens import Doc
from ...util import registry
from .._precomputable_affine import PrecomputableAffine
from ..tb_framework import TransitionModel
-from ...tokens import Doc
@registry.architectures("spacy.TransitionBasedParser.v2")
diff --git a/spacy/ml/models/span_finder.py b/spacy/ml/models/span_finder.py
new file mode 100644
index 000000000..d327fc761
--- /dev/null
+++ b/spacy/ml/models/span_finder.py
@@ -0,0 +1,41 @@
+from typing import Callable, List, Tuple
+
+from thinc.api import Model, chain, with_array
+from thinc.types import Floats1d, Floats2d
+
+from ...tokens import Doc
+from ...util import registry
+
+InT = List[Doc]
+OutT = Floats2d
+
+
+@registry.architectures("spacy.SpanFinder.v1")
+def build_finder_model(
+ tok2vec: Model[InT, List[Floats2d]], scorer: Model[OutT, OutT]
+) -> Model[InT, OutT]:
+
+ logistic_layer: Model[List[Floats2d], List[Floats2d]] = with_array(scorer)
+ model: Model[InT, OutT] = chain(tok2vec, logistic_layer, flattener())
+ model.set_ref("tok2vec", tok2vec)
+ model.set_ref("scorer", scorer)
+ model.set_ref("logistic_layer", logistic_layer)
+
+ return model
+
+
+def flattener() -> Model[List[Floats2d], Floats2d]:
+ """Flattens the input to a 1-dimensional list of scores"""
+
+ def forward(
+ model: Model[Floats1d, Floats1d], X: List[Floats2d], is_train: bool
+ ) -> Tuple[Floats2d, Callable[[Floats2d], List[Floats2d]]]:
+ lens = model.ops.asarray1i([len(doc) for doc in X])
+ Y = model.ops.flatten(X)
+
+ def backprop(dY: Floats2d) -> List[Floats2d]:
+ return model.ops.unflatten(dY, lens)
+
+ return Y, backprop
+
+ return Model("Flattener", forward=forward)
diff --git a/spacy/ml/models/spancat.py b/spacy/ml/models/spancat.py
index 893db2e6d..140ec553a 100644
--- a/spacy/ml/models/spancat.py
+++ b/spacy/ml/models/spancat.py
@@ -1,11 +1,24 @@
from typing import List, Tuple, cast
-from thinc.api import Model, with_getitem, chain, list2ragged, Logistic
-from thinc.api import Maxout, Linear, concatenate, glorot_uniform_init
-from thinc.api import reduce_mean, reduce_max, reduce_first, reduce_last
-from thinc.types import Ragged, Floats2d
-from ...util import registry
+from thinc.api import (
+ Linear,
+ Logistic,
+ Maxout,
+ Model,
+ chain,
+ concatenate,
+ glorot_uniform_init,
+ list2ragged,
+ reduce_first,
+ reduce_last,
+ reduce_max,
+ reduce_mean,
+ with_getitem,
+)
+from thinc.types import Floats2d, Ragged
+
from ...tokens import Doc
+from ...util import registry
from ..extract_spans import extract_spans
diff --git a/spacy/ml/models/tagger.py b/spacy/ml/models/tagger.py
index 9f8ef7b2b..8f1554fab 100644
--- a/spacy/ml/models/tagger.py
+++ b/spacy/ml/models/tagger.py
@@ -1,9 +1,10 @@
-from typing import Optional, List
-from thinc.api import zero_init, with_array, Softmax_v2, chain, Model
+from typing import List, Optional
+
+from thinc.api import Model, Softmax_v2, chain, with_array, zero_init
from thinc.types import Floats2d
-from ...util import registry
from ...tokens import Doc
+from ...util import registry
@registry.architectures("spacy.Tagger.v2")
diff --git a/spacy/ml/models/textcat.py b/spacy/ml/models/textcat.py
index 9c7e607fe..ab14110d2 100644
--- a/spacy/ml/models/textcat.py
+++ b/spacy/ml/models/textcat.py
@@ -1,22 +1,39 @@
-from typing import Optional, List, cast
from functools import partial
+from typing import List, Optional, cast
-from thinc.types import Floats2d
-from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic
-from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention
-from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum
-from thinc.api import with_cpu, Relu, residual, LayerNorm, resizable
+from thinc.api import (
+ Dropout,
+ LayerNorm,
+ Linear,
+ Logistic,
+ Maxout,
+ Model,
+ ParametricAttention,
+ Relu,
+ Softmax,
+ SparseLinear,
+ chain,
+ clone,
+ concatenate,
+ list2ragged,
+ reduce_mean,
+ reduce_sum,
+ residual,
+ resizable,
+ softmax_activation,
+ with_cpu,
+)
from thinc.layers.chain import init as init_chain
-from thinc.layers.resizable import resize_model, resize_linear_weighted
+from thinc.layers.resizable import resize_linear_weighted, resize_model
+from thinc.types import Floats2d
from ...attrs import ORTH
+from ...tokens import Doc
from ...util import registry
from ..extract_ngrams import extract_ngrams
from ..staticvectors import StaticVectors
-from ...tokens import Doc
from .tok2vec import get_tok2vec_width
-
NEG_VALUE = -5000
diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py
index 30c7360ff..2e9d21ef4 100644
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@@ -1,17 +1,32 @@
-from typing import Optional, List, Union, cast
-from thinc.types import Floats2d, Ints2d, Ragged, Ints1d
-from thinc.api import chain, clone, concatenate, with_array, with_padded
-from thinc.api import Model, noop, list2ragged, ragged2list, HashEmbed
-from thinc.api import expand_window, residual, Maxout, Mish, PyTorchLSTM
+from typing import List, Optional, Union, cast
-from ...tokens import Doc
-from ...util import registry
+from thinc.api import (
+ HashEmbed,
+ Maxout,
+ Mish,
+ Model,
+ PyTorchLSTM,
+ chain,
+ clone,
+ concatenate,
+ expand_window,
+ list2ragged,
+ noop,
+ ragged2list,
+ residual,
+ with_array,
+ with_padded,
+)
+from thinc.types import Floats2d, Ints1d, Ints2d, Ragged
+
+from ...attrs import intify_attr
from ...errors import Errors
from ...ml import _character_embed
-from ..staticvectors import StaticVectors
-from ..featureextractor import FeatureExtractor
from ...pipeline.tok2vec import Tok2VecListener
-from ...attrs import intify_attr
+from ...tokens import Doc
+from ...util import registry
+from ..featureextractor import FeatureExtractor
+from ..staticvectors import StaticVectors
@registry.architectures("spacy.Tok2VecListener.v1")
diff --git a/spacy/ml/parser_model.pxd b/spacy/ml/parser_model.pxd
index 8def6cea5..ca31c1699 100644
--- a/spacy/ml/parser_model.pxd
+++ b/spacy/ml/parser_model.pxd
@@ -1,7 +1,8 @@
-from libc.string cimport memset, memcpy
+from libc.string cimport memcpy, memset
from thinc.backends.cblas cimport CBlas
-from ..typedefs cimport weight_t, hash_t
+
from ..pipeline._parser_internals._state cimport StateC
+from ..typedefs cimport hash_t, weight_t
cdef struct SizesC:
diff --git a/spacy/ml/parser_model.pyx b/spacy/ml/parser_model.pyx
index 961bf4d70..5cffc4c2d 100644
--- a/spacy/ml/parser_model.pyx
+++ b/spacy/ml/parser_model.pyx
@@ -1,19 +1,20 @@
# cython: infer_types=True, cdivision=True, boundscheck=False
cimport numpy as np
from libc.math cimport exp
-from libc.string cimport memset, memcpy
from libc.stdlib cimport calloc, free, realloc
-from thinc.backends.linalg cimport Vec, VecVec
+from libc.string cimport memcpy, memset
from thinc.backends.cblas cimport saxpy, sgemm
+from thinc.backends.linalg cimport Vec, VecVec
import numpy
import numpy.random
-from thinc.api import Model, CupyOps, NumpyOps, get_ops
+from thinc.api import CupyOps, Model, NumpyOps, get_ops
from .. import util
from ..errors import Errors
-from ..typedefs cimport weight_t, class_t, hash_t
+
from ..pipeline._parser_internals.stateclass cimport StateClass
+from ..typedefs cimport class_t, hash_t, weight_t
cdef WeightsC get_c_weights(model) except *:
diff --git a/spacy/ml/staticvectors.py b/spacy/ml/staticvectors.py
index 04cfe912d..b75240c5d 100644
--- a/spacy/ml/staticvectors.py
+++ b/spacy/ml/staticvectors.py
@@ -1,11 +1,14 @@
-from typing import List, Tuple, Callable, Optional, Sequence, cast
-from thinc.initializers import glorot_uniform_init
-from thinc.util import partial
-from thinc.types import Ragged, Floats2d, Floats1d, Ints1d
-from thinc.api import Model, Ops, registry
+import warnings
+from typing import Callable, List, Optional, Sequence, Tuple, cast
+from thinc.api import Model, Ops, registry
+from thinc.initializers import glorot_uniform_init
+from thinc.types import Floats1d, Floats2d, Ints1d, Ragged
+from thinc.util import partial
+
+from ..attrs import ORTH
+from ..errors import Errors, Warnings
from ..tokens import Doc
-from ..errors import Errors
from ..vectors import Mode
from ..vocab import Vocab
@@ -23,6 +26,8 @@ def StaticVectors(
linear projection to control the dimensionality. If a dropout rate is
specified, the dropout is applied per dimension over the whole batch.
"""
+ if key_attr != "ORTH":
+ warnings.warn(Warnings.W125, DeprecationWarning)
return Model(
"static_vectors",
forward,
@@ -39,9 +44,9 @@ def forward(
token_count = sum(len(doc) for doc in docs)
if not token_count:
return _handle_empty(model.ops, model.get_dim("nO"))
- key_attr: int = model.attrs["key_attr"]
- keys = model.ops.flatten([cast(Ints1d, doc.to_array(key_attr)) for doc in docs])
vocab: Vocab = docs[0].vocab
+ key_attr: int = getattr(vocab.vectors, "attr", ORTH)
+ keys = model.ops.flatten([cast(Ints1d, doc.to_array(key_attr)) for doc in docs])
W = cast(Floats2d, model.ops.as_contig(model.get_param("W")))
if vocab.vectors.mode == Mode.default:
V = model.ops.asarray(vocab.vectors.data)
diff --git a/spacy/ml/tb_framework.py b/spacy/ml/tb_framework.py
index ab4a969e2..e351ad4e5 100644
--- a/spacy/ml/tb_framework.py
+++ b/spacy/ml/tb_framework.py
@@ -1,6 +1,7 @@
from thinc.api import Model, noop
-from .parser_model import ParserStepModel
+
from ..util import registry
+from .parser_model import ParserStepModel
@registry.layers("spacy.TransitionModel.v1")
diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd
index 8d449d065..968764b82 100644
--- a/spacy/morphology.pxd
+++ b/spacy/morphology.pxd
@@ -1,10 +1,10 @@
-from cymem.cymem cimport Pool
-from preshed.maps cimport PreshMap
cimport numpy as np
+from cymem.cymem cimport Pool
from libc.stdint cimport uint64_t
+from preshed.maps cimport PreshMap
-from .structs cimport MorphAnalysisC
from .strings cimport StringStore
+from .structs cimport MorphAnalysisC
from .typedefs cimport attr_t, hash_t
diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx
index c3ffc46a1..1062fff09 100644
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@@ -1,12 +1,13 @@
# cython: infer_types
-import numpy
import warnings
+import numpy
+
from .attrs cimport POS
-from .parts_of_speech import IDS as POS_IDS
-from .errors import Warnings
from . import symbols
+from .errors import Warnings
+from .parts_of_speech import IDS as POS_IDS
cdef class Morphology:
diff --git a/spacy/parts_of_speech.pxd b/spacy/parts_of_speech.pxd
index 0bf5b4789..a0b2567f1 100644
--- a/spacy/parts_of_speech.pxd
+++ b/spacy/parts_of_speech.pxd
@@ -1,5 +1,6 @@
from . cimport symbols
+
cpdef enum univ_pos_t:
NO_TAG = 0
ADJ = symbols.ADJ
diff --git a/spacy/pipe_analysis.py b/spacy/pipe_analysis.py
index 245747061..d26884487 100644
--- a/spacy/pipe_analysis.py
+++ b/spacy/pipe_analysis.py
@@ -1,8 +1,9 @@
-from typing import List, Set, Dict, Iterable, ItemsView, Union, TYPE_CHECKING
+from typing import TYPE_CHECKING, Dict, ItemsView, Iterable, List, Set, Union
+
from wasabi import msg
-from .tokens import Doc, Token, Span
from .errors import Errors
+from .tokens import Doc, Span, Token
from .util import dot_to_dict
if TYPE_CHECKING:
diff --git a/spacy/pipeline/__init__.py b/spacy/pipeline/__init__.py
index 26931606b..40e3fd638 100644
--- a/spacy/pipeline/__init__.py
+++ b/spacy/pipeline/__init__.py
@@ -2,21 +2,22 @@ from .attributeruler import AttributeRuler
from .dep_parser import DependencyParser
from .edit_tree_lemmatizer import EditTreeLemmatizer
from .entity_linker import EntityLinker
-from .ner import EntityRecognizer
from .entityruler import EntityRuler
+from .functions import merge_entities, merge_noun_chunks, merge_subtokens
from .lemmatizer import Lemmatizer
from .morphologizer import Morphologizer
+from .ner import EntityRecognizer
from .pipe import Pipe
-from .trainable_pipe import TrainablePipe
-from .senter import SentenceRecognizer
from .sentencizer import Sentencizer
+from .senter import SentenceRecognizer
+from .span_finder import SpanFinder
+from .span_ruler import SpanRuler
+from .spancat import SpanCategorizer
from .tagger import Tagger
from .textcat import TextCategorizer
-from .spancat import SpanCategorizer
-from .span_ruler import SpanRuler
from .textcat_multilabel import MultiLabel_TextCategorizer
from .tok2vec import Tok2Vec
-from .functions import merge_entities, merge_noun_chunks, merge_subtokens
+from .trainable_pipe import TrainablePipe
__all__ = [
"AttributeRuler",
@@ -31,6 +32,7 @@ __all__ = [
"SentenceRecognizer",
"Sentencizer",
"SpanCategorizer",
+ "SpanFinder",
"SpanRuler",
"Tagger",
"TextCategorizer",
diff --git a/spacy/pipeline/_edit_tree_internals/edit_trees.pxd b/spacy/pipeline/_edit_tree_internals/edit_trees.pxd
index dc4289f37..3d63af921 100644
--- a/spacy/pipeline/_edit_tree_internals/edit_trees.pxd
+++ b/spacy/pipeline/_edit_tree_internals/edit_trees.pxd
@@ -2,8 +2,9 @@ from libc.stdint cimport uint32_t, uint64_t
from libcpp.unordered_map cimport unordered_map
from libcpp.vector cimport vector
-from ...typedefs cimport attr_t, hash_t, len_t
from ...strings cimport StringStore
+from ...typedefs cimport attr_t, hash_t, len_t
+
cdef extern from "
- {Children.toArray(children).flat().filter(isRelevant)}
+
+
+ {Children.toArray(children).flat().filter(isRelevant)}
+