| Get a custom spaCy pipeline, tailor-made for your NLP problem by spaCy's core developers. Streamlined, production-ready, predictable and maintainable. Start by completing our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-pipelines)** |
|
| Bespoke advice for problem solving, strategy and analysis for applied NLP projects. Services include data strategy, code reviews, pipeline design and annotation coaching. Curious? Fill in our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-analysis)** |
@@ -57,13 +58,13 @@ open-source software, released under the [MIT license](https://github.com/explos
[api reference]: https://spacy.io/api/
[models]: https://spacy.io/models
[universe]: https://spacy.io/universe
+[spaCy VS Code Extension]: https://github.com/explosion/spacy-vscode
[videos]: https://www.youtube.com/c/ExplosionAI
[online course]: https://course.spacy.io
[project templates]: https://github.com/explosion/projects
[changelog]: https://spacy.io/usage#changelog
[contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md
-
## 💬 Where to ask questions
The spaCy project is maintained by the [spaCy team](https://explosion.ai/about).
diff --git a/pyproject.toml b/pyproject.toml
index 9cd96ac2d..dcb5cf10d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -9,3 +9,6 @@ requires = [
"numpy>=1.15.0",
]
build-backend = "setuptools.build_meta"
+
+[tool.isort]
+profile = "black"
diff --git a/requirements.txt b/requirements.txt
index 63e03d558..a007f495e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,7 +9,7 @@ murmurhash>=0.28.0,<1.1.0
wasabi>=0.9.1,<1.2.0
srsly>=2.4.3,<3.0.0
catalogue>=2.0.6,<2.1.0
-typer>=0.3.0,<0.8.0
+typer>=0.3.0,<0.10.0
pathy>=0.10.0
smart-open>=5.2.1,<7.0.0
# Third party dependencies
@@ -38,3 +38,4 @@ types-setuptools>=57.0.0
types-requests
types-setuptools>=57.0.0
black==22.3.0
+isort>=5.0,<6.0
diff --git a/setup.cfg b/setup.cfg
index eea557337..45734888f 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -52,7 +52,7 @@ install_requires =
srsly>=2.4.3,<3.0.0
catalogue>=2.0.6,<2.1.0
# Third-party dependencies
- typer>=0.3.0,<0.8.0
+ typer>=0.3.0,<0.10.0
pathy>=0.10.0
smart-open>=5.2.1,<7.0.0
tqdm>=4.38.0,<5.0.0
diff --git a/spacy/__init__.py b/spacy/__init__.py
index c3568bc5c..1a18ad0d5 100644
--- a/spacy/__init__.py
+++ b/spacy/__init__.py
@@ -1,6 +1,6 @@
-from typing import Union, Iterable, Dict, Any
-from pathlib import Path
import sys
+from pathlib import Path
+from typing import Any, Dict, Iterable, Union
# set library-specific custom warning handling before doing anything else
from .errors import setup_default_warnings
@@ -8,20 +8,17 @@ from .errors import setup_default_warnings
setup_default_warnings() # noqa: E402
# These are imported as part of the API
-from thinc.api import prefer_gpu, require_gpu, require_cpu # noqa: F401
-from thinc.api import Config
+from thinc.api import Config, prefer_gpu, require_cpu, require_gpu # noqa: F401
from . import pipeline # noqa: F401
-from .cli.info import info # noqa: F401
-from .glossary import explain # noqa: F401
-from .about import __version__ # noqa: F401
-from .util import registry, logger # noqa: F401
-
-from .errors import Errors
-from .language import Language
-from .vocab import Vocab
from . import util
-
+from .about import __version__ # noqa: F401
+from .cli.info import info # noqa: F401
+from .errors import Errors
+from .glossary import explain # noqa: F401
+from .language import Language
+from .util import logger, registry # noqa: F401
+from .vocab import Vocab
if sys.maxunicode == 65535:
raise SystemError(Errors.E130)
diff --git a/spacy/about.py b/spacy/about.py
index c6b09039e..cad6158da 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
# fmt: off
__title__ = "spacy"
-__version__ = "3.6.0.dev0"
+__version__ = "3.6.0"
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
__projects__ = "https://github.com/explosion/projects"
diff --git a/spacy/attrs.pxd b/spacy/attrs.pxd
index 33d5372de..6dc9ecaee 100644
--- a/spacy/attrs.pxd
+++ b/spacy/attrs.pxd
@@ -1,6 +1,7 @@
# Reserve 64 values for flag features
from . cimport symbols
+
cdef enum attr_id_t:
NULL_ATTR
IS_ALPHA
diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py
index 868526b42..549a27616 100644
--- a/spacy/cli/__init__.py
+++ b/spacy/cli/__init__.py
@@ -1,35 +1,35 @@
from wasabi import msg
from ._util import app, setup_cli # noqa: F401
+from .apply import apply # noqa: F401
+from .assemble import assemble_cli # noqa: F401
# These are the actual functions, NOT the wrapped CLI commands. The CLI commands
# are registered automatically and won't have to be imported here.
from .benchmark_speed import benchmark_speed_cli # noqa: F401
-from .download import download # noqa: F401
-from .info import info # noqa: F401
-from .package import package # noqa: F401
-from .profile import profile # noqa: F401
-from .train import train_cli # noqa: F401
-from .assemble import assemble_cli # noqa: F401
-from .pretrain import pretrain # noqa: F401
-from .debug_data import debug_data # noqa: F401
-from .debug_config import debug_config # noqa: F401
-from .debug_model import debug_model # noqa: F401
-from .debug_diff import debug_diff # noqa: F401
-from .evaluate import evaluate # noqa: F401
-from .apply import apply # noqa: F401
from .convert import convert # noqa: F401
-from .init_pipeline import init_pipeline_cli # noqa: F401
-from .init_config import init_config, fill_config # noqa: F401
-from .validate import validate # noqa: F401
-from .project.clone import project_clone # noqa: F401
-from .project.assets import project_assets # noqa: F401
-from .project.run import project_run # noqa: F401
-from .project.dvc import project_update_dvc # noqa: F401
-from .project.push import project_push # noqa: F401
-from .project.pull import project_pull # noqa: F401
-from .project.document import project_document # noqa: F401
+from .debug_config import debug_config # noqa: F401
+from .debug_data import debug_data # noqa: F401
+from .debug_diff import debug_diff # noqa: F401
+from .debug_model import debug_model # noqa: F401
+from .download import download # noqa: F401
+from .evaluate import evaluate # noqa: F401
from .find_threshold import find_threshold # noqa: F401
+from .info import info # noqa: F401
+from .init_config import fill_config, init_config # noqa: F401
+from .init_pipeline import init_pipeline_cli # noqa: F401
+from .package import package # noqa: F401
+from .pretrain import pretrain # noqa: F401
+from .profile import profile # noqa: F401
+from .project.assets import project_assets # noqa: F401
+from .project.clone import project_clone # noqa: F401
+from .project.document import project_document # noqa: F401
+from .project.dvc import project_update_dvc # noqa: F401
+from .project.pull import project_pull # noqa: F401
+from .project.push import project_push # noqa: F401
+from .project.run import project_run # noqa: F401
+from .train import train_cli # noqa: F401
+from .validate import validate # noqa: F401
@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index f104feff9..eff897316 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -1,26 +1,44 @@
-from typing import Dict, Any, Union, List, Optional, Tuple, Iterable
-from typing import TYPE_CHECKING, overload
-import sys
-import shutil
-from pathlib import Path
-from wasabi import msg, Printer
-import srsly
import hashlib
+import os
+import shutil
+import sys
+from configparser import InterpolationError
+from contextlib import contextmanager
+from pathlib import Path
+from typing import (
+ TYPE_CHECKING,
+ Any,
+ Dict,
+ Iterable,
+ List,
+ Optional,
+ Tuple,
+ Union,
+ overload,
+)
+
+import srsly
import typer
from click import NoSuchOption
from click.parser import split_arg_string
-from typer.main import get_command
-from contextlib import contextmanager
from thinc.api import Config, ConfigValidationError, require_gpu
from thinc.util import gpu_is_available
-from configparser import InterpolationError
-import os
+from typer.main import get_command
+from wasabi import Printer, msg
+from .. import about
from ..compat import Literal
from ..schemas import ProjectConfigSchema, validate
-from ..util import import_file, run_command, make_tempdir, registry, logger
-from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS
-from .. import about
+from ..util import (
+ ENV_VARS,
+ SimpleFrozenDict,
+ import_file,
+ is_compatible_version,
+ logger,
+ make_tempdir,
+ registry,
+ run_command,
+)
if TYPE_CHECKING:
from pathy import FluidPath # noqa: F401
diff --git a/spacy/cli/apply.py b/spacy/cli/apply.py
index f0df4e757..8c4b4c8bf 100644
--- a/spacy/cli/apply.py
+++ b/spacy/cli/apply.py
@@ -1,18 +1,15 @@
-import tqdm
-import srsly
-
from itertools import chain
from pathlib import Path
-from typing import Optional, List, Iterable, cast, Union
+from typing import Iterable, List, Optional, Union, cast
+import srsly
+import tqdm
from wasabi import msg
-from ._util import app, Arg, Opt, setup_gpu, import_code, walk_directory
-
from ..tokens import Doc, DocBin
-from ..vocab import Vocab
from ..util import ensure_path, load_model
-
+from ..vocab import Vocab
+from ._util import Arg, Opt, app, import_code, setup_gpu, walk_directory
path_help = """Location of the documents to predict on.
Can be a single file in .spacy format or a .jsonl file.
diff --git a/spacy/cli/assemble.py b/spacy/cli/assemble.py
index 1cfa290a3..ee2500b27 100644
--- a/spacy/cli/assemble.py
+++ b/spacy/cli/assemble.py
@@ -1,13 +1,20 @@
-from typing import Optional
-from pathlib import Path
-from wasabi import msg
-import typer
import logging
+from pathlib import Path
+from typing import Optional
+
+import typer
+from wasabi import msg
-from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
-from ._util import import_code
from .. import util
from ..util import get_sourced_components, load_model_from_config
+from ._util import (
+ Arg,
+ Opt,
+ app,
+ import_code,
+ parse_config_overrides,
+ show_validation_error,
+)
@app.command(
diff --git a/spacy/cli/benchmark_speed.py b/spacy/cli/benchmark_speed.py
index 4eb20a5fa..a683d1591 100644
--- a/spacy/cli/benchmark_speed.py
+++ b/spacy/cli/benchmark_speed.py
@@ -1,11 +1,12 @@
-from typing import Iterable, List, Optional
import random
-from itertools import islice
-import numpy
-from pathlib import Path
import time
-from tqdm import tqdm
+from itertools import islice
+from pathlib import Path
+from typing import Iterable, List, Optional
+
+import numpy
import typer
+from tqdm import tqdm
from wasabi import msg
from .. import util
diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py
index 68d454b3e..a66a68133 100644
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@@ -1,18 +1,22 @@
-from typing import Callable, Iterable, Mapping, Optional, Any, Union
-from enum import Enum
-from pathlib import Path
-from wasabi import Printer
-import srsly
+import itertools
import re
import sys
-import itertools
+from enum import Enum
+from pathlib import Path
+from typing import Any, Callable, Iterable, Mapping, Optional, Union
+
+import srsly
+from wasabi import Printer
-from ._util import app, Arg, Opt, walk_directory
-from ..training import docs_to_json
from ..tokens import Doc, DocBin
-from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs
-from ..training.converters import conllu_to_docs
-
+from ..training import docs_to_json
+from ..training.converters import (
+ conll_ner_to_docs,
+ conllu_to_docs,
+ iob_to_docs,
+ json_to_docs,
+)
+from ._util import Arg, Opt, app, walk_directory
# Converters are matched by file extension except for ner/iob, which are
# matched by file extension and content. To add a converter, add a new
diff --git a/spacy/cli/debug_config.py b/spacy/cli/debug_config.py
index 409fac4ed..0e5382cd9 100644
--- a/spacy/cli/debug_config.py
+++ b/spacy/cli/debug_config.py
@@ -1,15 +1,22 @@
-from typing import Optional, Dict, Any, Union, List
from pathlib import Path
-from wasabi import msg, table
+from typing import Any, Dict, List, Optional, Union
+
+import typer
from thinc.api import Config
from thinc.config import VARIABLE_RE
-import typer
+from wasabi import msg, table
-from ._util import Arg, Opt, show_validation_error, parse_config_overrides
-from ._util import import_code, debug_cli
+from .. import util
from ..schemas import ConfigSchemaInit, ConfigSchemaTraining
from ..util import registry
-from .. import util
+from ._util import (
+ Arg,
+ Opt,
+ debug_cli,
+ import_code,
+ parse_config_overrides,
+ show_validation_error,
+)
@debug_cli.command(
diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py
index 2826cd084..af3c24f3b 100644
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@@ -1,31 +1,49 @@
-from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Tuple, Union
-from typing import cast, overload
-from pathlib import Path
-from collections import Counter
-import sys
-import srsly
-from wasabi import Printer, MESSAGES, msg
-import typer
import math
-import numpy
+import sys
+from collections import Counter
+from pathlib import Path
+from typing import (
+ Any,
+ Dict,
+ Iterable,
+ List,
+ Optional,
+ Sequence,
+ Set,
+ Tuple,
+ Union,
+ cast,
+ overload,
+)
-from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
-from ._util import import_code, debug_cli, _format_number
-from ..training import Example, remove_bilu_prefix
-from ..training.initialize import get_sourced_components
-from ..schemas import ConfigSchemaTraining
-from ..pipeline import TrainablePipe
+import numpy
+import srsly
+import typer
+from wasabi import MESSAGES, Printer, msg
+
+from .. import util
+from ..compat import Literal
+from ..language import Language
+from ..morphology import Morphology
+from ..pipeline import Morphologizer, SpanCategorizer, TrainablePipe
+from ..pipeline._edit_tree_internals.edit_trees import EditTrees
from ..pipeline._parser_internals import nonproj
from ..pipeline._parser_internals.nonproj import DELIMITER
-from ..pipeline import Morphologizer, SpanCategorizer
-from ..pipeline._edit_tree_internals.edit_trees import EditTrees
-from ..morphology import Morphology
-from ..language import Language
+from ..schemas import ConfigSchemaTraining
+from ..training import Example, remove_bilu_prefix
+from ..training.initialize import get_sourced_components
from ..util import registry, resolve_dot_names
-from ..compat import Literal
from ..vectors import Mode as VectorsMode
-from .. import util
-
+from ._util import (
+ Arg,
+ Opt,
+ _format_number,
+ app,
+ debug_cli,
+ import_code,
+ parse_config_overrides,
+ show_validation_error,
+)
# Minimum number of expected occurrences of NER label in data to train new label
NEW_LABEL_THRESHOLD = 50
@@ -212,7 +230,7 @@ def debug_data(
else:
msg.info("No word vectors present in the package")
- if "spancat" in factory_names:
+ if "spancat" in factory_names or "spancat_singlelabel" in factory_names:
model_labels_spancat = _get_labels_from_spancat(nlp)
has_low_data_warning = False
has_no_neg_warning = False
@@ -830,7 +848,7 @@ def _compile_gold(
data["boundary_cross_ents"] += 1
elif label == "-":
data["ner"]["-"] += 1
- if "spancat" in factory_names:
+ if "spancat" in factory_names or "spancat_singlelabel" in factory_names:
for spans_key in list(eg.reference.spans.keys()):
# Obtain the span frequency
if spans_key not in data["spancat"]:
@@ -1028,7 +1046,7 @@ def _get_labels_from_spancat(nlp: Language) -> Dict[str, Set[str]]:
pipe_names = [
pipe_name
for pipe_name in nlp.pipe_names
- if nlp.get_pipe_meta(pipe_name).factory == "spancat"
+ if nlp.get_pipe_meta(pipe_name).factory in ("spancat", "spancat_singlelabel")
]
labels: Dict[str, Set[str]] = {}
for pipe_name in pipe_names:
diff --git a/spacy/cli/debug_diff.py b/spacy/cli/debug_diff.py
index 6697c38ae..c53b0acab 100644
--- a/spacy/cli/debug_diff.py
+++ b/spacy/cli/debug_diff.py
@@ -1,13 +1,13 @@
+from pathlib import Path
from typing import Optional
import typer
-from wasabi import Printer, diff_strings, MarkdownRenderer
-from pathlib import Path
from thinc.api import Config
+from wasabi import MarkdownRenderer, Printer, diff_strings
-from ._util import debug_cli, Arg, Opt, show_validation_error, parse_config_overrides
from ..util import load_config
-from .init_config import init_config, Optimizations
+from ._util import Arg, Opt, debug_cli, parse_config_overrides, show_validation_error
+from .init_config import Optimizations, init_config
@debug_cli.command(
diff --git a/spacy/cli/debug_model.py b/spacy/cli/debug_model.py
index 190094d81..8a0fd4889 100644
--- a/spacy/cli/debug_model.py
+++ b/spacy/cli/debug_model.py
@@ -1,19 +1,32 @@
-from typing import Dict, Any, Optional
-from pathlib import Path
import itertools
+from pathlib import Path
+from typing import Any, Dict, Optional
+
+import typer
+from thinc.api import (
+ Model,
+ data_validation,
+ fix_random_seed,
+ set_dropout_rate,
+ set_gpu_allocator,
+)
+from wasabi import msg
from spacy.training import Example
from spacy.util import resolve_dot_names
-from wasabi import msg
-from thinc.api import fix_random_seed, set_dropout_rate
-from thinc.api import Model, data_validation, set_gpu_allocator
-import typer
-from ._util import Arg, Opt, debug_cli, show_validation_error
-from ._util import parse_config_overrides, string_to_list, setup_gpu
+from .. import util
from ..schemas import ConfigSchemaTraining
from ..util import registry
-from .. import util
+from ._util import (
+ Arg,
+ Opt,
+ debug_cli,
+ parse_config_overrides,
+ setup_gpu,
+ show_validation_error,
+ string_to_list,
+)
@debug_cli.command(
diff --git a/spacy/cli/download.py b/spacy/cli/download.py
index df4bca53d..de731b0fd 100644
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@@ -1,14 +1,14 @@
-from typing import Optional, Sequence
-import requests
import sys
-from wasabi import msg
-import typer
+from typing import Optional, Sequence
+
+import requests
+import typer
+from wasabi import msg
-from ._util import app, Arg, Opt, WHEEL_SUFFIX, SDIST_SUFFIX
from .. import about
-from ..util import is_package, get_minor_version, run_command
-from ..util import is_prerelease_version
from ..errors import OLD_MODEL_SHORTCUTS
+from ..util import get_minor_version, is_package, is_prerelease_version, run_command
+from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app
@app.command(
diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py
index 363c02cd3..6235b658d 100644
--- a/spacy/cli/evaluate.py
+++ b/spacy/cli/evaluate.py
@@ -1,16 +1,16 @@
-from typing import Optional, List, Dict, Any, Union
-from wasabi import Printer
-from pathlib import Path
import re
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Union
+
import srsly
from thinc.api import fix_random_seed
+from wasabi import Printer
-from ..training import Corpus
-from ..tokens import Doc
-from ._util import app, Arg, Opt, setup_gpu, import_code, benchmark_cli
+from .. import displacy, util
from ..scorer import Scorer
-from .. import util
-from .. import displacy
+from ..tokens import Doc
+from ..training import Corpus
+from ._util import Arg, Opt, app, benchmark_cli, import_code, setup_gpu
@benchmark_cli.command(
@@ -27,6 +27,7 @@ def evaluate_cli(
gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"),
displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False),
displacy_limit: int = Opt(25, "--displacy-limit", "-dl", help="Limit of parses to render as HTML"),
+ per_component: bool = Opt(False, "--per-component", "-P", help="Return scores per component, only applicable when an output JSON file is specified."),
# fmt: on
):
"""
@@ -50,6 +51,7 @@ def evaluate_cli(
gold_preproc=gold_preproc,
displacy_path=displacy_path,
displacy_limit=displacy_limit,
+ per_component=per_component,
silent=False,
)
@@ -64,6 +66,7 @@ def evaluate(
displacy_limit: int = 25,
silent: bool = True,
spans_key: str = "sc",
+ per_component: bool = False,
) -> Dict[str, Any]:
msg = Printer(no_print=silent, pretty=not silent)
fix_random_seed()
@@ -78,44 +81,53 @@ def evaluate(
corpus = Corpus(data_path, gold_preproc=gold_preproc)
nlp = util.load_model(model)
dev_dataset = list(corpus(nlp))
- scores = nlp.evaluate(dev_dataset)
- metrics = {
- "TOK": "token_acc",
- "TAG": "tag_acc",
- "POS": "pos_acc",
- "MORPH": "morph_acc",
- "LEMMA": "lemma_acc",
- "UAS": "dep_uas",
- "LAS": "dep_las",
- "NER P": "ents_p",
- "NER R": "ents_r",
- "NER F": "ents_f",
- "TEXTCAT": "cats_score",
- "SENT P": "sents_p",
- "SENT R": "sents_r",
- "SENT F": "sents_f",
- "SPAN P": f"spans_{spans_key}_p",
- "SPAN R": f"spans_{spans_key}_r",
- "SPAN F": f"spans_{spans_key}_f",
- "SPEED": "speed",
- }
- results = {}
- data = {}
- for metric, key in metrics.items():
- if key in scores:
- if key == "cats_score":
- metric = metric + " (" + scores.get("cats_score_desc", "unk") + ")"
- if isinstance(scores[key], (int, float)):
- if key == "speed":
- results[metric] = f"{scores[key]:.0f}"
+ scores = nlp.evaluate(dev_dataset, per_component=per_component)
+ if per_component:
+ data = scores
+ if output is None:
+ msg.warn(
+ "The per-component option is enabled but there is no output JSON file provided to save the scores to."
+ )
+ else:
+ msg.info("Per-component scores will be saved to output JSON file.")
+ else:
+ metrics = {
+ "TOK": "token_acc",
+ "TAG": "tag_acc",
+ "POS": "pos_acc",
+ "MORPH": "morph_acc",
+ "LEMMA": "lemma_acc",
+ "UAS": "dep_uas",
+ "LAS": "dep_las",
+ "NER P": "ents_p",
+ "NER R": "ents_r",
+ "NER F": "ents_f",
+ "TEXTCAT": "cats_score",
+ "SENT P": "sents_p",
+ "SENT R": "sents_r",
+ "SENT F": "sents_f",
+ "SPAN P": f"spans_{spans_key}_p",
+ "SPAN R": f"spans_{spans_key}_r",
+ "SPAN F": f"spans_{spans_key}_f",
+ "SPEED": "speed",
+ }
+ results = {}
+ data = {}
+ for metric, key in metrics.items():
+ if key in scores:
+ if key == "cats_score":
+ metric = metric + " (" + scores.get("cats_score_desc", "unk") + ")"
+ if isinstance(scores[key], (int, float)):
+ if key == "speed":
+ results[metric] = f"{scores[key]:.0f}"
+ else:
+ results[metric] = f"{scores[key]*100:.2f}"
else:
- results[metric] = f"{scores[key]*100:.2f}"
- else:
- results[metric] = "-"
- data[re.sub(r"[\s/]", "_", key.lower())] = scores[key]
+ results[metric] = "-"
+ data[re.sub(r"[\s/]", "_", key.lower())] = scores[key]
- msg.table(results, title="Results")
- data = handle_scores_per_type(scores, data, spans_key=spans_key, silent=silent)
+ msg.table(results, title="Results")
+ data = handle_scores_per_type(scores, data, spans_key=spans_key, silent=silent)
if displacy_path:
factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names]
diff --git a/spacy/cli/find_threshold.py b/spacy/cli/find_threshold.py
index 6d591053d..7aa32c0c6 100644
--- a/spacy/cli/find_threshold.py
+++ b/spacy/cli/find_threshold.py
@@ -1,17 +1,17 @@
import functools
+import logging
import operator
from pathlib import Path
-import logging
-from typing import Optional, Tuple, Any, Dict, List
+from typing import Any, Dict, List, Optional, Tuple
import numpy
import wasabi.tables
-from ..pipeline import TextCategorizer, MultiLabel_TextCategorizer
-from ..errors import Errors
-from ..training import Corpus
-from ._util import app, Arg, Opt, import_code, setup_gpu
from .. import util
+from ..errors import Errors
+from ..pipeline import MultiLabel_TextCategorizer, TextCategorizer
+from ..training import Corpus
+from ._util import Arg, Opt, app, import_code, setup_gpu
_DEFAULTS = {
"n_trials": 11,
diff --git a/spacy/cli/info.py b/spacy/cli/info.py
index d82bf3fbc..8bfc6b54f 100644
--- a/spacy/cli/info.py
+++ b/spacy/cli/info.py
@@ -1,15 +1,15 @@
-from typing import Optional, Dict, Any, Union, List
-import platform
import json
+import platform
from pathlib import Path
-from wasabi import Printer, MarkdownRenderer
-import srsly
+from typing import Any, Dict, List, Optional, Union
-from ._util import app, Arg, Opt, string_to_list
-from .download import get_model_filename, get_latest_version
-from .. import util
-from .. import about
+import srsly
+from wasabi import MarkdownRenderer, Printer
+
+from .. import about, util
from ..compat import importlib_metadata
+from ._util import Arg, Opt, app, string_to_list
+from .download import get_latest_version, get_model_filename
@app.command("info")
diff --git a/spacy/cli/init_config.py b/spacy/cli/init_config.py
index b634caa4c..a7c03d00f 100644
--- a/spacy/cli/init_config.py
+++ b/spacy/cli/init_config.py
@@ -1,19 +1,26 @@
-from typing import Optional, List, Tuple
+import re
from enum import Enum
from pathlib import Path
-from wasabi import Printer, diff_strings
-from thinc.api import Config
+from typing import List, Optional, Tuple
+
import srsly
-import re
from jinja2 import Template
+from thinc.api import Config
+from wasabi import Printer, diff_strings
from .. import util
from ..language import DEFAULT_CONFIG_PRETRAIN_PATH
from ..schemas import RecommendationSchema
from ..util import SimpleFrozenList
-from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND
-from ._util import string_to_list, import_code
-
+from ._util import (
+ COMMAND,
+ Arg,
+ Opt,
+ import_code,
+ init_cli,
+ show_validation_error,
+ string_to_list,
+)
ROOT = Path(__file__).parent / "templates"
TEMPLATE_PATH = ROOT / "quickstart_training.jinja"
diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py
index d53a61b8e..e0d048c69 100644
--- a/spacy/cli/init_pipeline.py
+++ b/spacy/cli/init_pipeline.py
@@ -1,15 +1,23 @@
-from typing import Optional
import logging
from pathlib import Path
-from wasabi import msg
-import typer
+from typing import Optional
+
import srsly
+import typer
+from wasabi import msg
from .. import util
-from ..training.initialize import init_nlp, convert_vectors
from ..language import Language
-from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
-from ._util import import_code, setup_gpu
+from ..training.initialize import convert_vectors, init_nlp
+from ._util import (
+ Arg,
+ Opt,
+ import_code,
+ init_cli,
+ parse_config_overrides,
+ setup_gpu,
+ show_validation_error,
+)
@init_cli.command("vectors")
diff --git a/spacy/cli/package.py b/spacy/cli/package.py
index 6351f28eb..4545578e6 100644
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@@ -1,18 +1,18 @@
-from typing import Optional, Union, Any, Dict, List, Tuple, cast
-import shutil
-from pathlib import Path
-from wasabi import Printer, MarkdownRenderer, get_raw_input
-from thinc.api import Config
-from collections import defaultdict
-from catalogue import RegistryError
-import srsly
-import sys
import re
+import shutil
+import sys
+from collections import defaultdict
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple, Union, cast
-from ._util import app, Arg, Opt, string_to_list, WHEEL_SUFFIX, SDIST_SUFFIX
-from ..schemas import validate, ModelMetaSchema
-from .. import util
-from .. import about
+import srsly
+from catalogue import RegistryError
+from thinc.api import Config
+from wasabi import MarkdownRenderer, Printer, get_raw_input
+
+from .. import about, util
+from ..schemas import ModelMetaSchema, validate
+from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app, string_to_list
@app.command("package")
diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py
index 45042e605..446c40510 100644
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@@ -1,13 +1,21 @@
-from typing import Optional
-from pathlib import Path
-from wasabi import msg
-import typer
import re
+from pathlib import Path
+from typing import Optional
+
+import typer
+from wasabi import msg
-from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
-from ._util import import_code, setup_gpu
from ..training.pretrain import pretrain
from ..util import load_config
+from ._util import (
+ Arg,
+ Opt,
+ app,
+ import_code,
+ parse_config_overrides,
+ setup_gpu,
+ show_validation_error,
+)
@app.command(
diff --git a/spacy/cli/profile.py b/spacy/cli/profile.py
index 3c282c73d..e1f720327 100644
--- a/spacy/cli/profile.py
+++ b/spacy/cli/profile.py
@@ -1,17 +1,18 @@
-from typing import Optional, Sequence, Union, Iterator
-import tqdm
-from pathlib import Path
-import srsly
import cProfile
+import itertools
import pstats
import sys
-import itertools
-from wasabi import msg, Printer
-import typer
+from pathlib import Path
+from typing import Iterator, Optional, Sequence, Union
+
+import srsly
+import tqdm
+import typer
+from wasabi import Printer, msg
-from ._util import app, debug_cli, Arg, Opt, NAME
from ..language import Language
from ..util import load_model
+from ._util import NAME, Arg, Opt, app, debug_cli
@debug_cli.command("profile")
diff --git a/spacy/cli/project/assets.py b/spacy/cli/project/assets.py
index 8f35b2d23..aa2705986 100644
--- a/spacy/cli/project/assets.py
+++ b/spacy/cli/project/assets.py
@@ -1,16 +1,27 @@
-from typing import Any, Dict, Optional
-from pathlib import Path
-from wasabi import msg
import os
import re
import shutil
+from pathlib import Path
+from typing import Any, Dict, Optional
+
import requests
import typer
+from wasabi import msg
from ...util import ensure_path, working_dir
-from .._util import project_cli, Arg, Opt, PROJECT_FILE, load_project_config
-from .._util import get_checksum, download_file, git_checkout, get_git_version
-from .._util import SimpleFrozenDict, parse_config_overrides
+from .._util import (
+ PROJECT_FILE,
+ Arg,
+ Opt,
+ SimpleFrozenDict,
+ download_file,
+ get_checksum,
+ get_git_version,
+ git_checkout,
+ load_project_config,
+ parse_config_overrides,
+ project_cli,
+)
# Whether assets are extra if `extra` is not set.
EXTRA_DEFAULT = False
diff --git a/spacy/cli/project/clone.py b/spacy/cli/project/clone.py
index 14b4ed9b5..2ee27c92a 100644
--- a/spacy/cli/project/clone.py
+++ b/spacy/cli/project/clone.py
@@ -1,13 +1,22 @@
-from typing import Optional
-from pathlib import Path
-from wasabi import msg
-import subprocess
import re
+import subprocess
+from pathlib import Path
+from typing import Optional
+
+from wasabi import msg
from ... import about
from ...util import ensure_path
-from .._util import project_cli, Arg, Opt, COMMAND, PROJECT_FILE
-from .._util import git_checkout, get_git_version, git_repo_branch_exists
+from .._util import (
+ COMMAND,
+ PROJECT_FILE,
+ Arg,
+ Opt,
+ get_git_version,
+ git_checkout,
+ git_repo_branch_exists,
+ project_cli,
+)
DEFAULT_REPO = about.__projects__
DEFAULT_PROJECTS_BRANCH = about.__projects_branch__
diff --git a/spacy/cli/project/document.py b/spacy/cli/project/document.py
index 1ba43a958..80107d27a 100644
--- a/spacy/cli/project/document.py
+++ b/spacy/cli/project/document.py
@@ -1,9 +1,9 @@
from pathlib import Path
-from wasabi import msg, MarkdownRenderer
+
+from wasabi import MarkdownRenderer, msg
from ...util import working_dir
-from .._util import project_cli, Arg, Opt, PROJECT_FILE, load_project_config
-
+from .._util import PROJECT_FILE, Arg, Opt, load_project_config, project_cli
DOCS_URL = "https://spacy.io"
INTRO_PROJECT = f"""The [`{PROJECT_FILE}`]({PROJECT_FILE}) defines the data assets required by the
diff --git a/spacy/cli/project/dvc.py b/spacy/cli/project/dvc.py
index a15353855..9ad55c433 100644
--- a/spacy/cli/project/dvc.py
+++ b/spacy/cli/project/dvc.py
@@ -1,15 +1,28 @@
"""This module contains helpers and subcommands for integrating spaCy projects
with Data Version Controk (DVC). https://dvc.org"""
-from typing import Dict, Any, List, Optional, Iterable
import subprocess
from pathlib import Path
+from typing import Any, Dict, Iterable, List, Optional
+
from wasabi import msg
-from .._util import PROJECT_FILE, load_project_config, get_hash, project_cli
-from .._util import Arg, Opt, NAME, COMMAND
-from ...util import working_dir, split_command, join_command, run_command
-from ...util import SimpleFrozenList
-
+from ...util import (
+ SimpleFrozenList,
+ join_command,
+ run_command,
+ split_command,
+ working_dir,
+)
+from .._util import (
+ COMMAND,
+ NAME,
+ PROJECT_FILE,
+ Arg,
+ Opt,
+ get_hash,
+ load_project_config,
+ project_cli,
+)
DVC_CONFIG = "dvc.yaml"
DVC_DIR = ".dvc"
diff --git a/spacy/cli/project/pull.py b/spacy/cli/project/pull.py
index 8894baa50..e9be74df7 100644
--- a/spacy/cli/project/pull.py
+++ b/spacy/cli/project/pull.py
@@ -1,9 +1,9 @@
from pathlib import Path
+
from wasabi import msg
-from .remote_storage import RemoteStorage
-from .remote_storage import get_command_hash
-from .._util import project_cli, Arg, logger
-from .._util import load_project_config
+
+from .._util import Arg, load_project_config, logger, project_cli
+from .remote_storage import RemoteStorage, get_command_hash
from .run import update_lockfile
diff --git a/spacy/cli/project/push.py b/spacy/cli/project/push.py
index a8178de21..a7915e547 100644
--- a/spacy/cli/project/push.py
+++ b/spacy/cli/project/push.py
@@ -1,9 +1,9 @@
from pathlib import Path
+
from wasabi import msg
-from .remote_storage import RemoteStorage
-from .remote_storage import get_content_hash, get_command_hash
-from .._util import load_project_config
-from .._util import project_cli, Arg, logger
+
+from .._util import Arg, load_project_config, logger, project_cli
+from .remote_storage import RemoteStorage, get_command_hash, get_content_hash
@project_cli.command("push")
diff --git a/spacy/cli/project/remote_storage.py b/spacy/cli/project/remote_storage.py
index 076541580..84235a90d 100644
--- a/spacy/cli/project/remote_storage.py
+++ b/spacy/cli/project/remote_storage.py
@@ -1,18 +1,25 @@
-from typing import Optional, List, Dict, TYPE_CHECKING
+import hashlib
import os
import site
-import hashlib
-import urllib.parse
import tarfile
+import urllib.parse
from pathlib import Path
+from typing import TYPE_CHECKING, Dict, List, Optional
+
from wasabi import msg
-from .._util import get_hash, get_checksum, upload_file, download_file
-from .._util import ensure_pathy, make_tempdir
-from ...util import get_minor_version, ENV_VARS, check_bool_env_var
-from ...git_info import GIT_VERSION
from ... import about
from ...errors import Errors
+from ...git_info import GIT_VERSION
+from ...util import ENV_VARS, check_bool_env_var, get_minor_version
+from .._util import (
+ download_file,
+ ensure_pathy,
+ get_checksum,
+ get_hash,
+ make_tempdir,
+ upload_file,
+)
if TYPE_CHECKING:
from pathy import FluidPath # noqa: F401
diff --git a/spacy/cli/project/run.py b/spacy/cli/project/run.py
index 0f4858a99..43972a202 100644
--- a/spacy/cli/project/run.py
+++ b/spacy/cli/project/run.py
@@ -1,20 +1,39 @@
-from typing import Optional, List, Dict, Sequence, Any, Iterable, Tuple
import os.path
-from pathlib import Path
-
-from wasabi import msg
-from wasabi.util import locale_escape
import sys
+from pathlib import Path
+from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple
+
import srsly
import typer
+from wasabi import msg
+from wasabi.util import locale_escape
from ... import about
from ...git_info import GIT_VERSION
-from ...util import working_dir, run_command, split_command, is_cwd, join_command
-from ...util import SimpleFrozenList, is_minor_version_match, ENV_VARS
-from ...util import check_bool_env_var, SimpleFrozenDict
-from .._util import PROJECT_FILE, PROJECT_LOCK, load_project_config, get_hash
-from .._util import get_checksum, project_cli, Arg, Opt, COMMAND, parse_config_overrides
+from ...util import (
+ ENV_VARS,
+ SimpleFrozenDict,
+ SimpleFrozenList,
+ check_bool_env_var,
+ is_cwd,
+ is_minor_version_match,
+ join_command,
+ run_command,
+ split_command,
+ working_dir,
+)
+from .._util import (
+ COMMAND,
+ PROJECT_FILE,
+ PROJECT_LOCK,
+ Arg,
+ Opt,
+ get_checksum,
+ get_hash,
+ load_project_config,
+ parse_config_overrides,
+ project_cli,
+)
@project_cli.command(
diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
index 9481e53be..e3ca73cfb 100644
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@@ -3,7 +3,7 @@ the docs and the init config command. It encodes various best practices and
can help generate the best possible configuration, given a user's requirements. #}
{%- set use_transformer = hardware != "cpu" and transformer_data -%}
{%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
-{%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "spancat", "spancat_singlelabel", "trainable_lemmatizer"] -%}
+{%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "span_finder", "spancat", "spancat_singlelabel", "trainable_lemmatizer"] -%}
[paths]
train = null
dev = null
@@ -28,7 +28,7 @@ lang = "{{ lang }}"
tok2vec/transformer. #}
{%- set with_accuracy_or_transformer = (use_transformer or with_accuracy) -%}
{%- set textcat_needs_features = has_textcat and with_accuracy_or_transformer -%}
-{%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "spancat" in components or "spancat_singlelabel" in components or "trainable_lemmatizer" in components or "entity_linker" in components or textcat_needs_features) -%}
+{%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "span_finder" in components or "spancat" in components or "spancat_singlelabel" in components or "trainable_lemmatizer" in components or "entity_linker" in components or textcat_needs_features) -%}
{%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components -%}
{%- else -%}
{%- set full_pipeline = components -%}
@@ -127,6 +127,30 @@ grad_factor = 1.0
@layers = "reduce_mean.v1"
{% endif -%}
+{% if "span_finder" in components -%}
+[components.span_finder]
+factory = "span_finder"
+max_length = null
+min_length = null
+scorer = {"@scorers":"spacy.span_finder_scorer.v1"}
+spans_key = "sc"
+threshold = 0.5
+
+[components.span_finder.model]
+@architectures = "spacy.SpanFinder.v1"
+
+[components.span_finder.model.scorer]
+@layers = "spacy.LinearLogistic.v1"
+nO = 2
+
+[components.span_finder.model.tok2vec]
+@architectures = "spacy-transformers.TransformerListener.v1"
+grad_factor = 1.0
+
+[components.span_finder.model.tok2vec.pooling]
+@layers = "reduce_mean.v1"
+{% endif -%}
+
{% if "spancat" in components -%}
[components.spancat]
factory = "spancat"
@@ -392,6 +416,27 @@ nO = null
width = ${components.tok2vec.model.encode.width}
{% endif %}
+{% if "span_finder" in components %}
+[components.span_finder]
+factory = "span_finder"
+max_length = null
+min_length = null
+scorer = {"@scorers":"spacy.span_finder_scorer.v1"}
+spans_key = "sc"
+threshold = 0.5
+
+[components.span_finder.model]
+@architectures = "spacy.SpanFinder.v1"
+
+[components.span_finder.model.scorer]
+@layers = "spacy.LinearLogistic.v1"
+nO = 2
+
+[components.span_finder.model.tok2vec]
+@architectures = "spacy.Tok2VecListener.v1"
+width = ${components.tok2vec.model.encode.width}
+{% endif %}
+
{% if "spancat" in components %}
[components.spancat]
factory = "spancat"
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index cc22cbba6..8bdabd39c 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -1,15 +1,23 @@
-from typing import Optional, Dict, Any, Union
-from pathlib import Path
-from wasabi import msg
-import typer
import logging
import sys
+from pathlib import Path
+from typing import Any, Dict, Optional, Union
+
+import typer
+from wasabi import msg
-from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
-from ._util import import_code, setup_gpu
-from ..training.loop import train as train_nlp
-from ..training.initialize import init_nlp
from .. import util
+from ..training.initialize import init_nlp
+from ..training.loop import train as train_nlp
+from ._util import (
+ Arg,
+ Opt,
+ app,
+ import_code,
+ parse_config_overrides,
+ setup_gpu,
+ show_validation_error,
+)
@app.command(
diff --git a/spacy/cli/validate.py b/spacy/cli/validate.py
index a918e9a39..0426f05fd 100644
--- a/spacy/cli/validate.py
+++ b/spacy/cli/validate.py
@@ -1,14 +1,21 @@
-from typing import Tuple
-from pathlib import Path
import sys
-import requests
-from wasabi import msg, Printer
import warnings
+from pathlib import Path
+from typing import Tuple
+
+import requests
+from wasabi import Printer, msg
-from ._util import app
from .. import about
-from ..util import get_package_version, get_installed_models, get_minor_version
-from ..util import get_package_path, get_model_meta, is_compatible_version
+from ..util import (
+ get_installed_models,
+ get_minor_version,
+ get_model_meta,
+ get_package_path,
+ get_package_version,
+ is_compatible_version,
+)
+from ._util import app
@app.command("validate")
diff --git a/spacy/compat.py b/spacy/compat.py
index 89132735d..522fa30dd 100644
--- a/spacy/compat.py
+++ b/spacy/compat.py
@@ -1,5 +1,6 @@
"""Helpers for Python and platform compatibility."""
import sys
+
from thinc.util import copy_array
try:
diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py
index f42dad0c9..bde2d04fe 100644
--- a/spacy/displacy/__init__.py
+++ b/spacy/displacy/__init__.py
@@ -4,15 +4,13 @@ spaCy's built in visualization suite for dependencies and named entities.
DOCS: https://spacy.io/api/top-level#displacy
USAGE: https://spacy.io/usage/visualizers
"""
-from typing import Union, Iterable, Optional, Dict, Any, Callable
import warnings
+from typing import Any, Callable, Dict, Iterable, Optional, Union
-from .render import DependencyRenderer, EntityRenderer, SpanRenderer
-from ..tokens import Doc, Span
from ..errors import Errors, Warnings
-from ..util import is_in_jupyter
-from ..util import find_available_port
-
+from ..tokens import Doc, Span
+from ..util import find_available_port, is_in_jupyter
+from .render import DependencyRenderer, EntityRenderer, SpanRenderer
_html = {}
RENDER_WRAPPER = None
@@ -68,7 +66,7 @@ def render(
if jupyter or (jupyter is None and is_in_jupyter()):
# return HTML rendered by IPython display()
# See #4840 for details on span wrapper to disable mathjax
- from IPython.core.display import display, HTML
+ from IPython.core.display import HTML, display
return display(HTML('{}'.format(html)))
return html
diff --git a/spacy/displacy/render.py b/spacy/displacy/render.py
index f74222dc2..86869e3b8 100644
--- a/spacy/displacy/render.py
+++ b/spacy/displacy/render.py
@@ -1,15 +1,29 @@
-from typing import Any, Dict, List, Optional, Tuple, Union
-import uuid
import itertools
+import uuid
+from typing import Any, Dict, List, Optional, Tuple, Union
from ..errors import Errors
from ..util import escape_html, minify_html, registry
-from .templates import TPL_DEP_ARCS, TPL_DEP_SVG, TPL_DEP_WORDS
-from .templates import TPL_DEP_WORDS_LEMMA, TPL_ENT, TPL_ENT_RTL, TPL_ENTS
-from .templates import TPL_FIGURE, TPL_KB_LINK, TPL_PAGE, TPL_SPAN
-from .templates import TPL_SPAN_RTL, TPL_SPAN_SLICE, TPL_SPAN_SLICE_RTL
-from .templates import TPL_SPAN_START, TPL_SPAN_START_RTL, TPL_SPANS
-from .templates import TPL_TITLE
+from .templates import (
+ TPL_DEP_ARCS,
+ TPL_DEP_SVG,
+ TPL_DEP_WORDS,
+ TPL_DEP_WORDS_LEMMA,
+ TPL_ENT,
+ TPL_ENT_RTL,
+ TPL_ENTS,
+ TPL_FIGURE,
+ TPL_KB_LINK,
+ TPL_PAGE,
+ TPL_SPAN,
+ TPL_SPAN_RTL,
+ TPL_SPAN_SLICE,
+ TPL_SPAN_SLICE_RTL,
+ TPL_SPAN_START,
+ TPL_SPAN_START_RTL,
+ TPL_SPANS,
+ TPL_TITLE,
+)
DEFAULT_LANG = "en"
DEFAULT_DIR = "ltr"
diff --git a/spacy/errors.py b/spacy/errors.py
index 40cfa8d92..a95f0c8a2 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -1,4 +1,5 @@
import warnings
+
from .compat import Literal
@@ -738,8 +739,8 @@ class Errors(metaclass=ErrorsWithCodes):
"model from a shortcut, which is obsolete as of spaCy v3.0. To "
"load the model, use its full name instead:\n\n"
"nlp = spacy.load(\"{full}\")\n\nFor more details on the available "
- "models, see the models directory: https://spacy.io/models. If you "
- "want to create a blank model, use spacy.blank: "
+ "models, see the models directory: https://spacy.io/models and if "
+ "you want to create a blank model, use spacy.blank: "
"nlp = spacy.blank(\"{name}\")")
E942 = ("Executing `after_{name}` callback failed. Expected the function to "
"return an initialized nlp object but got: {value}. Maybe "
@@ -970,6 +971,13 @@ class Errors(metaclass=ErrorsWithCodes):
E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port=port)` "
"or use `auto_select_port=True` to pick an available port automatically.")
E1051 = ("'allow_overlap' can only be False when max_positive is 1, but found 'max_positive': {max_positive}.")
+ E1052 = ("Unable to copy spans: the character offsets for the span at "
+ "index {i} in the span group do not align with the tokenization "
+ "in the target doc.")
+ E1053 = ("Both 'min_length' and 'max_length' should be larger than 0, but found"
+ " 'min_length': {min_length}, 'max_length': {max_length}")
+ E1054 = ("The text, including whitespace, must match between reference and "
+ "predicted docs when training {component}.")
# Deprecated model shortcuts, only used in errors and warnings
diff --git a/spacy/glossary.py b/spacy/glossary.py
index d2240fbba..1f628698b 100644
--- a/spacy/glossary.py
+++ b/spacy/glossary.py
@@ -1,4 +1,5 @@
import warnings
+
from .errors import Warnings
diff --git a/spacy/kb/__init__.py b/spacy/kb/__init__.py
index 1d70a9b34..3ce3e4c33 100644
--- a/spacy/kb/__init__.py
+++ b/spacy/kb/__init__.py
@@ -1,3 +1,3 @@
+from .candidate import Candidate, get_candidates, get_candidates_batch
from .kb import KnowledgeBase
from .kb_in_memory import InMemoryLookupKB
-from .candidate import Candidate, get_candidates, get_candidates_batch
diff --git a/spacy/kb/candidate.pxd b/spacy/kb/candidate.pxd
index 942ce9dd0..9fc4c4e9d 100644
--- a/spacy/kb/candidate.pxd
+++ b/spacy/kb/candidate.pxd
@@ -1,6 +1,8 @@
-from .kb cimport KnowledgeBase
from libcpp.vector cimport vector
+
from ..typedefs cimport hash_t
+from .kb cimport KnowledgeBase
+
# Object used by the Entity Linker that summarizes one entity-alias candidate combination.
cdef class Candidate:
diff --git a/spacy/kb/candidate.pyx b/spacy/kb/candidate.pyx
index c89efeb03..4cd734f43 100644
--- a/spacy/kb/candidate.pyx
+++ b/spacy/kb/candidate.pyx
@@ -1,9 +1,12 @@
# cython: infer_types=True, profile=True
from typing import Iterable
+
from .kb cimport KnowledgeBase
+
from ..tokens import Span
+
cdef class Candidate:
"""A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved
to a specific `entity` from a Knowledge Base. This will be used as input for the entity linking
diff --git a/spacy/kb/kb.pxd b/spacy/kb/kb.pxd
index 1adeef8ae..263469546 100644
--- a/spacy/kb/kb.pxd
+++ b/spacy/kb/kb.pxd
@@ -2,8 +2,10 @@
from cymem.cymem cimport Pool
from libc.stdint cimport int64_t
+
from ..vocab cimport Vocab
+
cdef class KnowledgeBase:
cdef Pool mem
cdef readonly Vocab vocab
diff --git a/spacy/kb/kb.pyx b/spacy/kb/kb.pyx
index ce4bc0138..a88e18e1f 100644
--- a/spacy/kb/kb.pyx
+++ b/spacy/kb/kb.pyx
@@ -2,12 +2,13 @@
from pathlib import Path
from typing import Iterable, Tuple, Union
+
from cymem.cymem cimport Pool
-from .candidate import Candidate
+from ..errors import Errors
from ..tokens import Span
from ..util import SimpleFrozenList
-from ..errors import Errors
+from .candidate import Candidate
cdef class KnowledgeBase:
diff --git a/spacy/kb/kb_in_memory.pxd b/spacy/kb/kb_in_memory.pxd
index 825a6bde9..08ec6b2a3 100644
--- a/spacy/kb/kb_in_memory.pxd
+++ b/spacy/kb/kb_in_memory.pxd
@@ -1,11 +1,11 @@
"""Knowledge-base for entity or concept linking."""
-from preshed.maps cimport PreshMap
-from libcpp.vector cimport vector
from libc.stdint cimport int32_t, int64_t
from libc.stdio cimport FILE
+from libcpp.vector cimport vector
+from preshed.maps cimport PreshMap
+from ..structs cimport AliasC, KBEntryC
from ..typedefs cimport hash_t
-from ..structs cimport KBEntryC, AliasC
from .kb cimport KnowledgeBase
ctypedef vector[KBEntryC] entry_vec
diff --git a/spacy/kb/kb_in_memory.pyx b/spacy/kb/kb_in_memory.pyx
index 2a74d047b..e991f7720 100644
--- a/spacy/kb/kb_in_memory.pyx
+++ b/spacy/kb/kb_in_memory.pyx
@@ -1,23 +1,28 @@
# cython: infer_types=True, profile=True
-from typing import Iterable, Callable, Dict, Any, Union
+from typing import Any, Callable, Dict, Iterable, Union
import srsly
-from preshed.maps cimport PreshMap
-from cpython.exc cimport PyErr_SetFromErrno
-from libc.stdio cimport fopen, fclose, fread, fwrite, feof, fseek
-from libc.stdint cimport int32_t, int64_t
-from libcpp.vector cimport vector
-from pathlib import Path
+from cpython.exc cimport PyErr_SetFromErrno
+from libc.stdint cimport int32_t, int64_t
+from libc.stdio cimport fclose, feof, fopen, fread, fseek, fwrite
+from libcpp.vector cimport vector
+from preshed.maps cimport PreshMap
+
import warnings
+from pathlib import Path
from ..tokens import Span
+
from ..typedefs cimport hash_t
-from ..errors import Errors, Warnings
+
from .. import util
+from ..errors import Errors, Warnings
from ..util import SimpleFrozenList, ensure_path
+
from ..vocab cimport Vocab
from .kb cimport KnowledgeBase
+
from .candidate import Candidate as Candidate
diff --git a/spacy/lang/af/__init__.py b/spacy/lang/af/__init__.py
index 553fcbf4c..8bd73c7ad 100644
--- a/spacy/lang/af/__init__.py
+++ b/spacy/lang/af/__init__.py
@@ -1,5 +1,5 @@
+from ...language import BaseDefaults, Language
from .stop_words import STOP_WORDS
-from ...language import Language, BaseDefaults
class AfrikaansDefaults(BaseDefaults):
diff --git a/spacy/lang/am/__init__.py b/spacy/lang/am/__init__.py
index ddae556d6..284823eaa 100644
--- a/spacy/lang/am/__init__.py
+++ b/spacy/lang/am/__init__.py
@@ -1,12 +1,11 @@
-from .stop_words import STOP_WORDS
+from ...attrs import LANG
+from ...language import BaseDefaults, Language
+from ...util import update_exc
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_SUFFIXES
-
+from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...language import Language, BaseDefaults
-from ...attrs import LANG
-from ...util import update_exc
class AmharicDefaults(BaseDefaults):
diff --git a/spacy/lang/am/punctuation.py b/spacy/lang/am/punctuation.py
index 555a179fa..87447b054 100644
--- a/spacy/lang/am/punctuation.py
+++ b/spacy/lang/am/punctuation.py
@@ -1,5 +1,11 @@
-from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
-from ..char_classes import UNITS, ALPHA_UPPER
+from ..char_classes import (
+ ALPHA_UPPER,
+ CURRENCY,
+ LIST_ELLIPSES,
+ LIST_PUNCT,
+ LIST_QUOTES,
+ UNITS,
+)
_list_punct = LIST_PUNCT + "፡ ። ፣ ፤ ፥ ፦ ፧ ፠ ፨".strip().split()
diff --git a/spacy/lang/am/tokenizer_exceptions.py b/spacy/lang/am/tokenizer_exceptions.py
index 9472fe918..1ccf996ca 100644
--- a/spacy/lang/am/tokenizer_exceptions.py
+++ b/spacy/lang/am/tokenizer_exceptions.py
@@ -1,5 +1,4 @@
-from ...symbols import ORTH, NORM
-
+from ...symbols import NORM, ORTH
_exc = {}
diff --git a/spacy/lang/ar/__init__.py b/spacy/lang/ar/__init__.py
index 18c1f90ed..d50b0722c 100644
--- a/spacy/lang/ar/__init__.py
+++ b/spacy/lang/ar/__init__.py
@@ -1,8 +1,8 @@
-from .stop_words import STOP_WORDS
+from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_SUFFIXES
+from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from ...language import Language, BaseDefaults
class ArabicDefaults(BaseDefaults):
diff --git a/spacy/lang/ar/punctuation.py b/spacy/lang/ar/punctuation.py
index f30204c02..cf03fc68e 100644
--- a/spacy/lang/ar/punctuation.py
+++ b/spacy/lang/ar/punctuation.py
@@ -1,5 +1,11 @@
-from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
-from ..char_classes import UNITS, ALPHA_UPPER
+from ..char_classes import (
+ ALPHA_UPPER,
+ CURRENCY,
+ LIST_ELLIPSES,
+ LIST_PUNCT,
+ LIST_QUOTES,
+ UNITS,
+)
_suffixes = (
LIST_PUNCT
diff --git a/spacy/lang/ar/tokenizer_exceptions.py b/spacy/lang/ar/tokenizer_exceptions.py
index 7c385bef8..eb16876f5 100644
--- a/spacy/lang/ar/tokenizer_exceptions.py
+++ b/spacy/lang/ar/tokenizer_exceptions.py
@@ -1,7 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...symbols import ORTH, NORM
+from ...symbols import NORM, ORTH
from ...util import update_exc
-
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc = {}
diff --git a/spacy/lang/az/__init__.py b/spacy/lang/az/__init__.py
index 476898364..32949aa3e 100644
--- a/spacy/lang/az/__init__.py
+++ b/spacy/lang/az/__init__.py
@@ -1,6 +1,6 @@
-from .stop_words import STOP_WORDS
+from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
-from ...language import Language, BaseDefaults
+from .stop_words import STOP_WORDS
class AzerbaijaniDefaults(BaseDefaults):
diff --git a/spacy/lang/az/lex_attrs.py b/spacy/lang/az/lex_attrs.py
index 73a5e2762..96fb7f020 100644
--- a/spacy/lang/az/lex_attrs.py
+++ b/spacy/lang/az/lex_attrs.py
@@ -1,6 +1,5 @@
from ...attrs import LIKE_NUM
-
# Eleven, twelve etc. are written separate: on bir, on iki
_num_words = [
diff --git a/spacy/lang/bg/__init__.py b/spacy/lang/bg/__init__.py
index c9176b946..acca63ba1 100644
--- a/spacy/lang/bg/__init__.py
+++ b/spacy/lang/bg/__init__.py
@@ -1,12 +1,14 @@
+from ...attrs import LANG
+from ...language import BaseDefaults, Language
+from ...util import update_exc
+from ..punctuation import (
+ COMBINING_DIACRITICS_TOKENIZER_INFIXES,
+ COMBINING_DIACRITICS_TOKENIZER_SUFFIXES,
+)
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
+from .lex_attrs import LEX_ATTRS
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .lex_attrs import LEX_ATTRS
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_INFIXES
-from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_SUFFIXES
-from ...language import Language, BaseDefaults
-from ...attrs import LANG
-from ...util import update_exc
class BulgarianDefaults(BaseDefaults):
diff --git a/spacy/lang/bg/lex_attrs.py b/spacy/lang/bg/lex_attrs.py
index bba3c74cd..0b7942aec 100644
--- a/spacy/lang/bg/lex_attrs.py
+++ b/spacy/lang/bg/lex_attrs.py
@@ -1,6 +1,5 @@
from ...attrs import LIKE_NUM
-
_num_words = [
"нула",
"едно",
diff --git a/spacy/lang/bg/tokenizer_exceptions.py b/spacy/lang/bg/tokenizer_exceptions.py
index 0f484b778..89d466daf 100644
--- a/spacy/lang/bg/tokenizer_exceptions.py
+++ b/spacy/lang/bg/tokenizer_exceptions.py
@@ -4,8 +4,7 @@ References:
(countries, occupations, fields of studies and more).
"""
-from ...symbols import ORTH, NORM
-
+from ...symbols import NORM, ORTH
_exc = {}
diff --git a/spacy/lang/bn/__init__.py b/spacy/lang/bn/__init__.py
index 6d0331e00..6a5d37f5b 100644
--- a/spacy/lang/bn/__init__.py
+++ b/spacy/lang/bn/__init__.py
@@ -1,10 +1,12 @@
-from typing import Optional, Callable
+from typing import Callable, Optional
+
from thinc.api import Model
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
-from .stop_words import STOP_WORDS
-from ...language import Language, BaseDefaults
+
+from ...language import BaseDefaults, Language
from ...pipeline import Lemmatizer
+from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
+from .stop_words import STOP_WORDS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class BengaliDefaults(BaseDefaults):
diff --git a/spacy/lang/bn/punctuation.py b/spacy/lang/bn/punctuation.py
index becfe8d2a..ddb91cef1 100644
--- a/spacy/lang/bn/punctuation.py
+++ b/spacy/lang/bn/punctuation.py
@@ -1,6 +1,14 @@
-from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
-from ..char_classes import ALPHA_LOWER, ALPHA, HYPHENS, CONCAT_QUOTES, UNITS
-
+from ..char_classes import (
+ ALPHA,
+ ALPHA_LOWER,
+ CONCAT_QUOTES,
+ HYPHENS,
+ LIST_ELLIPSES,
+ LIST_ICONS,
+ LIST_PUNCT,
+ LIST_QUOTES,
+ UNITS,
+)
_currency = r"\$¢£€¥฿৳"
_quotes = CONCAT_QUOTES.replace("'", "")
diff --git a/spacy/lang/bn/tokenizer_exceptions.py b/spacy/lang/bn/tokenizer_exceptions.py
index e666522b8..016bf0fc5 100644
--- a/spacy/lang/bn/tokenizer_exceptions.py
+++ b/spacy/lang/bn/tokenizer_exceptions.py
@@ -1,7 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...symbols import ORTH, NORM
+from ...symbols import NORM, ORTH
from ...util import update_exc
-
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc = {}
diff --git a/spacy/lang/ca/__init__.py b/spacy/lang/ca/__init__.py
index a3def660d..8b2f3e85a 100755
--- a/spacy/lang/ca/__init__.py
+++ b/spacy/lang/ca/__init__.py
@@ -1,14 +1,14 @@
-from typing import Optional, Callable
+from typing import Callable, Optional
from thinc.api import Model
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES
-from .stop_words import STOP_WORDS
-from .lex_attrs import LEX_ATTRS
-from .syntax_iterators import SYNTAX_ITERATORS
-from ...language import Language, BaseDefaults
+from ...language import BaseDefaults, Language
from .lemmatizer import CatalanLemmatizer
+from .lex_attrs import LEX_ATTRS
+from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
+from .stop_words import STOP_WORDS
+from .syntax_iterators import SYNTAX_ITERATORS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class CatalanDefaults(BaseDefaults):
diff --git a/spacy/lang/ca/lex_attrs.py b/spacy/lang/ca/lex_attrs.py
index be8b7a6ea..3e99da0e0 100644
--- a/spacy/lang/ca/lex_attrs.py
+++ b/spacy/lang/ca/lex_attrs.py
@@ -1,6 +1,5 @@
from ...attrs import LIKE_NUM
-
_num_words = [
"zero",
"un",
diff --git a/spacy/lang/ca/punctuation.py b/spacy/lang/ca/punctuation.py
index 8e2f09828..6914f67a7 100755
--- a/spacy/lang/ca/punctuation.py
+++ b/spacy/lang/ca/punctuation.py
@@ -1,9 +1,18 @@
-from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
-from ..char_classes import LIST_CURRENCY
-from ..char_classes import CURRENCY
-from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT
-from ..char_classes import merge_chars, _units
-
+from ..char_classes import (
+ ALPHA,
+ ALPHA_LOWER,
+ ALPHA_UPPER,
+ CONCAT_QUOTES,
+ CURRENCY,
+ LIST_CURRENCY,
+ LIST_ELLIPSES,
+ LIST_ICONS,
+ LIST_PUNCT,
+ LIST_QUOTES,
+ PUNCT,
+ _units,
+ merge_chars,
+)
ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "")
diff --git a/spacy/lang/ca/syntax_iterators.py b/spacy/lang/ca/syntax_iterators.py
index 917e07c93..16a4c6a81 100644
--- a/spacy/lang/ca/syntax_iterators.py
+++ b/spacy/lang/ca/syntax_iterators.py
@@ -1,7 +1,8 @@
-from typing import Union, Iterator, Tuple
-from ...tokens import Doc, Span
-from ...symbols import NOUN, PROPN
+from typing import Iterator, Tuple, Union
+
from ...errors import Errors
+from ...symbols import NOUN, PROPN
+from ...tokens import Doc, Span
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
diff --git a/spacy/lang/ca/tokenizer_exceptions.py b/spacy/lang/ca/tokenizer_exceptions.py
index b261b3498..67165780e 100755
--- a/spacy/lang/ca/tokenizer_exceptions.py
+++ b/spacy/lang/ca/tokenizer_exceptions.py
@@ -1,7 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...symbols import ORTH, NORM
+from ...symbols import NORM, ORTH
from ...util import update_exc
-
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc = {}
diff --git a/spacy/lang/cs/__init__.py b/spacy/lang/cs/__init__.py
index 3e70e4078..9ea60afdf 100644
--- a/spacy/lang/cs/__init__.py
+++ b/spacy/lang/cs/__init__.py
@@ -1,6 +1,6 @@
-from .stop_words import STOP_WORDS
+from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
-from ...language import Language, BaseDefaults
+from .stop_words import STOP_WORDS
class CzechDefaults(BaseDefaults):
diff --git a/spacy/lang/da/__init__.py b/spacy/lang/da/__init__.py
index e148a7b4f..372f372dd 100644
--- a/spacy/lang/da/__init__.py
+++ b/spacy/lang/da/__init__.py
@@ -1,9 +1,9 @@
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
+from ...language import BaseDefaults, Language
+from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS
-from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS
-from ...language import Language, BaseDefaults
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class DanishDefaults(BaseDefaults):
diff --git a/spacy/lang/da/lex_attrs.py b/spacy/lang/da/lex_attrs.py
index 403af686c..8e0420912 100644
--- a/spacy/lang/da/lex_attrs.py
+++ b/spacy/lang/da/lex_attrs.py
@@ -1,6 +1,5 @@
from ...attrs import LIKE_NUM
-
# Source http://fjern-uv.dk/tal.php
_num_words = """nul
en et to tre fire fem seks syv otte ni ti
diff --git a/spacy/lang/da/punctuation.py b/spacy/lang/da/punctuation.py
index e050ab7aa..f70fe3d64 100644
--- a/spacy/lang/da/punctuation.py
+++ b/spacy/lang/da/punctuation.py
@@ -1,8 +1,13 @@
-from ..char_classes import LIST_ELLIPSES, LIST_ICONS
-from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
+from ..char_classes import (
+ ALPHA,
+ ALPHA_LOWER,
+ ALPHA_UPPER,
+ CONCAT_QUOTES,
+ LIST_ELLIPSES,
+ LIST_ICONS,
+)
from ..punctuation import TOKENIZER_SUFFIXES
-
_quotes = CONCAT_QUOTES.replace("'", "")
_infixes = (
diff --git a/spacy/lang/da/syntax_iterators.py b/spacy/lang/da/syntax_iterators.py
index a0b70f004..60224f0b1 100644
--- a/spacy/lang/da/syntax_iterators.py
+++ b/spacy/lang/da/syntax_iterators.py
@@ -1,7 +1,8 @@
-from typing import Union, Iterator, Tuple
-from ...tokens import Doc, Span
-from ...symbols import NOUN, PROPN, PRON, VERB, AUX
+from typing import Iterator, Tuple, Union
+
from ...errors import Errors
+from ...symbols import AUX, NOUN, PRON, PROPN, VERB
+from ...tokens import Doc, Span
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
diff --git a/spacy/lang/da/tokenizer_exceptions.py b/spacy/lang/da/tokenizer_exceptions.py
index ce25c546b..649d12022 100644
--- a/spacy/lang/da/tokenizer_exceptions.py
+++ b/spacy/lang/da/tokenizer_exceptions.py
@@ -2,10 +2,9 @@
Tokenizer Exceptions.
Source: https://forkortelse.dk/ and various others.
"""
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...symbols import ORTH, NORM
+from ...symbols import NORM, ORTH
from ...util import update_exc
-
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc = {}
diff --git a/spacy/lang/de/__init__.py b/spacy/lang/de/__init__.py
index 65863c098..4f45b2357 100644
--- a/spacy/lang/de/__init__.py
+++ b/spacy/lang/de/__init__.py
@@ -1,8 +1,8 @@
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
+from ...language import BaseDefaults, Language
+from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS
-from ...language import Language, BaseDefaults
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class GermanDefaults(BaseDefaults):
diff --git a/spacy/lang/de/punctuation.py b/spacy/lang/de/punctuation.py
index 69d402237..862207649 100644
--- a/spacy/lang/de/punctuation.py
+++ b/spacy/lang/de/punctuation.py
@@ -1,9 +1,18 @@
-from ..char_classes import LIST_ELLIPSES, LIST_ICONS, LIST_PUNCT, LIST_QUOTES
-from ..char_classes import CURRENCY, UNITS, PUNCT
-from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
+from ..char_classes import (
+ ALPHA,
+ ALPHA_LOWER,
+ ALPHA_UPPER,
+ CONCAT_QUOTES,
+ CURRENCY,
+ LIST_ELLIPSES,
+ LIST_ICONS,
+ LIST_PUNCT,
+ LIST_QUOTES,
+ PUNCT,
+ UNITS,
+)
from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES
-
_prefixes = ["``"] + BASE_TOKENIZER_PREFIXES
_suffixes = (
diff --git a/spacy/lang/de/syntax_iterators.py b/spacy/lang/de/syntax_iterators.py
index e80504998..544fe299c 100644
--- a/spacy/lang/de/syntax_iterators.py
+++ b/spacy/lang/de/syntax_iterators.py
@@ -1,7 +1,7 @@
-from typing import Union, Iterator, Tuple
+from typing import Iterator, Tuple, Union
-from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors
+from ...symbols import NOUN, PRON, PROPN
from ...tokens import Doc, Span
diff --git a/spacy/lang/de/tokenizer_exceptions.py b/spacy/lang/de/tokenizer_exceptions.py
index 21d99cffe..3f1aeeccd 100644
--- a/spacy/lang/de/tokenizer_exceptions.py
+++ b/spacy/lang/de/tokenizer_exceptions.py
@@ -1,7 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...symbols import ORTH, NORM
+from ...symbols import NORM, ORTH
from ...util import update_exc
-
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc = {
"auf'm": [{ORTH: "auf"}, {ORTH: "'m", NORM: "dem"}],
diff --git a/spacy/lang/dsb/__init__.py b/spacy/lang/dsb/__init__.py
index c66092a0c..096eced19 100644
--- a/spacy/lang/dsb/__init__.py
+++ b/spacy/lang/dsb/__init__.py
@@ -1,6 +1,6 @@
+from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
from .stop_words import STOP_WORDS
-from ...language import Language, BaseDefaults
class LowerSorbianDefaults(BaseDefaults):
diff --git a/spacy/lang/el/__init__.py b/spacy/lang/el/__init__.py
index 53dd9be8e..00e52bd97 100644
--- a/spacy/lang/el/__init__.py
+++ b/spacy/lang/el/__init__.py
@@ -1,13 +1,14 @@
-from typing import Optional, Callable
+from typing import Callable, Optional
+
from thinc.api import Model
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .stop_words import STOP_WORDS
-from .lex_attrs import LEX_ATTRS
-from .syntax_iterators import SYNTAX_ITERATORS
-from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
+from ...language import BaseDefaults, Language
from .lemmatizer import GreekLemmatizer
-from ...language import Language, BaseDefaults
+from .lex_attrs import LEX_ATTRS
+from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
+from .stop_words import STOP_WORDS
+from .syntax_iterators import SYNTAX_ITERATORS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class GreekDefaults(BaseDefaults):
diff --git a/spacy/lang/el/get_pos_from_wiktionary.py b/spacy/lang/el/get_pos_from_wiktionary.py
index 369973cc0..10b54d112 100644
--- a/spacy/lang/el/get_pos_from_wiktionary.py
+++ b/spacy/lang/el/get_pos_from_wiktionary.py
@@ -1,5 +1,6 @@
def get_pos_from_wiktionary():
import re
+
from gensim.corpora.wikicorpus import extract_pages
regex = re.compile(r"==={{(\w+)\|el}}===")
diff --git a/spacy/lang/el/punctuation.py b/spacy/lang/el/punctuation.py
index 2d5690407..b8b717bac 100644
--- a/spacy/lang/el/punctuation.py
+++ b/spacy/lang/el/punctuation.py
@@ -1,6 +1,16 @@
-from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
-from ..char_classes import LIST_ICONS, ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS
-from ..char_classes import CONCAT_QUOTES, CURRENCY
+from ..char_classes import (
+ ALPHA,
+ ALPHA_LOWER,
+ ALPHA_UPPER,
+ CONCAT_QUOTES,
+ CURRENCY,
+ HYPHENS,
+ LIST_CURRENCY,
+ LIST_ELLIPSES,
+ LIST_ICONS,
+ LIST_PUNCT,
+ LIST_QUOTES,
+)
_units = (
"km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft "
diff --git a/spacy/lang/el/syntax_iterators.py b/spacy/lang/el/syntax_iterators.py
index 18fa46695..31c7dccf7 100644
--- a/spacy/lang/el/syntax_iterators.py
+++ b/spacy/lang/el/syntax_iterators.py
@@ -1,7 +1,7 @@
-from typing import Union, Iterator, Tuple
+from typing import Iterator, Tuple, Union
-from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors
+from ...symbols import NOUN, PRON, PROPN
from ...tokens import Doc, Span
diff --git a/spacy/lang/el/tokenizer_exceptions.py b/spacy/lang/el/tokenizer_exceptions.py
index 0a36d5d2b..41317ba97 100644
--- a/spacy/lang/el/tokenizer_exceptions.py
+++ b/spacy/lang/el/tokenizer_exceptions.py
@@ -1,6 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...symbols import ORTH, NORM
+from ...symbols import NORM, ORTH
from ...util import update_exc
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc = {}
diff --git a/spacy/lang/en/__init__.py b/spacy/lang/en/__init__.py
index 876186979..c4bcfb938 100644
--- a/spacy/lang/en/__init__.py
+++ b/spacy/lang/en/__init__.py
@@ -1,13 +1,14 @@
-from typing import Optional, Callable
+from typing import Callable, Optional
+
from thinc.api import Model
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .stop_words import STOP_WORDS
-from .lex_attrs import LEX_ATTRS
-from .syntax_iterators import SYNTAX_ITERATORS
-from .punctuation import TOKENIZER_INFIXES
+from ...language import BaseDefaults, Language
from .lemmatizer import EnglishLemmatizer
-from ...language import Language, BaseDefaults
+from .lex_attrs import LEX_ATTRS
+from .punctuation import TOKENIZER_INFIXES
+from .stop_words import STOP_WORDS
+from .syntax_iterators import SYNTAX_ITERATORS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class EnglishDefaults(BaseDefaults):
diff --git a/spacy/lang/en/punctuation.py b/spacy/lang/en/punctuation.py
index 5d3eb792e..775c6b001 100644
--- a/spacy/lang/en/punctuation.py
+++ b/spacy/lang/en/punctuation.py
@@ -1,5 +1,12 @@
-from ..char_classes import LIST_ELLIPSES, LIST_ICONS, HYPHENS
-from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
+from ..char_classes import (
+ ALPHA,
+ ALPHA_LOWER,
+ ALPHA_UPPER,
+ CONCAT_QUOTES,
+ HYPHENS,
+ LIST_ELLIPSES,
+ LIST_ICONS,
+)
_infixes = (
LIST_ELLIPSES
diff --git a/spacy/lang/en/syntax_iterators.py b/spacy/lang/en/syntax_iterators.py
index 7904e5621..140ae0a5c 100644
--- a/spacy/lang/en/syntax_iterators.py
+++ b/spacy/lang/en/syntax_iterators.py
@@ -1,7 +1,7 @@
-from typing import Union, Iterator, Tuple
+from typing import Iterator, Tuple, Union
-from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors
+from ...symbols import NOUN, PRON, PROPN
from ...tokens import Doc, Span
diff --git a/spacy/lang/en/tokenizer_exceptions.py b/spacy/lang/en/tokenizer_exceptions.py
index 7886e28cb..dd3650c18 100644
--- a/spacy/lang/en/tokenizer_exceptions.py
+++ b/spacy/lang/en/tokenizer_exceptions.py
@@ -1,8 +1,8 @@
from typing import Dict, List
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...symbols import ORTH, NORM
-from ...util import update_exc
+from ...symbols import NORM, ORTH
+from ...util import update_exc
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc: Dict[str, List[Dict]] = {}
_exclude = [
diff --git a/spacy/lang/es/__init__.py b/spacy/lang/es/__init__.py
index e75955202..bcaed8672 100644
--- a/spacy/lang/es/__init__.py
+++ b/spacy/lang/es/__init__.py
@@ -1,12 +1,14 @@
-from typing import Optional, Callable
+from typing import Callable, Optional
+
from thinc.api import Model
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .stop_words import STOP_WORDS
-from .lex_attrs import LEX_ATTRS
+
+from ...language import BaseDefaults, Language
from .lemmatizer import SpanishLemmatizer
-from .syntax_iterators import SYNTAX_ITERATORS
+from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
-from ...language import Language, BaseDefaults
+from .stop_words import STOP_WORDS
+from .syntax_iterators import SYNTAX_ITERATORS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class SpanishDefaults(BaseDefaults):
diff --git a/spacy/lang/es/lemmatizer.py b/spacy/lang/es/lemmatizer.py
index ca5fc08c8..44f968347 100644
--- a/spacy/lang/es/lemmatizer.py
+++ b/spacy/lang/es/lemmatizer.py
@@ -1,5 +1,5 @@
-from typing import List, Optional, Tuple
import re
+from typing import List, Optional, Tuple
from ...pipeline import Lemmatizer
from ...tokens import Token
diff --git a/spacy/lang/es/lex_attrs.py b/spacy/lang/es/lex_attrs.py
index 9d1fa93b8..4c477eaee 100644
--- a/spacy/lang/es/lex_attrs.py
+++ b/spacy/lang/es/lex_attrs.py
@@ -1,6 +1,5 @@
from ...attrs import LIKE_NUM
-
_num_words = [
"cero",
"uno",
diff --git a/spacy/lang/es/punctuation.py b/spacy/lang/es/punctuation.py
index e9552371e..3d20518cd 100644
--- a/spacy/lang/es/punctuation.py
+++ b/spacy/lang/es/punctuation.py
@@ -1,8 +1,17 @@
-from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES
-from ..char_classes import LIST_ICONS, CURRENCY, LIST_UNITS, PUNCT
-from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
-from ..char_classes import merge_chars
-
+from ..char_classes import (
+ ALPHA,
+ ALPHA_LOWER,
+ ALPHA_UPPER,
+ CONCAT_QUOTES,
+ CURRENCY,
+ LIST_ELLIPSES,
+ LIST_ICONS,
+ LIST_PUNCT,
+ LIST_QUOTES,
+ LIST_UNITS,
+ PUNCT,
+ merge_chars,
+)
_list_units = [u for u in LIST_UNITS if u != "%"]
_units = merge_chars(" ".join(_list_units))
diff --git a/spacy/lang/es/syntax_iterators.py b/spacy/lang/es/syntax_iterators.py
index f2ca2a678..96df444a3 100644
--- a/spacy/lang/es/syntax_iterators.py
+++ b/spacy/lang/es/syntax_iterators.py
@@ -1,7 +1,7 @@
-from typing import Union, Iterator, Tuple
+from typing import Iterator, Tuple, Union
-from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors
+from ...symbols import NOUN, PRON, PROPN
from ...tokens import Doc, Span
diff --git a/spacy/lang/es/tokenizer_exceptions.py b/spacy/lang/es/tokenizer_exceptions.py
index 74cdc143d..2ea0ed8b7 100644
--- a/spacy/lang/es/tokenizer_exceptions.py
+++ b/spacy/lang/es/tokenizer_exceptions.py
@@ -1,7 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...symbols import ORTH, NORM
+from ...symbols import NORM, ORTH
from ...util import update_exc
-
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc = {
"pal": [{ORTH: "pa"}, {ORTH: "l", NORM: "el"}],
diff --git a/spacy/lang/et/__init__.py b/spacy/lang/et/__init__.py
index 274bc1309..9ec7e6006 100644
--- a/spacy/lang/et/__init__.py
+++ b/spacy/lang/et/__init__.py
@@ -1,5 +1,5 @@
+from ...language import BaseDefaults, Language
from .stop_words import STOP_WORDS
-from ...language import Language, BaseDefaults
class EstonianDefaults(BaseDefaults):
diff --git a/spacy/lang/eu/__init__.py b/spacy/lang/eu/__init__.py
index 3346468bd..81f9c4a18 100644
--- a/spacy/lang/eu/__init__.py
+++ b/spacy/lang/eu/__init__.py
@@ -1,7 +1,7 @@
-from .stop_words import STOP_WORDS
+from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_SUFFIXES
-from ...language import Language, BaseDefaults
+from .stop_words import STOP_WORDS
class BasqueDefaults(BaseDefaults):
diff --git a/spacy/lang/eu/punctuation.py b/spacy/lang/eu/punctuation.py
index 5d35d0a25..382bfc75c 100644
--- a/spacy/lang/eu/punctuation.py
+++ b/spacy/lang/eu/punctuation.py
@@ -1,4 +1,3 @@
from ..punctuation import TOKENIZER_SUFFIXES
-
_suffixes = TOKENIZER_SUFFIXES
diff --git a/spacy/lang/fa/__init__.py b/spacy/lang/fa/__init__.py
index 914e4c27d..e5baa8b4a 100644
--- a/spacy/lang/fa/__init__.py
+++ b/spacy/lang/fa/__init__.py
@@ -1,12 +1,14 @@
-from typing import Optional, Callable
+from typing import Callable, Optional
+
from thinc.api import Model
-from .stop_words import STOP_WORDS
-from .lex_attrs import LEX_ATTRS
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .punctuation import TOKENIZER_SUFFIXES
-from .syntax_iterators import SYNTAX_ITERATORS
-from ...language import Language, BaseDefaults
+
+from ...language import BaseDefaults, Language
from ...pipeline import Lemmatizer
+from .lex_attrs import LEX_ATTRS
+from .punctuation import TOKENIZER_SUFFIXES
+from .stop_words import STOP_WORDS
+from .syntax_iterators import SYNTAX_ITERATORS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class PersianDefaults(BaseDefaults):
diff --git a/spacy/lang/fa/lex_attrs.py b/spacy/lang/fa/lex_attrs.py
index 99b8e2787..065e81bd6 100644
--- a/spacy/lang/fa/lex_attrs.py
+++ b/spacy/lang/fa/lex_attrs.py
@@ -1,6 +1,5 @@
from ...attrs import LIKE_NUM
-
MIM = "م"
ZWNJ_O_MIM = "ام"
YE_NUN = "ین"
diff --git a/spacy/lang/fa/punctuation.py b/spacy/lang/fa/punctuation.py
index 4b258c13d..c1ee570ce 100644
--- a/spacy/lang/fa/punctuation.py
+++ b/spacy/lang/fa/punctuation.py
@@ -1,5 +1,11 @@
-from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
-from ..char_classes import UNITS, ALPHA_UPPER
+from ..char_classes import (
+ ALPHA_UPPER,
+ CURRENCY,
+ LIST_ELLIPSES,
+ LIST_PUNCT,
+ LIST_QUOTES,
+ UNITS,
+)
_suffixes = (
LIST_PUNCT
diff --git a/spacy/lang/fa/syntax_iterators.py b/spacy/lang/fa/syntax_iterators.py
index 8207884b0..3052369a7 100644
--- a/spacy/lang/fa/syntax_iterators.py
+++ b/spacy/lang/fa/syntax_iterators.py
@@ -1,7 +1,8 @@
-from typing import Union, Iterator, Tuple
-from ...tokens import Doc, Span
-from ...symbols import NOUN, PROPN, PRON
+from typing import Iterator, Tuple, Union
+
from ...errors import Errors
+from ...symbols import NOUN, PRON, PROPN
+from ...tokens import Doc, Span
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
diff --git a/spacy/lang/fa/tokenizer_exceptions.py b/spacy/lang/fa/tokenizer_exceptions.py
index 30df798ab..3b31b7f67 100644
--- a/spacy/lang/fa/tokenizer_exceptions.py
+++ b/spacy/lang/fa/tokenizer_exceptions.py
@@ -1,5 +1,4 @@
-from ...symbols import ORTH, NORM
-
+from ...symbols import NORM, ORTH
TOKENIZER_EXCEPTIONS = {
".ق ": [{ORTH: ".ق "}],
diff --git a/spacy/lang/fi/__init__.py b/spacy/lang/fi/__init__.py
index c3a0cf451..3e371b9b5 100644
--- a/spacy/lang/fi/__init__.py
+++ b/spacy/lang/fi/__init__.py
@@ -1,9 +1,9 @@
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .stop_words import STOP_WORDS
+from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
+from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS
-from ...language import Language, BaseDefaults
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class FinnishDefaults(BaseDefaults):
diff --git a/spacy/lang/fi/lex_attrs.py b/spacy/lang/fi/lex_attrs.py
index 4d500cead..9eec41b3d 100644
--- a/spacy/lang/fi/lex_attrs.py
+++ b/spacy/lang/fi/lex_attrs.py
@@ -1,6 +1,5 @@
from ...attrs import LIKE_NUM
-
_num_words = [
"nolla",
"yksi",
diff --git a/spacy/lang/fi/punctuation.py b/spacy/lang/fi/punctuation.py
index 6e14dde38..29ddc3111 100644
--- a/spacy/lang/fi/punctuation.py
+++ b/spacy/lang/fi/punctuation.py
@@ -1,8 +1,14 @@
-from ..char_classes import LIST_ELLIPSES, LIST_ICONS, LIST_HYPHENS
-from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
+from ..char_classes import (
+ ALPHA,
+ ALPHA_LOWER,
+ ALPHA_UPPER,
+ CONCAT_QUOTES,
+ LIST_ELLIPSES,
+ LIST_HYPHENS,
+ LIST_ICONS,
+)
from ..punctuation import TOKENIZER_SUFFIXES
-
_quotes = CONCAT_QUOTES.replace("'", "")
DASHES = "|".join(x for x in LIST_HYPHENS if x != "-")
diff --git a/spacy/lang/fi/syntax_iterators.py b/spacy/lang/fi/syntax_iterators.py
index 6b481e51f..6e2216713 100644
--- a/spacy/lang/fi/syntax_iterators.py
+++ b/spacy/lang/fi/syntax_iterators.py
@@ -1,7 +1,8 @@
from typing import Iterator, Tuple, Union
-from ...tokens import Doc, Span
-from ...symbols import NOUN, PROPN, PRON
+
from ...errors import Errors
+from ...symbols import NOUN, PRON, PROPN
+from ...tokens import Doc, Span
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
diff --git a/spacy/lang/fi/tokenizer_exceptions.py b/spacy/lang/fi/tokenizer_exceptions.py
index 465333b0a..881d5b91d 100644
--- a/spacy/lang/fi/tokenizer_exceptions.py
+++ b/spacy/lang/fi/tokenizer_exceptions.py
@@ -1,7 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...symbols import ORTH, NORM
+from ...symbols import NORM, ORTH
from ...util import update_exc
-
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc = {}
diff --git a/spacy/lang/fr/__init__.py b/spacy/lang/fr/__init__.py
index 27d2a915e..a8bc7f53e 100644
--- a/spacy/lang/fr/__init__.py
+++ b/spacy/lang/fr/__init__.py
@@ -1,15 +1,14 @@
-from typing import Optional, Callable
+from typing import Callable, Optional
from thinc.api import Model
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
-from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
-from .punctuation import TOKENIZER_SUFFIXES
-from .stop_words import STOP_WORDS
-from .lex_attrs import LEX_ATTRS
-from .syntax_iterators import SYNTAX_ITERATORS
+from ...language import BaseDefaults, Language
from .lemmatizer import FrenchLemmatizer
-from ...language import Language, BaseDefaults
+from .lex_attrs import LEX_ATTRS
+from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
+from .stop_words import STOP_WORDS
+from .syntax_iterators import SYNTAX_ITERATORS
+from .tokenizer_exceptions import TOKEN_MATCH, TOKENIZER_EXCEPTIONS
class FrenchDefaults(BaseDefaults):
diff --git a/spacy/lang/fr/lex_attrs.py b/spacy/lang/fr/lex_attrs.py
index 811312ad7..9cf508a07 100644
--- a/spacy/lang/fr/lex_attrs.py
+++ b/spacy/lang/fr/lex_attrs.py
@@ -1,6 +1,5 @@
from ...attrs import LIKE_NUM
-
_num_words = set(
"""
zero un une deux trois quatre cinq six sept huit neuf dix
diff --git a/spacy/lang/fr/punctuation.py b/spacy/lang/fr/punctuation.py
index 873d01d87..a3b178a2f 100644
--- a/spacy/lang/fr/punctuation.py
+++ b/spacy/lang/fr/punctuation.py
@@ -1,8 +1,16 @@
-from ..punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
-from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
-from ..char_classes import CONCAT_QUOTES, UNITS, ALPHA, ALPHA_LOWER, ALPHA_UPPER
-from ..char_classes import merge_chars
-
+from ..char_classes import (
+ ALPHA,
+ ALPHA_LOWER,
+ ALPHA_UPPER,
+ CONCAT_QUOTES,
+ CURRENCY,
+ LIST_ELLIPSES,
+ LIST_PUNCT,
+ LIST_QUOTES,
+ UNITS,
+ merge_chars,
+)
+from ..punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES
ELISION = "' ’".replace(" ", "")
HYPHENS = r"- – — ‐ ‑".replace(" ", "")
diff --git a/spacy/lang/fr/syntax_iterators.py b/spacy/lang/fr/syntax_iterators.py
index 5849c40b3..a6bf3d3ca 100644
--- a/spacy/lang/fr/syntax_iterators.py
+++ b/spacy/lang/fr/syntax_iterators.py
@@ -1,7 +1,7 @@
-from typing import Union, Iterator, Tuple
+from typing import Iterator, Tuple, Union
-from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors
+from ...symbols import NOUN, PRON, PROPN
from ...tokens import Doc, Span
diff --git a/spacy/lang/fr/tokenizer_exceptions.py b/spacy/lang/fr/tokenizer_exceptions.py
index 2e88b58cf..fa2062ef9 100644
--- a/spacy/lang/fr/tokenizer_exceptions.py
+++ b/spacy/lang/fr/tokenizer_exceptions.py
@@ -1,11 +1,10 @@
import re
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from .punctuation import ELISION, HYPHENS
-from ..char_classes import ALPHA_LOWER, ALPHA
from ...symbols import ORTH
from ...util import update_exc
-
+from ..char_classes import ALPHA, ALPHA_LOWER
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
+from .punctuation import ELISION, HYPHENS
# not using the large _tokenizer_exceptions_list by default as it slows down the tokenizer
# from ._tokenizer_exceptions_list import FR_BASE_EXCEPTIONS
diff --git a/spacy/lang/ga/__init__.py b/spacy/lang/ga/__init__.py
index 3be53bc7a..6f9a27a14 100644
--- a/spacy/lang/ga/__init__.py
+++ b/spacy/lang/ga/__init__.py
@@ -2,10 +2,10 @@ from typing import Optional
from thinc.api import Model
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .stop_words import STOP_WORDS
-from ...language import Language, BaseDefaults
+from ...language import BaseDefaults, Language
from .lemmatizer import IrishLemmatizer
+from .stop_words import STOP_WORDS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class IrishDefaults(BaseDefaults):
diff --git a/spacy/lang/ga/lemmatizer.py b/spacy/lang/ga/lemmatizer.py
index 47aec8fd4..c9fbfbc19 100644
--- a/spacy/lang/ga/lemmatizer.py
+++ b/spacy/lang/ga/lemmatizer.py
@@ -1,4 +1,4 @@
-from typing import List, Dict, Tuple
+from typing import Dict, List, Tuple
from ...pipeline import Lemmatizer
from ...tokens import Token
diff --git a/spacy/lang/ga/tokenizer_exceptions.py b/spacy/lang/ga/tokenizer_exceptions.py
index 63af65fe9..eb4b413fb 100644
--- a/spacy/lang/ga/tokenizer_exceptions.py
+++ b/spacy/lang/ga/tokenizer_exceptions.py
@@ -1,7 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...symbols import ORTH, NORM
+from ...symbols import NORM, ORTH
from ...util import update_exc
-
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc = {
"'acha'n": [{ORTH: "'ach", NORM: "gach"}, {ORTH: "a'n", NORM: "aon"}],
diff --git a/spacy/lang/grc/__init__.py b/spacy/lang/grc/__init__.py
index 019b3802e..ed742f4c5 100644
--- a/spacy/lang/grc/__init__.py
+++ b/spacy/lang/grc/__init__.py
@@ -1,8 +1,8 @@
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .stop_words import STOP_WORDS
+from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
-from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
-from ...language import Language, BaseDefaults
+from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
+from .stop_words import STOP_WORDS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class AncientGreekDefaults(BaseDefaults):
diff --git a/spacy/lang/grc/lex_attrs.py b/spacy/lang/grc/lex_attrs.py
index 0ab15e6fd..33cfca05b 100644
--- a/spacy/lang/grc/lex_attrs.py
+++ b/spacy/lang/grc/lex_attrs.py
@@ -1,6 +1,5 @@
from ...attrs import LIKE_NUM
-
_num_words = [
# CARDINALS
"εἷς",
diff --git a/spacy/lang/grc/punctuation.py b/spacy/lang/grc/punctuation.py
index 8f3589e9a..8e9fc8bf2 100644
--- a/spacy/lang/grc/punctuation.py
+++ b/spacy/lang/grc/punctuation.py
@@ -1,6 +1,15 @@
-from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
-from ..char_classes import LIST_ICONS, ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS
-from ..char_classes import CONCAT_QUOTES
+from ..char_classes import (
+ ALPHA,
+ ALPHA_LOWER,
+ ALPHA_UPPER,
+ CONCAT_QUOTES,
+ HYPHENS,
+ LIST_CURRENCY,
+ LIST_ELLIPSES,
+ LIST_ICONS,
+ LIST_PUNCT,
+ LIST_QUOTES,
+)
_prefixes = (
[
diff --git a/spacy/lang/grc/tokenizer_exceptions.py b/spacy/lang/grc/tokenizer_exceptions.py
index bcee70f32..86527ff61 100644
--- a/spacy/lang/grc/tokenizer_exceptions.py
+++ b/spacy/lang/grc/tokenizer_exceptions.py
@@ -1,6 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...symbols import ORTH, NORM
+from ...symbols import NORM, ORTH
from ...util import update_exc
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc = {}
diff --git a/spacy/lang/gu/__init__.py b/spacy/lang/gu/__init__.py
index e6fbc9d18..2f22034c1 100644
--- a/spacy/lang/gu/__init__.py
+++ b/spacy/lang/gu/__init__.py
@@ -1,5 +1,5 @@
+from ...language import BaseDefaults, Language
from .stop_words import STOP_WORDS
-from ...language import Language, BaseDefaults
class GujaratiDefaults(BaseDefaults):
diff --git a/spacy/lang/he/__init__.py b/spacy/lang/he/__init__.py
index dd2ee478d..07084acf1 100644
--- a/spacy/lang/he/__init__.py
+++ b/spacy/lang/he/__init__.py
@@ -1,6 +1,6 @@
-from .stop_words import STOP_WORDS
+from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
-from ...language import Language, BaseDefaults
+from .stop_words import STOP_WORDS
class HebrewDefaults(BaseDefaults):
diff --git a/spacy/lang/hi/__init__.py b/spacy/lang/hi/__init__.py
index 4c8ae446d..980dc31c1 100644
--- a/spacy/lang/hi/__init__.py
+++ b/spacy/lang/hi/__init__.py
@@ -1,6 +1,6 @@
-from .stop_words import STOP_WORDS
+from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
-from ...language import Language, BaseDefaults
+from .stop_words import STOP_WORDS
class HindiDefaults(BaseDefaults):
diff --git a/spacy/lang/hi/lex_attrs.py b/spacy/lang/hi/lex_attrs.py
index ee845e8b1..4ecd1db66 100644
--- a/spacy/lang/hi/lex_attrs.py
+++ b/spacy/lang/hi/lex_attrs.py
@@ -1,6 +1,5 @@
+from ...attrs import LIKE_NUM, NORM
from ..norm_exceptions import BASE_NORMS
-from ...attrs import NORM, LIKE_NUM
-
# fmt: off
_stem_suffixes = [
diff --git a/spacy/lang/hr/__init__.py b/spacy/lang/hr/__init__.py
index 30870b522..fd7622a3d 100644
--- a/spacy/lang/hr/__init__.py
+++ b/spacy/lang/hr/__init__.py
@@ -1,5 +1,5 @@
+from ...language import BaseDefaults, Language
from .stop_words import STOP_WORDS
-from ...language import Language, BaseDefaults
class CroatianDefaults(BaseDefaults):
diff --git a/spacy/lang/hsb/__init__.py b/spacy/lang/hsb/__init__.py
index 034d82319..e8b2ffc9f 100644
--- a/spacy/lang/hsb/__init__.py
+++ b/spacy/lang/hsb/__init__.py
@@ -1,7 +1,7 @@
+from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from ...language import Language, BaseDefaults
class UpperSorbianDefaults(BaseDefaults):
diff --git a/spacy/lang/hsb/tokenizer_exceptions.py b/spacy/lang/hsb/tokenizer_exceptions.py
index 4b9a4f98a..cd3bac913 100644
--- a/spacy/lang/hsb/tokenizer_exceptions.py
+++ b/spacy/lang/hsb/tokenizer_exceptions.py
@@ -1,6 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...symbols import ORTH, NORM
+from ...symbols import NORM, ORTH
from ...util import update_exc
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc = dict()
for exc_data in [
diff --git a/spacy/lang/hu/__init__.py b/spacy/lang/hu/__init__.py
index 9426bacea..799e6d230 100644
--- a/spacy/lang/hu/__init__.py
+++ b/spacy/lang/hu/__init__.py
@@ -1,7 +1,7 @@
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
-from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
+from ...language import BaseDefaults, Language
+from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS
-from ...language import Language, BaseDefaults
+from .tokenizer_exceptions import TOKEN_MATCH, TOKENIZER_EXCEPTIONS
class HungarianDefaults(BaseDefaults):
diff --git a/spacy/lang/hu/punctuation.py b/spacy/lang/hu/punctuation.py
index f827cd677..dbf93c622 100644
--- a/spacy/lang/hu/punctuation.py
+++ b/spacy/lang/hu/punctuation.py
@@ -1,6 +1,14 @@
-from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CONCAT_QUOTES
-from ..char_classes import CONCAT_ICONS, UNITS, ALPHA, ALPHA_LOWER, ALPHA_UPPER
-
+from ..char_classes import (
+ ALPHA,
+ ALPHA_LOWER,
+ ALPHA_UPPER,
+ CONCAT_ICONS,
+ CONCAT_QUOTES,
+ LIST_ELLIPSES,
+ LIST_PUNCT,
+ LIST_QUOTES,
+ UNITS,
+)
# removing ° from the special icons to keep e.g. 99° as one token
_concat_icons = CONCAT_ICONS.replace("\u00B0", "")
diff --git a/spacy/lang/hu/tokenizer_exceptions.py b/spacy/lang/hu/tokenizer_exceptions.py
index ffaa74f50..3f79b02d2 100644
--- a/spacy/lang/hu/tokenizer_exceptions.py
+++ b/spacy/lang/hu/tokenizer_exceptions.py
@@ -1,10 +1,9 @@
import re
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ..punctuation import ALPHA_LOWER, CURRENCY
from ...symbols import ORTH
from ...util import update_exc
-
+from ..punctuation import ALPHA_LOWER, CURRENCY
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc = {}
diff --git a/spacy/lang/hy/__init__.py b/spacy/lang/hy/__init__.py
index 481eaae0a..e00d4fd11 100644
--- a/spacy/lang/hy/__init__.py
+++ b/spacy/lang/hy/__init__.py
@@ -1,6 +1,6 @@
-from .stop_words import STOP_WORDS
+from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
-from ...language import Language, BaseDefaults
+from .stop_words import STOP_WORDS
class ArmenianDefaults(BaseDefaults):
diff --git a/spacy/lang/hy/lex_attrs.py b/spacy/lang/hy/lex_attrs.py
index 9c9c0380c..4c96b8ab5 100644
--- a/spacy/lang/hy/lex_attrs.py
+++ b/spacy/lang/hy/lex_attrs.py
@@ -1,6 +1,5 @@
from ...attrs import LIKE_NUM
-
_num_words = [
"զրո",
"մեկ",
diff --git a/spacy/lang/id/__init__.py b/spacy/lang/id/__init__.py
index 0d72cfa9d..93eb3214a 100644
--- a/spacy/lang/id/__init__.py
+++ b/spacy/lang/id/__init__.py
@@ -1,9 +1,9 @@
-from .stop_words import STOP_WORDS
-from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES, TOKENIZER_INFIXES
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
+from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
+from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
+from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS
-from ...language import Language, BaseDefaults
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class IndonesianDefaults(BaseDefaults):
diff --git a/spacy/lang/id/lex_attrs.py b/spacy/lang/id/lex_attrs.py
index 3167f4659..5952c4d06 100644
--- a/spacy/lang/id/lex_attrs.py
+++ b/spacy/lang/id/lex_attrs.py
@@ -1,8 +1,7 @@
import unicodedata
-from .punctuation import LIST_CURRENCY
from ...attrs import IS_CURRENCY, LIKE_NUM
-
+from .punctuation import LIST_CURRENCY
_num_words = [
"nol",
diff --git a/spacy/lang/id/punctuation.py b/spacy/lang/id/punctuation.py
index f6c2387d8..8303b8eaa 100644
--- a/spacy/lang/id/punctuation.py
+++ b/spacy/lang/id/punctuation.py
@@ -1,6 +1,5 @@
-from ..punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
-from ..char_classes import ALPHA, merge_chars, split_chars, _currency, _units
-
+from ..char_classes import ALPHA, _currency, _units, merge_chars, split_chars
+from ..punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
_units = (
_units + "s bit Gbps Mbps mbps Kbps kbps ƒ ppi px "
diff --git a/spacy/lang/id/syntax_iterators.py b/spacy/lang/id/syntax_iterators.py
index fa984d411..027798687 100644
--- a/spacy/lang/id/syntax_iterators.py
+++ b/spacy/lang/id/syntax_iterators.py
@@ -1,7 +1,7 @@
-from typing import Union, Iterator, Tuple
+from typing import Iterator, Tuple, Union
-from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors
+from ...symbols import NOUN, PRON, PROPN
from ...tokens import Doc, Span
diff --git a/spacy/lang/id/tokenizer_exceptions.py b/spacy/lang/id/tokenizer_exceptions.py
index ff77ede9f..8dea4e97f 100644
--- a/spacy/lang/id/tokenizer_exceptions.py
+++ b/spacy/lang/id/tokenizer_exceptions.py
@@ -1,8 +1,7 @@
+from ...symbols import NORM, ORTH
+from ...util import update_exc
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ._tokenizer_exceptions_list import ID_BASE_EXCEPTIONS
-from ...symbols import ORTH, NORM
-from ...util import update_exc
-
# Daftar singkatan dan Akronim dari:
# https://id.wiktionary.org/wiki/Wiktionary:Daftar_singkatan_dan_akronim_bahasa_Indonesia#A
diff --git a/spacy/lang/is/__init__.py b/spacy/lang/is/__init__.py
index 318363beb..af1260045 100644
--- a/spacy/lang/is/__init__.py
+++ b/spacy/lang/is/__init__.py
@@ -1,5 +1,5 @@
+from ...language import BaseDefaults, Language
from .stop_words import STOP_WORDS
-from ...language import Language, BaseDefaults
class IcelandicDefaults(BaseDefaults):
diff --git a/spacy/lang/it/__init__.py b/spacy/lang/it/__init__.py
index ecf322bd7..14458d811 100644
--- a/spacy/lang/it/__init__.py
+++ b/spacy/lang/it/__init__.py
@@ -1,12 +1,13 @@
-from typing import Optional, Callable
+from typing import Callable, Optional
+
from thinc.api import Model
-from .stop_words import STOP_WORDS
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
-from ...language import Language, BaseDefaults
+from ...language import BaseDefaults, Language
from .lemmatizer import ItalianLemmatizer
+from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES
+from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class ItalianDefaults(BaseDefaults):
diff --git a/spacy/lang/it/lemmatizer.py b/spacy/lang/it/lemmatizer.py
index e44e64e3a..bf869166d 100644
--- a/spacy/lang/it/lemmatizer.py
+++ b/spacy/lang/it/lemmatizer.py
@@ -1,4 +1,4 @@
-from typing import List, Dict, Tuple
+from typing import Dict, List, Tuple
from ...pipeline import Lemmatizer
from ...tokens import Token
diff --git a/spacy/lang/it/punctuation.py b/spacy/lang/it/punctuation.py
index f01ab4f0d..51318b22d 100644
--- a/spacy/lang/it/punctuation.py
+++ b/spacy/lang/it/punctuation.py
@@ -1,8 +1,13 @@
+from ..char_classes import (
+ ALPHA,
+ ALPHA_LOWER,
+ ALPHA_UPPER,
+ CONCAT_QUOTES,
+ HYPHENS,
+ LIST_ELLIPSES,
+ LIST_ICONS,
+)
from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES
-from ..char_classes import LIST_ELLIPSES, LIST_ICONS
-from ..char_classes import ALPHA, HYPHENS, CONCAT_QUOTES
-from ..char_classes import ALPHA_LOWER, ALPHA_UPPER
-
ELISION = "'’"
diff --git a/spacy/lang/it/syntax_iterators.py b/spacy/lang/it/syntax_iterators.py
index f63df3fad..924627648 100644
--- a/spacy/lang/it/syntax_iterators.py
+++ b/spacy/lang/it/syntax_iterators.py
@@ -1,7 +1,7 @@
-from typing import Union, Iterator, Tuple
+from typing import Iterator, Tuple, Union
-from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors
+from ...symbols import NOUN, PRON, PROPN
from ...tokens import Doc, Span
diff --git a/spacy/lang/it/tokenizer_exceptions.py b/spacy/lang/it/tokenizer_exceptions.py
index 42883863b..2e7a5a1a3 100644
--- a/spacy/lang/it/tokenizer_exceptions.py
+++ b/spacy/lang/it/tokenizer_exceptions.py
@@ -1,7 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH
from ...util import update_exc
-
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc = {
"all'art.": [{ORTH: "all'"}, {ORTH: "art."}],
diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py
index bf86305fb..0d5f97ac8 100644
--- a/spacy/lang/ja/__init__.py
+++ b/spacy/lang/ja/__init__.py
@@ -1,27 +1,27 @@
-from typing import Optional, Union, Dict, Any, Callable
-from pathlib import Path
-import srsly
-from collections import namedtuple
-from thinc.api import Model
import re
+from collections import namedtuple
+from pathlib import Path
+from typing import Any, Callable, Dict, Optional, Union
-from .stop_words import STOP_WORDS
-from .syntax_iterators import SYNTAX_ITERATORS
-from .tag_map import TAG_MAP
-from .tag_orth_map import TAG_ORTH_MAP
-from .tag_bigram_map import TAG_BIGRAM_MAP
+import srsly
+from thinc.api import Model
+
+from ... import util
from ...errors import Errors
-from ...language import Language, BaseDefaults
+from ...language import BaseDefaults, Language
from ...pipeline import Morphologizer
from ...pipeline.morphologizer import DEFAULT_MORPH_MODEL
from ...scorer import Scorer
from ...symbols import POS
from ...tokens import Doc, MorphAnalysis
from ...training import validate_examples
-from ...util import DummyTokenizer, registry, load_config_from_str
+from ...util import DummyTokenizer, load_config_from_str, registry
from ...vocab import Vocab
-from ... import util
-
+from .stop_words import STOP_WORDS
+from .syntax_iterators import SYNTAX_ITERATORS
+from .tag_bigram_map import TAG_BIGRAM_MAP
+from .tag_map import TAG_MAP
+from .tag_orth_map import TAG_ORTH_MAP
DEFAULT_CONFIG = """
[nlp]
diff --git a/spacy/lang/ja/syntax_iterators.py b/spacy/lang/ja/syntax_iterators.py
index 588a9ba03..34670083e 100644
--- a/spacy/lang/ja/syntax_iterators.py
+++ b/spacy/lang/ja/syntax_iterators.py
@@ -1,9 +1,8 @@
-from typing import Union, Iterator, Tuple, Set
+from typing import Iterator, Set, Tuple, Union
-from ...symbols import NOUN, PROPN, PRON, VERB
+from ...symbols import NOUN, PRON, PROPN, VERB
from ...tokens import Doc, Span
-
# TODO: this can probably be pruned a bit
# fmt: off
labels = ["nsubj", "nmod", "ddoclike", "nsubjpass", "pcomp", "pdoclike", "doclike", "obl", "dative", "appos", "attr", "ROOT"]
diff --git a/spacy/lang/ja/tag_map.py b/spacy/lang/ja/tag_map.py
index c6de3831a..5c14f41bf 100644
--- a/spacy/lang/ja/tag_map.py
+++ b/spacy/lang/ja/tag_map.py
@@ -1,6 +1,23 @@
-from ...symbols import POS, PUNCT, INTJ, ADJ, AUX, ADP, PART, SCONJ, NOUN
-from ...symbols import SYM, PRON, VERB, ADV, PROPN, NUM, DET, SPACE, CCONJ
-
+from ...symbols import (
+ ADJ,
+ ADP,
+ ADV,
+ AUX,
+ CCONJ,
+ DET,
+ INTJ,
+ NOUN,
+ NUM,
+ PART,
+ POS,
+ PRON,
+ PROPN,
+ PUNCT,
+ SCONJ,
+ SPACE,
+ SYM,
+ VERB,
+)
TAG_MAP = {
# Explanation of Unidic tags:
diff --git a/spacy/lang/kn/__init__.py b/spacy/lang/kn/__init__.py
index ccd46a394..44d53f6b7 100644
--- a/spacy/lang/kn/__init__.py
+++ b/spacy/lang/kn/__init__.py
@@ -1,5 +1,5 @@
+from ...language import BaseDefaults, Language
from .stop_words import STOP_WORDS
-from ...language import Language, BaseDefaults
class KannadaDefaults(BaseDefaults):
diff --git a/spacy/lang/ko/__init__.py b/spacy/lang/ko/__init__.py
index 0e02e4a2d..e2c860f7d 100644
--- a/spacy/lang/ko/__init__.py
+++ b/spacy/lang/ko/__init__.py
@@ -1,17 +1,16 @@
-from typing import Iterator, Any, Dict
+from typing import Any, Dict, Iterator
+from ...language import BaseDefaults, Language
+from ...scorer import Scorer
+from ...symbols import POS, X
+from ...tokens import Doc
+from ...training import validate_examples
+from ...util import DummyTokenizer, load_config_from_str, registry
+from ...vocab import Vocab
+from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_INFIXES
from .stop_words import STOP_WORDS
from .tag_map import TAG_MAP
-from .lex_attrs import LEX_ATTRS
-from ...language import Language, BaseDefaults
-from ...tokens import Doc
-from ...scorer import Scorer
-from ...symbols import POS, X
-from ...training import validate_examples
-from ...util import DummyTokenizer, registry, load_config_from_str
-from ...vocab import Vocab
-
DEFAULT_CONFIG = """
[nlp]
diff --git a/spacy/lang/ko/lex_attrs.py b/spacy/lang/ko/lex_attrs.py
index ac5bc7e48..2c49aa389 100644
--- a/spacy/lang/ko/lex_attrs.py
+++ b/spacy/lang/ko/lex_attrs.py
@@ -1,6 +1,5 @@
from ...attrs import LIKE_NUM
-
_num_words = [
"영",
"공",
diff --git a/spacy/lang/ko/punctuation.py b/spacy/lang/ko/punctuation.py
index f5f1c51da..c3c32ea1f 100644
--- a/spacy/lang/ko/punctuation.py
+++ b/spacy/lang/ko/punctuation.py
@@ -1,7 +1,6 @@
from ..char_classes import LIST_QUOTES
from ..punctuation import TOKENIZER_INFIXES as BASE_TOKENIZER_INFIXES
-
_infixes = (
["·", "ㆍ", r"\(", r"\)"]
+ [r"(?<=[0-9])~(?=[0-9-])"]
diff --git a/spacy/lang/ko/tag_map.py b/spacy/lang/ko/tag_map.py
index 26a8c56b9..85598c3ef 100644
--- a/spacy/lang/ko/tag_map.py
+++ b/spacy/lang/ko/tag_map.py
@@ -1,5 +1,21 @@
-from ...symbols import POS, PUNCT, INTJ, X, SYM, ADJ, AUX, ADP, CONJ, NOUN, PRON
-from ...symbols import VERB, ADV, PROPN, NUM, DET
+from ...symbols import (
+ ADJ,
+ ADP,
+ ADV,
+ AUX,
+ CONJ,
+ DET,
+ INTJ,
+ NOUN,
+ NUM,
+ POS,
+ PRON,
+ PROPN,
+ PUNCT,
+ SYM,
+ VERB,
+ X,
+)
# 은전한닢(mecab-ko-dic)의 품사 태그를 universal pos tag로 대응시킴
# https://docs.google.com/spreadsheets/d/1-9blXKjtjeKZqsf4NzHeYJCrr49-nXeRF6D80udfcwY/edit#gid=589544265
diff --git a/spacy/lang/ky/__init__.py b/spacy/lang/ky/__init__.py
index ccca384bd..fafc0f020 100644
--- a/spacy/lang/ky/__init__.py
+++ b/spacy/lang/ky/__init__.py
@@ -1,8 +1,8 @@
+from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_INFIXES
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from ...language import Language, BaseDefaults
class KyrgyzDefaults(BaseDefaults):
diff --git a/spacy/lang/ky/punctuation.py b/spacy/lang/ky/punctuation.py
index fa9819f80..6d89da2f7 100644
--- a/spacy/lang/ky/punctuation.py
+++ b/spacy/lang/ky/punctuation.py
@@ -1,5 +1,12 @@
-from ..char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER, CONCAT_QUOTES, HYPHENS
-from ..char_classes import LIST_ELLIPSES, LIST_ICONS
+from ..char_classes import (
+ ALPHA,
+ ALPHA_LOWER,
+ ALPHA_UPPER,
+ CONCAT_QUOTES,
+ HYPHENS,
+ LIST_ELLIPSES,
+ LIST_ICONS,
+)
_hyphens_no_dash = HYPHENS.replace("-", "").strip("|").replace("||", "")
_infixes = (
diff --git a/spacy/lang/ky/tokenizer_exceptions.py b/spacy/lang/ky/tokenizer_exceptions.py
index 8ec727ac1..c93e3dac3 100644
--- a/spacy/lang/ky/tokenizer_exceptions.py
+++ b/spacy/lang/ky/tokenizer_exceptions.py
@@ -1,6 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...symbols import ORTH, NORM
+from ...symbols import NORM, ORTH
from ...util import update_exc
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc = {}
diff --git a/spacy/lang/la/__init__.py b/spacy/lang/la/__init__.py
index 37164c3f3..d77ae267e 100644
--- a/spacy/lang/la/__init__.py
+++ b/spacy/lang/la/__init__.py
@@ -1,8 +1,8 @@
-from ...language import Language, BaseDefaults
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .stop_words import STOP_WORDS
+from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
+from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class LatinDefaults(BaseDefaults):
diff --git a/spacy/lang/la/lex_attrs.py b/spacy/lang/la/lex_attrs.py
index 9db1218a4..fcb35defc 100644
--- a/spacy/lang/la/lex_attrs.py
+++ b/spacy/lang/la/lex_attrs.py
@@ -1,6 +1,7 @@
-from ...attrs import LIKE_NUM
import re
+from ...attrs import LIKE_NUM
+
# cf. Goyvaerts/Levithan 2009; case-insensitive, allow 4
roman_numerals_compile = re.compile(
r"(?i)^(?=[MDCLXVI])M*(C[MD]|D?C{0,4})(X[CL]|L?X{0,4})(I[XV]|V?I{0,4})$"
diff --git a/spacy/lang/la/syntax_iterators.py b/spacy/lang/la/syntax_iterators.py
index 7093bacf9..39b4fb39d 100644
--- a/spacy/lang/la/syntax_iterators.py
+++ b/spacy/lang/la/syntax_iterators.py
@@ -1,7 +1,8 @@
-from typing import Union, Iterator, Tuple
-from ...tokens import Doc, Span
-from ...symbols import NOUN, PROPN, PRON, VERB, AUX
+from typing import Iterator, Tuple, Union
+
from ...errors import Errors
+from ...symbols import AUX, NOUN, PRON, PROPN, VERB
+from ...tokens import Doc, Span
# NB: Modified from da on suggestion from https://github.com/explosion/spaCy/issues/7457#issuecomment-800349751 [PJB]
diff --git a/spacy/lang/la/tokenizer_exceptions.py b/spacy/lang/la/tokenizer_exceptions.py
index 6d14b92c5..c0b98116f 100644
--- a/spacy/lang/la/tokenizer_exceptions.py
+++ b/spacy/lang/la/tokenizer_exceptions.py
@@ -1,7 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH
from ...util import update_exc
-
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
## TODO: Look into systematically handling u/v
_exc = {
diff --git a/spacy/lang/lb/__init__.py b/spacy/lang/lb/__init__.py
index 7827e7762..2386b4356 100644
--- a/spacy/lang/lb/__init__.py
+++ b/spacy/lang/lb/__init__.py
@@ -1,8 +1,8 @@
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .punctuation import TOKENIZER_INFIXES
+from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
+from .punctuation import TOKENIZER_INFIXES
from .stop_words import STOP_WORDS
-from ...language import Language, BaseDefaults
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class LuxembourgishDefaults(BaseDefaults):
diff --git a/spacy/lang/lb/lex_attrs.py b/spacy/lang/lb/lex_attrs.py
index d2d50d9dc..119231374 100644
--- a/spacy/lang/lb/lex_attrs.py
+++ b/spacy/lang/lb/lex_attrs.py
@@ -1,6 +1,5 @@
from ...attrs import LIKE_NUM
-
_num_words = set(
"""
null eent zwee dräi véier fënnef sechs ziwen aacht néng zéng eelef zwielef dräizéng
diff --git a/spacy/lang/lb/punctuation.py b/spacy/lang/lb/punctuation.py
index e382c56c5..8bdbf9713 100644
--- a/spacy/lang/lb/punctuation.py
+++ b/spacy/lang/lb/punctuation.py
@@ -1,4 +1,4 @@
-from ..char_classes import LIST_ELLIPSES, LIST_ICONS, ALPHA, ALPHA_LOWER, ALPHA_UPPER
+from ..char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER, LIST_ELLIPSES, LIST_ICONS
ELISION = " ' ’ ".strip().replace(" ", "")
diff --git a/spacy/lang/lb/tokenizer_exceptions.py b/spacy/lang/lb/tokenizer_exceptions.py
index d00dc9610..844826e27 100644
--- a/spacy/lang/lb/tokenizer_exceptions.py
+++ b/spacy/lang/lb/tokenizer_exceptions.py
@@ -1,7 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...symbols import ORTH, NORM
+from ...symbols import NORM, ORTH
from ...util import update_exc
-
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
# TODO
# treat other apostrophes within words as part of the word: [op d'mannst], [fir d'éischt] (= exceptions)
diff --git a/spacy/lang/lex_attrs.py b/spacy/lang/lex_attrs.py
index 6ed981a06..3ac20420d 100644
--- a/spacy/lang/lex_attrs.py
+++ b/spacy/lang/lex_attrs.py
@@ -1,11 +1,10 @@
-from typing import Set
-import unicodedata
import re
+import unicodedata
+from typing import Set
from .. import attrs
from .tokenizer_exceptions import URL_MATCH
-
_like_email = re.compile(r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)").match
_tlds = set(
"com|org|edu|gov|net|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|"
diff --git a/spacy/lang/lg/__init__.py b/spacy/lang/lg/__init__.py
index 6f7153fce..a87685375 100644
--- a/spacy/lang/lg/__init__.py
+++ b/spacy/lang/lg/__init__.py
@@ -1,7 +1,7 @@
-from .stop_words import STOP_WORDS
+from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_INFIXES
-from ...language import Language, BaseDefaults
+from .stop_words import STOP_WORDS
class LugandaDefaults(BaseDefaults):
diff --git a/spacy/lang/lg/punctuation.py b/spacy/lang/lg/punctuation.py
index 5d3eb792e..775c6b001 100644
--- a/spacy/lang/lg/punctuation.py
+++ b/spacy/lang/lg/punctuation.py
@@ -1,5 +1,12 @@
-from ..char_classes import LIST_ELLIPSES, LIST_ICONS, HYPHENS
-from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
+from ..char_classes import (
+ ALPHA,
+ ALPHA_LOWER,
+ ALPHA_UPPER,
+ CONCAT_QUOTES,
+ HYPHENS,
+ LIST_ELLIPSES,
+ LIST_ICONS,
+)
_infixes = (
LIST_ELLIPSES
diff --git a/spacy/lang/lij/__init__.py b/spacy/lang/lij/__init__.py
index b7e11f77e..3b8e972c6 100644
--- a/spacy/lang/lij/__init__.py
+++ b/spacy/lang/lij/__init__.py
@@ -1,7 +1,7 @@
+from ...language import BaseDefaults, Language
+from .punctuation import TOKENIZER_INFIXES
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .punctuation import TOKENIZER_INFIXES
-from ...language import Language, BaseDefaults
class LigurianDefaults(BaseDefaults):
diff --git a/spacy/lang/lij/punctuation.py b/spacy/lang/lij/punctuation.py
index d50b75589..c5c150d0a 100644
--- a/spacy/lang/lij/punctuation.py
+++ b/spacy/lang/lij/punctuation.py
@@ -1,6 +1,5 @@
-from ..punctuation import TOKENIZER_INFIXES
from ..char_classes import ALPHA
-
+from ..punctuation import TOKENIZER_INFIXES
ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "")
diff --git a/spacy/lang/lij/tokenizer_exceptions.py b/spacy/lang/lij/tokenizer_exceptions.py
index 52eae2c89..cf5a1af66 100644
--- a/spacy/lang/lij/tokenizer_exceptions.py
+++ b/spacy/lang/lij/tokenizer_exceptions.py
@@ -1,7 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH
from ...util import update_exc
-
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc = {}
diff --git a/spacy/lang/lt/__init__.py b/spacy/lang/lt/__init__.py
index 3ae000e5f..f3ea257b1 100644
--- a/spacy/lang/lt/__init__.py
+++ b/spacy/lang/lt/__init__.py
@@ -1,8 +1,8 @@
-from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .stop_words import STOP_WORDS
+from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
-from ...language import Language, BaseDefaults
+from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
+from .stop_words import STOP_WORDS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class LithuanianDefaults(BaseDefaults):
diff --git a/spacy/lang/lt/punctuation.py b/spacy/lang/lt/punctuation.py
index 22aee0941..deef24854 100644
--- a/spacy/lang/lt/punctuation.py
+++ b/spacy/lang/lt/punctuation.py
@@ -1,9 +1,14 @@
-from ..char_classes import LIST_ICONS, LIST_ELLIPSES
-from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
-from ..char_classes import HYPHENS
+from ..char_classes import (
+ ALPHA,
+ ALPHA_LOWER,
+ ALPHA_UPPER,
+ CONCAT_QUOTES,
+ HYPHENS,
+ LIST_ELLIPSES,
+ LIST_ICONS,
+)
from ..punctuation import TOKENIZER_SUFFIXES
-
_infixes = (
LIST_ELLIPSES
+ LIST_ICONS
diff --git a/spacy/lang/lt/tokenizer_exceptions.py b/spacy/lang/lt/tokenizer_exceptions.py
index 118fb2190..d39b86dfc 100644
--- a/spacy/lang/lt/tokenizer_exceptions.py
+++ b/spacy/lang/lt/tokenizer_exceptions.py
@@ -1,7 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH
from ...util import update_exc
-
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc = {}
diff --git a/spacy/lang/lv/__init__.py b/spacy/lang/lv/__init__.py
index a05e5b939..fdfca5e97 100644
--- a/spacy/lang/lv/__init__.py
+++ b/spacy/lang/lv/__init__.py
@@ -1,5 +1,5 @@
+from ...language import BaseDefaults, Language
from .stop_words import STOP_WORDS
-from ...language import Language, BaseDefaults
class LatvianDefaults(BaseDefaults):
diff --git a/spacy/lang/mk/__init__.py b/spacy/lang/mk/__init__.py
index fa07cfef9..413f0038d 100644
--- a/spacy/lang/mk/__init__.py
+++ b/spacy/lang/mk/__init__.py
@@ -1,15 +1,16 @@
-from typing import Optional, Callable
+from typing import Callable, Optional
+
from thinc.api import Model
+
+from ...attrs import LANG
+from ...language import BaseDefaults, Language
+from ...lookups import Lookups
+from ...util import update_exc
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
from .lemmatizer import MacedonianLemmatizer
+from .lex_attrs import LEX_ATTRS
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .lex_attrs import LEX_ATTRS
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-
-from ...language import Language, BaseDefaults
-from ...attrs import LANG
-from ...util import update_exc
-from ...lookups import Lookups
class MacedonianDefaults(BaseDefaults):
diff --git a/spacy/lang/mk/lemmatizer.py b/spacy/lang/mk/lemmatizer.py
index a792095e7..f5a5eca85 100644
--- a/spacy/lang/mk/lemmatizer.py
+++ b/spacy/lang/mk/lemmatizer.py
@@ -1,5 +1,5 @@
-from typing import List
from collections import OrderedDict
+from typing import List
from ...pipeline import Lemmatizer
from ...tokens import Token
diff --git a/spacy/lang/mk/tokenizer_exceptions.py b/spacy/lang/mk/tokenizer_exceptions.py
index 3b589b2a9..40f2c1d80 100644
--- a/spacy/lang/mk/tokenizer_exceptions.py
+++ b/spacy/lang/mk/tokenizer_exceptions.py
@@ -1,5 +1,4 @@
-from ...symbols import ORTH, NORM
-
+from ...symbols import NORM, ORTH
_exc = {}
diff --git a/spacy/lang/ml/__init__.py b/spacy/lang/ml/__init__.py
index 9f90605f0..0b17b8a7a 100644
--- a/spacy/lang/ml/__init__.py
+++ b/spacy/lang/ml/__init__.py
@@ -1,6 +1,6 @@
-from .stop_words import STOP_WORDS
+from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
-from ...language import Language, BaseDefaults
+from .stop_words import STOP_WORDS
class MalayalamDefaults(BaseDefaults):
diff --git a/spacy/lang/ml/lex_attrs.py b/spacy/lang/ml/lex_attrs.py
index 9ac19b6a7..33a144f6b 100644
--- a/spacy/lang/ml/lex_attrs.py
+++ b/spacy/lang/ml/lex_attrs.py
@@ -1,6 +1,5 @@
from ...attrs import LIKE_NUM
-
# reference 2: https://www.omniglot.com/language/numbers/malayalam.htm
_num_words = [
diff --git a/spacy/lang/mr/__init__.py b/spacy/lang/mr/__init__.py
index 3e172fa60..f980efbd0 100644
--- a/spacy/lang/mr/__init__.py
+++ b/spacy/lang/mr/__init__.py
@@ -1,5 +1,5 @@
+from ...language import BaseDefaults, Language
from .stop_words import STOP_WORDS
-from ...language import Language, BaseDefaults
class MarathiDefaults(BaseDefaults):
diff --git a/spacy/lang/ms/__init__.py b/spacy/lang/ms/__init__.py
new file mode 100644
index 000000000..f53ebfcf2
--- /dev/null
+++ b/spacy/lang/ms/__init__.py
@@ -0,0 +1,24 @@
+from ...language import BaseDefaults, Language
+from .lex_attrs import LEX_ATTRS
+from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
+from .stop_words import STOP_WORDS
+from .syntax_iterators import SYNTAX_ITERATORS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
+
+
+class MalayDefaults(BaseDefaults):
+ tokenizer_exceptions = TOKENIZER_EXCEPTIONS
+ prefixes = TOKENIZER_PREFIXES
+ suffixes = TOKENIZER_SUFFIXES
+ infixes = TOKENIZER_INFIXES
+ syntax_iterators = SYNTAX_ITERATORS
+ lex_attr_getters = LEX_ATTRS
+ stop_words = STOP_WORDS
+
+
+class Malay(Language):
+ lang = "ms"
+ Defaults = MalayDefaults
+
+
+__all__ = ["Malay"]
diff --git a/spacy/lang/ms/_tokenizer_exceptions_list.py b/spacy/lang/ms/_tokenizer_exceptions_list.py
new file mode 100644
index 000000000..fba1dd70f
--- /dev/null
+++ b/spacy/lang/ms/_tokenizer_exceptions_list.py
@@ -0,0 +1,1943 @@
+# from https://prpm.dbp.gov.my/cari1?keyword=
+# dbp https://en.wikipedia.org/wiki/Dewan_Bahasa_dan_Pustaka
+MS_BASE_EXCEPTIONS = set(
+ """
+aba-aba
+abah-abah
+abar-abar
+abrit-abritan
+abu-abu
+abuk-abuk
+abun-abun
+acak-acak
+acak-acakan
+acang-acang
+aci-aci
+aci-acian
+aci-acinya
+adang-adang
+adap-adapan
+adik-beradik
+aduk-adukan
+agak-agak
+agar-agar
+agut-agut
+air-cooled
+ajar-ajar
+aji-aji
+akal-akal
+akhir-akhir
+aki-aki
+alah-mengalahi
+alan-alan
+alang-alang
+alang-alangan
+alap-alap
+ali-ali
+alih-alih
+aling-aling
+aling-alingan
+alip-alipan
+alon-alon
+alu-alu
+alu-aluan
+alun-alun
+alur-alur
+ambah-ambah
+ambai-ambai
+ambil-mengambil
+ambring-ambringan
+ambu-ambu
+ambung-ambung
+amin-amin
+ampai-ampai
+amung-amung
+anai-anai
+anak-anak
+anak-anakan
+anak-beranak
+ancak-ancak
+ancang-ancang
+andang-andang
+angan-angan
+anggar-anggar
+angin-angin
+angin-anginan
+angkul-angkul
+angkup-angkup
+angkut-angkut
+ani-ani
+aning-aning
+anjang-anjang
+anjing-anjing
+anjung-anjung
+anjung-anjungan
+antar-antar
+ante-mortem
+anting-anting
+antung-antung
+anyam-menganyam
+apa-apa
+api-api
+apit-apit
+aprit-apritan
+arah-arah
+arak-arakan
+aram-aram
+ari-ari
+aru-aru
+asa-asaan
+asam-asaman
+asuh-asuh
+atas-mengatasi
+ati-ati
+audio-visual
+avant-garde
+awang-awang
+awang-gemawang
+ayak-ayak
+ayam-ayam
+ayam-ayaman
+ayang-ayang
+ayeng-ayengan
+ayun-temayun
+back-up
+bahu-membahu
+baik-baik
+bajang-bajang
+baji-baji
+balai-balai
+balam-balam
+balas-membalas
+baling-baling
+balut-balut
+bangun-bangun
+bantal-bantal
+barat-barat
+barau-barau
+bari-bari
+barung-barung
+basa-basi
+bata-bata
+batir-batir
+bau-bauan
+bayang-bayang
+bedil-bedal
+begana-begini
+bekal-bekalan
+belat-belit
+belu-belai
+benggal-benggil
+bengkal-bengkil
+bengkang-bengkok
+bengkang-bengkong
+berabad-abad
+berabun-rabun
+berada-ada
+beragah-agah
+beragak-agak
+beragam-ragam
+beraja-raja
+berakit-rakit
+beraku-akuan
+beralun-alun
+beramah-ramahan
+beramah-tamah
+beramai-ramai
+berambai-ambai
+berambal-ambalan
+beramuk-amukan
+berandai-andai
+berandai-randai
+berang-berang
+berangan-angan
+beranggap-anggapan
+berangguk-angguk
+berangin-angin
+berangka-angka
+berangka-angkaan
+berangkai-rangkai
+beranja-anja
+berantai-rantai
+berapi-api
+berapung-apung
+berarak-arakan
+beras-beras
+berasing-asingan
+beratus-ratus
+berawas-awas
+berayal-ayalan
+berayun-ayun
+berbagai-bagai
+berbahas-bahasan
+berbalas-balasan
+berbalik-balik
+berbanjar-banjar
+berbantah-bantah
+berbanyak-banyak
+berbarik-barik
+berbasah-basah
+berbatu-batu
+berbayang-bayang
+berbecak-becak
+berbedil-bedilan
+berbeka-beka
+berbelakang-belakangan
+berbelang-belang
+berbeli-belian
+berbelit-belit
+berbelok-belok
+berbenar-benar
+berbencah-bencah
+berbesar-besar
+berbidai-bidai
+berbiku-biku
+berbilik-bilik
+berbinar-binar
+berbincang-bincang
+berbingkah-bingkah
+berbintang-bintang
+berbintik-bintik
+berbintil-bintil
+berbisik-bisik
+berbolak-balik
+berbolong-bolong
+berbondong-bondong
+berbongkah-bongkah
+berbuai-buai
+berbual-bual
+berbukit-bukit
+berbulan-bulan
+berbunga-bunga
+berbuntut-buntut
+berbunuh-bunuhan
+berburu-buru
+berburuk-buruk
+berbutir-butir
+bercabang-cabang
+bercaci-cacian
+bercakap-cakap
+bercakar-cakaran
+bercantik-cantik
+bercari-cari
+bercari-carian
+bercarik-carik
+bercepat-cepat
+bercerai-berai
+bercerai-cerai
+bercetai-cetai
+bercikun-cikun
+bercinta-cintaan
+bercita-cita
+berciut-ciut
+berconteng-conteng
+bercoreng-coreng
+bercoreng-moreng
+bercuit-cuit
+bercumbu-cumbu
+bercumbu-cumbuan
+bercura-bura
+bercura-cura
+berdada-dadaan
+berdahulu-dahuluan
+berdalam-dalam
+berdebar-debar
+berdecap-decap
+berdedai-dedai
+berdegap-degap
+berdegar-degar
+berdeham-deham
+berdekah-dekah
+berdekat-dekat
+berdelat-delat
+berdembun-dembun
+berdempang-dempang
+berdendam-dendaman
+berdengkang-dengkang
+berdentang-dentang
+berdentum-dentum
+berdentung-dentung
+berdepak-depak
+berdepan-depan
+berderai-derai
+berderak-derak
+berderau-derau
+berdering-dering
+berderung-derung
+berdesak-desakan
+berdesing-desing
+berdesus-desus
+berdikit-dikit
+berdingkit-dingkit
+berdua-dua
+berduri-duri
+berduru-duru
+berduyun-duyun
+berebut-rebut
+berebut-rebutan
+beregang-regang
+berek-berek
+berembut-rembut
+berempat-empat
+berenak-enak
+berenteng-renteng
+beresah-resah
+berfoya-foya
+bergagah-gagahan
+bergagap-gagap
+bergalur-galur
+berganda-ganda
+berganti-ganti
+bergarah-garah
+bergaruk-garuk
+bergegas-gegas
+bergelang-gelang
+bergelap-gelap
+bergelas-gelasan
+bergeleng-geleng
+bergemal-gemal
+bergembut-gembut
+bergerek-gerek
+bergesa-gesa
+bergilir-gilir
+bergolek-golek
+bergores-gores
+bergotong-royong
+bergugus-gugus
+bergulung-gulung
+bergulut-gulut
+bergumpal-gumpal
+bergunung-gunung
+berhadap-hadapan
+berhamun-hamun
+berhandai-handai
+berhanyut-hanyut
+berhari-hari
+berhati-hati
+berhilau-hilau
+berhujan-hujan
+beria-ia
+beria-ria
+beriak-riak
+beribu-ribu
+berigi-rigi
+bering-bering
+beringat-ingat
+beringgit-ringgit
+berintik-rintik
+beriring-iring
+beriring-iringan
+berjabir-jabir
+berjaga-jaga
+berjagung-jagung
+berjalan-jalan
+berjalar-jalar
+berjalin-jalin
+berjalur-jalur
+berjam-jam
+berjauh-jauhan
+berjejal-jejal
+berjela-jela
+berjenis-jenis
+berjenjang-jenjang
+berjilid-jilid
+berjinak-jinak
+berjingkat-jingkat
+berjingkrak-jingkrak
+berjongkok-jongkok
+berjubel-jubel
+berjujut-jujutan
+berjulai-julai
+berjumbai-jumbai
+berjurai-jurai
+berjurus-jurus
+berjuta-juta
+berkaca-kaca
+berkait-kaitan
+berkala-kala
+berkali-kali
+berkanjar-kanjar
+berkaok-kaok
+berkarung-karung
+berkasih-kasihan
+berkata-kata
+berkatak-katak
+berkecai-kecai
+berkecek-kecek
+berkecil-kecil
+berkecil-kecilan
+berkedip-kedip
+berkejang-kejang
+berkejap-kejap
+berkejar-kejaran
+berkelar-kelar
+berkelip-kelip
+berkelit-kelit
+berkelok-kelok
+berkelompok-kelompok
+berkelun-kelun
+berkembur-kembur
+berkempul-kempul
+berkena-kenaan
+berkenal-kenalan
+berkendur-kendur
+berkeok-keok
+berkepak-kepak
+berkepal-kepal
+berkeping-keping
+berkepul-kepul
+berkeras-kerasan
+berkeritik-keritik
+berkeruit-keruit
+berkerut-kerut
+berketak-ketak
+berketak-ketik
+berketi-keti
+berketil-ketil
+berketuk-ketak
+berketul-ketul
+berkial-kial
+berkian-kian
+berkias-kiasan
+berkibar-kibar
+berkilah-kilah
+berkilat-kilat
+berkilau-kilauan
+berkilo-kilo
+berkinja-kinja
+berkipas-kipas
+berkira-kira
+berkirim-kiriman
+berkobar-kobar
+berkobok-kobok
+berkocak-kocak
+berkodi-kodi
+berkolek-kolek
+berkopah-kopah
+berkotak-kotak
+berkuat-kuatan
+berkunang-kunang
+berkurun-kurun
+berkusau-kusau
+berkusu-kusu
+berkusut-kusut
+berkuting-kuting
+berkutu-kutuan
+berlabun-labun
+berlain-lainan
+berlalai-lalai
+berlama-lama
+berlambai-lambai
+berlambak-lambak
+berlampang-lampang
+berlapang-lapang
+berlapis-lapis
+berlapuk-lapuk
+berlarah-larah
+berlarat-larat
+berlari-larian
+berlarik-larik
+berlarut-larut
+berlawak-lawak
+berlayap-layapan
+berlebih-lebih
+berlebih-lebihan
+berlekas-lekas
+berlena-lena
+berlengah-lengah
+berlenggek-lenggek
+berlenggok-lenggok
+berleret-leret
+berliang-liuk
+berliku-liku
+berlimpah-limpah
+berlimpap-limpap
+berlimpit-limpit
+berlinang-linang
+berlindak-lindak
+berlipat-lipat
+berlompok-lompok
+berloncat-loncatan
+berlopak-lopak
+berlubang-lubang
+bermaaf-maafan
+bermacam-macam
+bermain-main
+bermalas-malas
+bermanik-manik
+bermanis-manis
+bermanja-manja
+bermasak-masak
+bermati-mati
+bermegah-megah
+bermemek-memek
+bermesra-mesraan
+bermewah-mewah
+berminggu-minggu
+berminta-minta
+bermuda-muda
+bermudah-mudah
+bermuka-muka
+bermula-mula
+bermulut-mulut
+bernafsi-nafsi
+bernaka-naka
+berniat-niat
+berogak-ogak
+beroleng-oleng
+berolok-olok
+beromong-omong
+beronggok-onggok
+berorang-orang
+beroyal-royal
+berpada-pada
+berpahit-pahit
+berpair-pair
+berpal-pal
+berpalu-palu
+berpalu-paluan
+berpalun-palun
+berpandai-pandai
+berpandang-pandangan
+berpangkat-pangkat
+berpanjang-panjang
+berpasang-pasang
+berpasang-pasangan
+berpayah-payah
+berpeluh-peluh
+berpeluk-pelukan
+berpenat-penat
+berpencar-pencar
+berpendar-pendar
+berpenggal-penggal
+berperai-perai
+berpesai-pesai
+berpesta-pesta
+berpesuk-pesuk
+berpetak-petak
+berpeti-peti
+berpihak-pihak
+berpijar-pijar
+berpikul-pikul
+berpilih-pilih
+berpilin-pilin
+berpindah-pindah
+berpintal-pintal
+berpirau-pirau
+berpisah-pisah
+berpolah-polah
+berpongah-pongah
+berpontang-panting
+berporah-porah
+berpotong-potong
+berpuak-puak
+berpual-pual
+berpugak-pugak
+berpuluh-puluh
+berpulun-pulun
+berpuntal-puntal
+berpura-pura
+berpusar-pusar
+berpusing-pusing
+berpusu-pusu
+berputar-putar
+bersaf-saf
+bersahut-sahutan
+bersakit-sakit
+bersalah-salahan
+bersalam-salaman
+bersalin-salin
+bersama-sama
+bersambut-sambutan
+bersampan-sampan
+bersantai-santai
+bersapa-sapaan
+bersarang-sarang
+bersedan-sedan
+bersedia-sedia
+bersedu-sedu
+bersekat-sekat
+berselang-selang
+berselang-seli
+bersembur-semburan
+bersempit-sempit
+bersenang-senang
+bersenang-senangkan
+bersenda-senda
+bersendi-sendi
+bersepah-sepah
+bersepi-sepi
+berserak-serak
+berseri-seri
+bersesak-sesak
+bersetai-setai
+bersia-sia
+bersiap-siap
+bersiar-siar
+bersilir-silir
+bersimbur-simburan
+bersinau-sinau
+bersorak-sorai
+bersuap-suapan
+bersudah-sudah
+bersuka-suka
+bersuka-sukaan
+bersuku-suku
+bersumpah-sumpahan
+bersungguh-sungguh
+bersungut-sungut
+bersunyi-sunyi
+bersusah-susah
+bersusuk-susuk
+bersusuk-susukan
+bersutan-sutan
+bertabur-tabur
+bertahu-tahu
+bertahun-tahun
+bertajuk-tajuk
+bertakik-takik
+bertala-tala
+bertali-tali
+bertalu-talu
+bertambah-tambah
+bertanda-tandaan
+bertangis-tangisan
+bertangkil-tangkil
+bertanya-tanya
+bertarik-tarikan
+bertatai-tatai
+bertatih-tatih
+bertawan-tawan
+bertawar-tawaran
+bertebu-tebu
+bertebu-tebukan
+berteguh-teguh
+berteguh-teguhan
+berteka-teki
+bertelau-telau
+bertele-tele
+bertempat-tempat
+bertempuh-tempuh
+bertenang-tenang
+bertenggang-tenggangan
+bertentu-tentu
+bertepek-tepek
+berterang-terang
+berterang-terangan
+bertikam-tikaman
+bertimbal-timbalan
+bertimbun-timbun
+bertimpa-timpa
+bertimpas-timpas
+bertingkah-tingkah
+bertingkat-tingkat
+bertinjau-tinjauan
+bertiras-tiras
+bertitar-titar
+bertoboh-toboh
+bertolak-tolak
+bertolak-tolakan
+bertolong-tolongan
+bertonjol-tonjol
+bertua-tua
+bertua-tuaan
+bertual-tual
+bertubi-tubi
+bertukar-tukar
+bertukar-tukaran
+bertukas-tukas
+bertumpak-tumpak
+bertunda-tunda
+bertunjuk-tunjukan
+bertura-tura
+berturut-turut
+bertutur-tutur
+beruas-ruas
+berubah-ubah
+berulang-alik
+berulang-ulang
+berumbai-rumbai
+berundung-undung
+berunggas-runggas
+berungkur-ungkuran
+beruntai-untai
+beruntun-runtun
+berunyai-unyai
+berupa-rupa
+berura-ura
+beruris-uris
+berurut-urutan
+berwarna-warna
+berwarna-warni
+berwindu-windu
+berwiru-wiru
+beryang-yang
+besar-besaran
+betak-betak
+beti-beti
+betul-betul
+biang-biang
+biar-biar
+biji-bijian
+bila-bila
+bilang-bilang
+bincang-bincut
+bini-binian
+biri-biri
+biru-biru
+bisik-bisik
+biti-biti
+bolak-balik
+bolang-baling
+bongkar-bangkir
+buah-buahan
+buat-buatan
+buaya-buaya
+bubun-bubun
+bugi-bugi
+built-in
+bukan-bukan
+bulan-bulan
+bulan-bulanan
+bulang-bulang
+bulat-bulat
+buli-buli
+bulu-bulu
+buluh-buluh
+bulus-bulus
+bunga-bungaan
+bunuh-membunuh
+bunyi-bunyian
+buru-buru
+burung-burungan
+bye-bye
+cabik-cabik
+caing-caing
+calar-balar
+cara-cara
+carut-marut
+cawi-cawi
+cebar-cebur
+celam-celum
+celangak-celinguk
+celas-celus
+celedang-celedok
+celengkak-celengkok
+cemas-cemas
+centang-perenang
+cepat-cepat
+cerai-berai
+ceruk-menceruk
+ceruk-meruk
+check-up
+chit-chat
+cirit-birit
+cita-cita
+close-up
+closed-circuit
+cobak-cabik
+cobar-cabir
+cola-cala
+compang-camping
+congak-cangit
+congkah-cangkih
+congkah-mangkih
+copak-capik
+corak-carik
+corat-coret
+coreng-moreng
+cuang-caing
+cubung-cubung
+culik-culik
+cuma-cuma
+cumi-cumi
+cungap-cangip
+cupu-cupu
+dahulu-mendahului
+dali-dali
+dapur-dapur
+dari-dari
+daru-daru
+datang-datang
+datang-mendatangi
+daun-daunan
+dawai-dawai
+dayang-dayang
+degap-degap
+dekak-dekak
+dekat-dekat
+dengar-dengaran
+desas-desus
+diam-diam
+do-it-yourself
+dokok-dokok
+dolak-dalik
+dorong-mendorong
+drive-in
+dua-dua
+dua-duanya
+duduk-duduk
+dulang-dulang
+ecek-ecek
+embuh-embuhan
+empek-empek
+empok-empok
+encal-encal
+endap-endap
+endut-endutan
+engah-engah
+enggan-enggan
+engkah-engkah
+entah-berentah
+erang-erot
+erong-erong
+fast-food
+fifty-fifty
+flip-flop
+follow-up
+foya-foya
+gaba-gaba
+gabai-gabai
+gada-gada
+gading-gading
+gado-gado
+gajah-gajahan
+gala-gala
+gali-galian
+galing-galing
+galu-galu
+gamit-gamitan
+gampang-gampangan
+ganal-ganal
+ganda-berganda
+gapah-gopoh
+gara-gara
+garah-garah
+gatal-gatal
+gawar-gawar
+gaya-gayanya
+gedebak-gedebuk
+gelang-gelang
+gelembung-gelembungan
+geli-geli
+geliang-geliut
+geliat-geliut
+gempul-gempul
+gendang-gendang
+genjang-genjot
+gerabak-gerubuk
+gerak-gerik
+gerbas-gerbus
+gerit-gerit
+geruh-gerah
+getak-getuk
+geti-geti
+gila-gila
+gila-gilaan
+gilang-gemilang
+gilap-gemilap
+gili-gili
+giling-giling
+ginang-ginang
+girik-girik
+giring-giring
+go-kart
+golak-galik
+gonta-ganti
+gotong-royong
+gual-gail
+gudu-gudu
+gula-gula
+gulang-gulang
+guna-guna
+guntang-guntang
+gunung-ganang
+gunung-gemunung
+gunung-gunungan
+habis-habis
+habis-habisan
+halai-balai
+half-time
+hampir-hampir
+harap-harapan
+harum-haruman
+hati-hati
+heavy-duty
+hebat-hebatan
+hidup-hidup
+hiru-biru
+hiruk-pikuk
+hubaya-hubaya
+hula-hula
+huru-hara
+ibar-ibar
+icak-icak
+igau-igauan
+ikut-ikut
+ikut-ikutan
+ilam-ilam
+imbang-imbangan
+inang-inang
+inca-binca
+incang-incut
+ingat-ingat
+ingat-ingatan
+ingau-ingauan
+inggang-inggung
+injak-injak
+iras-iras
+iring-iringan
+iseng-iseng
+jadi-jadian
+jala-jala
+jamah-jamahan
+jambu-jambu
+jangan-jangan
+jarang-jarang
+jari-jari
+jaring-jaring
+jarum-jarum
+jauh-jauh
+jawi-jawi
+jebat-jebatan
+jelur-jelir
+jendal-jendul
+jenggar-jenggur
+jentik-jentik
+jerah-jerih
+jolong-jolong
+jongkar-jangkir
+juak-juak
+juang-juang
+julung-julung
+jurai-jurai
+kabu-kabu
+kacang-kacang
+kacang-kacangan
+kacau-balau
+kadang-kadang
+kail-kail
+kait-kait
+kakek-kakek
+kalau-kalau
+kaleng-kalengan
+kalut-malut
+kambing-kambing
+kanak-kanak
+kapa-kapa
+kapan-kapan
+kapu-kapu
+karang-karangan
+karang-mengarang
+kareseh-peseh
+karut-marut
+katang-katang
+kawa-kawa
+kayu-kayuan
+keabu-abuan
+keasyik-asyikan
+kebarat-baratan
+kebasah-basahan
+kebat-kebit
+kebata-bataan
+kebelanda-belandaan
+kebiru-biruan
+kebudak-budakan
+kecil-kecilan
+kecil-mengecil
+kecuh-kecah
+kedek-kedek
+kegadis-gadisan
+kegelap-gelapan
+kegila-gilaan
+kegirang-girangan
+kehijau-hijauan
+kehitam-hitaman
+kejaga-jagaan
+kejingga-jinggaan
+kekabur-kaburan
+kekanak-kanakan
+kekoboi-koboian
+kekuning-kuningan
+kelak-kelik
+kelak-keluk
+kelaki-lakian
+kelang-kelok
+kelap-kelip
+kelek-kelek
+kelek-kelekan
+kelik-kelik
+kelip-kelip
+kelusuh-kelasah
+kelut-melut
+kemak-kemik
+kemalu-maluan
+kemanja-manjaan
+kemarah-marahan
+kemasam-masaman
+kemati-matian
+kemerah-merahan
+kempang-kempis
+kempas-kempis
+kemuda-mudaan
+kena-mengena
+kenal-mengenal
+kenang-kenangan
+kencang-kencung
+kendang-kendang
+kendang-kendangan
+kentung-kentung
+kenyat-kenyit
+kepandir-pandiran
+kepang-kepot
+keperak-perakan
+kepilu-piluan
+kepura-puraan
+keputih-putihan
+kerah-kerahan
+kerancak-rancakan
+kerang-kerangan
+kerang-keroh
+kerang-kerung
+kerap-kerap
+keras-mengerasi
+kercap-kercip
+kercap-kercup
+keriang-keriut
+kernyat-kernyut
+kerong-kerong
+keropas-kerapis
+kertak-kertuk
+keruntang-pungkang
+kesap-kesip
+kesenak-senakan
+kesewenang-wenangan
+kesia-siaan
+kesik-kesik
+kesipu-sipuan
+kesu-kesi
+kesuh-kesih
+kesuk-kesik
+ketergesa-gesaan
+keti-keti
+ketidur-tiduran
+ketiga-tiganya
+ketua-tuaan
+ketuan-tuanan
+keungu-unguan
+kia-kia
+kiak-kiak
+kial-kial
+kiang-kiut
+kibang-kibut
+kicang-kecoh
+kicang-kicu
+kida-kida
+kilau-mengilau
+kili-kili
+kira-kira
+kira-kiraan
+kisi-kisi
+kocah-kacih
+kodok-kodok
+kolang-kaling
+koleh-koleh
+kolong-kolong
+koma-koma
+komat-kamit
+kontal-kantil
+kontang-kanting
+kosak-kasik
+kotak-katik
+kotak-kotak
+kuat-kuat
+kucar-kacir
+kucing-kucing
+kucing-kucingan
+kuda-kuda
+kuda-kudaan
+kudap-kudap
+kulah-kulah
+kulak-kulak
+kulik-kulik
+kulum-kulum
+kumat-kamit
+kunang-kunang
+kupat-kapit
+kupu-kupu
+kura-kura
+kurang-kurang
+kusat-mesat
+kutat-kutet
+kuti-kuti
+labi-labi
+labu-labu
+lagi-lagi
+laguh-lagah
+laki-laki
+lalu-lalang
+lama-kelamaan
+lama-lama
+lamat-lamat
+lambat-lambat
+lancar-lancar
+langak-longok
+langit-langit
+lanja-lanjaan
+lapat-lapat
+large-scale
+lari-lari
+lauk-pauk
+lawah-lawah
+lawak-lawak
+lawi-lawi
+layang-layang
+layu-layuan
+lebih-lebih
+legak-legok
+lekak-lekuk
+lekap-lekup
+lekas-lekas
+lekuh-lekih
+lekup-lekap
+lenggak-lenggok
+lenggok-lenggok
+lengket-lengket
+lentam-lentum
+lentang-lentok
+lentang-lentung
+lepa-lepa
+lerang-lerang
+lereng-lereng
+letah-letai
+letup-letup
+liang-liuk
+lidah-lidah
+line-up
+liuk-liuk
+liung-liung
+lobi-lobi
+lock-up
+lopak-lapik
+lopak-lopak
+lumba-lumba
+lumi-lumi
+luntang-lantung
+lupa-lupa
+lupa-lupaan
+main-mainan
+makan-makanan
+make-up
+malai-malai
+malam-malam
+malar-malar
+mali-mali
+malu-malu
+mana-mana
+manik-manik
+manis-manisan
+mark-up
+masing-masing
+mata-mata
+mati-matian
+maya-maya
+megap-megap
+megrek-megrek
+melak-melak
+melambai-lambai
+melambai-lambaikan
+melambat-lambatkan
+melaun-laun
+melawak-lawak
+melayap-layap
+melayap-layapkan
+melebih-lebihi
+melebih-lebihkan
+melejang-lejangkan
+melengah-lengah
+melihat-lihat
+melimpah-limpah
+melincah-lincah
+meloncat-loncat
+melonco-lonco
+melonjak-lonjak
+memacak-macak
+memaki-maki
+memaksa-maksa
+memandai-mandai
+memanggil-manggil
+memanis-manis
+memanjut-manjut
+memasak-masak
+memata-matai
+mematah-matah
+mematut-matut
+memayah-mayahkan
+membagi-bagikan
+membalik-balik
+membangkit-bangkit
+membayang-bayangi
+membayang-bayangkan
+membelai-belai
+membenar-benar
+membenar-benari
+memberai-beraikan
+membesar-besarkan
+membolak-balikkan
+membuang-buang
+membuat-buat
+membunga-bungai
+memburu-buru
+memburu-burukan
+memburuk-burukkan
+memencak-mencak
+memencar-mencar
+memetak-metak
+memetang-metangkan
+memetir-metir
+memikir-mikirkan
+memilih-milih
+meminang-minang
+meminta-minta
+memisah-misahkan
+memontang-mantingkan
+memperamat-amat
+memperamat-amatkan
+memperbagai-bagaikan
+memperganda-gandakan
+memperganduh-ganduhkan
+mempermacam-macamkan
+memperolok-olokkan
+mempersama-samakan
+mempertubi-tubi
+mempertubi-tubikan
+memperturut-turutkan
+memuja-muja
+memukang-mukang
+memulun-mulun
+memundi-mundi
+memundi-mundikan
+memuyu-muyu
+menagak-nagak
+menakut-nakuti
+menanjur-nanjur
+menanti-nanti
+menari-nari
+mencabik-cabik
+mencabik-cabikkan
+mencaing-caing
+mencak-mencak
+mencakup-cakup
+mencapak-capak
+mencari-cari
+mencarik-carik
+mencarut-carut
+mencengis-cengis
+mencepak-cepak
+mencepuk-cepuk
+mencerai-beraikan
+mencetai-cetai
+menciap-ciap
+menciar-ciar
+mencita-citakan
+menciut-ciut
+mencoang-coang
+mencubit-cubit
+mencuri-curi
+mendecap-decap
+mendengking-dengking
+menderak-derakkan
+menderau-derau
+menderu-deru
+mendesas-desuskan
+mendesus-desus
+mendewa-dewakan
+mendudu-dudu
+menebu-nebu
+menegur-neguri
+mengabung-ngabung
+mengaci-acikan
+mengada-ada
+mengaduk-aduk
+mengagak-agak
+mengagak-agihkan
+mengagut-agut
+mengais-ngais
+mengali-ali
+mengalur-alur
+mengamang-amang
+mengamat-amati
+mengambai-ambaikan
+mengambang-ambang
+mengancak-ancak
+mengangan-angankan
+mengangguk-angguk
+mengangin-anginkan
+mengangkat-angkat
+mengap-mengap
+mengapa-apai
+mengapi-apikan
+mengarah-arahi
+mengata-ngatai
+mengaum-aumkan
+mengejan-ejan
+mengelai-ngelai
+mengelepik-ngelepik
+mengelus-elus
+mengembut-embut
+mengenap-enapkan
+mengenjak-enjak
+mengepak-ngepak
+mengepak-ngepakkan
+menggaba-gabai
+menggalur-galur
+menggamak-gamak
+menggapai-gapai
+menggapai-gapaikan
+menggelepar-gelepar
+menggelepar-geleparkan
+menggemak-gemak
+menggerecak-gerecak
+menggesa-gesakan
+menggili-gili
+menggorek-gorek
+menggosok-gosok
+mengguit-guit
+menghalai-balaikan
+menghinap-hinap
+mengiang-ngiang
+mengibas-ngibas
+mengidam-idamkan
+mengilah-ngilahkan
+mengilai-ilai
+mengilat-ngilatkan
+mengilik-ngilik
+mengimak-imak
+mengiming-iming
+menginjak-injak
+mengipas-ngipas
+mengira-ngira
+mengira-ngirakan
+mengiras-iras
+mengiras-irasi
+mengitar-ngitar
+mengitik-ngitik
+mengogok-ogok
+mengolak-alikkan
+mengoleng-oleng
+mengongkang-ongkang
+mengongkok-ongkok
+mengonyah-anyih
+mengotak-ngatikkan
+mengoyak-ngoyakkan
+mengoyak-oyak
+menguar-nguarkan
+menguar-uarkan
+menguber-uber
+mengubit-ubit
+mengubrak-abrik
+mengucar-ngacirkan
+mengucek-ngucek
+menguik-uik
+menguis-uis
+mengulit-ulit
+menguman-uman
+mengumbang-ambingkan
+mengumpak-umpak
+mengungkat-ungkat
+mengungkit-ungkit
+mengurik-urik
+mengutak-ngatikkan
+mengutik-ngutik
+menimang-nimang
+meningkat-ningkat
+meniru-niru
+meniup-niup
+menjadi-jadi
+menjengek-jengek
+menjengit-jengit
+menjilat-jilat
+mentah-mentah
+mentang-mentang
+menunda-nunda
+menusuk-nusuk
+menyama-nyama
+menyambar-nyambar
+menyanjung-nyanjung
+menyapu-nyapu
+menyarat-nyarat
+menyendi-nyendi
+menyeret-nyeret
+menyeru-nyerukan
+menyia-nyiakan
+menyungguh-nyungguhi
+meraba-raba
+merangkak-rangkak
+merasa-rasai
+meraung-raung
+meraung-raungkan
+merayau-rayau
+merayu-rayu
+mereka-reka
+merelap-relap
+meremah-remah
+meremeh-temehkan
+merempah-rempahi
+merengek-rengek
+merenik-renik
+merenta-renta
+merenyai-renyai
+merintang-rintang
+merintik-rintik
+merobek-robek
+meronta-ronta
+merungus-rungus
+merungut-rungut
+mewarna-warnikan
+meyakin-yakini
+miju-miju
+minta-minta
+moga-moga
+morat-marit
+muda-mudi
+mudah-mudahan
+muka-muka
+mula-mula
+muluk-muluk
+naga-naga
+nanti-nantian
+nasi-nasi
+nasib-nasiban
+nenek-nenek
+nyolong-nyolong
+ogah-ogahan
+ogak-ogak
+olak-alik
+olak-olak
+olang-aling
+olang-alingan
+oleh-oleh
+olok-olok
+olok-olokan
+olong-olong
+on-screen
+onde-onde
+one-to-one
+oneng-oneng
+ongkang-ongkang
+ongol-ongol
+onyah-anyih
+orak-arik
+orang-aring
+orang-orangan
+orok-orok
+orong-orong
+otak-otak
+otak-otakan
+padi-padian
+pagi-pagi
+palas-palas
+paling-paling
+palu-memalu
+panas-panas
+pandang-memandang
+panji-panji
+para-para
+paru-paru
+pasang-memasang
+pasu-pasu
+paya-paya
+pecah-pecah
+pelan-pelan
+pengundang-undang
+perang-perangan
+perintang-rintang
+perlahan-lahan
+perlip-perlipan
+pertama-tama
+perundang-undangan
+pesan-pesan
+piat-piut
+pick-up
+pijak-pijak
+pijar-pijar
+pijat-pijat
+pina-pina
+pisang-pisang
+play-off
+pohon-pohonan
+pokrol-pokrolan
+polang-paling
+poma-poma
+pontang-panting
+porak-parik
+porak-peranda
+potong-memotong
+puji-pujian
+pukang-pukang
+pukul-memukul
+pulang-pergi
+pulut-pulut
+pundi-pundi
+punggung-memunggung
+pura-pura
+pusar-pusar
+push-up
+pusing-pusing
+putus-putus
+rada-rada
+radio-frequency
+ragu-ragu
+rama-rama
+rambu-rambu
+rango-rango
+rasa-rasanya
+rata-rata
+real-time
+rebah-rebah
+rebah-rebahan
+redam-redam
+reka-reka
+reka-rekaan
+remah-remah
+remang-remang
+rembah-rembih
+remeh-temeh
+rempah-rempah
+repuh-repuh
+riang-riang
+ribu-ribu
+rigi-rigi
+robak-rabik
+robat-rabit
+role-play
+roll-on
+rombang-rambing
+ruak-ruak
+ruku-ruku
+rumah-rumah
+rumah-rumahan
+rumput-rumputan
+runding-merunding
+runggu-rangga
+runner-up
+rupa-rupa
+rupa-rupanya
+saban-saban
+sabung-menyabung
+saing-menyaing
+salah-salah
+sama-sama
+samar-samar
+sambar-menyambar
+sambung-bersambung
+sambung-menyambung
+sambut-menyambut
+sampai-sampai
+sandar-menyandar
+sangat-sangat
+sangkut-menyangkut
+sapa-menyapa
+sapu-sapu
+sarit-sarit
+satu-satu
+satu-satunya
+sayup-menyayup
+sayup-sayup
+sayur-mayur
+sayur-sayuran
+sci-fi
+seakal-akal
+seakan-akan
+sealak-alak
+sebaik-baiknya
+sebelah-menyebelah
+sebentar-sebentar
+seberang-menyeberang
+seboleh-bolehnya
+sedalam-dalamnya
+sedang-menyedang
+sedap-sedapan
+sedapat-dapatnya
+sedikit-dikitnya
+sedikit-sedikit
+sedikit-sedikitnya
+seelok-eloknya
+segala-galanya
+segan-menyegan
+segan-menyegani
+segan-segan
+sehari-hari
+sehari-harian
+sejadi-jadinya
+sekali-kali
+sekali-sekali
+sekira-kira
+sekonyong-konyong
+sekuasa-kuasanya
+sekurang-kurangnya
+sela-menyela
+sela-sela
+selama-lamanya
+selambat-lambatnya
+selang-seli
+selang-seling
+selar-belar
+selat-latnya
+selekas-lekasnya
+selepas-lepas
+self-esteem
+self-help
+sema-sema
+semah-semah
+semak-semak
+semalam-malaman
+semasa-masa
+semata-mata
+sembunyi-sembunyi
+sembunyi-sembunyian
+semena-mena
+semenda-menyemenda
+semengga-mengga
+sementang-mentang
+semu-semu
+semut-semutan
+sengal-sengal
+sengau-sengauan
+seolah-olah
+sepala-pala
+sepandai-pandai
+sepetang-petangan
+sepoi-sepoi
+sepuas-puasnya
+serang-menyerang
+seraya-menyeraya
+serba-serbi
+serbah-serbih
+serembah-serembih
+sering-sering
+serta-menyertai
+serta-serta
+sesal-menyesali
+sesudah-sudah
+sesudah-sudahnya
+sesuka-suka
+setempat-setempat
+setengah-setengah
+setidak-tidaknya
+seupaya-upaya
+seupaya-upayanya
+sewaktu-waktu
+sewenang-wenang
+short-term
+sia-sia
+siang-siang
+siapa-siapa
+sibar-sibar
+sibur-sibur
+sida-sida
+siku-siku
+silah-silah
+silang-menyilang
+silir-semilir
+sinar-seminar
+sindir-menyindir
+singgah-menyinggah
+sorak-sorai
+stand-by
+stand-up
+sudu-sudu
+sudung-sudung
+suka-suka
+sulang-menyulang
+sulur-suluran
+sumpah-sumpah
+sumpit-sumpit
+sungguh-sungguh
+sungut-sungut
+suram-suram
+surat-menyurat
+suruh-suruhan
+tabar-tabar
+tabir-mabir
+tabrak-tubruk
+tabuh-tabuhan
+tahu-menahu
+tahu-tahu
+takang-takik
+take-off
+takut-takut
+takut-takutan
+tali-bertali
+tali-tali
+tampak-tampak
+tanam-menanam
+tanam-tanaman
+tanda-tanda
+tangan-menangan
+tangan-tangan
+tanggung-menanggung
+tapa-tapa
+tapak-tapak
+tari-menari
+tari-tarian
+tarik-menarik
+tatah-tatah
+tawak-tawak
+tawang-tawang
+tawar-menawar
+tawar-tawar
+tayum-temayum
+tebu-tebu
+tegak-tegak
+teka-teki
+temas-temas
+tembak-menembak
+temut-temut
+tenggang-menenggang
+teraba-raba
+terambang-ambang
+terang-terang
+terang-terangan
+teranggar-anggar
+terangguk-angguk
+teranggul-anggul
+terangin-angin
+terangkup-angkup
+teranja-anja
+terapung-apung
+terayan-rayan
+terayap-rayap
+terbada-bada
+terbahak-bahak
+terbata-bata
+terbatuk-batuk
+terbayang-bayang
+terbengkil-bengkil
+terbirit-birit
+terbuai-buai
+terbuang-buang
+terburu-buru
+tercangak-cangak
+tercengang-cengang
+tercilap-cilap
+tercongget-congget
+tercungap-cungap
+terdangka-dangka
+terdengih-dengih
+terekeh-ekeh
+terembut-embut
+terembut-rembut
+terengah-engah
+teresak-esak
+tergagap-gagap
+tergagau-gagau
+tergaguk-gaguk
+tergapai-gapai
+tergegap-gegap
+tergegas-gegas
+tergelung-gelung
+tergerenyeng-gerenyeng
+tergesa-gesa
+tergila-gila
+tergontai-gontai
+tergudik-gudik
+terguling-guling
+tergulut-gulut
+terharak-harak
+terharap-harap
+terhengit-hengit
+terhinggut-hinggut
+terigau-igau
+terincut-incut
+teringa-inga
+teringat-ingat
+terinjak-injak
+terjembak-jembak
+terjerit-jerit
+terkadang-kadang
+terkakah-kakah
+terkakak-kakak
+terkanjar-kanjar
+terkapah-kapah
+terkapai-kapai
+terkapung-kapung
+terkatah-katah
+terkatung-katung
+terkecap-kecap
+terkedek-kedek
+terkedip-kedip
+terkejar-kejar
+terkekau-kekau
+terkekeh-kekeh
+terkekek-kekek
+terkelinjat-kelinjat
+terkelip-kelip
+terkempul-kempul
+terkemut-kemut
+terkencar-kencar
+terkepak-kepak
+terkesot-kesot
+terkesut-kesut
+terkial-kial
+terkincak-kincak
+terkindap-kindap
+terkinja-kinja
+terkirai-kirai
+terkitar-kitar
+terkocoh-kocoh
+terkokol-kokol
+terkosel-kosel
+terkoteng-koteng
+terkumpal-kumpal
+terlara-lara
+terlayang-layang
+terlebih-lebih
+terlincah-lincah
+terliuk-liuk
+terlolong-lolong
+terlongong-longong
+termangu-mangu
+termanja-manja
+termata-mata
+termengah-mengah
+termimpi-mimpi
+ternanti-nanti
+terngiang-ngiang
+teroleng-oleng
+terpandang-pandang
+terpecah-pecah
+terpekik-pekik
+terpereh-pereh
+terpikau-pikau
+terpinga-pinga
+terpingkal-pingkal
+terpontang-panting
+terpusing-pusing
+terputus-putus
+tersanga-sanga
+tersaruk-saruk
+tersedan-sedan
+tersedih-sedih
+tersedu-sedu
+tersendat-sendat
+tersendeng-sendeng
+tersengal-sengal
+tersengguk-sengguk
+tersengut-sengut
+tersera-sera
+terserak-serak
+tersetai-setai
+tersia-sia
+tersipu-sipu
+tersoja-soja
+tersungkuk-sungkuk
+tertagak-tagak
+tertahan-tahan
+tertatih-tatih
+tertegun-tegun
+tertekan-tekan
+terteleng-teleng
+terumbang-ambing
+terumbang-umbang
+terungkap-ungkap
+terus-menerus
+terus-terusan
+think-tank
+tiap-tiap
+tiba-tiba
+tidak-tidak
+tidur-tidur
+tie-dye
+tiga-tiganya
+tikam-menikam
+tilik-menilik
+timah-timah
+timang-timangan
+timbang-menimbang
+timu-timu
+tindih-bertindih
+tinjau-meninjau
+tip-off
+tiru-tiruan
+tiup-tiup
+tokak-takik
+tokok-menokok
+tolak-menolak
+tolong-menolong
+top-level
+trade-in
+tua-tua
+tuan-tuan
+tuang-tuang
+tuban-tuban
+tukang-menukang
+tukar-menukar
+tulang-tulangan
+tuli-tuli
+tulis-menulis
+tumbuh-tumbuhan
+tune-up
+tunggang-tunggit
+tupai-tupai
+turun-temurun
+turut-menurut
+turut-turutan
+two-tone
+uar-uar
+ubel-ubel
+ubun-ubun
+ubur-ubur
+uci-uci
+udap-udapan
+ugal-ugalan
+uir-uir
+ujar-ujar
+ukir-mengukir
+ula-ula
+ulak-ulak
+ulang-alik
+ulang-aling
+ulang-ulang
+ulap-ulap
+ular-ular
+ular-ularan
+ulung-ulung
+umang-umang
+umbang-ambing
+umbi-umbian
+umbul-umbul
+umbut-umbut
+uncang-uncit
+undak-undakan
+undang-undang
+unduk-unduk
+undung-undung
+undur-undur
+unggat-unggit
+ungkit-ungkit
+unting-unting
+untung-untung
+untung-untungan
+upside-down
+ura-ura
+uran-uran
+urat-urat
+uring-uringan
+urup-urup
+urup-urupan
+urus-urus
+user-user
+user-useran
+utar-utar
+voice-over
+walk-out
+wangi-wangian
+wanti-wanti
+wara-wara
+warna-warni
+water-cooled
+world-class
+yang-yang
+""".split()
+)
diff --git a/spacy/lang/ms/examples.py b/spacy/lang/ms/examples.py
new file mode 100644
index 000000000..97ab19b6e
--- /dev/null
+++ b/spacy/lang/ms/examples.py
@@ -0,0 +1,17 @@
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.ms.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+ "Malaysia ialah sebuah negara yang terletak di Asia Tenggara.",
+ "Berapa banyak pelajar yang akan menghadiri majlis perpisahan sekolah?",
+ "Pengeluaran makanan berasal dari beberapa lokasi termasuk Cameron Highlands, Johor Bahru, dan Kuching.",
+ "Syarikat XYZ telah menghasilkan 20,000 unit produk baharu dalam setahun terakhir",
+ "Kuala Lumpur merupakan ibu negara Malaysia." "Kau berada di mana semalam?",
+ "Siapa yang akan memimpin projek itu?",
+ "Siapa perdana menteri Malaysia sekarang?",
+]
diff --git a/spacy/lang/ms/lex_attrs.py b/spacy/lang/ms/lex_attrs.py
new file mode 100644
index 000000000..2088c9955
--- /dev/null
+++ b/spacy/lang/ms/lex_attrs.py
@@ -0,0 +1,65 @@
+import unicodedata
+
+from ...attrs import IS_CURRENCY, LIKE_NUM
+from .punctuation import LIST_CURRENCY
+
+_num_words = [
+ "kosong",
+ "satu",
+ "dua",
+ "tiga",
+ "empat",
+ "lima",
+ "enam",
+ "tujuh",
+ "lapan",
+ "sembilan",
+ "sepuluh",
+ "sebelas",
+ "belas",
+ "puluh",
+ "ratus",
+ "ribu",
+ "juta",
+ "billion",
+ "trillion",
+ "kuadrilion",
+ "kuintilion",
+ "sekstilion",
+ "septilion",
+ "oktilion",
+ "nonilion",
+ "desilion",
+]
+
+
+def like_num(text):
+ if text.startswith(("+", "-", "±", "~")):
+ text = text[1:]
+ text = text.replace(",", "").replace(".", "")
+ if text.isdigit():
+ return True
+ if text.count("/") == 1:
+ num, denom = text.split("/")
+ if num.isdigit() and denom.isdigit():
+ return True
+ if text.lower() in _num_words:
+ return True
+ if text.count("-") == 1:
+ _, num = text.split("-")
+ if num.isdigit() or num in _num_words:
+ return True
+ return False
+
+
+def is_currency(text):
+ if text in LIST_CURRENCY:
+ return True
+
+ for char in text:
+ if unicodedata.category(char) != "Sc":
+ return False
+ return True
+
+
+LEX_ATTRS = {IS_CURRENCY: is_currency, LIKE_NUM: like_num}
diff --git a/spacy/lang/ms/punctuation.py b/spacy/lang/ms/punctuation.py
new file mode 100644
index 000000000..a8d6c2e8e
--- /dev/null
+++ b/spacy/lang/ms/punctuation.py
@@ -0,0 +1,60 @@
+from ..char_classes import ALPHA, _currency, _units, merge_chars, split_chars
+from ..punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
+
+_units = (
+ _units + "s bit Gbps Mbps mbps Kbps kbps ƒ ppi px "
+ "Hz kHz MHz GHz mAh "
+ "ratus rb ribu ribuan "
+ "juta jt jutaan mill?iar million bil[l]?iun bilyun billion "
+)
+_currency = _currency + r" USD RM MYR Rp IDR RMB SGD S\$"
+_months = (
+ "Januari Februari Mac April Mei Jun Julai Ogos September "
+ "Oktober November Disember Januari Februari Mac Mei Jun "
+ "Julai Ogos Oktober Disember Jan Feb Mac Jun Julai Ogos Sept "
+ "Okt Nov Dis"
+)
+
+
+UNITS = merge_chars(_units)
+CURRENCY = merge_chars(_currency)
+HTML_PREFIX = r"<(b|strong|i|em|p|span|div|br)\s?/>|]+)>"
+HTML_SUFFIX = r"(b|strong|i|em|p|span|div|a)>"
+MONTHS = merge_chars(_months)
+LIST_CURRENCY = split_chars(_currency)
+
+_prefixes = list(TOKENIZER_PREFIXES)
+_prefixes.remove("#") # hashtag
+_prefixes = _prefixes + LIST_CURRENCY + [HTML_PREFIX] + ["/", "—"]
+
+_suffixes = (
+ TOKENIZER_SUFFIXES
+ + [r"\-[Nn]ya", "-[KkMm]u", "[—-]"]
+ + [
+ # disabled: variable width currency variable
+ # r"(?<={c})(?:[0-9]+)".format(c=CURRENCY),
+ r"(?<=[0-9])(?:{u})".format(u=UNITS),
+ r"(?<=[0-9])%",
+ # disabled: variable width HTML_SUFFIX variable
+ # r"(?<=[0-9{a}]{h})(?:[\.,:-])".format(a=ALPHA, h=HTML_SUFFIX),
+ r"(?<=[0-9{a}])(?:{h})".format(a=ALPHA, h=HTML_SUFFIX),
+ ]
+)
+
+_infixes = TOKENIZER_INFIXES + [
+ r"(?<=[0-9])[\\/](?=[0-9%-])",
+ r"(?<=[0-9])%(?=[{a}0-9/])".format(a=ALPHA),
+ # disabled: variable width units variable
+ # r"(?<={u})[\/-](?=[0-9])".format(u=UNITS),
+ # disabled: variable width months variable
+ # r"(?<={m})[\/-](?=[0-9])".format(m=MONTHS),
+ r'(?<=[0-9)][.,])"(?=[0-9])',
+ r'(?<=[{a})][.,\'])["—](?=[{a}])'.format(a=ALPHA),
+ r"(?<=[{a}])-(?=[0-9])".format(a=ALPHA),
+ r"(?<=[0-9])-(?=[{a}])".format(a=ALPHA),
+ r"(?<=[{a}])[\/-](?={c}|[{a}])".format(a=ALPHA, c=CURRENCY),
+]
+
+TOKENIZER_PREFIXES = _prefixes
+TOKENIZER_SUFFIXES = _suffixes
+TOKENIZER_INFIXES = _infixes
diff --git a/spacy/lang/ms/stop_words.py b/spacy/lang/ms/stop_words.py
new file mode 100644
index 000000000..b1bfaea79
--- /dev/null
+++ b/spacy/lang/ms/stop_words.py
@@ -0,0 +1,118 @@
+STOP_WORDS = set(
+ """
+ada adalah adanya adapun agak agaknya agar akan akankah akhir akhiri akhirnya
+aku akulah amat amatlah anda andalah antar antara antaranya apa apaan apabila
+apakah apalagi apatah artinya asal asalkan atas atau ataukah ataupun awal
+awalnya
+
+bagai bagaikan bagaimana bagaimanakah bagaimanapun bagi bagian bahkan bahwa
+bahwasanya baik bakal bakalan balik banyak bapak baru bawah beberapa begini
+beginian beginikah beginilah begitu begitukah begitulah begitupun bekerja
+belakang belakangan belum belumlah benar benarkah benarlah berada berakhir
+berakhirlah berakhirnya berapa berapakah berapalah berapapun berarti berawal
+berbagai berdatangan beri berikan berikut berikutnya berjumlah berkali-kali
+berkata berkehendak berkeinginan berkenaan berlainan berlalu berlangsung
+berlebihan bermacam bermacam-macam bermaksud bermula bersama bersama-sama
+bersiap bersiap-siap bertanya bertanya-tanya berturut berturut-turut bertutur
+berujar berupa besar betul betulkah biasa biasanya bila bilakah bisa bisakah
+boleh bolehkah bolehlah buat bukan bukankah bukanlah bukannya bulan bung
+
+cara caranya cukup cukupkah cukuplah cuma
+
+dahulu dalam dan dapat dari daripada datang dekat demi demikian demikianlah
+dengan depan di dia diakhiri diakhirinya dialah diantara diantaranya diberi
+diberikan diberikannya dibuat dibuatnya didapat didatangkan digunakan
+diibaratkan diibaratkannya diingat diingatkan diinginkan dijawab dijelaskan
+dijelaskannya dikarenakan dikatakan dikatakannya dikerjakan diketahui
+diketahuinya dikira dilakukan dilalui dilihat dimaksud dimaksudkan
+dimaksudkannya dimaksudnya diminta dimintai dimisalkan dimulai dimulailah
+dimulainya dimungkinkan dini dipastikan diperbuat diperbuatnya dipergunakan
+diperkirakan diperlihatkan diperlukan diperlukannya dipersoalkan dipertanyakan
+dipunyai diri dirinya disampaikan disebut disebutkan disebutkannya disini
+disinilah ditambahkan ditandaskan ditanya ditanyai ditanyakan ditegaskan
+ditujukan ditunjuk ditunjuki ditunjukkan ditunjukkannya ditunjuknya dituturkan
+dituturkannya diucapkan diucapkannya diungkapkan dong dua dulu
+
+empat enggak enggaknya entah entahlah
+
+guna gunakan
+
+hal hampir hanya hanyalah hari harus haruslah harusnya hendak hendaklah
+hendaknya hingga
+
+ia ialah ibarat ibaratkan ibaratnya ibu ikut ingat ingat-ingat ingin inginkah
+inginkan ini inikah inilah itu itukah itulah
+
+jadi jadilah jadinya jangan jangankan janganlah jauh jawab jawaban jawabnya
+jelas jelaskan jelaslah jelasnya jika jikalau juga jumlah jumlahnya justru
+
+kala kalau kalaulah kalaupun kalian kami kamilah kamu kamulah kan kapan
+kapankah kapanpun karena karenanya kasus kata katakan katakanlah katanya ke
+keadaan kebetulan kecil kedua keduanya keinginan kelamaan kelihatan
+kelihatannya kelima keluar kembali kemudian kemungkinan kemungkinannya kenapa
+kepada kepadanya kesampaian keseluruhan keseluruhannya keterlaluan ketika
+khususnya kini kinilah kira kira-kira kiranya kita kitalah kok kurang
+
+lagi lagian lah lain lainnya lalu lama lamanya lanjut lanjutnya lebih lewat
+lima luar
+
+macam maka makanya makin malah malahan mampu mampukah mana manakala manalagi
+masa masalah masalahnya masih masihkah masing masing-masing mau maupun
+melainkan melakukan melalui melihat melihatnya memang memastikan memberi
+memberikan membuat memerlukan memihak meminta memintakan memisalkan memperbuat
+mempergunakan memperkirakan memperlihatkan mempersiapkan mempersoalkan
+mempertanyakan mempunyai memulai memungkinkan menaiki menambahkan menandaskan
+menanti menanti-nanti menantikan menanya menanyai menanyakan mendapat
+mendapatkan mendatang mendatangi mendatangkan menegaskan mengakhiri mengapa
+mengatakan mengatakannya mengenai mengerjakan mengetahui menggunakan
+menghendaki mengibaratkan mengibaratkannya mengingat mengingatkan menginginkan
+mengira mengucapkan mengucapkannya mengungkapkan menjadi menjawab menjelaskan
+menuju menunjuk menunjuki menunjukkan menunjuknya menurut menuturkan
+menyampaikan menyangkut menyatakan menyebutkan menyeluruh menyiapkan merasa
+mereka merekalah merupakan meski meskipun meyakini meyakinkan minta mirip
+misal misalkan misalnya mula mulai mulailah mulanya mungkin mungkinkah
+
+nah naik namun nanti nantinya nyaris nyatanya
+
+oleh olehnya
+
+pada padahal padanya pak paling panjang pantas para pasti pastilah penting
+pentingnya per percuma perlu perlukah perlunya pernah persoalan pertama
+pertama-tama pertanyaan pertanyakan pihak pihaknya pukul pula pun punya
+
+rasa rasanya rata rupanya
+
+saat saatnya saja sajalah saling sama sama-sama sambil sampai sampai-sampai
+sampaikan sana sangat sangatlah satu saya sayalah se sebab sebabnya sebagai
+sebagaimana sebagainya sebagian sebaik sebaik-baiknya sebaiknya sebaliknya
+sebanyak sebegini sebegitu sebelum sebelumnya sebenarnya seberapa sebesar
+sebetulnya sebisanya sebuah sebut sebutlah sebutnya secara secukupnya sedang
+sedangkan sedemikian sedikit sedikitnya seenaknya segala segalanya segera
+seharusnya sehingga seingat sejak sejauh sejenak sejumlah sekadar sekadarnya
+sekali sekali-kali sekalian sekaligus sekalipun sekarang sekarang sekecil
+seketika sekiranya sekitar sekitarnya sekurang-kurangnya sekurangnya sela
+selain selaku selalu selama selama-lamanya selamanya selanjutnya seluruh
+seluruhnya semacam semakin semampu semampunya semasa semasih semata semata-mata
+semaunya sementara semisal semisalnya sempat semua semuanya semula sendiri
+sendirian sendirinya seolah seolah-olah seorang sepanjang sepantasnya
+sepantasnyalah seperlunya seperti sepertinya sepihak sering seringnya serta
+serupa sesaat sesama sesampai sesegera sesekali seseorang sesuatu sesuatunya
+sesudah sesudahnya setelah setempat setengah seterusnya setiap setiba setibanya
+setidak-tidaknya setidaknya setinggi seusai sewaktu siap siapa siapakah
+siapapun sini sinilah soal soalnya suatu sudah sudahkah sudahlah supaya
+
+tadi tadinya tahu tahun tak tambah tambahnya tampak tampaknya tandas tandasnya
+tanpa tanya tanyakan tanyanya tapi tegas tegasnya telah tempat tengah tentang
+tentu tentulah tentunya tepat terakhir terasa terbanyak terdahulu terdapat
+terdiri terhadap terhadapnya teringat teringat-ingat terjadi terjadilah
+terjadinya terkira terlalu terlebih terlihat termasuk ternyata tersampaikan
+tersebut tersebutlah tertentu tertuju terus terutama tetap tetapi tiap tiba
+tiba-tiba tidak tidakkah tidaklah tiga tinggi toh tunjuk turut tutur tuturnya
+
+ucap ucapnya ujar ujarnya umum umumnya ungkap ungkapnya untuk usah usai
+
+waduh wah wahai waktu waktunya walau walaupun wong
+
+yaitu yakin yakni yang
+""".split()
+)
diff --git a/spacy/lang/ms/syntax_iterators.py b/spacy/lang/ms/syntax_iterators.py
new file mode 100644
index 000000000..027798687
--- /dev/null
+++ b/spacy/lang/ms/syntax_iterators.py
@@ -0,0 +1,41 @@
+from typing import Iterator, Tuple, Union
+
+from ...errors import Errors
+from ...symbols import NOUN, PRON, PROPN
+from ...tokens import Doc, Span
+
+
+def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
+ """
+ Detect base noun phrases from a dependency parse. Works on both Doc and Span.
+ """
+ # fmt: off
+ labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
+ # fmt: on
+ doc = doclike.doc # Ensure works on both Doc and Span.
+ if not doc.has_annotation("DEP"):
+ raise ValueError(Errors.E029)
+ np_deps = [doc.vocab.strings[label] for label in labels]
+ conj = doc.vocab.strings.add("conj")
+ np_label = doc.vocab.strings.add("NP")
+ prev_end = -1
+ for i, word in enumerate(doclike):
+ if word.pos not in (NOUN, PROPN, PRON):
+ continue
+ # Prevent nested chunks from being produced
+ if word.left_edge.i <= prev_end:
+ continue
+ if word.dep in np_deps:
+ prev_end = word.right_edge.i
+ yield word.left_edge.i, word.right_edge.i + 1, np_label
+ elif word.dep == conj:
+ head = word.head
+ while head.dep == conj and head.head.i < head.i:
+ head = head.head
+ # If the head is an NP, and we're coordinated to it, we're an NP
+ if head.dep in np_deps:
+ prev_end = word.right_edge.i
+ yield word.left_edge.i, word.right_edge.i + 1, np_label
+
+
+SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
diff --git a/spacy/lang/ms/tokenizer_exceptions.py b/spacy/lang/ms/tokenizer_exceptions.py
new file mode 100644
index 000000000..e8b53fed8
--- /dev/null
+++ b/spacy/lang/ms/tokenizer_exceptions.py
@@ -0,0 +1,1532 @@
+from ...symbols import NORM, ORTH
+from ...util import update_exc
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
+from ._tokenizer_exceptions_list import MS_BASE_EXCEPTIONS
+
+# Daftar singkatan dan Akronim dari:
+# https://ms.wiktionary.org/wiki/Wiktionary:Senarai_akronim_dan_singkatan
+
+_exc = {}
+
+for orth in MS_BASE_EXCEPTIONS:
+ _exc[orth] = [{ORTH: orth}]
+ orth_title = orth.title()
+ _exc[orth_title] = [{ORTH: orth_title}]
+ orth_caps = orth.upper()
+ _exc[orth_caps] = [{ORTH: orth_caps}]
+ orth_lower = orth.lower()
+ _exc[orth_lower] = [{ORTH: orth_lower}]
+ orth_first_upper = orth[0].upper() + orth[1:]
+ _exc[orth_first_upper] = [{ORTH: orth_first_upper}]
+ if "-" in orth:
+ orth_title = "-".join([part.title() for part in orth.split("-")])
+ _exc[orth_title] = [{ORTH: orth_title}]
+ orth_caps = "-".join([part.upper() for part in orth.split("-")])
+ _exc[orth_caps] = [{ORTH: orth_caps}]
+
+for exc_data in [
+ {ORTH: "Jan.", NORM: "Januari"},
+ {ORTH: "Feb.", NORM: "Februari"},
+ {ORTH: "Mac.", NORM: "Mac"},
+ {ORTH: "Apr.", NORM: "April"},
+ {ORTH: "Jun.", NORM: "Jun"},
+ {ORTH: "Jul.", NORM: "Julai"},
+ {ORTH: "Ogos.", NORM: "Ogos"},
+ {ORTH: "Sep.", NORM: "September"},
+ {ORTH: "Okt.", NORM: "Oktober"},
+ {ORTH: "Nov.", NORM: "November"},
+ {ORTH: "Dis.", NORM: "Disember"},
+]:
+ _exc[exc_data[ORTH]] = [exc_data]
+
+_other_exc = {
+ "do'a": [{ORTH: "do'a", NORM: "doa"}],
+ "jum'at": [{ORTH: "jum'at", NORM: "Jumat"}],
+ "Jum'at": [{ORTH: "Jum'at", NORM: "Jumat"}],
+ "la'nat": [{ORTH: "la'nat", NORM: "laknat"}],
+ "ma'af": [{ORTH: "ma'af", NORM: "maaf"}],
+ "mu'jizat": [{ORTH: "mu'jizat", NORM: "mukjizat"}],
+ "Mu'jizat": [{ORTH: "Mu'jizat", NORM: "mukjizat"}],
+ "ni'mat": [{ORTH: "ni'mat", NORM: "nikmat"}],
+ "raka'at": [{ORTH: "raka'at", NORM: "rakaat"}],
+ "ta'at": [{ORTH: "ta'at", NORM: "taat"}],
+}
+
+_exc.update(_other_exc)
+
+for orth in [
+ "1 Kor.",
+ "1 Ptr.",
+ "1 Raj.",
+ "1 Sam.",
+ "1 Taw.",
+ "1 Tes.",
+ "1 Tim.",
+ "1 Yoh.",
+ "1Ch.",
+ "1Co.",
+ "1Jo.",
+ "1Ki.",
+ "1Pe.",
+ "1Sa.",
+ "1Th.",
+ "1Ti.",
+ "2 Kor.",
+ "2 Ptr.",
+ "2 Raj.",
+ "2 Sam.",
+ "2 Taw.",
+ "2 Tes.",
+ "2 Tim.",
+ "2 Yoh.",
+ "2Ch.",
+ "2Co.",
+ "2Jo.",
+ "2Ki.",
+ "2Pe.",
+ "2Sa.",
+ "2Th.",
+ "2Ti.",
+ "3 Yoh.",
+ "3D",
+ "3F",
+ "3Jo.",
+ "3M",
+ "8MP",
+ "AA",
+ "AAAAAA",
+ "AB",
+ "Abd.",
+ "ABC",
+ "ABIM",
+ "ABM",
+ "ABMI",
+ "ABS",
+ "AC",
+ "Ac",
+ "ACAPLPL",
+ "Act.",
+ "AD",
+ "AD LIB",
+ "ADAM",
+ "ADB",
+ "ADD",
+ "ADIL",
+ "ADN",
+ "ADR",
+ "ADRI",
+ "ADSL",
+ "ADUN",
+ "AFAS",
+ "AFTA",
+ "Ag",
+ "AGMARIS",
+ "AH",
+ "AI",
+ "AIA",
+ "AIDS",
+ "AIJV",
+ "AIM",
+ "a/k",
+ "ak",
+ "AKN",
+ "Al",
+ "a/l",
+ "AM",
+ "Am",
+ "Am.",
+ "AMN",
+ "Amo.",
+ "AMPS",
+ "Ams.",
+ "AMWA",
+ "AN",
+ "a.n.",
+ "ANGKASA",
+ "ANM",
+ "ANSI",
+ "Ant.",
+ "AOL",
+ "AP",
+ "a/p",
+ "APD",
+ "APEC",
+ "API",
+ "APIK",
+ "APM",
+ "APN",
+ "APP",
+ "Apr.",
+ "APRI",
+ "Ar",
+ "Ar.",
+ "ark.",
+ "A.S.",
+ "As",
+ "a.s.",
+ "ASA",
+ "ASAS 50",
+ "ASB",
+ "ASCII",
+ "ASEAN",
+ "ASEAN+3",
+ "ASEM",
+ "a.s.f.",
+ "ASN",
+ "a.s.o.",
+ "ASP",
+ "Ast.",
+ "A.T.",
+ "At",
+ "ATM",
+ "a.t.r.",
+ "ATUR",
+ "Au",
+ "AURI",
+ "Aug.",
+ "AWOL",
+ "Ayb.",
+ "B",
+ "BA",
+ "Ba",
+ "BAC",
+ "BAFIA",
+ "BAM",
+ "BANANA",
+ "BAPP",
+ "BASF",
+ "BATA",
+ "BB",
+ "BBC",
+ "BBE",
+ "BBS",
+ "BC",
+ "BCG",
+ "BCIC",
+ "b.d.",
+ "BDSSHAM",
+ "Be",
+ "BEER",
+ "BERNAMA",
+ "Bh",
+ "b.h.",
+ "Bhd.",
+ "Bi",
+ "BIDS",
+ "Bil.",
+ "bil.",
+ "BIMP-EAGA",
+ "Bio.",
+ "BIOS",
+ "BITMB",
+ "BJ",
+ "Bk",
+ "b.k.",
+ "BKAL",
+ "bkn.",
+ "BKP",
+ "BL",
+ "BLR",
+ "BM",
+ "BMI",
+ "BMW",
+ "BN",
+ "BNM",
+ "BO",
+ "BOJ",
+ "BOO",
+ "BOP",
+ "BOT",
+ "BP",
+ "b.p.",
+ "BPA",
+ "BPAs",
+ "bpd.",
+ "BPIMB",
+ "BPM",
+ "BPO",
+ "BPPH",
+ "Br",
+ "Br.",
+ "BSA",
+ "B.Sc.",
+ "B.Sh.",
+ "b.s.j.",
+ "BSN",
+ "Bt.",
+ "bt.",
+ "BWT",
+ "BYOB",
+ "C",
+ "C.",
+ "C/E",
+ "Ca",
+ "CAAM",
+ "CAD",
+ "CAM",
+ "CATV",
+ "CBS",
+ "CBT",
+ "CC",
+ "CCD",
+ "CCM",
+ "CCR",
+ "cct-km",
+ "CCTV",
+ "CCU",
+ "CD",
+ "Cd",
+ "CD-ROM",
+ "CD-RW",
+ "CDRC",
+ "Ce",
+ "CEO",
+ "CEPT",
+ "Cetak",
+ "Cf",
+ "CFO",
+ "CFTC",
+ "CGC",
+ "CGI",
+ "CH",
+ "CIA",
+ "CIAST",
+ "CID",
+ "CIDB",
+ "CIQ",
+ "CKD",
+ "CL",
+ "Cl",
+ "c.l.",
+ "CLI",
+ "CLOB",
+ "CM",
+ "Cm",
+ "cm.",
+ "CMAG",
+ "CMI",
+ "CMP",
+ "CNN",
+ "Co",
+ "COD",
+ "Col.",
+ "COLA",
+ "COMDEX",
+ "CP",
+ "CPI",
+ "CPO",
+ "CPR",
+ "CPU",
+ "Cr",
+ "CRDF",
+ "Cs",
+ "CST",
+ "CT",
+ "CTIP",
+ "CTRM",
+ "Cu",
+ "CUEPACS",
+ "D-8",
+ "d/a",
+ "DAGS",
+ "Dan.",
+ "DANCED",
+ "DAP",
+ "DARA",
+ "Db",
+ "DBKL",
+ "DBP",
+ "DBR",
+ "DC",
+ "DDA",
+ "DDT",
+ "DEB",
+ "Dec.",
+ "Deu.",
+ "DFIs",
+ "dgn.",
+ "DHL",
+ "DIBML",
+ "DIN",
+ "Dis.",
+ "DJ",
+ "d.l.l.",
+ "dlm.",
+ "dng.",
+ "DNS",
+ "DO",
+ "DOA",
+ "DOE",
+ "DOF",
+ "DOSH",
+ "doz.",
+ "DPPS",
+ "Dr.",
+ "dr.",
+ "drp.",
+ "drpd.",
+ "Ds",
+ "d.sb.",
+ "d.st.",
+ "DSTN2",
+ "Dt.",
+ "DTAs",
+ "DTMF",
+ "DTP",
+ "DTV",
+ "DUBES",
+ "DUNHILL",
+ "DV8",
+ "DVD",
+ "DVE",
+ "DVS",
+ "dw.t.",
+ "Dy",
+ "DYMM",
+ "E",
+ "E-Commerce",
+ "E-Dagang",
+ "E&E",
+ "E-Faraid",
+ "E-Government",
+ "E-Kerajaan",
+ "E-Mail",
+ "E-Services",
+ "E-Village",
+ "E-Zine",
+ "EALAF",
+ "EBI",
+ "EBP",
+ "EC",
+ "ECAFE",
+ "Ecc.",
+ "ECI",
+ "ECM",
+ "ECOSOC",
+ "ECP",
+ "ECR",
+ "EDI",
+ "EE",
+ "EEC",
+ "Ef.",
+ "EG",
+ "Eko.",
+ "EKS",
+ "ELWS",
+ "ELX",
+ "EMI",
+ "EMUs",
+ "En.",
+ "EP",
+ "EPF",
+ "Eph.",
+ "EPP",
+ "EPS",
+ "EPU",
+ "ER",
+ "Er",
+ "ERL",
+ "ERT",
+ "Es",
+ "ESCAP",
+ "ESOS",
+ "ESP",
+ "EST",
+ "Est.",
+ "ET",
+ "ETA",
+ "ETACS",
+ "ETC",
+ "ETD",
+ "EU",
+ "Eu",
+ "EVIAN",
+ "Exim Bank",
+ "Exo.",
+ "Eze.",
+ "Ezr.",
+ "F",
+ "FAM",
+ "FAMA",
+ "FAO",
+ "FAQ",
+ "FAX",
+ "FBI",
+ "FC",
+ "FCA",
+ "FCC",
+ "FDI",
+ "FE",
+ "Fe",
+ "f.e.",
+ "Feb.",
+ "FELCRA",
+ "FELDA",
+ "FI",
+ "FIA 1993",
+ "FIAT",
+ "FIC",
+ "FIDA",
+ "FIFA",
+ "FIMA",
+ "Fiz.",
+ "Flm.",
+ "Flp.",
+ "FM",
+ "Fm",
+ "FMUTM",
+ "FO",
+ "FOA",
+ "FOB",
+ "FOC",
+ "FOMCA",
+ "FORD",
+ "Fr",
+ "FRIM",
+ "FRTI",
+ "FSMP",
+ "FTA",
+ "FTE",
+ "FTP",
+ "G",
+ "g.",
+ "G15",
+ "G77",
+ "Ga",
+ "GAC",
+ "GACM",
+ "Gal.",
+ "GAPENA",
+ "GATS",
+ "GATT",
+ "GB",
+ "Gbps.",
+ "Gd",
+ "GDP",
+ "Ge",
+ "GEC",
+ "Gen.",
+ "Geo.",
+ "Geog.",
+ "Gerakan",
+ "GH",
+ "GIF",
+ "GII",
+ "GIS",
+ "GITIC",
+ "GITN",
+ "GJ",
+ "GLCs",
+ "GM",
+ "GMBH",
+ "GMI",
+ "GMT",
+ "GNP",
+ "GNS",
+ "GOLD",
+ "GP",
+ "GPC",
+ "GPIM",
+ "GPMS",
+ "GPO",
+ "GPP",
+ "GPS",
+ "GRO",
+ "GRS",
+ "GSMC",
+ "GST",
+ "GTZ",
+ "GUI",
+ "GWh.",
+ "H",
+ "Ha",
+ "Hab.",
+ "Hag.",
+ "Hak.",
+ "ham",
+ "hb.",
+ "HCI",
+ "HDTV",
+ "He",
+ "Heb.",
+ "Hf",
+ "Hg",
+ "HI-FI",
+ "HIS",
+ "HIV",
+ "Hj.",
+ "HMS",
+ "Ho",
+ "Hos.",
+ "HP",
+ "HRDC",
+ "HRDF",
+ "HRMIS",
+ "Hs",
+ "Hut.",
+ "I",
+ "I/O",
+ "IA",
+ "IAA",
+ "IADPs",
+ "IB",
+ "i.b.",
+ "IBA",
+ "IBFIM",
+ "IBG",
+ "Ibr.",
+ "IBRD",
+ "IBS",
+ "IC",
+ "ICA",
+ "ICBM",
+ "ICFM",
+ "ICI",
+ "ICM",
+ "ICOR",
+ "ICP",
+ "ICT",
+ "ICU",
+ "ID",
+ "Id.",
+ "IDB",
+ "IDFR",
+ "IE",
+ "i.e.",
+ "IFSB",
+ "IGAs",
+ "IGS",
+ "IHP",
+ "IHPG",
+ "IIM",
+ "IINA",
+ "IKKL",
+ "IKP",
+ "IKPH",
+ "IKS",
+ "Im.",
+ "IMD",
+ "IMF",
+ "IMP2",
+ "IMR",
+ "IMS-GT",
+ "IMT-GT",
+ "In",
+ "in.",
+ "INFRA",
+ "INSEP",
+ "INSPEN",
+ "INTAN",
+ "IOFC",
+ "IOU",
+ "IP",
+ "IPA",
+ "IPBA",
+ "IPCs",
+ "IPEBP",
+ "IPI",
+ "IPKIM",
+ "IPKPM",
+ "IPO",
+ "IPP",
+ "IPPM",
+ "IPPPM",
+ "i.pt.",
+ "IPTAR",
+ "IPTNM",
+ "IQR",
+ "Ir",
+ "IRA",
+ "IRPA",
+ "IRS",
+ "i.s.",
+ "ISA",
+ "Isa.",
+ "ISDN",
+ "ISMM",
+ "ISO",
+ "ISP",
+ "ist.",
+ "IT",
+ "i.t.",
+ "ITA",
+ "ITAF",
+ "ITEX",
+ "ITK",
+ "ITM",
+ "ITO",
+ "ITRCo",
+ "ITTA",
+ "ITU",
+ "JAK",
+ "JAKIM",
+ "Jam.",
+ "Jan.",
+ "Jb.",
+ "JBIC",
+ "JD",
+ "JDA",
+ "Jdg.",
+ "Jer.",
+ "Jh.",
+ "JICA",
+ "JJ",
+ "Jk.",
+ "JKKK",
+ "jkps.",
+ "JKR",
+ "JMTI",
+ "JOA",
+ "Joe.",
+ "Joh.",
+ "Jon.",
+ "Jos.",
+ "JP",
+ "JPA",
+ "JPEG",
+ "JPH",
+ "JPJ",
+ "JPSHK",
+ "JPS",
+ "JPT",
+ "JRDA",
+ "JSM",
+ "JT",
+ "Jud.",
+ "Jul.",
+ "Jun.",
+ "JVC",
+ "Jw.",
+ "K",
+ "K-Economy",
+ "KADA",
+ "KBE",
+ "KBIA",
+ "KBPA",
+ "KBSM",
+ "KD",
+ "Kd.",
+ "KDI",
+ "KDN",
+ "KDNK",
+ "KE",
+ "KEAP",
+ "Kej.",
+ "Kel.",
+ "KEM",
+ "KEMLU",
+ "kep.",
+ "Kg.",
+ "kg.",
+ "KGB",
+ "KGK",
+ "KH",
+ "ki.",
+ "Kid.",
+ "KIK",
+ "KIKMTT",
+ "KIM",
+ "Kim.",
+ "Kis.",
+ "KIX",
+ "KKGSK",
+ "KKK",
+ "KKPPA",
+ "KL",
+ "Kl.",
+ "KLCI",
+ "KLIA",
+ "KLIBOR",
+ "KLIM",
+ "KLM",
+ "KLSE",
+ "KM",
+ "KMM",
+ "KNK",
+ "KO",
+ "Kol.",
+ "Kom.",
+ "Komp.",
+ "KOMSAS",
+ "KPAI",
+ "KPB",
+ "KPBA",
+ "KPC",
+ "kpd.",
+ "KPE",
+ "KPIs",
+ "KPPL",
+ "KPPMS",
+ "KPWM",
+ "Kr",
+ "KRM",
+ "KSTI",
+ "KT",
+ "KTA",
+ "KTABKL",
+ "KTM",
+ "KTMB",
+ "kV",
+ "kW",
+ "kWh",
+ "kWj",
+ "KWSP",
+ "LA",
+ "La",
+ "LABOR",
+ "Lam.",
+ "LAN",
+ "LAPD",
+ "LASER",
+ "LAX",
+ "lb.",
+ "LC",
+ "LCD",
+ "LCHRF",
+ "LCLY",
+ "LED",
+ "Lev.",
+ "LFPR",
+ "LFS",
+ "LFX",
+ "LGM",
+ "Li",
+ "LID",
+ "Lin.",
+ "LKN",
+ "LKPM",
+ "LKPP",
+ "LKTP",
+ "LKWJ",
+ "LLB",
+ "LLC",
+ "LLN",
+ "LLS",
+ "LMSM",
+ "LNG",
+ "LOA",
+ "LOBATA",
+ "LOFSA",
+ "LPG",
+ "LPIP",
+ "LPKI",
+ "LPKLPL",
+ "LPKN",
+ "LPN",
+ "LPP",
+ "LPPK",
+ "LPPM",
+ "LPPP",
+ "LPPTP",
+ "Lr",
+ "LRs",
+ "LRT",
+ "LS",
+ "LTAKL",
+ "LTD",
+ "LTK",
+ "Lu",
+ "LUAS",
+ "Luk.",
+ "lw.",
+ "lwn.",
+ "M\n",
+ "m",
+ "M&A",
+ "MAB",
+ "MACRES",
+ "MAD",
+ "MADA",
+ "MAGERAN",
+ "MAHA",
+ "MAHSURI",
+ "Mal.",
+ "MALINDO",
+ "MAMPU",
+ "Mar.",
+ "MARA",
+ "MARC",
+ "MARDI",
+ "MARLBORO",
+ "MAS",
+ "MASSA",
+ "MASSCORP",
+ "Mat.",
+ "MATRADE",
+ "MAVCAP",
+ "MB",
+ "MBA",
+ "MBBS",
+ "MBM",
+ "MBO",
+ "MBS",
+ "MBTU",
+ "MC",
+ "MCA",
+ "MCB",
+ "MCSL",
+ "MCSv5",
+ "MD",
+ "Md",
+ "MDB",
+ "MDC",
+ "MDG",
+ "MDV",
+ "MEASAT",
+ "MEATJ",
+ "MECIB",
+ "MEMO",
+ "MENLU",
+ "MEPS",
+ "MES",
+ "MESDAQ",
+ "METEOR",
+ "MFI",
+ "MFIs",
+ "MG",
+ "Mg",
+ "MGM",
+ "MGR",
+ "MGS",
+ "MHA",
+ "Mi.",
+ "MIA",
+ "MIB",
+ "MIC",
+ "Mic.",
+ "MICE",
+ "MIDA",
+ "MIDF",
+ "MIDI",
+ "MIG",
+ "MIGHT",
+ "MII",
+ "MIMOS",
+ "MINDEF",
+ "MINT",
+ "mis.",
+ "MIT",
+ "MITC",
+ "MITI",
+ "Ml.",
+ "MLNG",
+ "mlpd.",
+ "MM",
+ "mm",
+ "MMN",
+ "mmscfd.",
+ "MMU",
+ "MMX",
+ "Mn",
+ "Mn.",
+ "MNA",
+ "MNCs",
+ "MO",
+ "Mo",
+ "MOA",
+ "MOD",
+ "MODEM",
+ "MOE",
+ "MOH",
+ "MOSTE",
+ "MOSTI",
+ "MOU",
+ "MP",
+ "MPB",
+ "MPEG",
+ "MPOB",
+ "MPP",
+ "mppa.",
+ "MPPJ",
+ "MPS",
+ "MPTM",
+ "MR",
+ "m.r.",
+ "MRB",
+ "MRELB",
+ "Mrk.",
+ "MRRDB",
+ "MS",
+ "MS-DOS",
+ "MSC",
+ "MSG",
+ "MSM",
+ "Mt",
+ "MTC",
+ "MTCP",
+ "MTD",
+ "MTDC",
+ "MTPB",
+ "MTV",
+ "Muz.",
+ "MV",
+ "MW",
+ "MY",
+ "MyKe",
+ "Mzm.",
+ "N",
+ "N/A",
+ "Na",
+ "NAB",
+ "NACIWID",
+ "Nah.",
+ "NAP",
+ "NASA",
+ "NATO",
+ "NAV",
+ "NB",
+ "Nb",
+ "NBA",
+ "NBC",
+ "NCR",
+ "Nd",
+ "NDP",
+ "Ne",
+ "NEAC",
+ "NEC",
+ "NEF",
+ "Neh.",
+ "NEP",
+ "NEqO",
+ "NERP",
+ "NF",
+ "NFPEs",
+ "NG",
+ "NGOs",
+ "NGV",
+ "NHEF",
+ "NHHES",
+ "NHK",
+ "Ni",
+ "NIDC",
+ "NIH",
+ "NIP",
+ "NIPA",
+ "NIS",
+ "NISIR",
+ "NITA",
+ "NITC",
+ "NITP",
+ "NIV",
+ "NLAC",
+ "NMPBSP",
+ "NMU",
+ "No",
+ "No.",
+ "no.",
+ "NOSS",
+ "Nov.",
+ "Np",
+ "NPC",
+ "NPCS",
+ "NPL",
+ "NRCC",
+ "NRW",
+ "NS",
+ "Ns",
+ "NSB",
+ "NTA",
+ "NTHRDC",
+ "NTMP",
+ "NTSC",
+ "Num.",
+ "NUTF",
+ "NVP",
+ "NVTC",
+ "NWRC",
+ "O",
+ "Ob.",
+ "Oba.",
+ "OC",
+ "OCPD",
+ "Oct.",
+ "OD",
+ "ODA",
+ "OECD",
+ "OEM",
+ "Ogo.",
+ "OHQs",
+ "OIC",
+ "Okt.",
+ "OPEC",
+ "OPP",
+ "OPP3",
+ "OPR",
+ "OS",
+ "Os",
+ "OSA",
+ "OT",
+ "OUG",
+ "oz.",
+ "P",
+ "P&P",
+ "PA",
+ "Pa",
+ "PABK",
+ "PABX",
+ "PAK",
+ "PAKSI",
+ "PAL",
+ "PALL MALL",
+ "PAS",
+ "PATA",
+ "PAWS",
+ "Pb",
+ "PBA",
+ "PBB",
+ "PBM",
+ "PBP",
+ "PBSM",
+ "PBT",
+ "PC",
+ "PC(s)",
+ "PCB",
+ "PCIRITA",
+ "PCM",
+ "PCMCIA",
+ "PCN",
+ "PD",
+ "Pd",
+ "pd.",
+ "PDS",
+ "PE",
+ "PEKEMAS",
+ "PEMADAM",
+ "PENA",
+ "PENIS",
+ "PERDANA",
+ "PERKESO",
+ "PERKIM",
+ "PERNAS",
+ "PERTAMA",
+ "PERTIWI",
+ "PESAKA",
+ "PETA",
+ "PETRONAS",
+ "PGU",
+ "Ph.",
+ "PHD",
+ "Phi.",
+ "Phm.",
+ "PIK",
+ "PIKOM",
+ "PIN",
+ "PINTAS",
+ "PIPM",
+ "PISK",
+ "PITA",
+ "PIXEL",
+ "PJ",
+ "PJK",
+ "PJKB",
+ "PJP",
+ "PKBM",
+ "PKBTA",
+ "PKEN",
+ "Pkh.",
+ "PKKM",
+ "PKLPA",
+ "PKM",
+ "PKNS",
+ "PKPIM",
+ "PKPM",
+ "PKR",
+ "PKS",
+ "Pl.",
+ "p.l.",
+ "PLA",
+ "PLC",
+ "PLCHP",
+ "PLCs",
+ "PLI",
+ "PLT",
+ "PLUS",
+ "PLWS",
+ "PM",
+ "Pm",
+ "PMM",
+ "PMP",
+ "PMR",
+ "PMS",
+ "Pn.",
+ "PNAT",
+ "PNS",
+ "PO",
+ "Po",
+ "POCPA",
+ "POKEMON",
+ "Pol.",
+ "POP",
+ "PORIM",
+ "PORLA",
+ "PORTAFOAM",
+ "PP",
+ "PPA",
+ "PPBE",
+ "PPBK",
+ "ppd.",
+ "PPGM",
+ "PPI",
+ "PPK",
+ "PPL",
+ "PPM",
+ "PPP",
+ "PPPB",
+ "PPPLM",
+ "PPPM",
+ "PPR",
+ "PPRT",
+ "PPS",
+ "PPTM",
+ "PPU",
+ "PR",
+ "Pr",
+ "Pr.",
+ "prb.",
+ "PRI",
+ "PRO",
+ "Pro.",
+ "Prof.",
+ "PROSPER",
+ "PROSTAR",
+ "PROTON",
+ "PS",
+ "PSA",
+ "Psa.",
+ "PSCs",
+ "PSDC",
+ "PSDH",
+ "Psi.",
+ "PSKE",
+ "PSRM",
+ "PST",
+ "PT",
+ "Pt",
+ "PTD",
+ "PTP",
+ "Pu",
+ "PUNB",
+ "QA",
+ "QC",
+ "QCC",
+ "R&D",
+ "RA",
+ "Ra",
+ "RAM",
+ "RAPP",
+ "Rat.",
+ "Rb",
+ "RCA",
+ "RDA",
+ "RDAs",
+ "RDCs",
+ "RE",
+ "Re",
+ "REHDA",
+ "Rev.",
+ "Rf",
+ "Rg",
+ "RGB",
+ "Rh",
+ "RI",
+ "RIDA",
+ "RIP",
+ "RISDA",
+ "r.l.",
+ "RM",
+ "Rm.",
+ "RMKe-8",
+ "Rn",
+ "ROC",
+ "ROM",
+ "Rom.",
+ "RPG",
+ "RPS",
+ "RRI",
+ "RRIM",
+ "RRJP",
+ "RRP",
+ "RSGC",
+ "RSS",
+ "RSVP",
+ "Rt.",
+ "RTA",
+ "RTM",
+ "Ru",
+ "Rut.",
+ "RWCR",
+ "RX",
+ "S",
+ "S/N",
+ "S&T",
+ "S-VHS",
+ "SA",
+ "SAC",
+ "SADCs",
+ "SAGA",
+ "SALCRA",
+ "SALM",
+ "SALT",
+ "SAM",
+ "SAP",
+ "SARS",
+ "Sas.",
+ "s.a.w.",
+ "SB",
+ "Sb",
+ "Sb.",
+ "SBA",
+ "SBB",
+ "sbg.",
+ "SBK",
+ "SC",
+ "Sc",
+ "SCA",
+ "SCADA",
+ "SCANS",
+ "SCSI",
+ "SCuM",
+ "SDCs",
+ "Sdn. Bhd.",
+ "sdr.",
+ "SDRC",
+ "Se",
+ "SEATO",
+ "SEB",
+ "SECAM",
+ "SEDCs",
+ "SEFF",
+ "Sej.",
+ "SEMS",
+ "Sep.",
+ "Sept.",
+ "SESB",
+ "SESCo",
+ "s.f.",
+ "Sg",
+ "SGPCA",
+ "SGPPI",
+ "SGPPKRM",
+ "SGX",
+ "Si",
+ "Si.",
+ "SIA 1983",
+ "SIC",
+ "SIM",
+ "SING",
+ "SIRIM",
+ "SITTDEC",
+ "sj.",
+ "SKDTP",
+ "SKM",
+ "SKSM",
+ "SL",
+ "Sl.",
+ "sl.",
+ "SLMCH",
+ "SLR",
+ "SM",
+ "Sm",
+ "SMART",
+ "SMEs",
+ "SMEt",
+ "SMIs",
+ "SMIDEC",
+ "SMIDP",
+ "SMJK",
+ "SMR",
+ "SMS",
+ "SMT",
+ "SMTP",
+ "SN",
+ "Sn",
+ "SOB",
+ "SOCSO",
+ "SOHO",
+ "Son.",
+ "SOS",
+ "Sos.",
+ "SP",
+ "SPA",
+ "SPAM",
+ "SPCA",
+ "SPKR",
+ "SPLAM",
+ "SPM",
+ "SPNB",
+ "SPSP",
+ "t.",
+ "Ta",
+ "Tadb.",
+ "TAF",
+ "TAF-W",
+ "Tani",
+ "TAP",
+ "TAR",
+ "TARBI",
+ "TB",
+ "Tb",
+ "TBA",
+ "TBTP",
+ "Tc",
+ "TCPD",
+ "TDCs",
+ "Te",
+ "TEKUN",
+ "TELCO",
+ "TELEX",
+ "TEUs",
+ "TFP",
+ "TGV",
+ "TH",
+ "Th",
+ "THIS",
+ "Ti",
+ "TICAD",
+ "Tit.",
+ "TKA",
+ "Tks.",
+ "Tl",
+ "TLDM",
+ "TM",
+ "Tm",
+ "TMB",
+ "TMK",
+ "TNB",
+ "TNSB",
+ "TNT",
+ "TOEFL",
+ "TP",
+ "TPIM",
+ "TPK",
+ "TPPP",
+ "TPPT",
+ "TPSM",
+ "TPUB",
+ "TQM",
+ "Tr.",
+ "TRIPs",
+ "tsb.",
+ "tscf.",
+ "t.sh.",
+ "t.s.t.",
+ "TT",
+ "t.t.",
+ "TUDM",
+ "TV",
+ "TVSMR",
+ "TWAIN",
+ "TX",
+ "TYPHIrapid",
+ "U",
+ "Ubat",
+ "UDA",
+ "Udg.",
+ "UFO",
+ "UH",
+ "UIA",
+ "UiTM",
+ "UK",
+ "UKM",
+ "UL",
+ "Ul.",
+ "ULC",
+ "UM",
+ "UMNO",
+ "UMS",
+ "UN",
+ "UN/OSCAL",
+ "UNCLE",
+ "UNCTAD",
+ "UNDP",
+ "UNESCO",
+ "UNFCCC",
+ "UNFPA",
+ "UNHCR",
+ "UNICEF",
+ "UNIMAS",
+ "UNTAET",
+ "UPE",
+ "UPM",
+ "UPS",
+ "UPSR",
+ "URL",
+ "US",
+ "USAINS",
+ "USD",
+ "USM",
+ "USNO",
+ "USS",
+ "USSR",
+ "UTC",
+ "UTF",
+ "utk.",
+ "UTM",
+ "V",
+ "VAT",
+ "VCC",
+ "VCD",
+ "VCR",
+ "VD",
+ "VDSC",
+ "VGA",
+ "VHF",
+ "VHS",
+ "VIP",
+ "VMS",
+ "VO",
+ "VOA",
+ "VoIP",
+ "VR",
+ "VSOP",
+ "VW",
+ "W",
+ "W/O",
+ "WAP",
+ "WAY",
+ "WC",
+ "WDDM",
+ "WDM",
+ "WHO",
+ "Why.",
+ "WIM",
+ "WPG",
+ "WTO",
+ "WWF",
+ "WWW",
+ "WYSIWYG",
+ "Xe",
+ "XO",
+ "XXL",
+ "Y",
+ "Y2K",
+ "YAB",
+ "Yak.",
+ "YAM",
+ "YAS",
+ "YB",
+ "Yb",
+ "Yeh.",
+ "Yer.",
+ "Yes.",
+ "yg.",
+ "Yl.",
+ "YM",
+ "YMCA",
+ "Yoh.",
+ "Yos.",
+ "Y.Th.",
+ "YTM",
+ "Yud.",
+ "Yun.",
+ "Za.",
+ "Zec.",
+ "Zef.",
+ "Zep.",
+ "ZIP",
+ "Zn",
+ "Zr",
+]:
+ _exc[orth] = [{ORTH: orth}]
+
+TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
diff --git a/spacy/lang/nb/__init__.py b/spacy/lang/nb/__init__.py
index e079236fd..ef4665ccc 100644
--- a/spacy/lang/nb/__init__.py
+++ b/spacy/lang/nb/__init__.py
@@ -1,12 +1,13 @@
-from typing import Optional, Callable
+from typing import Callable, Optional
+
from thinc.api import Model
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
-from .punctuation import TOKENIZER_SUFFIXES
+
+from ...language import BaseDefaults, Language
+from ...pipeline import Lemmatizer
+from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS
-from ...language import Language, BaseDefaults
-from ...pipeline import Lemmatizer
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class NorwegianDefaults(BaseDefaults):
diff --git a/spacy/lang/nb/punctuation.py b/spacy/lang/nb/punctuation.py
index 8f2933670..a1fdb872a 100644
--- a/spacy/lang/nb/punctuation.py
+++ b/spacy/lang/nb/punctuation.py
@@ -1,7 +1,17 @@
-from ..char_classes import LIST_ELLIPSES, LIST_ICONS, LIST_PUNCT, LIST_QUOTES
-from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
-from ..char_classes import CURRENCY, PUNCT, UNITS, LIST_CURRENCY
-
+from ..char_classes import (
+ ALPHA,
+ ALPHA_LOWER,
+ ALPHA_UPPER,
+ CONCAT_QUOTES,
+ CURRENCY,
+ LIST_CURRENCY,
+ LIST_ELLIPSES,
+ LIST_ICONS,
+ LIST_PUNCT,
+ LIST_QUOTES,
+ PUNCT,
+ UNITS,
+)
# Punctuation adapted from Danish
_quotes = CONCAT_QUOTES.replace("'", "")
diff --git a/spacy/lang/nb/syntax_iterators.py b/spacy/lang/nb/syntax_iterators.py
index d86662693..89a8f5edf 100644
--- a/spacy/lang/nb/syntax_iterators.py
+++ b/spacy/lang/nb/syntax_iterators.py
@@ -1,7 +1,7 @@
-from typing import Union, Iterator, Tuple
+from typing import Iterator, Tuple, Union
-from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors
+from ...symbols import NOUN, PRON, PROPN
from ...tokens import Doc, Span
diff --git a/spacy/lang/nb/tokenizer_exceptions.py b/spacy/lang/nb/tokenizer_exceptions.py
index 0be436ae4..9b99a1d65 100644
--- a/spacy/lang/nb/tokenizer_exceptions.py
+++ b/spacy/lang/nb/tokenizer_exceptions.py
@@ -1,7 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...symbols import ORTH, NORM
+from ...symbols import NORM, ORTH
from ...util import update_exc
-
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc = {}
diff --git a/spacy/lang/ne/__init__.py b/spacy/lang/ne/__init__.py
index 0028d1b0b..5c9e6870e 100644
--- a/spacy/lang/ne/__init__.py
+++ b/spacy/lang/ne/__init__.py
@@ -1,6 +1,6 @@
-from .stop_words import STOP_WORDS
+from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
-from ...language import Language, BaseDefaults
+from .stop_words import STOP_WORDS
class NepaliDefaults(BaseDefaults):
diff --git a/spacy/lang/ne/lex_attrs.py b/spacy/lang/ne/lex_attrs.py
index 7cb01c515..91d5b0eb5 100644
--- a/spacy/lang/ne/lex_attrs.py
+++ b/spacy/lang/ne/lex_attrs.py
@@ -1,6 +1,5 @@
+from ...attrs import LIKE_NUM, NORM
from ..norm_exceptions import BASE_NORMS
-from ...attrs import NORM, LIKE_NUM
-
# fmt: off
_stem_suffixes = [
diff --git a/spacy/lang/nl/__init__.py b/spacy/lang/nl/__init__.py
index ad2205a0b..213041a85 100644
--- a/spacy/lang/nl/__init__.py
+++ b/spacy/lang/nl/__init__.py
@@ -1,15 +1,14 @@
-from typing import Optional, Callable
+from typing import Callable, Optional
from thinc.api import Model
+from ...language import BaseDefaults, Language
from .lemmatizer import DutchLemmatizer
from .lex_attrs import LEX_ATTRS
-from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
-from .punctuation import TOKENIZER_SUFFIXES
+from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from ...language import Language, BaseDefaults
class DutchDefaults(BaseDefaults):
diff --git a/spacy/lang/nl/lex_attrs.py b/spacy/lang/nl/lex_attrs.py
index f1acaefeb..488224c2f 100644
--- a/spacy/lang/nl/lex_attrs.py
+++ b/spacy/lang/nl/lex_attrs.py
@@ -1,6 +1,5 @@
from ...attrs import LIKE_NUM
-
_num_words = set(
"""
nul een één twee drie vier vijf zes zeven acht negen tien elf twaalf dertien
diff --git a/spacy/lang/nl/punctuation.py b/spacy/lang/nl/punctuation.py
index d9dd2a6e3..c9a4c9eeb 100644
--- a/spacy/lang/nl/punctuation.py
+++ b/spacy/lang/nl/punctuation.py
@@ -1,10 +1,19 @@
-from ..char_classes import LIST_ELLIPSES, LIST_ICONS, LIST_UNITS, merge_chars
-from ..char_classes import LIST_PUNCT, LIST_QUOTES, CURRENCY, PUNCT
-from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
-
+from ..char_classes import (
+ ALPHA,
+ ALPHA_LOWER,
+ ALPHA_UPPER,
+ CONCAT_QUOTES,
+ CURRENCY,
+ LIST_ELLIPSES,
+ LIST_ICONS,
+ LIST_PUNCT,
+ LIST_QUOTES,
+ LIST_UNITS,
+ PUNCT,
+ merge_chars,
+)
from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES
-
_prefixes = [",,"] + BASE_TOKENIZER_PREFIXES
diff --git a/spacy/lang/nl/syntax_iterators.py b/spacy/lang/nl/syntax_iterators.py
index be9beabe6..d7388a333 100644
--- a/spacy/lang/nl/syntax_iterators.py
+++ b/spacy/lang/nl/syntax_iterators.py
@@ -1,7 +1,7 @@
-from typing import Union, Iterator, Tuple
+from typing import Iterator, Tuple, Union
-from ...symbols import NOUN, PRON
from ...errors import Errors
+from ...symbols import NOUN, PRON
from ...tokens import Doc, Span
diff --git a/spacy/lang/nl/tokenizer_exceptions.py b/spacy/lang/nl/tokenizer_exceptions.py
index 489d10d71..85ad49f14 100644
--- a/spacy/lang/nl/tokenizer_exceptions.py
+++ b/spacy/lang/nl/tokenizer_exceptions.py
@@ -1,7 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH
from ...util import update_exc
-
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
# Extensive list of both common and uncommon dutch abbreviations copied from
# github.com/diasks2/pragmatic_segmenter, a Ruby library for rule-based
diff --git a/spacy/lang/pl/__init__.py b/spacy/lang/pl/__init__.py
index 02c96799b..50a3a8e4c 100644
--- a/spacy/lang/pl/__init__.py
+++ b/spacy/lang/pl/__init__.py
@@ -1,15 +1,13 @@
-from typing import Optional, Callable
+from typing import Callable, Optional
from thinc.api import Model
-from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
-from .punctuation import TOKENIZER_SUFFIXES
-from .stop_words import STOP_WORDS
-from .lex_attrs import LEX_ATTRS
-from .lemmatizer import PolishLemmatizer
+from ...language import BaseDefaults, Language
from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...language import Language, BaseDefaults
-
+from .lemmatizer import PolishLemmatizer
+from .lex_attrs import LEX_ATTRS
+from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
+from .stop_words import STOP_WORDS
TOKENIZER_EXCEPTIONS = {
exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".")
diff --git a/spacy/lang/pl/lemmatizer.py b/spacy/lang/pl/lemmatizer.py
index 059d0609a..d1d2a9c54 100644
--- a/spacy/lang/pl/lemmatizer.py
+++ b/spacy/lang/pl/lemmatizer.py
@@ -1,4 +1,4 @@
-from typing import List, Dict, Tuple
+from typing import Dict, List, Tuple
from ...pipeline import Lemmatizer
from ...tokens import Token
diff --git a/spacy/lang/pl/lex_attrs.py b/spacy/lang/pl/lex_attrs.py
index ce56e28a8..398f52a3c 100644
--- a/spacy/lang/pl/lex_attrs.py
+++ b/spacy/lang/pl/lex_attrs.py
@@ -1,6 +1,5 @@
from ...attrs import LIKE_NUM
-
_num_words = [
"zero",
"jeden",
diff --git a/spacy/lang/pl/punctuation.py b/spacy/lang/pl/punctuation.py
index 31e56b9ae..84ff239ed 100644
--- a/spacy/lang/pl/punctuation.py
+++ b/spacy/lang/pl/punctuation.py
@@ -1,6 +1,17 @@
-from ..char_classes import LIST_ELLIPSES, LIST_PUNCT, LIST_HYPHENS
-from ..char_classes import LIST_ICONS, LIST_QUOTES, CURRENCY, UNITS, PUNCT
-from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
+from ..char_classes import (
+ ALPHA,
+ ALPHA_LOWER,
+ ALPHA_UPPER,
+ CONCAT_QUOTES,
+ CURRENCY,
+ LIST_ELLIPSES,
+ LIST_HYPHENS,
+ LIST_ICONS,
+ LIST_PUNCT,
+ LIST_QUOTES,
+ PUNCT,
+ UNITS,
+)
from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES
_quotes = CONCAT_QUOTES.replace("'", "")
diff --git a/spacy/lang/pt/__init__.py b/spacy/lang/pt/__init__.py
index 454002491..be4041f8e 100644
--- a/spacy/lang/pt/__init__.py
+++ b/spacy/lang/pt/__init__.py
@@ -1,9 +1,9 @@
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .stop_words import STOP_WORDS
+from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
-from .syntax_iterators import SYNTAX_ITERATORS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES
-from ...language import Language, BaseDefaults
+from .stop_words import STOP_WORDS
+from .syntax_iterators import SYNTAX_ITERATORS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class PortugueseDefaults(BaseDefaults):
diff --git a/spacy/lang/pt/lex_attrs.py b/spacy/lang/pt/lex_attrs.py
index 3c6979ab4..de6a67f14 100644
--- a/spacy/lang/pt/lex_attrs.py
+++ b/spacy/lang/pt/lex_attrs.py
@@ -1,6 +1,5 @@
from ...attrs import LIKE_NUM
-
_num_words = [
"zero",
"um",
diff --git a/spacy/lang/pt/punctuation.py b/spacy/lang/pt/punctuation.py
index 08e31f9d0..b2d63cb3d 100644
--- a/spacy/lang/pt/punctuation.py
+++ b/spacy/lang/pt/punctuation.py
@@ -1,6 +1,6 @@
+from ..punctuation import TOKENIZER_INFIXES as BASE_TOKENIZER_INFIXES
from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES
from ..punctuation import TOKENIZER_SUFFIXES as BASE_TOKENIZER_SUFFIXES
-from ..punctuation import TOKENIZER_INFIXES as BASE_TOKENIZER_INFIXES
_prefixes = [r"\w{1,3}\$"] + BASE_TOKENIZER_PREFIXES
diff --git a/spacy/lang/pt/syntax_iterators.py b/spacy/lang/pt/syntax_iterators.py
index 62661f5e4..11017aace 100644
--- a/spacy/lang/pt/syntax_iterators.py
+++ b/spacy/lang/pt/syntax_iterators.py
@@ -1,7 +1,7 @@
-from typing import Union, Iterator, Tuple
+from typing import Iterator, Tuple, Union
-from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors
+from ...symbols import NOUN, PRON, PROPN
from ...tokens import Doc, Span
diff --git a/spacy/lang/pt/tokenizer_exceptions.py b/spacy/lang/pt/tokenizer_exceptions.py
index 187fc65ea..e369eda80 100644
--- a/spacy/lang/pt/tokenizer_exceptions.py
+++ b/spacy/lang/pt/tokenizer_exceptions.py
@@ -1,7 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH
from ...util import update_exc
-
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc = {}
diff --git a/spacy/lang/punctuation.py b/spacy/lang/punctuation.py
index a1cfe6224..e4a6392c8 100644
--- a/spacy/lang/punctuation.py
+++ b/spacy/lang/punctuation.py
@@ -1,7 +1,19 @@
-from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
-from .char_classes import LIST_ICONS, HYPHENS, CURRENCY, UNITS, COMBINING_DIACRITICS
-from .char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT
-
+from .char_classes import (
+ ALPHA,
+ ALPHA_LOWER,
+ ALPHA_UPPER,
+ COMBINING_DIACRITICS,
+ CONCAT_QUOTES,
+ CURRENCY,
+ HYPHENS,
+ LIST_CURRENCY,
+ LIST_ELLIPSES,
+ LIST_ICONS,
+ LIST_PUNCT,
+ LIST_QUOTES,
+ PUNCT,
+ UNITS,
+)
TOKENIZER_PREFIXES = (
["§", "%", "=", "—", "–", r"\+(?![0-9])"]
diff --git a/spacy/lang/ro/__init__.py b/spacy/lang/ro/__init__.py
index 50027ffd2..441fefbb6 100644
--- a/spacy/lang/ro/__init__.py
+++ b/spacy/lang/ro/__init__.py
@@ -1,9 +1,8 @@
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .stop_words import STOP_WORDS
-from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
-from .punctuation import TOKENIZER_SUFFIXES
+from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
-from ...language import Language, BaseDefaults
+from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
+from .stop_words import STOP_WORDS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
# Lemma data note:
# Original pairs downloaded from http://www.lexiconista.com/datasets/lemmatization/
diff --git a/spacy/lang/ro/lex_attrs.py b/spacy/lang/ro/lex_attrs.py
index 0f86f53cd..736aa911a 100644
--- a/spacy/lang/ro/lex_attrs.py
+++ b/spacy/lang/ro/lex_attrs.py
@@ -1,6 +1,5 @@
from ...attrs import LIKE_NUM
-
_num_words = set(
"""
zero unu doi două trei patru cinci șase șapte opt nouă zece
diff --git a/spacy/lang/ro/punctuation.py b/spacy/lang/ro/punctuation.py
index 529e1c977..7259f9ae7 100644
--- a/spacy/lang/ro/punctuation.py
+++ b/spacy/lang/ro/punctuation.py
@@ -1,9 +1,18 @@
import itertools
-from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
-from ..char_classes import LIST_ICONS, CURRENCY
-from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT
-
+from ..char_classes import (
+ ALPHA,
+ ALPHA_LOWER,
+ ALPHA_UPPER,
+ CONCAT_QUOTES,
+ CURRENCY,
+ LIST_CURRENCY,
+ LIST_ELLIPSES,
+ LIST_ICONS,
+ LIST_PUNCT,
+ LIST_QUOTES,
+ PUNCT,
+)
_list_icons = [x for x in LIST_ICONS if x != "°"]
_list_icons = [x.replace("\\u00B0", "") for x in _list_icons]
diff --git a/spacy/lang/ro/tokenizer_exceptions.py b/spacy/lang/ro/tokenizer_exceptions.py
index b8af0b1d6..a397b2754 100644
--- a/spacy/lang/ro/tokenizer_exceptions.py
+++ b/spacy/lang/ro/tokenizer_exceptions.py
@@ -1,9 +1,8 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH
from ...util import update_exc
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
from .punctuation import _make_ro_variants
-
_exc = {}
diff --git a/spacy/lang/ru/__init__.py b/spacy/lang/ru/__init__.py
index 7d17628c4..880965b70 100644
--- a/spacy/lang/ru/__init__.py
+++ b/spacy/lang/ru/__init__.py
@@ -1,13 +1,16 @@
-from typing import Optional, Callable
+from typing import Callable, Optional
+
from thinc.api import Model
+from ...language import BaseDefaults, Language
+from ..punctuation import (
+ COMBINING_DIACRITICS_TOKENIZER_INFIXES,
+ COMBINING_DIACRITICS_TOKENIZER_SUFFIXES,
+)
+from .lemmatizer import RussianLemmatizer
+from .lex_attrs import LEX_ATTRS
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .lex_attrs import LEX_ATTRS
-from .lemmatizer import RussianLemmatizer
-from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_INFIXES
-from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_SUFFIXES
-from ...language import Language, BaseDefaults
class RussianDefaults(BaseDefaults):
diff --git a/spacy/lang/ru/lemmatizer.py b/spacy/lang/ru/lemmatizer.py
index f4a35de38..1e41220f3 100644
--- a/spacy/lang/ru/lemmatizer.py
+++ b/spacy/lang/ru/lemmatizer.py
@@ -1,4 +1,4 @@
-from typing import Optional, List, Dict, Tuple, Callable
+from typing import Callable, Dict, List, Optional, Tuple
from thinc.api import Model
@@ -8,7 +8,6 @@ from ...symbols import POS
from ...tokens import Token
from ...vocab import Vocab
-
PUNCT_RULES = {"«": '"', "»": '"'}
diff --git a/spacy/lang/ru/lex_attrs.py b/spacy/lang/ru/lex_attrs.py
index 2afe47623..e0b35bdc0 100644
--- a/spacy/lang/ru/lex_attrs.py
+++ b/spacy/lang/ru/lex_attrs.py
@@ -1,6 +1,5 @@
from ...attrs import LIKE_NUM
-
_num_words = list(
set(
"""
diff --git a/spacy/lang/ru/tokenizer_exceptions.py b/spacy/lang/ru/tokenizer_exceptions.py
index e1889f785..0a8c476b1 100644
--- a/spacy/lang/ru/tokenizer_exceptions.py
+++ b/spacy/lang/ru/tokenizer_exceptions.py
@@ -1,6 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...symbols import ORTH, NORM
+from ...symbols import NORM, ORTH
from ...util import update_exc
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc = {}
diff --git a/spacy/lang/sa/__init__.py b/spacy/lang/sa/__init__.py
index 61398af6c..c7c0e98e6 100644
--- a/spacy/lang/sa/__init__.py
+++ b/spacy/lang/sa/__init__.py
@@ -1,6 +1,6 @@
-from .stop_words import STOP_WORDS
+from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
-from ...language import Language, BaseDefaults
+from .stop_words import STOP_WORDS
class SanskritDefaults(BaseDefaults):
diff --git a/spacy/lang/si/__init__.py b/spacy/lang/si/__init__.py
index 971cee3c6..08d0937b1 100644
--- a/spacy/lang/si/__init__.py
+++ b/spacy/lang/si/__init__.py
@@ -1,6 +1,6 @@
-from .stop_words import STOP_WORDS
+from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
-from ...language import Language, BaseDefaults
+from .stop_words import STOP_WORDS
class SinhalaDefaults(BaseDefaults):
diff --git a/spacy/lang/sk/__init__.py b/spacy/lang/sk/__init__.py
index da6e3048e..2ed7448d2 100644
--- a/spacy/lang/sk/__init__.py
+++ b/spacy/lang/sk/__init__.py
@@ -1,6 +1,6 @@
-from .stop_words import STOP_WORDS
+from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
-from ...language import Language, BaseDefaults
+from .stop_words import STOP_WORDS
class SlovakDefaults(BaseDefaults):
diff --git a/spacy/lang/sl/__init__.py b/spacy/lang/sl/__init__.py
index 0070e9fa1..cd3d70fc9 100644
--- a/spacy/lang/sl/__init__.py
+++ b/spacy/lang/sl/__init__.py
@@ -1,8 +1,8 @@
+from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
-from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES
+from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from ...language import Language, BaseDefaults
class SlovenianDefaults(BaseDefaults):
diff --git a/spacy/lang/sl/lex_attrs.py b/spacy/lang/sl/lex_attrs.py
index 958152e37..3c1493050 100644
--- a/spacy/lang/sl/lex_attrs.py
+++ b/spacy/lang/sl/lex_attrs.py
@@ -1,7 +1,6 @@
-from ...attrs import LIKE_NUM
-from ...attrs import IS_CURRENCY
import unicodedata
+from ...attrs import IS_CURRENCY, LIKE_NUM
_num_words = set(
"""
diff --git a/spacy/lang/sl/punctuation.py b/spacy/lang/sl/punctuation.py
index b6ca1830e..dadb54d31 100644
--- a/spacy/lang/sl/punctuation.py
+++ b/spacy/lang/sl/punctuation.py
@@ -1,20 +1,21 @@
from ..char_classes import (
+ ALPHA,
+ ALPHA_LOWER,
+ ALPHA_UPPER,
+ CONCAT_QUOTES,
+ CURRENCY,
+ HYPHENS,
+ LIST_CURRENCY,
LIST_ELLIPSES,
LIST_ICONS,
- HYPHENS,
LIST_PUNCT,
LIST_QUOTES,
- CURRENCY,
- UNITS,
PUNCT,
- LIST_CURRENCY,
- CONCAT_QUOTES,
+ UNITS,
+ merge_chars,
)
-from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
-from ..char_classes import merge_chars
from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES
-
INCLUDE_SPECIAL = ["\\+", "\\/", "\\•", "\\¯", "\\=", "\\×"] + HYPHENS.split("|")
_prefixes = INCLUDE_SPECIAL + BASE_TOKENIZER_PREFIXES
diff --git a/spacy/lang/sl/tokenizer_exceptions.py b/spacy/lang/sl/tokenizer_exceptions.py
index 3d4109228..ec4ea9e41 100644
--- a/spacy/lang/sl/tokenizer_exceptions.py
+++ b/spacy/lang/sl/tokenizer_exceptions.py
@@ -1,7 +1,8 @@
from typing import Dict, List
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...symbols import ORTH, NORM
+
+from ...symbols import NORM, ORTH
from ...util import update_exc
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc: Dict[str, List[Dict]] = {}
diff --git a/spacy/lang/sq/__init__.py b/spacy/lang/sq/__init__.py
index 5e32a0cbe..1c8a5acf8 100644
--- a/spacy/lang/sq/__init__.py
+++ b/spacy/lang/sq/__init__.py
@@ -1,5 +1,5 @@
+from ...language import BaseDefaults, Language
from .stop_words import STOP_WORDS
-from ...language import Language, BaseDefaults
class AlbanianDefaults(BaseDefaults):
diff --git a/spacy/lang/sr/__init__.py b/spacy/lang/sr/__init__.py
index b99ce96ec..5f121d79e 100644
--- a/spacy/lang/sr/__init__.py
+++ b/spacy/lang/sr/__init__.py
@@ -1,8 +1,8 @@
-from .stop_words import STOP_WORDS
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
+from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
-from ...language import Language, BaseDefaults
+from .stop_words import STOP_WORDS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class SerbianDefaults(BaseDefaults):
diff --git a/spacy/lang/sr/lex_attrs.py b/spacy/lang/sr/lex_attrs.py
index dc48909bc..696b9fd74 100644
--- a/spacy/lang/sr/lex_attrs.py
+++ b/spacy/lang/sr/lex_attrs.py
@@ -1,6 +1,5 @@
from ...attrs import LIKE_NUM
-
_num_words = [
"нула",
"један",
diff --git a/spacy/lang/sr/punctuation.py b/spacy/lang/sr/punctuation.py
index 793a20ec2..cafb0f68f 100644
--- a/spacy/lang/sr/punctuation.py
+++ b/spacy/lang/sr/punctuation.py
@@ -1,7 +1,16 @@
-from ..char_classes import LIST_ELLIPSES, LIST_ICONS, LIST_PUNCT, LIST_QUOTES
-from ..char_classes import CURRENCY, UNITS, PUNCT
-from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
-
+from ..char_classes import (
+ ALPHA,
+ ALPHA_LOWER,
+ ALPHA_UPPER,
+ CONCAT_QUOTES,
+ CURRENCY,
+ LIST_ELLIPSES,
+ LIST_ICONS,
+ LIST_PUNCT,
+ LIST_QUOTES,
+ PUNCT,
+ UNITS,
+)
_infixes = (
LIST_ELLIPSES
diff --git a/spacy/lang/sr/tokenizer_exceptions.py b/spacy/lang/sr/tokenizer_exceptions.py
index dcaa3e239..b7db0aadc 100755
--- a/spacy/lang/sr/tokenizer_exceptions.py
+++ b/spacy/lang/sr/tokenizer_exceptions.py
@@ -1,7 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...symbols import ORTH, NORM
+from ...symbols import NORM, ORTH
from ...util import update_exc
-
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc = {}
diff --git a/spacy/lang/sv/__init__.py b/spacy/lang/sv/__init__.py
index 28e5085a8..bb4ee1702 100644
--- a/spacy/lang/sv/__init__.py
+++ b/spacy/lang/sv/__init__.py
@@ -1,12 +1,14 @@
-from typing import Optional, Callable
+from typing import Callable, Optional
+
from thinc.api import Model
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .stop_words import STOP_WORDS
-from .lex_attrs import LEX_ATTRS
-from .syntax_iterators import SYNTAX_ITERATORS
-from ...language import Language, BaseDefaults
+
+from ...language import BaseDefaults, Language
from ...pipeline import Lemmatizer
+from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
+from .stop_words import STOP_WORDS
+from .syntax_iterators import SYNTAX_ITERATORS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class SwedishDefaults(BaseDefaults):
diff --git a/spacy/lang/sv/lex_attrs.py b/spacy/lang/sv/lex_attrs.py
index f8ada9e2e..8eeafede8 100644
--- a/spacy/lang/sv/lex_attrs.py
+++ b/spacy/lang/sv/lex_attrs.py
@@ -1,6 +1,5 @@
from ...attrs import LIKE_NUM
-
_num_words = [
"noll",
"en",
diff --git a/spacy/lang/sv/punctuation.py b/spacy/lang/sv/punctuation.py
index 67f1bcdc4..64f1da989 100644
--- a/spacy/lang/sv/punctuation.py
+++ b/spacy/lang/sv/punctuation.py
@@ -1,8 +1,13 @@
-from ..char_classes import LIST_ELLIPSES, LIST_ICONS
-from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
+from ..char_classes import (
+ ALPHA,
+ ALPHA_LOWER,
+ ALPHA_UPPER,
+ CONCAT_QUOTES,
+ LIST_ELLIPSES,
+ LIST_ICONS,
+)
from ..punctuation import TOKENIZER_SUFFIXES
-
_quotes = CONCAT_QUOTES.replace("'", "")
_infixes = (
diff --git a/spacy/lang/sv/syntax_iterators.py b/spacy/lang/sv/syntax_iterators.py
index 06ad016ac..09153a8ec 100644
--- a/spacy/lang/sv/syntax_iterators.py
+++ b/spacy/lang/sv/syntax_iterators.py
@@ -1,7 +1,7 @@
-from typing import Union, Iterator, Tuple
+from typing import Iterator, Tuple, Union
-from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors
+from ...symbols import NOUN, PRON, PROPN
from ...tokens import Doc, Span
diff --git a/spacy/lang/sv/tokenizer_exceptions.py b/spacy/lang/sv/tokenizer_exceptions.py
index ce7db895a..8fd3afbe3 100644
--- a/spacy/lang/sv/tokenizer_exceptions.py
+++ b/spacy/lang/sv/tokenizer_exceptions.py
@@ -1,6 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import NORM, ORTH
from ...util import update_exc
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc = {}
diff --git a/spacy/lang/ta/__init__.py b/spacy/lang/ta/__init__.py
index 4929a4b97..7fd29371a 100644
--- a/spacy/lang/ta/__init__.py
+++ b/spacy/lang/ta/__init__.py
@@ -1,6 +1,6 @@
-from .stop_words import STOP_WORDS
+from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
-from ...language import Language, BaseDefaults
+from .stop_words import STOP_WORDS
class TamilDefaults(BaseDefaults):
diff --git a/spacy/lang/ta/lex_attrs.py b/spacy/lang/ta/lex_attrs.py
index f830f4ac9..d66125552 100644
--- a/spacy/lang/ta/lex_attrs.py
+++ b/spacy/lang/ta/lex_attrs.py
@@ -1,6 +1,5 @@
from ...attrs import LIKE_NUM
-
_numeral_suffixes = {"பத்து": "பது", "ற்று": "று", "ரத்து": "ரம்", "சத்து": "சம்"}
_num_words = [
"பூச்சியம்",
diff --git a/spacy/lang/te/__init__.py b/spacy/lang/te/__init__.py
index 77cc2fe9b..611e9746a 100644
--- a/spacy/lang/te/__init__.py
+++ b/spacy/lang/te/__init__.py
@@ -1,6 +1,6 @@
-from .stop_words import STOP_WORDS
+from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
-from ...language import Language, BaseDefaults
+from .stop_words import STOP_WORDS
class TeluguDefaults(BaseDefaults):
diff --git a/spacy/lang/th/__init__.py b/spacy/lang/th/__init__.py
index 12b1527e0..bd29d32a4 100644
--- a/spacy/lang/th/__init__.py
+++ b/spacy/lang/th/__init__.py
@@ -1,10 +1,9 @@
-from .stop_words import STOP_WORDS
-from .lex_attrs import LEX_ATTRS
-from ...language import Language, BaseDefaults
+from ...language import BaseDefaults, Language
from ...tokens import Doc
-from ...util import DummyTokenizer, registry, load_config_from_str
+from ...util import DummyTokenizer, load_config_from_str, registry
from ...vocab import Vocab
-
+from .lex_attrs import LEX_ATTRS
+from .stop_words import STOP_WORDS
DEFAULT_CONFIG = """
[nlp]
diff --git a/spacy/lang/th/lex_attrs.py b/spacy/lang/th/lex_attrs.py
index bc4e5293e..80f6ccbe8 100644
--- a/spacy/lang/th/lex_attrs.py
+++ b/spacy/lang/th/lex_attrs.py
@@ -1,6 +1,5 @@
from ...attrs import LIKE_NUM
-
_num_words = [
"ศูนย์",
"หนึ่ง",
diff --git a/spacy/lang/th/tokenizer_exceptions.py b/spacy/lang/th/tokenizer_exceptions.py
index 92116d474..954766d28 100644
--- a/spacy/lang/th/tokenizer_exceptions.py
+++ b/spacy/lang/th/tokenizer_exceptions.py
@@ -1,6 +1,5 @@
from ...symbols import ORTH
-
_exc = {
# หน่วยงานรัฐ / government agency
"กกต.": [{ORTH: "กกต."}],
diff --git a/spacy/lang/ti/__init__.py b/spacy/lang/ti/__init__.py
index c74c081b5..510999f67 100644
--- a/spacy/lang/ti/__init__.py
+++ b/spacy/lang/ti/__init__.py
@@ -1,12 +1,11 @@
-from .stop_words import STOP_WORDS
+from ...attrs import LANG
+from ...language import BaseDefaults, Language
+from ...util import update_exc
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_SUFFIXES
-
+from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...language import Language, BaseDefaults
-from ...attrs import LANG
-from ...util import update_exc
class TigrinyaDefaults(BaseDefaults):
diff --git a/spacy/lang/ti/punctuation.py b/spacy/lang/ti/punctuation.py
index aa884c2ba..f29f30e26 100644
--- a/spacy/lang/ti/punctuation.py
+++ b/spacy/lang/ti/punctuation.py
@@ -1,5 +1,11 @@
-from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
-from ..char_classes import UNITS, ALPHA_UPPER
+from ..char_classes import (
+ ALPHA_UPPER,
+ CURRENCY,
+ LIST_ELLIPSES,
+ LIST_PUNCT,
+ LIST_QUOTES,
+ UNITS,
+)
_list_punct = LIST_PUNCT + "፡ ። ፣ ፤ ፥ ፦ ፧ ፠ ፨".strip().split()
diff --git a/spacy/lang/ti/tokenizer_exceptions.py b/spacy/lang/ti/tokenizer_exceptions.py
index 3d79cd84b..711e4b406 100644
--- a/spacy/lang/ti/tokenizer_exceptions.py
+++ b/spacy/lang/ti/tokenizer_exceptions.py
@@ -1,5 +1,4 @@
-from ...symbols import ORTH, NORM
-
+from ...symbols import NORM, ORTH
_exc = {}
diff --git a/spacy/lang/tl/__init__.py b/spacy/lang/tl/__init__.py
index 30838890a..6849810ef 100644
--- a/spacy/lang/tl/__init__.py
+++ b/spacy/lang/tl/__init__.py
@@ -1,7 +1,7 @@
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .stop_words import STOP_WORDS
+from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
-from ...language import Language, BaseDefaults
+from .stop_words import STOP_WORDS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class TagalogDefaults(BaseDefaults):
diff --git a/spacy/lang/tl/lex_attrs.py b/spacy/lang/tl/lex_attrs.py
index 60bdc923b..8866453a0 100644
--- a/spacy/lang/tl/lex_attrs.py
+++ b/spacy/lang/tl/lex_attrs.py
@@ -1,6 +1,5 @@
from ...attrs import LIKE_NUM
-
_num_words = [
"sero",
"isa",
diff --git a/spacy/lang/tl/tokenizer_exceptions.py b/spacy/lang/tl/tokenizer_exceptions.py
index 51ad12d9f..b10c90437 100644
--- a/spacy/lang/tl/tokenizer_exceptions.py
+++ b/spacy/lang/tl/tokenizer_exceptions.py
@@ -1,7 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...symbols import ORTH, NORM
+from ...symbols import NORM, ORTH
from ...util import update_exc
-
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc = {
"tayo'y": [{ORTH: "tayo"}, {ORTH: "'y", NORM: "ay"}],
diff --git a/spacy/lang/tn/__init__.py b/spacy/lang/tn/__init__.py
index 28e887eea..4cb8a1635 100644
--- a/spacy/lang/tn/__init__.py
+++ b/spacy/lang/tn/__init__.py
@@ -1,7 +1,7 @@
-from .stop_words import STOP_WORDS
+from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_INFIXES
-from ...language import Language, BaseDefaults
+from .stop_words import STOP_WORDS
class SetswanaDefaults(BaseDefaults):
diff --git a/spacy/lang/tn/punctuation.py b/spacy/lang/tn/punctuation.py
index a52755564..54d76fbaf 100644
--- a/spacy/lang/tn/punctuation.py
+++ b/spacy/lang/tn/punctuation.py
@@ -1,5 +1,12 @@
-from ..char_classes import LIST_ELLIPSES, LIST_ICONS, HYPHENS
-from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
+from ..char_classes import (
+ ALPHA,
+ ALPHA_LOWER,
+ ALPHA_UPPER,
+ CONCAT_QUOTES,
+ HYPHENS,
+ LIST_ELLIPSES,
+ LIST_ICONS,
+)
_infixes = (
LIST_ELLIPSES
diff --git a/spacy/lang/tokenizer_exceptions.py b/spacy/lang/tokenizer_exceptions.py
index d76fe4262..dbf9aab49 100644
--- a/spacy/lang/tokenizer_exceptions.py
+++ b/spacy/lang/tokenizer_exceptions.py
@@ -1,8 +1,7 @@
import re
+from ..symbols import NORM, ORTH
from .char_classes import ALPHA_LOWER
-from ..symbols import ORTH, NORM
-
# URL validation regex courtesy of: https://mathiasbynens.be/demo/url-regex
# and https://gist.github.com/dperini/729294 (Diego Perini, MIT License)
diff --git a/spacy/lang/tr/__init__.py b/spacy/lang/tr/__init__.py
index 02b5c7bf4..9aa752168 100644
--- a/spacy/lang/tr/__init__.py
+++ b/spacy/lang/tr/__init__.py
@@ -1,8 +1,8 @@
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
+from ...language import BaseDefaults, Language
+from .lex_attrs import LEX_ATTRS
from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS
-from .lex_attrs import LEX_ATTRS
-from ...language import Language, BaseDefaults
+from .tokenizer_exceptions import TOKEN_MATCH, TOKENIZER_EXCEPTIONS
class TurkishDefaults(BaseDefaults):
diff --git a/spacy/lang/tr/lex_attrs.py b/spacy/lang/tr/lex_attrs.py
index 6d9f4f388..2189932b6 100644
--- a/spacy/lang/tr/lex_attrs.py
+++ b/spacy/lang/tr/lex_attrs.py
@@ -1,6 +1,5 @@
from ...attrs import LIKE_NUM
-
# Thirteen, fifteen etc. are written separate: on üç
_num_words = [
diff --git a/spacy/lang/tr/syntax_iterators.py b/spacy/lang/tr/syntax_iterators.py
index 769af1223..ed588424a 100644
--- a/spacy/lang/tr/syntax_iterators.py
+++ b/spacy/lang/tr/syntax_iterators.py
@@ -1,7 +1,8 @@
-from typing import Union, Iterator, Tuple
-from ...tokens import Doc, Span
-from ...symbols import NOUN, PROPN, PRON
+from typing import Iterator, Tuple, Union
+
from ...errors import Errors
+from ...symbols import NOUN, PRON, PROPN
+from ...tokens import Doc, Span
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
diff --git a/spacy/lang/tr/tokenizer_exceptions.py b/spacy/lang/tr/tokenizer_exceptions.py
index 22fa9f09e..d095a3d0e 100644
--- a/spacy/lang/tr/tokenizer_exceptions.py
+++ b/spacy/lang/tr/tokenizer_exceptions.py
@@ -1,8 +1,7 @@
import re
-from ..punctuation import ALPHA_LOWER, ALPHA
-from ...symbols import ORTH, NORM
-
+from ...symbols import NORM, ORTH
+from ..punctuation import ALPHA, ALPHA_LOWER
_exc = {}
diff --git a/spacy/lang/tt/__init__.py b/spacy/lang/tt/__init__.py
index d5e1e87ef..ce04d09c2 100644
--- a/spacy/lang/tt/__init__.py
+++ b/spacy/lang/tt/__init__.py
@@ -1,8 +1,8 @@
+from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_INFIXES
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from ...language import Language, BaseDefaults
class TatarDefaults(BaseDefaults):
diff --git a/spacy/lang/tt/punctuation.py b/spacy/lang/tt/punctuation.py
index f644a8ccb..5c233df7c 100644
--- a/spacy/lang/tt/punctuation.py
+++ b/spacy/lang/tt/punctuation.py
@@ -1,5 +1,12 @@
-from ..char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER, CONCAT_QUOTES, HYPHENS
-from ..char_classes import LIST_ELLIPSES, LIST_ICONS
+from ..char_classes import (
+ ALPHA,
+ ALPHA_LOWER,
+ ALPHA_UPPER,
+ CONCAT_QUOTES,
+ HYPHENS,
+ LIST_ELLIPSES,
+ LIST_ICONS,
+)
_hyphens_no_dash = HYPHENS.replace("-", "").strip("|").replace("||", "")
_infixes = (
diff --git a/spacy/lang/tt/tokenizer_exceptions.py b/spacy/lang/tt/tokenizer_exceptions.py
index 3b8cc86b5..280b9f866 100644
--- a/spacy/lang/tt/tokenizer_exceptions.py
+++ b/spacy/lang/tt/tokenizer_exceptions.py
@@ -1,7 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...symbols import ORTH, NORM
+from ...symbols import NORM, ORTH
from ...util import update_exc
-
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc = {}
diff --git a/spacy/lang/uk/__init__.py b/spacy/lang/uk/__init__.py
index bfea9ff69..5dd75a2a4 100644
--- a/spacy/lang/uk/__init__.py
+++ b/spacy/lang/uk/__init__.py
@@ -1,14 +1,16 @@
-from typing import Optional, Callable
+from typing import Callable, Optional
from thinc.api import Model
-from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .stop_words import STOP_WORDS
-from .lex_attrs import LEX_ATTRS
+from ...language import BaseDefaults, Language
+from ..punctuation import (
+ COMBINING_DIACRITICS_TOKENIZER_INFIXES,
+ COMBINING_DIACRITICS_TOKENIZER_SUFFIXES,
+)
from .lemmatizer import UkrainianLemmatizer
-from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_INFIXES
-from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_SUFFIXES
-from ...language import Language, BaseDefaults
+from .lex_attrs import LEX_ATTRS
+from .stop_words import STOP_WORDS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class UkrainianDefaults(BaseDefaults):
diff --git a/spacy/lang/uk/lemmatizer.py b/spacy/lang/uk/lemmatizer.py
index 37015cc2a..9ec582b76 100644
--- a/spacy/lang/uk/lemmatizer.py
+++ b/spacy/lang/uk/lemmatizer.py
@@ -1,10 +1,10 @@
-from typing import Optional, Callable
+from typing import Callable, Optional
from thinc.api import Model
-from ..ru.lemmatizer import RussianLemmatizer
from ...pipeline.lemmatizer import lemmatizer_score
from ...vocab import Vocab
+from ..ru.lemmatizer import RussianLemmatizer
class UkrainianLemmatizer(RussianLemmatizer):
diff --git a/spacy/lang/uk/tokenizer_exceptions.py b/spacy/lang/uk/tokenizer_exceptions.py
index 7e168a27c..07dd941af 100644
--- a/spacy/lang/uk/tokenizer_exceptions.py
+++ b/spacy/lang/uk/tokenizer_exceptions.py
@@ -1,7 +1,6 @@
-from ..tokenizer_exceptions import BASE_EXCEPTIONS
-from ...symbols import ORTH, NORM
+from ...symbols import NORM, ORTH
from ...util import update_exc
-
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc = {}
diff --git a/spacy/lang/ur/__init__.py b/spacy/lang/ur/__init__.py
index 266c5a73d..4f20ac92f 100644
--- a/spacy/lang/ur/__init__.py
+++ b/spacy/lang/ur/__init__.py
@@ -1,7 +1,7 @@
-from .stop_words import STOP_WORDS
+from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_SUFFIXES
-from ...language import Language, BaseDefaults
+from .stop_words import STOP_WORDS
class UrduDefaults(BaseDefaults):
diff --git a/spacy/lang/ur/punctuation.py b/spacy/lang/ur/punctuation.py
index 5d35d0a25..382bfc75c 100644
--- a/spacy/lang/ur/punctuation.py
+++ b/spacy/lang/ur/punctuation.py
@@ -1,4 +1,3 @@
from ..punctuation import TOKENIZER_SUFFIXES
-
_suffixes = TOKENIZER_SUFFIXES
diff --git a/spacy/lang/vi/__init__.py b/spacy/lang/vi/__init__.py
index 822dc348c..a621b8bfe 100644
--- a/spacy/lang/vi/__init__.py
+++ b/spacy/lang/vi/__init__.py
@@ -1,17 +1,17 @@
-from typing import Any, Dict, Union
-from pathlib import Path
import re
-import srsly
import string
+from pathlib import Path
+from typing import Any, Dict, Union
+
+import srsly
-from .stop_words import STOP_WORDS
-from .lex_attrs import LEX_ATTRS
-from ...language import Language, BaseDefaults
-from ...tokens import Doc
-from ...util import DummyTokenizer, registry, load_config_from_str
-from ...vocab import Vocab
from ... import util
-
+from ...language import BaseDefaults, Language
+from ...tokens import Doc
+from ...util import DummyTokenizer, load_config_from_str, registry
+from ...vocab import Vocab
+from .lex_attrs import LEX_ATTRS
+from .stop_words import STOP_WORDS
DEFAULT_CONFIG = """
[nlp]
diff --git a/spacy/lang/vi/lex_attrs.py b/spacy/lang/vi/lex_attrs.py
index 0cbda4ffb..82997a133 100644
--- a/spacy/lang/vi/lex_attrs.py
+++ b/spacy/lang/vi/lex_attrs.py
@@ -1,6 +1,5 @@
from ...attrs import LIKE_NUM
-
_num_words = [
"không", # Zero
"một", # One
diff --git a/spacy/lang/yo/__init__.py b/spacy/lang/yo/__init__.py
index 6c38ec8af..93c4ca493 100644
--- a/spacy/lang/yo/__init__.py
+++ b/spacy/lang/yo/__init__.py
@@ -1,6 +1,6 @@
-from .stop_words import STOP_WORDS
+from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
-from ...language import Language, BaseDefaults
+from .stop_words import STOP_WORDS
class YorubaDefaults(BaseDefaults):
diff --git a/spacy/lang/yo/lex_attrs.py b/spacy/lang/yo/lex_attrs.py
index ead68ced2..5f33e06a5 100644
--- a/spacy/lang/yo/lex_attrs.py
+++ b/spacy/lang/yo/lex_attrs.py
@@ -2,7 +2,6 @@ import unicodedata
from ...attrs import LIKE_NUM
-
_num_words = [
"ení",
"oókàn",
diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py
index fdf6776e2..f7bb09277 100644
--- a/spacy/lang/zh/__init__.py
+++ b/spacy/lang/zh/__init__.py
@@ -1,21 +1,21 @@
-from typing import Optional, List, Dict, Any, Callable, Iterable
-from enum import Enum
import tempfile
-import srsly
import warnings
+from enum import Enum
from pathlib import Path
+from typing import Any, Callable, Dict, Iterable, List, Optional
-from ...errors import Warnings, Errors
-from ...language import Language, BaseDefaults
+import srsly
+
+from ... import util
+from ...errors import Errors, Warnings
+from ...language import BaseDefaults, Language
from ...scorer import Scorer
from ...tokens import Doc
-from ...training import validate_examples, Example
-from ...util import DummyTokenizer, registry, load_config_from_str
+from ...training import Example, validate_examples
+from ...util import DummyTokenizer, load_config_from_str, registry
from ...vocab import Vocab
from .lex_attrs import LEX_ATTRS
from .stop_words import STOP_WORDS
-from ... import util
-
# fmt: off
_PKUSEG_INSTALL_MSG = "install spacy-pkuseg with `pip install \"spacy-pkuseg>=0.0.27,<0.1.0\"` or `conda install -c conda-forge \"spacy-pkuseg>=0.0.27,<0.1.0\"`"
diff --git a/spacy/lang/zh/lex_attrs.py b/spacy/lang/zh/lex_attrs.py
index 08c8e3160..36fa7310a 100644
--- a/spacy/lang/zh/lex_attrs.py
+++ b/spacy/lang/zh/lex_attrs.py
@@ -2,7 +2,6 @@ import re
from ...attrs import LIKE_NUM
-
_single_num_words = [
"〇",
"一",
diff --git a/spacy/language.py b/spacy/language.py
index 9fdcf6328..fd616483b 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1,47 +1,70 @@
-from typing import Iterator, Optional, Any, Dict, Callable, Iterable
-from typing import Union, Tuple, List, Set, Pattern, Sequence
-from typing import NoReturn, TYPE_CHECKING, TypeVar, cast, overload
-
-from dataclasses import dataclass
-import random
-import itertools
import functools
+import itertools
+import multiprocessing as mp
+import random
+import traceback
+import warnings
from contextlib import contextmanager
from copy import deepcopy
-from pathlib import Path
-import warnings
-
-from thinc.api import get_current_ops, Config, CupyOps, Optimizer
-import srsly
-import multiprocessing as mp
+from dataclasses import dataclass
from itertools import chain, cycle
+from pathlib import Path
from timeit import default_timer as timer
-import traceback
+from typing import (
+ Any,
+ Callable,
+ Dict,
+ Iterable,
+ Iterator,
+ List,
+ NoReturn,
+ Optional,
+ Pattern,
+ Sequence,
+ Set,
+ Tuple,
+ TypeVar,
+ Union,
+ cast,
+ overload,
+)
-from . import ty
-from .tokens.underscore import Underscore
-from .vocab import Vocab, create_vocab
-from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis
-from .training import Example, validate_examples
-from .training.initialize import init_vocab, init_tok2vec
-from .scorer import Scorer
-from .util import registry, SimpleFrozenList, _pipe, raise_error, _DEFAULT_EMPTY_PIPES
-from .util import SimpleFrozenDict, combine_score_weights, CONFIG_SECTION_ORDER
-from .util import warn_if_jupyter_cupy
-from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS
-from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
-from .lang.punctuation import TOKENIZER_INFIXES
-from .tokens import Doc
-from .tokenizer import Tokenizer
-from .errors import Errors, Warnings
-from .schemas import ConfigSchema, ConfigSchemaNlp, ConfigSchemaInit
-from .schemas import ConfigSchemaPretrain, validate_init_settings
-from .git_info import GIT_VERSION
-from . import util
-from . import about
-from .lookups import load_lookups
+import srsly
+from thinc.api import Config, CupyOps, Optimizer, get_current_ops
+
+from . import about, ty, util
from .compat import Literal
-
+from .errors import Errors, Warnings
+from .git_info import GIT_VERSION
+from .lang.punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
+from .lang.tokenizer_exceptions import BASE_EXCEPTIONS, URL_MATCH
+from .lookups import load_lookups
+from .pipe_analysis import analyze_pipes, print_pipe_analysis, validate_attrs
+from .schemas import (
+ ConfigSchema,
+ ConfigSchemaInit,
+ ConfigSchemaNlp,
+ ConfigSchemaPretrain,
+ validate_init_settings,
+)
+from .scorer import Scorer
+from .tokenizer import Tokenizer
+from .tokens import Doc
+from .tokens.underscore import Underscore
+from .training import Example, validate_examples
+from .training.initialize import init_tok2vec, init_vocab
+from .util import (
+ _DEFAULT_EMPTY_PIPES,
+ CONFIG_SECTION_ORDER,
+ SimpleFrozenDict,
+ SimpleFrozenList,
+ _pipe,
+ combine_score_weights,
+ raise_error,
+ registry,
+ warn_if_jupyter_cupy,
+)
+from .vocab import Vocab, create_vocab
PipeCallable = Callable[[Doc], Doc]
@@ -716,6 +739,11 @@ class Language:
)
)
pipe = source.get_pipe(source_name)
+ # There is no actual solution here. Either the component has the right
+ # name for the source pipeline or the component has the right name for
+ # the current pipeline. This prioritizes the current pipeline.
+ if hasattr(pipe, "name"):
+ pipe.name = name
# Make sure the source config is interpolated so we don't end up with
# orphaned variables in our final config
source_config = source.config.interpolate()
@@ -793,6 +821,7 @@ class Language:
pipe_index = self._get_pipe_index(before, after, first, last)
self._pipe_meta[name] = self.get_factory_meta(factory_name)
self._components.insert(pipe_index, (name, pipe_component))
+ self._link_components()
return pipe_component
def _get_pipe_index(
@@ -928,6 +957,7 @@ class Language:
if old_name in self._config["initialize"]["components"]:
init_cfg = self._config["initialize"]["components"].pop(old_name)
self._config["initialize"]["components"][new_name] = init_cfg
+ self._link_components()
def remove_pipe(self, name: str) -> Tuple[str, PipeCallable]:
"""Remove a component from the pipeline.
@@ -951,6 +981,7 @@ class Language:
# Make sure the name is also removed from the set of disabled components
if name in self.disabled:
self._disabled.remove(name)
+ self._link_components()
return removed
def disable_pipe(self, name: str) -> None:
@@ -1269,7 +1300,10 @@ class Language:
"No 'get_examples' callback provided to 'Language.initialize', creating dummy examples"
)
doc = Doc(self.vocab, words=["x", "y", "z"])
- get_examples = lambda: [Example.from_dict(doc, {})]
+
+ def get_examples():
+ return [Example.from_dict(doc, {})]
+
if not hasattr(get_examples, "__call__"):
err = Errors.E930.format(
method="Language.initialize", obj=type(get_examples)
@@ -1372,6 +1406,7 @@ class Language:
scorer: Optional[Scorer] = None,
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
scorer_cfg: Optional[Dict[str, Any]] = None,
+ per_component: bool = False,
) -> Dict[str, Any]:
"""Evaluate a model's pipeline components.
@@ -1383,6 +1418,8 @@ class Language:
arguments for specific components.
scorer_cfg (dict): An optional dictionary with extra keyword arguments
for the scorer.
+ per_component (bool): Whether to return the scores keyed by component
+ name. Defaults to False.
RETURNS (Scorer): The scorer containing the evaluation results.
@@ -1415,7 +1452,7 @@ class Language:
for eg, doc in zip(examples, docs):
eg.predicted = doc
end_time = timer()
- results = scorer.score(examples)
+ results = scorer.score(examples, per_component=per_component)
n_words = sum(len(eg.predicted) for eg in examples)
results["speed"] = n_words / (end_time - start_time)
return results
@@ -1673,8 +1710,16 @@ class Language:
# The problem is we need to do it during deserialization...And the
# components don't receive the pipeline then. So this does have to be
# here :(
+ # First, fix up all the internal component names in case they have
+ # gotten out of sync due to sourcing components from different
+ # pipelines, since find_listeners uses proc2.name for the listener
+ # map.
+ for name, proc in self.pipeline:
+ if hasattr(proc, "name"):
+ proc.name = name
for i, (name1, proc1) in enumerate(self.pipeline):
if isinstance(proc1, ty.ListenedToComponent):
+ proc1.listener_map = {}
for name2, proc2 in self.pipeline[i + 1 :]:
proc1.find_listeners(proc2)
@@ -1808,6 +1853,7 @@ class Language:
raw_config=raw_config,
)
else:
+ assert "source" in pipe_cfg
# We need the sourced components to reference the same
# vocab without modifying the current vocab state **AND**
# we still want to load the source model vectors to perform
@@ -1827,6 +1873,10 @@ class Language:
source_name = pipe_cfg.get("component", pipe_name)
listeners_replaced = False
if "replace_listeners" in pipe_cfg:
+ # Make sure that the listened-to component has the
+ # state of the source pipeline listener map so that the
+ # replace_listeners method below works as intended.
+ source_nlps[model]._link_components()
for name, proc in source_nlps[model].pipeline:
if source_name in getattr(proc, "listening_components", []):
source_nlps[model].replace_listeners(
@@ -1838,6 +1888,8 @@ class Language:
nlp.add_pipe(
source_name, source=source_nlps[model], name=pipe_name
)
+ # At this point after nlp.add_pipe, the listener map
+ # corresponds to the new pipeline.
if model not in source_nlp_vectors_hashes:
source_nlp_vectors_hashes[model] = hash(
source_nlps[model].vocab.vectors.to_bytes(
@@ -1892,27 +1944,6 @@ class Language:
raise ValueError(
Errors.E942.format(name="pipeline_creation", value=type(nlp))
)
- # Detect components with listeners that are not frozen consistently
- for name, proc in nlp.pipeline:
- if isinstance(proc, ty.ListenedToComponent):
- # Remove listeners not in the pipeline
- listener_names = proc.listening_components
- unused_listener_names = [
- ll for ll in listener_names if ll not in nlp.pipe_names
- ]
- for listener_name in unused_listener_names:
- for listener in proc.listener_map.get(listener_name, []):
- proc.remove_listener(listener, listener_name)
-
- for listener_name in proc.listening_components:
- # e.g. tok2vec/transformer
- # If it's a component sourced from another pipeline, we check if
- # the tok2vec listeners should be replaced with standalone tok2vec
- # models (e.g. so component can be frozen without its performance
- # degrading when other components/tok2vec are updated)
- paths = sourced.get(listener_name, {}).get("replace_listeners", [])
- if paths:
- nlp.replace_listeners(name, listener_name, paths)
return nlp
def replace_listeners(
diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd
index 8dea0d6a2..ff2e4f92e 100644
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@@ -1,11 +1,20 @@
from numpy cimport ndarray
-from .typedefs cimport attr_t, hash_t, flags_t, len_t, tag_t
-from .attrs cimport attr_id_t
-from .attrs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, LANG
-
-from .structs cimport LexemeC
+from .attrs cimport (
+ ID,
+ LANG,
+ LENGTH,
+ LOWER,
+ NORM,
+ ORTH,
+ PREFIX,
+ SHAPE,
+ SUFFIX,
+ attr_id_t,
+)
from .strings cimport StringStore
+from .structs cimport LexemeC
+from .typedefs cimport attr_t, flags_t, hash_t, len_t, tag_t
from .vocab cimport Vocab
diff --git a/spacy/lexeme.pyi b/spacy/lexeme.pyi
index 9b7a6156a..9980b9fce 100644
--- a/spacy/lexeme.pyi
+++ b/spacy/lexeme.pyi
@@ -1,8 +1,7 @@
-from typing import (
- Union,
- Any,
-)
+from typing import Any, Union
+
from thinc.types import Floats1d
+
from .tokens import Doc, Span, Token
from .vocab import Vocab
diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx
index e70feaf9a..00e2c6258 100644
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@@ -1,24 +1,40 @@
# cython: embedsignature=True
# Compiler crashes on memory view coercion without this. Should report bug.
+cimport numpy as np
from cython.view cimport array as cvarray
from libc.string cimport memset
-cimport numpy as np
+
np.import_array()
+import warnings
+
import numpy
from thinc.api import get_array_module
-import warnings
+from .attrs cimport (
+ IS_ALPHA,
+ IS_ASCII,
+ IS_BRACKET,
+ IS_CURRENCY,
+ IS_DIGIT,
+ IS_LEFT_PUNCT,
+ IS_LOWER,
+ IS_PUNCT,
+ IS_QUOTE,
+ IS_RIGHT_PUNCT,
+ IS_SPACE,
+ IS_STOP,
+ IS_TITLE,
+ IS_UPPER,
+ LIKE_EMAIL,
+ LIKE_NUM,
+ LIKE_URL,
+)
from .typedefs cimport attr_t, flags_t
-from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
-from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
-from .attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT
-from .attrs cimport IS_CURRENCY
from .attrs import intify_attrs
from .errors import Errors, Warnings
-
OOV_RANK = 0xffffffffffffffff # UINT64_MAX
memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
EMPTY_LEXEME.id = OOV_RANK
diff --git a/spacy/lookups.py b/spacy/lookups.py
index d7cc44fb3..1a2c44bfa 100644
--- a/spacy/lookups.py
+++ b/spacy/lookups.py
@@ -1,13 +1,13 @@
-from typing import Any, List, Union, Optional, Dict
+from collections import OrderedDict
from pathlib import Path
+from typing import Any, Dict, List, Optional, Union
+
import srsly
from preshed.bloom import BloomFilter
-from collections import OrderedDict
from .errors import Errors
-from .util import SimpleFrozenDict, ensure_path, registry, load_language_data
from .strings import get_string_id
-
+from .util import SimpleFrozenDict, ensure_path, load_language_data, registry
UNSET = object()
diff --git a/spacy/matcher/__init__.py b/spacy/matcher/__init__.py
index a4f164847..f671f2e35 100644
--- a/spacy/matcher/__init__.py
+++ b/spacy/matcher/__init__.py
@@ -1,6 +1,6 @@
-from .matcher import Matcher
-from .phrasematcher import PhraseMatcher
from .dependencymatcher import DependencyMatcher
from .levenshtein import levenshtein
+from .matcher import Matcher
+from .phrasematcher import PhraseMatcher
__all__ = ["Matcher", "PhraseMatcher", "DependencyMatcher", "levenshtein"]
diff --git a/spacy/matcher/dependencymatcher.pyi b/spacy/matcher/dependencymatcher.pyi
index c19d3a71c..b9fbabda7 100644
--- a/spacy/matcher/dependencymatcher.pyi
+++ b/spacy/matcher/dependencymatcher.pyi
@@ -1,8 +1,9 @@
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
-from .matcher import Matcher
-from ..vocab import Vocab
+
from ..tokens.doc import Doc
from ..tokens.span import Span
+from ..vocab import Vocab
+from .matcher import Matcher
class DependencyMatcher:
"""Match dependency parse tree based on pattern rules."""
diff --git a/spacy/matcher/dependencymatcher.pyx b/spacy/matcher/dependencymatcher.pyx
index 48fb3eb2a..a214c0668 100644
--- a/spacy/matcher/dependencymatcher.pyx
+++ b/spacy/matcher/dependencymatcher.pyx
@@ -1,18 +1,16 @@
# cython: infer_types=True, profile=True
-from typing import List
+import warnings
from collections import defaultdict
from itertools import product
+from typing import List
-import warnings
-
-from .matcher cimport Matcher
-from ..vocab cimport Vocab
from ..tokens.doc cimport Doc
+from ..vocab cimport Vocab
+from .matcher cimport Matcher
from ..errors import Errors, Warnings
from ..tokens import Span
-
DELIMITER = "||"
INDEX_HEAD = 1
INDEX_RELOP = 0
diff --git a/spacy/matcher/matcher.pxd b/spacy/matcher/matcher.pxd
index 51854d562..2c82cea1d 100644
--- a/spacy/matcher/matcher.pxd
+++ b/spacy/matcher/matcher.pxd
@@ -1,11 +1,11 @@
+from cymem.cymem cimport Pool
from libc.stdint cimport int32_t
from libcpp.vector cimport vector
-from cymem.cymem cimport Pool
-from ..vocab cimport Vocab
-from ..typedefs cimport attr_t, hash_t
-from ..structs cimport TokenC
from ..lexeme cimport attr_id_t
+from ..structs cimport TokenC
+from ..typedefs cimport attr_t, hash_t
+from ..vocab cimport Vocab
cdef enum action_t:
diff --git a/spacy/matcher/matcher.pyi b/spacy/matcher/matcher.pyi
index 48922865b..c33b534cb 100644
--- a/spacy/matcher/matcher.pyi
+++ b/spacy/matcher/matcher.pyi
@@ -1,8 +1,19 @@
-from typing import Any, List, Dict, Tuple, Optional, Callable, Union
-from typing import Iterator, Iterable, overload
+from typing import (
+ Any,
+ Callable,
+ Dict,
+ Iterable,
+ Iterator,
+ List,
+ Optional,
+ Tuple,
+ Union,
+ overload,
+)
+
from ..compat import Literal
-from ..vocab import Vocab
from ..tokens import Doc, Span
+from ..vocab import Vocab
class Matcher:
def __init__(
diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index b886bd2ec..3d03f37ae 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -1,32 +1,43 @@
# cython: binding=True, infer_types=True, profile=True
-from typing import List, Iterable
+from typing import Iterable, List
-from libcpp.vector cimport vector
-from libc.stdint cimport int32_t, int8_t
-from libc.string cimport memset, memcmp
from cymem.cymem cimport Pool
+from libc.stdint cimport int8_t, int32_t
+from libc.string cimport memcmp, memset
+from libcpp.vector cimport vector
from murmurhash.mrmr cimport hash64
import re
-import srsly
import warnings
-from ..typedefs cimport attr_t
+import srsly
+
+from ..attrs cimport (
+ DEP,
+ ENT_IOB,
+ ID,
+ LEMMA,
+ MORPH,
+ NULL_ATTR,
+ ORTH,
+ POS,
+ TAG,
+ attr_id_t,
+)
from ..structs cimport TokenC
-from ..vocab cimport Vocab
from ..tokens.doc cimport Doc, get_token_attr_for_matcher
+from ..tokens.morphanalysis cimport MorphAnalysis
from ..tokens.span cimport Span
from ..tokens.token cimport Token
-from ..tokens.morphanalysis cimport MorphAnalysis
-from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH, ENT_IOB
+from ..typedefs cimport attr_t
+from ..vocab cimport Vocab
-from .levenshtein import levenshtein_compare
-from ..schemas import validate_token_pattern
-from ..errors import Errors, MatchPatternError, Warnings
-from ..strings import get_string_id
from ..attrs import IDS
+from ..errors import Errors, MatchPatternError, Warnings
+from ..schemas import validate_token_pattern
+from ..strings import get_string_id
from ..util import registry
-
+from .levenshtein import levenshtein_compare
DEF PADDING = 5
diff --git a/spacy/matcher/phrasematcher.pxd b/spacy/matcher/phrasematcher.pxd
index 1bdc19012..bffc1ac97 100644
--- a/spacy/matcher/phrasematcher.pxd
+++ b/spacy/matcher/phrasematcher.pxd
@@ -1,6 +1,6 @@
-from libcpp.vector cimport vector
from cymem.cymem cimport Pool
-from preshed.maps cimport key_t, MapStruct
+from libcpp.vector cimport vector
+from preshed.maps cimport MapStruct, key_t
from ..attrs cimport attr_id_t
from ..structs cimport SpanC
diff --git a/spacy/matcher/phrasematcher.pyi b/spacy/matcher/phrasematcher.pyi
index 68e3386e4..27f6ba373 100644
--- a/spacy/matcher/phrasematcher.pyi
+++ b/spacy/matcher/phrasematcher.pyi
@@ -1,12 +1,13 @@
-from typing import List, Tuple, Union, Optional, Callable, Any, Dict, overload
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union, overload
+
from ..compat import Literal
-from .matcher import Matcher
-from ..vocab import Vocab
from ..tokens import Doc, Span
+from ..vocab import Vocab
+from .matcher import Matcher
class PhraseMatcher:
def __init__(
- self, vocab: Vocab, attr: Optional[Union[int, str]], validate: bool = ...
+ self, vocab: Vocab, attr: Optional[Union[int, str]] = ..., validate: bool = ...
) -> None: ...
def __reduce__(self) -> Any: ...
def __len__(self) -> int: ...
diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx
index 382029872..c407cf1cc 100644
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@@ -1,18 +1,20 @@
# cython: infer_types=True, profile=True
from libc.stdint cimport uintptr_t
-from preshed.maps cimport map_init, map_set, map_get, map_clear, map_iter
+from preshed.maps cimport map_clear, map_get, map_init, map_iter, map_set
import warnings
-from ..attrs cimport ORTH, POS, TAG, DEP, LEMMA, MORPH
+from ..attrs cimport DEP, LEMMA, MORPH, ORTH, POS, TAG
+
from ..attrs import IDS
+
from ..structs cimport TokenC
-from ..tokens.token cimport Token
from ..tokens.span cimport Span
+from ..tokens.token cimport Token
from ..typedefs cimport attr_t
-from ..schemas import TokenPattern
from ..errors import Errors, Warnings
+from ..schemas import TokenPattern
cdef class PhraseMatcher:
diff --git a/spacy/ml/_character_embed.py b/spacy/ml/_character_embed.py
index e46735102..89c836144 100644
--- a/spacy/ml/_character_embed.py
+++ b/spacy/ml/_character_embed.py
@@ -1,4 +1,5 @@
from typing import List
+
from thinc.api import Model
from thinc.types import Floats2d
diff --git a/spacy/ml/callbacks.py b/spacy/ml/callbacks.py
index 3b60ec2ab..e2378a7ba 100644
--- a/spacy/ml/callbacks.py
+++ b/spacy/ml/callbacks.py
@@ -1,8 +1,8 @@
-from typing import Type, Callable, Dict, TYPE_CHECKING, List, Optional, Set
import functools
import inspect
import types
import warnings
+from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Set, Type
from thinc.layers import with_nvtx_range
from thinc.model import Model, wrap_model_recursive
diff --git a/spacy/ml/extract_ngrams.py b/spacy/ml/extract_ngrams.py
index c9c82f369..ce7c585cc 100644
--- a/spacy/ml/extract_ngrams.py
+++ b/spacy/ml/extract_ngrams.py
@@ -1,7 +1,7 @@
from thinc.api import Model
-from ..util import registry
from ..attrs import LOWER
+from ..util import registry
@registry.layers("spacy.extract_ngrams.v1")
diff --git a/spacy/ml/extract_spans.py b/spacy/ml/extract_spans.py
index af6be78db..ac0f5fa1b 100644
--- a/spacy/ml/extract_spans.py
+++ b/spacy/ml/extract_spans.py
@@ -1,6 +1,7 @@
-from typing import List, Tuple, Callable
+from typing import Callable, List, Tuple
+
from thinc.api import Model, to_numpy
-from thinc.types import Ragged, Ints1d
+from thinc.types import Ints1d, Ragged
from ..util import registry
diff --git a/spacy/ml/featureextractor.py b/spacy/ml/featureextractor.py
index ed2918f02..06f1ff51a 100644
--- a/spacy/ml/featureextractor.py
+++ b/spacy/ml/featureextractor.py
@@ -1,6 +1,7 @@
-from typing import List, Union, Callable, Tuple
-from thinc.types import Ints2d
+from typing import Callable, List, Tuple, Union
+
from thinc.api import Model, registry
+from thinc.types import Ints2d
from ..tokens import Doc
diff --git a/spacy/ml/models/__init__.py b/spacy/ml/models/__init__.py
index 9b7628f0e..5125018e5 100644
--- a/spacy/ml/models/__init__.py
+++ b/spacy/ml/models/__init__.py
@@ -1,6 +1,7 @@
from .entity_linker import * # noqa
from .multi_task import * # noqa
from .parser import * # noqa
+from .span_finder import * # noqa
from .spancat import * # noqa
from .tagger import * # noqa
from .textcat import * # noqa
diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py
index 7332ca199..b7100c00a 100644
--- a/spacy/ml/models/entity_linker.py
+++ b/spacy/ml/models/entity_linker.py
@@ -1,16 +1,31 @@
from pathlib import Path
-from typing import Optional, Callable, Iterable, List, Tuple
-from thinc.types import Floats2d
-from thinc.api import chain, list2ragged, reduce_mean, residual
-from thinc.api import Model, Maxout, Linear, tuplify, Ragged
+from typing import Callable, Iterable, List, Optional, Tuple
+
+from thinc.api import (
+ Linear,
+ Maxout,
+ Model,
+ Ragged,
+ chain,
+ list2ragged,
+ reduce_mean,
+ residual,
+ tuplify,
+)
+from thinc.types import Floats2d
-from ...util import registry
-from ...kb import KnowledgeBase, InMemoryLookupKB
-from ...kb import Candidate, get_candidates, get_candidates_batch
-from ...vocab import Vocab
-from ...tokens import Span, Doc
-from ..extract_spans import extract_spans
from ...errors import Errors
+from ...kb import (
+ Candidate,
+ InMemoryLookupKB,
+ KnowledgeBase,
+ get_candidates,
+ get_candidates_batch,
+)
+from ...tokens import Doc, Span
+from ...util import registry
+from ...vocab import Vocab
+from ..extract_spans import extract_spans
@registry.architectures("spacy.EntityLinker.v2")
diff --git a/spacy/ml/models/multi_task.py b/spacy/ml/models/multi_task.py
index 7eb13b608..b7faf1cd7 100644
--- a/spacy/ml/models/multi_task.py
+++ b/spacy/ml/models/multi_task.py
@@ -1,22 +1,33 @@
-from typing import Any, Optional, Iterable, Tuple, List, Callable, TYPE_CHECKING, cast
-from thinc.types import Floats2d, Ints1d
-from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init, Model
-from thinc.api import MultiSoftmax, list2array
-from thinc.api import to_categorical, CosineDistance, L2Distance
-from thinc.loss import Loss
-
-from ...util import registry, OOV_RANK
-from ...errors import Errors
-from ...attrs import ID, ORTH
-from ...vectors import Mode as VectorsMode
+from functools import partial
+from typing import TYPE_CHECKING, Any, Callable, Iterable, List, Optional, Tuple, cast
import numpy
-from functools import partial
+from thinc.api import (
+ CosineDistance,
+ L2Distance,
+ LayerNorm,
+ Linear,
+ Maxout,
+ Model,
+ MultiSoftmax,
+ Softmax,
+ chain,
+ list2array,
+ to_categorical,
+ zero_init,
+)
+from thinc.loss import Loss
+from thinc.types import Floats2d, Ints1d
+
+from ...attrs import ID, ORTH
+from ...errors import Errors
+from ...util import OOV_RANK, registry
+from ...vectors import Mode as VectorsMode
if TYPE_CHECKING:
# This lets us add type hints for mypy etc. without causing circular imports
- from ...vocab import Vocab # noqa: F401
from ...tokens.doc import Doc # noqa: F401
+ from ...vocab import Vocab # noqa: F401
@registry.architectures("spacy.PretrainVectors.v1")
diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py
index a70d84dea..f6c0e565d 100644
--- a/spacy/ml/models/parser.py
+++ b/spacy/ml/models/parser.py
@@ -1,13 +1,14 @@
-from typing import Optional, List, cast
-from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops
+from typing import List, Optional, cast
+
+from thinc.api import Linear, Model, chain, list2array, use_ops, zero_init
from thinc.types import Floats2d
-from ...errors import Errors
from ...compat import Literal
+from ...errors import Errors
+from ...tokens import Doc
from ...util import registry
from .._precomputable_affine import PrecomputableAffine
from ..tb_framework import TransitionModel
-from ...tokens import Doc
@registry.architectures("spacy.TransitionBasedParser.v2")
diff --git a/spacy/ml/models/span_finder.py b/spacy/ml/models/span_finder.py
new file mode 100644
index 000000000..d327fc761
--- /dev/null
+++ b/spacy/ml/models/span_finder.py
@@ -0,0 +1,41 @@
+from typing import Callable, List, Tuple
+
+from thinc.api import Model, chain, with_array
+from thinc.types import Floats1d, Floats2d
+
+from ...tokens import Doc
+from ...util import registry
+
+InT = List[Doc]
+OutT = Floats2d
+
+
+@registry.architectures("spacy.SpanFinder.v1")
+def build_finder_model(
+ tok2vec: Model[InT, List[Floats2d]], scorer: Model[OutT, OutT]
+) -> Model[InT, OutT]:
+
+ logistic_layer: Model[List[Floats2d], List[Floats2d]] = with_array(scorer)
+ model: Model[InT, OutT] = chain(tok2vec, logistic_layer, flattener())
+ model.set_ref("tok2vec", tok2vec)
+ model.set_ref("scorer", scorer)
+ model.set_ref("logistic_layer", logistic_layer)
+
+ return model
+
+
+def flattener() -> Model[List[Floats2d], Floats2d]:
+ """Flattens the input to a 1-dimensional list of scores"""
+
+ def forward(
+ model: Model[Floats1d, Floats1d], X: List[Floats2d], is_train: bool
+ ) -> Tuple[Floats2d, Callable[[Floats2d], List[Floats2d]]]:
+ lens = model.ops.asarray1i([len(doc) for doc in X])
+ Y = model.ops.flatten(X)
+
+ def backprop(dY: Floats2d) -> List[Floats2d]:
+ return model.ops.unflatten(dY, lens)
+
+ return Y, backprop
+
+ return Model("Flattener", forward=forward)
diff --git a/spacy/ml/models/spancat.py b/spacy/ml/models/spancat.py
index 893db2e6d..140ec553a 100644
--- a/spacy/ml/models/spancat.py
+++ b/spacy/ml/models/spancat.py
@@ -1,11 +1,24 @@
from typing import List, Tuple, cast
-from thinc.api import Model, with_getitem, chain, list2ragged, Logistic
-from thinc.api import Maxout, Linear, concatenate, glorot_uniform_init
-from thinc.api import reduce_mean, reduce_max, reduce_first, reduce_last
-from thinc.types import Ragged, Floats2d
-from ...util import registry
+from thinc.api import (
+ Linear,
+ Logistic,
+ Maxout,
+ Model,
+ chain,
+ concatenate,
+ glorot_uniform_init,
+ list2ragged,
+ reduce_first,
+ reduce_last,
+ reduce_max,
+ reduce_mean,
+ with_getitem,
+)
+from thinc.types import Floats2d, Ragged
+
from ...tokens import Doc
+from ...util import registry
from ..extract_spans import extract_spans
diff --git a/spacy/ml/models/tagger.py b/spacy/ml/models/tagger.py
index 9f8ef7b2b..8f1554fab 100644
--- a/spacy/ml/models/tagger.py
+++ b/spacy/ml/models/tagger.py
@@ -1,9 +1,10 @@
-from typing import Optional, List
-from thinc.api import zero_init, with_array, Softmax_v2, chain, Model
+from typing import List, Optional
+
+from thinc.api import Model, Softmax_v2, chain, with_array, zero_init
from thinc.types import Floats2d
-from ...util import registry
from ...tokens import Doc
+from ...util import registry
@registry.architectures("spacy.Tagger.v2")
diff --git a/spacy/ml/models/textcat.py b/spacy/ml/models/textcat.py
index 9c7e607fe..ab14110d2 100644
--- a/spacy/ml/models/textcat.py
+++ b/spacy/ml/models/textcat.py
@@ -1,22 +1,39 @@
-from typing import Optional, List, cast
from functools import partial
+from typing import List, Optional, cast
-from thinc.types import Floats2d
-from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic
-from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention
-from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum
-from thinc.api import with_cpu, Relu, residual, LayerNorm, resizable
+from thinc.api import (
+ Dropout,
+ LayerNorm,
+ Linear,
+ Logistic,
+ Maxout,
+ Model,
+ ParametricAttention,
+ Relu,
+ Softmax,
+ SparseLinear,
+ chain,
+ clone,
+ concatenate,
+ list2ragged,
+ reduce_mean,
+ reduce_sum,
+ residual,
+ resizable,
+ softmax_activation,
+ with_cpu,
+)
from thinc.layers.chain import init as init_chain
-from thinc.layers.resizable import resize_model, resize_linear_weighted
+from thinc.layers.resizable import resize_linear_weighted, resize_model
+from thinc.types import Floats2d
from ...attrs import ORTH
+from ...tokens import Doc
from ...util import registry
from ..extract_ngrams import extract_ngrams
from ..staticvectors import StaticVectors
-from ...tokens import Doc
from .tok2vec import get_tok2vec_width
-
NEG_VALUE = -5000
diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py
index 30c7360ff..2e9d21ef4 100644
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@@ -1,17 +1,32 @@
-from typing import Optional, List, Union, cast
-from thinc.types import Floats2d, Ints2d, Ragged, Ints1d
-from thinc.api import chain, clone, concatenate, with_array, with_padded
-from thinc.api import Model, noop, list2ragged, ragged2list, HashEmbed
-from thinc.api import expand_window, residual, Maxout, Mish, PyTorchLSTM
+from typing import List, Optional, Union, cast
-from ...tokens import Doc
-from ...util import registry
+from thinc.api import (
+ HashEmbed,
+ Maxout,
+ Mish,
+ Model,
+ PyTorchLSTM,
+ chain,
+ clone,
+ concatenate,
+ expand_window,
+ list2ragged,
+ noop,
+ ragged2list,
+ residual,
+ with_array,
+ with_padded,
+)
+from thinc.types import Floats2d, Ints1d, Ints2d, Ragged
+
+from ...attrs import intify_attr
from ...errors import Errors
from ...ml import _character_embed
-from ..staticvectors import StaticVectors
-from ..featureextractor import FeatureExtractor
from ...pipeline.tok2vec import Tok2VecListener
-from ...attrs import intify_attr
+from ...tokens import Doc
+from ...util import registry
+from ..featureextractor import FeatureExtractor
+from ..staticvectors import StaticVectors
@registry.architectures("spacy.Tok2VecListener.v1")
diff --git a/spacy/ml/parser_model.pxd b/spacy/ml/parser_model.pxd
index 8def6cea5..ca31c1699 100644
--- a/spacy/ml/parser_model.pxd
+++ b/spacy/ml/parser_model.pxd
@@ -1,7 +1,8 @@
-from libc.string cimport memset, memcpy
+from libc.string cimport memcpy, memset
from thinc.backends.cblas cimport CBlas
-from ..typedefs cimport weight_t, hash_t
+
from ..pipeline._parser_internals._state cimport StateC
+from ..typedefs cimport hash_t, weight_t
cdef struct SizesC:
diff --git a/spacy/ml/parser_model.pyx b/spacy/ml/parser_model.pyx
index 961bf4d70..5cffc4c2d 100644
--- a/spacy/ml/parser_model.pyx
+++ b/spacy/ml/parser_model.pyx
@@ -1,19 +1,20 @@
# cython: infer_types=True, cdivision=True, boundscheck=False
cimport numpy as np
from libc.math cimport exp
-from libc.string cimport memset, memcpy
from libc.stdlib cimport calloc, free, realloc
-from thinc.backends.linalg cimport Vec, VecVec
+from libc.string cimport memcpy, memset
from thinc.backends.cblas cimport saxpy, sgemm
+from thinc.backends.linalg cimport Vec, VecVec
import numpy
import numpy.random
-from thinc.api import Model, CupyOps, NumpyOps, get_ops
+from thinc.api import CupyOps, Model, NumpyOps, get_ops
from .. import util
from ..errors import Errors
-from ..typedefs cimport weight_t, class_t, hash_t
+
from ..pipeline._parser_internals.stateclass cimport StateClass
+from ..typedefs cimport class_t, hash_t, weight_t
cdef WeightsC get_c_weights(model) except *:
diff --git a/spacy/ml/staticvectors.py b/spacy/ml/staticvectors.py
index 04cfe912d..6fcb13ad0 100644
--- a/spacy/ml/staticvectors.py
+++ b/spacy/ml/staticvectors.py
@@ -1,11 +1,12 @@
-from typing import List, Tuple, Callable, Optional, Sequence, cast
-from thinc.initializers import glorot_uniform_init
-from thinc.util import partial
-from thinc.types import Ragged, Floats2d, Floats1d, Ints1d
-from thinc.api import Model, Ops, registry
+from typing import Callable, List, Optional, Sequence, Tuple, cast
+
+from thinc.api import Model, Ops, registry
+from thinc.initializers import glorot_uniform_init
+from thinc.types import Floats1d, Floats2d, Ints1d, Ragged
+from thinc.util import partial
-from ..tokens import Doc
from ..errors import Errors
+from ..tokens import Doc
from ..vectors import Mode
from ..vocab import Vocab
diff --git a/spacy/ml/tb_framework.py b/spacy/ml/tb_framework.py
index ab4a969e2..e351ad4e5 100644
--- a/spacy/ml/tb_framework.py
+++ b/spacy/ml/tb_framework.py
@@ -1,6 +1,7 @@
from thinc.api import Model, noop
-from .parser_model import ParserStepModel
+
from ..util import registry
+from .parser_model import ParserStepModel
@registry.layers("spacy.TransitionModel.v1")
diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd
index 8d449d065..968764b82 100644
--- a/spacy/morphology.pxd
+++ b/spacy/morphology.pxd
@@ -1,10 +1,10 @@
-from cymem.cymem cimport Pool
-from preshed.maps cimport PreshMap
cimport numpy as np
+from cymem.cymem cimport Pool
from libc.stdint cimport uint64_t
+from preshed.maps cimport PreshMap
-from .structs cimport MorphAnalysisC
from .strings cimport StringStore
+from .structs cimport MorphAnalysisC
from .typedefs cimport attr_t, hash_t
diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx
index c3ffc46a1..1062fff09 100644
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@@ -1,12 +1,13 @@
# cython: infer_types
-import numpy
import warnings
+import numpy
+
from .attrs cimport POS
-from .parts_of_speech import IDS as POS_IDS
-from .errors import Warnings
from . import symbols
+from .errors import Warnings
+from .parts_of_speech import IDS as POS_IDS
cdef class Morphology:
diff --git a/spacy/parts_of_speech.pxd b/spacy/parts_of_speech.pxd
index 0bf5b4789..a0b2567f1 100644
--- a/spacy/parts_of_speech.pxd
+++ b/spacy/parts_of_speech.pxd
@@ -1,5 +1,6 @@
from . cimport symbols
+
cpdef enum univ_pos_t:
NO_TAG = 0
ADJ = symbols.ADJ
diff --git a/spacy/pipe_analysis.py b/spacy/pipe_analysis.py
index 245747061..d26884487 100644
--- a/spacy/pipe_analysis.py
+++ b/spacy/pipe_analysis.py
@@ -1,8 +1,9 @@
-from typing import List, Set, Dict, Iterable, ItemsView, Union, TYPE_CHECKING
+from typing import TYPE_CHECKING, Dict, ItemsView, Iterable, List, Set, Union
+
from wasabi import msg
-from .tokens import Doc, Token, Span
from .errors import Errors
+from .tokens import Doc, Span, Token
from .util import dot_to_dict
if TYPE_CHECKING:
diff --git a/spacy/pipeline/__init__.py b/spacy/pipeline/__init__.py
index 26931606b..40e3fd638 100644
--- a/spacy/pipeline/__init__.py
+++ b/spacy/pipeline/__init__.py
@@ -2,21 +2,22 @@ from .attributeruler import AttributeRuler
from .dep_parser import DependencyParser
from .edit_tree_lemmatizer import EditTreeLemmatizer
from .entity_linker import EntityLinker
-from .ner import EntityRecognizer
from .entityruler import EntityRuler
+from .functions import merge_entities, merge_noun_chunks, merge_subtokens
from .lemmatizer import Lemmatizer
from .morphologizer import Morphologizer
+from .ner import EntityRecognizer
from .pipe import Pipe
-from .trainable_pipe import TrainablePipe
-from .senter import SentenceRecognizer
from .sentencizer import Sentencizer
+from .senter import SentenceRecognizer
+from .span_finder import SpanFinder
+from .span_ruler import SpanRuler
+from .spancat import SpanCategorizer
from .tagger import Tagger
from .textcat import TextCategorizer
-from .spancat import SpanCategorizer
-from .span_ruler import SpanRuler
from .textcat_multilabel import MultiLabel_TextCategorizer
from .tok2vec import Tok2Vec
-from .functions import merge_entities, merge_noun_chunks, merge_subtokens
+from .trainable_pipe import TrainablePipe
__all__ = [
"AttributeRuler",
@@ -31,6 +32,7 @@ __all__ = [
"SentenceRecognizer",
"Sentencizer",
"SpanCategorizer",
+ "SpanFinder",
"SpanRuler",
"Tagger",
"TextCategorizer",
diff --git a/spacy/pipeline/_edit_tree_internals/edit_trees.pxd b/spacy/pipeline/_edit_tree_internals/edit_trees.pxd
index dc4289f37..3d63af921 100644
--- a/spacy/pipeline/_edit_tree_internals/edit_trees.pxd
+++ b/spacy/pipeline/_edit_tree_internals/edit_trees.pxd
@@ -2,8 +2,9 @@ from libc.stdint cimport uint32_t, uint64_t
from libcpp.unordered_map cimport unordered_map
from libcpp.vector cimport vector
-from ...typedefs cimport attr_t, hash_t, len_t
from ...strings cimport StringStore
+from ...typedefs cimport attr_t, hash_t, len_t
+
cdef extern from "
-
- {Children.toArray(children).flat().filter(isRelevant)}
+
+
+ {Children.toArray(children).flat().filter(isRelevant)}
+