Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
This commit is contained in:
Ines Montani 2020-07-22 13:42:59 +02:00 committed by GitHub
parent 311d0bde29
commit 43b960c01b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
179 changed files with 6946 additions and 4619 deletions

View File

@ -17,7 +17,6 @@ import plac
import random
from pathlib import Path
import spacy
from spacy.kb import KnowledgeBase
from spacy.gold import Example
from spacy.pipeline import EntityRuler
@ -82,12 +81,16 @@ def main(kb_path, vocab_path, output_dir=None, n_iter=50):
# Create the Entity Linker component and add it to the pipeline.
if "entity_linker" not in nlp.pipe_names:
kb = KnowledgeBase(vocab=nlp.vocab)
kb.load_bulk(kb_path)
print("Loaded Knowledge Base from '%s'" % kb_path)
# use only the predicted EL score and not the prior probability (for demo purposes)
cfg = {"kb": kb, "incl_prior": False}
print("Loading Knowledge Base from '%s'" % kb_path)
cfg = {
"kb": {
"@assets": "spacy.KBFromFile.v1",
"vocab_path": vocab_path,
"kb_path": kb_path,
},
# use only the predicted EL score and not the prior probability (for demo purposes)
"incl_prior": False,
}
entity_linker = nlp.create_pipe("entity_linker", cfg)
nlp.add_pipe(entity_linker, last=True)

View File

@ -6,7 +6,7 @@ requires = [
"cymem>=2.0.2,<2.1.0",
"preshed>=3.0.2,<3.1.0",
"murmurhash>=0.28.0,<1.1.0",
"thinc>=8.0.0a18,<8.0.0a20",
"thinc>=8.0.0a19,<8.0.0a30",
"blis>=0.4.0,<0.5.0",
"pytokenizations"
]

View File

@ -1,11 +1,11 @@
# Our libraries
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
thinc>=8.0.0a18,<8.0.0a20
thinc>=8.0.0a19,<8.0.0a30
blis>=0.4.0,<0.5.0
ml_datasets>=0.1.1
murmurhash>=0.28.0,<1.1.0
wasabi>=0.7.0,<1.1.0
wasabi>=0.7.1,<1.1.0
srsly>=2.1.0,<3.0.0
catalogue>=0.0.7,<1.1.0
typer>=0.3.0,<0.4.0

View File

@ -34,15 +34,15 @@ setup_requires =
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
murmurhash>=0.28.0,<1.1.0
thinc>=8.0.0a18,<8.0.0a20
thinc>=8.0.0a19,<8.0.0a30
install_requires =
# Our libraries
murmurhash>=0.28.0,<1.1.0
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
thinc>=8.0.0a18,<8.0.0a20
thinc>=8.0.0a19,<8.0.0a30
blis>=0.4.0,<0.5.0
wasabi>=0.7.0,<1.1.0
wasabi>=0.7.1,<1.1.0
srsly>=2.1.0,<3.0.0
catalogue>=0.0.7,<1.1.0
typer>=0.3.0,<0.4.0

View File

@ -32,8 +32,14 @@ MOD_NAMES = [
"spacy.attrs",
"spacy.kb",
"spacy.morphology",
"spacy.pipeline.pipes",
"spacy.pipeline.dep_parser",
"spacy.pipeline.morphologizer",
"spacy.pipeline.multitask",
"spacy.pipeline.ner",
"spacy.pipeline.pipe",
"spacy.pipeline.sentencizer",
"spacy.pipeline.senter",
"spacy.pipeline.tagger",
"spacy.syntax.stateclass",
"spacy.syntax._state",
"spacy.tokenizer",

View File

@ -14,7 +14,6 @@ from .about import __version__
from .errors import Errors, Warnings
from . import util
from .util import registry
from .language import component
if sys.maxunicode == 65535:

View File

@ -63,7 +63,7 @@ def parse_config_overrides(args: List[str]) -> Dict[str, Any]:
result = {}
while args:
opt = args.pop(0)
err = f"Invalid config override '{opt}'"
err = f"Invalid CLI argument '{opt}'"
if opt.startswith("--"): # new argument
opt = opt.replace("--", "").replace("-", "_")
if "." not in opt:
@ -73,7 +73,7 @@ def parse_config_overrides(args: List[str]) -> Dict[str, Any]:
else:
value = args.pop(0)
# Just like we do in the config, we're calling json.loads on the
# values. But since they come from the CLI, it'd b unintuitive to
# values. But since they come from the CLI, it'd be unintuitive to
# explicitly mark strings with escaped quotes. So we're working
# around that here by falling back to a string if parsing fails.
# TODO: improve logic to handle simple types like list of strings?
@ -82,7 +82,7 @@ def parse_config_overrides(args: List[str]) -> Dict[str, Any]:
except ValueError:
result[opt] = str(value)
else:
msg.fail(f"{err}: options need to start with --", exits=1)
msg.fail(f"{err}: override option should start with --", exits=1)
return result

View File

@ -3,12 +3,12 @@ from pathlib import Path
from collections import Counter
import sys
import srsly
from wasabi import Printer, MESSAGES, msg
from wasabi import Printer, MESSAGES, msg, diff_strings
import typer
from thinc.api import Config
from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
from ._util import import_code, debug_cli
from ..schemas import ConfigSchema
from ..gold import Corpus, Example
from ..syntax import nonproj
from ..language import Language
@ -33,6 +33,9 @@ def debug_config_cli(
ctx: typer.Context, # This is only used to read additional arguments
config_path: Path = Arg(..., help="Path to config file", exists=True),
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
output_path: Optional[Path] = Opt(None, "--output", "-o", help="Output path for filled config or '-' for standard output", allow_dash=True),
auto_fill: bool = Opt(False, "--auto-fill", "-F", help="Whether or not to auto-fill the config with built-in defaults if possible"),
diff: bool = Opt(False, "--diff", "-D", help="Show a visual diff if config was auto-filled")
# fmt: on
):
"""Debug a config.cfg file and show validation errors. The command will
@ -40,14 +43,37 @@ def debug_config_cli(
validation errors are blocking and will prevent the rest of the config from
being resolved. This means that you may not see all validation errors at
once and some issues are only shown once previous errors have been fixed.
Similar as with the 'train' command, you can override settings from the config
as command line options. For instance, --training.batch_size 128 overrides
the value of "batch_size" in the block "[training]".
"""
overrides = parse_config_overrides(ctx.args)
import_code(code_path)
with show_validation_error():
util.load_config(
config_path, create_objects=False, schema=ConfigSchema, overrides=overrides,
)
msg.good("Config is valid")
config = Config().from_disk(config_path)
try:
nlp, _ = util.load_model_from_config(
config, overrides=overrides, auto_fill=auto_fill
)
except ValueError as e:
msg.fail(str(e), exits=1)
is_stdout = output_path is not None and str(output_path) == "-"
if auto_fill:
orig_config = config.to_str()
filled_config = nlp.config.to_str()
if orig_config == filled_config:
msg.good("Original config is valid, no values were auto-filled")
else:
msg.good("Auto-filled config is valid")
if diff:
print(diff_strings(config.to_str(), nlp.config.to_str()))
else:
msg.good("Original config is valid", show=not is_stdout)
if is_stdout:
print(nlp.config.to_str())
elif output_path is not None:
nlp.config.to_disk(output_path)
msg.good(f"Saved updated config to {output_path}")
@debug_cli.command(
@ -117,16 +143,13 @@ def debug_data(
if not config_path.exists():
msg.fail("Config file not found", config_path, exists=1)
with show_validation_error():
config = util.load_config(
config_path,
create_objects=False,
schema=ConfigSchema,
overrides=config_overrides,
)
nlp = util.load_model_from_config(config["nlp"])
cfg = Config().from_disk(config_path)
nlp, config = util.load_model_from_config(cfg, overrides=config_overrides)
# TODO: handle base model
lang = config["nlp"]["lang"]
base_model = config["nlp"]["base_model"]
pipeline = list(config["nlp"]["pipeline"].keys())
base_model = config["training"]["base_model"]
pipeline = nlp.pipe_names
factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names]
tag_map_path = util.ensure_path(config["training"]["tag_map"])
tag_map = {}
if tag_map_path is not None:
@ -164,19 +187,17 @@ def debug_data(
msg.good("Corpus is loadable")
# Create all gold data here to avoid iterating over the train_dataset constantly
gold_train_data = _compile_gold(train_dataset, pipeline, nlp, make_proj=True)
gold_train_data = _compile_gold(train_dataset, factory_names, nlp, make_proj=True)
gold_train_unpreprocessed_data = _compile_gold(
train_dataset, pipeline, nlp, make_proj=False
train_dataset, factory_names, nlp, make_proj=False
)
gold_dev_data = _compile_gold(dev_dataset, pipeline, nlp, make_proj=True)
gold_dev_data = _compile_gold(dev_dataset, factory_names, nlp, make_proj=True)
train_texts = gold_train_data["texts"]
dev_texts = gold_dev_data["texts"]
msg.divider("Training stats")
msg.text(f"Training pipeline: {', '.join(pipeline)}")
for pipe in [p for p in pipeline if p not in nlp.factories]:
msg.fail(f"Pipeline component '{pipe}' not available in factories")
if base_model:
msg.text(f"Starting with base model '{base_model}'")
else:
@ -244,7 +265,7 @@ def debug_data(
else:
msg.info("No word vectors present in the model")
if "ner" in pipeline:
if "ner" in factory_names:
# Get all unique NER labels present in the data
labels = set(
label for label in gold_train_data["ner"] if label not in ("O", "-", None)
@ -332,7 +353,7 @@ def debug_data(
"with punctuation can not be trained with a noise level > 0."
)
if "textcat" in pipeline:
if "textcat" in factory_names:
msg.divider("Text Classification")
labels = [label for label in gold_train_data["cats"]]
model_labels = _get_labels_from_model(nlp, "textcat")
@ -379,7 +400,7 @@ def debug_data(
"contains only instances with mutually-exclusive classes."
)
if "tagger" in pipeline:
if "tagger" in factory_names:
msg.divider("Part-of-speech Tagging")
labels = [label for label in gold_train_data["tags"]]
tag_map = nlp.vocab.morphology.tag_map
@ -394,7 +415,7 @@ def debug_data(
for label in non_tagmap:
msg.fail(f"Label '{label}' not found in tag map for language '{nlp.lang}'")
if "parser" in pipeline:
if "parser" in factory_names:
has_low_data_warning = False
msg.divider("Dependency Parsing")
@ -541,7 +562,10 @@ def _load_file(file_path: Path, msg: Printer) -> None:
def _compile_gold(
examples: Sequence[Example], pipeline: List[str], nlp: Language, make_proj: bool
examples: Sequence[Example],
factory_names: List[str],
nlp: Language,
make_proj: bool,
) -> Dict[str, Any]:
data = {
"ner": Counter(),
@ -573,7 +597,7 @@ def _compile_gold(
for word in valid_words:
if nlp.vocab.strings[word] not in nlp.vocab.vectors:
data["words_missing_vectors"].update([word])
if "ner" in pipeline:
if "ner" in factory_names:
for i, label in enumerate(eg.get_aligned_ner()):
if label is None:
continue
@ -595,14 +619,14 @@ def _compile_gold(
data["ner"][combined_label] += 1
elif label == "-":
data["ner"]["-"] += 1
if "textcat" in pipeline:
if "textcat" in factory_names:
data["cats"].update(gold.cats)
if list(gold.cats.values()).count(1.0) != 1:
data["n_cats_multilabel"] += 1
if "tagger" in pipeline:
if "tagger" in factory_names:
tags = eg.get_aligned("TAG", as_string=True)
data["tags"].update([x for x in tags if x is not None])
if "parser" in pipeline:
if "parser" in factory_names:
aligned_heads, aligned_deps = eg.get_aligned_parse(projectivize=make_proj)
data["deps"].update([x for x in aligned_deps if x is not None])
for i, (dep, head) in enumerate(zip(aligned_deps, aligned_heads)):

View File

@ -1,8 +1,11 @@
from typing import Dict, Any, Optional
from pathlib import Path
from wasabi import msg
from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam
from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam, Config
from thinc.api import Model
import typer
from ._util import Arg, Opt, debug_cli
from ._util import Arg, Opt, debug_cli, show_validation_error, parse_config_overrides
from .. import util
from ..lang.en import English
@ -10,8 +13,10 @@ from ..lang.en import English
@debug_cli.command("model")
def debug_model_cli(
# fmt: off
ctx: typer.Context, # This is only used to read additional arguments
config_path: Path = Arg(..., help="Path to config file", exists=True),
layers: str = Opt("", "--layers", "-l", help="Comma-separated names of pipeline components to train"),
section: str = Arg(..., help="Section that defines the model to be analysed"),
layers: str = Opt("", "--layers", "-l", help="Comma-separated names of layer IDs to print"),
dimensions: bool = Opt(False, "--dimensions", "-DIM", help="Show dimensions"),
parameters: bool = Opt(False, "--parameters", "-PAR", help="Show parameters"),
gradients: bool = Opt(False, "--gradients", "-GRAD", help="Show gradients"),
@ -20,14 +25,18 @@ def debug_model_cli(
P1: bool = Opt(False, "--print-step1", "-P1", help="Print model after initialization"),
P2: bool = Opt(False, "--print-step2", "-P2", help="Print model after training"),
P3: bool = Opt(True, "--print-step3", "-P3", help="Print final predictions"),
use_gpu: int = Opt(-1, "--use-gpu", "-g", help="Use GPU"),
seed: int = Opt(None, "--seed", "-s", help="Use GPU"),
use_gpu: int = Opt(-1, "--use-gpu", "-g", help="GPU ID or -1 for CPU")
# fmt: on
):
"""
Analyze a Thinc model implementation. Includes checks for internal structure
and activations during training.
"""
if use_gpu >= 0:
msg.info("Using GPU")
require_gpu(use_gpu)
else:
msg.info("Using CPU")
print_settings = {
"dimensions": dimensions,
"parameters": parameters,
@ -39,27 +48,47 @@ def debug_model_cli(
"print_after_training": P2,
"print_prediction": P3,
}
config_overrides = parse_config_overrides(ctx.args)
cfg = Config().from_disk(config_path)
with show_validation_error():
try:
_, config = util.load_model_from_config(cfg, overrides=config_overrides)
except ValueError as e:
msg.fail(str(e), exits=1)
seed = config["pretraining"]["seed"]
if seed is not None:
msg.info(f"Fixing random seed: {seed}")
fix_random_seed(seed)
if use_gpu >= 0:
msg.info(f"Using GPU: {use_gpu}")
require_gpu(use_gpu)
component = config
parts = section.split(".")
for item in parts:
try:
component = component[item]
except KeyError:
msg.fail(
f"The section '{section}' is not a valid section in the provided config.",
exits=1,
)
if hasattr(component, "model"):
model = component.model
else:
msg.info(f"Using CPU")
debug_model(
config_path, print_settings=print_settings,
)
msg.fail(
f"The section '{section}' does not specify an object that holds a Model.",
exits=1,
)
debug_model(model, print_settings=print_settings)
def debug_model(config_path: Path, *, print_settings=None):
def debug_model(model: Model, *, print_settings: Optional[Dict[str, Any]] = None):
if not isinstance(model, Model):
msg.fail(
f"Requires a Thinc Model to be analysed, but found {type(model)} instead.",
exits=1,
)
if print_settings is None:
print_settings = {}
model = util.load_config(config_path, create_objects=True)["model"]
# STEP 0: Printing before training
msg.info(f"Analysing model with ID {model.id}")
if print_settings.get("print_before_training"):
@ -67,7 +96,9 @@ def debug_model(config_path: Path, *, print_settings=None):
_print_model(model, print_settings)
# STEP 1: Initializing the model and printing again
model.initialize(X=_get_docs(), Y=_get_output(model.ops.xp))
Y = _get_output(model.ops.xp)
_set_output_dim(nO=Y.shape[-1], model=model)
model.initialize(X=_get_docs(), Y=Y)
if print_settings.get("print_after_init"):
msg.info(f"After initialization:")
_print_model(model, print_settings)
@ -110,12 +141,16 @@ def _get_docs():
def _get_output(xp):
return xp.asarray(
[
xp.asarray([i + 10, i + 20, i + 30], dtype="float32")
for i, _ in enumerate(_get_docs())
]
)
return xp.asarray([i + 10 for i, _ in enumerate(_get_docs())], dtype="float32")
def _set_output_dim(model, nO):
# the dim inference doesn't always work 100%, we need this hack like we have it in pipe.pyx
if model.has_dim("nO") is None:
model.set_dim("nO", nO)
if model.has_ref("output_layer"):
if model.get_ref("output_layer").has_dim("nO") is None:
model.get_ref("output_layer").set_dim("nO", nO)
def _print_model(model, print_settings):

View File

@ -105,9 +105,10 @@ def evaluate(
print_textcats_auc_per_cat(msg, scorer.textcats_auc_per_cat)
if displacy_path:
factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names]
docs = [ex.predicted for ex in dev_dataset]
render_deps = "parser" in nlp.meta.get("pipeline", [])
render_ents = "ner" in nlp.meta.get("pipeline", [])
render_deps = "parser" in factory_names
render_ents = "ner" in factory_names
render_parses(
docs,
displacy_path,

View File

@ -84,7 +84,6 @@ def info_model(model: str, *, silent: bool = True) -> Dict[str, Any]:
msg.fail("Can't find model meta.json", meta_path, exits=1)
meta = srsly.read_json(meta_path)
if model_path.resolve() != model_path:
meta["link"] = str(model_path)
meta["source"] = str(model_path.resolve())
else:
meta["source"] = str(model_path)

View File

@ -125,7 +125,6 @@ def get_meta(
meta.update(existing_meta)
nlp = util.load_model_from_path(Path(model_path))
meta["spacy_version"] = util.get_model_version_range(about.__version__)
meta["pipeline"] = nlp.pipe_names
meta["vectors"] = {
"width": nlp.vocab.vectors_length,
"vectors": len(nlp.vocab.vectors),

View File

@ -5,7 +5,7 @@ import time
import re
from collections import Counter
from pathlib import Path
from thinc.api import use_pytorch_for_gpu_memory, require_gpu
from thinc.api import use_pytorch_for_gpu_memory, require_gpu, Config
from thinc.api import set_dropout_rate, to_categorical, fix_random_seed
from thinc.api import CosineDistance, L2Distance
from wasabi import msg
@ -15,7 +15,6 @@ import typer
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
from ._util import import_code
from ..schemas import ConfigSchema
from ..errors import Errors
from ..ml.models.multi_task import build_cloze_multi_task_model
from ..ml.models.multi_task import build_cloze_characters_multi_task_model
@ -37,6 +36,7 @@ def pretrain_cli(
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using '--resume_path'. Prevents unintended overwriting of existing weight files."),
use_gpu: int = Opt(-1, "--use-gpu", "-g", help="GPU ID or -1 for CPU"),
# fmt: on
):
"""
@ -67,6 +67,7 @@ def pretrain_cli(
config_overrides=overrides,
resume_path=resume_path,
epoch_resume=epoch_resume,
use_gpu=use_gpu,
)
@ -77,40 +78,29 @@ def pretrain(
config_overrides: Dict[str, Any] = {},
resume_path: Optional[Path] = None,
epoch_resume: Optional[int] = None,
use_gpu: int = -1,
):
verify_cli_args(texts_loc, output_dir, config_path, resume_path, epoch_resume)
msg.info(f"Loading config from: {config_path}")
with show_validation_error():
config = util.load_config(
config_path,
create_objects=False,
validate=True,
schema=ConfigSchema,
overrides=config_overrides,
)
if not output_dir.exists():
output_dir.mkdir()
msg.good(f"Created output directory: {output_dir}")
use_gpu = config["training"]["use_gpu"]
if use_gpu >= 0:
msg.info("Using GPU")
require_gpu(use_gpu)
else:
msg.info("Using CPU")
msg.info(f"Loading config from: {config_path}")
config = Config().from_disk(config_path)
with show_validation_error():
nlp, config = util.load_model_from_config(config, overrides=config_overrides)
# TODO: validate that [pretraining] block exists
if not output_dir.exists():
output_dir.mkdir()
msg.good(f"Created output directory: {output_dir}")
seed = config["pretraining"]["seed"]
if seed is not None:
fix_random_seed(seed)
if use_gpu >= 0 and config["pretraining"]["use_pytorch_for_gpu_memory"]:
use_pytorch_for_gpu_memory()
nlp_config = config["nlp"]
srsly.write_json(output_dir / "config.json", config)
config.to_disk(output_dir / "config.cfg")
msg.good("Saved config file in the output directory")
config = util.load_config(config_path, create_objects=True)
nlp = util.load_model_from_config(nlp_config)
pretrain_config = config["pretraining"]
if texts_loc != "-": # reading from a file

View File

@ -25,7 +25,7 @@ def profile_cli(
# fmt: on
):
"""
Profile a spaCy pipeline, to find out which functions take the most time.
Profile which functions take the most time in a spaCy pipeline.
Input should be formatted as one JSON object per line with a key "text".
It can either be provided as a JSONL file, or be read from sys.sytdin.
If no input file is specified, the IMDB dataset is loaded via Thinc.

View File

@ -1,4 +1,4 @@
from typing import Optional, Dict, Any
from typing import Optional, Dict, Any, Tuple, Union, Callable, List
from timeit import default_timer as timer
import srsly
import tqdm
@ -7,6 +7,7 @@ from wasabi import msg
import thinc
import thinc.schedules
from thinc.api import use_pytorch_for_gpu_memory, require_gpu, fix_random_seed
from thinc.api import Config, Optimizer
import random
import typer
@ -14,18 +15,15 @@ from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
from ._util import import_code
from ..gold import Corpus, Example
from ..lookups import Lookups
from ..language import Language
from .. import util
from ..errors import Errors
from ..schemas import ConfigSchema
# Don't remove - required to load the built-in architectures
from ..ml import models # noqa: F401
registry = util.registry
@app.command(
"train", context_settings={"allow_extra_args": True, "ignore_unknown_options": True}
)
@ -38,6 +36,8 @@ def train_cli(
output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store model in"),
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
use_gpu: int = Opt(-1, "--use-gpu", "-g", help="GPU ID or -1 for CPU"),
resume: bool = Opt(False, "--resume", "-R", help="Resume training"),
# fmt: on
):
"""
@ -53,9 +53,7 @@ def train_cli(
referenced in the config.
"""
util.set_env_log(verbose)
verify_cli_args(
train_path=train_path, dev_path=dev_path, config_path=config_path,
)
verify_cli_args(train_path, dev_path, config_path)
overrides = parse_config_overrides(ctx.args)
import_code(code_path)
train(
@ -63,6 +61,8 @@ def train_cli(
{"train": train_path, "dev": dev_path},
output_path=output_path,
config_overrides=overrides,
use_gpu=use_gpu,
resume_training=resume,
)
@ -72,63 +72,53 @@ def train(
raw_text: Optional[Path] = None,
output_path: Optional[Path] = None,
config_overrides: Dict[str, Any] = {},
use_gpu: int = -1,
resume_training: bool = False,
) -> None:
msg.info(f"Loading config from: {config_path}")
# Read the config first without creating objects, to get to the original nlp_config
with show_validation_error():
config = util.load_config(
config_path,
create_objects=False,
schema=ConfigSchema,
overrides=config_overrides,
)
use_gpu = config["training"]["use_gpu"]
if use_gpu >= 0:
msg.info(f"Using GPU: {use_gpu}")
require_gpu(use_gpu)
else:
msg.info("Using CPU")
msg.info(f"Loading config and nlp from: {config_path}")
config = Config().from_disk(config_path)
with show_validation_error():
nlp, config = util.load_model_from_config(config, overrides=config_overrides)
if config["training"]["base_model"]:
base_nlp = util.load_model(config["training"]["base_model"])
# TODO: do something to check base_nlp against regular nlp described in config?
nlp = base_nlp
verify_config(nlp)
raw_text, tag_map, morph_rules, weights_data = load_from_paths(config)
if config["training"]["seed"] is not None:
fix_random_seed(config["training"]["seed"])
if config["training"].get("use_pytorch_for_gpu_memory"):
if config["training"]["use_pytorch_for_gpu_memory"]:
# It feels kind of weird to not have a default for this.
use_pytorch_for_gpu_memory()
nlp_config = config["nlp"]
config = util.load_config(
config_path,
create_objects=True,
schema=ConfigSchema,
overrides=config_overrides,
)
training = config["training"]
msg.info("Creating nlp from config")
nlp = util.load_model_from_config(nlp_config)
optimizer = training["optimizer"]
limit = training["limit"]
corpus = Corpus(data_paths["train"], data_paths["dev"], limit=limit)
if "textcat" in nlp_config["pipeline"]:
verify_textcat_config(nlp, nlp_config)
if training.get("resume", False):
if resume_training:
msg.info("Resuming training")
nlp.resume_training()
else:
msg.info(f"Initializing the nlp pipeline: {nlp.pipe_names}")
train_examples = list(
corpus.train_dataset(
nlp,
shuffle=False,
gold_preproc=training["gold_preproc"],
max_length=training["max_length"],
)
train_examples = corpus.train_dataset(
nlp,
shuffle=False,
gold_preproc=training["gold_preproc"],
max_length=training["max_length"],
)
train_examples = list(train_examples)
nlp.begin_training(lambda: train_examples)
# Replace tag map with provided mapping
nlp.vocab.morphology.load_tag_map(tag_map)
# Load morph rules
nlp.vocab.morphology.load_morph_exceptions(morph_rules)
if tag_map:
# Replace tag map with provided mapping
nlp.vocab.morphology.load_tag_map(tag_map)
if morph_rules:
# Load morph rules
nlp.vocab.morphology.load_morph_exceptions(morph_rules)
# Create empty extra lexeme tables so the data from spacy-lookups-data
# isn't loaded if these features are accessed
@ -151,9 +141,8 @@ def train(
for subpath in tok2vec_path.split("."):
tok2vec = tok2vec.get(subpath)
if not tok2vec:
msg.fail(
f"Could not locate the tok2vec model at {tok2vec_path}.", exits=1,
)
err = f"Could not locate the tok2vec model at {tok2vec_path}"
msg.fail(err, exits=1)
tok2vec.from_bytes(weights_data)
msg.info("Loading training corpus")
@ -169,12 +158,11 @@ def train(
evaluate,
dropout=training["dropout"],
accumulate_gradient=training["accumulate_gradient"],
patience=training.get("patience", 0),
max_steps=training.get("max_steps", 0),
patience=training["patience"],
max_steps=training["max_steps"],
eval_frequency=training["eval_frequency"],
raw_text=raw_text,
)
msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}")
print_row = setup_printer(training, nlp)
@ -209,8 +197,10 @@ def train(
msg.good(f"Saved model to output directory {final_model_path}")
def create_train_batches(nlp, corpus, cfg):
max_epochs = cfg.get("max_epochs", 0)
def create_train_batches(
nlp: Language, corpus: Corpus, cfg: Union[Config, Dict[str, Any]]
):
max_epochs = cfg["max_epochs"]
train_examples = list(
corpus.train_dataset(
nlp,
@ -219,9 +209,8 @@ def create_train_batches(nlp, corpus, cfg):
max_length=cfg["max_length"],
)
)
epoch = 0
batch_strategy = cfg.get("batch_by", "sequences")
batch_strategy = cfg["batch_by"]
while True:
if len(train_examples) == 0:
raise ValueError(Errors.E988)
@ -241,7 +230,6 @@ def create_train_batches(nlp, corpus, cfg):
)
else:
batches = util.minibatch(train_examples, size=cfg["batch_size"])
# make sure the minibatch_by_words result is not empty, or we'll have an infinite training loop
try:
first = next(batches)
@ -255,18 +243,20 @@ def create_train_batches(nlp, corpus, cfg):
random.shuffle(train_examples)
def create_evaluation_callback(nlp, optimizer, corpus, cfg):
def evaluate():
dev_examples = list(
corpus.dev_dataset(
nlp, gold_preproc=cfg["gold_preproc"], ignore_misaligned=True
)
def create_evaluation_callback(
nlp: Language,
optimizer: Optimizer,
corpus: Corpus,
cfg: Union[Config, Dict[str, Any]],
) -> Callable[[], Tuple[float, Dict[str, float]]]:
def evaluate() -> Tuple[float, Dict[str, float]]:
dev_examples = corpus.dev_dataset(
nlp, gold_preproc=cfg["gold_preproc"], ignore_misaligned=True
)
dev_examples = list(dev_examples)
n_words = sum(len(ex.predicted) for ex in dev_examples)
batch_size = cfg.get("evaluation_batch_size", 128)
batch_size = cfg["eval_batch_size"]
start_time = timer()
if optimizer.averages:
with nlp.use_params(optimizer.averages):
scorer = nlp.evaluate(dev_examples, batch_size=batch_size)
@ -280,12 +270,9 @@ def create_evaluation_callback(nlp, optimizer, corpus, cfg):
try:
weighted_score = sum(scores[s] * weights.get(s, 0.0) for s in weights)
except KeyError as e:
raise KeyError(
Errors.E983.format(
dict="score_weights", key=str(e), keys=list(scores.keys())
)
)
keys = list(scores.keys())
err = Errors.E983.format(dict="score_weights", key=str(e), keys=keys)
raise KeyError(err)
scores["speed"] = wps
return weighted_score, scores
@ -293,17 +280,17 @@ def create_evaluation_callback(nlp, optimizer, corpus, cfg):
def train_while_improving(
nlp,
optimizer,
nlp: Language,
optimizer: Optimizer,
train_data,
evaluate,
*,
dropout,
eval_frequency,
accumulate_gradient=1,
patience=0,
max_steps=0,
raw_text=None,
dropout: float,
eval_frequency: int,
accumulate_gradient: int,
patience: int,
max_steps: int,
raw_text: List[Dict[str, str]],
):
"""Train until an evaluation stops improving. Works as a generator,
with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`,
@ -414,7 +401,9 @@ def subdivide_batch(batch, accumulate_gradient):
yield subbatch
def setup_printer(training, nlp):
def setup_printer(
training: Union[Dict[str, Any], Config], nlp: Language
) -> Callable[[Dict[str, Any]], None]:
score_cols = training["scores"]
score_widths = [max(len(col), 6) for col in score_cols]
loss_cols = [f"Loss {pipe}" for pipe in nlp.pipe_names]
@ -423,11 +412,10 @@ def setup_printer(training, nlp):
table_header = [col.upper() for col in table_header]
table_widths = [3, 6] + loss_widths + score_widths + [6]
table_aligns = ["r" for _ in table_widths]
msg.row(table_header, widths=table_widths)
msg.row(["-" * width for width in table_widths])
def print_row(info):
def print_row(info: Dict[str, Any]) -> None:
try:
losses = [
"{0:.2f}".format(float(info["losses"][pipe_name]))
@ -463,7 +451,9 @@ def setup_printer(training, nlp):
return print_row
def update_meta(training, nlp, info):
def update_meta(
training: Union[Dict[str, Any], Config], nlp: Language, info: Dict[str, Any]
) -> None:
score_cols = training["scores"]
nlp.meta["performance"] = {}
for metric in score_cols:
@ -472,7 +462,9 @@ def update_meta(training, nlp, info):
nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name]
def load_from_paths(config):
def load_from_paths(
config: Config,
) -> Tuple[List[Dict[str, str]], Dict[str, dict], bytes]:
# TODO: separate checks from loading
raw_text = util.ensure_path(config["training"]["raw_text"])
if raw_text is not None:
@ -506,7 +498,7 @@ def verify_cli_args(
dev_path: Path,
config_path: Path,
output_path: Optional[Path] = None,
):
) -> None:
# Make sure all files and paths exists if they are needed
if not config_path or not config_path.exists():
msg.fail("Config file not found", config_path, exits=1)
@ -528,12 +520,23 @@ def verify_cli_args(
)
def verify_textcat_config(nlp, nlp_config):
def verify_config(nlp: Language) -> None:
"""Perform additional checks based on the config and loaded nlp object."""
# TODO: maybe we should validate based on the actual components, the list
# in config["nlp"]["pipeline"] instead?
for pipe_config in nlp.config["components"].values():
# We can't assume that the component name == the factory
factory = pipe_config["@factories"]
if factory == "textcat":
verify_textcat_config(nlp, pipe_config)
def verify_textcat_config(nlp: Language, pipe_config: Dict[str, Any]) -> None:
# if 'positive_label' is provided: double check whether it's in the data and
# the task is binary
if nlp_config["pipeline"]["textcat"].get("positive_label", None):
if pipe_config.get("positive_label"):
textcat_labels = nlp.get_pipe("textcat").cfg.get("labels", [])
pos_label = nlp_config["pipeline"]["textcat"]["positive_label"]
pos_label = pipe_config.get("positive_label")
if pos_label not in textcat_labels:
msg.fail(
f"The textcat's 'positive_label' config setting '{pos_label}' "

102
spacy/default_config.cfg Normal file
View File

@ -0,0 +1,102 @@
[nlp]
lang = null
stop_words = []
lex_attr_getters = {}
pipeline = []
[nlp.tokenizer]
@tokenizers = "spacy.Tokenizer.v1"
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.writing_system]
direction = "ltr"
has_case = true
has_letters = true
[components]
# Training hyper-parameters and additional features.
[training]
# Whether to train on sequences with 'gold standard' sentence boundaries
# and tokens. If you set this to true, take care to ensure your run-time
# data is passed in sentence-by-sentence via some prior preprocessing.
gold_preproc = false
# Limitations on training document length or number of examples.
max_length = 5000
limit = 0
# Data augmentation
orth_variant_level = 0.0
dropout = 0.1
# Controls early-stopping. 0 or -1 mean unlimited.
patience = 1600
max_epochs = 0
max_steps = 20000
eval_frequency = 200
eval_batch_size = 128
# Other settings
seed = 0
accumulate_gradient = 1
use_pytorch_for_gpu_memory = false
# Control how scores are printed and checkpoints are evaluated.
scores = ["speed", "tags_acc", "uas", "las", "ents_f"]
score_weights = {"tags_acc": 0.2, "las": 0.4, "ents_f": 0.4}
# These settings are invalid for the transformer models.
init_tok2vec = null
discard_oversize = false
omit_extra_lookups = false
batch_by = "sequences"
raw_text = null
tag_map = null
morph_rules = null
base_model = null
vectors = null
[training.batch_size]
@schedules = "compounding.v1"
start = 1000
stop = 1000
compound = 1.001
[training.optimizer]
@optimizers = "Adam.v1"
beta1 = 0.9
beta2 = 0.999
L2_is_weight_decay = true
L2 = 0.01
grad_clip = 1.0
use_averages = false
eps = 1e-8
[training.optimizer.learn_rate]
@schedules = "warmup_linear.v1"
warmup_steps = 250
total_steps = 20000
initial_rate = 0.001
[pretraining]
max_epochs = 1000
min_length = 5
max_length = 500
dropout = 0.2
n_save_every = null
batch_size = 3000
seed = ${training:seed}
use_pytorch_for_gpu_memory = ${training:use_pytorch_for_gpu_memory}
tok2vec_model = "components.tok2vec.model"
[pretraining.objective]
type = "characters"
n_characters = 4
[pretraining.optimizer]
@optimizers = "Adam.v1"
beta1 = 0.9
beta2 = 0.999
L2_is_weight_decay = true
L2 = 0.01
grad_clip = 1.0
use_averages = true
eps = 1e-8
learn_rate = 0.001

View File

@ -124,20 +124,24 @@ class Warnings:
@add_codes
class Errors:
E001 = ("No component '{name}' found in pipeline. Available names: {opts}")
E002 = ("Can't find factory for '{name}'. This usually happens when spaCy "
"calls `nlp.create_pipe` with a component name that's not built "
"in - for example, when constructing the pipeline from a model's "
"meta.json. If you're using a custom component, you can write to "
"`Language.factories['{name}']` or remove it from the model meta "
"and add it via `nlp.add_pipe` instead.")
E002 = ("Can't find factory for '{name}' for language {lang} ({lang_code}). "
"This usually happens when spaCy calls nlp.{method} with custom "
"component name that's not registered on the current language class. "
"If you're using a custom component, make sure you've added the "
"decorator @Language.component (for function components) or "
"@Language.factory (for class components).\n\nAvailable "
"factories: {opts}")
E003 = ("Not a valid pipeline component. Expected callable, but "
"got {component} (name: '{name}').")
E004 = ("If you meant to add a built-in component, use `create_pipe`: "
"`nlp.add_pipe(nlp.create_pipe('{component}'))`")
"got {component} (name: '{name}'). If you're using a custom "
"component factory, double-check that it correctly returns your "
"initialized component.")
E004 = ("Can't set up pipeline component: a factory for '{name}' already exists.")
E005 = ("Pipeline component '{name}' returned None. If you're using a "
"custom component, maybe you forgot to return the processed Doc?")
E006 = ("Invalid constraints. You can only set one of the following: "
"before, after, first, last.")
E006 = ("Invalid constraints for adding pipeline component. You can only "
"set one of the following: before (component name or index), "
"after (component name or index), first (True) or last (True). "
"Invalid configuration: {args}. Existing components: {opts}")
E007 = ("'{name}' already exists in pipeline. Existing names: {opts}")
E008 = ("Some current components would be lost when restoring previous "
"pipeline state. If you added components after calling "
@ -184,7 +188,7 @@ class Errors:
"the documentation:\nhttps://spacy.io/usage/models")
E030 = ("Sentence boundaries unset. You can add the 'sentencizer' "
"component to the pipeline with: "
"nlp.add_pipe(nlp.create_pipe('sentencizer')). "
"nlp.add_pipe('sentencizer'). "
"Alternatively, add the dependency parser, or set sentence "
"boundaries by setting doc[i].is_sent_start.")
E031 = ("Invalid token: empty string ('') at position {i}.")
@ -365,8 +369,6 @@ class Errors:
E133 = ("The sum of prior probabilities for alias '{alias}' should not "
"exceed 1, but found {sum}.")
E134 = ("Entity '{entity}' is not defined in the Knowledge Base.")
E135 = ("If you meant to replace a built-in component, use `create_pipe`: "
"`nlp.replace_pipe('{name}', nlp.create_pipe('{name}'))`")
E137 = ("Expected 'dict' type, but got '{type}' from '{line}'. Make sure "
"to provide a valid JSON object as input with either the `text` "
"or `tokens` key. For more info, see the docs:\n"
@ -484,6 +486,62 @@ class Errors:
E199 = ("Unable to merge 0-length span at doc[{start}:{end}].")
# TODO: fix numbering after merging develop into master
E956 = ("Can't find component '{name}' in [components] block in the config. "
"Available components: {opts}")
E957 = ("Writing directly to Language.factories isn't needed anymore in "
"spaCy v3. Instead, you can use the @Language.factory decorator "
"to register your custom component factory or @Language.component "
"to register a simple stateless function component that just takes "
"a Doc and returns it.")
E958 = ("Language code defined in config ({bad_lang_code}) does not match "
"language code of current Language subclass {lang} ({lang_code})")
E959 = ("Can't insert component {dir} index {idx}. Existing components: {opts}")
E960 = ("No config data found for component '{name}'. This is likely a bug "
"in spaCy.")
E961 = ("Found non-serializable Python object in config. Configs should "
"only include values that can be serialized to JSON. If you need "
"to pass models or other objects to your component, use a reference "
"to a registered function or initialize the object in your "
"component.\n\n{config}")
E962 = ("Received incorrect {style} for pipe '{name}'. Expected dict, "
"got: {cfg_type}.")
E963 = ("Can't read component info from @Language.{decorator} decorator. "
"Maybe you forgot to call it? Make sure you're using "
"@Language.{decorator}() instead of @Language.{decorator}.")
E964 = ("The pipeline component factory for '{name}' needs to have the "
"following named arguments, which are passed in by spaCy:\n- nlp: "
"receives the current nlp object and lets you access the vocab\n- "
"name: the name of the component instance, can be used to identify "
"the component, output losses etc.")
E965 = ("It looks like you're using the @Language.component decorator to "
"register '{name}' on a class instead of a function component. If "
"you need to register a class or function that *returns* a component "
"function, use the @Language.factory decorator instead.")
E966 = ("nlp.add_pipe now takes the string name of the registered component "
"factory, not a callable component. Expected string, but got "
"{component} (name: '{name}').\n\n- If you created your component "
"with nlp.create_pipe('name'): remove nlp.create_pipe and call "
"nlp.add_pipe('name') instead.\n\n- If you passed in a component "
"like TextCategorizer(): call nlp.add_pipe with the string name "
"instead, e.g. nlp.add_pipe('textcat').\n\n- If you're using a custom "
"component: Add the decorator @Language.component (for function "
"components) or @Language.factory (for class components / factories) "
"to your custom component and assign it a name, e.g. "
"@Language.component('your_name'). You can then run "
"nlp.add_pipe('your_name') to add it to the pipeline.")
E967 = ("No {meta} meta information found for '{name}'. This is likely a bug in spaCy.")
E968 = ("nlp.replace_pipe now takes the string name of the registered component "
"factory, not a callable component. Expected string, but got "
"{component}.\n\n- If you created your component with"
"with nlp.create_pipe('name'): remove nlp.create_pipe and call "
"nlp.replace_pipe('{name}', 'name') instead.\n\n- If you passed in a "
"component like TextCategorizer(): call nlp.replace_pipe with the "
"string name instead, e.g. nlp.replace_pipe('{name}', 'textcat').\n\n"
"- If you're using a custom component: Add the decorator "
"@Language.component (for function components) or @Language.factory "
"(for class components / factories) to your custom component and "
"assign it a name, e.g. @Language.component('your_name'). You can "
"then run nlp.replace_pipe('{name}', 'your_name').")
E969 = ("Expected string values for field '{field}', but received {types} instead. ")
E970 = ("Can not execute command '{str_command}'. Do you have '{tool}' installed?")
E971 = ("Found incompatible lengths in Doc.from_array: {array_length} for the "
@ -506,10 +564,12 @@ class Errors:
"into {values}, but found {value}.")
E983 = ("Invalid key for '{dict}': {key}. Available keys: "
"{keys}")
E985 = ("The pipeline component '{component}' is already available in the base "
"model. The settings in the component block in the config file are "
"being ignored. If you want to replace this component instead, set "
"'replace' to True in the training configuration.")
E984 = ("Invalid component config for '{name}': no @factories key "
"specifying the registered function used to initialize the "
"component. For example, @factories = \"ner\" will use the 'ner' "
"factory and all other settings in the block will be passed "
"to it as arguments.\n\n{config}")
E985 = ("Can't load model from config file: no 'nlp' section found.\n\n{config}")
E986 = ("Could not create any training batches: check your input. "
"Perhaps discard_oversize should be set to False ?")
E987 = ("The text of an example training instance is either a Doc or "
@ -530,9 +590,9 @@ class Errors:
E992 = ("The function `select_pipes` was called with `enable`={enable} "
"and `disable`={disable} but that information is conflicting "
"for the `nlp` pipeline with components {names}.")
E993 = ("The config for 'nlp' should include either a key 'name' to "
"refer to an existing model by name or path, or a key 'lang' "
"to create a new blank model.")
E993 = ("The config for 'nlp' needs to include a key 'lang' specifying "
"the code of the language to initialize it with (for example "
"'en' for English).\n\n{config}")
E996 = ("Could not parse {file}: {msg}")
E997 = ("Tokenizer special cases are not allowed to modify the text. "
"This would map '{chunk}' to '{orth}' given token attributes "
@ -540,9 +600,9 @@ class Errors:
E999 = ("Unable to merge the `Doc` objects because they do not all share "
"the same `Vocab`.")
E1000 = ("No pkuseg model available. Provide a pkuseg model when "
"initializing the pipeline: "
'`cfg = {"segmenter": "pkuseg", "pkuseg_model": name_or_path}; '
'nlp = Chinese(meta={"tokenizer": {"config": cfg}})`')
"initializing the pipeline:\n"
'cfg = {"tokenizer": {"segmenter": "pkuseg", "pkuseg_model": name_or_path}}\m'
'nlp = Chinese(config=cfg)')
@add_codes

View File

@ -1,10 +1,9 @@
import re
from .conll_ner2docs import n_sents_info
from ...gold import Example
from ...gold import iob_to_biluo, spans_from_biluo_tags
from ...language import Language
from ...tokens import Doc, Token, Span
from ...vocab import Vocab
from wasabi import Printer
@ -73,7 +72,7 @@ def read_conllx(
ner_map=None,
):
""" Yield docs, one for each sentence """
vocab = Language.Defaults.create_vocab() # need vocab to make a minimal Doc
vocab = Vocab() # need vocab to make a minimal Doc
for sent in input_data.strip().split("\n\n"):
lines = sent.strip().split("\n")
if lines:

View File

@ -1,17 +1,26 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS
from ...language import Language
from ...attrs import LANG
from ...util import registry
class AfrikaansDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: "af"
stop_words = STOP_WORDS
DEFAULT_CONFIG = """
[nlp]
lang = "af"
stop_words = {"@language_data": "spacy.af.stop_words"}
"""
@registry.language_data("spacy.af.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class Afrikaans(Language):
lang = "af"
Defaults = AfrikaansDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Afrikaans"]

View File

@ -1,31 +1,48 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_SUFFIXES
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups
from ...util import update_exc, registry
DEFAULT_CONFIG = """
[nlp]
lang = "ar"
stop_words = {"@language_data": "spacy.ar.stop_words"}
lex_attr_getters = {"@language_data": "spacy.ar.lex_attr_getters"}
[nlp.writing_system]
direction = "rtl"
has_case = false
has_letters = true
"""
@registry.language_data("spacy.ar.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.ar.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class ArabicDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: "ar"
lex_attr_getters[NORM] = add_lookups(
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
suffixes = TOKENIZER_SUFFIXES
writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
class Arabic(Language):
lang = "ar"
Defaults = ArabicDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Arabic"]

View File

@ -1,17 +1,26 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS
from ...language import Language
from ...attrs import LANG
from ...util import registry
class BulgarianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: "bg"
stop_words = STOP_WORDS
DEFAULT_CONFIG = """
[nlp]
lang = "bg"
stop_words = {"@language_data": "spacy.bg.stop_words"}
"""
@registry.language_data("spacy.bg.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class Bulgarian(Language):
lang = "bg"
Defaults = BulgarianDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Bulgarian"]

View File

@ -1,18 +1,35 @@
from typing import Set
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from .stop_words import STOP_WORDS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language
from ...attrs import LANG
from ...util import update_exc
from ...util import update_exc, registry
DEFAULT_CONFIG = """
[nlp]
lang = "bn"
stop_words = {"@language_data": "spacy.bn.stop_words"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
"""
@registry.language_data("spacy.bn.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class BengaliDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: "bn"
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES
infixes = TOKENIZER_INFIXES
@ -21,6 +38,7 @@ class BengaliDefaults(Language.Defaults):
class Bengali(Language):
lang = "bn"
Defaults = BengaliDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Bengali"]

View File

@ -1,31 +1,49 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups
from ...util import update_exc, registry
from .punctuation import TOKENIZER_INFIXES
DEFAULT_CONFIG = """
[nlp]
lang = "ca"
stop_words = {"@language_data": "spacy.ca.stop_words"}
lex_attr_getters = {"@language_data": "spacy.ca.lex_attr_getters"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
"""
@registry.language_data("spacy.ca.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.ca.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class CatalanDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: "ca"
lex_attr_getters[NORM] = add_lookups(
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
)
lex_attr_getters.update(LEX_ATTRS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
infixes = TOKENIZER_INFIXES
class Catalan(Language):
lang = "ca"
Defaults = CatalanDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Catalan"]

View File

@ -1,17 +1,26 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS
from ...language import Language
from ...attrs import LANG
from ...util import registry
class CzechDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: "cs"
stop_words = STOP_WORDS
DEFAULT_CONFIG = """
[nlp]
lang = "cs"
stop_words = {"@language_data": "spacy.cs.stop_words"}
"""
@registry.language_data("spacy.cs.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class Czech(Language):
lang = "cs"
Defaults = CzechDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Czech"]

View File

@ -1,27 +1,50 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language
from ...attrs import LANG
from ...util import update_exc
from ...util import update_exc, registry
DEFAULT_CONFIG = """
[nlp]
lang = "da"
stop_words = {"@language_data": "spacy.da.stop_words"}
lex_attr_getters = {"@language_data": "spacy.da.lex_attr_getters"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
"""
@registry.language_data("spacy.da.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.da.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class DanishDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: "da"
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES
stop_words = STOP_WORDS
class Danish(Language):
lang = "da"
Defaults = DanishDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Danish"]

View File

@ -1,23 +1,40 @@
from typing import Set
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
from .punctuation import TOKENIZER_INFIXES
from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language
from ...attrs import LANG
from ...util import update_exc
from ...util import update_exc, registry
DEFAULT_CONFIG = """
[nlp]
lang = "de"
stop_words = {"@language_data": "spacy.de.stop_words"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
"""
@registry.language_data("spacy.de.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class GermanDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: "de"
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES
infixes = TOKENIZER_INFIXES
stop_words = STOP_WORDS
syntax_iterators = SYNTAX_ITERATORS
single_orth_variants = [
{"tags": ["$("], "variants": ["", "..."]},
@ -38,6 +55,7 @@ class GermanDefaults(Language.Defaults):
class German(Language):
lang = "de"
Defaults = GermanDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["German"]

View File

@ -1,3 +1,6 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
@ -6,32 +9,51 @@ from .syntax_iterators import SYNTAX_ITERATORS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language
from ...lookups import Lookups
from ...attrs import LANG
from ...util import update_exc
from ...util import update_exc, registry
DEFAULT_CONFIG = """
[nlp]
lang = "el"
stop_words = {"@language_data": "spacy.el.stop_words"}
lex_attr_getters = {"@language_data": "spacy.el.lex_attr_getters"}
[nlp.lemmatizer]
@lemmatizers = "spacy.GreekLemmatizer.v1"
[nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
"""
@registry.lemmatizers("spacy.GreekLemmatizer.v1")
def create_greek_lemmatizer(data_paths: dict = {}) -> GreekLemmatizer:
return GreekLemmatizer(data_paths=data_paths)
@registry.language_data("spacy.el.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.el.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class GreekDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: "el"
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES
infixes = TOKENIZER_INFIXES
syntax_iterators = SYNTAX_ITERATORS
@classmethod
def create_lemmatizer(cls, nlp=None, lookups=None):
if lookups is None:
lookups = Lookups()
return GreekLemmatizer(lookups)
class Greek(Language):
lang = "el"
Defaults = GreekDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Greek"]

View File

@ -1,3 +1,5 @@
from typing import Dict, List
from ...lemmatizer import Lemmatizer
@ -12,7 +14,13 @@ class GreekLemmatizer(Lemmatizer):
not applicable for Greek language.
"""
def lemmatize(self, string, index, exceptions, rules):
def lemmatize(
self,
string: str,
index: Dict[str, List[str]],
exceptions: Dict[str, Dict[str, List[str]]],
rules: Dict[str, List[List[str]]],
) -> List[str]:
string = string.lower()
forms = []
if string in index:

View File

@ -1,25 +1,50 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS
from .lemmatizer import is_base_form
from .punctuation import TOKENIZER_INFIXES
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language
from ...attrs import LANG
from ...util import update_exc
from ...lemmatizer import Lemmatizer
from ...util import update_exc, registry
def _return_en(_):
return "en"
DEFAULT_CONFIG = """
[nlp]
lang = "en"
stop_words = {"@language_data": "spacy.en.stop_words"}
lex_attr_getters = {"@language_data": "spacy.en.lex_attr_getters"}
[nlp.lemmatizer]
@lemmatizers = "spacy.EnglishLemmatizer.v1"
[nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
"""
@registry.language_data("spacy.en.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.en.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
@registry.lemmatizers("spacy.EnglishLemmatizer.v1")
def create_lemmatizer(data_paths: dict = {}) -> "Lemmatizer":
return Lemmatizer(data_paths=data_paths, is_base_form=is_base_form)
class EnglishDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = _return_en
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
syntax_iterators = SYNTAX_ITERATORS
infixes = TOKENIZER_INFIXES
single_orth_variants = [
@ -31,45 +56,11 @@ class EnglishDefaults(Language.Defaults):
{"tags": ["``", "''"], "variants": [('"', '"'), ("", "")]},
]
@classmethod
def is_base_form(cls, univ_pos, morphology=None):
"""
Check whether we're dealing with an uninflected paradigm, so we can
avoid lemmatization entirely.
univ_pos (unicode / int): The token's universal part-of-speech tag.
morphology (dict): The token's morphological features following the
Universal Dependencies scheme.
"""
if morphology is None:
morphology = {}
if univ_pos == "noun" and morphology.get("Number") == "sing":
return True
elif univ_pos == "verb" and morphology.get("VerbForm") == "inf":
return True
# This maps 'VBP' to base form -- probably just need 'IS_BASE'
# morphology
elif univ_pos == "verb" and (
morphology.get("VerbForm") == "fin"
and morphology.get("Tense") == "pres"
and morphology.get("Number") is None
):
return True
elif univ_pos == "adj" and morphology.get("Degree") == "pos":
return True
elif morphology.get("VerbForm") == "inf":
return True
elif morphology.get("VerbForm") == "none":
return True
elif morphology.get("Degree") == "pos":
return True
else:
return False
class English(Language):
lang = "en"
Defaults = EnglishDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["English"]

View File

@ -0,0 +1,36 @@
from typing import Optional
def is_base_form(univ_pos: str, morphology: Optional[dict] = None) -> bool:
"""
Check whether we're dealing with an uninflected paradigm, so we can
avoid lemmatization entirely.
univ_pos (unicode / int): The token's universal part-of-speech tag.
morphology (dict): The token's morphological features following the
Universal Dependencies scheme.
"""
if morphology is None:
morphology = {}
if univ_pos == "noun" and morphology.get("Number") == "sing":
return True
elif univ_pos == "verb" and morphology.get("VerbForm") == "inf":
return True
# This maps 'VBP' to base form -- probably just need 'IS_BASE'
# morphology
elif univ_pos == "verb" and (
morphology.get("VerbForm") == "fin"
and morphology.get("Tense") == "pres"
and morphology.get("Number") is None
):
return True
elif univ_pos == "adj" and morphology.get("Degree") == "pos":
return True
elif morphology.get("VerbForm") == "inf":
return True
elif morphology.get("VerbForm") == "none":
return True
elif morphology.get("Degree") == "pos":
return True
else:
return False

View File

@ -1,47 +1,17 @@
from ...attrs import LIKE_NUM
# fmt: off
_num_words = [
"zero",
"one",
"two",
"three",
"four",
"five",
"six",
"seven",
"eight",
"nine",
"ten",
"eleven",
"twelve",
"thirteen",
"fourteen",
"fifteen",
"sixteen",
"seventeen",
"eighteen",
"nineteen",
"twenty",
"thirty",
"forty",
"fifty",
"sixty",
"seventy",
"eighty",
"ninety",
"hundred",
"thousand",
"million",
"billion",
"trillion",
"quadrillion",
"gajillion",
"bazillion",
"zero", "one", "two", "three", "four", "five", "six", "seven", "eight",
"nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
"sixteen", "seventeen", "eighteen", "nineteen", "twenty", "thirty", "forty",
"fifty", "sixty", "seventy", "eighty", "ninety", "hundred", "thousand",
"million", "billion", "trillion", "quadrillion", "gajillion", "bazillion"
]
# fmt: on
def like_num(text):
def like_num(text: str) -> bool:
if text.startswith(("+", "-", "±", "~")):
text = text[1:]
text = text.replace(",", "").replace(".", "")

View File

@ -1,33 +1,52 @@
from typing import Set, Dict, Callable, Any
from thinc.config import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups
from ...util import update_exc, registry
DEFAULT_CONFIG = """
[nlp]
lang = "es"
stop_words = {"@language_data": "spacy.es.stop_words"}
lex_attr_getters = {"@language_data": "spacy.es.lex_attr_getters"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
"""
@registry.language_data("spacy.es.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.es.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class SpanishDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: "es"
lex_attr_getters[NORM] = add_lookups(
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES
stop_words = STOP_WORDS
syntax_iterators = SYNTAX_ITERATORS
class Spanish(Language):
lang = "es"
Defaults = SpanishDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Spanish"]

View File

@ -1,17 +1,26 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS
from ...language import Language
from ...attrs import LANG
from ...util import registry
class EstonianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: "et"
stop_words = STOP_WORDS
DEFAULT_CONFIG = """
[nlp]
lang = "et"
stop_words = {"@language_data": "spacy.et.stop_words"}
"""
@registry.language_data("spacy.et.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class Estonian(Language):
lang = "et"
Defaults = EstonianDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Estonian"]

View File

@ -1,25 +1,41 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_SUFFIXES
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language
from ...attrs import LANG
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "eu"
stop_words = {"@language_data": "spacy.eu.stop_words"}
lex_attr_getters = {"@language_data": "spacy.eu.lex_attr_getters"}
"""
@registry.language_data("spacy.eu.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.eu.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class BasqueDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: "eu"
tokenizer_exceptions = BASE_EXCEPTIONS
stop_words = STOP_WORDS
suffixes = TOKENIZER_SUFFIXES
class Basque(Language):
lang = "eu"
Defaults = BasqueDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Basque"]

View File

@ -1,7 +1,8 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from ...language import Language
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups
from ..norm_exceptions import BASE_NORMS
from ...util import update_exc, registry
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
@ -9,23 +10,46 @@ from .punctuation import TOKENIZER_SUFFIXES
from .syntax_iterators import SYNTAX_ITERATORS
DEFAULT_CONFIG = """
[nlp]
lang = "fa"
stop_words = {"@language_data": "spacy.fa.stop_words"}
lex_attr_getters = {"@language_data": "spacy.fa.lex_attr_getters"}
[nlp.writing_system]
direction = "rtl"
has_case = false
has_letters = true
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
"""
@registry.language_data("spacy.fa.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.fa.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class PersianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[NORM] = add_lookups(
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
)
lex_attr_getters[LANG] = lambda text: "fa"
tokenizer_exceptions = update_exc(TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
suffixes = TOKENIZER_SUFFIXES
writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
syntax_iterators = SYNTAX_ITERATORS
class Persian(Language):
lang = "fa"
Defaults = PersianDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Persian"]

View File

@ -1,31 +1,43 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups
from ...util import update_exc, registry
DEFAULT_CONFIG = """
[nlp]
lang = "fi"
stop_words = {"@language_data": "spacy.fi.stop_words"}
lex_attr_getters = {"@language_data": "spacy.fi.lex_attr_getters"}
"""
@registry.language_data("spacy.fi.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.fi.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class FinnishDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: "fi"
lex_attr_getters[NORM] = add_lookups(
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
)
infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
class Finnish(Language):
lang = "fi"
Defaults = FinnishDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Finnish"]

View File

@ -1,44 +1,61 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from .punctuation import TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .lemmatizer import FrenchLemmatizer
from .lemmatizer import FrenchLemmatizer, is_base_form
from .syntax_iterators import SYNTAX_ITERATORS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language
from ...lookups import Lookups
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups
from ...util import update_exc, registry
DEFAULT_CONFIG = """
[nlp]
lang = "fr"
stop_words = {"@language_data": "spacy.fr.stop_words"}
lex_attr_getters = {"@language_data": "spacy.fr.lex_attr_getters"}
[nlp.lemmatizer]
@lemmatizers = "spacy.FrenchLemmatizer.v1"
[nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
"""
@registry.lemmatizers("spacy.FrenchLemmatizer.v1")
def create_french_lemmatizer(data_paths: dict = {}) -> FrenchLemmatizer:
return FrenchLemmatizer(data_paths=data_paths, is_base_form=is_base_form)
@registry.language_data("spacy.fr.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.fr.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class FrenchDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: "fr"
lex_attr_getters[NORM] = add_lookups(
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
prefixes = TOKENIZER_PREFIXES
infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES
token_match = TOKEN_MATCH
syntax_iterators = SYNTAX_ITERATORS
@classmethod
def create_lemmatizer(cls, nlp=None, lookups=None):
if lookups is None:
lookups = Lookups()
return FrenchLemmatizer(lookups)
class French(Language):
lang = "fr"
Defaults = FrenchDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["French"]

View File

@ -1,3 +1,5 @@
from typing import Optional, List, Dict
from ...lemmatizer import Lemmatizer
from ...symbols import POS, NOUN, VERB, ADJ, ADV, PRON, DET, AUX, PUNCT, ADP
from ...symbols import SCONJ, CCONJ
@ -13,7 +15,9 @@ class FrenchLemmatizer(Lemmatizer):
the lookup table.
"""
def __call__(self, string, univ_pos, morphology=None):
def __call__(
self, string: str, univ_pos: str, morphology: Optional[dict] = None
) -> List[str]:
lookup_table = self.lookups.get_table("lemma_lookup", {})
if "lemma_rules" not in self.lookups:
return [lookup_table.get(string, string)]
@ -52,62 +56,19 @@ class FrenchLemmatizer(Lemmatizer):
)
return lemmas
def is_base_form(self, univ_pos, morphology=None):
"""
Check whether we're dealing with an uninflected paradigm, so we can
avoid lemmatization entirely.
"""
morphology = {} if morphology is None else morphology
others = [
key
for key in morphology
if key not in (POS, "Number", "POS", "VerbForm", "Tense")
]
if univ_pos == "noun" and morphology.get("Number") == "sing":
return True
elif univ_pos == "verb" and morphology.get("VerbForm") == "inf":
return True
# This maps 'VBP' to base form -- probably just need 'IS_BASE'
# morphology
elif univ_pos == "verb" and (
morphology.get("VerbForm") == "fin"
and morphology.get("Tense") == "pres"
and morphology.get("Number") is None
and not others
):
return True
elif univ_pos == "adj" and morphology.get("Degree") == "pos":
return True
elif "VerbForm=inf" in morphology:
return True
elif "VerbForm=none" in morphology:
return True
elif "Number=sing" in morphology:
return True
elif "Degree=pos" in morphology:
return True
else:
return False
def noun(self, string, morphology=None):
return self(string, "noun", morphology)
def verb(self, string, morphology=None):
return self(string, "verb", morphology)
def adj(self, string, morphology=None):
return self(string, "adj", morphology)
def punct(self, string, morphology=None):
return self(string, "punct", morphology)
def lookup(self, string, orth=None):
def lookup(self, string: str, orth: Optional[int] = None) -> str:
lookup_table = self.lookups.get_table("lemma_lookup", {})
if orth is not None and orth in lookup_table:
return lookup_table[orth][0]
return string
def lemmatize(self, string, index, exceptions, rules):
def lemmatize(
self,
string: str,
index: Dict[str, List[str]],
exceptions: Dict[str, Dict[str, List[str]]],
rules: Dict[str, List[List[str]]],
) -> List[str]:
lookup_table = self.lookups.get_table("lemma_lookup", {})
string = string.lower()
forms = []
@ -133,3 +94,41 @@ class FrenchLemmatizer(Lemmatizer):
if not forms:
forms.append(string)
return list(set(forms))
def is_base_form(univ_pos: str, morphology: Optional[dict] = None) -> bool:
"""
Check whether we're dealing with an uninflected paradigm, so we can
avoid lemmatization entirely.
"""
morphology = {} if morphology is None else morphology
others = [
key
for key in morphology
if key not in (POS, "Number", "POS", "VerbForm", "Tense")
]
if univ_pos == "noun" and morphology.get("Number") == "sing":
return True
elif univ_pos == "verb" and morphology.get("VerbForm") == "inf":
return True
# This maps 'VBP' to base form -- probably just need 'IS_BASE'
# morphology
elif univ_pos == "verb" and (
morphology.get("VerbForm") == "fin"
and morphology.get("Tense") == "pres"
and morphology.get("Number") is None
and not others
):
return True
elif univ_pos == "adj" and morphology.get("Degree") == "pos":
return True
elif "VerbForm=inf" in morphology:
return True
elif "VerbForm=none" in morphology:
return True
elif "Number=sing" in morphology:
return True
elif "Degree=pos" in morphology:
return True
else:
return False

View File

@ -1,23 +1,33 @@
from typing import Set
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language
from ...attrs import LANG
from ...util import update_exc
from ...util import update_exc, registry
DEFAULT_CONFIG = """
[nlp]
lang = "ga"
stop_words = {"@language_data": "spacy.ga.stop_words"}
"""
@registry.language_data("spacy.ga.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class IrishDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: "ga"
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
class Irish(Language):
lang = "ga"
Defaults = IrishDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Irish"]

View File

@ -1,15 +1,26 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS
from ...language import Language
from ...util import registry
class GujaratiDefaults(Language.Defaults):
stop_words = STOP_WORDS
DEFAULT_CONFIG = """
[nlp]
lang = "gu"
stop_words = {"@language_data": "spacy.gu.stop_words"}
"""
@registry.language_data("spacy.gu.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class Gujarati(Language):
lang = "gu"
Defaults = GujaratiDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Gujarati"]

View File

@ -1,22 +1,37 @@
from .stop_words import STOP_WORDS
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language
from ...attrs import LANG
from ...util import update_exc
from ...util import update_exc, registry
DEFAULT_CONFIG = """
[nlp]
lang = "he"
stop_words = {"@language_data": "spacy.he.stop_words"}
[nlp.writing_system]
direction = "rtl"
has_case = false
has_letters = true
"""
@registry.language_data("spacy.he.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class HebrewDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: "he"
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
stop_words = STOP_WORDS
writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
class Hebrew(Language):
lang = "he"
Defaults = HebrewDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Hebrew"]

View File

@ -1,20 +1,33 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from ...language import Language
from ...attrs import LANG
from ...util import registry
class HindiDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: "hi"
stop_words = STOP_WORDS
DEFAULT_CONFIG = """
[nlp]
lang = "hi"
stop_words = {"@language_data": "spacy.hi.stop_words"}
lex_attr_getters = {"@language_data": "spacy.hi.lex_attr_getters"}
"""
@registry.language_data("spacy.hi.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.hi.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class Hindi(Language):
lang = "hi"
Defaults = HindiDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Hindi"]

View File

@ -1,25 +1,39 @@
from .stop_words import STOP_WORDS
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups
from ...util import update_exc, registry
DEFAULT_CONFIG = """
[nlp]
lang = "hr"
stop_words = {"@language_data": "spacy.hr.stop_words"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
"""
@registry.language_data("spacy.hr.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class CroatianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: "hr"
lex_attr_getters[NORM] = add_lookups(
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
stop_words = STOP_WORDS
class Croatian(Language):
lang = "hr"
Defaults = CroatianDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Croatian"]

View File

@ -1,22 +1,35 @@
from typing import Set
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from .stop_words import STOP_WORDS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups
from ...util import update_exc, registry
DEFAULT_CONFIG = """
[nlp]
lang = "hu"
stop_words = {"@language_data": "spacy.hu.stop_words"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
"""
@registry.language_data("spacy.hu.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class HungarianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: "hu"
lex_attr_getters[NORM] = add_lookups(
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES
infixes = TOKENIZER_INFIXES
@ -26,6 +39,7 @@ class HungarianDefaults(Language.Defaults):
class Hungarian(Language):
lang = "hu"
Defaults = HungarianDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Hungarian"]

View File

@ -1,21 +1,33 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from ...attrs import LANG
from ...language import Language
from ...util import registry
class ArmenianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: "hy"
DEFAULT_CONFIG = """
[nlp]
lang = "hy"
stop_words = {"@language_data": "spacy.hy.stop_words"}
lex_attr_getters = {"@language_data": "spacy.hy.lex_attr_getters"}
"""
lex_attr_getters.update(LEX_ATTRS)
stop_words = STOP_WORDS
@registry.language_data("spacy.hy.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.hy.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class Armenian(Language):
lang = "hy"
Defaults = ArmenianDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Armenian"]

View File

@ -1,21 +1,43 @@
from typing import Set, Dict, Callable, Any
from thinc.config import Config
from .stop_words import STOP_WORDS
from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language
from ...attrs import LANG
from ...util import update_exc
from ...util import update_exc, registry
DEFAULT_CONFIG = """
[nlp]
lang = "id"
stop_words = {"@language_data": "spacy.id.stop_words"}
lex_attr_getters = {"@language_data": "spacy.id.lex_attr_getters"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
"""
@registry.language_data("spacy.id.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.id.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class IndonesianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: "id"
lex_attr_getters.update(LEX_ATTRS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES
infixes = TOKENIZER_INFIXES
@ -25,6 +47,7 @@ class IndonesianDefaults(Language.Defaults):
class Indonesian(Language):
lang = "id"
Defaults = IndonesianDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Indonesian"]

View File

@ -1,17 +1,26 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS
from ...language import Language
from ...attrs import LANG
from ...util import registry
class IcelandicDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: "is"
stop_words = STOP_WORDS
DEFAULT_CONFIG = """
[nlp]
lang = "is"
stop_words = {"@language_data": "spacy.is.stop_words"}
"""
@registry.language_data("spacy.is.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class Icelandic(Language):
lang = "is"
Defaults = IcelandicDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Icelandic"]

View File

@ -1,20 +1,34 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups
from ...util import update_exc, registry
DEFAULT_CONFIG = """
[nlp]
lang = "it"
stop_words = {"@language_data": "spacy.it.stop_words"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
"""
@registry.language_data("spacy.it.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class ItalianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: "it"
lex_attr_getters[NORM] = add_lookups(
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
prefixes = TOKENIZER_PREFIXES
@ -24,6 +38,7 @@ class ItalianDefaults(Language.Defaults):
class Italian(Language):
lang = "it"
Defaults = ItalianDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Italian"]

View File

@ -1,21 +1,187 @@
from typing import Optional, Union, Dict, Any, Set
from pathlib import Path
import srsly
from collections import namedtuple, OrderedDict
from collections import namedtuple
from thinc.api import Config
from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS
from .tag_map import TAG_MAP
from .tag_orth_map import TAG_ORTH_MAP
from .tag_bigram_map import TAG_BIGRAM_MAP
from ...attrs import LANG
from ...compat import copy_reg
from ...errors import Errors
from ...language import Language
from ...symbols import POS
from ...tokens import Doc
from ...util import DummyTokenizer
from ...util import DummyTokenizer, registry
from ... import util
DEFAULT_CONFIG = """
[nlp]
lang = "ja"
stop_words = {"@language_data": "spacy.ja.stop_words"}
[nlp.tokenizer]
@tokenizers = "spacy.JapaneseTokenizer.v1"
split_mode = null
[nlp.writing_system]
direction = "ltr"
has_case = false
has_letters = false
"""
@registry.language_data("spacy.ja.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.tokenizers("spacy.JapaneseTokenizer.v1")
def create_japanese_tokenizer(split_mode: Optional[str] = None):
def japanese_tokenizer_factory(nlp):
return JapaneseTokenizer(nlp, split_mode=split_mode)
return japanese_tokenizer_factory
class JapaneseTokenizer(DummyTokenizer):
def __init__(self, nlp: Language, split_mode: Optional[str] = None) -> None:
self.vocab = nlp.vocab
self.split_mode = split_mode
self.tokenizer = try_sudachi_import(self.split_mode)
def __call__(self, text: str) -> Doc:
# convert sudachipy.morpheme.Morpheme to DetailedToken and merge continuous spaces
sudachipy_tokens = self.tokenizer.tokenize(text)
dtokens = self._get_dtokens(sudachipy_tokens)
dtokens, spaces = get_dtokens_and_spaces(dtokens, text)
# create Doc with tag bi-gram based part-of-speech identification rules
words, tags, inflections, lemmas, readings, sub_tokens_list = (
zip(*dtokens) if dtokens else [[]] * 6
)
sub_tokens_list = list(sub_tokens_list)
doc = Doc(self.vocab, words=words, spaces=spaces)
next_pos = None # for bi-gram rules
for idx, (token, dtoken) in enumerate(zip(doc, dtokens)):
token.tag_ = dtoken.tag
if next_pos: # already identified in previous iteration
token.pos = next_pos
next_pos = None
else:
token.pos, next_pos = resolve_pos(
token.orth_,
dtoken.tag,
tags[idx + 1] if idx + 1 < len(tags) else None,
)
# if there's no lemma info (it's an unk) just use the surface
token.lemma_ = dtoken.lemma if dtoken.lemma else dtoken.surface
doc.user_data["inflections"] = inflections
doc.user_data["reading_forms"] = readings
doc.user_data["sub_tokens"] = sub_tokens_list
return doc
def _get_dtokens(self, sudachipy_tokens, need_sub_tokens: bool = True):
sub_tokens_list = (
self._get_sub_tokens(sudachipy_tokens) if need_sub_tokens else None
)
dtokens = [
DetailedToken(
token.surface(), # orth
"-".join([xx for xx in token.part_of_speech()[:4] if xx != "*"]), # tag
",".join([xx for xx in token.part_of_speech()[4:] if xx != "*"]), # inf
token.dictionary_form(), # lemma
token.reading_form(), # user_data['reading_forms']
sub_tokens_list[idx]
if sub_tokens_list
else None, # user_data['sub_tokens']
)
for idx, token in enumerate(sudachipy_tokens)
if len(token.surface()) > 0
# remove empty tokens which can be produced with characters like … that
]
# Sudachi normalizes internally and outputs each space char as a token.
# This is the preparation for get_dtokens_and_spaces() to merge the continuous space tokens
return [
t
for idx, t in enumerate(dtokens)
if idx == 0
or not t.surface.isspace()
or t.tag != "空白"
or not dtokens[idx - 1].surface.isspace()
or dtokens[idx - 1].tag != "空白"
]
def _get_sub_tokens(self, sudachipy_tokens):
if (
self.split_mode is None or self.split_mode == "A"
): # do nothing for default split mode
return None
sub_tokens_list = [] # list of (list of list of DetailedToken | None)
for token in sudachipy_tokens:
sub_a = token.split(self.tokenizer.SplitMode.A)
if len(sub_a) == 1: # no sub tokens
sub_tokens_list.append(None)
elif self.split_mode == "B":
sub_tokens_list.append([self._get_dtokens(sub_a, False)])
else: # "C"
sub_b = token.split(self.tokenizer.SplitMode.B)
if len(sub_a) == len(sub_b):
dtokens = self._get_dtokens(sub_a, False)
sub_tokens_list.append([dtokens, dtokens])
else:
sub_tokens_list.append(
[
self._get_dtokens(sub_a, False),
self._get_dtokens(sub_b, False),
]
)
return sub_tokens_list
def _get_config(self) -> Dict[str, Any]:
return {"split_mode": self.split_mode}
def _set_config(self, config: Dict[str, Any] = {}) -> None:
self.split_mode = config.get("split_mode", None)
def to_bytes(self, **kwargs) -> bytes:
serializers = {"cfg": lambda: srsly.json_dumps(self._get_config())}
return util.to_bytes(serializers, [])
def from_bytes(self, data: bytes, **kwargs) -> "JapaneseTokenizer":
deserializers = {"cfg": lambda b: self._set_config(srsly.json_loads(b))}
util.from_bytes(data, deserializers, [])
self.tokenizer = try_sudachi_import(self.split_mode)
return self
def to_disk(self, path: Union[str, Path], **kwargs) -> None:
path = util.ensure_path(path)
serializers = {"cfg": lambda p: srsly.write_json(p, self._get_config())}
return util.to_disk(path, serializers, [])
def from_disk(self, path: Union[str, Path], **kwargs) -> "JapaneseTokenizer":
path = util.ensure_path(path)
serializers = {"cfg": lambda p: self._set_config(srsly.read_json(p))}
util.from_disk(path, serializers, [])
self.tokenizer = try_sudachi_import(self.split_mode)
return self
class JapaneseDefaults(Language.Defaults):
tag_map = TAG_MAP
syntax_iterators = SYNTAX_ITERATORS
class Japanese(Language):
lang = "ja"
Defaults = JapaneseDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
# Hold the attributes we need with convenient names
DetailedToken = namedtuple(
"DetailedToken", ["surface", "tag", "inf", "lemma", "reading", "sub_tokens"]
@ -133,161 +299,6 @@ def get_dtokens_and_spaces(dtokens, text, gap_tag="空白"):
return text_dtokens, text_spaces
class JapaneseTokenizer(DummyTokenizer):
def __init__(self, cls, nlp=None, config={}):
self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
self.split_mode = config.get("split_mode", None)
self.tokenizer = try_sudachi_import(self.split_mode)
def __call__(self, text):
# convert sudachipy.morpheme.Morpheme to DetailedToken and merge continuous spaces
sudachipy_tokens = self.tokenizer.tokenize(text)
dtokens = self._get_dtokens(sudachipy_tokens)
dtokens, spaces = get_dtokens_and_spaces(dtokens, text)
# create Doc with tag bi-gram based part-of-speech identification rules
words, tags, inflections, lemmas, readings, sub_tokens_list = (
zip(*dtokens) if dtokens else [[]] * 6
)
sub_tokens_list = list(sub_tokens_list)
doc = Doc(self.vocab, words=words, spaces=spaces)
next_pos = None # for bi-gram rules
for idx, (token, dtoken) in enumerate(zip(doc, dtokens)):
token.tag_ = dtoken.tag
if next_pos: # already identified in previous iteration
token.pos = next_pos
next_pos = None
else:
token.pos, next_pos = resolve_pos(
token.orth_,
dtoken.tag,
tags[idx + 1] if idx + 1 < len(tags) else None,
)
# if there's no lemma info (it's an unk) just use the surface
token.lemma_ = dtoken.lemma if dtoken.lemma else dtoken.surface
doc.user_data["inflections"] = inflections
doc.user_data["reading_forms"] = readings
doc.user_data["sub_tokens"] = sub_tokens_list
return doc
def _get_dtokens(self, sudachipy_tokens, need_sub_tokens=True):
sub_tokens_list = (
self._get_sub_tokens(sudachipy_tokens) if need_sub_tokens else None
)
dtokens = [
DetailedToken(
token.surface(), # orth
"-".join([xx for xx in token.part_of_speech()[:4] if xx != "*"]), # tag
",".join([xx for xx in token.part_of_speech()[4:] if xx != "*"]), # inf
token.dictionary_form(), # lemma
token.reading_form(), # user_data['reading_forms']
sub_tokens_list[idx]
if sub_tokens_list
else None, # user_data['sub_tokens']
)
for idx, token in enumerate(sudachipy_tokens)
if len(token.surface()) > 0
# remove empty tokens which can be produced with characters like … that
]
# Sudachi normalizes internally and outputs each space char as a token.
# This is the preparation for get_dtokens_and_spaces() to merge the continuous space tokens
return [
t
for idx, t in enumerate(dtokens)
if idx == 0
or not t.surface.isspace()
or t.tag != "空白"
or not dtokens[idx - 1].surface.isspace()
or dtokens[idx - 1].tag != "空白"
]
def _get_sub_tokens(self, sudachipy_tokens):
if (
self.split_mode is None or self.split_mode == "A"
): # do nothing for default split mode
return None
sub_tokens_list = [] # list of (list of list of DetailedToken | None)
for token in sudachipy_tokens:
sub_a = token.split(self.tokenizer.SplitMode.A)
if len(sub_a) == 1: # no sub tokens
sub_tokens_list.append(None)
elif self.split_mode == "B":
sub_tokens_list.append([self._get_dtokens(sub_a, False)])
else: # "C"
sub_b = token.split(self.tokenizer.SplitMode.B)
if len(sub_a) == len(sub_b):
dtokens = self._get_dtokens(sub_a, False)
sub_tokens_list.append([dtokens, dtokens])
else:
sub_tokens_list.append(
[
self._get_dtokens(sub_a, False),
self._get_dtokens(sub_b, False),
]
)
return sub_tokens_list
def _get_config(self):
config = OrderedDict((("split_mode", self.split_mode),))
return config
def _set_config(self, config={}):
self.split_mode = config.get("split_mode", None)
def to_bytes(self, **kwargs):
serializers = OrderedDict(
(("cfg", lambda: srsly.json_dumps(self._get_config())),)
)
return util.to_bytes(serializers, [])
def from_bytes(self, data, **kwargs):
deserializers = OrderedDict(
(("cfg", lambda b: self._set_config(srsly.json_loads(b))),)
)
util.from_bytes(data, deserializers, [])
self.tokenizer = try_sudachi_import(self.split_mode)
return self
def to_disk(self, path, **kwargs):
path = util.ensure_path(path)
serializers = OrderedDict(
(("cfg", lambda p: srsly.write_json(p, self._get_config())),)
)
return util.to_disk(path, serializers, [])
def from_disk(self, path, **kwargs):
path = util.ensure_path(path)
serializers = OrderedDict(
(("cfg", lambda p: self._set_config(srsly.read_json(p))),)
)
util.from_disk(path, serializers, [])
self.tokenizer = try_sudachi_import(self.split_mode)
class JapaneseDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda _text: "ja"
stop_words = STOP_WORDS
tag_map = TAG_MAP
syntax_iterators = SYNTAX_ITERATORS
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
@classmethod
def create_tokenizer(cls, nlp=None, config={}):
return JapaneseTokenizer(cls, nlp, config)
class Japanese(Language):
lang = "ja"
Defaults = JapaneseDefaults
def make_doc(self, text):
return self.tokenizer(text)
def pickle_japanese(instance):
return Japanese, tuple()

View File

@ -1,17 +1,26 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS
from ...language import Language
from ...attrs import LANG
from ...util import registry
class KannadaDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: "kn"
stop_words = STOP_WORDS
DEFAULT_CONFIG = """
[nlp]
lang = "kn"
stop_words = {"@language_data": "spacy.kn.stop_words"}
"""
@registry.language_data("spacy.kn.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class Kannada(Language):
lang = "kn"
Defaults = KannadaDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Kannada"]

View File

@ -1,51 +1,52 @@
from typing import Set, Optional, Any, Dict
from thinc.api import Config
from .stop_words import STOP_WORDS
from .tag_map import TAG_MAP
from ...attrs import LANG
from ...language import Language
from ...tokens import Doc
from ...compat import copy_reg
from ...util import DummyTokenizer
from ...util import DummyTokenizer, registry
def try_mecab_import():
try:
from natto import MeCab
DEFAULT_CONFIG = """
[nlp]
lang = "ko"
stop_words = {"@language_data": "spacy.ko.stop_words"}
return MeCab
except ImportError:
raise ImportError(
"Korean support requires [mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), "
"[mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), "
"and [natto-py](https://github.com/buruzaemon/natto-py)"
)
[nlp.tokenizer]
@tokenizers = "spacy.KoreanTokenizer.v1"
[nlp.writing_system]
direction = "ltr"
has_case = false
has_letters = false
"""
# fmt: on
@registry.language_data("spacy.ko.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
def check_spaces(text, tokens):
prev_end = -1
start = 0
for token in tokens:
idx = text.find(token, start)
if prev_end > 0:
yield prev_end != idx
prev_end = idx + len(token)
start = prev_end
if start > 0:
yield False
@registry.tokenizers("spacy.KoreanTokenizer.v1")
def create_korean_tokenizer():
def korean_tokenizer_factory(nlp):
return KoreanTokenizer(nlp)
return korean_tokenizer_factory
class KoreanTokenizer(DummyTokenizer):
def __init__(self, cls, nlp=None):
self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
def __init__(self, nlp: Optional[Language] = None):
self.vocab = nlp.vocab
MeCab = try_mecab_import()
self.mecab_tokenizer = MeCab("-F%f[0],%f[7]")
def __del__(self):
self.mecab_tokenizer.__del__()
def __call__(self, text):
def __call__(self, text: str) -> Doc:
dtokens = list(self.detailed_tokens(text))
surfaces = [dt["surface"] for dt in dtokens]
doc = Doc(self.vocab, words=surfaces, spaces=list(check_spaces(text, surfaces)))
@ -56,7 +57,7 @@ class KoreanTokenizer(DummyTokenizer):
doc.user_data["full_tags"] = [dt["tag"] for dt in dtokens]
return doc
def detailed_tokens(self, text):
def detailed_tokens(self, text: str) -> Dict[str, Any]:
# 품사 태그(POS)[0], 의미 부류(semantic class)[1], 종성 유무(jongseong)[2], 읽기(reading)[3],
# 타입(type)[4], 첫번째 품사(start pos)[5], 마지막 품사(end pos)[6], 표현(expression)[7], *
for node in self.mecab_tokenizer.parse(text, as_nodes=True):
@ -72,23 +73,39 @@ class KoreanTokenizer(DummyTokenizer):
class KoreanDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda _text: "ko"
stop_words = STOP_WORDS
tag_map = TAG_MAP
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
@classmethod
def create_tokenizer(cls, nlp=None):
return KoreanTokenizer(cls, nlp)
class Korean(Language):
lang = "ko"
Defaults = KoreanDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
def make_doc(self, text):
return self.tokenizer(text)
def try_mecab_import() -> None:
try:
from natto import MeCab
return MeCab
except ImportError:
raise ImportError(
"Korean support requires [mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), "
"[mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), "
"and [natto-py](https://github.com/buruzaemon/natto-py)"
)
def check_spaces(text, tokens):
prev_end = -1
start = 0
for token in tokens:
idx = text.find(token, start)
if prev_end > 0:
yield prev_end != idx
prev_end = idx + len(token)
start = prev_end
if start > 0:
yield False
def pickle_korean(instance):

View File

@ -1,26 +1,49 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_INFIXES
from .lex_attrs import LEX_ATTRS
from .stop_words import STOP_WORDS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language
from ...attrs import LANG
from ...util import update_exc
from ...util import update_exc, registry
DEFAULT_CONFIG = """
[nlp]
lang = "lb"
stop_words = {"@language_data": "spacy.lb.stop_words"}
lex_attr_getters = {"@language_data": "spacy.lb.lex_attr_getters"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
"""
@registry.language_data("spacy.lb.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.lb.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class LuxembourgishDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: "lb"
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
infixes = TOKENIZER_INFIXES
class Luxembourgish(Language):
lang = "lb"
Defaults = LuxembourgishDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Luxembourgish"]

View File

@ -1,3 +1,4 @@
from typing import Set
import unicodedata
import re
@ -21,21 +22,21 @@ _tlds = set(
)
def is_punct(text):
def is_punct(text: str) -> bool:
for char in text:
if not unicodedata.category(char).startswith("P"):
return False
return True
def is_ascii(text):
def is_ascii(text: str) -> bool:
for char in text:
if ord(char) >= 128:
return False
return True
def like_num(text):
def like_num(text: str) -> bool:
if text.startswith(("+", "-", "±", "~")):
text = text[1:]
# can be overwritten by lang with list of number words
@ -49,64 +50,31 @@ def like_num(text):
return False
def is_bracket(text):
def is_bracket(text: str) -> bool:
brackets = ("(", ")", "[", "]", "{", "}", "<", ">")
return text in brackets
def is_quote(text):
quotes = (
'"',
"'",
"`",
"«",
"»",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"''",
"``",
)
def is_quote(text: str) -> bool:
# fmt: off
quotes = ('"', "'", "`", "«", "»", "", "", "", "", "", "", "", "", "", "", "", "", "''", "``")
# fmt: on
return text in quotes
def is_left_punct(text):
left_punct = (
"(",
"[",
"{",
"<",
'"',
"'",
"«",
"",
"",
"",
"",
"",
"",
"",
"",
"``",
)
def is_left_punct(text: str) -> bool:
# fmt: off
left_punct = ("(", "[", "{", "<", '"', "'", "«", "", "", "", "", "", "", "", "", "``")
# fmt: on
return text in left_punct
def is_right_punct(text):
def is_right_punct(text: str) -> bool:
right_punct = (")", "]", "}", ">", '"', "'", "»", "", "", "", "", "''")
return text in right_punct
def is_currency(text):
def is_currency(text: str) -> bool:
# can be overwritten by lang with list of currency words, e.g. dollar, euro
for char in text:
if unicodedata.category(char) != "Sc":
@ -114,11 +82,11 @@ def is_currency(text):
return True
def like_email(text):
def like_email(text: str) -> bool:
return bool(_like_email(text))
def like_url(text):
def like_url(text: str) -> bool:
# We're looking for things that function in text like URLs. So, valid URL
# or not, anything they say http:// is going to be good.
if text.startswith("http://") or text.startswith("https://"):
@ -144,7 +112,7 @@ def like_url(text):
return False
def word_shape(text):
def word_shape(text: str) -> str:
if len(text) >= 100:
return "LONG"
shape = []
@ -171,46 +139,52 @@ def word_shape(text):
return "".join(shape)
def lower(string):
def lower(string: str) -> str:
return string.lower()
def prefix(string):
def prefix(string: str) -> str:
return string[0]
def suffix(string):
def suffix(string: str) -> str:
return string[-3:]
def is_alpha(string):
def is_alpha(string: str) -> bool:
return string.isalpha()
def is_digit(string):
def is_digit(string: str) -> bool:
return string.isdigit()
def is_lower(string):
def is_lower(string: str) -> bool:
return string.islower()
def is_space(string):
def is_space(string: str) -> bool:
return string.isspace()
def is_title(string):
def is_title(string: str) -> bool:
return string.istitle()
def is_upper(string):
def is_upper(string: str) -> bool:
return string.isupper()
def is_stop(string, stops=set()):
def is_stop(string: str, stops: Set[str] = set()) -> bool:
return string.lower() in stops
def get_lang(text: str, lang: str = "") -> str:
# This function is partially applied so lang code can be passed in
# automatically while still allowing pickling
return lang
LEX_ATTRS = {
attrs.LOWER: lower,
attrs.NORM: lower,

View File

@ -1,28 +1,35 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_INFIXES
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups
from ...util import update_exc, registry
DEFAULT_CONFIG = """
[nlp]
lang = "lij"
stop_words = {"@language_data": "spacy.lij.stop_words"}
"""
@registry.language_data("spacy.lij.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class LigurianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: "lij"
lex_attr_getters[NORM] = add_lookups(
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
infixes = TOKENIZER_INFIXES
class Ligurian(Language):
lang = "lij"
Defaults = LigurianDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Ligurian"]

View File

@ -1,27 +1,41 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups
from ...util import update_exc, registry
def _return_lt(_):
return "lt"
DEFAULT_CONFIG = """
[nlp]
lang = "lt"
stop_words = {"@language_data": "spacy.lt.stop_words"}
lex_attr_getters = {"@language_data": "spacy.lt.lex_attr_getters"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
"""
@registry.language_data("spacy.lt.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.lt.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class LithuanianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = _return_lt
lex_attr_getters[NORM] = add_lookups(
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
)
lex_attr_getters.update(LEX_ATTRS)
infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES
mod_base_exceptions = {
@ -29,12 +43,12 @@ class LithuanianDefaults(Language.Defaults):
}
del mod_base_exceptions["8)"]
tokenizer_exceptions = update_exc(mod_base_exceptions, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
class Lithuanian(Language):
lang = "lt"
Defaults = LithuanianDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Lithuanian"]

View File

@ -1,17 +1,26 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS
from ...language import Language
from ...attrs import LANG
from ...util import registry
class LatvianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: "lv"
stop_words = STOP_WORDS
DEFAULT_CONFIG = """
[nlp]
lang = "lv"
stop_words = {"@language_data": "spacy.lv.stop_words"}
"""
@registry.language_data("spacy.lv.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class Latvian(Language):
lang = "lv"
Defaults = LatvianDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Latvian"]

View File

@ -1,15 +1,26 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS
from ...language import Language
from ...util import registry
class MalayalamDefaults(Language.Defaults):
stop_words = STOP_WORDS
DEFAULT_CONFIG = """
[nlp]
lang = "ml"
stop_words = {"@language_data": "spacy.ml.stop_words"}
"""
@registry.language_data("spacy.ml.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class Malayalam(Language):
lang = "ml"
Defaults = MalayalamDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Malayalam"]

View File

@ -1,17 +1,26 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS
from ...language import Language
from ...attrs import LANG
from ...util import registry
class MarathiDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: "mr"
stop_words = STOP_WORDS
DEFAULT_CONFIG = """
[nlp]
lang = "af"
stop_words = {"@language_data": "spacy.mr.stop_words"}
"""
@registry.language_data("spacy.mr.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class Marathi(Language):
lang = "mr"
Defaults = MarathiDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Marathi"]

View File

@ -1,33 +1,47 @@
from typing import Set
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from .punctuation import TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups
from ...util import update_exc, registry
DEFAULT_CONFIG = """
[nlp]
lang = "nb"
stop_words = {"@language_data": "spacy.nb.stop_words"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
"""
@registry.language_data("spacy.nb.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class NorwegianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: "nb"
lex_attr_getters[NORM] = add_lookups(
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
prefixes = TOKENIZER_PREFIXES
infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES
stop_words = STOP_WORDS
syntax_iterators = SYNTAX_ITERATORS
class Norwegian(Language):
lang = "nb"
Defaults = NorwegianDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Norwegian"]

View File

@ -1,23 +1,33 @@
# coding: utf8
from __future__ import unicode_literals
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from ...language import Language
from ...attrs import LANG
from ...util import registry
class NepaliDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: "ne" # Nepali language ISO code
stop_words = STOP_WORDS
DEFAULT_CONFIG = """
[nlp]
lang = "ne"
stop_words = {"@language_data": "spacy.ne.stop_words"}
lex_attr_getters = {"@language_data": "spacy.ne.lex_attr_getters"}
"""
@registry.language_data("spacy.ne.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.ne.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class Nepali(Language):
lang = "ne"
Defaults = NepaliDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Nepali"]

View File

@ -1,3 +1,6 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
@ -5,36 +8,51 @@ from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from .punctuation import TOKENIZER_SUFFIXES
from .lemmatizer import DutchLemmatizer
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language
from ...lookups import Lookups
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups
from ...util import update_exc, registry
DEFAULT_CONFIG = """
[nlp]
lang = "nl"
stop_words = {"@language_data": "spacy.nl.stop_words"}
lex_attr_getters = {"@language_data": "spacy.nl.lex_attr_getters"}
[nlp.lemmatizer]
@lemmatizers = "spacy.DutchLemmatizer.v1"
[nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
"""
@registry.language_data("spacy.nl.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.nl.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
@registry.lemmatizers("spacy.DutchLemmatizer.v1")
def create_dutch_lemmatizer(data_paths: dict = {}) -> DutchLemmatizer:
return DutchLemmatizer(data_paths=data_paths)
class DutchDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: "nl"
lex_attr_getters[NORM] = add_lookups(
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
prefixes = TOKENIZER_PREFIXES
infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES
@classmethod
def create_lemmatizer(cls, nlp=None, lookups=None):
if lookups is None:
lookups = Lookups()
return DutchLemmatizer(lookups)
class Dutch(Language):
lang = "nl"
Defaults = DutchDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Dutch"]

View File

@ -1,3 +1,5 @@
from typing import Optional, List, Dict, Tuple
from ...lemmatizer import Lemmatizer
from ...symbols import NOUN, VERB, ADJ, NUM, DET, PRON, ADP, AUX, ADV
@ -34,7 +36,9 @@ class DutchLemmatizer(Lemmatizer):
"num": "num",
}
def __call__(self, string, univ_pos, morphology=None):
def __call__(
self, string: str, univ_pos: str, morphology: Optional[dict] = None
) -> List[str]:
# Difference 1: self.rules is assumed to be non-None, so no
# 'is None' check required.
# String lowercased from the get-go. All lemmatization results in
@ -92,7 +96,7 @@ class DutchLemmatizer(Lemmatizer):
# Overrides parent method so that a lowercased version of the string is
# used to search the lookup table. This is necessary because our lookup
# table consists entirely of lowercase keys.
def lookup(self, string, orth=None):
def lookup(self, string: str, orth: Optional[int] = None) -> str:
lookup_table = self.lookups.get_table("lemma_lookup", {})
string = string.lower()
if orth is not None:
@ -102,7 +106,13 @@ class DutchLemmatizer(Lemmatizer):
# Reimplemented to focus more on application of suffix rules and to return
# as early as possible.
def lemmatize(self, string, index, exceptions, rules):
def lemmatize(
self,
string: str,
index: Dict[str, List[str]],
exceptions: Dict[str, Dict[str, List[str]]],
rules: Dict[str, List[List[str]]],
) -> Tuple[List[str], bool]:
# returns (forms, is_known: bool)
oov_forms = []
for old, new in rules:

View File

@ -1,43 +1,60 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from .punctuation import TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .lemmatizer import PolishLemmatizer
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language
from ...attrs import LANG, NORM
from ...util import add_lookups
from ...lookups import Lookups
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "pl"
stop_words = {"@language_data": "spacy.pl.stop_words"}
lex_attr_getters = {"@language_data": "spacy.pl.lex_attr_getters"}
[nlp.lemmatizer]
@lemmatizers = "spacy.PolishLemmatizer.v1"
[nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
"""
@registry.language_data("spacy.pl.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.pl.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
@registry.lemmatizers("spacy.PolishLemmatizer.v1")
def create_polish_lemmatizer(data_paths: dict = {}) -> PolishLemmatizer:
return PolishLemmatizer(data_paths=data_paths)
class PolishDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: "pl"
lex_attr_getters[NORM] = add_lookups(
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
)
mod_base_exceptions = {
exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".")
}
tokenizer_exceptions = mod_base_exceptions
stop_words = STOP_WORDS
prefixes = TOKENIZER_PREFIXES
infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES
@classmethod
def create_lemmatizer(cls, nlp=None, lookups=None):
if lookups is None:
lookups = Lookups()
return PolishLemmatizer(lookups)
class Polish(Language):
lang = "pl"
Defaults = PolishDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Polish"]

View File

@ -1,3 +1,5 @@
from typing import Optional, List, Dict
from ...lemmatizer import Lemmatizer
from ...parts_of_speech import NAMES
@ -7,30 +9,29 @@ class PolishLemmatizer(Lemmatizer):
# dictionary (morfeusz.sgjp.pl/en) by Institute of Computer Science PAS.
# It utilizes some prefix based improvements for verb and adjectives
# lemmatization, as well as case-sensitive lemmatization for nouns.
def __call__(self, string, univ_pos, morphology=None):
def __call__(
self, string: str, univ_pos: str, morphology: Optional[dict] = None
) -> List[str]:
if isinstance(univ_pos, int):
univ_pos = NAMES.get(univ_pos, "X")
univ_pos = univ_pos.upper()
lookup_pos = univ_pos.lower()
if univ_pos == "PROPN":
lookup_pos = "noun"
lookup_table = self.lookups.get_table("lemma_lookup_" + lookup_pos, {})
if univ_pos == "NOUN":
return self.lemmatize_noun(string, morphology, lookup_table)
if univ_pos != "PROPN":
string = string.lower()
if univ_pos == "ADJ":
return self.lemmatize_adj(string, morphology, lookup_table)
elif univ_pos == "VERB":
return self.lemmatize_verb(string, morphology, lookup_table)
return [lookup_table.get(string, string.lower())]
def lemmatize_adj(self, string, morphology, lookup_table):
def lemmatize_adj(
self, string: str, morphology: dict, lookup_table: Dict[str, str]
) -> List[str]:
# this method utilizes different procedures for adjectives
# with 'nie' and 'naj' prefixes
if string[:3] == "nie":
@ -41,25 +42,26 @@ class PolishLemmatizer(Lemmatizer):
return [lookup_table[naj_search_string]]
if search_string in lookup_table:
return [lookup_table[search_string]]
if string[:3] == "naj":
naj_search_string = string[3:]
if naj_search_string in lookup_table:
return [lookup_table[naj_search_string]]
return [lookup_table.get(string, string)]
def lemmatize_verb(self, string, morphology, lookup_table):
def lemmatize_verb(
self, string: str, morphology: dict, lookup_table: Dict[str, str]
) -> List[str]:
# this method utilizes a different procedure for verbs
# with 'nie' prefix
if string[:3] == "nie":
search_string = string[3:]
if search_string in lookup_table:
return [lookup_table[search_string]]
return [lookup_table.get(string, string)]
def lemmatize_noun(self, string, morphology, lookup_table):
def lemmatize_noun(
self, string: str, morphology: dict, lookup_table: Dict[str, str]
) -> List[str]:
# this method is case-sensitive, in order to work
# for incorrectly tagged proper names
if string != string.lower():
@ -68,11 +70,16 @@ class PolishLemmatizer(Lemmatizer):
elif string in lookup_table:
return [lookup_table[string]]
return [string.lower()]
return [lookup_table.get(string, string)]
def lookup(self, string, orth=None):
def lookup(self, string: str, orth: Optional[int] = None) -> str:
return string.lower()
def lemmatize(self, string, index, exceptions, rules):
def lemmatize(
self,
string: str,
index: Dict[str, List[str]],
exceptions: Dict[str, Dict[str, List[str]]],
rules: Dict[str, List[List[str]]],
) -> List[str]:
raise NotImplementedError

View File

@ -1,20 +1,42 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES
from ...language import Language
from ...attrs import LANG
from ...util import update_exc
from ...util import update_exc, registry
DEFAULT_CONFIG = """
[nlp]
lang = "pt"
stop_words = {"@language_data": "spacy.pt.stop_words"}
lex_attr_getters = {"@language_data": "spacy.pt.lex_attr_getters"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
"""
@registry.language_data("spacy.pt.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.pt.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class PortugueseDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: "pt"
lex_attr_getters.update(LEX_ATTRS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
infixes = TOKENIZER_INFIXES
prefixes = TOKENIZER_PREFIXES
@ -22,6 +44,7 @@ class PortugueseDefaults(Language.Defaults):
class Portuguese(Language):
lang = "pt"
Defaults = PortugueseDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Portuguese"]

View File

@ -1,27 +1,40 @@
from typing import Set
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from .punctuation import TOKENIZER_SUFFIXES
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups
from ...util import update_exc, registry
# Lemma data note:
# Original pairs downloaded from http://www.lexiconista.com/datasets/lemmatization/
# Replaced characters using cedillas with the correct ones (ș and ț)
DEFAULT_CONFIG = """
[nlp]
lang = "ro"
stop_words = {"@language_data": "spacy.ro.stop_words"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
"""
@registry.language_data("spacy.ro.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class RomanianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: "ro"
lex_attr_getters[NORM] = add_lookups(
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES
infixes = TOKENIZER_INFIXES
@ -30,6 +43,7 @@ class RomanianDefaults(Language.Defaults):
class Romanian(Language):
lang = "ro"
Defaults = RomanianDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Romanian"]

View File

@ -1,32 +1,49 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .lex_attrs import LEX_ATTRS
from .lemmatizer import RussianLemmatizer
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...util import update_exc
from ...util import update_exc, registry
from ...language import Language
from ...lookups import Lookups
from ...attrs import LANG
DEFAULT_CONFIG = """
[nlp]
lang = "ru"
stop_words = {"@language_data": "spacy.ru.stop_words"}
lex_attr_getters = {"@language_data": "spacy.ru.lex_attr_getters"}
[nlp.lemmatizer]
@lemmatizers = "spacy.RussianLemmatizer.v1"
"""
@registry.language_data("spacy.ru.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.ru.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
@registry.lemmatizers("spacy.RussianLemmatizer.v1")
def create_russian_lemmatizer() -> RussianLemmatizer:
return RussianLemmatizer()
class RussianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: "ru"
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
@classmethod
def create_lemmatizer(cls, nlp=None, lookups=None):
if lookups is None:
lookups = Lookups()
return RussianLemmatizer(lookups)
class Russian(Language):
lang = "ru"
Defaults = RussianDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Russian"]

View File

@ -1,11 +1,17 @@
from typing import Optional, Tuple, Dict, List
from ...symbols import ADJ, DET, NOUN, NUM, PRON, PROPN, PUNCT, VERB, POS
from ...lemmatizer import Lemmatizer
from ...lookups import Lookups
PUNCT_RULES = {"«": '"', "»": '"'}
class RussianLemmatizer(Lemmatizer):
_morph = None
def __init__(self, lookups=None):
def __init__(self, lookups: Optional[Lookups] = None) -> None:
super(RussianLemmatizer, self).__init__(lookups)
try:
from pymorphy2 import MorphAnalyzer
@ -19,15 +25,15 @@ class RussianLemmatizer(Lemmatizer):
if RussianLemmatizer._morph is None:
RussianLemmatizer._morph = MorphAnalyzer()
def __call__(self, string, univ_pos, morphology=None):
def __call__(
self, string: str, univ_pos: str, morphology: Optional[dict] = None
) -> List[str]:
univ_pos = self.normalize_univ_pos(univ_pos)
if univ_pos == "PUNCT":
return [PUNCT_RULES.get(string, string)]
if univ_pos not in ("ADJ", "DET", "NOUN", "NUM", "PRON", "PROPN", "VERB"):
# Skip unchangeable pos
return [string.lower()]
analyses = self._morph.parse(string)
filtered_analyses = []
for analysis in analyses:
@ -39,12 +45,10 @@ class RussianLemmatizer(Lemmatizer):
analysis_pos in ("NOUN", "PROPN") and univ_pos in ("NOUN", "PROPN")
):
filtered_analyses.append(analysis)
if not len(filtered_analyses):
return [string.lower()]
if morphology is None or (len(morphology) == 1 and POS in morphology):
return list(set([analysis.normal_form for analysis in filtered_analyses]))
if univ_pos in ("ADJ", "DET", "NOUN", "PROPN"):
features_to_compare = ["Case", "Number", "Gender"]
elif univ_pos == "NUM":
@ -61,7 +65,6 @@ class RussianLemmatizer(Lemmatizer):
"VerbForm",
"Voice",
]
analyses, filtered_analyses = filtered_analyses, []
for analysis in analyses:
_, analysis_morph = oc2ud(str(analysis.tag))
@ -74,16 +77,14 @@ class RussianLemmatizer(Lemmatizer):
break
else:
filtered_analyses.append(analysis)
if not len(filtered_analyses):
return [string.lower()]
return list(set([analysis.normal_form for analysis in filtered_analyses]))
@staticmethod
def normalize_univ_pos(univ_pos):
def normalize_univ_pos(univ_pos: str) -> Optional[str]:
if isinstance(univ_pos, str):
return univ_pos.upper()
symbols_to_str = {
ADJ: "ADJ",
DET: "DET",
@ -98,14 +99,14 @@ class RussianLemmatizer(Lemmatizer):
return symbols_to_str[univ_pos]
return None
def lookup(self, string, orth=None):
def lookup(self, string: str, orth: Optional[int] = None) -> str:
analyses = self._morph.parse(string)
if len(analyses) == 1:
return analyses[0].normal_form
return string
def oc2ud(oc_tag):
def oc2ud(oc_tag: str) -> Tuple[str, Dict[str, str]]:
gram_map = {
"_POS": {
"ADJF": "ADJ",
@ -160,11 +161,9 @@ def oc2ud(oc_tag):
"Voice": {"actv": "Act", "pssv": "Pass"},
"Abbr": {"Abbr": "Yes"},
}
pos = "X"
morphology = dict()
unmatched = set()
grams = oc_tag.replace(" ", ",").split(",")
for gram in grams:
match = False
@ -177,7 +176,6 @@ def oc2ud(oc_tag):
morphology[categ] = gmap[gram]
if not match:
unmatched.add(gram)
while len(unmatched) > 0:
gram = unmatched.pop()
if gram in ("Name", "Patr", "Surn", "Geox", "Orgn"):
@ -186,8 +184,4 @@ def oc2ud(oc_tag):
pos = "AUX"
elif gram == "Pltm":
morphology["Number"] = "Ptan"
return pos, morphology
PUNCT_RULES = {"«": '"', "»": '"'}

View File

@ -1,20 +1,33 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from ...language import Language
from ...attrs import LANG
from ...util import registry
class SinhalaDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: "si"
stop_words = STOP_WORDS
DEFAULT_CONFIG = """
[nlp]
lang = "si"
stop_words = {"@language_data": "spacy.si.stop_words"}
lex_attr_getters = {"@language_data": "spacy.si.lex_attr_getters"}
"""
@registry.language_data("spacy.si.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.si.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class Sinhala(Language):
lang = "si"
Defaults = SinhalaDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Sinhala"]

View File

@ -1,20 +1,33 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from ...language import Language
from ...attrs import LANG
from ...util import registry
class SlovakDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: "sk"
stop_words = STOP_WORDS
DEFAULT_CONFIG = """
[nlp]
lang = "sk"
stop_words = {"@language_data": "spacy.sk.stop_words"}
lex_attr_getters = {"@language_data": "spacy.sk.lex_attr_getters"}
"""
@registry.language_data("spacy.sk.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.sk.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class Slovak(Language):
lang = "sk"
Defaults = SlovakDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Slovak"]

View File

@ -1,17 +1,26 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS
from ...language import Language
from ...attrs import LANG
from ...util import registry
class SlovenianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: "sl"
stop_words = STOP_WORDS
DEFAULT_CONFIG = """
[nlp]
lang = "sl"
stop_words = {"@language_data": "spacy.sl.stop_words"}
"""
@registry.language_data("spacy.sl.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class Slovenian(Language):
lang = "sl"
Defaults = SlovenianDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Slovenian"]

View File

@ -1,17 +1,26 @@
from typing import Set
from thinc.api import Config
from .stop_words import STOP_WORDS
from ...language import Language
from ...attrs import LANG
from ...util import registry
class AlbanianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: "sq"
stop_words = STOP_WORDS
DEFAULT_CONFIG = """
[nlp]
lang = "sq"
stop_words = {"@language_data": "spacy.sq.stop_words"}
"""
@registry.language_data("spacy.sq.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class Albanian(Language):
lang = "sq"
Defaults = AlbanianDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Albanian"]

View File

@ -1,23 +1,47 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .lex_attrs import LEX_ATTRS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language
from ...attrs import LANG
from ...util import update_exc
from ...util import update_exc, registry
DEFAULT_CONFIG = """
[nlp]
lang = "sr"
stop_words = {"@language_data": "spacy.sr.stop_words"}
lex_attr_getters = {"@language_data": "spacy.sr.lex_attr_getters"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
"""
@registry.language_data("spacy.sr.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.sr.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class SerbianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: "sr"
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
class Serbian(Language):
lang = "sr"
Defaults = SerbianDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Serbian"]

View File

@ -1,35 +1,54 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language
from ...util import update_exc, registry
from .syntax_iterators import SYNTAX_ITERATORS
# Punctuation stolen from Danish
from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups
from .syntax_iterators import SYNTAX_ITERATORS
DEFAULT_CONFIG = """
[nlp]
lang = "sv"
stop_words = {"@language_data": "spacy.sv.stop_words"}
lex_attr_getters = {"@language_data": "spacy.sv.lex_attr_getters"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
"""
@registry.language_data("spacy.sv.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.sv.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class SwedishDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: "sv"
lex_attr_getters[NORM] = add_lookups(
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES
stop_words = STOP_WORDS
syntax_iterators = SYNTAX_ITERATORS
class Swedish(Language):
lang = "sv"
Defaults = SwedishDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Swedish"]

View File

@ -1,20 +1,33 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from ...language import Language
from ...attrs import LANG
from ...util import registry
class TamilDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: "ta"
lex_attr_getters.update(LEX_ATTRS)
stop_words = STOP_WORDS
DEFAULT_CONFIG = """
[nlp]
lang = "ta"
stop_words = {"@language_data": "spacy.ta.stop_words"}
lex_attr_getters = {"@language_data": "spacy.ta.lex_attr_getters"}
"""
@registry.language_data("spacy.ta.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.ta.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class Tamil(Language):
lang = "ta"
Defaults = TamilDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Tamil"]

View File

@ -1,20 +1,33 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from ...language import Language
from ...attrs import LANG
from ...util import registry
class TeluguDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: "te"
stop_words = STOP_WORDS
DEFAULT_CONFIG = """
[nlp]
lang = "te"
stop_words = {"@language_data": "spacy.te.stop_words"}
lex_attr_getters = {"@language_data": "spacy.te.lex_attr_getters"}
"""
@registry.language_data("spacy.te.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.te.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class Telugu(Language):
lang = "te"
Defaults = TeluguDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Telugu"]

View File

@ -1,15 +1,44 @@
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from ...attrs import LANG
from ...language import Language
from ...tokens import Doc
from ...util import DummyTokenizer
from ...util import DummyTokenizer, registry
DEFAULT_CONFIG = """
[nlp]
lang = "th"
stop_words = {"@language_data": "spacy.th.stop_words"}
lex_attr_getters = {"@language_data": "spacy.th.lex_attr_getters"}
[nlp.tokenizer]
@tokenizers = "spacy.ThaiTokenizer.v1"
"""
@registry.language_data("spacy.th.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.th.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
@registry.tokenizers("spacy.ThaiTokenizer.v1")
def create_thai_tokenizer():
def thai_tokenizer_factory(nlp):
return ThaiTokenizer(nlp)
return thai_tokenizer_factory
class ThaiTokenizer(DummyTokenizer):
def __init__(self, cls, nlp=None):
def __init__(self, nlp: Language) -> None:
try:
from pythainlp.tokenize import word_tokenize
except ImportError:
@ -17,34 +46,18 @@ class ThaiTokenizer(DummyTokenizer):
"The Thai tokenizer requires the PyThaiNLP library: "
"https://github.com/PyThaiNLP/pythainlp"
)
self.word_tokenize = word_tokenize
self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
self.vocab = nlp.vocab
def __call__(self, text):
def __call__(self, text: str) -> Doc:
words = list(self.word_tokenize(text))
spaces = [False] * len(words)
return Doc(self.vocab, words=words, spaces=spaces)
class ThaiDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda _text: "th"
tokenizer_exceptions = dict(TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
@classmethod
def create_tokenizer(cls, nlp=None):
return ThaiTokenizer(cls, nlp)
class Thai(Language):
lang = "th"
Defaults = ThaiDefaults
def make_doc(self, text):
return self.tokenizer(text)
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Thai"]

View File

@ -1,31 +1,47 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups
from ...util import update_exc, registry
def _return_tl(_):
return "tl"
DEFAULT_CONFIG = """
[nlp]
lang = "tl"
stop_words = {"@language_data": "spacy.tl.stop_words"}
lex_attr_getters = {"@language_data": "spacy.tl.lex_attr_getters"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
"""
@registry.language_data("spacy.tl.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.tl.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class TagalogDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = _return_tl
lex_attr_getters[NORM] = add_lookups(
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
)
lex_attr_getters.update(LEX_ATTRS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
class Tagalog(Language):
lang = "tl"
Defaults = TagalogDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Tagalog"]

View File

@ -1,26 +1,40 @@
from typing import Set
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups
from ...util import update_exc, registry
DEFAULT_CONFIG = """
[nlp]
lang = "tr"
stop_words = {"@language_data": "spacy.tr.stop_words"}
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
"""
@registry.language_data("spacy.tr.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
class TurkishDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: "tr"
lex_attr_getters[NORM] = add_lookups(
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
class Turkish(Language):
lang = "tr"
Defaults = TurkishDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Turkish"]

View File

@ -1,28 +1,42 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_INFIXES
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...attrs import LANG
from ...language import Language
from ...util import update_exc
from ...util import update_exc, registry
DEFAULT_CONFIG = """
[nlp]
lang = "tt"
stop_words = {"@language_data": "spacy.tt.stop_words"}
lex_attr_getters = {"@language_data": "spacy.tt.lex_attr_getters"}
"""
@registry.language_data("spacy.tt.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.tt.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class TatarDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: "tt"
lex_attr_getters.update(LEX_ATTRS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
infixes = tuple(TOKENIZER_INFIXES)
stop_words = STOP_WORDS
class Tatar(Language):
lang = "tt"
Defaults = TatarDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Tatar"]

View File

@ -1,36 +1,49 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...util import update_exc, add_lookups
from ...util import update_exc, registry
from ...language import Language
from ...lookups import Lookups
from ...attrs import LANG, NORM
from .lemmatizer import UkrainianLemmatizer
class UkrainianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: "uk"
lex_attr_getters[NORM] = add_lookups(
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
)
lex_attr_getters.update(LEX_ATTRS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
DEFAULT_CONFIG = """
[nlp]
lang = "uk"
stop_words = {"@language_data": "spacy.uk.stop_words"}
lex_attr_getters = {"@language_data": "spacy.uk.lex_attr_getters"}
@classmethod
def create_lemmatizer(cls, nlp=None, lookups=None):
if lookups is None:
lookups = Lookups()
return UkrainianLemmatizer(lookups)
[nlp.lemmatizer]
@lemmatizers = "spacy.UkrainianLemmatizer.v1"
"""
@registry.language_data("spacy.uk.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.uk.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
@registry.lemmatizers("spacy.UkrainianLemmatizer.v1")
def create_ukrainian_lemmatizer() -> UkrainianLemmatizer:
return UkrainianLemmatizer()
class UkrainianDefaults(Language.Defaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
class Ukrainian(Language):
lang = "uk"
Defaults = UkrainianDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Ukrainian"]

View File

@ -1,11 +1,17 @@
from typing import Optional, List, Tuple, Dict
from ...symbols import ADJ, DET, NOUN, NUM, PRON, PROPN, PUNCT, VERB, POS
from ...lookups import Lookups
from ...lemmatizer import Lemmatizer
PUNCT_RULES = {"«": '"', "»": '"'}
class UkrainianLemmatizer(Lemmatizer):
_morph = None
def __init__(self, lookups=None):
def __init__(self, lookups: Optional[Lookups] = None) -> None:
super(UkrainianLemmatizer, self).__init__(lookups)
try:
from pymorphy2 import MorphAnalyzer
@ -19,15 +25,15 @@ class UkrainianLemmatizer(Lemmatizer):
'"pip install git+https://github.com/kmike/pymorphy2.git pymorphy2-dicts-uk"'
)
def __call__(self, string, univ_pos, morphology=None):
def __call__(
self, string: str, univ_pos: str, morphology: Optional[dict] = None
) -> List[str]:
univ_pos = self.normalize_univ_pos(univ_pos)
if univ_pos == "PUNCT":
return [PUNCT_RULES.get(string, string)]
if univ_pos not in ("ADJ", "DET", "NOUN", "NUM", "PRON", "PROPN", "VERB"):
# Skip unchangeable pos
return [string.lower()]
analyses = self._morph.parse(string)
filtered_analyses = []
for analysis in analyses:
@ -39,12 +45,10 @@ class UkrainianLemmatizer(Lemmatizer):
analysis_pos in ("NOUN", "PROPN") and univ_pos in ("NOUN", "PROPN")
):
filtered_analyses.append(analysis)
if not len(filtered_analyses):
return [string.lower()]
if morphology is None or (len(morphology) == 1 and POS in morphology):
return list(set([analysis.normal_form for analysis in filtered_analyses]))
if univ_pos in ("ADJ", "DET", "NOUN", "PROPN"):
features_to_compare = ["Case", "Number", "Gender"]
elif univ_pos == "NUM":
@ -61,7 +65,6 @@ class UkrainianLemmatizer(Lemmatizer):
"VerbForm",
"Voice",
]
analyses, filtered_analyses = filtered_analyses, []
for analysis in analyses:
_, analysis_morph = oc2ud(str(analysis.tag))
@ -74,16 +77,14 @@ class UkrainianLemmatizer(Lemmatizer):
break
else:
filtered_analyses.append(analysis)
if not len(filtered_analyses):
return [string.lower()]
return list(set([analysis.normal_form for analysis in filtered_analyses]))
@staticmethod
def normalize_univ_pos(univ_pos):
def normalize_univ_pos(univ_pos: str) -> Optional[str]:
if isinstance(univ_pos, str):
return univ_pos.upper()
symbols_to_str = {
ADJ: "ADJ",
DET: "DET",
@ -98,14 +99,14 @@ class UkrainianLemmatizer(Lemmatizer):
return symbols_to_str[univ_pos]
return None
def lookup(self, string, orth=None):
def lookup(self, string: str, orth: Optional[int] = None) -> str:
analyses = self._morph.parse(string)
if len(analyses) == 1:
return analyses[0].normal_form
return string
def oc2ud(oc_tag):
def oc2ud(oc_tag: str) -> Tuple[str, Dict[str, str]]:
gram_map = {
"_POS": {
"ADJF": "ADJ",
@ -160,11 +161,9 @@ def oc2ud(oc_tag):
"Voice": {"actv": "Act", "pssv": "Pass"},
"Abbr": {"Abbr": "Yes"},
}
pos = "X"
morphology = dict()
unmatched = set()
grams = oc_tag.replace(" ", ",").split(",")
for gram in grams:
match = False
@ -177,7 +176,6 @@ def oc2ud(oc_tag):
morphology[categ] = gmap[gram]
if not match:
unmatched.add(gram)
while len(unmatched) > 0:
gram = unmatched.pop()
if gram in ("Name", "Patr", "Surn", "Geox", "Orgn"):
@ -186,8 +184,4 @@ def oc2ud(oc_tag):
pos = "AUX"
elif gram == "Pltm":
morphology["Number"] = "Ptan"
return pos, morphology
PUNCT_RULES = {"«": '"', "»": '"'}

View File

@ -1,26 +1,53 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_SUFFIXES
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language
from ...attrs import LANG
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "ur"
stop_words = {"@language_data": "spacy.ur.stop_words"}
lex_attr_getters = {"@language_data": "spacy.ur.lex_attr_getters"}
[nlp.writing_system]
direction = "rtl"
has_case = false
has_letters = true
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[nlp.lemmatizer.data_paths]
@language_data = "spacy-lookups-data"
lang = ${nlp:lang}
"""
@registry.language_data("spacy.ur.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.ur.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class UrduDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: "ur"
tokenizer_exceptions = BASE_EXCEPTIONS
stop_words = STOP_WORDS
suffixes = TOKENIZER_SUFFIXES
writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
class Urdu(Language):
lang = "ur"
Defaults = UrduDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Urdu"]

View File

@ -1,38 +1,62 @@
from ...attrs import LANG, NORM
from ..norm_exceptions import BASE_NORMS
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from ...language import Language
from ...tokens import Doc
from .stop_words import STOP_WORDS
from ...util import add_lookups
from ...util import DummyTokenizer, registry
from .lex_attrs import LEX_ATTRS
class VietnameseDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: "vi" # for pickling
lex_attr_getters[NORM] = add_lookups(
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
)
lex_attr_getters.update(LEX_ATTRS)
stop_words = STOP_WORDS
use_pyvi = True
DEFAULT_CONFIG = """
[nlp]
lang = "vi"
stop_words = {"@language_data": "spacy.vi.stop_words"}
lex_attr_getters = {"@language_data": "spacy.vi.lex_attr_getters"}
[nlp.tokenizer]
@tokenizers = "spacy.VietnameseTokenizer.v1"
use_pyvi = true
"""
class Vietnamese(Language):
lang = "vi"
Defaults = VietnameseDefaults # override defaults
@registry.language_data("spacy.vi.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
def make_doc(self, text):
if self.Defaults.use_pyvi:
@registry.language_data("spacy.vi.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
@registry.tokenizers("spacy.VietnameseTokenizer.v1")
def create_vietnamese_tokenizer(use_pyvi: bool = True,):
def vietnamese_tokenizer_factory(nlp):
return VietnameseTokenizer(nlp, use_pyvi=use_pyvi)
return vietnamese_tokenizer_factory
class VietnameseTokenizer(DummyTokenizer):
def __init__(self, nlp: Language, use_pyvi: bool = False):
self.vocab = nlp.vocab
self.use_pyvi = use_pyvi
if self.use_pyvi:
try:
from pyvi import ViTokenizer
self.ViTokenizer = ViTokenizer
except ImportError:
msg = (
"Pyvi not installed. Either set Vietnamese.use_pyvi = False, "
"Pyvi not installed. Either set use_pyvi = False, "
"or install it https://pypi.python.org/pypi/pyvi"
)
raise ImportError(msg)
words, spaces = ViTokenizer.spacy_tokenize(text)
def __call__(self, text: str) -> Doc:
if self.use_pyvi:
words, spaces = self.ViTokenizer.spacy_tokenize(text)
return Doc(self.vocab, words=words, spaces=spaces)
else:
words = []
@ -44,4 +68,9 @@ class Vietnamese(Language):
return Doc(self.vocab, words=words, spaces=spaces)
class Vietnamese(Language):
lang = "vi"
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Vietnamese"]

View File

@ -1,17 +1,17 @@
from thinc.api import Config
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups
DEFAULT_CONFIG = """
[nlp]
lang = "xx"
"""
class MultiLanguageDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: "xx"
lex_attr_getters[NORM] = add_lookups(
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
tokenizer_exceptions = BASE_EXCEPTIONS
class MultiLanguage(Language):
@ -21,6 +21,7 @@ class MultiLanguage(Language):
lang = "xx"
Defaults = MultiLanguageDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["MultiLanguage"]

View File

@ -1,21 +1,39 @@
from typing import Set, Dict, Callable, Any
from thinc.api import Config
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language
from ...attrs import LANG
from ...util import registry
DEFAULT_CONFIG = """
[nlp]
lang = "si"
stop_words = {"@language_data": "spacy.yo.stop_words"}
lex_attr_getters = {"@language_data": "spacy.yo.lex_attr_getters"}
"""
@registry.language_data("spacy.yo.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.yo.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
class YorubaDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: "yo"
stop_words = STOP_WORDS
tokenizer_exceptions = BASE_EXCEPTIONS
class Yoruba(Language):
lang = "yo"
Defaults = YorubaDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
__all__ = ["Yoruba"]

View File

@ -1,13 +1,15 @@
from typing import Optional, List, Set, Dict, Callable, Any
from enum import Enum
import tempfile
import srsly
import warnings
from pathlib import Path
from collections import OrderedDict
from ...attrs import LANG
from thinc.api import Config
from ...errors import Warnings, Errors
from ...language import Language
from ...tokens import Doc
from ...util import DummyTokenizer
from ...util import DummyTokenizer, registry
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from .lex_attrs import LEX_ATTRS
from .stop_words import STOP_WORDS
@ -16,88 +18,103 @@ from ... import util
_PKUSEG_INSTALL_MSG = "install it with `pip install pkuseg==0.0.25` or from https://github.com/lancopku/pkuseg-python"
DEFAULT_CONFIG = """
[nlp]
lang = "zh"
stop_words = {"@language_data": "spacy.zh.stop_words"}
lex_attr_getters = {"@language_data": "spacy.zh.lex_attr_getters"}
def try_jieba_import(segmenter):
try:
import jieba
[nlp.tokenizer]
@tokenizers = "spacy.ChineseTokenizer.v1"
segmenter = "char"
pkuseg_model = null
pkuseg_user_dict = "default"
if segmenter == "jieba":
# segment a short text to have jieba initialize its cache in advance
list(jieba.cut("作为", cut_all=False))
return jieba
except ImportError:
if segmenter == "jieba":
msg = (
"Jieba not installed. To use jieba, install it with `pip "
" install jieba` or from https://github.com/fxsjy/jieba"
)
raise ImportError(msg)
[nlp.writing_system]
direction = "ltr"
has_case = false
has_letters = false
"""
def try_pkuseg_import(segmenter, pkuseg_model, pkuseg_user_dict):
try:
import pkuseg
class Segmenter(str, Enum):
char = "char"
jieba = "jieba"
pkuseg = "pkuseg"
if pkuseg_model:
return pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict)
elif segmenter == "pkuseg":
msg = (
"The Chinese word segmenter is 'pkuseg' but no pkuseg model "
"was specified. Please provide the name of a pretrained model "
"or the path to a model with "
'`cfg = {"segmenter": "pkuseg", "pkuseg_model": name_or_path}; '
'nlp = Chinese(meta={"tokenizer": {"config": cfg}})`'
)
raise ValueError(msg)
except ImportError:
if segmenter == "pkuseg":
msg = "pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG
raise ImportError(msg)
except FileNotFoundError:
if segmenter == "pkuseg":
msg = "Unable to load pkuseg model from: " + pkuseg_model
raise FileNotFoundError(msg)
@classmethod
def values(cls):
return list(cls.__members__.keys())
@registry.language_data("spacy.zh.stop_words")
def stop_words() -> Set[str]:
return STOP_WORDS
@registry.language_data("spacy.zh.lex_attr_getters")
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
return LEX_ATTRS
@registry.tokenizers("spacy.ChineseTokenizer.v1")
def create_chinese_tokenizer(
segmenter: Segmenter = Segmenter.char,
pkuseg_model: Optional[str] = None,
pkuseg_user_dict: Optional[str] = "default",
):
def chinese_tokenizer_factory(nlp):
return ChineseTokenizer(
nlp,
segmenter=segmenter,
pkuseg_model=pkuseg_model,
pkuseg_user_dict=pkuseg_user_dict,
)
return chinese_tokenizer_factory
class ChineseTokenizer(DummyTokenizer):
def __init__(self, cls, nlp=None, config={}):
self.supported_segmenters = ("char", "jieba", "pkuseg")
self.configure_segmenter(config)
self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
# remove relevant settings from config so they're not also saved in
# Language.meta
for key in ["segmenter", "pkuseg_model", "pkuseg_user_dict"]:
if key in config:
del config[key]
self.tokenizer = Language.Defaults().create_tokenizer(nlp)
def __init__(
self,
nlp: Language,
segmenter: Segmenter = Segmenter.char,
pkuseg_model: Optional[str] = None,
pkuseg_user_dict: Optional[str] = None,
):
self.vocab = nlp.vocab
if isinstance(segmenter, Segmenter): # we might have the Enum here
segmenter = segmenter.value
self.segmenter = segmenter
self.pkuseg_model = pkuseg_model
self.pkuseg_user_dict = pkuseg_user_dict
self.pkuseg_seg = None
self.jieba_seg = None
self.configure_segmenter(segmenter)
def configure_segmenter(self, config):
self.segmenter = "char"
if "segmenter" in config:
if config["segmenter"] in self.supported_segmenters:
self.segmenter = config["segmenter"]
else:
warn_msg = Warnings.W103.format(
lang="Chinese",
segmenter=config["segmenter"],
supported=", ".join([repr(s) for s in self.supported_segmenters]),
default="'char' (character segmentation)",
)
warnings.warn(warn_msg)
def configure_segmenter(self, segmenter: str):
if segmenter not in Segmenter.values():
warn_msg = Warnings.W103.format(
lang="Chinese",
segmenter=segmenter,
supported=", ".join(Segmenter.values()),
default="'char' (character segmentation)",
)
warnings.warn(warn_msg)
self.segmenter = Segmenter.char
self.jieba_seg = try_jieba_import(self.segmenter)
self.pkuseg_seg = try_pkuseg_import(
self.segmenter,
pkuseg_model=config.get("pkuseg_model", None),
pkuseg_user_dict=config.get("pkuseg_user_dict", "default"),
pkuseg_model=self.pkuseg_model,
pkuseg_user_dict=self.pkuseg_user_dict,
)
def __call__(self, text):
if self.segmenter == "jieba":
def __call__(self, text: str) -> Doc:
if self.segmenter == Segmenter.jieba:
words = list([x for x in self.jieba_seg.cut(text, cut_all=False) if x])
(words, spaces) = util.get_words_and_spaces(words, text)
return Doc(self.vocab, words=words, spaces=spaces)
elif self.segmenter == "pkuseg":
elif self.segmenter == Segmenter.pkuseg:
if self.pkuseg_seg is None:
raise ValueError(Errors.E1000)
words = self.pkuseg_seg.cut(text)
@ -105,11 +122,11 @@ class ChineseTokenizer(DummyTokenizer):
return Doc(self.vocab, words=words, spaces=spaces)
# warn if segmenter setting is not the only remaining option "char"
if self.segmenter != "char":
if self.segmenter != Segmenter.char:
warn_msg = Warnings.W103.format(
lang="Chinese",
segmenter=self.segmenter,
supported=", ".join([repr(s) for s in self.supported_segmenters]),
supported=", ".join(Segmenter.values()),
default="'char' (character segmentation)",
)
warnings.warn(warn_msg)
@ -119,33 +136,25 @@ class ChineseTokenizer(DummyTokenizer):
(words, spaces) = util.get_words_and_spaces(words, text)
return Doc(self.vocab, words=words, spaces=spaces)
def pkuseg_update_user_dict(self, words, reset=False):
if self.segmenter == "pkuseg":
def pkuseg_update_user_dict(self, words: List[str], reset: bool = False):
if self.segmenter == Segmenter.pkuseg:
if reset:
try:
import pkuseg
self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(None)
except ImportError:
if self.segmenter == "pkuseg":
msg = (
"pkuseg not installed: unable to reset pkuseg "
"user dict. Please " + _PKUSEG_INSTALL_MSG
)
raise ImportError(msg)
msg = (
"pkuseg not installed: unable to reset pkuseg "
"user dict. Please " + _PKUSEG_INSTALL_MSG
)
raise ImportError(msg)
for word in words:
self.pkuseg_seg.preprocesser.insert(word.strip(), "")
else:
warn_msg = Warnings.W104.format(target="pkuseg", current=self.segmenter)
warnings.warn(warn_msg)
def _get_config(self):
config = OrderedDict((("segmenter", self.segmenter),))
return config
def _set_config(self, config={}):
self.configure_segmenter(config)
def to_bytes(self, **kwargs):
pkuseg_features_b = b""
pkuseg_weights_b = b""
@ -165,17 +174,11 @@ class ChineseTokenizer(DummyTokenizer):
sorted(list(self.pkuseg_seg.postprocesser.common_words)),
sorted(list(self.pkuseg_seg.postprocesser.other_words)),
)
serializers = OrderedDict(
(
("cfg", lambda: srsly.json_dumps(self._get_config())),
("pkuseg_features", lambda: pkuseg_features_b),
("pkuseg_weights", lambda: pkuseg_weights_b),
(
"pkuseg_processors",
lambda: srsly.msgpack_dumps(pkuseg_processors_data),
),
)
)
serializers = {
"pkuseg_features": lambda: pkuseg_features_b,
"pkuseg_weights": lambda: pkuseg_weights_b,
"pkuseg_processors": lambda: srsly.msgpack_dumps(pkuseg_processors_data),
}
return util.to_bytes(serializers, [])
def from_bytes(self, data, **kwargs):
@ -190,14 +193,11 @@ class ChineseTokenizer(DummyTokenizer):
def deserialize_pkuseg_processors(b):
pkuseg_data["processors_data"] = srsly.msgpack_loads(b)
deserializers = OrderedDict(
(
("cfg", lambda b: self._set_config(srsly.json_loads(b))),
("pkuseg_features", deserialize_pkuseg_features),
("pkuseg_weights", deserialize_pkuseg_weights),
("pkuseg_processors", deserialize_pkuseg_processors),
)
)
deserializers = {
"pkuseg_features": deserialize_pkuseg_features,
"pkuseg_weights": deserialize_pkuseg_weights,
"pkuseg_processors": deserialize_pkuseg_processors,
}
util.from_bytes(data, deserializers, [])
if pkuseg_data["features_b"] and pkuseg_data["weights_b"]:
@ -245,13 +245,10 @@ class ChineseTokenizer(DummyTokenizer):
)
srsly.write_msgpack(path, data)
serializers = OrderedDict(
(
("cfg", lambda p: srsly.write_json(p, self._get_config())),
("pkuseg_model", lambda p: save_pkuseg_model(p)),
("pkuseg_processors", lambda p: save_pkuseg_processors(p)),
)
)
serializers = {
"pkuseg_model": lambda p: save_pkuseg_model(p),
"pkuseg_processors": lambda p: save_pkuseg_processors(p),
}
return util.to_disk(path, serializers, [])
def from_disk(self, path, **kwargs):
@ -261,7 +258,7 @@ class ChineseTokenizer(DummyTokenizer):
try:
import pkuseg
except ImportError:
if self.segmenter == "pkuseg":
if self.segmenter == Segmenter.pkuseg:
raise ImportError(
"pkuseg not installed. To use this model, "
+ _PKUSEG_INSTALL_MSG
@ -273,9 +270,9 @@ class ChineseTokenizer(DummyTokenizer):
try:
import pkuseg
except ImportError:
if self.segmenter == "pkuseg":
if self.segmenter == Segmenter.pkuseg:
raise ImportError(self._pkuseg_install_msg)
if self.segmenter == "pkuseg":
if self.segmenter == Segmenter.pkuseg:
data = srsly.read_msgpack(path)
(user_dict, do_process, common_words, other_words) = data
self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(user_dict)
@ -283,35 +280,64 @@ class ChineseTokenizer(DummyTokenizer):
self.pkuseg_seg.postprocesser.common_words = set(common_words)
self.pkuseg_seg.postprocesser.other_words = set(other_words)
serializers = OrderedDict(
(
("cfg", lambda p: self._set_config(srsly.read_json(p))),
("pkuseg_model", lambda p: load_pkuseg_model(p)),
("pkuseg_processors", lambda p: load_pkuseg_processors(p)),
)
)
serializers = {
"pkuseg_model": lambda p: load_pkuseg_model(p),
"pkuseg_processors": lambda p: load_pkuseg_processors(p),
}
util.from_disk(path, serializers, [])
class ChineseDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: "zh"
tokenizer_exceptions = BASE_EXCEPTIONS
stop_words = STOP_WORDS
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
@classmethod
def create_tokenizer(cls, nlp=None, config={}):
return ChineseTokenizer(cls, nlp, config=config)
class Chinese(Language):
lang = "zh"
Defaults = ChineseDefaults # override defaults
Defaults = ChineseDefaults
default_config = Config().from_str(DEFAULT_CONFIG)
def make_doc(self, text):
return self.tokenizer(text)
def try_jieba_import(segmenter: str) -> None:
try:
import jieba
if segmenter == Segmenter.jieba:
# segment a short text to have jieba initialize its cache in advance
list(jieba.cut("作为", cut_all=False))
return jieba
except ImportError:
if segmenter == Segmenter.jieba:
msg = (
"Jieba not installed. To use jieba, install it with `pip "
" install jieba` or from https://github.com/fxsjy/jieba"
)
raise ImportError(msg)
def try_pkuseg_import(segmenter: str, pkuseg_model: str, pkuseg_user_dict: str) -> None:
try:
import pkuseg
if pkuseg_model:
return pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict)
elif segmenter == Segmenter.pkuseg:
msg = (
"The Chinese word segmenter is 'pkuseg' but no pkuseg model "
"was specified. Please provide the name of a pretrained model "
"or the path to a model with:\n"
'cfg = {"nlp": {"tokenizer": {"segmenter": "pkuseg", "pkuseg_model": name_or_path }}\n'
"nlp = Chinese.from_config(cfg)"
)
raise ValueError(msg)
except ImportError:
if segmenter == Segmenter.pkuseg:
msg = "pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG
raise ImportError(msg)
except FileNotFoundError:
if segmenter == Segmenter.pkuseg:
msg = "Unable to load pkuseg model from: " + pkuseg_model
raise FileNotFoundError(msg)
def _get_pkuseg_trie_data(node, path=""):

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,14 @@
from typing import Optional, Callable, List, Dict
from .lookups import Lookups
from .errors import Errors
from .parts_of_speech import NAMES as UPOS_NAMES
from .util import registry, load_language_data, SimpleFrozenDict
@registry.lemmatizers("spacy.Lemmatizer.v1")
def create_lemmatizer(data_paths: dict = {}) -> "Lemmatizer":
return Lemmatizer(data_paths=data_paths)
class Lemmatizer:
@ -14,17 +23,27 @@ class Lemmatizer:
def load(cls, *args, **kwargs):
raise NotImplementedError(Errors.E172)
def __init__(self, lookups, is_base_form=None):
def __init__(
self,
lookups: Optional[Lookups] = None,
data_paths: dict = SimpleFrozenDict(),
is_base_form: Optional[Callable] = None,
) -> None:
"""Initialize a Lemmatizer.
lookups (Lookups): The lookups object containing the (optional) tables
"lemma_rules", "lemma_index", "lemma_exc" and "lemma_lookup".
RETURNS (Lemmatizer): The newly constructed object.
"""
self.lookups = lookups
self.lookups = lookups if lookups is not None else Lookups()
for name, filename in data_paths.items():
data = load_language_data(filename)
self.lookups.add_table(name, data)
self.is_base_form = is_base_form
def __call__(self, string, univ_pos, morphology=None):
def __call__(
self, string: str, univ_pos: str, morphology: Optional[dict] = None
) -> List[str]:
"""Lemmatize a string.
string (str): The string to lemmatize, e.g. the token text.
@ -39,7 +58,6 @@ class Lemmatizer:
if isinstance(univ_pos, int):
univ_pos = UPOS_NAMES.get(univ_pos, "X")
univ_pos = univ_pos.lower()
if univ_pos in ("", "eol", "space"):
return [string.lower()]
# See Issue #435 for example of where this logic is requied.
@ -67,65 +85,31 @@ class Lemmatizer:
)
return lemmas
def is_base_form(self, univ_pos, morphology=None):
"""
Check whether we're dealing with an uninflected paradigm, so we can
avoid lemmatization entirely.
univ_pos (str / int): The token's universal part-of-speech tag.
morphology (dict): The token's morphological features following the
Universal Dependencies scheme.
"""
if morphology is None:
morphology = {}
if univ_pos == "noun" and morphology.get("Number") == "sing":
return True
elif univ_pos == "verb" and morphology.get("VerbForm") == "inf":
return True
# This maps 'VBP' to base form -- probably just need 'IS_BASE'
# morphology
elif univ_pos == "verb" and (
morphology.get("VerbForm") == "fin"
and morphology.get("Tense") == "pres"
and morphology.get("Number") is None
):
return True
elif univ_pos == "adj" and morphology.get("Degree") == "pos":
return True
elif morphology.get("VerbForm") == "inf":
return True
elif morphology.get("VerbForm") == "none":
return True
elif morphology.get("Degree") == "pos":
return True
else:
return False
def noun(self, string, morphology=None):
def noun(self, string: str, morphology: Optional[dict] = None) -> List[str]:
return self(string, "noun", morphology)
def verb(self, string, morphology=None):
def verb(self, string: str, morphology: Optional[dict] = None) -> List[str]:
return self(string, "verb", morphology)
def adj(self, string, morphology=None):
def adj(self, string: str, morphology: Optional[dict] = None) -> List[str]:
return self(string, "adj", morphology)
def det(self, string, morphology=None):
def det(self, string: str, morphology: Optional[dict] = None) -> List[str]:
return self(string, "det", morphology)
def pron(self, string, morphology=None):
def pron(self, string: str, morphology: Optional[dict] = None) -> List[str]:
return self(string, "pron", morphology)
def adp(self, string, morphology=None):
def adp(self, string: str, morphology: Optional[dict] = None) -> List[str]:
return self(string, "adp", morphology)
def num(self, string, morphology=None):
def num(self, string: str, morphology: Optional[dict] = None) -> List[str]:
return self(string, "num", morphology)
def punct(self, string, morphology=None):
def punct(self, string: str, morphology: Optional[dict] = None) -> List[str]:
return self(string, "punct", morphology)
def lookup(self, string, orth=None):
def lookup(self, string: str, orth: Optional[int] = None) -> str:
"""Look up a lemma in the table, if available. If no lemma is found,
the original string is returned.
@ -141,7 +125,13 @@ class Lemmatizer:
return lookup_table[key]
return string
def lemmatize(self, string, index, exceptions, rules):
def lemmatize(
self,
string: str,
index: Dict[str, List[str]],
exceptions: Dict[str, Dict[str, List[str]]],
rules: Dict[str, List[List[str]]],
) -> List[str]:
orig = string
string = string.lower()
forms = []

View File

@ -1,15 +1,32 @@
from typing import Dict, Any, List, Union, Optional
from pathlib import Path
import srsly
from preshed.bloom import BloomFilter
from collections import OrderedDict
from .errors import Errors
from .util import SimpleFrozenDict, ensure_path
from .util import SimpleFrozenDict, ensure_path, registry
from .strings import get_string_id
UNSET = object()
@registry.language_data("spacy-lookups-data")
def get_lookups(lang: str) -> Dict[str, Any]:
"""Load the data from the spacy-lookups-data package for a given language,
if available. Returns an empty dict if there's no data or if the package
is not installed.
lang (str): The language code (corresponds to entry point exposed by
the spacy-lookups-data package).
RETURNS (Dict[str, Any]): The lookups, keyed by table name.
"""
if lang in registry.lookups:
return registry.lookups.get(lang)
return {}
class Lookups:
"""Container for large lookup tables and dictionaries, e.g. lemmatization
data or tokenizer exception lists. Lookups are available via vocab.lookups,
@ -18,7 +35,7 @@ class Lookups:
via doc.vocab.lookups.
"""
def __init__(self):
def __init__(self) -> None:
"""Initialize the Lookups object.
RETURNS (Lookups): The newly created object.
@ -27,7 +44,7 @@ class Lookups:
"""
self._tables = {}
def __contains__(self, name):
def __contains__(self, name: str) -> bool:
"""Check if the lookups contain a table of a given name. Delegates to
Lookups.has_table.
@ -36,16 +53,16 @@ class Lookups:
"""
return self.has_table(name)
def __len__(self):
def __len__(self) -> int:
"""RETURNS (int): The number of tables in the lookups."""
return len(self._tables)
@property
def tables(self):
"""RETURNS (list): Names of all tables in the lookups."""
def tables(self) -> List[str]:
"""RETURNS (List[str]): Names of all tables in the lookups."""
return list(self._tables.keys())
def add_table(self, name, data=SimpleFrozenDict()):
def add_table(self, name: str, data: dict = SimpleFrozenDict()) -> "Table":
"""Add a new table to the lookups. Raises an error if the table exists.
name (str): Unique name of table.
@ -60,12 +77,12 @@ class Lookups:
self._tables[name] = table
return table
def get_table(self, name, default=UNSET):
def get_table(self, name: str, default: Any = UNSET) -> "Table":
"""Get a table. Raises an error if the table doesn't exist and no
default value is provided.
name (str): Name of the table.
default: Optional default value to return if table doesn't exist.
default (Any): Optional default value to return if table doesn't exist.
RETURNS (Table): The table.
DOCS: https://spacy.io/api/lookups#get_table
@ -76,7 +93,7 @@ class Lookups:
return default
return self._tables[name]
def remove_table(self, name):
def remove_table(self, name: str) -> "Table":
"""Remove a table. Raises an error if the table doesn't exist.
name (str): Name of the table to remove.
@ -88,7 +105,7 @@ class Lookups:
raise KeyError(Errors.E159.format(name=name, tables=self.tables))
return self._tables.pop(name)
def has_table(self, name):
def has_table(self, name: str) -> bool:
"""Check if the lookups contain a table of a given name.
name (str): Name of the table.
@ -98,7 +115,7 @@ class Lookups:
"""
return name in self._tables
def to_bytes(self, **kwargs):
def to_bytes(self, **kwargs) -> bytes:
"""Serialize the lookups to a bytestring.
RETURNS (bytes): The serialized Lookups.
@ -107,7 +124,7 @@ class Lookups:
"""
return srsly.msgpack_dumps(self._tables)
def from_bytes(self, bytes_data, **kwargs):
def from_bytes(self, bytes_data: bytes, **kwargs) -> "Lookups":
"""Load the lookups from a bytestring.
bytes_data (bytes): The data to load.
@ -120,7 +137,9 @@ class Lookups:
self._tables[key] = Table(key, value)
return self
def to_disk(self, path, filename="lookups.bin", **kwargs):
def to_disk(
self, path: Union[str, Path], filename: str = "lookups.bin", **kwargs
) -> None:
"""Save the lookups to a directory as lookups.bin. Expects a path to a
directory, which will be created if it doesn't exist.
@ -136,7 +155,9 @@ class Lookups:
with filepath.open("wb") as file_:
file_.write(self.to_bytes())
def from_disk(self, path, filename="lookups.bin", **kwargs):
def from_disk(
self, path: Union[str, Path], filename: str = "lookups.bin", **kwargs
) -> "Lookups":
"""Load lookups from a directory containing a lookups.bin. Will skip
loading if the file doesn't exist.
@ -162,7 +183,7 @@ class Table(OrderedDict):
"""
@classmethod
def from_dict(cls, data, name=None):
def from_dict(cls, data: dict, name: Optional[str] = None) -> "Table":
"""Initialize a new table from a dict.
data (dict): The dictionary.
@ -175,7 +196,7 @@ class Table(OrderedDict):
self.update(data)
return self
def __init__(self, name=None, data=None):
def __init__(self, name: Optional[str] = None, data: Optional[dict] = None) -> None:
"""Initialize a new table.
name (str): Optional table name for reference.
@ -193,7 +214,7 @@ class Table(OrderedDict):
if data:
self.update(data)
def __setitem__(self, key, value):
def __setitem__(self, key: Union[str, int], value: Any) -> None:
"""Set new key/value pair. String keys will be hashed.
key (str / int): The key to set.
@ -203,7 +224,7 @@ class Table(OrderedDict):
OrderedDict.__setitem__(self, key, value)
self.bloom.add(key)
def set(self, key, value):
def set(self, key: Union[str, int], value: Any) -> None:
"""Set new key/value pair. String keys will be hashed.
Same as table[key] = value.
@ -212,7 +233,7 @@ class Table(OrderedDict):
"""
self[key] = value
def __getitem__(self, key):
def __getitem__(self, key: Union[str, int]) -> Any:
"""Get the value for a given key. String keys will be hashed.
key (str / int): The key to get.
@ -221,7 +242,7 @@ class Table(OrderedDict):
key = get_string_id(key)
return OrderedDict.__getitem__(self, key)
def get(self, key, default=None):
def get(self, key: Union[str, int], default: Optional[Any] = None) -> Any:
"""Get the value for a given key. String keys will be hashed.
key (str / int): The key to get.
@ -231,7 +252,7 @@ class Table(OrderedDict):
key = get_string_id(key)
return OrderedDict.get(self, key, default)
def __contains__(self, key):
def __contains__(self, key: Union[str, int]) -> bool:
"""Check whether a key is in the table. String keys will be hashed.
key (str / int): The key to check.
@ -243,7 +264,7 @@ class Table(OrderedDict):
return False
return OrderedDict.__contains__(self, key)
def to_bytes(self):
def to_bytes(self) -> bytes:
"""Serialize table to a bytestring.
RETURNS (bytes): The serialized table.
@ -257,7 +278,7 @@ class Table(OrderedDict):
}
return srsly.msgpack_dumps(data)
def from_bytes(self, bytes_data):
def from_bytes(self, bytes_data: bytes) -> "Table":
"""Load a table from a bytestring.
bytes_data (bytes): The data to load.

View File

@ -26,8 +26,8 @@ def build_nel_encoder(tok2vec, nO=None):
@registry.assets.register("spacy.KBFromFile.v1")
def load_kb(nlp_path, kb_path) -> KnowledgeBase:
vocab = Vocab().from_disk(Path(nlp_path) / "vocab")
def load_kb(vocab_path, kb_path) -> KnowledgeBase:
vocab = Vocab().from_disk(vocab_path)
kb = KnowledgeBase(vocab=vocab)
kb.load_bulk(kb_path)
return kb

View File

@ -1,30 +1,9 @@
from thinc.api import (
Model,
reduce_mean,
Linear,
list2ragged,
Logistic,
ParametricAttention,
)
from thinc.api import chain, concatenate, clone, Dropout
from thinc.api import (
SparseLinear,
Softmax,
softmax_activation,
Maxout,
reduce_sum,
Relu,
residual,
expand_window,
)
from thinc.api import (
HashEmbed,
with_ragged,
with_array,
with_cpu,
uniqued,
FeatureExtractor,
)
from typing import Optional
from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic
from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention
from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum
from thinc.api import HashEmbed, with_ragged, with_array, with_cpu, uniqued
from thinc.api import Relu, residual, expand_window, FeatureExtractor
from ..spacy_vectors import SpacyVectors
from ... import util
@ -34,7 +13,9 @@ from ..extract_ngrams import extract_ngrams
@registry.architectures.register("spacy.TextCatCNN.v1")
def build_simple_cnn_text_classifier(tok2vec, exclusive_classes, nO=None):
def build_simple_cnn_text_classifier(
tok2vec: Model, exclusive_classes: bool, nO: Optional[int] = None
) -> Model:
"""
Build a simple CNN text classifier, given a token-to-vector model as inputs.
If exclusive_classes=True, a softmax non-linearity is applied, so that the
@ -90,13 +71,25 @@ def build_text_classifier(
nO=width, nV=embed_size, column=cols.index(LOWER), dropout=dropout, seed=10
)
prefix = HashEmbed(
nO=width // 2, nV=embed_size, column=cols.index(PREFIX), dropout=dropout, seed=11
nO=width // 2,
nV=embed_size,
column=cols.index(PREFIX),
dropout=dropout,
seed=11,
)
suffix = HashEmbed(
nO=width // 2, nV=embed_size, column=cols.index(SUFFIX), dropout=dropout, seed=12
nO=width // 2,
nV=embed_size,
column=cols.index(SUFFIX),
dropout=dropout,
seed=12,
)
shape = HashEmbed(
nO=width // 2, nV=embed_size, column=cols.index(SHAPE), dropout=dropout, seed=13
nO=width // 2,
nV=embed_size,
column=cols.index(SHAPE),
dropout=dropout,
seed=13,
)
width_nI = sum(layer.get_dim("nO") for layer in [lower, prefix, suffix, shape])

View File

@ -11,8 +11,8 @@ from ...attrs import ID, ORTH, NORM, PREFIX, SUFFIX, SHAPE
@registry.architectures.register("spacy.Tok2VecTensors.v1")
def tok2vec_tensors_v1(width):
tok2vec = Tok2VecListener("tok2vec", width=width)
def tok2vec_tensors_v1(width, upstream="*"):
tok2vec = Tok2VecListener(upstream_name=upstream, width=width)
return tok2vec

View File

@ -1,30 +1,37 @@
from typing import List, Dict, Iterable, Optional, Union, TYPE_CHECKING
from wasabi import Printer
import warnings
from .tokens import Doc, Token, Span
from .errors import Errors, Warnings
from .util import dot_to_dict
if TYPE_CHECKING:
# This lets us add type hints for mypy etc. without causing circular imports
from .language import Language # noqa: F401
def analyze_pipes(pipeline, name, pipe, index, warn=True):
def analyze_pipes(
nlp: "Language", name: str, index: int, warn: bool = True
) -> List[str]:
"""Analyze a pipeline component with respect to its position in the current
pipeline and the other components. Will check whether requirements are
fulfilled (e.g. if previous components assign the attributes).
pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
nlp (Language): The current nlp object.
name (str): The name of the pipeline component to analyze.
pipe (callable): The pipeline component function to analyze.
index (int): The index of the component in the pipeline.
warn (bool): Show user warning if problem is found.
RETURNS (list): The problems found for the given pipeline component.
RETURNS (List[str]): The problems found for the given pipeline component.
"""
assert pipeline[index][0] == name
prev_pipes = pipeline[:index]
pipe_requires = getattr(pipe, "requires", [])
requires = {annot: False for annot in pipe_requires}
assert nlp.pipeline[index][0] == name
prev_pipes = nlp.pipeline[:index]
meta = nlp.get_pipe_meta(name)
requires = {annot: False for annot in meta.requires}
if requires:
for prev_name, prev_pipe in prev_pipes:
prev_assigns = getattr(prev_pipe, "assigns", [])
for annot in prev_assigns:
prev_meta = nlp.get_pipe_meta(prev_name)
for annot in prev_meta.assigns:
requires[annot] = True
problems = []
for annot, fulfilled in requires.items():
@ -35,46 +42,29 @@ def analyze_pipes(pipeline, name, pipe, index, warn=True):
return problems
def analyze_all_pipes(pipeline, warn=True):
def analyze_all_pipes(nlp: "Language", warn: bool = True) -> Dict[str, List[str]]:
"""Analyze all pipes in the pipeline in order.
pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
nlp (Language): The current nlp object.
warn (bool): Show user warning if problem is found.
RETURNS (dict): The problems found, keyed by component name.
RETURNS (Dict[str, List[str]]): The problems found, keyed by component name.
"""
problems = {}
for i, (name, pipe) in enumerate(pipeline):
problems[name] = analyze_pipes(pipeline, name, pipe, i, warn=warn)
for i, name in enumerate(nlp.pipe_names):
problems[name] = analyze_pipes(nlp, name, i, warn=warn)
return problems
def dot_to_dict(values):
"""Convert dot notation to a dict. For example: ["token.pos", "token._.xyz"]
become {"token": {"pos": True, "_": {"xyz": True }}}.
values (iterable): The values to convert.
RETURNS (dict): The converted values.
"""
result = {}
for value in values:
path = result
parts = value.lower().split(".")
for i, item in enumerate(parts):
is_last = i == len(parts) - 1
path = path.setdefault(item, True if is_last else {})
return result
def validate_attrs(values):
def validate_attrs(values: Iterable[str]) -> Iterable[str]:
"""Validate component attributes provided to "assigns", "requires" etc.
Raises error for invalid attributes and formatting. Doesn't check if
custom extension attributes are registered, since this is something the
user might want to do themselves later in the component.
values (iterable): The string attributes to check, e.g. `["token.pos"]`.
RETURNS (iterable): The checked attributes.
values (Iterable[str]): The string attributes to check, e.g. `["token.pos"]`.
RETURNS (Iterable[str]): The checked attributes.
"""
data = dot_to_dict(values)
data = dot_to_dict({value: True for value in values})
objs = {"doc": Doc, "token": Token, "span": Span}
for obj_key, attrs in data.items():
if obj_key == "span":
@ -111,37 +101,40 @@ def validate_attrs(values):
return values
def _get_feature_for_attr(pipeline, attr, feature):
def _get_feature_for_attr(nlp: "Language", attr: str, feature: str) -> List[str]:
assert feature in ["assigns", "requires"]
result = []
for pipe_name, pipe in pipeline:
pipe_assigns = getattr(pipe, feature, [])
for pipe_name in nlp.pipe_names:
meta = nlp.get_pipe_meta(pipe_name)
pipe_assigns = getattr(meta, feature, [])
if attr in pipe_assigns:
result.append((pipe_name, pipe))
result.append(pipe_name)
return result
def get_assigns_for_attr(pipeline, attr):
def get_assigns_for_attr(nlp: "Language", attr: str) -> List[str]:
"""Get all pipeline components that assign an attr, e.g. "doc.tensor".
pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
pipeline (Language): The current nlp object.
attr (str): The attribute to check.
RETURNS (list): (name, pipeline) tuples of components that assign the attr.
RETURNS (List[str]): Names of components that require the attr.
"""
return _get_feature_for_attr(pipeline, attr, "assigns")
return _get_feature_for_attr(nlp, attr, "assigns")
def get_requires_for_attr(pipeline, attr):
def get_requires_for_attr(nlp: "Language", attr: str) -> List[str]:
"""Get all pipeline components that require an attr, e.g. "doc.tensor".
pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
pipeline (Language): The current nlp object.
attr (str): The attribute to check.
RETURNS (list): (name, pipeline) tuples of components that require the attr.
RETURNS (List[str]): Names of components that require the attr.
"""
return _get_feature_for_attr(pipeline, attr, "requires")
return _get_feature_for_attr(nlp, attr, "requires")
def print_summary(nlp, pretty=True, no_print=False):
def print_summary(
nlp: "Language", pretty: bool = True, no_print: bool = False
) -> Optional[Dict[str, Union[List[str], Dict[str, List[str]]]]]:
"""Print a formatted summary for the current nlp object's pipeline. Shows
a table with the pipeline components and why they assign and require, as
well as any problems if available.
@ -154,12 +147,10 @@ def print_summary(nlp, pretty=True, no_print=False):
msg = Printer(pretty=pretty, no_print=no_print)
overview = []
problems = {}
for i, (name, pipe) in enumerate(nlp.pipeline):
requires = getattr(pipe, "requires", [])
assigns = getattr(pipe, "assigns", [])
retok = getattr(pipe, "retokenizes", False)
overview.append((i, name, requires, assigns, retok))
problems[name] = analyze_pipes(nlp.pipeline, name, pipe, i, warn=False)
for i, name in enumerate(nlp.pipe_names):
meta = nlp.get_pipe_meta(name)
overview.append((i, name, meta.requires, meta.assigns, meta.retokenizes))
problems[name] = analyze_pipes(nlp, name, i, warn=False)
msg.divider("Pipeline Overview")
header = ("#", "Component", "Requires", "Assigns", "Retokenizes")
msg.table(overview, header=header, divider=True, multiline=True)
@ -175,15 +166,19 @@ def print_summary(nlp, pretty=True, no_print=False):
return {"overview": overview, "problems": problems}
def count_pipeline_interdependencies(pipeline):
def count_pipeline_interdependencies(nlp: "Language") -> List[int]:
"""Count how many subsequent components require an annotation set by each
component in the pipeline.
nlp (Language): The current nlp object.
RETURNS (List[int]): The interdependency counts.
"""
pipe_assigns = []
pipe_requires = []
for name, pipe in pipeline:
pipe_assigns.append(set(getattr(pipe, "assigns", [])))
pipe_requires.append(set(getattr(pipe, "requires", [])))
for name in nlp.pipe_names:
meta = nlp.get_pipe_meta(name)
pipe_assigns.append(set(meta.assigns))
pipe_requires.append(set(meta.requires))
counts = []
for i, assigns in enumerate(pipe_assigns):
count = 0

View File

@ -1,28 +1,33 @@
from .pipes import Tagger, DependencyParser, EntityRecognizer, EntityLinker
from .pipes import TextCategorizer, Pipe, Sentencizer
from .pipes import SentenceRecognizer
from .simple_ner import SimpleNER
from .morphologizer import Morphologizer
from .dep_parser import DependencyParser
from .entity_linker import EntityLinker
from .ner import EntityRecognizer
from .entityruler import EntityRuler
from .morphologizer import Morphologizer
from .pipe import Pipe
from spacy.pipeline.senter import SentenceRecognizer
from .sentencizer import Sentencizer
from .simple_ner import SimpleNER
from .tagger import Tagger
from .textcat import TextCategorizer
from .tok2vec import Tok2Vec
from .hooks import SentenceSegmenter, SimilarityHook
from .functions import merge_entities, merge_noun_chunks, merge_subtokens
__all__ = [
"Tagger",
"DependencyParser",
"EntityRecognizer",
"EntityLinker",
"TextCategorizer",
"Tok2Vec",
"Pipe",
"Morphologizer",
"EntityRecognizer",
"EntityRuler",
"Sentencizer",
"SentenceSegmenter",
"Morphologizer",
"Pipe",
"SentenceRecognizer",
"SentenceSegmenter",
"Sentencizer",
"SimilarityHook",
"SimpleNER",
"Tagger",
"TextCategorizer",
"Tok2Vec",
"merge_entities",
"merge_noun_chunks",
"merge_subtokens",

View File

@ -1,93 +0,0 @@
from pathlib import Path
from ... import util
def default_nel_config():
loc = Path(__file__).parent / "entity_linker_defaults.cfg"
return util.load_config(loc, create_objects=False)
def default_nel():
loc = Path(__file__).parent / "entity_linker_defaults.cfg"
return util.load_config(loc, create_objects=True)["model"]
def default_morphologizer_config():
loc = Path(__file__).parent / "morphologizer_defaults.cfg"
return util.load_config(loc, create_objects=False)
def default_morphologizer():
loc = Path(__file__).parent / "morphologizer_defaults.cfg"
return util.load_config(loc, create_objects=True)["model"]
def default_parser_config():
loc = Path(__file__).parent / "parser_defaults.cfg"
return util.load_config(loc, create_objects=False)
def default_parser():
loc = Path(__file__).parent / "parser_defaults.cfg"
return util.load_config(loc, create_objects=True)["model"]
def default_ner_config():
loc = Path(__file__).parent / "ner_defaults.cfg"
return util.load_config(loc, create_objects=False)
def default_ner():
loc = Path(__file__).parent / "ner_defaults.cfg"
return util.load_config(loc, create_objects=True)["model"]
def default_senter_config():
loc = Path(__file__).parent / "senter_defaults.cfg"
return util.load_config(loc, create_objects=False)
def default_senter():
loc = Path(__file__).parent / "senter_defaults.cfg"
return util.load_config(loc, create_objects=True)["model"]
def default_tagger_config():
loc = Path(__file__).parent / "tagger_defaults.cfg"
return util.load_config(loc, create_objects=False)
def default_tagger():
loc = Path(__file__).parent / "tagger_defaults.cfg"
return util.load_config(loc, create_objects=True)["model"]
def default_textcat_config():
loc = Path(__file__).parent / "textcat_defaults.cfg"
return util.load_config(loc, create_objects=False)
def default_textcat():
loc = Path(__file__).parent / "textcat_defaults.cfg"
return util.load_config(loc, create_objects=True)["model"]
def default_tok2vec_config():
loc = Path(__file__).parent / "tok2vec_defaults.cfg"
return util.load_config(loc, create_objects=False)
def default_tok2vec():
loc = Path(__file__).parent / "tok2vec_defaults.cfg"
return util.load_config(loc, create_objects=True)["model"]
def default_simple_ner_config():
loc = Path(__file__).parent / "simple_ner_defaults.cfg"
return util.load_config(loc, create_objects=False)
def default_simple_ner():
loc = Path(__file__).parent / "simple_ner_defaults.cfg"
return util.load_config(loc, create_objects=True)["model"]

View File

@ -1,13 +0,0 @@
[model]
@architectures = "spacy.EntityLinker.v1"
[model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v1"
pretrained_vectors = null
width = 96
depth = 2
embed_size = 300
window_size = 1
maxout_pieces = 3
subword_features = true
dropout = null

View File

@ -1,14 +0,0 @@
[model]
@architectures = "spacy.Tagger.v1"
[model.tok2vec]
@architectures = "spacy.HashCharEmbedCNN.v1"
pretrained_vectors = null
width = 128
depth = 4
embed_size = 7000
window_size = 1
maxout_pieces = 3
nM = 64
nC = 8
dropout = null

View File

@ -1,15 +0,0 @@
[model]
@architectures = "spacy.MultiTask.v1"
maxout_pieces = 3
token_vector_width = 96
[model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v1"
pretrained_vectors = null
width = 96
depth = 4
embed_size = 2000
window_size = 1
maxout_pieces = 2
subword_features = true
dropout = null

View File

@ -1,16 +0,0 @@
[model]
@architectures = "spacy.TransitionBasedParser.v1"
nr_feature_tokens = 6
hidden_width = 64
maxout_pieces = 2
[model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v1"
pretrained_vectors = null
width = 96
depth = 4
embed_size = 2000
window_size = 1
maxout_pieces = 3
subword_features = true
dropout = null

Some files were not shown because too many files have changed in this diff Show More