mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 09:14:32 +03:00
Refactor pipeline components, config and language data (#5759)
* Update with WIP * Update with WIP * Update with pipeline serialization * Update types and pipe factories * Add deep merge, tidy up and add tests * Fix pipe creation from config * Don't validate default configs on load * Update spacy/language.py Co-authored-by: Ines Montani <ines@ines.io> * Adjust factory/component meta error * Clean up factory args and remove defaults * Add test for failing empty dict defaults * Update pipeline handling and methods * provide KB as registry function instead of as object * small change in test to make functionality more clear * update example script for EL configuration * Fix typo * Simplify test * Simplify test * splitting pipes.pyx into separate files * moving default configs to each component file * fix batch_size type * removing default values from component constructors where possible (TODO: test 4725) * skip instead of xfail * Add test for config -> nlp with multiple instances * pipeline.pipes -> pipeline.pipe * Tidy up, document, remove kwargs * small cleanup/generalization for Tok2VecListener * use DEFAULT_UPSTREAM field * revert to avoid circular imports * Fix tests * Replace deprecated arg * Make model dirs require config * fix pickling of keyword-only arguments in constructor * WIP: clean up and integrate full config * Add helper to handle function args more reliably Now also includes keyword-only args * Fix config composition and serialization * Improve config debugging and add visual diff * Remove unused defaults and fix type * Remove pipeline and factories from meta * Update spacy/default_config.cfg Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update spacy/default_config.cfg * small UX edits * avoid printing stack trace for debug CLI commands * Add support for language-specific factories * specify the section of the config which holds the model to debug * WIP: add Language.from_config * Update with language data refactor WIP * Auto-format * Add backwards-compat handling for Language.factories * Update morphologizer.pyx * Fix morphologizer * Update and simplify lemmatizers * Fix Japanese tests * Port over tagger changes * Fix Chinese and tests * Update to latest Thinc * WIP: xfail first Russian lemmatizer test * Fix component-specific overrides * fix nO for output layers in debug_model * Fix default value * Fix tests and don't pass objects in config * Fix deep merging * Fix lemma lookup data registry Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed) * Add types * Add Vocab.from_config * Fix typo * Fix tests * Make config copying more elegant * Fix pipe analysis * Fix lemmatizers and is_base_form * WIP: move language defaults to config * Fix morphology type * Fix vocab * Remove comment * Update to latest Thinc * Add morph rules to config * Tidy up * Remove set_morphology option from tagger factory * Hack use_gpu * Move [pipeline] to top-level block and make [nlp.pipeline] list Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them * Fix use_gpu and resume in CLI * Auto-format * Remove resume from config * Fix formatting and error * [pipeline] -> [components] * Fix types * Fix tagger test: requires set_morphology? Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com> Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
This commit is contained in:
parent
311d0bde29
commit
43b960c01b
|
@ -17,7 +17,6 @@ import plac
|
|||
import random
|
||||
from pathlib import Path
|
||||
import spacy
|
||||
from spacy.kb import KnowledgeBase
|
||||
|
||||
from spacy.gold import Example
|
||||
from spacy.pipeline import EntityRuler
|
||||
|
@ -82,12 +81,16 @@ def main(kb_path, vocab_path, output_dir=None, n_iter=50):
|
|||
|
||||
# Create the Entity Linker component and add it to the pipeline.
|
||||
if "entity_linker" not in nlp.pipe_names:
|
||||
kb = KnowledgeBase(vocab=nlp.vocab)
|
||||
kb.load_bulk(kb_path)
|
||||
print("Loaded Knowledge Base from '%s'" % kb_path)
|
||||
|
||||
# use only the predicted EL score and not the prior probability (for demo purposes)
|
||||
cfg = {"kb": kb, "incl_prior": False}
|
||||
print("Loading Knowledge Base from '%s'" % kb_path)
|
||||
cfg = {
|
||||
"kb": {
|
||||
"@assets": "spacy.KBFromFile.v1",
|
||||
"vocab_path": vocab_path,
|
||||
"kb_path": kb_path,
|
||||
},
|
||||
# use only the predicted EL score and not the prior probability (for demo purposes)
|
||||
"incl_prior": False,
|
||||
}
|
||||
entity_linker = nlp.create_pipe("entity_linker", cfg)
|
||||
nlp.add_pipe(entity_linker, last=True)
|
||||
|
||||
|
|
|
@ -6,7 +6,7 @@ requires = [
|
|||
"cymem>=2.0.2,<2.1.0",
|
||||
"preshed>=3.0.2,<3.1.0",
|
||||
"murmurhash>=0.28.0,<1.1.0",
|
||||
"thinc>=8.0.0a18,<8.0.0a20",
|
||||
"thinc>=8.0.0a19,<8.0.0a30",
|
||||
"blis>=0.4.0,<0.5.0",
|
||||
"pytokenizations"
|
||||
]
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
# Our libraries
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
thinc>=8.0.0a18,<8.0.0a20
|
||||
thinc>=8.0.0a19,<8.0.0a30
|
||||
blis>=0.4.0,<0.5.0
|
||||
ml_datasets>=0.1.1
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
wasabi>=0.7.0,<1.1.0
|
||||
wasabi>=0.7.1,<1.1.0
|
||||
srsly>=2.1.0,<3.0.0
|
||||
catalogue>=0.0.7,<1.1.0
|
||||
typer>=0.3.0,<0.4.0
|
||||
|
|
|
@ -34,15 +34,15 @@ setup_requires =
|
|||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
thinc>=8.0.0a18,<8.0.0a20
|
||||
thinc>=8.0.0a19,<8.0.0a30
|
||||
install_requires =
|
||||
# Our libraries
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
thinc>=8.0.0a18,<8.0.0a20
|
||||
thinc>=8.0.0a19,<8.0.0a30
|
||||
blis>=0.4.0,<0.5.0
|
||||
wasabi>=0.7.0,<1.1.0
|
||||
wasabi>=0.7.1,<1.1.0
|
||||
srsly>=2.1.0,<3.0.0
|
||||
catalogue>=0.0.7,<1.1.0
|
||||
typer>=0.3.0,<0.4.0
|
||||
|
|
8
setup.py
8
setup.py
|
@ -32,8 +32,14 @@ MOD_NAMES = [
|
|||
"spacy.attrs",
|
||||
"spacy.kb",
|
||||
"spacy.morphology",
|
||||
"spacy.pipeline.pipes",
|
||||
"spacy.pipeline.dep_parser",
|
||||
"spacy.pipeline.morphologizer",
|
||||
"spacy.pipeline.multitask",
|
||||
"spacy.pipeline.ner",
|
||||
"spacy.pipeline.pipe",
|
||||
"spacy.pipeline.sentencizer",
|
||||
"spacy.pipeline.senter",
|
||||
"spacy.pipeline.tagger",
|
||||
"spacy.syntax.stateclass",
|
||||
"spacy.syntax._state",
|
||||
"spacy.tokenizer",
|
||||
|
|
|
@ -14,7 +14,6 @@ from .about import __version__
|
|||
from .errors import Errors, Warnings
|
||||
from . import util
|
||||
from .util import registry
|
||||
from .language import component
|
||||
|
||||
|
||||
if sys.maxunicode == 65535:
|
||||
|
|
|
@ -63,7 +63,7 @@ def parse_config_overrides(args: List[str]) -> Dict[str, Any]:
|
|||
result = {}
|
||||
while args:
|
||||
opt = args.pop(0)
|
||||
err = f"Invalid config override '{opt}'"
|
||||
err = f"Invalid CLI argument '{opt}'"
|
||||
if opt.startswith("--"): # new argument
|
||||
opt = opt.replace("--", "").replace("-", "_")
|
||||
if "." not in opt:
|
||||
|
@ -73,7 +73,7 @@ def parse_config_overrides(args: List[str]) -> Dict[str, Any]:
|
|||
else:
|
||||
value = args.pop(0)
|
||||
# Just like we do in the config, we're calling json.loads on the
|
||||
# values. But since they come from the CLI, it'd b unintuitive to
|
||||
# values. But since they come from the CLI, it'd be unintuitive to
|
||||
# explicitly mark strings with escaped quotes. So we're working
|
||||
# around that here by falling back to a string if parsing fails.
|
||||
# TODO: improve logic to handle simple types like list of strings?
|
||||
|
@ -82,7 +82,7 @@ def parse_config_overrides(args: List[str]) -> Dict[str, Any]:
|
|||
except ValueError:
|
||||
result[opt] = str(value)
|
||||
else:
|
||||
msg.fail(f"{err}: options need to start with --", exits=1)
|
||||
msg.fail(f"{err}: override option should start with --", exits=1)
|
||||
return result
|
||||
|
||||
|
||||
|
|
|
@ -3,12 +3,12 @@ from pathlib import Path
|
|||
from collections import Counter
|
||||
import sys
|
||||
import srsly
|
||||
from wasabi import Printer, MESSAGES, msg
|
||||
from wasabi import Printer, MESSAGES, msg, diff_strings
|
||||
import typer
|
||||
from thinc.api import Config
|
||||
|
||||
from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
|
||||
from ._util import import_code, debug_cli
|
||||
from ..schemas import ConfigSchema
|
||||
from ..gold import Corpus, Example
|
||||
from ..syntax import nonproj
|
||||
from ..language import Language
|
||||
|
@ -33,6 +33,9 @@ def debug_config_cli(
|
|||
ctx: typer.Context, # This is only used to read additional arguments
|
||||
config_path: Path = Arg(..., help="Path to config file", exists=True),
|
||||
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||
output_path: Optional[Path] = Opt(None, "--output", "-o", help="Output path for filled config or '-' for standard output", allow_dash=True),
|
||||
auto_fill: bool = Opt(False, "--auto-fill", "-F", help="Whether or not to auto-fill the config with built-in defaults if possible"),
|
||||
diff: bool = Opt(False, "--diff", "-D", help="Show a visual diff if config was auto-filled")
|
||||
# fmt: on
|
||||
):
|
||||
"""Debug a config.cfg file and show validation errors. The command will
|
||||
|
@ -40,14 +43,37 @@ def debug_config_cli(
|
|||
validation errors are blocking and will prevent the rest of the config from
|
||||
being resolved. This means that you may not see all validation errors at
|
||||
once and some issues are only shown once previous errors have been fixed.
|
||||
Similar as with the 'train' command, you can override settings from the config
|
||||
as command line options. For instance, --training.batch_size 128 overrides
|
||||
the value of "batch_size" in the block "[training]".
|
||||
"""
|
||||
overrides = parse_config_overrides(ctx.args)
|
||||
import_code(code_path)
|
||||
with show_validation_error():
|
||||
util.load_config(
|
||||
config_path, create_objects=False, schema=ConfigSchema, overrides=overrides,
|
||||
)
|
||||
msg.good("Config is valid")
|
||||
config = Config().from_disk(config_path)
|
||||
try:
|
||||
nlp, _ = util.load_model_from_config(
|
||||
config, overrides=overrides, auto_fill=auto_fill
|
||||
)
|
||||
except ValueError as e:
|
||||
msg.fail(str(e), exits=1)
|
||||
is_stdout = output_path is not None and str(output_path) == "-"
|
||||
if auto_fill:
|
||||
orig_config = config.to_str()
|
||||
filled_config = nlp.config.to_str()
|
||||
if orig_config == filled_config:
|
||||
msg.good("Original config is valid, no values were auto-filled")
|
||||
else:
|
||||
msg.good("Auto-filled config is valid")
|
||||
if diff:
|
||||
print(diff_strings(config.to_str(), nlp.config.to_str()))
|
||||
else:
|
||||
msg.good("Original config is valid", show=not is_stdout)
|
||||
if is_stdout:
|
||||
print(nlp.config.to_str())
|
||||
elif output_path is not None:
|
||||
nlp.config.to_disk(output_path)
|
||||
msg.good(f"Saved updated config to {output_path}")
|
||||
|
||||
|
||||
@debug_cli.command(
|
||||
|
@ -117,16 +143,13 @@ def debug_data(
|
|||
if not config_path.exists():
|
||||
msg.fail("Config file not found", config_path, exists=1)
|
||||
with show_validation_error():
|
||||
config = util.load_config(
|
||||
config_path,
|
||||
create_objects=False,
|
||||
schema=ConfigSchema,
|
||||
overrides=config_overrides,
|
||||
)
|
||||
nlp = util.load_model_from_config(config["nlp"])
|
||||
cfg = Config().from_disk(config_path)
|
||||
nlp, config = util.load_model_from_config(cfg, overrides=config_overrides)
|
||||
# TODO: handle base model
|
||||
lang = config["nlp"]["lang"]
|
||||
base_model = config["nlp"]["base_model"]
|
||||
pipeline = list(config["nlp"]["pipeline"].keys())
|
||||
base_model = config["training"]["base_model"]
|
||||
pipeline = nlp.pipe_names
|
||||
factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names]
|
||||
tag_map_path = util.ensure_path(config["training"]["tag_map"])
|
||||
tag_map = {}
|
||||
if tag_map_path is not None:
|
||||
|
@ -164,19 +187,17 @@ def debug_data(
|
|||
msg.good("Corpus is loadable")
|
||||
|
||||
# Create all gold data here to avoid iterating over the train_dataset constantly
|
||||
gold_train_data = _compile_gold(train_dataset, pipeline, nlp, make_proj=True)
|
||||
gold_train_data = _compile_gold(train_dataset, factory_names, nlp, make_proj=True)
|
||||
gold_train_unpreprocessed_data = _compile_gold(
|
||||
train_dataset, pipeline, nlp, make_proj=False
|
||||
train_dataset, factory_names, nlp, make_proj=False
|
||||
)
|
||||
gold_dev_data = _compile_gold(dev_dataset, pipeline, nlp, make_proj=True)
|
||||
gold_dev_data = _compile_gold(dev_dataset, factory_names, nlp, make_proj=True)
|
||||
|
||||
train_texts = gold_train_data["texts"]
|
||||
dev_texts = gold_dev_data["texts"]
|
||||
|
||||
msg.divider("Training stats")
|
||||
msg.text(f"Training pipeline: {', '.join(pipeline)}")
|
||||
for pipe in [p for p in pipeline if p not in nlp.factories]:
|
||||
msg.fail(f"Pipeline component '{pipe}' not available in factories")
|
||||
if base_model:
|
||||
msg.text(f"Starting with base model '{base_model}'")
|
||||
else:
|
||||
|
@ -244,7 +265,7 @@ def debug_data(
|
|||
else:
|
||||
msg.info("No word vectors present in the model")
|
||||
|
||||
if "ner" in pipeline:
|
||||
if "ner" in factory_names:
|
||||
# Get all unique NER labels present in the data
|
||||
labels = set(
|
||||
label for label in gold_train_data["ner"] if label not in ("O", "-", None)
|
||||
|
@ -332,7 +353,7 @@ def debug_data(
|
|||
"with punctuation can not be trained with a noise level > 0."
|
||||
)
|
||||
|
||||
if "textcat" in pipeline:
|
||||
if "textcat" in factory_names:
|
||||
msg.divider("Text Classification")
|
||||
labels = [label for label in gold_train_data["cats"]]
|
||||
model_labels = _get_labels_from_model(nlp, "textcat")
|
||||
|
@ -379,7 +400,7 @@ def debug_data(
|
|||
"contains only instances with mutually-exclusive classes."
|
||||
)
|
||||
|
||||
if "tagger" in pipeline:
|
||||
if "tagger" in factory_names:
|
||||
msg.divider("Part-of-speech Tagging")
|
||||
labels = [label for label in gold_train_data["tags"]]
|
||||
tag_map = nlp.vocab.morphology.tag_map
|
||||
|
@ -394,7 +415,7 @@ def debug_data(
|
|||
for label in non_tagmap:
|
||||
msg.fail(f"Label '{label}' not found in tag map for language '{nlp.lang}'")
|
||||
|
||||
if "parser" in pipeline:
|
||||
if "parser" in factory_names:
|
||||
has_low_data_warning = False
|
||||
msg.divider("Dependency Parsing")
|
||||
|
||||
|
@ -541,7 +562,10 @@ def _load_file(file_path: Path, msg: Printer) -> None:
|
|||
|
||||
|
||||
def _compile_gold(
|
||||
examples: Sequence[Example], pipeline: List[str], nlp: Language, make_proj: bool
|
||||
examples: Sequence[Example],
|
||||
factory_names: List[str],
|
||||
nlp: Language,
|
||||
make_proj: bool,
|
||||
) -> Dict[str, Any]:
|
||||
data = {
|
||||
"ner": Counter(),
|
||||
|
@ -573,7 +597,7 @@ def _compile_gold(
|
|||
for word in valid_words:
|
||||
if nlp.vocab.strings[word] not in nlp.vocab.vectors:
|
||||
data["words_missing_vectors"].update([word])
|
||||
if "ner" in pipeline:
|
||||
if "ner" in factory_names:
|
||||
for i, label in enumerate(eg.get_aligned_ner()):
|
||||
if label is None:
|
||||
continue
|
||||
|
@ -595,14 +619,14 @@ def _compile_gold(
|
|||
data["ner"][combined_label] += 1
|
||||
elif label == "-":
|
||||
data["ner"]["-"] += 1
|
||||
if "textcat" in pipeline:
|
||||
if "textcat" in factory_names:
|
||||
data["cats"].update(gold.cats)
|
||||
if list(gold.cats.values()).count(1.0) != 1:
|
||||
data["n_cats_multilabel"] += 1
|
||||
if "tagger" in pipeline:
|
||||
if "tagger" in factory_names:
|
||||
tags = eg.get_aligned("TAG", as_string=True)
|
||||
data["tags"].update([x for x in tags if x is not None])
|
||||
if "parser" in pipeline:
|
||||
if "parser" in factory_names:
|
||||
aligned_heads, aligned_deps = eg.get_aligned_parse(projectivize=make_proj)
|
||||
data["deps"].update([x for x in aligned_deps if x is not None])
|
||||
for i, (dep, head) in enumerate(zip(aligned_deps, aligned_heads)):
|
||||
|
|
|
@ -1,8 +1,11 @@
|
|||
from typing import Dict, Any, Optional
|
||||
from pathlib import Path
|
||||
from wasabi import msg
|
||||
from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam
|
||||
from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam, Config
|
||||
from thinc.api import Model
|
||||
import typer
|
||||
|
||||
from ._util import Arg, Opt, debug_cli
|
||||
from ._util import Arg, Opt, debug_cli, show_validation_error, parse_config_overrides
|
||||
from .. import util
|
||||
from ..lang.en import English
|
||||
|
||||
|
@ -10,8 +13,10 @@ from ..lang.en import English
|
|||
@debug_cli.command("model")
|
||||
def debug_model_cli(
|
||||
# fmt: off
|
||||
ctx: typer.Context, # This is only used to read additional arguments
|
||||
config_path: Path = Arg(..., help="Path to config file", exists=True),
|
||||
layers: str = Opt("", "--layers", "-l", help="Comma-separated names of pipeline components to train"),
|
||||
section: str = Arg(..., help="Section that defines the model to be analysed"),
|
||||
layers: str = Opt("", "--layers", "-l", help="Comma-separated names of layer IDs to print"),
|
||||
dimensions: bool = Opt(False, "--dimensions", "-DIM", help="Show dimensions"),
|
||||
parameters: bool = Opt(False, "--parameters", "-PAR", help="Show parameters"),
|
||||
gradients: bool = Opt(False, "--gradients", "-GRAD", help="Show gradients"),
|
||||
|
@ -20,14 +25,18 @@ def debug_model_cli(
|
|||
P1: bool = Opt(False, "--print-step1", "-P1", help="Print model after initialization"),
|
||||
P2: bool = Opt(False, "--print-step2", "-P2", help="Print model after training"),
|
||||
P3: bool = Opt(True, "--print-step3", "-P3", help="Print final predictions"),
|
||||
use_gpu: int = Opt(-1, "--use-gpu", "-g", help="Use GPU"),
|
||||
seed: int = Opt(None, "--seed", "-s", help="Use GPU"),
|
||||
use_gpu: int = Opt(-1, "--use-gpu", "-g", help="GPU ID or -1 for CPU")
|
||||
# fmt: on
|
||||
):
|
||||
"""
|
||||
Analyze a Thinc model implementation. Includes checks for internal structure
|
||||
and activations during training.
|
||||
"""
|
||||
if use_gpu >= 0:
|
||||
msg.info("Using GPU")
|
||||
require_gpu(use_gpu)
|
||||
else:
|
||||
msg.info("Using CPU")
|
||||
print_settings = {
|
||||
"dimensions": dimensions,
|
||||
"parameters": parameters,
|
||||
|
@ -39,27 +48,47 @@ def debug_model_cli(
|
|||
"print_after_training": P2,
|
||||
"print_prediction": P3,
|
||||
}
|
||||
|
||||
config_overrides = parse_config_overrides(ctx.args)
|
||||
cfg = Config().from_disk(config_path)
|
||||
with show_validation_error():
|
||||
try:
|
||||
_, config = util.load_model_from_config(cfg, overrides=config_overrides)
|
||||
except ValueError as e:
|
||||
msg.fail(str(e), exits=1)
|
||||
seed = config["pretraining"]["seed"]
|
||||
if seed is not None:
|
||||
msg.info(f"Fixing random seed: {seed}")
|
||||
fix_random_seed(seed)
|
||||
if use_gpu >= 0:
|
||||
msg.info(f"Using GPU: {use_gpu}")
|
||||
require_gpu(use_gpu)
|
||||
|
||||
component = config
|
||||
parts = section.split(".")
|
||||
for item in parts:
|
||||
try:
|
||||
component = component[item]
|
||||
except KeyError:
|
||||
msg.fail(
|
||||
f"The section '{section}' is not a valid section in the provided config.",
|
||||
exits=1,
|
||||
)
|
||||
if hasattr(component, "model"):
|
||||
model = component.model
|
||||
else:
|
||||
msg.info(f"Using CPU")
|
||||
|
||||
debug_model(
|
||||
config_path, print_settings=print_settings,
|
||||
)
|
||||
msg.fail(
|
||||
f"The section '{section}' does not specify an object that holds a Model.",
|
||||
exits=1,
|
||||
)
|
||||
debug_model(model, print_settings=print_settings)
|
||||
|
||||
|
||||
def debug_model(config_path: Path, *, print_settings=None):
|
||||
def debug_model(model: Model, *, print_settings: Optional[Dict[str, Any]] = None):
|
||||
if not isinstance(model, Model):
|
||||
msg.fail(
|
||||
f"Requires a Thinc Model to be analysed, but found {type(model)} instead.",
|
||||
exits=1,
|
||||
)
|
||||
if print_settings is None:
|
||||
print_settings = {}
|
||||
|
||||
model = util.load_config(config_path, create_objects=True)["model"]
|
||||
|
||||
# STEP 0: Printing before training
|
||||
msg.info(f"Analysing model with ID {model.id}")
|
||||
if print_settings.get("print_before_training"):
|
||||
|
@ -67,7 +96,9 @@ def debug_model(config_path: Path, *, print_settings=None):
|
|||
_print_model(model, print_settings)
|
||||
|
||||
# STEP 1: Initializing the model and printing again
|
||||
model.initialize(X=_get_docs(), Y=_get_output(model.ops.xp))
|
||||
Y = _get_output(model.ops.xp)
|
||||
_set_output_dim(nO=Y.shape[-1], model=model)
|
||||
model.initialize(X=_get_docs(), Y=Y)
|
||||
if print_settings.get("print_after_init"):
|
||||
msg.info(f"After initialization:")
|
||||
_print_model(model, print_settings)
|
||||
|
@ -110,12 +141,16 @@ def _get_docs():
|
|||
|
||||
|
||||
def _get_output(xp):
|
||||
return xp.asarray(
|
||||
[
|
||||
xp.asarray([i + 10, i + 20, i + 30], dtype="float32")
|
||||
for i, _ in enumerate(_get_docs())
|
||||
]
|
||||
)
|
||||
return xp.asarray([i + 10 for i, _ in enumerate(_get_docs())], dtype="float32")
|
||||
|
||||
|
||||
def _set_output_dim(model, nO):
|
||||
# the dim inference doesn't always work 100%, we need this hack like we have it in pipe.pyx
|
||||
if model.has_dim("nO") is None:
|
||||
model.set_dim("nO", nO)
|
||||
if model.has_ref("output_layer"):
|
||||
if model.get_ref("output_layer").has_dim("nO") is None:
|
||||
model.get_ref("output_layer").set_dim("nO", nO)
|
||||
|
||||
|
||||
def _print_model(model, print_settings):
|
||||
|
|
|
@ -105,9 +105,10 @@ def evaluate(
|
|||
print_textcats_auc_per_cat(msg, scorer.textcats_auc_per_cat)
|
||||
|
||||
if displacy_path:
|
||||
factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names]
|
||||
docs = [ex.predicted for ex in dev_dataset]
|
||||
render_deps = "parser" in nlp.meta.get("pipeline", [])
|
||||
render_ents = "ner" in nlp.meta.get("pipeline", [])
|
||||
render_deps = "parser" in factory_names
|
||||
render_ents = "ner" in factory_names
|
||||
render_parses(
|
||||
docs,
|
||||
displacy_path,
|
||||
|
|
|
@ -84,7 +84,6 @@ def info_model(model: str, *, silent: bool = True) -> Dict[str, Any]:
|
|||
msg.fail("Can't find model meta.json", meta_path, exits=1)
|
||||
meta = srsly.read_json(meta_path)
|
||||
if model_path.resolve() != model_path:
|
||||
meta["link"] = str(model_path)
|
||||
meta["source"] = str(model_path.resolve())
|
||||
else:
|
||||
meta["source"] = str(model_path)
|
||||
|
|
|
@ -125,7 +125,6 @@ def get_meta(
|
|||
meta.update(existing_meta)
|
||||
nlp = util.load_model_from_path(Path(model_path))
|
||||
meta["spacy_version"] = util.get_model_version_range(about.__version__)
|
||||
meta["pipeline"] = nlp.pipe_names
|
||||
meta["vectors"] = {
|
||||
"width": nlp.vocab.vectors_length,
|
||||
"vectors": len(nlp.vocab.vectors),
|
||||
|
|
|
@ -5,7 +5,7 @@ import time
|
|||
import re
|
||||
from collections import Counter
|
||||
from pathlib import Path
|
||||
from thinc.api import use_pytorch_for_gpu_memory, require_gpu
|
||||
from thinc.api import use_pytorch_for_gpu_memory, require_gpu, Config
|
||||
from thinc.api import set_dropout_rate, to_categorical, fix_random_seed
|
||||
from thinc.api import CosineDistance, L2Distance
|
||||
from wasabi import msg
|
||||
|
@ -15,7 +15,6 @@ import typer
|
|||
|
||||
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
|
||||
from ._util import import_code
|
||||
from ..schemas import ConfigSchema
|
||||
from ..errors import Errors
|
||||
from ..ml.models.multi_task import build_cloze_multi_task_model
|
||||
from ..ml.models.multi_task import build_cloze_characters_multi_task_model
|
||||
|
@ -37,6 +36,7 @@ def pretrain_cli(
|
|||
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||
resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
|
||||
epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using '--resume_path'. Prevents unintended overwriting of existing weight files."),
|
||||
use_gpu: int = Opt(-1, "--use-gpu", "-g", help="GPU ID or -1 for CPU"),
|
||||
# fmt: on
|
||||
):
|
||||
"""
|
||||
|
@ -67,6 +67,7 @@ def pretrain_cli(
|
|||
config_overrides=overrides,
|
||||
resume_path=resume_path,
|
||||
epoch_resume=epoch_resume,
|
||||
use_gpu=use_gpu,
|
||||
)
|
||||
|
||||
|
||||
|
@ -77,40 +78,29 @@ def pretrain(
|
|||
config_overrides: Dict[str, Any] = {},
|
||||
resume_path: Optional[Path] = None,
|
||||
epoch_resume: Optional[int] = None,
|
||||
use_gpu: int = -1,
|
||||
):
|
||||
verify_cli_args(texts_loc, output_dir, config_path, resume_path, epoch_resume)
|
||||
msg.info(f"Loading config from: {config_path}")
|
||||
with show_validation_error():
|
||||
config = util.load_config(
|
||||
config_path,
|
||||
create_objects=False,
|
||||
validate=True,
|
||||
schema=ConfigSchema,
|
||||
overrides=config_overrides,
|
||||
)
|
||||
if not output_dir.exists():
|
||||
output_dir.mkdir()
|
||||
msg.good(f"Created output directory: {output_dir}")
|
||||
|
||||
use_gpu = config["training"]["use_gpu"]
|
||||
if use_gpu >= 0:
|
||||
msg.info("Using GPU")
|
||||
require_gpu(use_gpu)
|
||||
else:
|
||||
msg.info("Using CPU")
|
||||
|
||||
msg.info(f"Loading config from: {config_path}")
|
||||
config = Config().from_disk(config_path)
|
||||
with show_validation_error():
|
||||
nlp, config = util.load_model_from_config(config, overrides=config_overrides)
|
||||
# TODO: validate that [pretraining] block exists
|
||||
if not output_dir.exists():
|
||||
output_dir.mkdir()
|
||||
msg.good(f"Created output directory: {output_dir}")
|
||||
seed = config["pretraining"]["seed"]
|
||||
if seed is not None:
|
||||
fix_random_seed(seed)
|
||||
if use_gpu >= 0 and config["pretraining"]["use_pytorch_for_gpu_memory"]:
|
||||
use_pytorch_for_gpu_memory()
|
||||
|
||||
nlp_config = config["nlp"]
|
||||
srsly.write_json(output_dir / "config.json", config)
|
||||
config.to_disk(output_dir / "config.cfg")
|
||||
msg.good("Saved config file in the output directory")
|
||||
|
||||
config = util.load_config(config_path, create_objects=True)
|
||||
nlp = util.load_model_from_config(nlp_config)
|
||||
pretrain_config = config["pretraining"]
|
||||
|
||||
if texts_loc != "-": # reading from a file
|
||||
|
|
|
@ -25,7 +25,7 @@ def profile_cli(
|
|||
# fmt: on
|
||||
):
|
||||
"""
|
||||
Profile a spaCy pipeline, to find out which functions take the most time.
|
||||
Profile which functions take the most time in a spaCy pipeline.
|
||||
Input should be formatted as one JSON object per line with a key "text".
|
||||
It can either be provided as a JSONL file, or be read from sys.sytdin.
|
||||
If no input file is specified, the IMDB dataset is loaded via Thinc.
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from typing import Optional, Dict, Any
|
||||
from typing import Optional, Dict, Any, Tuple, Union, Callable, List
|
||||
from timeit import default_timer as timer
|
||||
import srsly
|
||||
import tqdm
|
||||
|
@ -7,6 +7,7 @@ from wasabi import msg
|
|||
import thinc
|
||||
import thinc.schedules
|
||||
from thinc.api import use_pytorch_for_gpu_memory, require_gpu, fix_random_seed
|
||||
from thinc.api import Config, Optimizer
|
||||
import random
|
||||
import typer
|
||||
|
||||
|
@ -14,18 +15,15 @@ from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
|
|||
from ._util import import_code
|
||||
from ..gold import Corpus, Example
|
||||
from ..lookups import Lookups
|
||||
from ..language import Language
|
||||
from .. import util
|
||||
from ..errors import Errors
|
||||
from ..schemas import ConfigSchema
|
||||
|
||||
|
||||
# Don't remove - required to load the built-in architectures
|
||||
from ..ml import models # noqa: F401
|
||||
|
||||
|
||||
registry = util.registry
|
||||
|
||||
|
||||
@app.command(
|
||||
"train", context_settings={"allow_extra_args": True, "ignore_unknown_options": True}
|
||||
)
|
||||
|
@ -38,6 +36,8 @@ def train_cli(
|
|||
output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store model in"),
|
||||
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||
use_gpu: int = Opt(-1, "--use-gpu", "-g", help="GPU ID or -1 for CPU"),
|
||||
resume: bool = Opt(False, "--resume", "-R", help="Resume training"),
|
||||
# fmt: on
|
||||
):
|
||||
"""
|
||||
|
@ -53,9 +53,7 @@ def train_cli(
|
|||
referenced in the config.
|
||||
"""
|
||||
util.set_env_log(verbose)
|
||||
verify_cli_args(
|
||||
train_path=train_path, dev_path=dev_path, config_path=config_path,
|
||||
)
|
||||
verify_cli_args(train_path, dev_path, config_path)
|
||||
overrides = parse_config_overrides(ctx.args)
|
||||
import_code(code_path)
|
||||
train(
|
||||
|
@ -63,6 +61,8 @@ def train_cli(
|
|||
{"train": train_path, "dev": dev_path},
|
||||
output_path=output_path,
|
||||
config_overrides=overrides,
|
||||
use_gpu=use_gpu,
|
||||
resume_training=resume,
|
||||
)
|
||||
|
||||
|
||||
|
@ -72,63 +72,53 @@ def train(
|
|||
raw_text: Optional[Path] = None,
|
||||
output_path: Optional[Path] = None,
|
||||
config_overrides: Dict[str, Any] = {},
|
||||
use_gpu: int = -1,
|
||||
resume_training: bool = False,
|
||||
) -> None:
|
||||
msg.info(f"Loading config from: {config_path}")
|
||||
# Read the config first without creating objects, to get to the original nlp_config
|
||||
with show_validation_error():
|
||||
config = util.load_config(
|
||||
config_path,
|
||||
create_objects=False,
|
||||
schema=ConfigSchema,
|
||||
overrides=config_overrides,
|
||||
)
|
||||
use_gpu = config["training"]["use_gpu"]
|
||||
if use_gpu >= 0:
|
||||
msg.info(f"Using GPU: {use_gpu}")
|
||||
require_gpu(use_gpu)
|
||||
else:
|
||||
msg.info("Using CPU")
|
||||
msg.info(f"Loading config and nlp from: {config_path}")
|
||||
config = Config().from_disk(config_path)
|
||||
with show_validation_error():
|
||||
nlp, config = util.load_model_from_config(config, overrides=config_overrides)
|
||||
if config["training"]["base_model"]:
|
||||
base_nlp = util.load_model(config["training"]["base_model"])
|
||||
# TODO: do something to check base_nlp against regular nlp described in config?
|
||||
nlp = base_nlp
|
||||
verify_config(nlp)
|
||||
raw_text, tag_map, morph_rules, weights_data = load_from_paths(config)
|
||||
if config["training"]["seed"] is not None:
|
||||
fix_random_seed(config["training"]["seed"])
|
||||
if config["training"].get("use_pytorch_for_gpu_memory"):
|
||||
if config["training"]["use_pytorch_for_gpu_memory"]:
|
||||
# It feels kind of weird to not have a default for this.
|
||||
use_pytorch_for_gpu_memory()
|
||||
nlp_config = config["nlp"]
|
||||
config = util.load_config(
|
||||
config_path,
|
||||
create_objects=True,
|
||||
schema=ConfigSchema,
|
||||
overrides=config_overrides,
|
||||
)
|
||||
training = config["training"]
|
||||
msg.info("Creating nlp from config")
|
||||
nlp = util.load_model_from_config(nlp_config)
|
||||
optimizer = training["optimizer"]
|
||||
limit = training["limit"]
|
||||
corpus = Corpus(data_paths["train"], data_paths["dev"], limit=limit)
|
||||
if "textcat" in nlp_config["pipeline"]:
|
||||
verify_textcat_config(nlp, nlp_config)
|
||||
if training.get("resume", False):
|
||||
if resume_training:
|
||||
msg.info("Resuming training")
|
||||
nlp.resume_training()
|
||||
else:
|
||||
msg.info(f"Initializing the nlp pipeline: {nlp.pipe_names}")
|
||||
train_examples = list(
|
||||
corpus.train_dataset(
|
||||
nlp,
|
||||
shuffle=False,
|
||||
gold_preproc=training["gold_preproc"],
|
||||
max_length=training["max_length"],
|
||||
)
|
||||
train_examples = corpus.train_dataset(
|
||||
nlp,
|
||||
shuffle=False,
|
||||
gold_preproc=training["gold_preproc"],
|
||||
max_length=training["max_length"],
|
||||
)
|
||||
train_examples = list(train_examples)
|
||||
nlp.begin_training(lambda: train_examples)
|
||||
|
||||
# Replace tag map with provided mapping
|
||||
nlp.vocab.morphology.load_tag_map(tag_map)
|
||||
|
||||
# Load morph rules
|
||||
nlp.vocab.morphology.load_morph_exceptions(morph_rules)
|
||||
if tag_map:
|
||||
# Replace tag map with provided mapping
|
||||
nlp.vocab.morphology.load_tag_map(tag_map)
|
||||
if morph_rules:
|
||||
# Load morph rules
|
||||
nlp.vocab.morphology.load_morph_exceptions(morph_rules)
|
||||
|
||||
# Create empty extra lexeme tables so the data from spacy-lookups-data
|
||||
# isn't loaded if these features are accessed
|
||||
|
@ -151,9 +141,8 @@ def train(
|
|||
for subpath in tok2vec_path.split("."):
|
||||
tok2vec = tok2vec.get(subpath)
|
||||
if not tok2vec:
|
||||
msg.fail(
|
||||
f"Could not locate the tok2vec model at {tok2vec_path}.", exits=1,
|
||||
)
|
||||
err = f"Could not locate the tok2vec model at {tok2vec_path}"
|
||||
msg.fail(err, exits=1)
|
||||
tok2vec.from_bytes(weights_data)
|
||||
|
||||
msg.info("Loading training corpus")
|
||||
|
@ -169,12 +158,11 @@ def train(
|
|||
evaluate,
|
||||
dropout=training["dropout"],
|
||||
accumulate_gradient=training["accumulate_gradient"],
|
||||
patience=training.get("patience", 0),
|
||||
max_steps=training.get("max_steps", 0),
|
||||
patience=training["patience"],
|
||||
max_steps=training["max_steps"],
|
||||
eval_frequency=training["eval_frequency"],
|
||||
raw_text=raw_text,
|
||||
)
|
||||
|
||||
msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}")
|
||||
print_row = setup_printer(training, nlp)
|
||||
|
||||
|
@ -209,8 +197,10 @@ def train(
|
|||
msg.good(f"Saved model to output directory {final_model_path}")
|
||||
|
||||
|
||||
def create_train_batches(nlp, corpus, cfg):
|
||||
max_epochs = cfg.get("max_epochs", 0)
|
||||
def create_train_batches(
|
||||
nlp: Language, corpus: Corpus, cfg: Union[Config, Dict[str, Any]]
|
||||
):
|
||||
max_epochs = cfg["max_epochs"]
|
||||
train_examples = list(
|
||||
corpus.train_dataset(
|
||||
nlp,
|
||||
|
@ -219,9 +209,8 @@ def create_train_batches(nlp, corpus, cfg):
|
|||
max_length=cfg["max_length"],
|
||||
)
|
||||
)
|
||||
|
||||
epoch = 0
|
||||
batch_strategy = cfg.get("batch_by", "sequences")
|
||||
batch_strategy = cfg["batch_by"]
|
||||
while True:
|
||||
if len(train_examples) == 0:
|
||||
raise ValueError(Errors.E988)
|
||||
|
@ -241,7 +230,6 @@ def create_train_batches(nlp, corpus, cfg):
|
|||
)
|
||||
else:
|
||||
batches = util.minibatch(train_examples, size=cfg["batch_size"])
|
||||
|
||||
# make sure the minibatch_by_words result is not empty, or we'll have an infinite training loop
|
||||
try:
|
||||
first = next(batches)
|
||||
|
@ -255,18 +243,20 @@ def create_train_batches(nlp, corpus, cfg):
|
|||
random.shuffle(train_examples)
|
||||
|
||||
|
||||
def create_evaluation_callback(nlp, optimizer, corpus, cfg):
|
||||
def evaluate():
|
||||
dev_examples = list(
|
||||
corpus.dev_dataset(
|
||||
nlp, gold_preproc=cfg["gold_preproc"], ignore_misaligned=True
|
||||
)
|
||||
def create_evaluation_callback(
|
||||
nlp: Language,
|
||||
optimizer: Optimizer,
|
||||
corpus: Corpus,
|
||||
cfg: Union[Config, Dict[str, Any]],
|
||||
) -> Callable[[], Tuple[float, Dict[str, float]]]:
|
||||
def evaluate() -> Tuple[float, Dict[str, float]]:
|
||||
dev_examples = corpus.dev_dataset(
|
||||
nlp, gold_preproc=cfg["gold_preproc"], ignore_misaligned=True
|
||||
)
|
||||
|
||||
dev_examples = list(dev_examples)
|
||||
n_words = sum(len(ex.predicted) for ex in dev_examples)
|
||||
batch_size = cfg.get("evaluation_batch_size", 128)
|
||||
batch_size = cfg["eval_batch_size"]
|
||||
start_time = timer()
|
||||
|
||||
if optimizer.averages:
|
||||
with nlp.use_params(optimizer.averages):
|
||||
scorer = nlp.evaluate(dev_examples, batch_size=batch_size)
|
||||
|
@ -280,12 +270,9 @@ def create_evaluation_callback(nlp, optimizer, corpus, cfg):
|
|||
try:
|
||||
weighted_score = sum(scores[s] * weights.get(s, 0.0) for s in weights)
|
||||
except KeyError as e:
|
||||
raise KeyError(
|
||||
Errors.E983.format(
|
||||
dict="score_weights", key=str(e), keys=list(scores.keys())
|
||||
)
|
||||
)
|
||||
|
||||
keys = list(scores.keys())
|
||||
err = Errors.E983.format(dict="score_weights", key=str(e), keys=keys)
|
||||
raise KeyError(err)
|
||||
scores["speed"] = wps
|
||||
return weighted_score, scores
|
||||
|
||||
|
@ -293,17 +280,17 @@ def create_evaluation_callback(nlp, optimizer, corpus, cfg):
|
|||
|
||||
|
||||
def train_while_improving(
|
||||
nlp,
|
||||
optimizer,
|
||||
nlp: Language,
|
||||
optimizer: Optimizer,
|
||||
train_data,
|
||||
evaluate,
|
||||
*,
|
||||
dropout,
|
||||
eval_frequency,
|
||||
accumulate_gradient=1,
|
||||
patience=0,
|
||||
max_steps=0,
|
||||
raw_text=None,
|
||||
dropout: float,
|
||||
eval_frequency: int,
|
||||
accumulate_gradient: int,
|
||||
patience: int,
|
||||
max_steps: int,
|
||||
raw_text: List[Dict[str, str]],
|
||||
):
|
||||
"""Train until an evaluation stops improving. Works as a generator,
|
||||
with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`,
|
||||
|
@ -414,7 +401,9 @@ def subdivide_batch(batch, accumulate_gradient):
|
|||
yield subbatch
|
||||
|
||||
|
||||
def setup_printer(training, nlp):
|
||||
def setup_printer(
|
||||
training: Union[Dict[str, Any], Config], nlp: Language
|
||||
) -> Callable[[Dict[str, Any]], None]:
|
||||
score_cols = training["scores"]
|
||||
score_widths = [max(len(col), 6) for col in score_cols]
|
||||
loss_cols = [f"Loss {pipe}" for pipe in nlp.pipe_names]
|
||||
|
@ -423,11 +412,10 @@ def setup_printer(training, nlp):
|
|||
table_header = [col.upper() for col in table_header]
|
||||
table_widths = [3, 6] + loss_widths + score_widths + [6]
|
||||
table_aligns = ["r" for _ in table_widths]
|
||||
|
||||
msg.row(table_header, widths=table_widths)
|
||||
msg.row(["-" * width for width in table_widths])
|
||||
|
||||
def print_row(info):
|
||||
def print_row(info: Dict[str, Any]) -> None:
|
||||
try:
|
||||
losses = [
|
||||
"{0:.2f}".format(float(info["losses"][pipe_name]))
|
||||
|
@ -463,7 +451,9 @@ def setup_printer(training, nlp):
|
|||
return print_row
|
||||
|
||||
|
||||
def update_meta(training, nlp, info):
|
||||
def update_meta(
|
||||
training: Union[Dict[str, Any], Config], nlp: Language, info: Dict[str, Any]
|
||||
) -> None:
|
||||
score_cols = training["scores"]
|
||||
nlp.meta["performance"] = {}
|
||||
for metric in score_cols:
|
||||
|
@ -472,7 +462,9 @@ def update_meta(training, nlp, info):
|
|||
nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name]
|
||||
|
||||
|
||||
def load_from_paths(config):
|
||||
def load_from_paths(
|
||||
config: Config,
|
||||
) -> Tuple[List[Dict[str, str]], Dict[str, dict], bytes]:
|
||||
# TODO: separate checks from loading
|
||||
raw_text = util.ensure_path(config["training"]["raw_text"])
|
||||
if raw_text is not None:
|
||||
|
@ -506,7 +498,7 @@ def verify_cli_args(
|
|||
dev_path: Path,
|
||||
config_path: Path,
|
||||
output_path: Optional[Path] = None,
|
||||
):
|
||||
) -> None:
|
||||
# Make sure all files and paths exists if they are needed
|
||||
if not config_path or not config_path.exists():
|
||||
msg.fail("Config file not found", config_path, exits=1)
|
||||
|
@ -528,12 +520,23 @@ def verify_cli_args(
|
|||
)
|
||||
|
||||
|
||||
def verify_textcat_config(nlp, nlp_config):
|
||||
def verify_config(nlp: Language) -> None:
|
||||
"""Perform additional checks based on the config and loaded nlp object."""
|
||||
# TODO: maybe we should validate based on the actual components, the list
|
||||
# in config["nlp"]["pipeline"] instead?
|
||||
for pipe_config in nlp.config["components"].values():
|
||||
# We can't assume that the component name == the factory
|
||||
factory = pipe_config["@factories"]
|
||||
if factory == "textcat":
|
||||
verify_textcat_config(nlp, pipe_config)
|
||||
|
||||
|
||||
def verify_textcat_config(nlp: Language, pipe_config: Dict[str, Any]) -> None:
|
||||
# if 'positive_label' is provided: double check whether it's in the data and
|
||||
# the task is binary
|
||||
if nlp_config["pipeline"]["textcat"].get("positive_label", None):
|
||||
if pipe_config.get("positive_label"):
|
||||
textcat_labels = nlp.get_pipe("textcat").cfg.get("labels", [])
|
||||
pos_label = nlp_config["pipeline"]["textcat"]["positive_label"]
|
||||
pos_label = pipe_config.get("positive_label")
|
||||
if pos_label not in textcat_labels:
|
||||
msg.fail(
|
||||
f"The textcat's 'positive_label' config setting '{pos_label}' "
|
||||
|
|
102
spacy/default_config.cfg
Normal file
102
spacy/default_config.cfg
Normal file
|
@ -0,0 +1,102 @@
|
|||
[nlp]
|
||||
lang = null
|
||||
stop_words = []
|
||||
lex_attr_getters = {}
|
||||
pipeline = []
|
||||
|
||||
[nlp.tokenizer]
|
||||
@tokenizers = "spacy.Tokenizer.v1"
|
||||
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||
|
||||
[nlp.writing_system]
|
||||
direction = "ltr"
|
||||
has_case = true
|
||||
has_letters = true
|
||||
|
||||
[components]
|
||||
|
||||
# Training hyper-parameters and additional features.
|
||||
[training]
|
||||
# Whether to train on sequences with 'gold standard' sentence boundaries
|
||||
# and tokens. If you set this to true, take care to ensure your run-time
|
||||
# data is passed in sentence-by-sentence via some prior preprocessing.
|
||||
gold_preproc = false
|
||||
# Limitations on training document length or number of examples.
|
||||
max_length = 5000
|
||||
limit = 0
|
||||
# Data augmentation
|
||||
orth_variant_level = 0.0
|
||||
dropout = 0.1
|
||||
# Controls early-stopping. 0 or -1 mean unlimited.
|
||||
patience = 1600
|
||||
max_epochs = 0
|
||||
max_steps = 20000
|
||||
eval_frequency = 200
|
||||
eval_batch_size = 128
|
||||
# Other settings
|
||||
seed = 0
|
||||
accumulate_gradient = 1
|
||||
use_pytorch_for_gpu_memory = false
|
||||
# Control how scores are printed and checkpoints are evaluated.
|
||||
scores = ["speed", "tags_acc", "uas", "las", "ents_f"]
|
||||
score_weights = {"tags_acc": 0.2, "las": 0.4, "ents_f": 0.4}
|
||||
# These settings are invalid for the transformer models.
|
||||
init_tok2vec = null
|
||||
discard_oversize = false
|
||||
omit_extra_lookups = false
|
||||
batch_by = "sequences"
|
||||
raw_text = null
|
||||
tag_map = null
|
||||
morph_rules = null
|
||||
base_model = null
|
||||
vectors = null
|
||||
|
||||
[training.batch_size]
|
||||
@schedules = "compounding.v1"
|
||||
start = 1000
|
||||
stop = 1000
|
||||
compound = 1.001
|
||||
|
||||
[training.optimizer]
|
||||
@optimizers = "Adam.v1"
|
||||
beta1 = 0.9
|
||||
beta2 = 0.999
|
||||
L2_is_weight_decay = true
|
||||
L2 = 0.01
|
||||
grad_clip = 1.0
|
||||
use_averages = false
|
||||
eps = 1e-8
|
||||
|
||||
[training.optimizer.learn_rate]
|
||||
@schedules = "warmup_linear.v1"
|
||||
warmup_steps = 250
|
||||
total_steps = 20000
|
||||
initial_rate = 0.001
|
||||
|
||||
[pretraining]
|
||||
max_epochs = 1000
|
||||
min_length = 5
|
||||
max_length = 500
|
||||
dropout = 0.2
|
||||
n_save_every = null
|
||||
batch_size = 3000
|
||||
seed = ${training:seed}
|
||||
use_pytorch_for_gpu_memory = ${training:use_pytorch_for_gpu_memory}
|
||||
tok2vec_model = "components.tok2vec.model"
|
||||
|
||||
[pretraining.objective]
|
||||
type = "characters"
|
||||
n_characters = 4
|
||||
|
||||
[pretraining.optimizer]
|
||||
@optimizers = "Adam.v1"
|
||||
beta1 = 0.9
|
||||
beta2 = 0.999
|
||||
L2_is_weight_decay = true
|
||||
L2 = 0.01
|
||||
grad_clip = 1.0
|
||||
use_averages = true
|
||||
eps = 1e-8
|
||||
learn_rate = 0.001
|
108
spacy/errors.py
108
spacy/errors.py
|
@ -124,20 +124,24 @@ class Warnings:
|
|||
@add_codes
|
||||
class Errors:
|
||||
E001 = ("No component '{name}' found in pipeline. Available names: {opts}")
|
||||
E002 = ("Can't find factory for '{name}'. This usually happens when spaCy "
|
||||
"calls `nlp.create_pipe` with a component name that's not built "
|
||||
"in - for example, when constructing the pipeline from a model's "
|
||||
"meta.json. If you're using a custom component, you can write to "
|
||||
"`Language.factories['{name}']` or remove it from the model meta "
|
||||
"and add it via `nlp.add_pipe` instead.")
|
||||
E002 = ("Can't find factory for '{name}' for language {lang} ({lang_code}). "
|
||||
"This usually happens when spaCy calls nlp.{method} with custom "
|
||||
"component name that's not registered on the current language class. "
|
||||
"If you're using a custom component, make sure you've added the "
|
||||
"decorator @Language.component (for function components) or "
|
||||
"@Language.factory (for class components).\n\nAvailable "
|
||||
"factories: {opts}")
|
||||
E003 = ("Not a valid pipeline component. Expected callable, but "
|
||||
"got {component} (name: '{name}').")
|
||||
E004 = ("If you meant to add a built-in component, use `create_pipe`: "
|
||||
"`nlp.add_pipe(nlp.create_pipe('{component}'))`")
|
||||
"got {component} (name: '{name}'). If you're using a custom "
|
||||
"component factory, double-check that it correctly returns your "
|
||||
"initialized component.")
|
||||
E004 = ("Can't set up pipeline component: a factory for '{name}' already exists.")
|
||||
E005 = ("Pipeline component '{name}' returned None. If you're using a "
|
||||
"custom component, maybe you forgot to return the processed Doc?")
|
||||
E006 = ("Invalid constraints. You can only set one of the following: "
|
||||
"before, after, first, last.")
|
||||
E006 = ("Invalid constraints for adding pipeline component. You can only "
|
||||
"set one of the following: before (component name or index), "
|
||||
"after (component name or index), first (True) or last (True). "
|
||||
"Invalid configuration: {args}. Existing components: {opts}")
|
||||
E007 = ("'{name}' already exists in pipeline. Existing names: {opts}")
|
||||
E008 = ("Some current components would be lost when restoring previous "
|
||||
"pipeline state. If you added components after calling "
|
||||
|
@ -184,7 +188,7 @@ class Errors:
|
|||
"the documentation:\nhttps://spacy.io/usage/models")
|
||||
E030 = ("Sentence boundaries unset. You can add the 'sentencizer' "
|
||||
"component to the pipeline with: "
|
||||
"nlp.add_pipe(nlp.create_pipe('sentencizer')). "
|
||||
"nlp.add_pipe('sentencizer'). "
|
||||
"Alternatively, add the dependency parser, or set sentence "
|
||||
"boundaries by setting doc[i].is_sent_start.")
|
||||
E031 = ("Invalid token: empty string ('') at position {i}.")
|
||||
|
@ -365,8 +369,6 @@ class Errors:
|
|||
E133 = ("The sum of prior probabilities for alias '{alias}' should not "
|
||||
"exceed 1, but found {sum}.")
|
||||
E134 = ("Entity '{entity}' is not defined in the Knowledge Base.")
|
||||
E135 = ("If you meant to replace a built-in component, use `create_pipe`: "
|
||||
"`nlp.replace_pipe('{name}', nlp.create_pipe('{name}'))`")
|
||||
E137 = ("Expected 'dict' type, but got '{type}' from '{line}'. Make sure "
|
||||
"to provide a valid JSON object as input with either the `text` "
|
||||
"or `tokens` key. For more info, see the docs:\n"
|
||||
|
@ -484,6 +486,62 @@ class Errors:
|
|||
E199 = ("Unable to merge 0-length span at doc[{start}:{end}].")
|
||||
|
||||
# TODO: fix numbering after merging develop into master
|
||||
E956 = ("Can't find component '{name}' in [components] block in the config. "
|
||||
"Available components: {opts}")
|
||||
E957 = ("Writing directly to Language.factories isn't needed anymore in "
|
||||
"spaCy v3. Instead, you can use the @Language.factory decorator "
|
||||
"to register your custom component factory or @Language.component "
|
||||
"to register a simple stateless function component that just takes "
|
||||
"a Doc and returns it.")
|
||||
E958 = ("Language code defined in config ({bad_lang_code}) does not match "
|
||||
"language code of current Language subclass {lang} ({lang_code})")
|
||||
E959 = ("Can't insert component {dir} index {idx}. Existing components: {opts}")
|
||||
E960 = ("No config data found for component '{name}'. This is likely a bug "
|
||||
"in spaCy.")
|
||||
E961 = ("Found non-serializable Python object in config. Configs should "
|
||||
"only include values that can be serialized to JSON. If you need "
|
||||
"to pass models or other objects to your component, use a reference "
|
||||
"to a registered function or initialize the object in your "
|
||||
"component.\n\n{config}")
|
||||
E962 = ("Received incorrect {style} for pipe '{name}'. Expected dict, "
|
||||
"got: {cfg_type}.")
|
||||
E963 = ("Can't read component info from @Language.{decorator} decorator. "
|
||||
"Maybe you forgot to call it? Make sure you're using "
|
||||
"@Language.{decorator}() instead of @Language.{decorator}.")
|
||||
E964 = ("The pipeline component factory for '{name}' needs to have the "
|
||||
"following named arguments, which are passed in by spaCy:\n- nlp: "
|
||||
"receives the current nlp object and lets you access the vocab\n- "
|
||||
"name: the name of the component instance, can be used to identify "
|
||||
"the component, output losses etc.")
|
||||
E965 = ("It looks like you're using the @Language.component decorator to "
|
||||
"register '{name}' on a class instead of a function component. If "
|
||||
"you need to register a class or function that *returns* a component "
|
||||
"function, use the @Language.factory decorator instead.")
|
||||
E966 = ("nlp.add_pipe now takes the string name of the registered component "
|
||||
"factory, not a callable component. Expected string, but got "
|
||||
"{component} (name: '{name}').\n\n- If you created your component "
|
||||
"with nlp.create_pipe('name'): remove nlp.create_pipe and call "
|
||||
"nlp.add_pipe('name') instead.\n\n- If you passed in a component "
|
||||
"like TextCategorizer(): call nlp.add_pipe with the string name "
|
||||
"instead, e.g. nlp.add_pipe('textcat').\n\n- If you're using a custom "
|
||||
"component: Add the decorator @Language.component (for function "
|
||||
"components) or @Language.factory (for class components / factories) "
|
||||
"to your custom component and assign it a name, e.g. "
|
||||
"@Language.component('your_name'). You can then run "
|
||||
"nlp.add_pipe('your_name') to add it to the pipeline.")
|
||||
E967 = ("No {meta} meta information found for '{name}'. This is likely a bug in spaCy.")
|
||||
E968 = ("nlp.replace_pipe now takes the string name of the registered component "
|
||||
"factory, not a callable component. Expected string, but got "
|
||||
"{component}.\n\n- If you created your component with"
|
||||
"with nlp.create_pipe('name'): remove nlp.create_pipe and call "
|
||||
"nlp.replace_pipe('{name}', 'name') instead.\n\n- If you passed in a "
|
||||
"component like TextCategorizer(): call nlp.replace_pipe with the "
|
||||
"string name instead, e.g. nlp.replace_pipe('{name}', 'textcat').\n\n"
|
||||
"- If you're using a custom component: Add the decorator "
|
||||
"@Language.component (for function components) or @Language.factory "
|
||||
"(for class components / factories) to your custom component and "
|
||||
"assign it a name, e.g. @Language.component('your_name'). You can "
|
||||
"then run nlp.replace_pipe('{name}', 'your_name').")
|
||||
E969 = ("Expected string values for field '{field}', but received {types} instead. ")
|
||||
E970 = ("Can not execute command '{str_command}'. Do you have '{tool}' installed?")
|
||||
E971 = ("Found incompatible lengths in Doc.from_array: {array_length} for the "
|
||||
|
@ -506,10 +564,12 @@ class Errors:
|
|||
"into {values}, but found {value}.")
|
||||
E983 = ("Invalid key for '{dict}': {key}. Available keys: "
|
||||
"{keys}")
|
||||
E985 = ("The pipeline component '{component}' is already available in the base "
|
||||
"model. The settings in the component block in the config file are "
|
||||
"being ignored. If you want to replace this component instead, set "
|
||||
"'replace' to True in the training configuration.")
|
||||
E984 = ("Invalid component config for '{name}': no @factories key "
|
||||
"specifying the registered function used to initialize the "
|
||||
"component. For example, @factories = \"ner\" will use the 'ner' "
|
||||
"factory and all other settings in the block will be passed "
|
||||
"to it as arguments.\n\n{config}")
|
||||
E985 = ("Can't load model from config file: no 'nlp' section found.\n\n{config}")
|
||||
E986 = ("Could not create any training batches: check your input. "
|
||||
"Perhaps discard_oversize should be set to False ?")
|
||||
E987 = ("The text of an example training instance is either a Doc or "
|
||||
|
@ -530,9 +590,9 @@ class Errors:
|
|||
E992 = ("The function `select_pipes` was called with `enable`={enable} "
|
||||
"and `disable`={disable} but that information is conflicting "
|
||||
"for the `nlp` pipeline with components {names}.")
|
||||
E993 = ("The config for 'nlp' should include either a key 'name' to "
|
||||
"refer to an existing model by name or path, or a key 'lang' "
|
||||
"to create a new blank model.")
|
||||
E993 = ("The config for 'nlp' needs to include a key 'lang' specifying "
|
||||
"the code of the language to initialize it with (for example "
|
||||
"'en' for English).\n\n{config}")
|
||||
E996 = ("Could not parse {file}: {msg}")
|
||||
E997 = ("Tokenizer special cases are not allowed to modify the text. "
|
||||
"This would map '{chunk}' to '{orth}' given token attributes "
|
||||
|
@ -540,9 +600,9 @@ class Errors:
|
|||
E999 = ("Unable to merge the `Doc` objects because they do not all share "
|
||||
"the same `Vocab`.")
|
||||
E1000 = ("No pkuseg model available. Provide a pkuseg model when "
|
||||
"initializing the pipeline: "
|
||||
'`cfg = {"segmenter": "pkuseg", "pkuseg_model": name_or_path}; '
|
||||
'nlp = Chinese(meta={"tokenizer": {"config": cfg}})`')
|
||||
"initializing the pipeline:\n"
|
||||
'cfg = {"tokenizer": {"segmenter": "pkuseg", "pkuseg_model": name_or_path}}\m'
|
||||
'nlp = Chinese(config=cfg)')
|
||||
|
||||
|
||||
@add_codes
|
||||
|
|
|
@ -1,10 +1,9 @@
|
|||
import re
|
||||
|
||||
from .conll_ner2docs import n_sents_info
|
||||
from ...gold import Example
|
||||
from ...gold import iob_to_biluo, spans_from_biluo_tags
|
||||
from ...language import Language
|
||||
from ...tokens import Doc, Token, Span
|
||||
from ...vocab import Vocab
|
||||
from wasabi import Printer
|
||||
|
||||
|
||||
|
@ -73,7 +72,7 @@ def read_conllx(
|
|||
ner_map=None,
|
||||
):
|
||||
""" Yield docs, one for each sentence """
|
||||
vocab = Language.Defaults.create_vocab() # need vocab to make a minimal Doc
|
||||
vocab = Vocab() # need vocab to make a minimal Doc
|
||||
for sent in input_data.strip().split("\n\n"):
|
||||
lines = sent.strip().split("\n")
|
||||
if lines:
|
||||
|
|
|
@ -1,17 +1,26 @@
|
|||
from typing import Set
|
||||
from thinc.api import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG
|
||||
from ...util import registry
|
||||
|
||||
|
||||
class AfrikaansDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: "af"
|
||||
stop_words = STOP_WORDS
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "af"
|
||||
stop_words = {"@language_data": "spacy.af.stop_words"}
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.af.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
class Afrikaans(Language):
|
||||
lang = "af"
|
||||
Defaults = AfrikaansDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Afrikaans"]
|
||||
|
|
|
@ -1,31 +1,48 @@
|
|||
from typing import Set, Dict, Callable, Any
|
||||
from thinc.api import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .punctuation import TOKENIZER_SUFFIXES
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
from ...util import update_exc, registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "ar"
|
||||
stop_words = {"@language_data": "spacy.ar.stop_words"}
|
||||
lex_attr_getters = {"@language_data": "spacy.ar.lex_attr_getters"}
|
||||
|
||||
[nlp.writing_system]
|
||||
direction = "rtl"
|
||||
has_case = false
|
||||
has_letters = true
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.ar.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
@registry.language_data("spacy.ar.lex_attr_getters")
|
||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||
return LEX_ATTRS
|
||||
|
||||
|
||||
class ArabicDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
lex_attr_getters[LANG] = lambda text: "ar"
|
||||
lex_attr_getters[NORM] = add_lookups(
|
||||
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
|
||||
)
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = STOP_WORDS
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
|
||||
|
||||
|
||||
class Arabic(Language):
|
||||
lang = "ar"
|
||||
Defaults = ArabicDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Arabic"]
|
||||
|
|
|
@ -1,17 +1,26 @@
|
|||
from typing import Set
|
||||
from thinc.api import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG
|
||||
from ...util import registry
|
||||
|
||||
|
||||
class BulgarianDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: "bg"
|
||||
stop_words = STOP_WORDS
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "bg"
|
||||
stop_words = {"@language_data": "spacy.bg.stop_words"}
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.bg.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
class Bulgarian(Language):
|
||||
lang = "bg"
|
||||
Defaults = BulgarianDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Bulgarian"]
|
||||
|
|
|
@ -1,18 +1,35 @@
|
|||
from typing import Set
|
||||
from thinc.api import Config
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||
from .stop_words import STOP_WORDS
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG
|
||||
from ...util import update_exc
|
||||
from ...util import update_exc, registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "bn"
|
||||
stop_words = {"@language_data": "spacy.bn.stop_words"}
|
||||
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||
|
||||
[nlp.lemmatizer.data_paths]
|
||||
@language_data = "spacy-lookups-data"
|
||||
lang = ${nlp:lang}
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.bn.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
class BengaliDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: "bn"
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = STOP_WORDS
|
||||
prefixes = TOKENIZER_PREFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
infixes = TOKENIZER_INFIXES
|
||||
|
@ -21,6 +38,7 @@ class BengaliDefaults(Language.Defaults):
|
|||
class Bengali(Language):
|
||||
lang = "bn"
|
||||
Defaults = BengaliDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Bengali"]
|
||||
|
|
|
@ -1,31 +1,49 @@
|
|||
from typing import Set, Dict, Callable, Any
|
||||
from thinc.api import Config
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
|
||||
from ...util import update_exc, registry
|
||||
from .punctuation import TOKENIZER_INFIXES
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "ca"
|
||||
stop_words = {"@language_data": "spacy.ca.stop_words"}
|
||||
lex_attr_getters = {"@language_data": "spacy.ca.lex_attr_getters"}
|
||||
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||
|
||||
[nlp.lemmatizer.data_paths]
|
||||
@language_data = "spacy-lookups-data"
|
||||
lang = ${nlp:lang}
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.ca.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
@registry.language_data("spacy.ca.lex_attr_getters")
|
||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||
return LEX_ATTRS
|
||||
|
||||
|
||||
class CatalanDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: "ca"
|
||||
lex_attr_getters[NORM] = add_lookups(
|
||||
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
|
||||
)
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = STOP_WORDS
|
||||
infixes = TOKENIZER_INFIXES
|
||||
|
||||
|
||||
class Catalan(Language):
|
||||
lang = "ca"
|
||||
Defaults = CatalanDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Catalan"]
|
||||
|
|
|
@ -1,17 +1,26 @@
|
|||
from typing import Set
|
||||
from thinc.api import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG
|
||||
from ...util import registry
|
||||
|
||||
|
||||
class CzechDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: "cs"
|
||||
stop_words = STOP_WORDS
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "cs"
|
||||
stop_words = {"@language_data": "spacy.cs.stop_words"}
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.cs.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
class Czech(Language):
|
||||
lang = "cs"
|
||||
Defaults = CzechDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Czech"]
|
||||
|
|
|
@ -1,27 +1,50 @@
|
|||
from typing import Set, Dict, Callable, Any
|
||||
from thinc.api import Config
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG
|
||||
from ...util import update_exc
|
||||
from ...util import update_exc, registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "da"
|
||||
stop_words = {"@language_data": "spacy.da.stop_words"}
|
||||
lex_attr_getters = {"@language_data": "spacy.da.lex_attr_getters"}
|
||||
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||
|
||||
[nlp.lemmatizer.data_paths]
|
||||
@language_data = "spacy-lookups-data"
|
||||
lang = ${nlp:lang}
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.da.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
@registry.language_data("spacy.da.lex_attr_getters")
|
||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||
return LEX_ATTRS
|
||||
|
||||
|
||||
class DanishDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
lex_attr_getters[LANG] = lambda text: "da"
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
infixes = TOKENIZER_INFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class Danish(Language):
|
||||
lang = "da"
|
||||
Defaults = DanishDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Danish"]
|
||||
|
|
|
@ -1,23 +1,40 @@
|
|||
from typing import Set
|
||||
from thinc.api import Config
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
||||
from .punctuation import TOKENIZER_INFIXES
|
||||
from .stop_words import STOP_WORDS
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG
|
||||
from ...util import update_exc
|
||||
from ...util import update_exc, registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "de"
|
||||
stop_words = {"@language_data": "spacy.de.stop_words"}
|
||||
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||
|
||||
[nlp.lemmatizer.data_paths]
|
||||
@language_data = "spacy-lookups-data"
|
||||
lang = ${nlp:lang}
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.de.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
class GermanDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: "de"
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
prefixes = TOKENIZER_PREFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
infixes = TOKENIZER_INFIXES
|
||||
stop_words = STOP_WORDS
|
||||
syntax_iterators = SYNTAX_ITERATORS
|
||||
single_orth_variants = [
|
||||
{"tags": ["$("], "variants": ["…", "..."]},
|
||||
|
@ -38,6 +55,7 @@ class GermanDefaults(Language.Defaults):
|
|||
class German(Language):
|
||||
lang = "de"
|
||||
Defaults = GermanDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["German"]
|
||||
|
|
|
@ -1,3 +1,6 @@
|
|||
from typing import Set, Dict, Callable, Any
|
||||
from thinc.api import Config
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
|
@ -6,32 +9,51 @@ from .syntax_iterators import SYNTAX_ITERATORS
|
|||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...language import Language
|
||||
from ...lookups import Lookups
|
||||
from ...attrs import LANG
|
||||
from ...util import update_exc
|
||||
from ...util import update_exc, registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "el"
|
||||
stop_words = {"@language_data": "spacy.el.stop_words"}
|
||||
lex_attr_getters = {"@language_data": "spacy.el.lex_attr_getters"}
|
||||
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.GreekLemmatizer.v1"
|
||||
|
||||
[nlp.lemmatizer.data_paths]
|
||||
@language_data = "spacy-lookups-data"
|
||||
lang = ${nlp:lang}
|
||||
"""
|
||||
|
||||
|
||||
@registry.lemmatizers("spacy.GreekLemmatizer.v1")
|
||||
def create_greek_lemmatizer(data_paths: dict = {}) -> GreekLemmatizer:
|
||||
return GreekLemmatizer(data_paths=data_paths)
|
||||
|
||||
|
||||
@registry.language_data("spacy.el.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
@registry.language_data("spacy.el.lex_attr_getters")
|
||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||
return LEX_ATTRS
|
||||
|
||||
|
||||
class GreekDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
lex_attr_getters[LANG] = lambda text: "el"
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = STOP_WORDS
|
||||
prefixes = TOKENIZER_PREFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
infixes = TOKENIZER_INFIXES
|
||||
syntax_iterators = SYNTAX_ITERATORS
|
||||
|
||||
@classmethod
|
||||
def create_lemmatizer(cls, nlp=None, lookups=None):
|
||||
if lookups is None:
|
||||
lookups = Lookups()
|
||||
return GreekLemmatizer(lookups)
|
||||
|
||||
|
||||
class Greek(Language):
|
||||
lang = "el"
|
||||
Defaults = GreekDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Greek"]
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
from typing import Dict, List
|
||||
|
||||
from ...lemmatizer import Lemmatizer
|
||||
|
||||
|
||||
|
@ -12,7 +14,13 @@ class GreekLemmatizer(Lemmatizer):
|
|||
not applicable for Greek language.
|
||||
"""
|
||||
|
||||
def lemmatize(self, string, index, exceptions, rules):
|
||||
def lemmatize(
|
||||
self,
|
||||
string: str,
|
||||
index: Dict[str, List[str]],
|
||||
exceptions: Dict[str, Dict[str, List[str]]],
|
||||
rules: Dict[str, List[List[str]]],
|
||||
) -> List[str]:
|
||||
string = string.lower()
|
||||
forms = []
|
||||
if string in index:
|
||||
|
|
|
@ -1,25 +1,50 @@
|
|||
from typing import Set, Dict, Callable, Any
|
||||
from thinc.api import Config
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
|
||||
from .lemmatizer import is_base_form
|
||||
from .punctuation import TOKENIZER_INFIXES
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG
|
||||
from ...util import update_exc
|
||||
from ...lemmatizer import Lemmatizer
|
||||
from ...util import update_exc, registry
|
||||
|
||||
|
||||
def _return_en(_):
|
||||
return "en"
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "en"
|
||||
stop_words = {"@language_data": "spacy.en.stop_words"}
|
||||
lex_attr_getters = {"@language_data": "spacy.en.lex_attr_getters"}
|
||||
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.EnglishLemmatizer.v1"
|
||||
|
||||
[nlp.lemmatizer.data_paths]
|
||||
@language_data = "spacy-lookups-data"
|
||||
lang = ${nlp:lang}
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.en.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
@registry.language_data("spacy.en.lex_attr_getters")
|
||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||
return LEX_ATTRS
|
||||
|
||||
|
||||
@registry.lemmatizers("spacy.EnglishLemmatizer.v1")
|
||||
def create_lemmatizer(data_paths: dict = {}) -> "Lemmatizer":
|
||||
return Lemmatizer(data_paths=data_paths, is_base_form=is_base_form)
|
||||
|
||||
|
||||
class EnglishDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
lex_attr_getters[LANG] = _return_en
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = STOP_WORDS
|
||||
syntax_iterators = SYNTAX_ITERATORS
|
||||
infixes = TOKENIZER_INFIXES
|
||||
single_orth_variants = [
|
||||
|
@ -31,45 +56,11 @@ class EnglishDefaults(Language.Defaults):
|
|||
{"tags": ["``", "''"], "variants": [('"', '"'), ("“", "”")]},
|
||||
]
|
||||
|
||||
@classmethod
|
||||
def is_base_form(cls, univ_pos, morphology=None):
|
||||
"""
|
||||
Check whether we're dealing with an uninflected paradigm, so we can
|
||||
avoid lemmatization entirely.
|
||||
|
||||
univ_pos (unicode / int): The token's universal part-of-speech tag.
|
||||
morphology (dict): The token's morphological features following the
|
||||
Universal Dependencies scheme.
|
||||
"""
|
||||
if morphology is None:
|
||||
morphology = {}
|
||||
if univ_pos == "noun" and morphology.get("Number") == "sing":
|
||||
return True
|
||||
elif univ_pos == "verb" and morphology.get("VerbForm") == "inf":
|
||||
return True
|
||||
# This maps 'VBP' to base form -- probably just need 'IS_BASE'
|
||||
# morphology
|
||||
elif univ_pos == "verb" and (
|
||||
morphology.get("VerbForm") == "fin"
|
||||
and morphology.get("Tense") == "pres"
|
||||
and morphology.get("Number") is None
|
||||
):
|
||||
return True
|
||||
elif univ_pos == "adj" and morphology.get("Degree") == "pos":
|
||||
return True
|
||||
elif morphology.get("VerbForm") == "inf":
|
||||
return True
|
||||
elif morphology.get("VerbForm") == "none":
|
||||
return True
|
||||
elif morphology.get("Degree") == "pos":
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
class English(Language):
|
||||
lang = "en"
|
||||
Defaults = EnglishDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["English"]
|
||||
|
|
36
spacy/lang/en/lemmatizer.py
Normal file
36
spacy/lang/en/lemmatizer.py
Normal file
|
@ -0,0 +1,36 @@
|
|||
from typing import Optional
|
||||
|
||||
|
||||
def is_base_form(univ_pos: str, morphology: Optional[dict] = None) -> bool:
|
||||
"""
|
||||
Check whether we're dealing with an uninflected paradigm, so we can
|
||||
avoid lemmatization entirely.
|
||||
|
||||
univ_pos (unicode / int): The token's universal part-of-speech tag.
|
||||
morphology (dict): The token's morphological features following the
|
||||
Universal Dependencies scheme.
|
||||
"""
|
||||
if morphology is None:
|
||||
morphology = {}
|
||||
if univ_pos == "noun" and morphology.get("Number") == "sing":
|
||||
return True
|
||||
elif univ_pos == "verb" and morphology.get("VerbForm") == "inf":
|
||||
return True
|
||||
# This maps 'VBP' to base form -- probably just need 'IS_BASE'
|
||||
# morphology
|
||||
elif univ_pos == "verb" and (
|
||||
morphology.get("VerbForm") == "fin"
|
||||
and morphology.get("Tense") == "pres"
|
||||
and morphology.get("Number") is None
|
||||
):
|
||||
return True
|
||||
elif univ_pos == "adj" and morphology.get("Degree") == "pos":
|
||||
return True
|
||||
elif morphology.get("VerbForm") == "inf":
|
||||
return True
|
||||
elif morphology.get("VerbForm") == "none":
|
||||
return True
|
||||
elif morphology.get("Degree") == "pos":
|
||||
return True
|
||||
else:
|
||||
return False
|
|
@ -1,47 +1,17 @@
|
|||
from ...attrs import LIKE_NUM
|
||||
|
||||
|
||||
# fmt: off
|
||||
_num_words = [
|
||||
"zero",
|
||||
"one",
|
||||
"two",
|
||||
"three",
|
||||
"four",
|
||||
"five",
|
||||
"six",
|
||||
"seven",
|
||||
"eight",
|
||||
"nine",
|
||||
"ten",
|
||||
"eleven",
|
||||
"twelve",
|
||||
"thirteen",
|
||||
"fourteen",
|
||||
"fifteen",
|
||||
"sixteen",
|
||||
"seventeen",
|
||||
"eighteen",
|
||||
"nineteen",
|
||||
"twenty",
|
||||
"thirty",
|
||||
"forty",
|
||||
"fifty",
|
||||
"sixty",
|
||||
"seventy",
|
||||
"eighty",
|
||||
"ninety",
|
||||
"hundred",
|
||||
"thousand",
|
||||
"million",
|
||||
"billion",
|
||||
"trillion",
|
||||
"quadrillion",
|
||||
"gajillion",
|
||||
"bazillion",
|
||||
"zero", "one", "two", "three", "four", "five", "six", "seven", "eight",
|
||||
"nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
|
||||
"sixteen", "seventeen", "eighteen", "nineteen", "twenty", "thirty", "forty",
|
||||
"fifty", "sixty", "seventy", "eighty", "ninety", "hundred", "thousand",
|
||||
"million", "billion", "trillion", "quadrillion", "gajillion", "bazillion"
|
||||
]
|
||||
# fmt: on
|
||||
|
||||
|
||||
def like_num(text):
|
||||
def like_num(text: str) -> bool:
|
||||
if text.startswith(("+", "-", "±", "~")):
|
||||
text = text[1:]
|
||||
text = text.replace(",", "").replace(".", "")
|
||||
|
|
|
@ -1,33 +1,52 @@
|
|||
from typing import Set, Dict, Callable, Any
|
||||
from thinc.config import Config
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
from ...util import update_exc, registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "es"
|
||||
stop_words = {"@language_data": "spacy.es.stop_words"}
|
||||
lex_attr_getters = {"@language_data": "spacy.es.lex_attr_getters"}
|
||||
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||
|
||||
[nlp.lemmatizer.data_paths]
|
||||
@language_data = "spacy-lookups-data"
|
||||
lang = ${nlp:lang}
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.es.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
@registry.language_data("spacy.es.lex_attr_getters")
|
||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||
return LEX_ATTRS
|
||||
|
||||
|
||||
class SpanishDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
lex_attr_getters[LANG] = lambda text: "es"
|
||||
lex_attr_getters[NORM] = add_lookups(
|
||||
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
|
||||
)
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
infixes = TOKENIZER_INFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
stop_words = STOP_WORDS
|
||||
syntax_iterators = SYNTAX_ITERATORS
|
||||
|
||||
|
||||
class Spanish(Language):
|
||||
lang = "es"
|
||||
Defaults = SpanishDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Spanish"]
|
||||
|
|
|
@ -1,17 +1,26 @@
|
|||
from typing import Set
|
||||
from thinc.api import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG
|
||||
from ...util import registry
|
||||
|
||||
|
||||
class EstonianDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: "et"
|
||||
stop_words = STOP_WORDS
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "et"
|
||||
stop_words = {"@language_data": "spacy.et.stop_words"}
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.et.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
class Estonian(Language):
|
||||
lang = "et"
|
||||
Defaults = EstonianDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Estonian"]
|
||||
|
|
|
@ -1,25 +1,41 @@
|
|||
from typing import Set, Dict, Callable, Any
|
||||
from thinc.api import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .punctuation import TOKENIZER_SUFFIXES
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "eu"
|
||||
stop_words = {"@language_data": "spacy.eu.stop_words"}
|
||||
lex_attr_getters = {"@language_data": "spacy.eu.lex_attr_getters"}
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.eu.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
@registry.language_data("spacy.eu.lex_attr_getters")
|
||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||
return LEX_ATTRS
|
||||
|
||||
|
||||
class BasqueDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
lex_attr_getters[LANG] = lambda text: "eu"
|
||||
|
||||
tokenizer_exceptions = BASE_EXCEPTIONS
|
||||
stop_words = STOP_WORDS
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
|
||||
|
||||
class Basque(Language):
|
||||
lang = "eu"
|
||||
Defaults = BasqueDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Basque"]
|
||||
|
|
|
@ -1,7 +1,8 @@
|
|||
from typing import Set, Dict, Callable, Any
|
||||
from thinc.api import Config
|
||||
|
||||
from ...language import Language
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...util import update_exc, registry
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
|
@ -9,23 +10,46 @@ from .punctuation import TOKENIZER_SUFFIXES
|
|||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "fa"
|
||||
stop_words = {"@language_data": "spacy.fa.stop_words"}
|
||||
lex_attr_getters = {"@language_data": "spacy.fa.lex_attr_getters"}
|
||||
|
||||
[nlp.writing_system]
|
||||
direction = "rtl"
|
||||
has_case = false
|
||||
has_letters = true
|
||||
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||
|
||||
[nlp.lemmatizer.data_paths]
|
||||
@language_data = "spacy-lookups-data"
|
||||
lang = ${nlp:lang}
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.fa.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
@registry.language_data("spacy.fa.lex_attr_getters")
|
||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||
return LEX_ATTRS
|
||||
|
||||
|
||||
class PersianDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
lex_attr_getters[NORM] = add_lookups(
|
||||
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
|
||||
)
|
||||
lex_attr_getters[LANG] = lambda text: "fa"
|
||||
tokenizer_exceptions = update_exc(TOKENIZER_EXCEPTIONS)
|
||||
stop_words = STOP_WORDS
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
|
||||
syntax_iterators = SYNTAX_ITERATORS
|
||||
|
||||
|
||||
class Persian(Language):
|
||||
lang = "fa"
|
||||
Defaults = PersianDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Persian"]
|
||||
|
|
|
@ -1,31 +1,43 @@
|
|||
from typing import Set, Dict, Callable, Any
|
||||
from thinc.api import Config
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
from ...util import update_exc, registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "fi"
|
||||
stop_words = {"@language_data": "spacy.fi.stop_words"}
|
||||
lex_attr_getters = {"@language_data": "spacy.fi.lex_attr_getters"}
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.fi.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
@registry.language_data("spacy.fi.lex_attr_getters")
|
||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||
return LEX_ATTRS
|
||||
|
||||
|
||||
class FinnishDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
lex_attr_getters[LANG] = lambda text: "fi"
|
||||
lex_attr_getters[NORM] = add_lookups(
|
||||
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
|
||||
)
|
||||
infixes = TOKENIZER_INFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class Finnish(Language):
|
||||
lang = "fi"
|
||||
Defaults = FinnishDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Finnish"]
|
||||
|
|
|
@ -1,44 +1,61 @@
|
|||
from typing import Set, Dict, Callable, Any
|
||||
from thinc.api import Config
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
|
||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||
from .punctuation import TOKENIZER_SUFFIXES
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .lemmatizer import FrenchLemmatizer
|
||||
from .lemmatizer import FrenchLemmatizer, is_base_form
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...lookups import Lookups
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
from ...util import update_exc, registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "fr"
|
||||
stop_words = {"@language_data": "spacy.fr.stop_words"}
|
||||
lex_attr_getters = {"@language_data": "spacy.fr.lex_attr_getters"}
|
||||
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.FrenchLemmatizer.v1"
|
||||
|
||||
[nlp.lemmatizer.data_paths]
|
||||
@language_data = "spacy-lookups-data"
|
||||
lang = ${nlp:lang}
|
||||
"""
|
||||
|
||||
|
||||
@registry.lemmatizers("spacy.FrenchLemmatizer.v1")
|
||||
def create_french_lemmatizer(data_paths: dict = {}) -> FrenchLemmatizer:
|
||||
return FrenchLemmatizer(data_paths=data_paths, is_base_form=is_base_form)
|
||||
|
||||
|
||||
@registry.language_data("spacy.fr.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
@registry.language_data("spacy.fr.lex_attr_getters")
|
||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||
return LEX_ATTRS
|
||||
|
||||
|
||||
class FrenchDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
lex_attr_getters[LANG] = lambda text: "fr"
|
||||
lex_attr_getters[NORM] = add_lookups(
|
||||
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
|
||||
)
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = STOP_WORDS
|
||||
prefixes = TOKENIZER_PREFIXES
|
||||
infixes = TOKENIZER_INFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
token_match = TOKEN_MATCH
|
||||
syntax_iterators = SYNTAX_ITERATORS
|
||||
|
||||
@classmethod
|
||||
def create_lemmatizer(cls, nlp=None, lookups=None):
|
||||
if lookups is None:
|
||||
lookups = Lookups()
|
||||
return FrenchLemmatizer(lookups)
|
||||
|
||||
|
||||
class French(Language):
|
||||
lang = "fr"
|
||||
Defaults = FrenchDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["French"]
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
from typing import Optional, List, Dict
|
||||
|
||||
from ...lemmatizer import Lemmatizer
|
||||
from ...symbols import POS, NOUN, VERB, ADJ, ADV, PRON, DET, AUX, PUNCT, ADP
|
||||
from ...symbols import SCONJ, CCONJ
|
||||
|
@ -13,7 +15,9 @@ class FrenchLemmatizer(Lemmatizer):
|
|||
the lookup table.
|
||||
"""
|
||||
|
||||
def __call__(self, string, univ_pos, morphology=None):
|
||||
def __call__(
|
||||
self, string: str, univ_pos: str, morphology: Optional[dict] = None
|
||||
) -> List[str]:
|
||||
lookup_table = self.lookups.get_table("lemma_lookup", {})
|
||||
if "lemma_rules" not in self.lookups:
|
||||
return [lookup_table.get(string, string)]
|
||||
|
@ -52,62 +56,19 @@ class FrenchLemmatizer(Lemmatizer):
|
|||
)
|
||||
return lemmas
|
||||
|
||||
def is_base_form(self, univ_pos, morphology=None):
|
||||
"""
|
||||
Check whether we're dealing with an uninflected paradigm, so we can
|
||||
avoid lemmatization entirely.
|
||||
"""
|
||||
morphology = {} if morphology is None else morphology
|
||||
others = [
|
||||
key
|
||||
for key in morphology
|
||||
if key not in (POS, "Number", "POS", "VerbForm", "Tense")
|
||||
]
|
||||
if univ_pos == "noun" and morphology.get("Number") == "sing":
|
||||
return True
|
||||
elif univ_pos == "verb" and morphology.get("VerbForm") == "inf":
|
||||
return True
|
||||
# This maps 'VBP' to base form -- probably just need 'IS_BASE'
|
||||
# morphology
|
||||
elif univ_pos == "verb" and (
|
||||
morphology.get("VerbForm") == "fin"
|
||||
and morphology.get("Tense") == "pres"
|
||||
and morphology.get("Number") is None
|
||||
and not others
|
||||
):
|
||||
return True
|
||||
elif univ_pos == "adj" and morphology.get("Degree") == "pos":
|
||||
return True
|
||||
elif "VerbForm=inf" in morphology:
|
||||
return True
|
||||
elif "VerbForm=none" in morphology:
|
||||
return True
|
||||
elif "Number=sing" in morphology:
|
||||
return True
|
||||
elif "Degree=pos" in morphology:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def noun(self, string, morphology=None):
|
||||
return self(string, "noun", morphology)
|
||||
|
||||
def verb(self, string, morphology=None):
|
||||
return self(string, "verb", morphology)
|
||||
|
||||
def adj(self, string, morphology=None):
|
||||
return self(string, "adj", morphology)
|
||||
|
||||
def punct(self, string, morphology=None):
|
||||
return self(string, "punct", morphology)
|
||||
|
||||
def lookup(self, string, orth=None):
|
||||
def lookup(self, string: str, orth: Optional[int] = None) -> str:
|
||||
lookup_table = self.lookups.get_table("lemma_lookup", {})
|
||||
if orth is not None and orth in lookup_table:
|
||||
return lookup_table[orth][0]
|
||||
return string
|
||||
|
||||
def lemmatize(self, string, index, exceptions, rules):
|
||||
def lemmatize(
|
||||
self,
|
||||
string: str,
|
||||
index: Dict[str, List[str]],
|
||||
exceptions: Dict[str, Dict[str, List[str]]],
|
||||
rules: Dict[str, List[List[str]]],
|
||||
) -> List[str]:
|
||||
lookup_table = self.lookups.get_table("lemma_lookup", {})
|
||||
string = string.lower()
|
||||
forms = []
|
||||
|
@ -133,3 +94,41 @@ class FrenchLemmatizer(Lemmatizer):
|
|||
if not forms:
|
||||
forms.append(string)
|
||||
return list(set(forms))
|
||||
|
||||
|
||||
def is_base_form(univ_pos: str, morphology: Optional[dict] = None) -> bool:
|
||||
"""
|
||||
Check whether we're dealing with an uninflected paradigm, so we can
|
||||
avoid lemmatization entirely.
|
||||
"""
|
||||
morphology = {} if morphology is None else morphology
|
||||
others = [
|
||||
key
|
||||
for key in morphology
|
||||
if key not in (POS, "Number", "POS", "VerbForm", "Tense")
|
||||
]
|
||||
if univ_pos == "noun" and morphology.get("Number") == "sing":
|
||||
return True
|
||||
elif univ_pos == "verb" and morphology.get("VerbForm") == "inf":
|
||||
return True
|
||||
# This maps 'VBP' to base form -- probably just need 'IS_BASE'
|
||||
# morphology
|
||||
elif univ_pos == "verb" and (
|
||||
morphology.get("VerbForm") == "fin"
|
||||
and morphology.get("Tense") == "pres"
|
||||
and morphology.get("Number") is None
|
||||
and not others
|
||||
):
|
||||
return True
|
||||
elif univ_pos == "adj" and morphology.get("Degree") == "pos":
|
||||
return True
|
||||
elif "VerbForm=inf" in morphology:
|
||||
return True
|
||||
elif "VerbForm=none" in morphology:
|
||||
return True
|
||||
elif "Number=sing" in morphology:
|
||||
return True
|
||||
elif "Degree=pos" in morphology:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
|
|
@ -1,23 +1,33 @@
|
|||
from typing import Set
|
||||
from thinc.api import Config
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .stop_words import STOP_WORDS
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG
|
||||
from ...util import update_exc
|
||||
from ...util import update_exc, registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "ga"
|
||||
stop_words = {"@language_data": "spacy.ga.stop_words"}
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.ga.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
class IrishDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: "ga"
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = set(STOP_WORDS)
|
||||
|
||||
|
||||
class Irish(Language):
|
||||
lang = "ga"
|
||||
Defaults = IrishDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Irish"]
|
||||
|
|
|
@ -1,15 +1,26 @@
|
|||
from typing import Set
|
||||
from thinc.api import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
|
||||
from ...language import Language
|
||||
from ...util import registry
|
||||
|
||||
|
||||
class GujaratiDefaults(Language.Defaults):
|
||||
stop_words = STOP_WORDS
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "gu"
|
||||
stop_words = {"@language_data": "spacy.gu.stop_words"}
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.gu.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
class Gujarati(Language):
|
||||
lang = "gu"
|
||||
Defaults = GujaratiDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Gujarati"]
|
||||
|
|
|
@ -1,22 +1,37 @@
|
|||
from .stop_words import STOP_WORDS
|
||||
from typing import Set
|
||||
from thinc.api import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG
|
||||
from ...util import update_exc
|
||||
from ...util import update_exc, registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "he"
|
||||
stop_words = {"@language_data": "spacy.he.stop_words"}
|
||||
|
||||
[nlp.writing_system]
|
||||
direction = "rtl"
|
||||
has_case = false
|
||||
has_letters = true
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.he.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
class HebrewDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: "he"
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
|
||||
stop_words = STOP_WORDS
|
||||
writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
|
||||
|
||||
|
||||
class Hebrew(Language):
|
||||
lang = "he"
|
||||
Defaults = HebrewDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Hebrew"]
|
||||
|
|
|
@ -1,20 +1,33 @@
|
|||
from typing import Set, Dict, Callable, Any
|
||||
from thinc.api import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
|
||||
from ...language import Language
|
||||
from ...attrs import LANG
|
||||
from ...util import registry
|
||||
|
||||
|
||||
class HindiDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
lex_attr_getters[LANG] = lambda text: "hi"
|
||||
stop_words = STOP_WORDS
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "hi"
|
||||
stop_words = {"@language_data": "spacy.hi.stop_words"}
|
||||
lex_attr_getters = {"@language_data": "spacy.hi.lex_attr_getters"}
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.hi.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
@registry.language_data("spacy.hi.lex_attr_getters")
|
||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||
return LEX_ATTRS
|
||||
|
||||
|
||||
class Hindi(Language):
|
||||
lang = "hi"
|
||||
Defaults = HindiDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Hindi"]
|
||||
|
|
|
@ -1,25 +1,39 @@
|
|||
from .stop_words import STOP_WORDS
|
||||
from typing import Set
|
||||
from thinc.api import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
from ...util import update_exc, registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "hr"
|
||||
stop_words = {"@language_data": "spacy.hr.stop_words"}
|
||||
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||
|
||||
[nlp.lemmatizer.data_paths]
|
||||
@language_data = "spacy-lookups-data"
|
||||
lang = ${nlp:lang}
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.hr.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
class CroatianDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: "hr"
|
||||
lex_attr_getters[NORM] = add_lookups(
|
||||
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
|
||||
)
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class Croatian(Language):
|
||||
lang = "hr"
|
||||
Defaults = CroatianDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Croatian"]
|
||||
|
|
|
@ -1,22 +1,35 @@
|
|||
from typing import Set
|
||||
from thinc.api import Config
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
|
||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||
from .stop_words import STOP_WORDS
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
from ...util import update_exc, registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "hu"
|
||||
stop_words = {"@language_data": "spacy.hu.stop_words"}
|
||||
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||
|
||||
[nlp.lemmatizer.data_paths]
|
||||
@language_data = "spacy-lookups-data"
|
||||
lang = ${nlp:lang}
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.hu.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
class HungarianDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: "hu"
|
||||
lex_attr_getters[NORM] = add_lookups(
|
||||
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
|
||||
)
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = STOP_WORDS
|
||||
prefixes = TOKENIZER_PREFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
infixes = TOKENIZER_INFIXES
|
||||
|
@ -26,6 +39,7 @@ class HungarianDefaults(Language.Defaults):
|
|||
class Hungarian(Language):
|
||||
lang = "hu"
|
||||
Defaults = HungarianDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Hungarian"]
|
||||
|
|
|
@ -1,21 +1,33 @@
|
|||
from typing import Set, Dict, Callable, Any
|
||||
from thinc.api import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
|
||||
from ...attrs import LANG
|
||||
from ...language import Language
|
||||
from ...util import registry
|
||||
|
||||
|
||||
class ArmenianDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: "hy"
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "hy"
|
||||
stop_words = {"@language_data": "spacy.hy.stop_words"}
|
||||
lex_attr_getters = {"@language_data": "spacy.hy.lex_attr_getters"}
|
||||
"""
|
||||
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
@registry.language_data("spacy.hy.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
@registry.language_data("spacy.hy.lex_attr_getters")
|
||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||
return LEX_ATTRS
|
||||
|
||||
|
||||
class Armenian(Language):
|
||||
lang = "hy"
|
||||
Defaults = ArmenianDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Armenian"]
|
||||
|
|
|
@ -1,21 +1,43 @@
|
|||
from typing import Set, Dict, Callable, Any
|
||||
from thinc.config import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG
|
||||
from ...util import update_exc
|
||||
from ...util import update_exc, registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "id"
|
||||
stop_words = {"@language_data": "spacy.id.stop_words"}
|
||||
lex_attr_getters = {"@language_data": "spacy.id.lex_attr_getters"}
|
||||
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||
|
||||
[nlp.lemmatizer.data_paths]
|
||||
@language_data = "spacy-lookups-data"
|
||||
lang = ${nlp:lang}
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.id.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
@registry.language_data("spacy.id.lex_attr_getters")
|
||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||
return LEX_ATTRS
|
||||
|
||||
|
||||
class IndonesianDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: "id"
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = STOP_WORDS
|
||||
prefixes = TOKENIZER_PREFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
infixes = TOKENIZER_INFIXES
|
||||
|
@ -25,6 +47,7 @@ class IndonesianDefaults(Language.Defaults):
|
|||
class Indonesian(Language):
|
||||
lang = "id"
|
||||
Defaults = IndonesianDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Indonesian"]
|
||||
|
|
|
@ -1,17 +1,26 @@
|
|||
from typing import Set
|
||||
from thinc.api import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG
|
||||
from ...util import registry
|
||||
|
||||
|
||||
class IcelandicDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: "is"
|
||||
stop_words = STOP_WORDS
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "is"
|
||||
stop_words = {"@language_data": "spacy.is.stop_words"}
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.is.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
class Icelandic(Language):
|
||||
lang = "is"
|
||||
Defaults = IcelandicDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Icelandic"]
|
||||
|
|
|
@ -1,20 +1,34 @@
|
|||
from typing import Set
|
||||
from thinc.api import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
from ...util import update_exc, registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "it"
|
||||
stop_words = {"@language_data": "spacy.it.stop_words"}
|
||||
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||
|
||||
[nlp.lemmatizer.data_paths]
|
||||
@language_data = "spacy-lookups-data"
|
||||
lang = ${nlp:lang}
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.it.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
class ItalianDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: "it"
|
||||
lex_attr_getters[NORM] = add_lookups(
|
||||
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
|
||||
)
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = STOP_WORDS
|
||||
prefixes = TOKENIZER_PREFIXES
|
||||
|
@ -24,6 +38,7 @@ class ItalianDefaults(Language.Defaults):
|
|||
class Italian(Language):
|
||||
lang = "it"
|
||||
Defaults = ItalianDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Italian"]
|
||||
|
|
|
@ -1,21 +1,187 @@
|
|||
from typing import Optional, Union, Dict, Any, Set
|
||||
from pathlib import Path
|
||||
import srsly
|
||||
from collections import namedtuple, OrderedDict
|
||||
from collections import namedtuple
|
||||
from thinc.api import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
from .tag_map import TAG_MAP
|
||||
from .tag_orth_map import TAG_ORTH_MAP
|
||||
from .tag_bigram_map import TAG_BIGRAM_MAP
|
||||
from ...attrs import LANG
|
||||
from ...compat import copy_reg
|
||||
from ...errors import Errors
|
||||
from ...language import Language
|
||||
from ...symbols import POS
|
||||
from ...tokens import Doc
|
||||
from ...util import DummyTokenizer
|
||||
from ...util import DummyTokenizer, registry
|
||||
from ... import util
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "ja"
|
||||
stop_words = {"@language_data": "spacy.ja.stop_words"}
|
||||
|
||||
[nlp.tokenizer]
|
||||
@tokenizers = "spacy.JapaneseTokenizer.v1"
|
||||
split_mode = null
|
||||
|
||||
[nlp.writing_system]
|
||||
direction = "ltr"
|
||||
has_case = false
|
||||
has_letters = false
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.ja.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
@registry.tokenizers("spacy.JapaneseTokenizer.v1")
|
||||
def create_japanese_tokenizer(split_mode: Optional[str] = None):
|
||||
def japanese_tokenizer_factory(nlp):
|
||||
return JapaneseTokenizer(nlp, split_mode=split_mode)
|
||||
|
||||
return japanese_tokenizer_factory
|
||||
|
||||
|
||||
class JapaneseTokenizer(DummyTokenizer):
|
||||
def __init__(self, nlp: Language, split_mode: Optional[str] = None) -> None:
|
||||
self.vocab = nlp.vocab
|
||||
self.split_mode = split_mode
|
||||
self.tokenizer = try_sudachi_import(self.split_mode)
|
||||
|
||||
def __call__(self, text: str) -> Doc:
|
||||
# convert sudachipy.morpheme.Morpheme to DetailedToken and merge continuous spaces
|
||||
sudachipy_tokens = self.tokenizer.tokenize(text)
|
||||
dtokens = self._get_dtokens(sudachipy_tokens)
|
||||
dtokens, spaces = get_dtokens_and_spaces(dtokens, text)
|
||||
|
||||
# create Doc with tag bi-gram based part-of-speech identification rules
|
||||
words, tags, inflections, lemmas, readings, sub_tokens_list = (
|
||||
zip(*dtokens) if dtokens else [[]] * 6
|
||||
)
|
||||
sub_tokens_list = list(sub_tokens_list)
|
||||
doc = Doc(self.vocab, words=words, spaces=spaces)
|
||||
next_pos = None # for bi-gram rules
|
||||
for idx, (token, dtoken) in enumerate(zip(doc, dtokens)):
|
||||
token.tag_ = dtoken.tag
|
||||
if next_pos: # already identified in previous iteration
|
||||
token.pos = next_pos
|
||||
next_pos = None
|
||||
else:
|
||||
token.pos, next_pos = resolve_pos(
|
||||
token.orth_,
|
||||
dtoken.tag,
|
||||
tags[idx + 1] if idx + 1 < len(tags) else None,
|
||||
)
|
||||
# if there's no lemma info (it's an unk) just use the surface
|
||||
token.lemma_ = dtoken.lemma if dtoken.lemma else dtoken.surface
|
||||
doc.user_data["inflections"] = inflections
|
||||
doc.user_data["reading_forms"] = readings
|
||||
doc.user_data["sub_tokens"] = sub_tokens_list
|
||||
return doc
|
||||
|
||||
def _get_dtokens(self, sudachipy_tokens, need_sub_tokens: bool = True):
|
||||
sub_tokens_list = (
|
||||
self._get_sub_tokens(sudachipy_tokens) if need_sub_tokens else None
|
||||
)
|
||||
dtokens = [
|
||||
DetailedToken(
|
||||
token.surface(), # orth
|
||||
"-".join([xx for xx in token.part_of_speech()[:4] if xx != "*"]), # tag
|
||||
",".join([xx for xx in token.part_of_speech()[4:] if xx != "*"]), # inf
|
||||
token.dictionary_form(), # lemma
|
||||
token.reading_form(), # user_data['reading_forms']
|
||||
sub_tokens_list[idx]
|
||||
if sub_tokens_list
|
||||
else None, # user_data['sub_tokens']
|
||||
)
|
||||
for idx, token in enumerate(sudachipy_tokens)
|
||||
if len(token.surface()) > 0
|
||||
# remove empty tokens which can be produced with characters like … that
|
||||
]
|
||||
# Sudachi normalizes internally and outputs each space char as a token.
|
||||
# This is the preparation for get_dtokens_and_spaces() to merge the continuous space tokens
|
||||
return [
|
||||
t
|
||||
for idx, t in enumerate(dtokens)
|
||||
if idx == 0
|
||||
or not t.surface.isspace()
|
||||
or t.tag != "空白"
|
||||
or not dtokens[idx - 1].surface.isspace()
|
||||
or dtokens[idx - 1].tag != "空白"
|
||||
]
|
||||
|
||||
def _get_sub_tokens(self, sudachipy_tokens):
|
||||
if (
|
||||
self.split_mode is None or self.split_mode == "A"
|
||||
): # do nothing for default split mode
|
||||
return None
|
||||
|
||||
sub_tokens_list = [] # list of (list of list of DetailedToken | None)
|
||||
for token in sudachipy_tokens:
|
||||
sub_a = token.split(self.tokenizer.SplitMode.A)
|
||||
if len(sub_a) == 1: # no sub tokens
|
||||
sub_tokens_list.append(None)
|
||||
elif self.split_mode == "B":
|
||||
sub_tokens_list.append([self._get_dtokens(sub_a, False)])
|
||||
else: # "C"
|
||||
sub_b = token.split(self.tokenizer.SplitMode.B)
|
||||
if len(sub_a) == len(sub_b):
|
||||
dtokens = self._get_dtokens(sub_a, False)
|
||||
sub_tokens_list.append([dtokens, dtokens])
|
||||
else:
|
||||
sub_tokens_list.append(
|
||||
[
|
||||
self._get_dtokens(sub_a, False),
|
||||
self._get_dtokens(sub_b, False),
|
||||
]
|
||||
)
|
||||
return sub_tokens_list
|
||||
|
||||
def _get_config(self) -> Dict[str, Any]:
|
||||
return {"split_mode": self.split_mode}
|
||||
|
||||
def _set_config(self, config: Dict[str, Any] = {}) -> None:
|
||||
self.split_mode = config.get("split_mode", None)
|
||||
|
||||
def to_bytes(self, **kwargs) -> bytes:
|
||||
serializers = {"cfg": lambda: srsly.json_dumps(self._get_config())}
|
||||
return util.to_bytes(serializers, [])
|
||||
|
||||
def from_bytes(self, data: bytes, **kwargs) -> "JapaneseTokenizer":
|
||||
deserializers = {"cfg": lambda b: self._set_config(srsly.json_loads(b))}
|
||||
util.from_bytes(data, deserializers, [])
|
||||
self.tokenizer = try_sudachi_import(self.split_mode)
|
||||
return self
|
||||
|
||||
def to_disk(self, path: Union[str, Path], **kwargs) -> None:
|
||||
path = util.ensure_path(path)
|
||||
serializers = {"cfg": lambda p: srsly.write_json(p, self._get_config())}
|
||||
return util.to_disk(path, serializers, [])
|
||||
|
||||
def from_disk(self, path: Union[str, Path], **kwargs) -> "JapaneseTokenizer":
|
||||
path = util.ensure_path(path)
|
||||
serializers = {"cfg": lambda p: self._set_config(srsly.read_json(p))}
|
||||
util.from_disk(path, serializers, [])
|
||||
self.tokenizer = try_sudachi_import(self.split_mode)
|
||||
return self
|
||||
|
||||
|
||||
class JapaneseDefaults(Language.Defaults):
|
||||
tag_map = TAG_MAP
|
||||
syntax_iterators = SYNTAX_ITERATORS
|
||||
|
||||
|
||||
class Japanese(Language):
|
||||
lang = "ja"
|
||||
Defaults = JapaneseDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
# Hold the attributes we need with convenient names
|
||||
DetailedToken = namedtuple(
|
||||
"DetailedToken", ["surface", "tag", "inf", "lemma", "reading", "sub_tokens"]
|
||||
|
@ -133,161 +299,6 @@ def get_dtokens_and_spaces(dtokens, text, gap_tag="空白"):
|
|||
return text_dtokens, text_spaces
|
||||
|
||||
|
||||
class JapaneseTokenizer(DummyTokenizer):
|
||||
def __init__(self, cls, nlp=None, config={}):
|
||||
self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
|
||||
self.split_mode = config.get("split_mode", None)
|
||||
self.tokenizer = try_sudachi_import(self.split_mode)
|
||||
|
||||
def __call__(self, text):
|
||||
# convert sudachipy.morpheme.Morpheme to DetailedToken and merge continuous spaces
|
||||
sudachipy_tokens = self.tokenizer.tokenize(text)
|
||||
dtokens = self._get_dtokens(sudachipy_tokens)
|
||||
dtokens, spaces = get_dtokens_and_spaces(dtokens, text)
|
||||
|
||||
# create Doc with tag bi-gram based part-of-speech identification rules
|
||||
words, tags, inflections, lemmas, readings, sub_tokens_list = (
|
||||
zip(*dtokens) if dtokens else [[]] * 6
|
||||
)
|
||||
sub_tokens_list = list(sub_tokens_list)
|
||||
doc = Doc(self.vocab, words=words, spaces=spaces)
|
||||
next_pos = None # for bi-gram rules
|
||||
for idx, (token, dtoken) in enumerate(zip(doc, dtokens)):
|
||||
token.tag_ = dtoken.tag
|
||||
if next_pos: # already identified in previous iteration
|
||||
token.pos = next_pos
|
||||
next_pos = None
|
||||
else:
|
||||
token.pos, next_pos = resolve_pos(
|
||||
token.orth_,
|
||||
dtoken.tag,
|
||||
tags[idx + 1] if idx + 1 < len(tags) else None,
|
||||
)
|
||||
# if there's no lemma info (it's an unk) just use the surface
|
||||
token.lemma_ = dtoken.lemma if dtoken.lemma else dtoken.surface
|
||||
|
||||
doc.user_data["inflections"] = inflections
|
||||
doc.user_data["reading_forms"] = readings
|
||||
doc.user_data["sub_tokens"] = sub_tokens_list
|
||||
|
||||
return doc
|
||||
|
||||
def _get_dtokens(self, sudachipy_tokens, need_sub_tokens=True):
|
||||
sub_tokens_list = (
|
||||
self._get_sub_tokens(sudachipy_tokens) if need_sub_tokens else None
|
||||
)
|
||||
dtokens = [
|
||||
DetailedToken(
|
||||
token.surface(), # orth
|
||||
"-".join([xx for xx in token.part_of_speech()[:4] if xx != "*"]), # tag
|
||||
",".join([xx for xx in token.part_of_speech()[4:] if xx != "*"]), # inf
|
||||
token.dictionary_form(), # lemma
|
||||
token.reading_form(), # user_data['reading_forms']
|
||||
sub_tokens_list[idx]
|
||||
if sub_tokens_list
|
||||
else None, # user_data['sub_tokens']
|
||||
)
|
||||
for idx, token in enumerate(sudachipy_tokens)
|
||||
if len(token.surface()) > 0
|
||||
# remove empty tokens which can be produced with characters like … that
|
||||
]
|
||||
# Sudachi normalizes internally and outputs each space char as a token.
|
||||
# This is the preparation for get_dtokens_and_spaces() to merge the continuous space tokens
|
||||
return [
|
||||
t
|
||||
for idx, t in enumerate(dtokens)
|
||||
if idx == 0
|
||||
or not t.surface.isspace()
|
||||
or t.tag != "空白"
|
||||
or not dtokens[idx - 1].surface.isspace()
|
||||
or dtokens[idx - 1].tag != "空白"
|
||||
]
|
||||
|
||||
def _get_sub_tokens(self, sudachipy_tokens):
|
||||
if (
|
||||
self.split_mode is None or self.split_mode == "A"
|
||||
): # do nothing for default split mode
|
||||
return None
|
||||
|
||||
sub_tokens_list = [] # list of (list of list of DetailedToken | None)
|
||||
for token in sudachipy_tokens:
|
||||
sub_a = token.split(self.tokenizer.SplitMode.A)
|
||||
if len(sub_a) == 1: # no sub tokens
|
||||
sub_tokens_list.append(None)
|
||||
elif self.split_mode == "B":
|
||||
sub_tokens_list.append([self._get_dtokens(sub_a, False)])
|
||||
else: # "C"
|
||||
sub_b = token.split(self.tokenizer.SplitMode.B)
|
||||
if len(sub_a) == len(sub_b):
|
||||
dtokens = self._get_dtokens(sub_a, False)
|
||||
sub_tokens_list.append([dtokens, dtokens])
|
||||
else:
|
||||
sub_tokens_list.append(
|
||||
[
|
||||
self._get_dtokens(sub_a, False),
|
||||
self._get_dtokens(sub_b, False),
|
||||
]
|
||||
)
|
||||
return sub_tokens_list
|
||||
|
||||
def _get_config(self):
|
||||
config = OrderedDict((("split_mode", self.split_mode),))
|
||||
return config
|
||||
|
||||
def _set_config(self, config={}):
|
||||
self.split_mode = config.get("split_mode", None)
|
||||
|
||||
def to_bytes(self, **kwargs):
|
||||
serializers = OrderedDict(
|
||||
(("cfg", lambda: srsly.json_dumps(self._get_config())),)
|
||||
)
|
||||
return util.to_bytes(serializers, [])
|
||||
|
||||
def from_bytes(self, data, **kwargs):
|
||||
deserializers = OrderedDict(
|
||||
(("cfg", lambda b: self._set_config(srsly.json_loads(b))),)
|
||||
)
|
||||
util.from_bytes(data, deserializers, [])
|
||||
self.tokenizer = try_sudachi_import(self.split_mode)
|
||||
return self
|
||||
|
||||
def to_disk(self, path, **kwargs):
|
||||
path = util.ensure_path(path)
|
||||
serializers = OrderedDict(
|
||||
(("cfg", lambda p: srsly.write_json(p, self._get_config())),)
|
||||
)
|
||||
return util.to_disk(path, serializers, [])
|
||||
|
||||
def from_disk(self, path, **kwargs):
|
||||
path = util.ensure_path(path)
|
||||
serializers = OrderedDict(
|
||||
(("cfg", lambda p: self._set_config(srsly.read_json(p))),)
|
||||
)
|
||||
util.from_disk(path, serializers, [])
|
||||
self.tokenizer = try_sudachi_import(self.split_mode)
|
||||
|
||||
|
||||
class JapaneseDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda _text: "ja"
|
||||
stop_words = STOP_WORDS
|
||||
tag_map = TAG_MAP
|
||||
syntax_iterators = SYNTAX_ITERATORS
|
||||
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
|
||||
|
||||
@classmethod
|
||||
def create_tokenizer(cls, nlp=None, config={}):
|
||||
return JapaneseTokenizer(cls, nlp, config)
|
||||
|
||||
|
||||
class Japanese(Language):
|
||||
lang = "ja"
|
||||
Defaults = JapaneseDefaults
|
||||
|
||||
def make_doc(self, text):
|
||||
return self.tokenizer(text)
|
||||
|
||||
|
||||
def pickle_japanese(instance):
|
||||
return Japanese, tuple()
|
||||
|
||||
|
|
|
@ -1,17 +1,26 @@
|
|||
from typing import Set
|
||||
from thinc.api import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG
|
||||
from ...util import registry
|
||||
|
||||
|
||||
class KannadaDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: "kn"
|
||||
stop_words = STOP_WORDS
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "kn"
|
||||
stop_words = {"@language_data": "spacy.kn.stop_words"}
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.kn.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
class Kannada(Language):
|
||||
lang = "kn"
|
||||
Defaults = KannadaDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Kannada"]
|
||||
|
|
|
@ -1,51 +1,52 @@
|
|||
from typing import Set, Optional, Any, Dict
|
||||
from thinc.api import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from .tag_map import TAG_MAP
|
||||
from ...attrs import LANG
|
||||
from ...language import Language
|
||||
from ...tokens import Doc
|
||||
from ...compat import copy_reg
|
||||
from ...util import DummyTokenizer
|
||||
from ...util import DummyTokenizer, registry
|
||||
|
||||
|
||||
def try_mecab_import():
|
||||
try:
|
||||
from natto import MeCab
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "ko"
|
||||
stop_words = {"@language_data": "spacy.ko.stop_words"}
|
||||
|
||||
return MeCab
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Korean support requires [mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), "
|
||||
"[mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), "
|
||||
"and [natto-py](https://github.com/buruzaemon/natto-py)"
|
||||
)
|
||||
[nlp.tokenizer]
|
||||
@tokenizers = "spacy.KoreanTokenizer.v1"
|
||||
|
||||
[nlp.writing_system]
|
||||
direction = "ltr"
|
||||
has_case = false
|
||||
has_letters = false
|
||||
"""
|
||||
|
||||
|
||||
# fmt: on
|
||||
@registry.language_data("spacy.ko.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
def check_spaces(text, tokens):
|
||||
prev_end = -1
|
||||
start = 0
|
||||
for token in tokens:
|
||||
idx = text.find(token, start)
|
||||
if prev_end > 0:
|
||||
yield prev_end != idx
|
||||
prev_end = idx + len(token)
|
||||
start = prev_end
|
||||
if start > 0:
|
||||
yield False
|
||||
@registry.tokenizers("spacy.KoreanTokenizer.v1")
|
||||
def create_korean_tokenizer():
|
||||
def korean_tokenizer_factory(nlp):
|
||||
return KoreanTokenizer(nlp)
|
||||
|
||||
return korean_tokenizer_factory
|
||||
|
||||
|
||||
class KoreanTokenizer(DummyTokenizer):
|
||||
def __init__(self, cls, nlp=None):
|
||||
self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
|
||||
def __init__(self, nlp: Optional[Language] = None):
|
||||
self.vocab = nlp.vocab
|
||||
MeCab = try_mecab_import()
|
||||
self.mecab_tokenizer = MeCab("-F%f[0],%f[7]")
|
||||
|
||||
def __del__(self):
|
||||
self.mecab_tokenizer.__del__()
|
||||
|
||||
def __call__(self, text):
|
||||
def __call__(self, text: str) -> Doc:
|
||||
dtokens = list(self.detailed_tokens(text))
|
||||
surfaces = [dt["surface"] for dt in dtokens]
|
||||
doc = Doc(self.vocab, words=surfaces, spaces=list(check_spaces(text, surfaces)))
|
||||
|
@ -56,7 +57,7 @@ class KoreanTokenizer(DummyTokenizer):
|
|||
doc.user_data["full_tags"] = [dt["tag"] for dt in dtokens]
|
||||
return doc
|
||||
|
||||
def detailed_tokens(self, text):
|
||||
def detailed_tokens(self, text: str) -> Dict[str, Any]:
|
||||
# 품사 태그(POS)[0], 의미 부류(semantic class)[1], 종성 유무(jongseong)[2], 읽기(reading)[3],
|
||||
# 타입(type)[4], 첫번째 품사(start pos)[5], 마지막 품사(end pos)[6], 표현(expression)[7], *
|
||||
for node in self.mecab_tokenizer.parse(text, as_nodes=True):
|
||||
|
@ -72,23 +73,39 @@ class KoreanTokenizer(DummyTokenizer):
|
|||
|
||||
|
||||
class KoreanDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda _text: "ko"
|
||||
stop_words = STOP_WORDS
|
||||
tag_map = TAG_MAP
|
||||
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
|
||||
|
||||
@classmethod
|
||||
def create_tokenizer(cls, nlp=None):
|
||||
return KoreanTokenizer(cls, nlp)
|
||||
|
||||
|
||||
class Korean(Language):
|
||||
lang = "ko"
|
||||
Defaults = KoreanDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
def make_doc(self, text):
|
||||
return self.tokenizer(text)
|
||||
|
||||
def try_mecab_import() -> None:
|
||||
try:
|
||||
from natto import MeCab
|
||||
|
||||
return MeCab
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Korean support requires [mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), "
|
||||
"[mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), "
|
||||
"and [natto-py](https://github.com/buruzaemon/natto-py)"
|
||||
)
|
||||
|
||||
|
||||
def check_spaces(text, tokens):
|
||||
prev_end = -1
|
||||
start = 0
|
||||
for token in tokens:
|
||||
idx = text.find(token, start)
|
||||
if prev_end > 0:
|
||||
yield prev_end != idx
|
||||
prev_end = idx + len(token)
|
||||
start = prev_end
|
||||
if start > 0:
|
||||
yield False
|
||||
|
||||
|
||||
def pickle_korean(instance):
|
||||
|
|
|
@ -1,26 +1,49 @@
|
|||
from typing import Set, Dict, Callable, Any
|
||||
from thinc.api import Config
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .punctuation import TOKENIZER_INFIXES
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .stop_words import STOP_WORDS
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG
|
||||
from ...util import update_exc
|
||||
from ...util import update_exc, registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "lb"
|
||||
stop_words = {"@language_data": "spacy.lb.stop_words"}
|
||||
lex_attr_getters = {"@language_data": "spacy.lb.lex_attr_getters"}
|
||||
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||
|
||||
[nlp.lemmatizer.data_paths]
|
||||
@language_data = "spacy-lookups-data"
|
||||
lang = ${nlp:lang}
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.lb.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
@registry.language_data("spacy.lb.lex_attr_getters")
|
||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||
return LEX_ATTRS
|
||||
|
||||
|
||||
class LuxembourgishDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
lex_attr_getters[LANG] = lambda text: "lb"
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = STOP_WORDS
|
||||
infixes = TOKENIZER_INFIXES
|
||||
|
||||
|
||||
class Luxembourgish(Language):
|
||||
lang = "lb"
|
||||
Defaults = LuxembourgishDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Luxembourgish"]
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
from typing import Set
|
||||
import unicodedata
|
||||
import re
|
||||
|
||||
|
@ -21,21 +22,21 @@ _tlds = set(
|
|||
)
|
||||
|
||||
|
||||
def is_punct(text):
|
||||
def is_punct(text: str) -> bool:
|
||||
for char in text:
|
||||
if not unicodedata.category(char).startswith("P"):
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def is_ascii(text):
|
||||
def is_ascii(text: str) -> bool:
|
||||
for char in text:
|
||||
if ord(char) >= 128:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def like_num(text):
|
||||
def like_num(text: str) -> bool:
|
||||
if text.startswith(("+", "-", "±", "~")):
|
||||
text = text[1:]
|
||||
# can be overwritten by lang with list of number words
|
||||
|
@ -49,64 +50,31 @@ def like_num(text):
|
|||
return False
|
||||
|
||||
|
||||
def is_bracket(text):
|
||||
def is_bracket(text: str) -> bool:
|
||||
brackets = ("(", ")", "[", "]", "{", "}", "<", ">")
|
||||
return text in brackets
|
||||
|
||||
|
||||
def is_quote(text):
|
||||
quotes = (
|
||||
'"',
|
||||
"'",
|
||||
"`",
|
||||
"«",
|
||||
"»",
|
||||
"‘",
|
||||
"’",
|
||||
"‚",
|
||||
"‛",
|
||||
"“",
|
||||
"”",
|
||||
"„",
|
||||
"‟",
|
||||
"‹",
|
||||
"›",
|
||||
"❮",
|
||||
"❯",
|
||||
"''",
|
||||
"``",
|
||||
)
|
||||
def is_quote(text: str) -> bool:
|
||||
# fmt: off
|
||||
quotes = ('"', "'", "`", "«", "»", "‘", "’", "‚", "‛", "“", "”", "„", "‟", "‹", "›", "❮", "❯", "''", "``")
|
||||
# fmt: on
|
||||
return text in quotes
|
||||
|
||||
|
||||
def is_left_punct(text):
|
||||
left_punct = (
|
||||
"(",
|
||||
"[",
|
||||
"{",
|
||||
"<",
|
||||
'"',
|
||||
"'",
|
||||
"«",
|
||||
"‘",
|
||||
"‚",
|
||||
"‛",
|
||||
"“",
|
||||
"„",
|
||||
"‟",
|
||||
"‹",
|
||||
"❮",
|
||||
"``",
|
||||
)
|
||||
def is_left_punct(text: str) -> bool:
|
||||
# fmt: off
|
||||
left_punct = ("(", "[", "{", "<", '"', "'", "«", "‘", "‚", "‛", "“", "„", "‟", "‹", "❮", "``")
|
||||
# fmt: on
|
||||
return text in left_punct
|
||||
|
||||
|
||||
def is_right_punct(text):
|
||||
def is_right_punct(text: str) -> bool:
|
||||
right_punct = (")", "]", "}", ">", '"', "'", "»", "’", "”", "›", "❯", "''")
|
||||
return text in right_punct
|
||||
|
||||
|
||||
def is_currency(text):
|
||||
def is_currency(text: str) -> bool:
|
||||
# can be overwritten by lang with list of currency words, e.g. dollar, euro
|
||||
for char in text:
|
||||
if unicodedata.category(char) != "Sc":
|
||||
|
@ -114,11 +82,11 @@ def is_currency(text):
|
|||
return True
|
||||
|
||||
|
||||
def like_email(text):
|
||||
def like_email(text: str) -> bool:
|
||||
return bool(_like_email(text))
|
||||
|
||||
|
||||
def like_url(text):
|
||||
def like_url(text: str) -> bool:
|
||||
# We're looking for things that function in text like URLs. So, valid URL
|
||||
# or not, anything they say http:// is going to be good.
|
||||
if text.startswith("http://") or text.startswith("https://"):
|
||||
|
@ -144,7 +112,7 @@ def like_url(text):
|
|||
return False
|
||||
|
||||
|
||||
def word_shape(text):
|
||||
def word_shape(text: str) -> str:
|
||||
if len(text) >= 100:
|
||||
return "LONG"
|
||||
shape = []
|
||||
|
@ -171,46 +139,52 @@ def word_shape(text):
|
|||
return "".join(shape)
|
||||
|
||||
|
||||
def lower(string):
|
||||
def lower(string: str) -> str:
|
||||
return string.lower()
|
||||
|
||||
|
||||
def prefix(string):
|
||||
def prefix(string: str) -> str:
|
||||
return string[0]
|
||||
|
||||
|
||||
def suffix(string):
|
||||
def suffix(string: str) -> str:
|
||||
return string[-3:]
|
||||
|
||||
|
||||
def is_alpha(string):
|
||||
def is_alpha(string: str) -> bool:
|
||||
return string.isalpha()
|
||||
|
||||
|
||||
def is_digit(string):
|
||||
def is_digit(string: str) -> bool:
|
||||
return string.isdigit()
|
||||
|
||||
|
||||
def is_lower(string):
|
||||
def is_lower(string: str) -> bool:
|
||||
return string.islower()
|
||||
|
||||
|
||||
def is_space(string):
|
||||
def is_space(string: str) -> bool:
|
||||
return string.isspace()
|
||||
|
||||
|
||||
def is_title(string):
|
||||
def is_title(string: str) -> bool:
|
||||
return string.istitle()
|
||||
|
||||
|
||||
def is_upper(string):
|
||||
def is_upper(string: str) -> bool:
|
||||
return string.isupper()
|
||||
|
||||
|
||||
def is_stop(string, stops=set()):
|
||||
def is_stop(string: str, stops: Set[str] = set()) -> bool:
|
||||
return string.lower() in stops
|
||||
|
||||
|
||||
def get_lang(text: str, lang: str = "") -> str:
|
||||
# This function is partially applied so lang code can be passed in
|
||||
# automatically while still allowing pickling
|
||||
return lang
|
||||
|
||||
|
||||
LEX_ATTRS = {
|
||||
attrs.LOWER: lower,
|
||||
attrs.NORM: lower,
|
||||
|
|
|
@ -1,28 +1,35 @@
|
|||
from typing import Set
|
||||
from thinc.api import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .punctuation import TOKENIZER_INFIXES
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
from ...util import update_exc, registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "lij"
|
||||
stop_words = {"@language_data": "spacy.lij.stop_words"}
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.lij.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
class LigurianDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: "lij"
|
||||
lex_attr_getters[NORM] = add_lookups(
|
||||
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
|
||||
)
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = STOP_WORDS
|
||||
infixes = TOKENIZER_INFIXES
|
||||
|
||||
|
||||
class Ligurian(Language):
|
||||
lang = "lij"
|
||||
Defaults = LigurianDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Ligurian"]
|
||||
|
|
|
@ -1,27 +1,41 @@
|
|||
from typing import Set, Dict, Callable, Any
|
||||
from thinc.api import Config
|
||||
|
||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
from ...util import update_exc, registry
|
||||
|
||||
|
||||
def _return_lt(_):
|
||||
return "lt"
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "lt"
|
||||
stop_words = {"@language_data": "spacy.lt.stop_words"}
|
||||
lex_attr_getters = {"@language_data": "spacy.lt.lex_attr_getters"}
|
||||
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||
|
||||
[nlp.lemmatizer.data_paths]
|
||||
@language_data = "spacy-lookups-data"
|
||||
lang = ${nlp:lang}
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.lt.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
@registry.language_data("spacy.lt.lex_attr_getters")
|
||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||
return LEX_ATTRS
|
||||
|
||||
|
||||
class LithuanianDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = _return_lt
|
||||
lex_attr_getters[NORM] = add_lookups(
|
||||
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
|
||||
)
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
|
||||
infixes = TOKENIZER_INFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
mod_base_exceptions = {
|
||||
|
@ -29,12 +43,12 @@ class LithuanianDefaults(Language.Defaults):
|
|||
}
|
||||
del mod_base_exceptions["8)"]
|
||||
tokenizer_exceptions = update_exc(mod_base_exceptions, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class Lithuanian(Language):
|
||||
lang = "lt"
|
||||
Defaults = LithuanianDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Lithuanian"]
|
||||
|
|
|
@ -1,17 +1,26 @@
|
|||
from typing import Set
|
||||
from thinc.api import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG
|
||||
from ...util import registry
|
||||
|
||||
|
||||
class LatvianDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: "lv"
|
||||
stop_words = STOP_WORDS
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "lv"
|
||||
stop_words = {"@language_data": "spacy.lv.stop_words"}
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.lv.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
class Latvian(Language):
|
||||
lang = "lv"
|
||||
Defaults = LatvianDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Latvian"]
|
||||
|
|
|
@ -1,15 +1,26 @@
|
|||
from typing import Set
|
||||
from thinc.api import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
|
||||
from ...language import Language
|
||||
from ...util import registry
|
||||
|
||||
|
||||
class MalayalamDefaults(Language.Defaults):
|
||||
stop_words = STOP_WORDS
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "ml"
|
||||
stop_words = {"@language_data": "spacy.ml.stop_words"}
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.ml.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
class Malayalam(Language):
|
||||
lang = "ml"
|
||||
Defaults = MalayalamDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Malayalam"]
|
||||
|
|
|
@ -1,17 +1,26 @@
|
|||
from typing import Set
|
||||
from thinc.api import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG
|
||||
from ...util import registry
|
||||
|
||||
|
||||
class MarathiDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: "mr"
|
||||
stop_words = STOP_WORDS
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "af"
|
||||
stop_words = {"@language_data": "spacy.mr.stop_words"}
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.mr.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
class Marathi(Language):
|
||||
lang = "mr"
|
||||
Defaults = MarathiDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Marathi"]
|
||||
|
|
|
@ -1,33 +1,47 @@
|
|||
from typing import Set
|
||||
from thinc.api import Config
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||
from .punctuation import TOKENIZER_SUFFIXES
|
||||
from .stop_words import STOP_WORDS
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
from ...util import update_exc, registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "nb"
|
||||
stop_words = {"@language_data": "spacy.nb.stop_words"}
|
||||
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||
|
||||
[nlp.lemmatizer.data_paths]
|
||||
@language_data = "spacy-lookups-data"
|
||||
lang = ${nlp:lang}
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.nb.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
class NorwegianDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: "nb"
|
||||
lex_attr_getters[NORM] = add_lookups(
|
||||
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
|
||||
)
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
prefixes = TOKENIZER_PREFIXES
|
||||
infixes = TOKENIZER_INFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
stop_words = STOP_WORDS
|
||||
syntax_iterators = SYNTAX_ITERATORS
|
||||
|
||||
|
||||
class Norwegian(Language):
|
||||
lang = "nb"
|
||||
Defaults = NorwegianDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Norwegian"]
|
||||
|
|
|
@ -1,23 +1,33 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
from typing import Set, Dict, Callable, Any
|
||||
from thinc.api import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
|
||||
from ...language import Language
|
||||
from ...attrs import LANG
|
||||
from ...util import registry
|
||||
|
||||
|
||||
class NepaliDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
lex_attr_getters[LANG] = lambda text: "ne" # Nepali language ISO code
|
||||
stop_words = STOP_WORDS
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "ne"
|
||||
stop_words = {"@language_data": "spacy.ne.stop_words"}
|
||||
lex_attr_getters = {"@language_data": "spacy.ne.lex_attr_getters"}
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.ne.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
@registry.language_data("spacy.ne.lex_attr_getters")
|
||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||
return LEX_ATTRS
|
||||
|
||||
|
||||
class Nepali(Language):
|
||||
lang = "ne"
|
||||
Defaults = NepaliDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Nepali"]
|
||||
|
|
|
@ -1,3 +1,6 @@
|
|||
from typing import Set, Dict, Callable, Any
|
||||
from thinc.api import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
|
@ -5,36 +8,51 @@ from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
|||
from .punctuation import TOKENIZER_SUFFIXES
|
||||
from .lemmatizer import DutchLemmatizer
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...lookups import Lookups
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
from ...util import update_exc, registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "nl"
|
||||
stop_words = {"@language_data": "spacy.nl.stop_words"}
|
||||
lex_attr_getters = {"@language_data": "spacy.nl.lex_attr_getters"}
|
||||
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.DutchLemmatizer.v1"
|
||||
|
||||
[nlp.lemmatizer.data_paths]
|
||||
@language_data = "spacy-lookups-data"
|
||||
lang = ${nlp:lang}
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.nl.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
@registry.language_data("spacy.nl.lex_attr_getters")
|
||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||
return LEX_ATTRS
|
||||
|
||||
|
||||
@registry.lemmatizers("spacy.DutchLemmatizer.v1")
|
||||
def create_dutch_lemmatizer(data_paths: dict = {}) -> DutchLemmatizer:
|
||||
return DutchLemmatizer(data_paths=data_paths)
|
||||
|
||||
|
||||
class DutchDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
lex_attr_getters[LANG] = lambda text: "nl"
|
||||
lex_attr_getters[NORM] = add_lookups(
|
||||
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
|
||||
)
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = STOP_WORDS
|
||||
prefixes = TOKENIZER_PREFIXES
|
||||
infixes = TOKENIZER_INFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
|
||||
@classmethod
|
||||
def create_lemmatizer(cls, nlp=None, lookups=None):
|
||||
if lookups is None:
|
||||
lookups = Lookups()
|
||||
return DutchLemmatizer(lookups)
|
||||
|
||||
|
||||
class Dutch(Language):
|
||||
lang = "nl"
|
||||
Defaults = DutchDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Dutch"]
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
from typing import Optional, List, Dict, Tuple
|
||||
|
||||
from ...lemmatizer import Lemmatizer
|
||||
from ...symbols import NOUN, VERB, ADJ, NUM, DET, PRON, ADP, AUX, ADV
|
||||
|
||||
|
@ -34,7 +36,9 @@ class DutchLemmatizer(Lemmatizer):
|
|||
"num": "num",
|
||||
}
|
||||
|
||||
def __call__(self, string, univ_pos, morphology=None):
|
||||
def __call__(
|
||||
self, string: str, univ_pos: str, morphology: Optional[dict] = None
|
||||
) -> List[str]:
|
||||
# Difference 1: self.rules is assumed to be non-None, so no
|
||||
# 'is None' check required.
|
||||
# String lowercased from the get-go. All lemmatization results in
|
||||
|
@ -92,7 +96,7 @@ class DutchLemmatizer(Lemmatizer):
|
|||
# Overrides parent method so that a lowercased version of the string is
|
||||
# used to search the lookup table. This is necessary because our lookup
|
||||
# table consists entirely of lowercase keys.
|
||||
def lookup(self, string, orth=None):
|
||||
def lookup(self, string: str, orth: Optional[int] = None) -> str:
|
||||
lookup_table = self.lookups.get_table("lemma_lookup", {})
|
||||
string = string.lower()
|
||||
if orth is not None:
|
||||
|
@ -102,7 +106,13 @@ class DutchLemmatizer(Lemmatizer):
|
|||
|
||||
# Reimplemented to focus more on application of suffix rules and to return
|
||||
# as early as possible.
|
||||
def lemmatize(self, string, index, exceptions, rules):
|
||||
def lemmatize(
|
||||
self,
|
||||
string: str,
|
||||
index: Dict[str, List[str]],
|
||||
exceptions: Dict[str, Dict[str, List[str]]],
|
||||
rules: Dict[str, List[List[str]]],
|
||||
) -> Tuple[List[str], bool]:
|
||||
# returns (forms, is_known: bool)
|
||||
oov_forms = []
|
||||
for old, new in rules:
|
||||
|
|
|
@ -1,43 +1,60 @@
|
|||
from typing import Set, Dict, Callable, Any
|
||||
from thinc.api import Config
|
||||
|
||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||
from .punctuation import TOKENIZER_SUFFIXES
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .lemmatizer import PolishLemmatizer
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import add_lookups
|
||||
from ...lookups import Lookups
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "pl"
|
||||
stop_words = {"@language_data": "spacy.pl.stop_words"}
|
||||
lex_attr_getters = {"@language_data": "spacy.pl.lex_attr_getters"}
|
||||
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.PolishLemmatizer.v1"
|
||||
|
||||
[nlp.lemmatizer.data_paths]
|
||||
@language_data = "spacy-lookups-data"
|
||||
lang = ${nlp:lang}
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.pl.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
@registry.language_data("spacy.pl.lex_attr_getters")
|
||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||
return LEX_ATTRS
|
||||
|
||||
|
||||
@registry.lemmatizers("spacy.PolishLemmatizer.v1")
|
||||
def create_polish_lemmatizer(data_paths: dict = {}) -> PolishLemmatizer:
|
||||
return PolishLemmatizer(data_paths=data_paths)
|
||||
|
||||
|
||||
class PolishDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
lex_attr_getters[LANG] = lambda text: "pl"
|
||||
lex_attr_getters[NORM] = add_lookups(
|
||||
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
|
||||
)
|
||||
mod_base_exceptions = {
|
||||
exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".")
|
||||
}
|
||||
tokenizer_exceptions = mod_base_exceptions
|
||||
stop_words = STOP_WORDS
|
||||
prefixes = TOKENIZER_PREFIXES
|
||||
infixes = TOKENIZER_INFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
|
||||
@classmethod
|
||||
def create_lemmatizer(cls, nlp=None, lookups=None):
|
||||
if lookups is None:
|
||||
lookups = Lookups()
|
||||
return PolishLemmatizer(lookups)
|
||||
|
||||
|
||||
class Polish(Language):
|
||||
lang = "pl"
|
||||
Defaults = PolishDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Polish"]
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
from typing import Optional, List, Dict
|
||||
|
||||
from ...lemmatizer import Lemmatizer
|
||||
from ...parts_of_speech import NAMES
|
||||
|
||||
|
@ -7,30 +9,29 @@ class PolishLemmatizer(Lemmatizer):
|
|||
# dictionary (morfeusz.sgjp.pl/en) by Institute of Computer Science PAS.
|
||||
# It utilizes some prefix based improvements for verb and adjectives
|
||||
# lemmatization, as well as case-sensitive lemmatization for nouns.
|
||||
def __call__(self, string, univ_pos, morphology=None):
|
||||
def __call__(
|
||||
self, string: str, univ_pos: str, morphology: Optional[dict] = None
|
||||
) -> List[str]:
|
||||
if isinstance(univ_pos, int):
|
||||
univ_pos = NAMES.get(univ_pos, "X")
|
||||
univ_pos = univ_pos.upper()
|
||||
|
||||
lookup_pos = univ_pos.lower()
|
||||
if univ_pos == "PROPN":
|
||||
lookup_pos = "noun"
|
||||
lookup_table = self.lookups.get_table("lemma_lookup_" + lookup_pos, {})
|
||||
|
||||
if univ_pos == "NOUN":
|
||||
return self.lemmatize_noun(string, morphology, lookup_table)
|
||||
|
||||
if univ_pos != "PROPN":
|
||||
string = string.lower()
|
||||
|
||||
if univ_pos == "ADJ":
|
||||
return self.lemmatize_adj(string, morphology, lookup_table)
|
||||
elif univ_pos == "VERB":
|
||||
return self.lemmatize_verb(string, morphology, lookup_table)
|
||||
|
||||
return [lookup_table.get(string, string.lower())]
|
||||
|
||||
def lemmatize_adj(self, string, morphology, lookup_table):
|
||||
def lemmatize_adj(
|
||||
self, string: str, morphology: dict, lookup_table: Dict[str, str]
|
||||
) -> List[str]:
|
||||
# this method utilizes different procedures for adjectives
|
||||
# with 'nie' and 'naj' prefixes
|
||||
if string[:3] == "nie":
|
||||
|
@ -41,25 +42,26 @@ class PolishLemmatizer(Lemmatizer):
|
|||
return [lookup_table[naj_search_string]]
|
||||
if search_string in lookup_table:
|
||||
return [lookup_table[search_string]]
|
||||
|
||||
if string[:3] == "naj":
|
||||
naj_search_string = string[3:]
|
||||
if naj_search_string in lookup_table:
|
||||
return [lookup_table[naj_search_string]]
|
||||
|
||||
return [lookup_table.get(string, string)]
|
||||
|
||||
def lemmatize_verb(self, string, morphology, lookup_table):
|
||||
def lemmatize_verb(
|
||||
self, string: str, morphology: dict, lookup_table: Dict[str, str]
|
||||
) -> List[str]:
|
||||
# this method utilizes a different procedure for verbs
|
||||
# with 'nie' prefix
|
||||
if string[:3] == "nie":
|
||||
search_string = string[3:]
|
||||
if search_string in lookup_table:
|
||||
return [lookup_table[search_string]]
|
||||
|
||||
return [lookup_table.get(string, string)]
|
||||
|
||||
def lemmatize_noun(self, string, morphology, lookup_table):
|
||||
def lemmatize_noun(
|
||||
self, string: str, morphology: dict, lookup_table: Dict[str, str]
|
||||
) -> List[str]:
|
||||
# this method is case-sensitive, in order to work
|
||||
# for incorrectly tagged proper names
|
||||
if string != string.lower():
|
||||
|
@ -68,11 +70,16 @@ class PolishLemmatizer(Lemmatizer):
|
|||
elif string in lookup_table:
|
||||
return [lookup_table[string]]
|
||||
return [string.lower()]
|
||||
|
||||
return [lookup_table.get(string, string)]
|
||||
|
||||
def lookup(self, string, orth=None):
|
||||
def lookup(self, string: str, orth: Optional[int] = None) -> str:
|
||||
return string.lower()
|
||||
|
||||
def lemmatize(self, string, index, exceptions, rules):
|
||||
def lemmatize(
|
||||
self,
|
||||
string: str,
|
||||
index: Dict[str, List[str]],
|
||||
exceptions: Dict[str, Dict[str, List[str]]],
|
||||
rules: Dict[str, List[List[str]]],
|
||||
) -> List[str]:
|
||||
raise NotImplementedError
|
||||
|
|
|
@ -1,20 +1,42 @@
|
|||
from typing import Set, Dict, Callable, Any
|
||||
from thinc.api import Config
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES
|
||||
from ...language import Language
|
||||
from ...attrs import LANG
|
||||
from ...util import update_exc
|
||||
from ...util import update_exc, registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "pt"
|
||||
stop_words = {"@language_data": "spacy.pt.stop_words"}
|
||||
lex_attr_getters = {"@language_data": "spacy.pt.lex_attr_getters"}
|
||||
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||
|
||||
[nlp.lemmatizer.data_paths]
|
||||
@language_data = "spacy-lookups-data"
|
||||
lang = ${nlp:lang}
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.pt.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
@registry.language_data("spacy.pt.lex_attr_getters")
|
||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||
return LEX_ATTRS
|
||||
|
||||
|
||||
class PortugueseDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: "pt"
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = STOP_WORDS
|
||||
infixes = TOKENIZER_INFIXES
|
||||
prefixes = TOKENIZER_PREFIXES
|
||||
|
||||
|
@ -22,6 +44,7 @@ class PortugueseDefaults(Language.Defaults):
|
|||
class Portuguese(Language):
|
||||
lang = "pt"
|
||||
Defaults = PortugueseDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Portuguese"]
|
||||
|
|
|
@ -1,27 +1,40 @@
|
|||
from typing import Set
|
||||
from thinc.api import Config
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .stop_words import STOP_WORDS
|
||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||
from .punctuation import TOKENIZER_SUFFIXES
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
from ...util import update_exc, registry
|
||||
|
||||
# Lemma data note:
|
||||
# Original pairs downloaded from http://www.lexiconista.com/datasets/lemmatization/
|
||||
# Replaced characters using cedillas with the correct ones (ș and ț)
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "ro"
|
||||
stop_words = {"@language_data": "spacy.ro.stop_words"}
|
||||
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||
|
||||
[nlp.lemmatizer.data_paths]
|
||||
@language_data = "spacy-lookups-data"
|
||||
lang = ${nlp:lang}
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.ro.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
class RomanianDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: "ro"
|
||||
lex_attr_getters[NORM] = add_lookups(
|
||||
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
|
||||
)
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = STOP_WORDS
|
||||
prefixes = TOKENIZER_PREFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
infixes = TOKENIZER_INFIXES
|
||||
|
@ -30,6 +43,7 @@ class RomanianDefaults(Language.Defaults):
|
|||
class Romanian(Language):
|
||||
lang = "ro"
|
||||
Defaults = RomanianDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Romanian"]
|
||||
|
|
|
@ -1,32 +1,49 @@
|
|||
from typing import Set, Dict, Callable, Any
|
||||
from thinc.api import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .lemmatizer import RussianLemmatizer
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...util import update_exc
|
||||
from ...util import update_exc, registry
|
||||
from ...language import Language
|
||||
from ...lookups import Lookups
|
||||
from ...attrs import LANG
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "ru"
|
||||
stop_words = {"@language_data": "spacy.ru.stop_words"}
|
||||
lex_attr_getters = {"@language_data": "spacy.ru.lex_attr_getters"}
|
||||
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.RussianLemmatizer.v1"
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.ru.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
@registry.language_data("spacy.ru.lex_attr_getters")
|
||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||
return LEX_ATTRS
|
||||
|
||||
|
||||
@registry.lemmatizers("spacy.RussianLemmatizer.v1")
|
||||
def create_russian_lemmatizer() -> RussianLemmatizer:
|
||||
return RussianLemmatizer()
|
||||
|
||||
|
||||
class RussianDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
lex_attr_getters[LANG] = lambda text: "ru"
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
@classmethod
|
||||
def create_lemmatizer(cls, nlp=None, lookups=None):
|
||||
if lookups is None:
|
||||
lookups = Lookups()
|
||||
return RussianLemmatizer(lookups)
|
||||
|
||||
|
||||
class Russian(Language):
|
||||
lang = "ru"
|
||||
Defaults = RussianDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Russian"]
|
||||
|
|
|
@ -1,11 +1,17 @@
|
|||
from typing import Optional, Tuple, Dict, List
|
||||
|
||||
from ...symbols import ADJ, DET, NOUN, NUM, PRON, PROPN, PUNCT, VERB, POS
|
||||
from ...lemmatizer import Lemmatizer
|
||||
from ...lookups import Lookups
|
||||
|
||||
|
||||
PUNCT_RULES = {"«": '"', "»": '"'}
|
||||
|
||||
|
||||
class RussianLemmatizer(Lemmatizer):
|
||||
_morph = None
|
||||
|
||||
def __init__(self, lookups=None):
|
||||
def __init__(self, lookups: Optional[Lookups] = None) -> None:
|
||||
super(RussianLemmatizer, self).__init__(lookups)
|
||||
try:
|
||||
from pymorphy2 import MorphAnalyzer
|
||||
|
@ -19,15 +25,15 @@ class RussianLemmatizer(Lemmatizer):
|
|||
if RussianLemmatizer._morph is None:
|
||||
RussianLemmatizer._morph = MorphAnalyzer()
|
||||
|
||||
def __call__(self, string, univ_pos, morphology=None):
|
||||
def __call__(
|
||||
self, string: str, univ_pos: str, morphology: Optional[dict] = None
|
||||
) -> List[str]:
|
||||
univ_pos = self.normalize_univ_pos(univ_pos)
|
||||
if univ_pos == "PUNCT":
|
||||
return [PUNCT_RULES.get(string, string)]
|
||||
|
||||
if univ_pos not in ("ADJ", "DET", "NOUN", "NUM", "PRON", "PROPN", "VERB"):
|
||||
# Skip unchangeable pos
|
||||
return [string.lower()]
|
||||
|
||||
analyses = self._morph.parse(string)
|
||||
filtered_analyses = []
|
||||
for analysis in analyses:
|
||||
|
@ -39,12 +45,10 @@ class RussianLemmatizer(Lemmatizer):
|
|||
analysis_pos in ("NOUN", "PROPN") and univ_pos in ("NOUN", "PROPN")
|
||||
):
|
||||
filtered_analyses.append(analysis)
|
||||
|
||||
if not len(filtered_analyses):
|
||||
return [string.lower()]
|
||||
if morphology is None or (len(morphology) == 1 and POS in morphology):
|
||||
return list(set([analysis.normal_form for analysis in filtered_analyses]))
|
||||
|
||||
if univ_pos in ("ADJ", "DET", "NOUN", "PROPN"):
|
||||
features_to_compare = ["Case", "Number", "Gender"]
|
||||
elif univ_pos == "NUM":
|
||||
|
@ -61,7 +65,6 @@ class RussianLemmatizer(Lemmatizer):
|
|||
"VerbForm",
|
||||
"Voice",
|
||||
]
|
||||
|
||||
analyses, filtered_analyses = filtered_analyses, []
|
||||
for analysis in analyses:
|
||||
_, analysis_morph = oc2ud(str(analysis.tag))
|
||||
|
@ -74,16 +77,14 @@ class RussianLemmatizer(Lemmatizer):
|
|||
break
|
||||
else:
|
||||
filtered_analyses.append(analysis)
|
||||
|
||||
if not len(filtered_analyses):
|
||||
return [string.lower()]
|
||||
return list(set([analysis.normal_form for analysis in filtered_analyses]))
|
||||
|
||||
@staticmethod
|
||||
def normalize_univ_pos(univ_pos):
|
||||
def normalize_univ_pos(univ_pos: str) -> Optional[str]:
|
||||
if isinstance(univ_pos, str):
|
||||
return univ_pos.upper()
|
||||
|
||||
symbols_to_str = {
|
||||
ADJ: "ADJ",
|
||||
DET: "DET",
|
||||
|
@ -98,14 +99,14 @@ class RussianLemmatizer(Lemmatizer):
|
|||
return symbols_to_str[univ_pos]
|
||||
return None
|
||||
|
||||
def lookup(self, string, orth=None):
|
||||
def lookup(self, string: str, orth: Optional[int] = None) -> str:
|
||||
analyses = self._morph.parse(string)
|
||||
if len(analyses) == 1:
|
||||
return analyses[0].normal_form
|
||||
return string
|
||||
|
||||
|
||||
def oc2ud(oc_tag):
|
||||
def oc2ud(oc_tag: str) -> Tuple[str, Dict[str, str]]:
|
||||
gram_map = {
|
||||
"_POS": {
|
||||
"ADJF": "ADJ",
|
||||
|
@ -160,11 +161,9 @@ def oc2ud(oc_tag):
|
|||
"Voice": {"actv": "Act", "pssv": "Pass"},
|
||||
"Abbr": {"Abbr": "Yes"},
|
||||
}
|
||||
|
||||
pos = "X"
|
||||
morphology = dict()
|
||||
unmatched = set()
|
||||
|
||||
grams = oc_tag.replace(" ", ",").split(",")
|
||||
for gram in grams:
|
||||
match = False
|
||||
|
@ -177,7 +176,6 @@ def oc2ud(oc_tag):
|
|||
morphology[categ] = gmap[gram]
|
||||
if not match:
|
||||
unmatched.add(gram)
|
||||
|
||||
while len(unmatched) > 0:
|
||||
gram = unmatched.pop()
|
||||
if gram in ("Name", "Patr", "Surn", "Geox", "Orgn"):
|
||||
|
@ -186,8 +184,4 @@ def oc2ud(oc_tag):
|
|||
pos = "AUX"
|
||||
elif gram == "Pltm":
|
||||
morphology["Number"] = "Ptan"
|
||||
|
||||
return pos, morphology
|
||||
|
||||
|
||||
PUNCT_RULES = {"«": '"', "»": '"'}
|
||||
|
|
|
@ -1,20 +1,33 @@
|
|||
from typing import Set, Dict, Callable, Any
|
||||
from thinc.api import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
|
||||
from ...language import Language
|
||||
from ...attrs import LANG
|
||||
from ...util import registry
|
||||
|
||||
|
||||
class SinhalaDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
lex_attr_getters[LANG] = lambda text: "si"
|
||||
stop_words = STOP_WORDS
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "si"
|
||||
stop_words = {"@language_data": "spacy.si.stop_words"}
|
||||
lex_attr_getters = {"@language_data": "spacy.si.lex_attr_getters"}
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.si.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
@registry.language_data("spacy.si.lex_attr_getters")
|
||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||
return LEX_ATTRS
|
||||
|
||||
|
||||
class Sinhala(Language):
|
||||
lang = "si"
|
||||
Defaults = SinhalaDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Sinhala"]
|
||||
|
|
|
@ -1,20 +1,33 @@
|
|||
from typing import Set, Dict, Callable, Any
|
||||
from thinc.api import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
|
||||
from ...language import Language
|
||||
from ...attrs import LANG
|
||||
from ...util import registry
|
||||
|
||||
|
||||
class SlovakDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
lex_attr_getters[LANG] = lambda text: "sk"
|
||||
stop_words = STOP_WORDS
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "sk"
|
||||
stop_words = {"@language_data": "spacy.sk.stop_words"}
|
||||
lex_attr_getters = {"@language_data": "spacy.sk.lex_attr_getters"}
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.sk.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
@registry.language_data("spacy.sk.lex_attr_getters")
|
||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||
return LEX_ATTRS
|
||||
|
||||
|
||||
class Slovak(Language):
|
||||
lang = "sk"
|
||||
Defaults = SlovakDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Slovak"]
|
||||
|
|
|
@ -1,17 +1,26 @@
|
|||
from typing import Set
|
||||
from thinc.api import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG
|
||||
from ...util import registry
|
||||
|
||||
|
||||
class SlovenianDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: "sl"
|
||||
stop_words = STOP_WORDS
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "sl"
|
||||
stop_words = {"@language_data": "spacy.sl.stop_words"}
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.sl.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
class Slovenian(Language):
|
||||
lang = "sl"
|
||||
Defaults = SlovenianDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Slovenian"]
|
||||
|
|
|
@ -1,17 +1,26 @@
|
|||
from typing import Set
|
||||
from thinc.api import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG
|
||||
from ...util import registry
|
||||
|
||||
|
||||
class AlbanianDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: "sq"
|
||||
stop_words = STOP_WORDS
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "sq"
|
||||
stop_words = {"@language_data": "spacy.sq.stop_words"}
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.sq.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
class Albanian(Language):
|
||||
lang = "sq"
|
||||
Defaults = AlbanianDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Albanian"]
|
||||
|
|
|
@ -1,23 +1,47 @@
|
|||
from typing import Set, Dict, Callable, Any
|
||||
from thinc.api import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG
|
||||
from ...util import update_exc
|
||||
from ...util import update_exc, registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "sr"
|
||||
stop_words = {"@language_data": "spacy.sr.stop_words"}
|
||||
lex_attr_getters = {"@language_data": "spacy.sr.lex_attr_getters"}
|
||||
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||
|
||||
[nlp.lemmatizer.data_paths]
|
||||
@language_data = "spacy-lookups-data"
|
||||
lang = ${nlp:lang}
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.sr.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
@registry.language_data("spacy.sr.lex_attr_getters")
|
||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||
return LEX_ATTRS
|
||||
|
||||
|
||||
class SerbianDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
lex_attr_getters[LANG] = lambda text: "sr"
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class Serbian(Language):
|
||||
lang = "sr"
|
||||
Defaults = SerbianDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Serbian"]
|
||||
|
|
|
@ -1,35 +1,54 @@
|
|||
from typing import Set, Dict, Callable, Any
|
||||
from thinc.api import Config
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...language import Language
|
||||
from ...util import update_exc, registry
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
|
||||
# Punctuation stolen from Danish
|
||||
from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "sv"
|
||||
stop_words = {"@language_data": "spacy.sv.stop_words"}
|
||||
lex_attr_getters = {"@language_data": "spacy.sv.lex_attr_getters"}
|
||||
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||
|
||||
[nlp.lemmatizer.data_paths]
|
||||
@language_data = "spacy-lookups-data"
|
||||
lang = ${nlp:lang}
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.sv.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
@registry.language_data("spacy.sv.lex_attr_getters")
|
||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||
return LEX_ATTRS
|
||||
|
||||
|
||||
class SwedishDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
lex_attr_getters[LANG] = lambda text: "sv"
|
||||
lex_attr_getters[NORM] = add_lookups(
|
||||
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
|
||||
)
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
infixes = TOKENIZER_INFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
stop_words = STOP_WORDS
|
||||
syntax_iterators = SYNTAX_ITERATORS
|
||||
|
||||
|
||||
class Swedish(Language):
|
||||
lang = "sv"
|
||||
Defaults = SwedishDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Swedish"]
|
||||
|
|
|
@ -1,20 +1,33 @@
|
|||
from typing import Set, Dict, Callable, Any
|
||||
from thinc.api import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
|
||||
from ...language import Language
|
||||
from ...attrs import LANG
|
||||
from ...util import registry
|
||||
|
||||
|
||||
class TamilDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: "ta"
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
stop_words = STOP_WORDS
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "ta"
|
||||
stop_words = {"@language_data": "spacy.ta.stop_words"}
|
||||
lex_attr_getters = {"@language_data": "spacy.ta.lex_attr_getters"}
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.ta.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
@registry.language_data("spacy.ta.lex_attr_getters")
|
||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||
return LEX_ATTRS
|
||||
|
||||
|
||||
class Tamil(Language):
|
||||
lang = "ta"
|
||||
Defaults = TamilDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Tamil"]
|
||||
|
|
|
@ -1,20 +1,33 @@
|
|||
from typing import Set, Dict, Callable, Any
|
||||
from thinc.api import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
|
||||
from ...language import Language
|
||||
from ...attrs import LANG
|
||||
from ...util import registry
|
||||
|
||||
|
||||
class TeluguDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
lex_attr_getters[LANG] = lambda text: "te"
|
||||
stop_words = STOP_WORDS
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "te"
|
||||
stop_words = {"@language_data": "spacy.te.stop_words"}
|
||||
lex_attr_getters = {"@language_data": "spacy.te.lex_attr_getters"}
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.te.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
@registry.language_data("spacy.te.lex_attr_getters")
|
||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||
return LEX_ATTRS
|
||||
|
||||
|
||||
class Telugu(Language):
|
||||
lang = "te"
|
||||
Defaults = TeluguDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Telugu"]
|
||||
|
|
|
@ -1,15 +1,44 @@
|
|||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from typing import Set, Dict, Callable, Any
|
||||
from thinc.api import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
|
||||
from ...attrs import LANG
|
||||
from ...language import Language
|
||||
from ...tokens import Doc
|
||||
from ...util import DummyTokenizer
|
||||
from ...util import DummyTokenizer, registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "th"
|
||||
stop_words = {"@language_data": "spacy.th.stop_words"}
|
||||
lex_attr_getters = {"@language_data": "spacy.th.lex_attr_getters"}
|
||||
|
||||
[nlp.tokenizer]
|
||||
@tokenizers = "spacy.ThaiTokenizer.v1"
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.th.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
@registry.language_data("spacy.th.lex_attr_getters")
|
||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||
return LEX_ATTRS
|
||||
|
||||
|
||||
@registry.tokenizers("spacy.ThaiTokenizer.v1")
|
||||
def create_thai_tokenizer():
|
||||
def thai_tokenizer_factory(nlp):
|
||||
return ThaiTokenizer(nlp)
|
||||
|
||||
return thai_tokenizer_factory
|
||||
|
||||
|
||||
class ThaiTokenizer(DummyTokenizer):
|
||||
def __init__(self, cls, nlp=None):
|
||||
def __init__(self, nlp: Language) -> None:
|
||||
try:
|
||||
from pythainlp.tokenize import word_tokenize
|
||||
except ImportError:
|
||||
|
@ -17,34 +46,18 @@ class ThaiTokenizer(DummyTokenizer):
|
|||
"The Thai tokenizer requires the PyThaiNLP library: "
|
||||
"https://github.com/PyThaiNLP/pythainlp"
|
||||
)
|
||||
|
||||
self.word_tokenize = word_tokenize
|
||||
self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
|
||||
self.vocab = nlp.vocab
|
||||
|
||||
def __call__(self, text):
|
||||
def __call__(self, text: str) -> Doc:
|
||||
words = list(self.word_tokenize(text))
|
||||
spaces = [False] * len(words)
|
||||
return Doc(self.vocab, words=words, spaces=spaces)
|
||||
|
||||
|
||||
class ThaiDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
lex_attr_getters[LANG] = lambda _text: "th"
|
||||
tokenizer_exceptions = dict(TOKENIZER_EXCEPTIONS)
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
@classmethod
|
||||
def create_tokenizer(cls, nlp=None):
|
||||
return ThaiTokenizer(cls, nlp)
|
||||
|
||||
|
||||
class Thai(Language):
|
||||
lang = "th"
|
||||
Defaults = ThaiDefaults
|
||||
|
||||
def make_doc(self, text):
|
||||
return self.tokenizer(text)
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Thai"]
|
||||
|
|
|
@ -1,31 +1,47 @@
|
|||
from typing import Set, Dict, Callable, Any
|
||||
from thinc.api import Config
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
from ...util import update_exc, registry
|
||||
|
||||
|
||||
def _return_tl(_):
|
||||
return "tl"
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "tl"
|
||||
stop_words = {"@language_data": "spacy.tl.stop_words"}
|
||||
lex_attr_getters = {"@language_data": "spacy.tl.lex_attr_getters"}
|
||||
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||
|
||||
[nlp.lemmatizer.data_paths]
|
||||
@language_data = "spacy-lookups-data"
|
||||
lang = ${nlp:lang}
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.tl.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
@registry.language_data("spacy.tl.lex_attr_getters")
|
||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||
return LEX_ATTRS
|
||||
|
||||
|
||||
class TagalogDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = _return_tl
|
||||
lex_attr_getters[NORM] = add_lookups(
|
||||
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
|
||||
)
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class Tagalog(Language):
|
||||
lang = "tl"
|
||||
Defaults = TagalogDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Tagalog"]
|
||||
|
|
|
@ -1,26 +1,40 @@
|
|||
from typing import Set
|
||||
from thinc.api import Config
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .stop_words import STOP_WORDS
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
from ...util import update_exc, registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "tr"
|
||||
stop_words = {"@language_data": "spacy.tr.stop_words"}
|
||||
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||
|
||||
[nlp.lemmatizer.data_paths]
|
||||
@language_data = "spacy-lookups-data"
|
||||
lang = ${nlp:lang}
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.tr.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
class TurkishDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: "tr"
|
||||
lex_attr_getters[NORM] = add_lookups(
|
||||
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
|
||||
)
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class Turkish(Language):
|
||||
lang = "tr"
|
||||
Defaults = TurkishDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Turkish"]
|
||||
|
|
|
@ -1,28 +1,42 @@
|
|||
from typing import Set, Dict, Callable, Any
|
||||
from thinc.api import Config
|
||||
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .punctuation import TOKENIZER_INFIXES
|
||||
from .stop_words import STOP_WORDS
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...attrs import LANG
|
||||
from ...language import Language
|
||||
from ...util import update_exc
|
||||
from ...util import update_exc, registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "tt"
|
||||
stop_words = {"@language_data": "spacy.tt.stop_words"}
|
||||
lex_attr_getters = {"@language_data": "spacy.tt.lex_attr_getters"}
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.tt.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
@registry.language_data("spacy.tt.lex_attr_getters")
|
||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||
return LEX_ATTRS
|
||||
|
||||
|
||||
class TatarDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: "tt"
|
||||
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
infixes = tuple(TOKENIZER_INFIXES)
|
||||
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class Tatar(Language):
|
||||
lang = "tt"
|
||||
Defaults = TatarDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Tatar"]
|
||||
|
|
|
@ -1,36 +1,49 @@
|
|||
from typing import Set, Dict, Callable, Any
|
||||
from thinc.api import Config
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...util import update_exc, add_lookups
|
||||
from ...util import update_exc, registry
|
||||
from ...language import Language
|
||||
from ...lookups import Lookups
|
||||
from ...attrs import LANG, NORM
|
||||
from .lemmatizer import UkrainianLemmatizer
|
||||
|
||||
|
||||
class UkrainianDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: "uk"
|
||||
lex_attr_getters[NORM] = add_lookups(
|
||||
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
|
||||
)
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = STOP_WORDS
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "uk"
|
||||
stop_words = {"@language_data": "spacy.uk.stop_words"}
|
||||
lex_attr_getters = {"@language_data": "spacy.uk.lex_attr_getters"}
|
||||
|
||||
@classmethod
|
||||
def create_lemmatizer(cls, nlp=None, lookups=None):
|
||||
if lookups is None:
|
||||
lookups = Lookups()
|
||||
return UkrainianLemmatizer(lookups)
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.UkrainianLemmatizer.v1"
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.uk.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
@registry.language_data("spacy.uk.lex_attr_getters")
|
||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||
return LEX_ATTRS
|
||||
|
||||
|
||||
@registry.lemmatizers("spacy.UkrainianLemmatizer.v1")
|
||||
def create_ukrainian_lemmatizer() -> UkrainianLemmatizer:
|
||||
return UkrainianLemmatizer()
|
||||
|
||||
|
||||
class UkrainianDefaults(Language.Defaults):
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
|
||||
|
||||
class Ukrainian(Language):
|
||||
lang = "uk"
|
||||
Defaults = UkrainianDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Ukrainian"]
|
||||
|
|
|
@ -1,11 +1,17 @@
|
|||
from typing import Optional, List, Tuple, Dict
|
||||
|
||||
from ...symbols import ADJ, DET, NOUN, NUM, PRON, PROPN, PUNCT, VERB, POS
|
||||
from ...lookups import Lookups
|
||||
from ...lemmatizer import Lemmatizer
|
||||
|
||||
|
||||
PUNCT_RULES = {"«": '"', "»": '"'}
|
||||
|
||||
|
||||
class UkrainianLemmatizer(Lemmatizer):
|
||||
_morph = None
|
||||
|
||||
def __init__(self, lookups=None):
|
||||
def __init__(self, lookups: Optional[Lookups] = None) -> None:
|
||||
super(UkrainianLemmatizer, self).__init__(lookups)
|
||||
try:
|
||||
from pymorphy2 import MorphAnalyzer
|
||||
|
@ -19,15 +25,15 @@ class UkrainianLemmatizer(Lemmatizer):
|
|||
'"pip install git+https://github.com/kmike/pymorphy2.git pymorphy2-dicts-uk"'
|
||||
)
|
||||
|
||||
def __call__(self, string, univ_pos, morphology=None):
|
||||
def __call__(
|
||||
self, string: str, univ_pos: str, morphology: Optional[dict] = None
|
||||
) -> List[str]:
|
||||
univ_pos = self.normalize_univ_pos(univ_pos)
|
||||
if univ_pos == "PUNCT":
|
||||
return [PUNCT_RULES.get(string, string)]
|
||||
|
||||
if univ_pos not in ("ADJ", "DET", "NOUN", "NUM", "PRON", "PROPN", "VERB"):
|
||||
# Skip unchangeable pos
|
||||
return [string.lower()]
|
||||
|
||||
analyses = self._morph.parse(string)
|
||||
filtered_analyses = []
|
||||
for analysis in analyses:
|
||||
|
@ -39,12 +45,10 @@ class UkrainianLemmatizer(Lemmatizer):
|
|||
analysis_pos in ("NOUN", "PROPN") and univ_pos in ("NOUN", "PROPN")
|
||||
):
|
||||
filtered_analyses.append(analysis)
|
||||
|
||||
if not len(filtered_analyses):
|
||||
return [string.lower()]
|
||||
if morphology is None or (len(morphology) == 1 and POS in morphology):
|
||||
return list(set([analysis.normal_form for analysis in filtered_analyses]))
|
||||
|
||||
if univ_pos in ("ADJ", "DET", "NOUN", "PROPN"):
|
||||
features_to_compare = ["Case", "Number", "Gender"]
|
||||
elif univ_pos == "NUM":
|
||||
|
@ -61,7 +65,6 @@ class UkrainianLemmatizer(Lemmatizer):
|
|||
"VerbForm",
|
||||
"Voice",
|
||||
]
|
||||
|
||||
analyses, filtered_analyses = filtered_analyses, []
|
||||
for analysis in analyses:
|
||||
_, analysis_morph = oc2ud(str(analysis.tag))
|
||||
|
@ -74,16 +77,14 @@ class UkrainianLemmatizer(Lemmatizer):
|
|||
break
|
||||
else:
|
||||
filtered_analyses.append(analysis)
|
||||
|
||||
if not len(filtered_analyses):
|
||||
return [string.lower()]
|
||||
return list(set([analysis.normal_form for analysis in filtered_analyses]))
|
||||
|
||||
@staticmethod
|
||||
def normalize_univ_pos(univ_pos):
|
||||
def normalize_univ_pos(univ_pos: str) -> Optional[str]:
|
||||
if isinstance(univ_pos, str):
|
||||
return univ_pos.upper()
|
||||
|
||||
symbols_to_str = {
|
||||
ADJ: "ADJ",
|
||||
DET: "DET",
|
||||
|
@ -98,14 +99,14 @@ class UkrainianLemmatizer(Lemmatizer):
|
|||
return symbols_to_str[univ_pos]
|
||||
return None
|
||||
|
||||
def lookup(self, string, orth=None):
|
||||
def lookup(self, string: str, orth: Optional[int] = None) -> str:
|
||||
analyses = self._morph.parse(string)
|
||||
if len(analyses) == 1:
|
||||
return analyses[0].normal_form
|
||||
return string
|
||||
|
||||
|
||||
def oc2ud(oc_tag):
|
||||
def oc2ud(oc_tag: str) -> Tuple[str, Dict[str, str]]:
|
||||
gram_map = {
|
||||
"_POS": {
|
||||
"ADJF": "ADJ",
|
||||
|
@ -160,11 +161,9 @@ def oc2ud(oc_tag):
|
|||
"Voice": {"actv": "Act", "pssv": "Pass"},
|
||||
"Abbr": {"Abbr": "Yes"},
|
||||
}
|
||||
|
||||
pos = "X"
|
||||
morphology = dict()
|
||||
unmatched = set()
|
||||
|
||||
grams = oc_tag.replace(" ", ",").split(",")
|
||||
for gram in grams:
|
||||
match = False
|
||||
|
@ -177,7 +176,6 @@ def oc2ud(oc_tag):
|
|||
morphology[categ] = gmap[gram]
|
||||
if not match:
|
||||
unmatched.add(gram)
|
||||
|
||||
while len(unmatched) > 0:
|
||||
gram = unmatched.pop()
|
||||
if gram in ("Name", "Patr", "Surn", "Geox", "Orgn"):
|
||||
|
@ -186,8 +184,4 @@ def oc2ud(oc_tag):
|
|||
pos = "AUX"
|
||||
elif gram == "Pltm":
|
||||
morphology["Number"] = "Ptan"
|
||||
|
||||
return pos, morphology
|
||||
|
||||
|
||||
PUNCT_RULES = {"«": '"', "»": '"'}
|
||||
|
|
|
@ -1,26 +1,53 @@
|
|||
from typing import Set, Dict, Callable, Any
|
||||
from thinc.api import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .punctuation import TOKENIZER_SUFFIXES
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "ur"
|
||||
stop_words = {"@language_data": "spacy.ur.stop_words"}
|
||||
lex_attr_getters = {"@language_data": "spacy.ur.lex_attr_getters"}
|
||||
|
||||
[nlp.writing_system]
|
||||
direction = "rtl"
|
||||
has_case = false
|
||||
has_letters = true
|
||||
|
||||
[nlp.lemmatizer]
|
||||
@lemmatizers = "spacy.Lemmatizer.v1"
|
||||
|
||||
[nlp.lemmatizer.data_paths]
|
||||
@language_data = "spacy-lookups-data"
|
||||
lang = ${nlp:lang}
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.ur.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
@registry.language_data("spacy.ur.lex_attr_getters")
|
||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||
return LEX_ATTRS
|
||||
|
||||
|
||||
class UrduDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
lex_attr_getters[LANG] = lambda text: "ur"
|
||||
|
||||
tokenizer_exceptions = BASE_EXCEPTIONS
|
||||
stop_words = STOP_WORDS
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
|
||||
|
||||
|
||||
class Urdu(Language):
|
||||
lang = "ur"
|
||||
Defaults = UrduDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Urdu"]
|
||||
|
|
|
@ -1,38 +1,62 @@
|
|||
from ...attrs import LANG, NORM
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from typing import Set, Dict, Callable, Any
|
||||
from thinc.api import Config
|
||||
|
||||
from ...language import Language
|
||||
from ...tokens import Doc
|
||||
from .stop_words import STOP_WORDS
|
||||
from ...util import add_lookups
|
||||
from ...util import DummyTokenizer, registry
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
|
||||
|
||||
class VietnameseDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: "vi" # for pickling
|
||||
lex_attr_getters[NORM] = add_lookups(
|
||||
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
|
||||
)
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
stop_words = STOP_WORDS
|
||||
use_pyvi = True
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "vi"
|
||||
stop_words = {"@language_data": "spacy.vi.stop_words"}
|
||||
lex_attr_getters = {"@language_data": "spacy.vi.lex_attr_getters"}
|
||||
|
||||
[nlp.tokenizer]
|
||||
@tokenizers = "spacy.VietnameseTokenizer.v1"
|
||||
use_pyvi = true
|
||||
"""
|
||||
|
||||
|
||||
class Vietnamese(Language):
|
||||
lang = "vi"
|
||||
Defaults = VietnameseDefaults # override defaults
|
||||
@registry.language_data("spacy.vi.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
def make_doc(self, text):
|
||||
if self.Defaults.use_pyvi:
|
||||
|
||||
@registry.language_data("spacy.vi.lex_attr_getters")
|
||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||
return LEX_ATTRS
|
||||
|
||||
|
||||
@registry.tokenizers("spacy.VietnameseTokenizer.v1")
|
||||
def create_vietnamese_tokenizer(use_pyvi: bool = True,):
|
||||
def vietnamese_tokenizer_factory(nlp):
|
||||
return VietnameseTokenizer(nlp, use_pyvi=use_pyvi)
|
||||
|
||||
return vietnamese_tokenizer_factory
|
||||
|
||||
|
||||
class VietnameseTokenizer(DummyTokenizer):
|
||||
def __init__(self, nlp: Language, use_pyvi: bool = False):
|
||||
self.vocab = nlp.vocab
|
||||
self.use_pyvi = use_pyvi
|
||||
if self.use_pyvi:
|
||||
try:
|
||||
from pyvi import ViTokenizer
|
||||
|
||||
self.ViTokenizer = ViTokenizer
|
||||
except ImportError:
|
||||
msg = (
|
||||
"Pyvi not installed. Either set Vietnamese.use_pyvi = False, "
|
||||
"Pyvi not installed. Either set use_pyvi = False, "
|
||||
"or install it https://pypi.python.org/pypi/pyvi"
|
||||
)
|
||||
raise ImportError(msg)
|
||||
words, spaces = ViTokenizer.spacy_tokenize(text)
|
||||
|
||||
def __call__(self, text: str) -> Doc:
|
||||
if self.use_pyvi:
|
||||
words, spaces = self.ViTokenizer.spacy_tokenize(text)
|
||||
return Doc(self.vocab, words=words, spaces=spaces)
|
||||
else:
|
||||
words = []
|
||||
|
@ -44,4 +68,9 @@ class Vietnamese(Language):
|
|||
return Doc(self.vocab, words=words, spaces=spaces)
|
||||
|
||||
|
||||
class Vietnamese(Language):
|
||||
lang = "vi"
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Vietnamese"]
|
||||
|
|
|
@ -1,17 +1,17 @@
|
|||
from thinc.api import Config
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "xx"
|
||||
"""
|
||||
|
||||
|
||||
class MultiLanguageDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: "xx"
|
||||
lex_attr_getters[NORM] = add_lookups(
|
||||
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
|
||||
)
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
|
||||
tokenizer_exceptions = BASE_EXCEPTIONS
|
||||
|
||||
|
||||
class MultiLanguage(Language):
|
||||
|
@ -21,6 +21,7 @@ class MultiLanguage(Language):
|
|||
|
||||
lang = "xx"
|
||||
Defaults = MultiLanguageDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["MultiLanguage"]
|
||||
|
|
|
@ -1,21 +1,39 @@
|
|||
from typing import Set, Dict, Callable, Any
|
||||
from thinc.api import Config
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG
|
||||
from ...util import registry
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "si"
|
||||
stop_words = {"@language_data": "spacy.yo.stop_words"}
|
||||
lex_attr_getters = {"@language_data": "spacy.yo.lex_attr_getters"}
|
||||
"""
|
||||
|
||||
|
||||
@registry.language_data("spacy.yo.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
@registry.language_data("spacy.yo.lex_attr_getters")
|
||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||
return LEX_ATTRS
|
||||
|
||||
|
||||
class YorubaDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
lex_attr_getters[LANG] = lambda text: "yo"
|
||||
stop_words = STOP_WORDS
|
||||
tokenizer_exceptions = BASE_EXCEPTIONS
|
||||
|
||||
|
||||
class Yoruba(Language):
|
||||
lang = "yo"
|
||||
Defaults = YorubaDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
|
||||
__all__ = ["Yoruba"]
|
||||
|
|
|
@ -1,13 +1,15 @@
|
|||
from typing import Optional, List, Set, Dict, Callable, Any
|
||||
from enum import Enum
|
||||
import tempfile
|
||||
import srsly
|
||||
import warnings
|
||||
from pathlib import Path
|
||||
from collections import OrderedDict
|
||||
from ...attrs import LANG
|
||||
from thinc.api import Config
|
||||
|
||||
from ...errors import Warnings, Errors
|
||||
from ...language import Language
|
||||
from ...tokens import Doc
|
||||
from ...util import DummyTokenizer
|
||||
from ...util import DummyTokenizer, registry
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .stop_words import STOP_WORDS
|
||||
|
@ -16,88 +18,103 @@ from ... import util
|
|||
|
||||
_PKUSEG_INSTALL_MSG = "install it with `pip install pkuseg==0.0.25` or from https://github.com/lancopku/pkuseg-python"
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
[nlp]
|
||||
lang = "zh"
|
||||
stop_words = {"@language_data": "spacy.zh.stop_words"}
|
||||
lex_attr_getters = {"@language_data": "spacy.zh.lex_attr_getters"}
|
||||
|
||||
def try_jieba_import(segmenter):
|
||||
try:
|
||||
import jieba
|
||||
[nlp.tokenizer]
|
||||
@tokenizers = "spacy.ChineseTokenizer.v1"
|
||||
segmenter = "char"
|
||||
pkuseg_model = null
|
||||
pkuseg_user_dict = "default"
|
||||
|
||||
if segmenter == "jieba":
|
||||
# segment a short text to have jieba initialize its cache in advance
|
||||
list(jieba.cut("作为", cut_all=False))
|
||||
|
||||
return jieba
|
||||
except ImportError:
|
||||
if segmenter == "jieba":
|
||||
msg = (
|
||||
"Jieba not installed. To use jieba, install it with `pip "
|
||||
" install jieba` or from https://github.com/fxsjy/jieba"
|
||||
)
|
||||
raise ImportError(msg)
|
||||
[nlp.writing_system]
|
||||
direction = "ltr"
|
||||
has_case = false
|
||||
has_letters = false
|
||||
"""
|
||||
|
||||
|
||||
def try_pkuseg_import(segmenter, pkuseg_model, pkuseg_user_dict):
|
||||
try:
|
||||
import pkuseg
|
||||
class Segmenter(str, Enum):
|
||||
char = "char"
|
||||
jieba = "jieba"
|
||||
pkuseg = "pkuseg"
|
||||
|
||||
if pkuseg_model:
|
||||
return pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict)
|
||||
elif segmenter == "pkuseg":
|
||||
msg = (
|
||||
"The Chinese word segmenter is 'pkuseg' but no pkuseg model "
|
||||
"was specified. Please provide the name of a pretrained model "
|
||||
"or the path to a model with "
|
||||
'`cfg = {"segmenter": "pkuseg", "pkuseg_model": name_or_path}; '
|
||||
'nlp = Chinese(meta={"tokenizer": {"config": cfg}})`'
|
||||
)
|
||||
raise ValueError(msg)
|
||||
except ImportError:
|
||||
if segmenter == "pkuseg":
|
||||
msg = "pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG
|
||||
raise ImportError(msg)
|
||||
except FileNotFoundError:
|
||||
if segmenter == "pkuseg":
|
||||
msg = "Unable to load pkuseg model from: " + pkuseg_model
|
||||
raise FileNotFoundError(msg)
|
||||
@classmethod
|
||||
def values(cls):
|
||||
return list(cls.__members__.keys())
|
||||
|
||||
|
||||
@registry.language_data("spacy.zh.stop_words")
|
||||
def stop_words() -> Set[str]:
|
||||
return STOP_WORDS
|
||||
|
||||
|
||||
@registry.language_data("spacy.zh.lex_attr_getters")
|
||||
def lex_attr_getters() -> Dict[int, Callable[[str], Any]]:
|
||||
return LEX_ATTRS
|
||||
|
||||
|
||||
@registry.tokenizers("spacy.ChineseTokenizer.v1")
|
||||
def create_chinese_tokenizer(
|
||||
segmenter: Segmenter = Segmenter.char,
|
||||
pkuseg_model: Optional[str] = None,
|
||||
pkuseg_user_dict: Optional[str] = "default",
|
||||
):
|
||||
def chinese_tokenizer_factory(nlp):
|
||||
return ChineseTokenizer(
|
||||
nlp,
|
||||
segmenter=segmenter,
|
||||
pkuseg_model=pkuseg_model,
|
||||
pkuseg_user_dict=pkuseg_user_dict,
|
||||
)
|
||||
|
||||
return chinese_tokenizer_factory
|
||||
|
||||
|
||||
class ChineseTokenizer(DummyTokenizer):
|
||||
def __init__(self, cls, nlp=None, config={}):
|
||||
self.supported_segmenters = ("char", "jieba", "pkuseg")
|
||||
self.configure_segmenter(config)
|
||||
self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
|
||||
# remove relevant settings from config so they're not also saved in
|
||||
# Language.meta
|
||||
for key in ["segmenter", "pkuseg_model", "pkuseg_user_dict"]:
|
||||
if key in config:
|
||||
del config[key]
|
||||
self.tokenizer = Language.Defaults().create_tokenizer(nlp)
|
||||
def __init__(
|
||||
self,
|
||||
nlp: Language,
|
||||
segmenter: Segmenter = Segmenter.char,
|
||||
pkuseg_model: Optional[str] = None,
|
||||
pkuseg_user_dict: Optional[str] = None,
|
||||
):
|
||||
self.vocab = nlp.vocab
|
||||
if isinstance(segmenter, Segmenter): # we might have the Enum here
|
||||
segmenter = segmenter.value
|
||||
self.segmenter = segmenter
|
||||
self.pkuseg_model = pkuseg_model
|
||||
self.pkuseg_user_dict = pkuseg_user_dict
|
||||
self.pkuseg_seg = None
|
||||
self.jieba_seg = None
|
||||
self.configure_segmenter(segmenter)
|
||||
|
||||
def configure_segmenter(self, config):
|
||||
self.segmenter = "char"
|
||||
if "segmenter" in config:
|
||||
if config["segmenter"] in self.supported_segmenters:
|
||||
self.segmenter = config["segmenter"]
|
||||
else:
|
||||
warn_msg = Warnings.W103.format(
|
||||
lang="Chinese",
|
||||
segmenter=config["segmenter"],
|
||||
supported=", ".join([repr(s) for s in self.supported_segmenters]),
|
||||
default="'char' (character segmentation)",
|
||||
)
|
||||
warnings.warn(warn_msg)
|
||||
def configure_segmenter(self, segmenter: str):
|
||||
if segmenter not in Segmenter.values():
|
||||
warn_msg = Warnings.W103.format(
|
||||
lang="Chinese",
|
||||
segmenter=segmenter,
|
||||
supported=", ".join(Segmenter.values()),
|
||||
default="'char' (character segmentation)",
|
||||
)
|
||||
warnings.warn(warn_msg)
|
||||
self.segmenter = Segmenter.char
|
||||
self.jieba_seg = try_jieba_import(self.segmenter)
|
||||
self.pkuseg_seg = try_pkuseg_import(
|
||||
self.segmenter,
|
||||
pkuseg_model=config.get("pkuseg_model", None),
|
||||
pkuseg_user_dict=config.get("pkuseg_user_dict", "default"),
|
||||
pkuseg_model=self.pkuseg_model,
|
||||
pkuseg_user_dict=self.pkuseg_user_dict,
|
||||
)
|
||||
|
||||
def __call__(self, text):
|
||||
if self.segmenter == "jieba":
|
||||
def __call__(self, text: str) -> Doc:
|
||||
if self.segmenter == Segmenter.jieba:
|
||||
words = list([x for x in self.jieba_seg.cut(text, cut_all=False) if x])
|
||||
(words, spaces) = util.get_words_and_spaces(words, text)
|
||||
return Doc(self.vocab, words=words, spaces=spaces)
|
||||
elif self.segmenter == "pkuseg":
|
||||
elif self.segmenter == Segmenter.pkuseg:
|
||||
if self.pkuseg_seg is None:
|
||||
raise ValueError(Errors.E1000)
|
||||
words = self.pkuseg_seg.cut(text)
|
||||
|
@ -105,11 +122,11 @@ class ChineseTokenizer(DummyTokenizer):
|
|||
return Doc(self.vocab, words=words, spaces=spaces)
|
||||
|
||||
# warn if segmenter setting is not the only remaining option "char"
|
||||
if self.segmenter != "char":
|
||||
if self.segmenter != Segmenter.char:
|
||||
warn_msg = Warnings.W103.format(
|
||||
lang="Chinese",
|
||||
segmenter=self.segmenter,
|
||||
supported=", ".join([repr(s) for s in self.supported_segmenters]),
|
||||
supported=", ".join(Segmenter.values()),
|
||||
default="'char' (character segmentation)",
|
||||
)
|
||||
warnings.warn(warn_msg)
|
||||
|
@ -119,33 +136,25 @@ class ChineseTokenizer(DummyTokenizer):
|
|||
(words, spaces) = util.get_words_and_spaces(words, text)
|
||||
return Doc(self.vocab, words=words, spaces=spaces)
|
||||
|
||||
def pkuseg_update_user_dict(self, words, reset=False):
|
||||
if self.segmenter == "pkuseg":
|
||||
def pkuseg_update_user_dict(self, words: List[str], reset: bool = False):
|
||||
if self.segmenter == Segmenter.pkuseg:
|
||||
if reset:
|
||||
try:
|
||||
import pkuseg
|
||||
|
||||
self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(None)
|
||||
except ImportError:
|
||||
if self.segmenter == "pkuseg":
|
||||
msg = (
|
||||
"pkuseg not installed: unable to reset pkuseg "
|
||||
"user dict. Please " + _PKUSEG_INSTALL_MSG
|
||||
)
|
||||
raise ImportError(msg)
|
||||
msg = (
|
||||
"pkuseg not installed: unable to reset pkuseg "
|
||||
"user dict. Please " + _PKUSEG_INSTALL_MSG
|
||||
)
|
||||
raise ImportError(msg)
|
||||
for word in words:
|
||||
self.pkuseg_seg.preprocesser.insert(word.strip(), "")
|
||||
else:
|
||||
warn_msg = Warnings.W104.format(target="pkuseg", current=self.segmenter)
|
||||
warnings.warn(warn_msg)
|
||||
|
||||
def _get_config(self):
|
||||
config = OrderedDict((("segmenter", self.segmenter),))
|
||||
return config
|
||||
|
||||
def _set_config(self, config={}):
|
||||
self.configure_segmenter(config)
|
||||
|
||||
def to_bytes(self, **kwargs):
|
||||
pkuseg_features_b = b""
|
||||
pkuseg_weights_b = b""
|
||||
|
@ -165,17 +174,11 @@ class ChineseTokenizer(DummyTokenizer):
|
|||
sorted(list(self.pkuseg_seg.postprocesser.common_words)),
|
||||
sorted(list(self.pkuseg_seg.postprocesser.other_words)),
|
||||
)
|
||||
serializers = OrderedDict(
|
||||
(
|
||||
("cfg", lambda: srsly.json_dumps(self._get_config())),
|
||||
("pkuseg_features", lambda: pkuseg_features_b),
|
||||
("pkuseg_weights", lambda: pkuseg_weights_b),
|
||||
(
|
||||
"pkuseg_processors",
|
||||
lambda: srsly.msgpack_dumps(pkuseg_processors_data),
|
||||
),
|
||||
)
|
||||
)
|
||||
serializers = {
|
||||
"pkuseg_features": lambda: pkuseg_features_b,
|
||||
"pkuseg_weights": lambda: pkuseg_weights_b,
|
||||
"pkuseg_processors": lambda: srsly.msgpack_dumps(pkuseg_processors_data),
|
||||
}
|
||||
return util.to_bytes(serializers, [])
|
||||
|
||||
def from_bytes(self, data, **kwargs):
|
||||
|
@ -190,14 +193,11 @@ class ChineseTokenizer(DummyTokenizer):
|
|||
def deserialize_pkuseg_processors(b):
|
||||
pkuseg_data["processors_data"] = srsly.msgpack_loads(b)
|
||||
|
||||
deserializers = OrderedDict(
|
||||
(
|
||||
("cfg", lambda b: self._set_config(srsly.json_loads(b))),
|
||||
("pkuseg_features", deserialize_pkuseg_features),
|
||||
("pkuseg_weights", deserialize_pkuseg_weights),
|
||||
("pkuseg_processors", deserialize_pkuseg_processors),
|
||||
)
|
||||
)
|
||||
deserializers = {
|
||||
"pkuseg_features": deserialize_pkuseg_features,
|
||||
"pkuseg_weights": deserialize_pkuseg_weights,
|
||||
"pkuseg_processors": deserialize_pkuseg_processors,
|
||||
}
|
||||
util.from_bytes(data, deserializers, [])
|
||||
|
||||
if pkuseg_data["features_b"] and pkuseg_data["weights_b"]:
|
||||
|
@ -245,13 +245,10 @@ class ChineseTokenizer(DummyTokenizer):
|
|||
)
|
||||
srsly.write_msgpack(path, data)
|
||||
|
||||
serializers = OrderedDict(
|
||||
(
|
||||
("cfg", lambda p: srsly.write_json(p, self._get_config())),
|
||||
("pkuseg_model", lambda p: save_pkuseg_model(p)),
|
||||
("pkuseg_processors", lambda p: save_pkuseg_processors(p)),
|
||||
)
|
||||
)
|
||||
serializers = {
|
||||
"pkuseg_model": lambda p: save_pkuseg_model(p),
|
||||
"pkuseg_processors": lambda p: save_pkuseg_processors(p),
|
||||
}
|
||||
return util.to_disk(path, serializers, [])
|
||||
|
||||
def from_disk(self, path, **kwargs):
|
||||
|
@ -261,7 +258,7 @@ class ChineseTokenizer(DummyTokenizer):
|
|||
try:
|
||||
import pkuseg
|
||||
except ImportError:
|
||||
if self.segmenter == "pkuseg":
|
||||
if self.segmenter == Segmenter.pkuseg:
|
||||
raise ImportError(
|
||||
"pkuseg not installed. To use this model, "
|
||||
+ _PKUSEG_INSTALL_MSG
|
||||
|
@ -273,9 +270,9 @@ class ChineseTokenizer(DummyTokenizer):
|
|||
try:
|
||||
import pkuseg
|
||||
except ImportError:
|
||||
if self.segmenter == "pkuseg":
|
||||
if self.segmenter == Segmenter.pkuseg:
|
||||
raise ImportError(self._pkuseg_install_msg)
|
||||
if self.segmenter == "pkuseg":
|
||||
if self.segmenter == Segmenter.pkuseg:
|
||||
data = srsly.read_msgpack(path)
|
||||
(user_dict, do_process, common_words, other_words) = data
|
||||
self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(user_dict)
|
||||
|
@ -283,35 +280,64 @@ class ChineseTokenizer(DummyTokenizer):
|
|||
self.pkuseg_seg.postprocesser.common_words = set(common_words)
|
||||
self.pkuseg_seg.postprocesser.other_words = set(other_words)
|
||||
|
||||
serializers = OrderedDict(
|
||||
(
|
||||
("cfg", lambda p: self._set_config(srsly.read_json(p))),
|
||||
("pkuseg_model", lambda p: load_pkuseg_model(p)),
|
||||
("pkuseg_processors", lambda p: load_pkuseg_processors(p)),
|
||||
)
|
||||
)
|
||||
serializers = {
|
||||
"pkuseg_model": lambda p: load_pkuseg_model(p),
|
||||
"pkuseg_processors": lambda p: load_pkuseg_processors(p),
|
||||
}
|
||||
util.from_disk(path, serializers, [])
|
||||
|
||||
|
||||
class ChineseDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
lex_attr_getters[LANG] = lambda text: "zh"
|
||||
tokenizer_exceptions = BASE_EXCEPTIONS
|
||||
stop_words = STOP_WORDS
|
||||
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
|
||||
|
||||
@classmethod
|
||||
def create_tokenizer(cls, nlp=None, config={}):
|
||||
return ChineseTokenizer(cls, nlp, config=config)
|
||||
|
||||
|
||||
class Chinese(Language):
|
||||
lang = "zh"
|
||||
Defaults = ChineseDefaults # override defaults
|
||||
Defaults = ChineseDefaults
|
||||
default_config = Config().from_str(DEFAULT_CONFIG)
|
||||
|
||||
def make_doc(self, text):
|
||||
return self.tokenizer(text)
|
||||
|
||||
def try_jieba_import(segmenter: str) -> None:
|
||||
try:
|
||||
import jieba
|
||||
|
||||
if segmenter == Segmenter.jieba:
|
||||
# segment a short text to have jieba initialize its cache in advance
|
||||
list(jieba.cut("作为", cut_all=False))
|
||||
|
||||
return jieba
|
||||
except ImportError:
|
||||
if segmenter == Segmenter.jieba:
|
||||
msg = (
|
||||
"Jieba not installed. To use jieba, install it with `pip "
|
||||
" install jieba` or from https://github.com/fxsjy/jieba"
|
||||
)
|
||||
raise ImportError(msg)
|
||||
|
||||
|
||||
def try_pkuseg_import(segmenter: str, pkuseg_model: str, pkuseg_user_dict: str) -> None:
|
||||
try:
|
||||
import pkuseg
|
||||
|
||||
if pkuseg_model:
|
||||
return pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict)
|
||||
elif segmenter == Segmenter.pkuseg:
|
||||
msg = (
|
||||
"The Chinese word segmenter is 'pkuseg' but no pkuseg model "
|
||||
"was specified. Please provide the name of a pretrained model "
|
||||
"or the path to a model with:\n"
|
||||
'cfg = {"nlp": {"tokenizer": {"segmenter": "pkuseg", "pkuseg_model": name_or_path }}\n'
|
||||
"nlp = Chinese.from_config(cfg)"
|
||||
)
|
||||
raise ValueError(msg)
|
||||
except ImportError:
|
||||
if segmenter == Segmenter.pkuseg:
|
||||
msg = "pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG
|
||||
raise ImportError(msg)
|
||||
except FileNotFoundError:
|
||||
if segmenter == Segmenter.pkuseg:
|
||||
msg = "Unable to load pkuseg model from: " + pkuseg_model
|
||||
raise FileNotFoundError(msg)
|
||||
|
||||
|
||||
def _get_pkuseg_trie_data(node, path=""):
|
||||
|
|
1052
spacy/language.py
1052
spacy/language.py
File diff suppressed because it is too large
Load Diff
|
@ -1,5 +1,14 @@
|
|||
from typing import Optional, Callable, List, Dict
|
||||
|
||||
from .lookups import Lookups
|
||||
from .errors import Errors
|
||||
from .parts_of_speech import NAMES as UPOS_NAMES
|
||||
from .util import registry, load_language_data, SimpleFrozenDict
|
||||
|
||||
|
||||
@registry.lemmatizers("spacy.Lemmatizer.v1")
|
||||
def create_lemmatizer(data_paths: dict = {}) -> "Lemmatizer":
|
||||
return Lemmatizer(data_paths=data_paths)
|
||||
|
||||
|
||||
class Lemmatizer:
|
||||
|
@ -14,17 +23,27 @@ class Lemmatizer:
|
|||
def load(cls, *args, **kwargs):
|
||||
raise NotImplementedError(Errors.E172)
|
||||
|
||||
def __init__(self, lookups, is_base_form=None):
|
||||
def __init__(
|
||||
self,
|
||||
lookups: Optional[Lookups] = None,
|
||||
data_paths: dict = SimpleFrozenDict(),
|
||||
is_base_form: Optional[Callable] = None,
|
||||
) -> None:
|
||||
"""Initialize a Lemmatizer.
|
||||
|
||||
lookups (Lookups): The lookups object containing the (optional) tables
|
||||
"lemma_rules", "lemma_index", "lemma_exc" and "lemma_lookup".
|
||||
RETURNS (Lemmatizer): The newly constructed object.
|
||||
"""
|
||||
self.lookups = lookups
|
||||
self.lookups = lookups if lookups is not None else Lookups()
|
||||
for name, filename in data_paths.items():
|
||||
data = load_language_data(filename)
|
||||
self.lookups.add_table(name, data)
|
||||
self.is_base_form = is_base_form
|
||||
|
||||
def __call__(self, string, univ_pos, morphology=None):
|
||||
def __call__(
|
||||
self, string: str, univ_pos: str, morphology: Optional[dict] = None
|
||||
) -> List[str]:
|
||||
"""Lemmatize a string.
|
||||
|
||||
string (str): The string to lemmatize, e.g. the token text.
|
||||
|
@ -39,7 +58,6 @@ class Lemmatizer:
|
|||
if isinstance(univ_pos, int):
|
||||
univ_pos = UPOS_NAMES.get(univ_pos, "X")
|
||||
univ_pos = univ_pos.lower()
|
||||
|
||||
if univ_pos in ("", "eol", "space"):
|
||||
return [string.lower()]
|
||||
# See Issue #435 for example of where this logic is requied.
|
||||
|
@ -67,65 +85,31 @@ class Lemmatizer:
|
|||
)
|
||||
return lemmas
|
||||
|
||||
def is_base_form(self, univ_pos, morphology=None):
|
||||
"""
|
||||
Check whether we're dealing with an uninflected paradigm, so we can
|
||||
avoid lemmatization entirely.
|
||||
|
||||
univ_pos (str / int): The token's universal part-of-speech tag.
|
||||
morphology (dict): The token's morphological features following the
|
||||
Universal Dependencies scheme.
|
||||
"""
|
||||
if morphology is None:
|
||||
morphology = {}
|
||||
if univ_pos == "noun" and morphology.get("Number") == "sing":
|
||||
return True
|
||||
elif univ_pos == "verb" and morphology.get("VerbForm") == "inf":
|
||||
return True
|
||||
# This maps 'VBP' to base form -- probably just need 'IS_BASE'
|
||||
# morphology
|
||||
elif univ_pos == "verb" and (
|
||||
morphology.get("VerbForm") == "fin"
|
||||
and morphology.get("Tense") == "pres"
|
||||
and morphology.get("Number") is None
|
||||
):
|
||||
return True
|
||||
elif univ_pos == "adj" and morphology.get("Degree") == "pos":
|
||||
return True
|
||||
elif morphology.get("VerbForm") == "inf":
|
||||
return True
|
||||
elif morphology.get("VerbForm") == "none":
|
||||
return True
|
||||
elif morphology.get("Degree") == "pos":
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def noun(self, string, morphology=None):
|
||||
def noun(self, string: str, morphology: Optional[dict] = None) -> List[str]:
|
||||
return self(string, "noun", morphology)
|
||||
|
||||
def verb(self, string, morphology=None):
|
||||
def verb(self, string: str, morphology: Optional[dict] = None) -> List[str]:
|
||||
return self(string, "verb", morphology)
|
||||
|
||||
def adj(self, string, morphology=None):
|
||||
def adj(self, string: str, morphology: Optional[dict] = None) -> List[str]:
|
||||
return self(string, "adj", morphology)
|
||||
|
||||
def det(self, string, morphology=None):
|
||||
def det(self, string: str, morphology: Optional[dict] = None) -> List[str]:
|
||||
return self(string, "det", morphology)
|
||||
|
||||
def pron(self, string, morphology=None):
|
||||
def pron(self, string: str, morphology: Optional[dict] = None) -> List[str]:
|
||||
return self(string, "pron", morphology)
|
||||
|
||||
def adp(self, string, morphology=None):
|
||||
def adp(self, string: str, morphology: Optional[dict] = None) -> List[str]:
|
||||
return self(string, "adp", morphology)
|
||||
|
||||
def num(self, string, morphology=None):
|
||||
def num(self, string: str, morphology: Optional[dict] = None) -> List[str]:
|
||||
return self(string, "num", morphology)
|
||||
|
||||
def punct(self, string, morphology=None):
|
||||
def punct(self, string: str, morphology: Optional[dict] = None) -> List[str]:
|
||||
return self(string, "punct", morphology)
|
||||
|
||||
def lookup(self, string, orth=None):
|
||||
def lookup(self, string: str, orth: Optional[int] = None) -> str:
|
||||
"""Look up a lemma in the table, if available. If no lemma is found,
|
||||
the original string is returned.
|
||||
|
||||
|
@ -141,7 +125,13 @@ class Lemmatizer:
|
|||
return lookup_table[key]
|
||||
return string
|
||||
|
||||
def lemmatize(self, string, index, exceptions, rules):
|
||||
def lemmatize(
|
||||
self,
|
||||
string: str,
|
||||
index: Dict[str, List[str]],
|
||||
exceptions: Dict[str, Dict[str, List[str]]],
|
||||
rules: Dict[str, List[List[str]]],
|
||||
) -> List[str]:
|
||||
orig = string
|
||||
string = string.lower()
|
||||
forms = []
|
||||
|
|
|
@ -1,15 +1,32 @@
|
|||
from typing import Dict, Any, List, Union, Optional
|
||||
from pathlib import Path
|
||||
import srsly
|
||||
from preshed.bloom import BloomFilter
|
||||
from collections import OrderedDict
|
||||
|
||||
from .errors import Errors
|
||||
from .util import SimpleFrozenDict, ensure_path
|
||||
from .util import SimpleFrozenDict, ensure_path, registry
|
||||
from .strings import get_string_id
|
||||
|
||||
|
||||
UNSET = object()
|
||||
|
||||
|
||||
@registry.language_data("spacy-lookups-data")
|
||||
def get_lookups(lang: str) -> Dict[str, Any]:
|
||||
"""Load the data from the spacy-lookups-data package for a given language,
|
||||
if available. Returns an empty dict if there's no data or if the package
|
||||
is not installed.
|
||||
|
||||
lang (str): The language code (corresponds to entry point exposed by
|
||||
the spacy-lookups-data package).
|
||||
RETURNS (Dict[str, Any]): The lookups, keyed by table name.
|
||||
"""
|
||||
if lang in registry.lookups:
|
||||
return registry.lookups.get(lang)
|
||||
return {}
|
||||
|
||||
|
||||
class Lookups:
|
||||
"""Container for large lookup tables and dictionaries, e.g. lemmatization
|
||||
data or tokenizer exception lists. Lookups are available via vocab.lookups,
|
||||
|
@ -18,7 +35,7 @@ class Lookups:
|
|||
via doc.vocab.lookups.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
def __init__(self) -> None:
|
||||
"""Initialize the Lookups object.
|
||||
|
||||
RETURNS (Lookups): The newly created object.
|
||||
|
@ -27,7 +44,7 @@ class Lookups:
|
|||
"""
|
||||
self._tables = {}
|
||||
|
||||
def __contains__(self, name):
|
||||
def __contains__(self, name: str) -> bool:
|
||||
"""Check if the lookups contain a table of a given name. Delegates to
|
||||
Lookups.has_table.
|
||||
|
||||
|
@ -36,16 +53,16 @@ class Lookups:
|
|||
"""
|
||||
return self.has_table(name)
|
||||
|
||||
def __len__(self):
|
||||
def __len__(self) -> int:
|
||||
"""RETURNS (int): The number of tables in the lookups."""
|
||||
return len(self._tables)
|
||||
|
||||
@property
|
||||
def tables(self):
|
||||
"""RETURNS (list): Names of all tables in the lookups."""
|
||||
def tables(self) -> List[str]:
|
||||
"""RETURNS (List[str]): Names of all tables in the lookups."""
|
||||
return list(self._tables.keys())
|
||||
|
||||
def add_table(self, name, data=SimpleFrozenDict()):
|
||||
def add_table(self, name: str, data: dict = SimpleFrozenDict()) -> "Table":
|
||||
"""Add a new table to the lookups. Raises an error if the table exists.
|
||||
|
||||
name (str): Unique name of table.
|
||||
|
@ -60,12 +77,12 @@ class Lookups:
|
|||
self._tables[name] = table
|
||||
return table
|
||||
|
||||
def get_table(self, name, default=UNSET):
|
||||
def get_table(self, name: str, default: Any = UNSET) -> "Table":
|
||||
"""Get a table. Raises an error if the table doesn't exist and no
|
||||
default value is provided.
|
||||
|
||||
name (str): Name of the table.
|
||||
default: Optional default value to return if table doesn't exist.
|
||||
default (Any): Optional default value to return if table doesn't exist.
|
||||
RETURNS (Table): The table.
|
||||
|
||||
DOCS: https://spacy.io/api/lookups#get_table
|
||||
|
@ -76,7 +93,7 @@ class Lookups:
|
|||
return default
|
||||
return self._tables[name]
|
||||
|
||||
def remove_table(self, name):
|
||||
def remove_table(self, name: str) -> "Table":
|
||||
"""Remove a table. Raises an error if the table doesn't exist.
|
||||
|
||||
name (str): Name of the table to remove.
|
||||
|
@ -88,7 +105,7 @@ class Lookups:
|
|||
raise KeyError(Errors.E159.format(name=name, tables=self.tables))
|
||||
return self._tables.pop(name)
|
||||
|
||||
def has_table(self, name):
|
||||
def has_table(self, name: str) -> bool:
|
||||
"""Check if the lookups contain a table of a given name.
|
||||
|
||||
name (str): Name of the table.
|
||||
|
@ -98,7 +115,7 @@ class Lookups:
|
|||
"""
|
||||
return name in self._tables
|
||||
|
||||
def to_bytes(self, **kwargs):
|
||||
def to_bytes(self, **kwargs) -> bytes:
|
||||
"""Serialize the lookups to a bytestring.
|
||||
|
||||
RETURNS (bytes): The serialized Lookups.
|
||||
|
@ -107,7 +124,7 @@ class Lookups:
|
|||
"""
|
||||
return srsly.msgpack_dumps(self._tables)
|
||||
|
||||
def from_bytes(self, bytes_data, **kwargs):
|
||||
def from_bytes(self, bytes_data: bytes, **kwargs) -> "Lookups":
|
||||
"""Load the lookups from a bytestring.
|
||||
|
||||
bytes_data (bytes): The data to load.
|
||||
|
@ -120,7 +137,9 @@ class Lookups:
|
|||
self._tables[key] = Table(key, value)
|
||||
return self
|
||||
|
||||
def to_disk(self, path, filename="lookups.bin", **kwargs):
|
||||
def to_disk(
|
||||
self, path: Union[str, Path], filename: str = "lookups.bin", **kwargs
|
||||
) -> None:
|
||||
"""Save the lookups to a directory as lookups.bin. Expects a path to a
|
||||
directory, which will be created if it doesn't exist.
|
||||
|
||||
|
@ -136,7 +155,9 @@ class Lookups:
|
|||
with filepath.open("wb") as file_:
|
||||
file_.write(self.to_bytes())
|
||||
|
||||
def from_disk(self, path, filename="lookups.bin", **kwargs):
|
||||
def from_disk(
|
||||
self, path: Union[str, Path], filename: str = "lookups.bin", **kwargs
|
||||
) -> "Lookups":
|
||||
"""Load lookups from a directory containing a lookups.bin. Will skip
|
||||
loading if the file doesn't exist.
|
||||
|
||||
|
@ -162,7 +183,7 @@ class Table(OrderedDict):
|
|||
"""
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data, name=None):
|
||||
def from_dict(cls, data: dict, name: Optional[str] = None) -> "Table":
|
||||
"""Initialize a new table from a dict.
|
||||
|
||||
data (dict): The dictionary.
|
||||
|
@ -175,7 +196,7 @@ class Table(OrderedDict):
|
|||
self.update(data)
|
||||
return self
|
||||
|
||||
def __init__(self, name=None, data=None):
|
||||
def __init__(self, name: Optional[str] = None, data: Optional[dict] = None) -> None:
|
||||
"""Initialize a new table.
|
||||
|
||||
name (str): Optional table name for reference.
|
||||
|
@ -193,7 +214,7 @@ class Table(OrderedDict):
|
|||
if data:
|
||||
self.update(data)
|
||||
|
||||
def __setitem__(self, key, value):
|
||||
def __setitem__(self, key: Union[str, int], value: Any) -> None:
|
||||
"""Set new key/value pair. String keys will be hashed.
|
||||
|
||||
key (str / int): The key to set.
|
||||
|
@ -203,7 +224,7 @@ class Table(OrderedDict):
|
|||
OrderedDict.__setitem__(self, key, value)
|
||||
self.bloom.add(key)
|
||||
|
||||
def set(self, key, value):
|
||||
def set(self, key: Union[str, int], value: Any) -> None:
|
||||
"""Set new key/value pair. String keys will be hashed.
|
||||
Same as table[key] = value.
|
||||
|
||||
|
@ -212,7 +233,7 @@ class Table(OrderedDict):
|
|||
"""
|
||||
self[key] = value
|
||||
|
||||
def __getitem__(self, key):
|
||||
def __getitem__(self, key: Union[str, int]) -> Any:
|
||||
"""Get the value for a given key. String keys will be hashed.
|
||||
|
||||
key (str / int): The key to get.
|
||||
|
@ -221,7 +242,7 @@ class Table(OrderedDict):
|
|||
key = get_string_id(key)
|
||||
return OrderedDict.__getitem__(self, key)
|
||||
|
||||
def get(self, key, default=None):
|
||||
def get(self, key: Union[str, int], default: Optional[Any] = None) -> Any:
|
||||
"""Get the value for a given key. String keys will be hashed.
|
||||
|
||||
key (str / int): The key to get.
|
||||
|
@ -231,7 +252,7 @@ class Table(OrderedDict):
|
|||
key = get_string_id(key)
|
||||
return OrderedDict.get(self, key, default)
|
||||
|
||||
def __contains__(self, key):
|
||||
def __contains__(self, key: Union[str, int]) -> bool:
|
||||
"""Check whether a key is in the table. String keys will be hashed.
|
||||
|
||||
key (str / int): The key to check.
|
||||
|
@ -243,7 +264,7 @@ class Table(OrderedDict):
|
|||
return False
|
||||
return OrderedDict.__contains__(self, key)
|
||||
|
||||
def to_bytes(self):
|
||||
def to_bytes(self) -> bytes:
|
||||
"""Serialize table to a bytestring.
|
||||
|
||||
RETURNS (bytes): The serialized table.
|
||||
|
@ -257,7 +278,7 @@ class Table(OrderedDict):
|
|||
}
|
||||
return srsly.msgpack_dumps(data)
|
||||
|
||||
def from_bytes(self, bytes_data):
|
||||
def from_bytes(self, bytes_data: bytes) -> "Table":
|
||||
"""Load a table from a bytestring.
|
||||
|
||||
bytes_data (bytes): The data to load.
|
||||
|
|
|
@ -26,8 +26,8 @@ def build_nel_encoder(tok2vec, nO=None):
|
|||
|
||||
|
||||
@registry.assets.register("spacy.KBFromFile.v1")
|
||||
def load_kb(nlp_path, kb_path) -> KnowledgeBase:
|
||||
vocab = Vocab().from_disk(Path(nlp_path) / "vocab")
|
||||
def load_kb(vocab_path, kb_path) -> KnowledgeBase:
|
||||
vocab = Vocab().from_disk(vocab_path)
|
||||
kb = KnowledgeBase(vocab=vocab)
|
||||
kb.load_bulk(kb_path)
|
||||
return kb
|
||||
|
|
|
@ -1,30 +1,9 @@
|
|||
from thinc.api import (
|
||||
Model,
|
||||
reduce_mean,
|
||||
Linear,
|
||||
list2ragged,
|
||||
Logistic,
|
||||
ParametricAttention,
|
||||
)
|
||||
from thinc.api import chain, concatenate, clone, Dropout
|
||||
from thinc.api import (
|
||||
SparseLinear,
|
||||
Softmax,
|
||||
softmax_activation,
|
||||
Maxout,
|
||||
reduce_sum,
|
||||
Relu,
|
||||
residual,
|
||||
expand_window,
|
||||
)
|
||||
from thinc.api import (
|
||||
HashEmbed,
|
||||
with_ragged,
|
||||
with_array,
|
||||
with_cpu,
|
||||
uniqued,
|
||||
FeatureExtractor,
|
||||
)
|
||||
from typing import Optional
|
||||
from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic
|
||||
from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention
|
||||
from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum
|
||||
from thinc.api import HashEmbed, with_ragged, with_array, with_cpu, uniqued
|
||||
from thinc.api import Relu, residual, expand_window, FeatureExtractor
|
||||
|
||||
from ..spacy_vectors import SpacyVectors
|
||||
from ... import util
|
||||
|
@ -34,7 +13,9 @@ from ..extract_ngrams import extract_ngrams
|
|||
|
||||
|
||||
@registry.architectures.register("spacy.TextCatCNN.v1")
|
||||
def build_simple_cnn_text_classifier(tok2vec, exclusive_classes, nO=None):
|
||||
def build_simple_cnn_text_classifier(
|
||||
tok2vec: Model, exclusive_classes: bool, nO: Optional[int] = None
|
||||
) -> Model:
|
||||
"""
|
||||
Build a simple CNN text classifier, given a token-to-vector model as inputs.
|
||||
If exclusive_classes=True, a softmax non-linearity is applied, so that the
|
||||
|
@ -90,13 +71,25 @@ def build_text_classifier(
|
|||
nO=width, nV=embed_size, column=cols.index(LOWER), dropout=dropout, seed=10
|
||||
)
|
||||
prefix = HashEmbed(
|
||||
nO=width // 2, nV=embed_size, column=cols.index(PREFIX), dropout=dropout, seed=11
|
||||
nO=width // 2,
|
||||
nV=embed_size,
|
||||
column=cols.index(PREFIX),
|
||||
dropout=dropout,
|
||||
seed=11,
|
||||
)
|
||||
suffix = HashEmbed(
|
||||
nO=width // 2, nV=embed_size, column=cols.index(SUFFIX), dropout=dropout, seed=12
|
||||
nO=width // 2,
|
||||
nV=embed_size,
|
||||
column=cols.index(SUFFIX),
|
||||
dropout=dropout,
|
||||
seed=12,
|
||||
)
|
||||
shape = HashEmbed(
|
||||
nO=width // 2, nV=embed_size, column=cols.index(SHAPE), dropout=dropout, seed=13
|
||||
nO=width // 2,
|
||||
nV=embed_size,
|
||||
column=cols.index(SHAPE),
|
||||
dropout=dropout,
|
||||
seed=13,
|
||||
)
|
||||
|
||||
width_nI = sum(layer.get_dim("nO") for layer in [lower, prefix, suffix, shape])
|
||||
|
|
|
@ -11,8 +11,8 @@ from ...attrs import ID, ORTH, NORM, PREFIX, SUFFIX, SHAPE
|
|||
|
||||
|
||||
@registry.architectures.register("spacy.Tok2VecTensors.v1")
|
||||
def tok2vec_tensors_v1(width):
|
||||
tok2vec = Tok2VecListener("tok2vec", width=width)
|
||||
def tok2vec_tensors_v1(width, upstream="*"):
|
||||
tok2vec = Tok2VecListener(upstream_name=upstream, width=width)
|
||||
return tok2vec
|
||||
|
||||
|
||||
|
|
|
@ -1,30 +1,37 @@
|
|||
from typing import List, Dict, Iterable, Optional, Union, TYPE_CHECKING
|
||||
from wasabi import Printer
|
||||
import warnings
|
||||
|
||||
from .tokens import Doc, Token, Span
|
||||
from .errors import Errors, Warnings
|
||||
from .util import dot_to_dict
|
||||
|
||||
if TYPE_CHECKING:
|
||||
# This lets us add type hints for mypy etc. without causing circular imports
|
||||
from .language import Language # noqa: F401
|
||||
|
||||
|
||||
def analyze_pipes(pipeline, name, pipe, index, warn=True):
|
||||
def analyze_pipes(
|
||||
nlp: "Language", name: str, index: int, warn: bool = True
|
||||
) -> List[str]:
|
||||
"""Analyze a pipeline component with respect to its position in the current
|
||||
pipeline and the other components. Will check whether requirements are
|
||||
fulfilled (e.g. if previous components assign the attributes).
|
||||
|
||||
pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
|
||||
nlp (Language): The current nlp object.
|
||||
name (str): The name of the pipeline component to analyze.
|
||||
pipe (callable): The pipeline component function to analyze.
|
||||
index (int): The index of the component in the pipeline.
|
||||
warn (bool): Show user warning if problem is found.
|
||||
RETURNS (list): The problems found for the given pipeline component.
|
||||
RETURNS (List[str]): The problems found for the given pipeline component.
|
||||
"""
|
||||
assert pipeline[index][0] == name
|
||||
prev_pipes = pipeline[:index]
|
||||
pipe_requires = getattr(pipe, "requires", [])
|
||||
requires = {annot: False for annot in pipe_requires}
|
||||
assert nlp.pipeline[index][0] == name
|
||||
prev_pipes = nlp.pipeline[:index]
|
||||
meta = nlp.get_pipe_meta(name)
|
||||
requires = {annot: False for annot in meta.requires}
|
||||
if requires:
|
||||
for prev_name, prev_pipe in prev_pipes:
|
||||
prev_assigns = getattr(prev_pipe, "assigns", [])
|
||||
for annot in prev_assigns:
|
||||
prev_meta = nlp.get_pipe_meta(prev_name)
|
||||
for annot in prev_meta.assigns:
|
||||
requires[annot] = True
|
||||
problems = []
|
||||
for annot, fulfilled in requires.items():
|
||||
|
@ -35,46 +42,29 @@ def analyze_pipes(pipeline, name, pipe, index, warn=True):
|
|||
return problems
|
||||
|
||||
|
||||
def analyze_all_pipes(pipeline, warn=True):
|
||||
def analyze_all_pipes(nlp: "Language", warn: bool = True) -> Dict[str, List[str]]:
|
||||
"""Analyze all pipes in the pipeline in order.
|
||||
|
||||
pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
|
||||
nlp (Language): The current nlp object.
|
||||
warn (bool): Show user warning if problem is found.
|
||||
RETURNS (dict): The problems found, keyed by component name.
|
||||
RETURNS (Dict[str, List[str]]): The problems found, keyed by component name.
|
||||
"""
|
||||
problems = {}
|
||||
for i, (name, pipe) in enumerate(pipeline):
|
||||
problems[name] = analyze_pipes(pipeline, name, pipe, i, warn=warn)
|
||||
for i, name in enumerate(nlp.pipe_names):
|
||||
problems[name] = analyze_pipes(nlp, name, i, warn=warn)
|
||||
return problems
|
||||
|
||||
|
||||
def dot_to_dict(values):
|
||||
"""Convert dot notation to a dict. For example: ["token.pos", "token._.xyz"]
|
||||
become {"token": {"pos": True, "_": {"xyz": True }}}.
|
||||
|
||||
values (iterable): The values to convert.
|
||||
RETURNS (dict): The converted values.
|
||||
"""
|
||||
result = {}
|
||||
for value in values:
|
||||
path = result
|
||||
parts = value.lower().split(".")
|
||||
for i, item in enumerate(parts):
|
||||
is_last = i == len(parts) - 1
|
||||
path = path.setdefault(item, True if is_last else {})
|
||||
return result
|
||||
|
||||
|
||||
def validate_attrs(values):
|
||||
def validate_attrs(values: Iterable[str]) -> Iterable[str]:
|
||||
"""Validate component attributes provided to "assigns", "requires" etc.
|
||||
Raises error for invalid attributes and formatting. Doesn't check if
|
||||
custom extension attributes are registered, since this is something the
|
||||
user might want to do themselves later in the component.
|
||||
|
||||
values (iterable): The string attributes to check, e.g. `["token.pos"]`.
|
||||
RETURNS (iterable): The checked attributes.
|
||||
values (Iterable[str]): The string attributes to check, e.g. `["token.pos"]`.
|
||||
RETURNS (Iterable[str]): The checked attributes.
|
||||
"""
|
||||
data = dot_to_dict(values)
|
||||
data = dot_to_dict({value: True for value in values})
|
||||
objs = {"doc": Doc, "token": Token, "span": Span}
|
||||
for obj_key, attrs in data.items():
|
||||
if obj_key == "span":
|
||||
|
@ -111,37 +101,40 @@ def validate_attrs(values):
|
|||
return values
|
||||
|
||||
|
||||
def _get_feature_for_attr(pipeline, attr, feature):
|
||||
def _get_feature_for_attr(nlp: "Language", attr: str, feature: str) -> List[str]:
|
||||
assert feature in ["assigns", "requires"]
|
||||
result = []
|
||||
for pipe_name, pipe in pipeline:
|
||||
pipe_assigns = getattr(pipe, feature, [])
|
||||
for pipe_name in nlp.pipe_names:
|
||||
meta = nlp.get_pipe_meta(pipe_name)
|
||||
pipe_assigns = getattr(meta, feature, [])
|
||||
if attr in pipe_assigns:
|
||||
result.append((pipe_name, pipe))
|
||||
result.append(pipe_name)
|
||||
return result
|
||||
|
||||
|
||||
def get_assigns_for_attr(pipeline, attr):
|
||||
def get_assigns_for_attr(nlp: "Language", attr: str) -> List[str]:
|
||||
"""Get all pipeline components that assign an attr, e.g. "doc.tensor".
|
||||
|
||||
pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
|
||||
pipeline (Language): The current nlp object.
|
||||
attr (str): The attribute to check.
|
||||
RETURNS (list): (name, pipeline) tuples of components that assign the attr.
|
||||
RETURNS (List[str]): Names of components that require the attr.
|
||||
"""
|
||||
return _get_feature_for_attr(pipeline, attr, "assigns")
|
||||
return _get_feature_for_attr(nlp, attr, "assigns")
|
||||
|
||||
|
||||
def get_requires_for_attr(pipeline, attr):
|
||||
def get_requires_for_attr(nlp: "Language", attr: str) -> List[str]:
|
||||
"""Get all pipeline components that require an attr, e.g. "doc.tensor".
|
||||
|
||||
pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
|
||||
pipeline (Language): The current nlp object.
|
||||
attr (str): The attribute to check.
|
||||
RETURNS (list): (name, pipeline) tuples of components that require the attr.
|
||||
RETURNS (List[str]): Names of components that require the attr.
|
||||
"""
|
||||
return _get_feature_for_attr(pipeline, attr, "requires")
|
||||
return _get_feature_for_attr(nlp, attr, "requires")
|
||||
|
||||
|
||||
def print_summary(nlp, pretty=True, no_print=False):
|
||||
def print_summary(
|
||||
nlp: "Language", pretty: bool = True, no_print: bool = False
|
||||
) -> Optional[Dict[str, Union[List[str], Dict[str, List[str]]]]]:
|
||||
"""Print a formatted summary for the current nlp object's pipeline. Shows
|
||||
a table with the pipeline components and why they assign and require, as
|
||||
well as any problems if available.
|
||||
|
@ -154,12 +147,10 @@ def print_summary(nlp, pretty=True, no_print=False):
|
|||
msg = Printer(pretty=pretty, no_print=no_print)
|
||||
overview = []
|
||||
problems = {}
|
||||
for i, (name, pipe) in enumerate(nlp.pipeline):
|
||||
requires = getattr(pipe, "requires", [])
|
||||
assigns = getattr(pipe, "assigns", [])
|
||||
retok = getattr(pipe, "retokenizes", False)
|
||||
overview.append((i, name, requires, assigns, retok))
|
||||
problems[name] = analyze_pipes(nlp.pipeline, name, pipe, i, warn=False)
|
||||
for i, name in enumerate(nlp.pipe_names):
|
||||
meta = nlp.get_pipe_meta(name)
|
||||
overview.append((i, name, meta.requires, meta.assigns, meta.retokenizes))
|
||||
problems[name] = analyze_pipes(nlp, name, i, warn=False)
|
||||
msg.divider("Pipeline Overview")
|
||||
header = ("#", "Component", "Requires", "Assigns", "Retokenizes")
|
||||
msg.table(overview, header=header, divider=True, multiline=True)
|
||||
|
@ -175,15 +166,19 @@ def print_summary(nlp, pretty=True, no_print=False):
|
|||
return {"overview": overview, "problems": problems}
|
||||
|
||||
|
||||
def count_pipeline_interdependencies(pipeline):
|
||||
def count_pipeline_interdependencies(nlp: "Language") -> List[int]:
|
||||
"""Count how many subsequent components require an annotation set by each
|
||||
component in the pipeline.
|
||||
|
||||
nlp (Language): The current nlp object.
|
||||
RETURNS (List[int]): The interdependency counts.
|
||||
"""
|
||||
pipe_assigns = []
|
||||
pipe_requires = []
|
||||
for name, pipe in pipeline:
|
||||
pipe_assigns.append(set(getattr(pipe, "assigns", [])))
|
||||
pipe_requires.append(set(getattr(pipe, "requires", [])))
|
||||
for name in nlp.pipe_names:
|
||||
meta = nlp.get_pipe_meta(name)
|
||||
pipe_assigns.append(set(meta.assigns))
|
||||
pipe_requires.append(set(meta.requires))
|
||||
counts = []
|
||||
for i, assigns in enumerate(pipe_assigns):
|
||||
count = 0
|
||||
|
|
|
@ -1,28 +1,33 @@
|
|||
from .pipes import Tagger, DependencyParser, EntityRecognizer, EntityLinker
|
||||
from .pipes import TextCategorizer, Pipe, Sentencizer
|
||||
from .pipes import SentenceRecognizer
|
||||
from .simple_ner import SimpleNER
|
||||
from .morphologizer import Morphologizer
|
||||
from .dep_parser import DependencyParser
|
||||
from .entity_linker import EntityLinker
|
||||
from .ner import EntityRecognizer
|
||||
from .entityruler import EntityRuler
|
||||
from .morphologizer import Morphologizer
|
||||
from .pipe import Pipe
|
||||
from spacy.pipeline.senter import SentenceRecognizer
|
||||
from .sentencizer import Sentencizer
|
||||
from .simple_ner import SimpleNER
|
||||
from .tagger import Tagger
|
||||
from .textcat import TextCategorizer
|
||||
from .tok2vec import Tok2Vec
|
||||
from .hooks import SentenceSegmenter, SimilarityHook
|
||||
from .functions import merge_entities, merge_noun_chunks, merge_subtokens
|
||||
|
||||
__all__ = [
|
||||
"Tagger",
|
||||
"DependencyParser",
|
||||
"EntityRecognizer",
|
||||
"EntityLinker",
|
||||
"TextCategorizer",
|
||||
"Tok2Vec",
|
||||
"Pipe",
|
||||
"Morphologizer",
|
||||
"EntityRecognizer",
|
||||
"EntityRuler",
|
||||
"Sentencizer",
|
||||
"SentenceSegmenter",
|
||||
"Morphologizer",
|
||||
"Pipe",
|
||||
"SentenceRecognizer",
|
||||
"SentenceSegmenter",
|
||||
"Sentencizer",
|
||||
"SimilarityHook",
|
||||
"SimpleNER",
|
||||
"Tagger",
|
||||
"TextCategorizer",
|
||||
"Tok2Vec",
|
||||
"merge_entities",
|
||||
"merge_noun_chunks",
|
||||
"merge_subtokens",
|
||||
|
|
|
@ -1,93 +0,0 @@
|
|||
from pathlib import Path
|
||||
|
||||
from ... import util
|
||||
|
||||
|
||||
def default_nel_config():
|
||||
loc = Path(__file__).parent / "entity_linker_defaults.cfg"
|
||||
return util.load_config(loc, create_objects=False)
|
||||
|
||||
|
||||
def default_nel():
|
||||
loc = Path(__file__).parent / "entity_linker_defaults.cfg"
|
||||
return util.load_config(loc, create_objects=True)["model"]
|
||||
|
||||
|
||||
def default_morphologizer_config():
|
||||
loc = Path(__file__).parent / "morphologizer_defaults.cfg"
|
||||
return util.load_config(loc, create_objects=False)
|
||||
|
||||
|
||||
def default_morphologizer():
|
||||
loc = Path(__file__).parent / "morphologizer_defaults.cfg"
|
||||
return util.load_config(loc, create_objects=True)["model"]
|
||||
|
||||
|
||||
def default_parser_config():
|
||||
loc = Path(__file__).parent / "parser_defaults.cfg"
|
||||
return util.load_config(loc, create_objects=False)
|
||||
|
||||
|
||||
def default_parser():
|
||||
loc = Path(__file__).parent / "parser_defaults.cfg"
|
||||
return util.load_config(loc, create_objects=True)["model"]
|
||||
|
||||
|
||||
def default_ner_config():
|
||||
loc = Path(__file__).parent / "ner_defaults.cfg"
|
||||
return util.load_config(loc, create_objects=False)
|
||||
|
||||
|
||||
def default_ner():
|
||||
loc = Path(__file__).parent / "ner_defaults.cfg"
|
||||
return util.load_config(loc, create_objects=True)["model"]
|
||||
|
||||
|
||||
def default_senter_config():
|
||||
loc = Path(__file__).parent / "senter_defaults.cfg"
|
||||
return util.load_config(loc, create_objects=False)
|
||||
|
||||
|
||||
def default_senter():
|
||||
loc = Path(__file__).parent / "senter_defaults.cfg"
|
||||
return util.load_config(loc, create_objects=True)["model"]
|
||||
|
||||
|
||||
def default_tagger_config():
|
||||
loc = Path(__file__).parent / "tagger_defaults.cfg"
|
||||
return util.load_config(loc, create_objects=False)
|
||||
|
||||
|
||||
def default_tagger():
|
||||
loc = Path(__file__).parent / "tagger_defaults.cfg"
|
||||
return util.load_config(loc, create_objects=True)["model"]
|
||||
|
||||
|
||||
def default_textcat_config():
|
||||
loc = Path(__file__).parent / "textcat_defaults.cfg"
|
||||
return util.load_config(loc, create_objects=False)
|
||||
|
||||
|
||||
def default_textcat():
|
||||
loc = Path(__file__).parent / "textcat_defaults.cfg"
|
||||
return util.load_config(loc, create_objects=True)["model"]
|
||||
|
||||
|
||||
def default_tok2vec_config():
|
||||
loc = Path(__file__).parent / "tok2vec_defaults.cfg"
|
||||
return util.load_config(loc, create_objects=False)
|
||||
|
||||
|
||||
def default_tok2vec():
|
||||
loc = Path(__file__).parent / "tok2vec_defaults.cfg"
|
||||
return util.load_config(loc, create_objects=True)["model"]
|
||||
|
||||
|
||||
def default_simple_ner_config():
|
||||
loc = Path(__file__).parent / "simple_ner_defaults.cfg"
|
||||
return util.load_config(loc, create_objects=False)
|
||||
|
||||
|
||||
def default_simple_ner():
|
||||
loc = Path(__file__).parent / "simple_ner_defaults.cfg"
|
||||
return util.load_config(loc, create_objects=True)["model"]
|
|
@ -1,13 +0,0 @@
|
|||
[model]
|
||||
@architectures = "spacy.EntityLinker.v1"
|
||||
|
||||
[model.tok2vec]
|
||||
@architectures = "spacy.HashEmbedCNN.v1"
|
||||
pretrained_vectors = null
|
||||
width = 96
|
||||
depth = 2
|
||||
embed_size = 300
|
||||
window_size = 1
|
||||
maxout_pieces = 3
|
||||
subword_features = true
|
||||
dropout = null
|
|
@ -1,14 +0,0 @@
|
|||
[model]
|
||||
@architectures = "spacy.Tagger.v1"
|
||||
|
||||
[model.tok2vec]
|
||||
@architectures = "spacy.HashCharEmbedCNN.v1"
|
||||
pretrained_vectors = null
|
||||
width = 128
|
||||
depth = 4
|
||||
embed_size = 7000
|
||||
window_size = 1
|
||||
maxout_pieces = 3
|
||||
nM = 64
|
||||
nC = 8
|
||||
dropout = null
|
|
@ -1,15 +0,0 @@
|
|||
[model]
|
||||
@architectures = "spacy.MultiTask.v1"
|
||||
maxout_pieces = 3
|
||||
token_vector_width = 96
|
||||
|
||||
[model.tok2vec]
|
||||
@architectures = "spacy.HashEmbedCNN.v1"
|
||||
pretrained_vectors = null
|
||||
width = 96
|
||||
depth = 4
|
||||
embed_size = 2000
|
||||
window_size = 1
|
||||
maxout_pieces = 2
|
||||
subword_features = true
|
||||
dropout = null
|
|
@ -1,16 +0,0 @@
|
|||
[model]
|
||||
@architectures = "spacy.TransitionBasedParser.v1"
|
||||
nr_feature_tokens = 6
|
||||
hidden_width = 64
|
||||
maxout_pieces = 2
|
||||
|
||||
[model.tok2vec]
|
||||
@architectures = "spacy.HashEmbedCNN.v1"
|
||||
pretrained_vectors = null
|
||||
width = 96
|
||||
depth = 4
|
||||
embed_size = 2000
|
||||
window_size = 1
|
||||
maxout_pieces = 3
|
||||
subword_features = true
|
||||
dropout = null
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user